diff --git a/Cargo.lock b/Cargo.lock
index 8c73cb0553..78838acb2a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -282,6 +282,7 @@ dependencies = [
  "rustc-workspace-hack",
  "rustc_version",
  "shell-escape",
+ "smallvec",
 ]
 
 [[package]]
@@ -496,6 +497,12 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "45bb67a18fa91266cc7807181f62f9178a6873bfad7dc788c42e6430db40184f"
 
+[[package]]
+name = "smallvec"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbee7696b84bbf3d89a1c2eccff0850e3047ed46bfcd2e92c29a2d074d57e252"
+
 [[package]]
 name = "socket2"
 version = "0.3.15"
diff --git a/Cargo.toml b/Cargo.toml
index c36a97bb0a..4413dab321 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,6 +30,7 @@ log = "0.4"
 shell-escape = "0.1.4"
 hex = "0.4.0"
 rand = "0.7"
+smallvec = "1.4.2"
 
 # A noop dependency that changes in the Rust repository, it's a bit of a hack.
 # See the `src/tools/rustc-workspace-hack/README.md` file in `rust-lang/rust`
diff --git a/bench-cargo-miri/mse/src/main.rs b/bench-cargo-miri/mse/src/main.rs
index b4ad157510..57e2860710 100644
--- a/bench-cargo-miri/mse/src/main.rs
+++ b/bench-cargo-miri/mse/src/main.rs
@@ -2,6 +2,9 @@ static EXPECTED: &[u8] = &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 static PCM: &[i16] = &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, -2, 0, -2, 0, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 1, 0, 1, 0, 0, -2, 0, -2, 0, -2, 0, -2, -2, -2, -3, -3, -3, -3, -4, -2, -5, -2, -5, -2, -4, 0, -4, 0, -4, 0, -4, 1, -4, 1, -4, 2, -4, 2, -4, 2, -4, 2, -4, 2, -3, 1, -4, 0, -4, 0, -5, 0, -5, 0, -5, 0, -4, 2, -4, 3, -4, 4, -3, 5, -2, 5, -3, 6, -3, 6, -3, 5, -3, 5, -2, 4, -2, 3, -5, 0, -6, 0, -3, -2, -4, -4, -9, -5, -9, -4, -4, -2, -4, -2, -4, 0, -2, 1, 1, 1, 4, 2, 8, 2, 12, 1, 13, 0, 12, 0, 11, 0, 8, -2, 7, 0, 7, -3, 11, -8, 15, -9, 17, -6, 17, -5, 13, -3, 7, 0, 3, 0, -2, 0, -4, 0, -4, -2, -6, 0, -14, -2, -17, -4, -8, 0, -7, 5, -17, 7, -18, 10, -7, 18, -2, 25, -3, 27, 0, 31, 4, 34, 4, 34, 8, 36, 8, 37, 2, 36, 4, 34, 8, 28, 3, 15, 0, 11, 0, 12, -5, 8, -4, 10, 0, 23, -4, 31, -8, 30, -2, 30, 0, 26, -6, 22, -6, 20, -12, 15, -19, 10, -10, 13, -14, 6, -43, -13, -43, -16, -9, -12, -10, -29, -42, -40, -37, -28, -5, -21, 1, -24, -8, -20, 4, -18, 26, -24, 44, -26, 66, -30, 86, -37, 88, -41, 72, -46, 50, -31, 28, 23, 14, 64, 16, 51, 26, 32, 34, 39, 42, 48, 35, 58, 0, 72, -36, 69, -59, 58, -98, 54, -124, 36, -103, 12, -110, 5, -173, -19, -146, -59, -4, -42, 51, 1, -23, -6, -30, -6, 45, 46, 47, 70, 6, 55, 19, 60, 38, 62, 42, 47, 61, 46, 40, 42, -19, 22, -34, 6, -35, -50, -61, -141, -37, -171, 17, -163, 26, -180, 46, -154, 80, -63, 48, -4, 18, 20, 50, 47, 58, 53, 44, 61, 57, 85, 37, 80, 0, 86, -8, 106, -95, 49, -213, -8, -131, 47, 49, 63, 40, -39, -69, -74, -37, -20, 63, -12, 58, -14, -12, 25, -31, 41, 11, 45, 76, 47, 167, 5, 261, -37, 277, -83, 183, -172, 35, -122, -79, 138, -70, 266, 69, 124, 228, 0, 391, -29, 594, -84, 702, -78, 627, -8, 551, -13, 509, 13, 372, 120, 352, 125, 622, 127, 691, 223, 362, 126, 386, -33, 915, 198, 958, 457, 456, 298, 500, 233, 1027, 469, 1096, 426, 918, 160, 1067, 141, 1220, 189, 1245, 164, 1375, 297, 1378, 503, 1299, 702, 1550, 929, 1799, 855, 1752, 547, 1830, 602, 1928, 832, 1736, 796, 1735, 933, 1961, 1385, 1935, 1562, 2105, 1485, 2716, 1449, 2948, 1305, 2768, 1205, 2716, 1346, 2531, 1450, 2470, 1653, 3117, 2111, 3370, 2176, 2696, 1947, 2925, 2305, 3846, 2658, 2425, 2184, -877, 1981, -2261, 2623, -1645, 2908, -1876, 2732, -2704, 2953, -2484, 3116, -2120, 2954, -2442, 3216, -2466, 3499, -2192, 3234, -2392, 3361, -2497, 3869, -2078, 3772, -1858, 3915, -2066, 4438, -2285, 2934, -2294, -280, -2066, -1762, -1992, -1412, -2298, -1535, -2399, -1789, -2223, -1419, -2244, -1334, -2092, -1476, -1777, -1396, -2014, -1571, -2199, -1574, -1843, -1167, -1910, -1446, -2007, -1818];
 
 fn main() {
+    #[cfg(increase_thread_usage)]
+    let thread = std::thread::spawn(|| 4);
+    
     for _ in 0..2 {
         mse(PCM.len(), PCM, EXPECTED);
     }
diff --git a/src/bin/miri.rs b/src/bin/miri.rs
index ef1429a350..1117b69116 100644
--- a/src/bin/miri.rs
+++ b/src/bin/miri.rs
@@ -195,6 +195,9 @@ fn main() {
                 "-Zmiri-disable-stacked-borrows" => {
                     miri_config.stacked_borrows = false;
                 }
+                "-Zmiri-disable-data-race-detector" => {
+                    miri_config.data_race_detector = false;
+                }
                 "-Zmiri-disable-alignment-check" => {
                     miri_config.check_alignment = miri::AlignmentCheck::None;
                 }
diff --git a/src/data_race.rs b/src/data_race.rs
new file mode 100644
index 0000000000..aca735e6f2
--- /dev/null
+++ b/src/data_race.rs
@@ -0,0 +1,1381 @@
+//! Implementation of a data-race detector using Lamport Timestamps / Vector-clocks
+//! based on the Dyamic Race Detection for C++:
+//! https://www.doc.ic.ac.uk/~afd/homepages/papers/pdfs/2017/POPL.pdf
+//! which does not report false-positives when fences are used, and gives better
+//! accuracy in presence of read-modify-write operations.
+//!
+//! This does not explore weak memory orders and so can still miss data-races
+//! but should not report false-positives
+//!
+//! Data-race definiton from(https://en.cppreference.com/w/cpp/language/memory_model#Threads_and_data_races):
+//! a data race occurs between two memory accesses if they are on different threads, at least one operation
+//! is non-atomic, at least one operation is a write and neither access happens-before the other. Read the link
+//! for full definition.
+//!
+//! This re-uses vector indexes for threads that are known to be unable to report data-races, this is valid
+//! because it only re-uses vector indexes once all currently-active (not-terminated) threads have an internal
+//! vector clock that happens-after the join operation of the candidate thread. Threads that have not been joined
+//! on are not considered. Since the thread's vector clock will only increase and a data-race implies that
+//! there is some index x where clock[x] > thread_clock, when this is true clock[candidate-idx] > thread_clock
+//! can never hold and hence a data-race can never be reported in that vector index again.
+//! This means that the thread-index can be safely re-used, starting on the next timestamp for the newly created
+//! thread.
+//!
+//! The sequentially consistant ordering corresponds to the ordering that the threads
+//! are currently scheduled, this means that the data-race detector has no additional
+//! logic for sequentially consistent accesses at the moment since they are indistinguishable
+//! from acquire/release operations. If weak memory orderings are explored then this
+//! may need to change or be updated accordingly.
+//!
+//! Per the C++ spec for the memory model a sequentially consistent operation:
+//!   "A load operation with this memory order performs an acquire operation,
+//!    a store performs a release operation, and read-modify-write performs
+//!    both an acquire operation and a release operation, plus a single total
+//!    order exists in which all threads observe all modifications in the same
+//!    order (see Sequentially-consistent ordering below) "
+//! So in the absence of weak memory effects a seq-cst load & a seq-cst store is identical
+//! to a acquire load and a release store given the global sequentially consistent order
+//! of the schedule.
+//!
+//! The timestamps used in the data-race detector assign each sequence of non-atomic operations
+//! followed by a single atomic or concurrent operation a single timestamp.
+//! Write, Read, Write, ThreadJoin will be represented by a single timestamp value on a thread.
+//! This is because extra increment operations between the operations in the sequence are not
+//! required for accurate reporting of data-race values.
+//!
+//! As per the paper a threads timestamp is only incremented after a release operation is performed
+//! so some atomic operations that only perform acquires do not increment the timestamp. Due to shared
+//! code some atomic operations may increment the timestamp when not necessary but this has no effect
+//! on the data-race detection code.
+//!
+//! FIXME:
+//! currently we have our own local copy of the currently active thread index and names, this is due
+//! in part to the inability to access the current location of threads.active_thread inside the AllocExtra
+//! read, write and deallocate functions and should be cleaned up in the future.
+
+use std::{
+    cell::{Cell, Ref, RefCell, RefMut},
+    fmt::Debug,
+    mem,
+    rc::Rc,
+};
+
+use rustc_data_structures::fx::{FxHashMap, FxHashSet};
+use rustc_index::vec::{Idx, IndexVec};
+use rustc_middle::{mir, ty::layout::TyAndLayout};
+use rustc_target::abi::Size;
+
+use crate::{
+    ImmTy, Immediate, InterpResult, MPlaceTy, MemPlaceMeta, MiriEvalContext, MiriEvalContextExt,
+    OpTy, Pointer, RangeMap, ScalarMaybeUninit, Tag, ThreadId, VClock, VSmallClockMap, VTimestamp,
+    VectorIdx,
+};
+
+pub type AllocExtra = VClockAlloc;
+pub type MemoryExtra = Rc<GlobalState>;
+
+/// Valid atomic read-write operations, alias of atomic::Ordering (not non-exhaustive).
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum AtomicRwOp {
+    Relaxed,
+    Acquire,
+    Release,
+    AcqRel,
+    SeqCst,
+}
+
+/// Valid atomic read operations, subset of atomic::Ordering.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum AtomicReadOp {
+    Relaxed,
+    Acquire,
+    SeqCst,
+}
+
+/// Valid atomic write operations, subset of atomic::Ordering.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum AtomicWriteOp {
+    Relaxed,
+    Release,
+    SeqCst,
+}
+
+/// Valid atomic fence operations, subset of atomic::Ordering.
+#[derive(Copy, Clone, PartialEq, Eq, Debug)]
+pub enum AtomicFenceOp {
+    Acquire,
+    Release,
+    AcqRel,
+    SeqCst,
+}
+
+/// The current set of vector clocks describing the state
+/// of a thread, contains the happens-before clock and
+/// additional metadata to model atomic fence operations.
+#[derive(Clone, Default, Debug)]
+struct ThreadClockSet {
+    /// The increasing clock representing timestamps
+    /// that happen-before this thread.
+    clock: VClock,
+
+    /// The set of timestamps that will happen-before this
+    /// thread once it performs an acquire fence.
+    fence_acquire: VClock,
+
+    /// The last timesamp of happens-before relations that
+    /// have been released by this thread by a fence.
+    fence_release: VClock,
+}
+
+impl ThreadClockSet {
+    /// Apply the effects of a release fence to this
+    /// set of thread vector clocks.
+    #[inline]
+    fn apply_release_fence(&mut self) {
+        self.fence_release.clone_from(&self.clock);
+    }
+
+    /// Apply the effects of a acquire fence to this
+    /// set of thread vector clocks.
+    #[inline]
+    fn apply_acquire_fence(&mut self) {
+        self.clock.join(&self.fence_acquire);
+    }
+
+    /// Increment the happens-before clock at a
+    /// known index.
+    #[inline]
+    fn increment_clock(&mut self, index: VectorIdx) {
+        self.clock.increment_index(index);
+    }
+
+    /// Join the happens-before clock with that of
+    /// another thread, used to model thread join
+    /// operations.
+    fn join_with(&mut self, other: &ThreadClockSet) {
+        self.clock.join(&other.clock);
+    }
+}
+
+/// Error returned by finding a data race
+/// should be elaborated upon.
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)]
+pub struct DataRace;
+
+/// Externally stored memory cell clocks
+/// explicitly to reduce memory usage for the
+/// common case where no atomic operations
+/// exists on the memory cell.
+#[derive(Clone, PartialEq, Eq, Default, Debug)]
+struct AtomicMemoryCellClocks {
+    /// The clock-vector of the timestamp of the last atomic
+    /// read operation performed by each thread.
+    /// This detects potential data-races between atomic read
+    /// and non-atomic write operations.
+    read_vector: VClock,
+
+    /// The clock-vector of the timestamp of the last atomic
+    /// write operation performed by each thread.
+    /// This detects potential data-races between atomic write
+    /// and non-atomic read or write operations.
+    write_vector: VClock,
+
+    /// Synchronization vector for acquire-release semantics
+    /// contains the vector of timestamps that will
+    /// happen-before a thread if an acquire-load is
+    /// performed on the data.
+    sync_vector: VClock,
+
+    /// The Hash-Map of all threads for which a release
+    /// sequence exists in the memory cell, required
+    /// since read-modify-write operations do not
+    /// invalidate existing release sequences.
+    /// See page 6 of linked paper.
+    release_sequences: VSmallClockMap,
+}
+
+/// Memory Cell vector clock metadata
+/// for data-race detection.
+#[derive(Clone, PartialEq, Eq, Debug)]
+struct MemoryCellClocks {
+    /// The vector-clock timestamp of the last write
+    /// corresponding to the writing threads timestamp.
+    write: VTimestamp,
+
+    /// The identifier of the vector index, corresponding to a thread
+    /// that performed the last write operation.
+    write_index: VectorIdx,
+
+    /// The vector-clock of the timestamp of the last read operation
+    /// performed by a thread since the last write operation occured.
+    /// It is reset to zero on each write operation.
+    read: VClock,
+
+    /// Atomic acquire & release sequence tracking clocks.
+    /// For non-atomic memory in the common case this
+    /// value is set to None.
+    atomic_ops: Option<Box<AtomicMemoryCellClocks>>,
+}
+
+/// Create a default memory cell clocks instance
+/// for uninitialized memory.
+impl Default for MemoryCellClocks {
+    fn default() -> Self {
+        MemoryCellClocks {
+            read: VClock::default(),
+            write: 0,
+            write_index: VectorIdx::MAX_INDEX,
+            atomic_ops: None,
+        }
+    }
+}
+
+impl MemoryCellClocks {
+    /// Load the internal atomic memory cells if they exist.
+    #[inline]
+    fn atomic(&self) -> Option<&AtomicMemoryCellClocks> {
+        match &self.atomic_ops {
+            Some(op) => Some(&*op),
+            None => None,
+        }
+    }
+
+    /// Load or create the internal atomic memory metadata
+    /// if it does not exist.
+    #[inline]
+    fn atomic_mut(&mut self) -> &mut AtomicMemoryCellClocks {
+        self.atomic_ops.get_or_insert_with(Default::default)
+    }
+
+    /// Update memory cell data-race tracking for atomic
+    /// load acquire semantics, is a no-op if this memory was
+    /// not used previously as atomic memory.
+    fn load_acquire(
+        &mut self,
+        clocks: &mut ThreadClockSet,
+        index: VectorIdx,
+    ) -> Result<(), DataRace> {
+        self.atomic_read_detect(clocks, index)?;
+        if let Some(atomic) = self.atomic() {
+            clocks.clock.join(&atomic.sync_vector);
+        }
+        Ok(())
+    }
+
+    /// Update memory cell data-race tracking for atomic
+    /// load relaxed semantics, is a no-op if this memory was
+    /// not used previously as atomic memory.
+    fn load_relaxed(
+        &mut self,
+        clocks: &mut ThreadClockSet,
+        index: VectorIdx,
+    ) -> Result<(), DataRace> {
+        self.atomic_read_detect(clocks, index)?;
+        if let Some(atomic) = self.atomic() {
+            clocks.fence_acquire.join(&atomic.sync_vector);
+        }
+        Ok(())
+    }
+
+    /// Update the memory cell data-race tracking for atomic
+    /// store release semantics.
+    fn store_release(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
+        self.atomic_write_detect(clocks, index)?;
+        let atomic = self.atomic_mut();
+        atomic.sync_vector.clone_from(&clocks.clock);
+        atomic.release_sequences.clear();
+        atomic.release_sequences.insert(index, &clocks.clock);
+        Ok(())
+    }
+
+    /// Update the memory cell data-race tracking for atomic
+    /// store relaxed semantics.
+    fn store_relaxed(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
+        self.atomic_write_detect(clocks, index)?;
+        let atomic = self.atomic_mut();
+        atomic.sync_vector.clone_from(&clocks.fence_release);
+        if let Some(release) = atomic.release_sequences.get(index) {
+            atomic.sync_vector.join(release);
+        }
+        atomic.release_sequences.retain_index(index);
+        Ok(())
+    }
+
+    /// Update the memory cell data-race tracking for atomic
+    /// store release semantics for RMW operations.
+    fn rmw_release(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
+        self.atomic_write_detect(clocks, index)?;
+        let atomic = self.atomic_mut();
+        atomic.sync_vector.join(&clocks.clock);
+        atomic.release_sequences.insert(index, &clocks.clock);
+        Ok(())
+    }
+
+    /// Update the memory cell data-race tracking for atomic
+    /// store relaxed semantics for RMW operations.
+    fn rmw_relaxed(&mut self, clocks: &ThreadClockSet, index: VectorIdx) -> Result<(), DataRace> {
+        self.atomic_write_detect(clocks, index)?;
+        let atomic = self.atomic_mut();
+        atomic.sync_vector.join(&clocks.fence_release);
+        Ok(())
+    }
+
+    /// Detect data-races with an atomic read, caused by a non-atomic write that does
+    /// not happen-before the atomic-read.
+    fn atomic_read_detect(
+        &mut self,
+        clocks: &ThreadClockSet,
+        index: VectorIdx,
+    ) -> Result<(), DataRace> {
+        log::trace!("Atomic read with vectors: {:#?} :: {:#?}", self, clocks);
+        if self.write <= clocks.clock[self.write_index] {
+            let atomic = self.atomic_mut();
+            atomic.read_vector.set_at_index(&clocks.clock, index);
+            Ok(())
+        } else {
+            Err(DataRace)
+        }
+    }
+
+    /// Detect data-races with an atomic write, either with a non-atomic read or with
+    /// a non-atomic write.
+    fn atomic_write_detect(
+        &mut self,
+        clocks: &ThreadClockSet,
+        index: VectorIdx,
+    ) -> Result<(), DataRace> {
+        log::trace!("Atomic write with vectors: {:#?} :: {:#?}", self, clocks);
+        if self.write <= clocks.clock[self.write_index] && self.read <= clocks.clock {
+            let atomic = self.atomic_mut();
+            atomic.write_vector.set_at_index(&clocks.clock, index);
+            Ok(())
+        } else {
+            Err(DataRace)
+        }
+    }
+
+    /// Detect races for non-atomic read operations at the current memory cell
+    /// returns true if a data-race is detected.
+    fn read_race_detect(
+        &mut self,
+        clocks: &ThreadClockSet,
+        index: VectorIdx,
+    ) -> Result<(), DataRace> {
+        log::trace!("Unsynchronized read with vectors: {:#?} :: {:#?}", self, clocks);
+        if self.write <= clocks.clock[self.write_index] {
+            let race_free = if let Some(atomic) = self.atomic() {
+                atomic.write_vector <= clocks.clock
+            } else {
+                true
+            };
+            if race_free {
+                self.read.set_at_index(&clocks.clock, index);
+                Ok(())
+            } else {
+                Err(DataRace)
+            }
+        } else {
+            Err(DataRace)
+        }
+    }
+
+    /// Detect races for non-atomic write operations at the current memory cell
+    /// returns true if a data-race is detected.
+    fn write_race_detect(
+        &mut self,
+        clocks: &ThreadClockSet,
+        index: VectorIdx,
+    ) -> Result<(), DataRace> {
+        log::trace!("Unsynchronized write with vectors: {:#?} :: {:#?}", self, clocks);
+        if self.write <= clocks.clock[self.write_index] && self.read <= clocks.clock {
+            let race_free = if let Some(atomic) = self.atomic() {
+                atomic.write_vector <= clocks.clock && atomic.read_vector <= clocks.clock
+            } else {
+                true
+            };
+            if race_free {
+                self.write = clocks.clock[index];
+                self.write_index = index;
+                self.read.set_zero_vector();
+                Ok(())
+            } else {
+                Err(DataRace)
+            }
+        } else {
+            Err(DataRace)
+        }
+    }
+}
+
+/// Evaluation context extensions.
+impl<'mir, 'tcx: 'mir> EvalContextExt<'mir, 'tcx> for MiriEvalContext<'mir, 'tcx> {}
+pub trait EvalContextExt<'mir, 'tcx: 'mir>: MiriEvalContextExt<'mir, 'tcx> {
+    /// Atomic variant of read_scalar_at_offset.
+    fn read_scalar_at_offset_atomic(
+        &self,
+        op: OpTy<'tcx, Tag>,
+        offset: u64,
+        layout: TyAndLayout<'tcx>,
+        atomic: AtomicReadOp,
+    ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
+        let this = self.eval_context_ref();
+        let op_place = this.deref_operand(op)?;
+        let offset = Size::from_bytes(offset);
+
+        // Ensure that the following read at an offset is within bounds.
+        assert!(op_place.layout.size >= offset + layout.size);
+        let value_place = op_place.offset(offset, MemPlaceMeta::None, layout, this)?;
+        this.read_scalar_atomic(value_place, atomic)
+    }
+
+    /// Atomic variant of write_scalar_at_offset.
+    fn write_scalar_at_offset_atomic(
+        &mut self,
+        op: OpTy<'tcx, Tag>,
+        offset: u64,
+        value: impl Into<ScalarMaybeUninit<Tag>>,
+        layout: TyAndLayout<'tcx>,
+        atomic: AtomicWriteOp,
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+        let op_place = this.deref_operand(op)?;
+        let offset = Size::from_bytes(offset);
+
+        // Ensure that the following read at an offset is within bounds.
+        assert!(op_place.layout.size >= offset + layout.size);
+        let value_place = op_place.offset(offset, MemPlaceMeta::None, layout, this)?;
+        this.write_scalar_atomic(value.into(), value_place, atomic)
+    }
+
+    /// Perform an atomic read operation at the memory location.
+    fn read_scalar_atomic(
+        &self,
+        place: MPlaceTy<'tcx, Tag>,
+        atomic: AtomicReadOp,
+    ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
+        let this = self.eval_context_ref();
+        let scalar = this.allow_data_races_ref(move |this| this.read_scalar(place.into()))?;
+        self.validate_atomic_load(place, atomic)?;
+        Ok(scalar)
+    }
+
+    /// Perform an atomic write operation at the memory location.
+    fn write_scalar_atomic(
+        &mut self,
+        val: ScalarMaybeUninit<Tag>,
+        dest: MPlaceTy<'tcx, Tag>,
+        atomic: AtomicWriteOp,
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+        this.allow_data_races_mut(move |this| this.write_scalar(val, dest.into()))?;
+        self.validate_atomic_store(dest, atomic)
+    }
+
+    /// Perform a atomic operation on a memory location.
+    fn atomic_op_immediate(
+        &mut self,
+        place: MPlaceTy<'tcx, Tag>,
+        rhs: ImmTy<'tcx, Tag>,
+        op: mir::BinOp,
+        neg: bool,
+        atomic: AtomicRwOp,
+    ) -> InterpResult<'tcx, ImmTy<'tcx, Tag>> {
+        let this = self.eval_context_mut();
+
+        let old = this.allow_data_races_mut(|this| this.read_immediate(place.into()))?;
+
+        // Atomics wrap around on overflow.
+        let val = this.binary_op(op, old, rhs)?;
+        let val = if neg { this.unary_op(mir::UnOp::Not, val)? } else { val };
+        this.allow_data_races_mut(|this| this.write_immediate(*val, place.into()))?;
+
+        this.validate_atomic_rmw(place, atomic)?;
+        Ok(old)
+    }
+
+    /// Perform an atomic exchange with a memory place and a new
+    /// scalar value, the old value is returned.
+    fn atomic_exchange_scalar(
+        &mut self,
+        place: MPlaceTy<'tcx, Tag>,
+        new: ScalarMaybeUninit<Tag>,
+        atomic: AtomicRwOp,
+    ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
+        let this = self.eval_context_mut();
+
+        let old = this.allow_data_races_mut(|this| this.read_scalar(place.into()))?;
+        this.allow_data_races_mut(|this| this.write_scalar(new, place.into()))?;
+        this.validate_atomic_rmw(place, atomic)?;
+        Ok(old)
+    }
+
+    /// Perform an atomic compare and exchange at a given memory location.
+    /// On success an atomic RMW operation is performed and on failure
+    /// only an atomic read occurs.
+    fn atomic_compare_exchange_scalar(
+        &mut self,
+        place: MPlaceTy<'tcx, Tag>,
+        expect_old: ImmTy<'tcx, Tag>,
+        new: ScalarMaybeUninit<Tag>,
+        success: AtomicRwOp,
+        fail: AtomicReadOp,
+    ) -> InterpResult<'tcx, Immediate<Tag>> {
+        let this = self.eval_context_mut();
+
+        // Failure ordering cannot be stronger than success ordering, therefore first attempt
+        // to read with the failure ordering and if successfull then try again with the success
+        // read ordering and write in the success case.
+        // Read as immediate for the sake of `binary_op()`
+        let old = this.allow_data_races_mut(|this| this.read_immediate(place.into()))?;
+
+        // `binary_op` will bail if either of them is not a scalar.
+        let eq = this.overflowing_binary_op(mir::BinOp::Eq, old, expect_old)?.0;
+        let res = Immediate::ScalarPair(old.to_scalar_or_uninit(), eq.into());
+
+        // Update ptr depending on comparison.
+        // if successful, perform a full rw-atomic validation
+        // otherwise treat this as an atomic load with the fail ordering.
+        if eq.to_bool()? {
+            this.allow_data_races_mut(|this| this.write_scalar(new, place.into()))?;
+            this.validate_atomic_rmw(place, success)?;
+        } else {
+            this.validate_atomic_load(place, fail)?;
+        }
+
+        // Return the old value.
+        Ok(res)
+    }
+
+    /// Update the data-race detector for an atomic read occuring at the
+    /// associated memory-place and on the current thread.
+    fn validate_atomic_load(
+        &self,
+        place: MPlaceTy<'tcx, Tag>,
+        atomic: AtomicReadOp,
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_ref();
+        this.validate_atomic_op(
+            place,
+            atomic,
+            "Atomic Load",
+            move |memory, clocks, index, atomic| {
+                if atomic == AtomicReadOp::Relaxed {
+                    memory.load_relaxed(&mut *clocks, index)
+                } else {
+                    memory.load_acquire(&mut *clocks, index)
+                }
+            },
+        )
+    }
+
+    /// Update the data-race detector for an atomic write occuring at the
+    /// associated memory-place and on the current thread.
+    fn validate_atomic_store(
+        &mut self,
+        place: MPlaceTy<'tcx, Tag>,
+        atomic: AtomicWriteOp,
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_ref();
+        this.validate_atomic_op(
+            place,
+            atomic,
+            "Atomic Store",
+            move |memory, clocks, index, atomic| {
+                if atomic == AtomicWriteOp::Relaxed {
+                    memory.store_relaxed(clocks, index)
+                } else {
+                    memory.store_release(clocks, index)
+                }
+            },
+        )
+    }
+
+    /// Update the data-race detector for an atomic read-modify-write occuring
+    /// at the associated memory place and on the current thread.
+    fn validate_atomic_rmw(
+        &mut self,
+        place: MPlaceTy<'tcx, Tag>,
+        atomic: AtomicRwOp,
+    ) -> InterpResult<'tcx> {
+        use AtomicRwOp::*;
+        let acquire = matches!(atomic, Acquire | AcqRel | SeqCst);
+        let release = matches!(atomic, Release | AcqRel | SeqCst);
+        let this = self.eval_context_ref();
+        this.validate_atomic_op(place, atomic, "Atomic RMW", move |memory, clocks, index, _| {
+            if acquire {
+                memory.load_acquire(clocks, index)?;
+            } else {
+                memory.load_relaxed(clocks, index)?;
+            }
+            if release {
+                memory.rmw_release(clocks, index)
+            } else {
+                memory.rmw_relaxed(clocks, index)
+            }
+        })
+    }
+
+    /// Update the data-race detector for an atomic fence on the current thread.
+    fn validate_atomic_fence(&mut self, atomic: AtomicFenceOp) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.maybe_perform_sync_operation(move |index, mut clocks| {
+                log::trace!("Atomic fence on {:?} with ordering {:?}", index, atomic);
+
+                // Apply data-race detection for the current fences
+                // this treats AcqRel and SeqCst as the same as a acquire
+                // and release fence applied in the same timestamp.
+                if atomic != AtomicFenceOp::Release {
+                    // Either Acquire | AcqRel | SeqCst
+                    clocks.apply_acquire_fence();
+                }
+                if atomic != AtomicFenceOp::Acquire {
+                    // Either Release | AcqRel | SeqCst
+                    clocks.apply_release_fence();
+                }
+                
+                // Increment timestamp in case of release semantics.
+                Ok(atomic != AtomicFenceOp::Acquire)
+            })
+        } else {
+            Ok(())
+        }
+    }
+}
+
+/// Vector clock metadata for a logical memory allocation.
+#[derive(Debug, Clone)]
+pub struct VClockAlloc {
+    /// Assigning each byte a MemoryCellClocks.
+    alloc_ranges: RefCell<RangeMap<MemoryCellClocks>>,
+
+    // Pointer to global state.
+    global: MemoryExtra,
+}
+
+impl VClockAlloc {
+    /// Create a new data-race allocation detector.
+    pub fn new_allocation(global: &MemoryExtra, len: Size) -> VClockAlloc {
+        VClockAlloc {
+            global: Rc::clone(global),
+            alloc_ranges: RefCell::new(RangeMap::new(len, MemoryCellClocks::default())),
+        }
+    }
+
+    // Find an index, if one exists where the value
+    // in `l` is greater than the value in `r`.
+    fn find_gt_index(l: &VClock, r: &VClock) -> Option<VectorIdx> {
+        let l_slice = l.as_slice();
+        let r_slice = r.as_slice();
+        l_slice
+            .iter()
+            .zip(r_slice.iter())
+            .enumerate()
+            .find_map(|(idx, (&l, &r))| if l > r { Some(idx) } else { None })
+            .or_else(|| {
+                if l_slice.len() > r_slice.len() {
+                    // By invariant, if l_slice is longer
+                    // then one element must be larger.
+                    // This just validates that this is true
+                    // and reports earlier elements first.
+                    let l_remainder_slice = &l_slice[r_slice.len()..];
+                    let idx = l_remainder_slice
+                        .iter()
+                        .enumerate()
+                        .find_map(|(idx, &r)| if r == 0 { None } else { Some(idx) })
+                        .expect("Invalid VClock Invariant");
+                    Some(idx)
+                } else {
+                    None
+                }
+            })
+            .map(|idx| VectorIdx::new(idx))
+    }
+
+    /// Report a data-race found in the program.
+    /// This finds the two racing threads and the type
+    /// of data-race that occured. This will also
+    /// return info about the memory location the data-race
+    /// occured in.
+    #[cold]
+    #[inline(never)]
+    fn report_data_race<'tcx>(
+        global: &MemoryExtra,
+        range: &MemoryCellClocks,
+        action: &str,
+        is_atomic: bool,
+        pointer: Pointer<Tag>,
+        len: Size,
+    ) -> InterpResult<'tcx> {
+        let (current_index, current_clocks) = global.current_thread_state();
+        let write_clock;
+        let (other_action, other_thread, other_clock) = if range.write
+            > current_clocks.clock[range.write_index]
+        {
+            // Convert the write action into the vector clock it
+            // represents for diagnostic purposes.
+            write_clock = VClock::new_with_index(range.write_index, range.write);
+            ("WRITE", range.write_index, &write_clock)
+        } else if let Some(idx) = Self::find_gt_index(&range.read, &current_clocks.clock) {
+            ("READ", idx, &range.read)
+        } else if !is_atomic {
+            if let Some(atomic) = range.atomic() {
+                if let Some(idx) = Self::find_gt_index(&atomic.write_vector, &current_clocks.clock)
+                {
+                    ("ATOMIC_STORE", idx, &atomic.write_vector)
+                } else if let Some(idx) =
+                    Self::find_gt_index(&atomic.read_vector, &current_clocks.clock)
+                {
+                    ("ATOMIC_LOAD", idx, &atomic.read_vector)
+                } else {
+                    unreachable!(
+                        "Failed to report data-race for non-atomic operation: no race found"
+                    )
+                }
+            } else {
+                unreachable!(
+                    "Failed to report data-race for non-atomic operation: no atomic component"
+                )
+            }
+        } else {
+            unreachable!("Failed to report data-race for atomic operation")
+        };
+
+        // Load elaborated thread information about the racing thread actions.
+        let current_thread_info = global.print_thread_metadata(current_index);
+        let other_thread_info = global.print_thread_metadata(other_thread);
+
+        // Throw the data-race detection.
+        throw_ub_format!(
+            "Data race detected between {} on {} and {} on {}, memory({:?},offset={},size={})\
+            \n\t\t -current vector clock = {:?}\
+            \n\t\t -conflicting timestamp = {:?}",
+            action,
+            current_thread_info,
+            other_action,
+            other_thread_info,
+            pointer.alloc_id,
+            pointer.offset.bytes(),
+            len.bytes(),
+            current_clocks.clock,
+            other_clock
+        )
+    }
+
+    /// Detect data-races for an unsychronized read operation, will not perform
+    /// data-race detection if `multi-threaded` is false, either due to no threads
+    /// being created or if it is temporarily disabled during a racy read or write
+    /// operation for which data-race detection is handled separately, for example
+    /// atomic read operations.
+    pub fn read<'tcx>(&self, pointer: Pointer<Tag>, len: Size) -> InterpResult<'tcx> {
+        if self.global.multi_threaded.get() {
+            let (index, clocks) = self.global.current_thread_state();
+            let mut alloc_ranges = self.alloc_ranges.borrow_mut();
+            for (_, range) in alloc_ranges.iter_mut(pointer.offset, len) {
+                if let Err(DataRace) = range.read_race_detect(&*clocks, index) {
+                    // Report data-race.
+                    return Self::report_data_race(
+                        &self.global,
+                        range,
+                        "READ",
+                        false,
+                        pointer,
+                        len,
+                    );
+                }
+            }
+            Ok(())
+        } else {
+            Ok(())
+        }
+    }
+
+    // Shared code for detecting data-races on unique access to a section of memory
+    fn unique_access<'tcx>(
+        &mut self,
+        pointer: Pointer<Tag>,
+        len: Size,
+        action: &str,
+    ) -> InterpResult<'tcx> {
+        if self.global.multi_threaded.get() {
+            let (index, clocks) = self.global.current_thread_state();
+            for (_, range) in self.alloc_ranges.get_mut().iter_mut(pointer.offset, len) {
+                if let Err(DataRace) = range.write_race_detect(&*clocks, index) {
+                    // Report data-race
+                    return Self::report_data_race(
+                        &self.global,
+                        range,
+                        action,
+                        false,
+                        pointer,
+                        len,
+                    );
+                }
+            }
+            Ok(())
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Detect data-races for an unsychronized write operation, will not perform
+    /// data-race threads if `multi-threaded` is false, either due to no threads
+    /// being created or if it is temporarily disabled during a racy read or write
+    /// operation
+    pub fn write<'tcx>(&mut self, pointer: Pointer<Tag>, len: Size) -> InterpResult<'tcx> {
+        self.unique_access(pointer, len, "Write")
+    }
+
+    /// Detect data-races for an unsychronized deallocate operation, will not perform
+    /// data-race threads if `multi-threaded` is false, either due to no threads
+    /// being created or if it is temporarily disabled during a racy read or write
+    /// operation
+    pub fn deallocate<'tcx>(&mut self, pointer: Pointer<Tag>, len: Size) -> InterpResult<'tcx> {
+        self.unique_access(pointer, len, "Deallocate")
+    }
+}
+
+impl<'mir, 'tcx: 'mir> EvalContextPrivExt<'mir, 'tcx> for MiriEvalContext<'mir, 'tcx> {}
+trait EvalContextPrivExt<'mir, 'tcx: 'mir>: MiriEvalContextExt<'mir, 'tcx> {
+    // Temporarily allow data-races to occur, this should only be
+    // used if either one of the appropiate `validate_atomic` functions
+    // will be called to treat a memory access as atomic or if the memory
+    // being accessed should be treated as internal state, that cannot be
+    // accessed by the interpreted program.
+    #[inline]
+    fn allow_data_races_ref<R>(&self, op: impl FnOnce(&MiriEvalContext<'mir, 'tcx>) -> R) -> R {
+        let this = self.eval_context_ref();
+        let old = if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.multi_threaded.replace(false)
+        } else {
+            false
+        };
+        let result = op(this);
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.multi_threaded.set(old);
+        }
+        result
+    }
+
+    /// Same as `allow_data_races_ref`, this temporarily disables any data-race detection and
+    /// so should only be used for atomic operations or internal state that the program cannot
+    /// access.
+    #[inline]
+    fn allow_data_races_mut<R>(
+        &mut self,
+        op: impl FnOnce(&mut MiriEvalContext<'mir, 'tcx>) -> R,
+    ) -> R {
+        let this = self.eval_context_mut();
+        let old = if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.multi_threaded.replace(false)
+        } else {
+            false
+        };
+        let result = op(this);
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.multi_threaded.set(old);
+        }
+        result
+    }
+
+    /// Generic atomic operation implementation,
+    /// this accesses memory via get_raw instead of
+    /// get_raw_mut, due to issues calling get_raw_mut
+    /// for atomic loads from read-only memory.
+    /// FIXME: is this valid, or should get_raw_mut be used for
+    /// atomic-stores/atomic-rmw?
+    fn validate_atomic_op<A: Debug + Copy>(
+        &self,
+        place: MPlaceTy<'tcx, Tag>,
+        atomic: A,
+        description: &str,
+        mut op: impl FnMut(
+            &mut MemoryCellClocks,
+            &mut ThreadClockSet,
+            VectorIdx,
+            A,
+        ) -> Result<(), DataRace>,
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_ref();
+        if let Some(data_race) = &this.memory.extra.data_race {
+            if data_race.multi_threaded.get() {
+                // Load and log the atomic operation.
+                let place_ptr = place.ptr.assert_ptr();
+                let size = place.layout.size;
+                let alloc_meta =
+                    &this.memory.get_raw(place_ptr.alloc_id)?.extra.data_race.as_ref().unwrap();
+                log::trace!(
+                    "Atomic op({}) with ordering {:?} on memory({:?}, offset={}, size={})",
+                    description,
+                    &atomic,
+                    place_ptr.alloc_id,
+                    place_ptr.offset.bytes(),
+                    size.bytes()
+                );
+
+                // Perform the atomic operation.
+                let data_race = &alloc_meta.global;
+                data_race.maybe_perform_sync_operation(|index, mut clocks| {
+                    for (_, range) in
+                        alloc_meta.alloc_ranges.borrow_mut().iter_mut(place_ptr.offset, size)
+                    {
+                        if let Err(DataRace) = op(range, &mut *clocks, index, atomic) {
+                            mem::drop(clocks);
+                            return VClockAlloc::report_data_race(
+                                &alloc_meta.global,
+                                range,
+                                description,
+                                true,
+                                place_ptr,
+                                size,
+                            ).map(|_| true);
+                        }
+                    }
+
+                    // This conservatively assumes all operations have release semantics
+                    Ok(true)
+                })?;
+
+                // Log changes to atomic memory.
+                if log::log_enabled!(log::Level::Trace) {
+                    for (_, range) in alloc_meta.alloc_ranges.borrow().iter(place_ptr.offset, size)
+                    {
+                        log::trace!(
+                            "Updated atomic memory({:?}, offset={}, size={}) to {:#?}",
+                            place.ptr.assert_ptr().alloc_id,
+                            place_ptr.offset.bytes(),
+                            size.bytes(),
+                            range.atomic_ops
+                        );
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+/// Extra metadata associated with a thread.
+#[derive(Debug, Clone, Default)]
+struct ThreadExtraState {
+    /// The current vector index in use by the
+    /// thread currently, this is set to None
+    /// after the vector index has been re-used
+    /// and hence the value will never need to be
+    /// read during data-race reporting.
+    vector_index: Option<VectorIdx>,
+
+    /// The name of the thread, updated for better
+    /// diagnostics when reporting detected data
+    /// races.
+    thread_name: Option<Box<str>>,
+
+    /// Thread termination vector clock, this
+    /// is set on thread termination and is used
+    /// for joining on threads since the vector_index
+    /// may be re-used when the join operation occurs.
+    termination_vector_clock: Option<VClock>,
+}
+
+/// Global data-race detection state, contains the currently
+/// executing thread as well as the vector-clocks associated
+/// with each of the threads.
+#[derive(Debug, Clone)]
+pub struct GlobalState {
+    /// Set to true once the first additional
+    /// thread has launched, due to the dependency
+    /// between before and after a thread launch.
+    /// Any data-races must be recorded after this
+    /// so concurrent execution can ignore recording
+    /// any data-races.
+    multi_threaded: Cell<bool>,
+
+    /// Mapping of a vector index to a known set of thread
+    /// clocks, this is not directly mapping from a thread id
+    /// since it may refer to multiple threads.
+    vector_clocks: RefCell<IndexVec<VectorIdx, ThreadClockSet>>,
+
+    /// Mapping of a given vector index to the current thread
+    /// that the execution is representing, this may change
+    /// if a vector index is re-assigned to a new thread.
+    vector_info: RefCell<IndexVec<VectorIdx, ThreadId>>,
+
+    /// The mapping of a given thread to assocaited thread metadata.
+    thread_info: RefCell<IndexVec<ThreadId, ThreadExtraState>>,
+
+    /// The current vector index being executed.
+    current_index: Cell<VectorIdx>,
+
+    /// Potential vector indices that could be re-used on thread creation
+    /// values are inserted here on after the thread has terminated and
+    /// been joined with, and hence may potentially become free
+    /// for use as the index for a new thread.
+    /// Elements in this set may still require the vector index to
+    /// report data-races, and can only be re-used after all
+    /// active vector-clocks catch up with the threads timestamp.
+    reuse_candidates: RefCell<FxHashSet<VectorIdx>>,
+
+    /// Counts the number of threads that are currently active
+    /// if the number of active threads reduces to 1 and then
+    /// a join operation occures with the remaining main thread
+    /// then multi-threaded execution may be disabled.
+    active_thread_count: Cell<usize>,
+
+    /// This contains threads that have terminated, but not yet joined
+    /// and so cannot become re-use candidates until a join operation
+    /// occurs.
+    /// The associated vector index will be moved into re-use candidates
+    /// after the join operation occurs.
+    terminated_threads: RefCell<FxHashMap<ThreadId, VectorIdx>>,
+}
+
+impl GlobalState {
+    /// Create a new global state, setup with just thread-id=0
+    /// advanced to timestamp = 1.
+    pub fn new() -> Self {
+        let global_state = GlobalState {
+            multi_threaded: Cell::new(false),
+            vector_clocks: RefCell::new(IndexVec::new()),
+            vector_info: RefCell::new(IndexVec::new()),
+            thread_info: RefCell::new(IndexVec::new()),
+            current_index: Cell::new(VectorIdx::new(0)),
+            active_thread_count: Cell::new(1),
+            reuse_candidates: RefCell::new(FxHashSet::default()),
+            terminated_threads: RefCell::new(FxHashMap::default()),
+        };
+
+        // Setup the main-thread since it is not explicitly created:
+        // uses vector index and thread-id 0, also the rust runtime gives
+        // the main-thread a name of "main".
+        let index = global_state.vector_clocks.borrow_mut().push(ThreadClockSet::default());
+        global_state.vector_info.borrow_mut().push(ThreadId::new(0));
+        global_state.thread_info.borrow_mut().push(ThreadExtraState {
+            vector_index: Some(index),
+            thread_name: Some("main".to_string().into_boxed_str()),
+            termination_vector_clock: None,
+        });
+
+        global_state
+    }
+
+    // Try to find vector index values that can potentially be re-used
+    // by a new thread instead of a new vector index being created.
+    fn find_vector_index_reuse_candidate(&self) -> Option<VectorIdx> {
+        let mut reuse = self.reuse_candidates.borrow_mut();
+        let vector_clocks = self.vector_clocks.borrow();
+        let vector_info = self.vector_info.borrow();
+        let terminated_threads = self.terminated_threads.borrow();
+        for &candidate in reuse.iter() {
+            let target_timestamp = vector_clocks[candidate].clock[candidate];
+            if vector_clocks.iter_enumerated().all(|(clock_idx, clock)| {
+                // The thread happens before the clock, and hence cannot report
+                // a data-race with this the candidate index.
+                let no_data_race = clock.clock[candidate] >= target_timestamp;
+
+                // The vector represents a thread that has terminated and hence cannot
+                // report a data-race with the candidate index.
+                let thread_id = vector_info[clock_idx];
+                let vector_terminated =
+                    reuse.contains(&clock_idx) || terminated_threads.contains_key(&thread_id);
+
+                // The vector index cannot report a race with the candidate index
+                // and hence allows the candidate index to be re-used.
+                no_data_race || vector_terminated
+            }) {
+                // All vector clocks for each vector index are equal to
+                // the target timestamp, and the thread is known to have
+                // terminated, therefore this vector clock index cannot
+                // report any more data-races.
+                assert!(reuse.remove(&candidate));
+                return Some(candidate);
+            }
+        }
+        None
+    }
+
+    // Hook for thread creation, enabled multi-threaded execution and marks
+    // the current thread timestamp as happening-before the current thread.
+    #[inline]
+    pub fn thread_created(&self, thread: ThreadId) {
+        let current_index = self.current_index();
+
+        // Increment the number of active threads.
+        let active_threads = self.active_thread_count.get();
+        self.active_thread_count.set(active_threads + 1);
+
+        // Enable multi-threaded execution, there are now two threads
+        // so data-races are now possible.
+        self.multi_threaded.set(true);
+
+        // Load and setup the associated thread metadata
+        let mut thread_info = self.thread_info.borrow_mut();
+        thread_info.ensure_contains_elem(thread, Default::default);
+
+        // Assign a vector index for the thread, attempting to re-use an old
+        // vector index that can no longer report any data-races if possible.
+        let created_index = if let Some(reuse_index) = self.find_vector_index_reuse_candidate() {
+            // Now re-configure the re-use candidate, increment the clock
+            // for the new sync use of the vector.
+            let mut vector_clocks = self.vector_clocks.borrow_mut();
+            vector_clocks[reuse_index].increment_clock(reuse_index);
+
+            // Locate the old thread the vector was associated with and update
+            // it to represent the new thread instead.
+            let mut vector_info = self.vector_info.borrow_mut();
+            let old_thread = vector_info[reuse_index];
+            vector_info[reuse_index] = thread;
+
+            // Mark the thread the vector index was associated with as no longer
+            // representing a thread index.
+            thread_info[old_thread].vector_index = None;
+
+            reuse_index
+        } else {
+            // No vector re-use candidates available, instead create
+            // a new vector index.
+            let mut vector_info = self.vector_info.borrow_mut();
+            vector_info.push(thread)
+        };
+
+        // Mark the chosen vector index as in use by the thread.
+        thread_info[thread].vector_index = Some(created_index);
+
+        // Create a thread clock set if applicable.
+        let mut vector_clocks = self.vector_clocks.borrow_mut();
+        if created_index == vector_clocks.next_index() {
+            vector_clocks.push(ThreadClockSet::default());
+        }
+
+        // Now load the two clocks and configure the initial state.
+        let (current, created) = vector_clocks.pick2_mut(current_index, created_index);
+
+        // Join the created with current, since the current threads
+        // previous actions happen-before the created thread.
+        created.join_with(current);
+
+        // Advance both threads after the synchronized operation.
+        // Both operations are considered to have release semantics.
+        current.increment_clock(current_index);
+        created.increment_clock(created_index);
+    }
+
+    /// Hook on a thread join to update the implicit happens-before relation
+    /// between the joined thead and the current thread.
+    #[inline]
+    pub fn thread_joined(&self, current_thread: ThreadId, join_thread: ThreadId) {
+        let mut clocks_vec = self.vector_clocks.borrow_mut();
+        let thread_info = self.thread_info.borrow();
+
+        // Load the vector clock of the current thread.
+        let current_index = thread_info[current_thread]
+            .vector_index
+            .expect("Performed thread join on thread with no assigned vector");
+        let current = &mut clocks_vec[current_index];
+
+        // Load the associated vector clock for the terminated thread.
+        let join_clock = thread_info[join_thread]
+            .termination_vector_clock
+            .as_ref()
+            .expect("Joined with thread but thread has not terminated");
+
+
+        // The join thread happens-before the current thread
+        // so update the current vector clock.
+        // Is not a release operation so the clock is not incremented.
+        current.clock.join(join_clock);
+
+        // Check the number of active threads, if the value is 1
+        // then test for potentially disabling multi-threaded execution.
+        let active_threads = self.active_thread_count.get();
+        if active_threads == 1 {
+            // May potentially be able to disable multi-threaded execution.
+            let current_clock = &clocks_vec[current_index];
+            if clocks_vec
+                .iter_enumerated()
+                .all(|(idx, clocks)| clocks.clock[idx] <= current_clock.clock[idx])
+            {
+                // The all thread termations happen-before the current clock
+                // therefore no data-races can be reported until a new thread
+                // is created, so disable multi-threaded execution.
+                self.multi_threaded.set(false);
+            }
+        }
+
+        // If the thread is marked as terminated but not joined
+        // then move the thread to the re-use set.
+        let mut termination = self.terminated_threads.borrow_mut();
+        if let Some(index) = termination.remove(&join_thread) {
+            let mut reuse = self.reuse_candidates.borrow_mut();
+            reuse.insert(index);
+        }
+    }
+
+    /// On thread termination, the vector-clock may re-used
+    /// in the future once all remaining thread-clocks catch
+    /// up with the time index of the terminated thread.
+    /// This assiges thread termination with a unique index
+    /// which will be used to join the thread
+    /// This should be called strictly before any calls to
+    /// `thread_joined`.
+    #[inline]
+    pub fn thread_terminated(&self) {
+        let current_index = self.current_index();
+
+        // Increment the clock to a unique termination timestamp.
+        let mut vector_clocks = self.vector_clocks.borrow_mut();
+        let current_clocks = &mut vector_clocks[current_index];
+        current_clocks.increment_clock(current_index);
+
+        // Load the current thread id for the executing vector.
+        let vector_info = self.vector_info.borrow();
+        let current_thread = vector_info[current_index];
+
+        // Load the current thread metadata, and move to a terminated
+        // vector state. Setting up the vector clock all join operations
+        // will use.
+        let mut thread_info = self.thread_info.borrow_mut();
+        let current = &mut thread_info[current_thread];
+        current.termination_vector_clock = Some(current_clocks.clock.clone());
+
+        // Add this thread as a candidate for re-use after a thread join
+        // occurs.
+        let mut termination = self.terminated_threads.borrow_mut();
+        termination.insert(current_thread, current_index);
+
+        // Reduce the number of active threads, now that a thread has
+        // terminated.
+        let mut active_threads = self.active_thread_count.get();
+        active_threads -= 1;
+        self.active_thread_count.set(active_threads);
+    }
+
+    /// Hook for updating the local tracker of the currently
+    /// enabled thread, should always be updated whenever
+    /// `active_thread` in thread.rs is updated.
+    #[inline]
+    pub fn thread_set_active(&self, thread: ThreadId) {
+        let thread_info = self.thread_info.borrow();
+        let vector_idx = thread_info[thread]
+            .vector_index
+            .expect("Setting thread active with no assigned vector");
+        self.current_index.set(vector_idx);
+    }
+
+    /// Hook for updating the local tracker of the threads name
+    /// this should always mirror the local value in thread.rs
+    /// the thread name is used for improved diagnostics
+    /// during a data-race.
+    #[inline]
+    pub fn thread_set_name(&self, thread: ThreadId, name: String) {
+        let name = name.into_boxed_str();
+        let mut thread_info = self.thread_info.borrow_mut();
+        thread_info[thread].thread_name = Some(name);
+    }
+
+    /// Attempt to perform a synchronized operation, this
+    /// will perform no operation if multi-threading is
+    /// not currently enabled.
+    /// Otherwise it will increment the clock for the current
+    /// vector before and after the operation for data-race
+    /// detection between any happens-before edges the
+    /// operation may create.
+    fn maybe_perform_sync_operation<'tcx>(
+        &self,
+        op: impl FnOnce(VectorIdx, RefMut<'_, ThreadClockSet>) -> InterpResult<'tcx, bool>,
+    ) -> InterpResult<'tcx> {
+        if self.multi_threaded.get() {
+            let (index, clocks) = self.current_thread_state_mut();
+            if op(index, clocks)? {
+                let (_, mut clocks) = self.current_thread_state_mut();
+                clocks.increment_clock(index);
+            }
+        }
+        Ok(())
+    }
+
+    /// Internal utility to identify a thread stored internally
+    /// returns the id and the name for better diagnostics.
+    fn print_thread_metadata(&self, vector: VectorIdx) -> String {
+        let thread = self.vector_info.borrow()[vector];
+        let thread_name = &self.thread_info.borrow()[thread].thread_name;
+        if let Some(name) = thread_name {
+            let name: &str = name;
+            format!("Thread(id = {:?}, name = {:?})", thread.to_u32(), &*name)
+        } else {
+            format!("Thread(id = {:?})", thread.to_u32())
+        }
+    }
+
+    /// Acquire a lock, express that the previous call of
+    /// `validate_lock_release` must happen before this.
+    /// As this is an acquire operation, the thread timestamp is not
+    /// incremented.
+    pub fn validate_lock_acquire(&self, lock: &VClock, thread: ThreadId) {
+        let (_, mut clocks) = self.load_thread_state_mut(thread);
+        clocks.clock.join(&lock);
+    }
+
+    /// Release a lock handle, express that this happens-before
+    /// any subsequent calls to `validate_lock_acquire`.
+    /// For normal locks this should be equivalent to `validate_lock_release_shared`
+    /// since an acquire operation should have occured before, however
+    /// for futex & cond-var operations this is not the case and this
+    /// operation must be used.
+    pub fn validate_lock_release(&self, lock: &mut VClock, thread: ThreadId) {
+        let (index, mut clocks) = self.load_thread_state_mut(thread);
+        lock.clone_from(&clocks.clock);
+        clocks.increment_clock(index);
+    }
+
+    /// Release a lock handle, express that this happens-before
+    /// any subsequent calls to `validate_lock_acquire` as well
+    /// as any previous calls to this function after any
+    /// `validate_lock_release` calls.
+    /// For normal locks this should be equivalent to `validate_lock_release`.
+    /// This function only exists for joining over the set of concurrent readers
+    /// in a read-write lock and should not be used for anything else.
+    pub fn validate_lock_release_shared(&self, lock: &mut VClock, thread: ThreadId) {
+        let (index, mut clocks) = self.load_thread_state_mut(thread);
+        lock.join(&clocks.clock);
+        clocks.increment_clock(index);
+    }
+
+    /// Load the vector index used by the given thread as well as the set of vector clocks
+    /// used by the thread.
+    #[inline]
+    fn load_thread_state_mut(&self, thread: ThreadId) -> (VectorIdx, RefMut<'_, ThreadClockSet>) {
+        let index = self.thread_info.borrow()[thread]
+            .vector_index
+            .expect("Loading thread state for thread with no assigned vector");
+        let ref_vector = self.vector_clocks.borrow_mut();
+        let clocks = RefMut::map(ref_vector, |vec| &mut vec[index]);
+        (index, clocks)
+    }
+
+    /// Load the current vector clock in use and the current set of thread clocks
+    /// in use for the vector.
+    #[inline]
+    fn current_thread_state(&self) -> (VectorIdx, Ref<'_, ThreadClockSet>) {
+        let index = self.current_index();
+        let ref_vector = self.vector_clocks.borrow();
+        let clocks = Ref::map(ref_vector, |vec| &vec[index]);
+        (index, clocks)
+    }
+
+    /// Load the current vector clock in use and the current set of thread clocks
+    /// in use for the vector mutably for modification.
+    #[inline]
+    fn current_thread_state_mut(&self) -> (VectorIdx, RefMut<'_, ThreadClockSet>) {
+        let index = self.current_index();
+        let ref_vector = self.vector_clocks.borrow_mut();
+        let clocks = RefMut::map(ref_vector, |vec| &mut vec[index]);
+        (index, clocks)
+    }
+
+    /// Return the current thread, should be the same
+    /// as the data-race active thread.
+    #[inline]
+    fn current_index(&self) -> VectorIdx {
+        self.current_index.get()
+    }
+}
diff --git a/src/eval.rs b/src/eval.rs
index 54d06feec3..0a62f14dd3 100644
--- a/src/eval.rs
+++ b/src/eval.rs
@@ -48,6 +48,8 @@ pub struct MiriConfig {
     pub tracked_alloc_id: Option<AllocId>,
     /// Whether to track raw pointers in stacked borrows.
     pub track_raw: bool,
+    /// Determine if data race detection should be enabled
+    pub data_race_detector: bool,
 }
 
 impl Default for MiriConfig {
@@ -65,6 +67,7 @@ impl Default for MiriConfig {
             tracked_call_id: None,
             tracked_alloc_id: None,
             track_raw: false,
+            data_race_detector: true,
         }
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index d4802f3b11..87effe9c68 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -22,6 +22,7 @@ extern crate rustc_mir;
 extern crate rustc_span;
 extern crate rustc_target;
 
+mod data_race;
 mod diagnostics;
 mod eval;
 mod helpers;
@@ -34,6 +35,7 @@ mod shims;
 mod stacked_borrows;
 mod sync;
 mod thread;
+mod vector_clock;
 
 // Establish a "crate-wide prelude": we often import `crate::*`.
 
@@ -52,6 +54,10 @@ pub use crate::shims::panic::{CatchUnwindData, EvalContextExt as _};
 pub use crate::shims::tls::{EvalContextExt as _, TlsData};
 pub use crate::shims::EvalContextExt as _;
 
+pub use crate::data_race::{
+    AtomicReadOp, AtomicWriteOp, AtomicRwOp, AtomicFenceOp,
+    EvalContextExt as DataRaceEvalContextExt
+};
 pub use crate::diagnostics::{
     register_diagnostic, report_error, EvalContextExt as DiagnosticsEvalContextExt,
     TerminationInfo, NonHaltingDiagnostic,
@@ -74,6 +80,9 @@ pub use crate::thread::{
 pub use crate::sync::{
     EvalContextExt as SyncEvalContextExt, CondvarId, MutexId, RwLockId
 };
+pub use crate::vector_clock::{
+    VClock, VSmallClockMap, VectorIdx, VTimestamp
+};
 
 /// Insert rustc arguments at the beginning of the argument list that Miri wants to be
 /// set per default, for maximal validation power.
diff --git a/src/machine.rs b/src/machine.rs
index e9f9298e56..02c6691556 100644
--- a/src/machine.rs
+++ b/src/machine.rs
@@ -109,12 +109,16 @@ impl fmt::Display for MiriMemoryKind {
 pub struct AllocExtra {
     /// Stacked Borrows state is only added if it is enabled.
     pub stacked_borrows: Option<stacked_borrows::AllocExtra>,
+    /// Data race detection via the use of a vector-clock,
+    ///  this is only added if it is enabled.
+    pub data_race: Option<data_race::AllocExtra>,
 }
 
 /// Extra global memory data
 #[derive(Clone, Debug)]
 pub struct MemoryExtra {
     pub stacked_borrows: Option<stacked_borrows::MemoryExtra>,
+    pub data_race: Option<data_race::MemoryExtra>,
     pub intptrcast: intptrcast::MemoryExtra,
 
     /// Mapping extern static names to their canonical allocation.
@@ -144,8 +148,14 @@ impl MemoryExtra {
         } else {
             None
         };
+        let data_race = if config.data_race_detector {
+            Some(Rc::new(data_race::GlobalState::new()))
+        } else {
+            None
+        };
         MemoryExtra {
             stacked_borrows,
+            data_race,
             intptrcast: Default::default(),
             extern_statics: FxHashMap::default(),
             rng: RefCell::new(rng),
@@ -467,6 +477,11 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for Evaluator<'mir, 'tcx> {
                 // No stacks, no tag.
                 (None, Tag::Untagged)
             };
+        let race_alloc = if let Some(data_race) = &memory_extra.data_race {
+            Some(data_race::AllocExtra::new_allocation(&data_race, alloc.size))
+        } else {
+            None
+        };
         let mut stacked_borrows = memory_extra.stacked_borrows.as_ref().map(|sb| sb.borrow_mut());
         let alloc: Allocation<Tag, Self::AllocExtra> = alloc.with_tags_and_extra(
             |alloc| {
@@ -478,7 +493,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for Evaluator<'mir, 'tcx> {
                     Tag::Untagged
                 }
             },
-            AllocExtra { stacked_borrows: stacks },
+            AllocExtra { stacked_borrows: stacks, data_race: race_alloc },
         );
         (Cow::Owned(alloc), base_tag)
     }
@@ -584,6 +599,9 @@ impl AllocationExtra<Tag> for AllocExtra {
         ptr: Pointer<Tag>,
         size: Size,
     ) -> InterpResult<'tcx> {
+        if let Some(data_race) = &alloc.extra.data_race {
+            data_race.read(ptr, size)?;
+        }
         if let Some(stacked_borrows) = &alloc.extra.stacked_borrows {
             stacked_borrows.memory_read(ptr, size)
         } else {
@@ -597,6 +615,9 @@ impl AllocationExtra<Tag> for AllocExtra {
         ptr: Pointer<Tag>,
         size: Size,
     ) -> InterpResult<'tcx> {
+        if let Some(data_race) = &mut alloc.extra.data_race {
+            data_race.write(ptr, size)?;
+        }
         if let Some(stacked_borrows) = &mut alloc.extra.stacked_borrows {
             stacked_borrows.memory_written(ptr, size)
         } else {
@@ -610,6 +631,9 @@ impl AllocationExtra<Tag> for AllocExtra {
         ptr: Pointer<Tag>,
         size: Size,
     ) -> InterpResult<'tcx> {
+        if let Some(data_race) = &mut alloc.extra.data_race {
+            data_race.deallocate(ptr, size)?;
+        }
         if let Some(stacked_borrows) = &mut alloc.extra.stacked_borrows {
             stacked_borrows.memory_deallocated(ptr, size)
         } else {
diff --git a/src/shims/intrinsics.rs b/src/shims/intrinsics.rs
index b401bd8ada..8f7ae6bebb 100644
--- a/src/shims/intrinsics.rs
+++ b/src/shims/intrinsics.rs
@@ -4,7 +4,7 @@ use log::trace;
 
 use rustc_attr as attr;
 use rustc_ast::ast::FloatTy;
-use rustc_middle::{mir, ty};
+use rustc_middle::{mir, mir::BinOp, ty};
 use rustc_middle::ty::layout::IntegerExt;
 use rustc_apfloat::{Float, Round};
 use rustc_target::abi::{Align, Integer, LayoutOf};
@@ -306,157 +306,117 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
             }
 
             // Atomic operations
-            #[rustfmt::skip]
-            | "atomic_load"
-            | "atomic_load_relaxed"
-            | "atomic_load_acq"
-            => {
-                let &[place] = check_arg_count(args)?;
-                let place = this.deref_operand(place)?;
-                let val = this.read_scalar(place.into())?; // make sure it fits into a scalar; otherwise it cannot be atomic
-
-                // Check alignment requirements. Atomics must always be aligned to their size,
-                // even if the type they wrap would be less aligned (e.g. AtomicU64 on 32bit must
-                // be 8-aligned).
-                let align = Align::from_bytes(place.layout.size.bytes()).unwrap();
-                this.memory.check_ptr_access(place.ptr, place.layout.size, align)?;
-
-                this.write_scalar(val, dest)?;
-            }
-
-            #[rustfmt::skip]
-            | "atomic_store"
-            | "atomic_store_relaxed"
-            | "atomic_store_rel"
-            => {
-                let &[place, val] = check_arg_count(args)?;
-                let place = this.deref_operand(place)?;
-                let val = this.read_scalar(val)?; // make sure it fits into a scalar; otherwise it cannot be atomic
+            "atomic_load" => this.atomic_load(args, dest, AtomicReadOp::SeqCst)?,
+            "atomic_load_relaxed" => this.atomic_load(args, dest, AtomicReadOp::Relaxed)?,
+            "atomic_load_acq" => this.atomic_load(args, dest, AtomicReadOp::Acquire)?,
+
+            "atomic_store" => this.atomic_store(args, AtomicWriteOp::SeqCst)?,
+            "atomic_store_relaxed" => this.atomic_store(args, AtomicWriteOp::Relaxed)?,
+            "atomic_store_rel" => this.atomic_store(args, AtomicWriteOp::Release)?,
+
+            "atomic_fence_acq" => this.atomic_fence(args, AtomicFenceOp::Acquire)?,
+            "atomic_fence_rel" => this.atomic_fence(args, AtomicFenceOp::Release)?,
+            "atomic_fence_acqrel" => this.atomic_fence(args, AtomicFenceOp::AcqRel)?,
+            "atomic_fence" => this.atomic_fence(args, AtomicFenceOp::SeqCst)?,
+
+            "atomic_singlethreadfence_acq" => this.compiler_fence(args, AtomicFenceOp::Acquire)?,
+            "atomic_singlethreadfence_rel" => this.compiler_fence(args, AtomicFenceOp::Release)?,
+            "atomic_singlethreadfence_acqrel" => this.compiler_fence(args, AtomicFenceOp::AcqRel)?,
+            "atomic_singlethreadfence" => this.compiler_fence(args, AtomicFenceOp::SeqCst)?,
+
+            "atomic_xchg" => this.atomic_exchange(args, dest, AtomicRwOp::SeqCst)?,
+            "atomic_xchg_acq" => this.atomic_exchange(args, dest, AtomicRwOp::Acquire)?,
+            "atomic_xchg_rel" => this.atomic_exchange(args, dest, AtomicRwOp::Release)?,
+            "atomic_xchg_acqrel" => this.atomic_exchange(args, dest, AtomicRwOp::AcqRel)?,
+            "atomic_xchg_relaxed" => this.atomic_exchange(args, dest, AtomicRwOp::Relaxed)?,
+
+            "atomic_cxchg" => this.atomic_compare_exchange(
+                args, dest, AtomicRwOp::SeqCst, AtomicReadOp::SeqCst
+            )?,
+            "atomic_cxchg_acq" => this.atomic_compare_exchange(
+                args, dest, AtomicRwOp::Acquire, AtomicReadOp::Acquire
+            )?,
+            "atomic_cxchg_rel" => this.atomic_compare_exchange(
+                args, dest, AtomicRwOp::Release, AtomicReadOp::Relaxed
+            )?,
+            "atomic_cxchg_acqrel" => this.atomic_compare_exchange
+            (args, dest, AtomicRwOp::AcqRel, AtomicReadOp::Acquire
+            )?,
+            "atomic_cxchg_relaxed" => this.atomic_compare_exchange(
+                args, dest, AtomicRwOp::Relaxed, AtomicReadOp::Relaxed
+            )?,
+            "atomic_cxchg_acq_failrelaxed" => this.atomic_compare_exchange(
+                args, dest, AtomicRwOp::Acquire, AtomicReadOp::Relaxed
+            )?,
+            "atomic_cxchg_acqrel_failrelaxed" => this.atomic_compare_exchange(
+                args, dest, AtomicRwOp::AcqRel, AtomicReadOp::Relaxed
+            )?,
+            "atomic_cxchg_failrelaxed" => this.atomic_compare_exchange(
+                args, dest, AtomicRwOp::SeqCst, AtomicReadOp::Relaxed
+            )?,
+            "atomic_cxchg_failacq" => this.atomic_compare_exchange(
+                args, dest, AtomicRwOp::SeqCst, AtomicReadOp::Acquire
+            )?,
+
+            "atomic_cxchgweak" => this.atomic_compare_exchange_weak(
+                args, dest, AtomicRwOp::SeqCst, AtomicReadOp::SeqCst
+            )?,
+            "atomic_cxchgweak_acq" => this.atomic_compare_exchange_weak(
+                args, dest, AtomicRwOp::Acquire, AtomicReadOp::Acquire
+            )?,
+            "atomic_cxchgweak_rel" => this.atomic_compare_exchange_weak(
+                args, dest, AtomicRwOp::Release, AtomicReadOp::Relaxed
+            )?,
+            "atomic_cxchgweak_acqrel" => this.atomic_compare_exchange_weak(
+                args, dest, AtomicRwOp::AcqRel, AtomicReadOp::Acquire
+            )?,
+            "atomic_cxchgweak_relaxed" => this.atomic_compare_exchange_weak(
+                args, dest, AtomicRwOp::Relaxed, AtomicReadOp::Relaxed
+            )?,
+            "atomic_cxchgweak_acq_failrelaxed" => this.atomic_compare_exchange_weak(
+                args, dest, AtomicRwOp::Acquire, AtomicReadOp::Relaxed
+            )?,
+            "atomic_cxchgweak_acqrel_failrelaxed" => this.atomic_compare_exchange_weak(
+                args, dest, AtomicRwOp::AcqRel, AtomicReadOp::Relaxed
+            )?,
+            "atomic_cxchgweak_failrelaxed" => this.atomic_compare_exchange_weak(
+                args, dest, AtomicRwOp::SeqCst, AtomicReadOp::Relaxed
+            )?,
+            "atomic_cxchgweak_failacq" => this.atomic_compare_exchange_weak(
+                args, dest, AtomicRwOp::SeqCst, AtomicReadOp::Acquire
+            )?,
+
+            "atomic_or" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRwOp::SeqCst)?,
+            "atomic_or_acq" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRwOp::Acquire)?,
+            "atomic_or_rel" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRwOp::Release)?,
+            "atomic_or_acqrel" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRwOp::AcqRel)?,
+            "atomic_or_relaxed" => this.atomic_op(args, dest, BinOp::BitOr, false, AtomicRwOp::Relaxed)?,
+            "atomic_xor" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRwOp::SeqCst)?,
+            "atomic_xor_acq" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRwOp::Acquire)?,
+            "atomic_xor_rel" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRwOp::Release)?,
+            "atomic_xor_acqrel" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRwOp::AcqRel)?,
+            "atomic_xor_relaxed" => this.atomic_op(args, dest, BinOp::BitXor, false, AtomicRwOp::Relaxed)?,
+            "atomic_and" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRwOp::SeqCst)?,
+            "atomic_and_acq" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRwOp::Acquire)?,
+            "atomic_and_rel" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRwOp::Release)?,
+            "atomic_and_acqrel" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRwOp::AcqRel)?,
+            "atomic_and_relaxed" => this.atomic_op(args, dest, BinOp::BitAnd, false, AtomicRwOp::Relaxed)?,
+            "atomic_nand" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRwOp::SeqCst)?,
+            "atomic_nand_acq" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRwOp::Acquire)?,
+            "atomic_nand_rel" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRwOp::Release)?,
+            "atomic_nand_acqrel" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRwOp::AcqRel)?,
+            "atomic_nand_relaxed" => this.atomic_op(args, dest, BinOp::BitAnd, true, AtomicRwOp::Relaxed)?,
+            "atomic_xadd" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRwOp::SeqCst)?,
+            "atomic_xadd_acq" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRwOp::Acquire)?,
+            "atomic_xadd_rel" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRwOp::Release)?,
+            "atomic_xadd_acqrel" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRwOp::AcqRel)?,
+            "atomic_xadd_relaxed" => this.atomic_op(args, dest, BinOp::Add, false, AtomicRwOp::Relaxed)?,
+            "atomic_xsub" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRwOp::SeqCst)?,
+            "atomic_xsub_acq" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRwOp::Acquire)?,
+            "atomic_xsub_rel" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRwOp::Release)?,
+            "atomic_xsub_acqrel" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRwOp::AcqRel)?,
+            "atomic_xsub_relaxed" => this.atomic_op(args, dest, BinOp::Sub, false, AtomicRwOp::Relaxed)?,
 
-                // Check alignment requirements. Atomics must always be aligned to their size,
-                // even if the type they wrap would be less aligned (e.g. AtomicU64 on 32bit must
-                // be 8-aligned).
-                let align = Align::from_bytes(place.layout.size.bytes()).unwrap();
-                this.memory.check_ptr_access(place.ptr, place.layout.size, align)?;
-
-                this.write_scalar(val, place.into())?;
-            }
-
-            #[rustfmt::skip]
-            | "atomic_fence_acq"
-            | "atomic_fence_rel"
-            | "atomic_fence_acqrel"
-            | "atomic_fence"
-            | "atomic_singlethreadfence_acq"
-            | "atomic_singlethreadfence_rel"
-            | "atomic_singlethreadfence_acqrel"
-            | "atomic_singlethreadfence"
-            => {
-                let &[] = check_arg_count(args)?;
-                // FIXME: this will become relevant once we try to detect data races.
-            }
-
-            _ if intrinsic_name.starts_with("atomic_xchg") => {
-                let &[place, new] = check_arg_count(args)?;
-                let place = this.deref_operand(place)?;
-                let new = this.read_scalar(new)?;
-                let old = this.read_scalar(place.into())?;
-
-                // Check alignment requirements. Atomics must always be aligned to their size,
-                // even if the type they wrap would be less aligned (e.g. AtomicU64 on 32bit must
-                // be 8-aligned).
-                let align = Align::from_bytes(place.layout.size.bytes()).unwrap();
-                this.memory.check_ptr_access(place.ptr, place.layout.size, align)?;
-
-                this.write_scalar(old, dest)?; // old value is returned
-                this.write_scalar(new, place.into())?;
-            }
-
-            _ if intrinsic_name.starts_with("atomic_cxchg") => {
-                let &[place, expect_old, new] = check_arg_count(args)?;
-                let place = this.deref_operand(place)?;
-                let expect_old = this.read_immediate(expect_old)?; // read as immediate for the sake of `binary_op()`
-                let new = this.read_scalar(new)?;
-                let old = this.read_immediate(place.into())?; // read as immediate for the sake of `binary_op()`
-
-                // Check alignment requirements. Atomics must always be aligned to their size,
-                // even if the type they wrap would be less aligned (e.g. AtomicU64 on 32bit must
-                // be 8-aligned).
-                let align = Align::from_bytes(place.layout.size.bytes()).unwrap();
-                this.memory.check_ptr_access(place.ptr, place.layout.size, align)?;
-
-                // `binary_op` will bail if either of them is not a scalar.
-                let eq = this.overflowing_binary_op(mir::BinOp::Eq, old, expect_old)?.0;
-                let res = Immediate::ScalarPair(old.to_scalar_or_uninit(), eq.into());
-                // Return old value.
-                this.write_immediate(res, dest)?;
-                // Update ptr depending on comparison.
-                if eq.to_bool()? {
-                    this.write_scalar(new, place.into())?;
-                }
-            }
-
-            #[rustfmt::skip]
-            | "atomic_or"
-            | "atomic_or_acq"
-            | "atomic_or_rel"
-            | "atomic_or_acqrel"
-            | "atomic_or_relaxed"
-            | "atomic_xor"
-            | "atomic_xor_acq"
-            | "atomic_xor_rel"
-            | "atomic_xor_acqrel"
-            | "atomic_xor_relaxed"
-            | "atomic_and"
-            | "atomic_and_acq"
-            | "atomic_and_rel"
-            | "atomic_and_acqrel"
-            | "atomic_and_relaxed"
-            | "atomic_nand"
-            | "atomic_nand_acq"
-            | "atomic_nand_rel"
-            | "atomic_nand_acqrel"
-            | "atomic_nand_relaxed"
-            | "atomic_xadd"
-            | "atomic_xadd_acq"
-            | "atomic_xadd_rel"
-            | "atomic_xadd_acqrel"
-            | "atomic_xadd_relaxed"
-            | "atomic_xsub"
-            | "atomic_xsub_acq"
-            | "atomic_xsub_rel"
-            | "atomic_xsub_acqrel"
-            | "atomic_xsub_relaxed"
-            => {
-                let &[place, rhs] = check_arg_count(args)?;
-                let place = this.deref_operand(place)?;
-                if !place.layout.ty.is_integral() {
-                    bug!("Atomic arithmetic operations only work on integer types");
-                }
-                let rhs = this.read_immediate(rhs)?;
-                let old = this.read_immediate(place.into())?;
-
-                // Check alignment requirements. Atomics must always be aligned to their size,
-                // even if the type they wrap would be less aligned (e.g. AtomicU64 on 32bit must
-                // be 8-aligned).
-                let align = Align::from_bytes(place.layout.size.bytes()).unwrap();
-                this.memory.check_ptr_access(place.ptr, place.layout.size, align)?;
-
-                this.write_immediate(*old, dest)?; // old value is returned
-                let (op, neg) = match intrinsic_name.split('_').nth(1).unwrap() {
-                    "or" => (mir::BinOp::BitOr, false),
-                    "xor" => (mir::BinOp::BitXor, false),
-                    "and" => (mir::BinOp::BitAnd, false),
-                    "xadd" => (mir::BinOp::Add, false),
-                    "xsub" => (mir::BinOp::Sub, false),
-                    "nand" => (mir::BinOp::BitAnd, true),
-                    _ => bug!(),
-                };
-                // Atomics wrap around on overflow.
-                let val = this.binary_op(op, old, rhs)?;
-                let val = if neg { this.unary_op(mir::UnOp::Not, val)? } else { val };
-                this.write_immediate(*val, place.into())?;
-            }
 
             // Query type information
             "assert_inhabited" |
@@ -498,6 +458,142 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
         Ok(())
     }
 
+    fn atomic_load(
+        &mut self, args: &[OpTy<'tcx, Tag>], dest: PlaceTy<'tcx, Tag>,
+        atomic: AtomicReadOp
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+
+
+        let &[place] = check_arg_count(args)?;
+        let place = this.deref_operand(place)?;
+
+        // make sure it fits into a scalar; otherwise it cannot be atomic
+        let val = this.read_scalar_atomic(place, atomic)?;
+
+        // Check alignment requirements. Atomics must always be aligned to their size,
+        // even if the type they wrap would be less aligned (e.g. AtomicU64 on 32bit must
+        // be 8-aligned).
+        let align = Align::from_bytes(place.layout.size.bytes()).unwrap();
+        this.memory.check_ptr_access(place.ptr, place.layout.size, align)?;
+        this.write_scalar(val, dest)?;
+        Ok(())
+    }
+
+    fn atomic_store(&mut self, args: &[OpTy<'tcx, Tag>], atomic: AtomicWriteOp) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+
+        let &[place, val] = check_arg_count(args)?;
+        let place = this.deref_operand(place)?;
+        let val = this.read_scalar(val)?; // make sure it fits into a scalar; otherwise it cannot be atomic
+
+        // Check alignment requirements. Atomics must always be aligned to their size,
+        // even if the type they wrap would be less aligned (e.g. AtomicU64 on 32bit must
+        // be 8-aligned).
+        let align = Align::from_bytes(place.layout.size.bytes()).unwrap();
+        this.memory.check_ptr_access(place.ptr, place.layout.size, align)?;
+
+        // Perform atomic store
+        this.write_scalar_atomic(val, place, atomic)?;
+        Ok(())
+    }
+
+    fn compiler_fence(&mut self, args: &[OpTy<'tcx, Tag>], atomic: AtomicFenceOp) -> InterpResult<'tcx> {
+        let &[] = check_arg_count(args)?;
+        let _ = atomic;
+        //FIXME: compiler fences are currently ignored
+        Ok(())
+    }
+
+    fn atomic_fence(&mut self, args: &[OpTy<'tcx, Tag>], atomic: AtomicFenceOp) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+        let &[] = check_arg_count(args)?;
+        this.validate_atomic_fence(atomic)?;
+        Ok(())
+    }
+
+    fn atomic_op(
+        &mut self, args: &[OpTy<'tcx, Tag>], dest: PlaceTy<'tcx, Tag>,
+        op: mir::BinOp, neg: bool, atomic: AtomicRwOp
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+
+        let &[place, rhs] = check_arg_count(args)?;
+        let place = this.deref_operand(place)?;
+        if !place.layout.ty.is_integral() {
+            bug!("Atomic arithmetic operations only work on integer types");
+        }
+        let rhs = this.read_immediate(rhs)?;
+
+        // Check alignment requirements. Atomics must always be aligned to their size,
+        // even if the type they wrap would be less aligned (e.g. AtomicU64 on 32bit must
+        // be 8-aligned).
+        let align = Align::from_bytes(place.layout.size.bytes()).unwrap();
+        this.memory.check_ptr_access(place.ptr, place.layout.size, align)?;
+        
+        let old = this.atomic_op_immediate(place, rhs, op, neg, atomic)?;
+        this.write_immediate(*old, dest)?; // old value is returned
+        Ok(())
+    }
+    
+    fn atomic_exchange(
+        &mut self, args: &[OpTy<'tcx, Tag>], dest: PlaceTy<'tcx, Tag>, atomic: AtomicRwOp
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+
+        let &[place, new] = check_arg_count(args)?;
+        let place = this.deref_operand(place)?;
+        let new = this.read_scalar(new)?;
+
+        // Check alignment requirements. Atomics must always be aligned to their size,
+        // even if the type they wrap would be less aligned (e.g. AtomicU64 on 32bit must
+        // be 8-aligned).
+        let align = Align::from_bytes(place.layout.size.bytes()).unwrap();
+        this.memory.check_ptr_access(place.ptr, place.layout.size, align)?;
+
+        let old = this.atomic_exchange_scalar(place, new, atomic)?;
+        this.write_scalar(old, dest)?; // old value is returned
+        Ok(())
+    }
+
+    fn atomic_compare_exchange(
+        &mut self, args: &[OpTy<'tcx, Tag>], dest: PlaceTy<'tcx, Tag>,
+        success: AtomicRwOp, fail: AtomicReadOp
+    ) -> InterpResult<'tcx> {
+        let this = self.eval_context_mut();
+
+        let &[place, expect_old, new] = check_arg_count(args)?;
+        let place = this.deref_operand(place)?;
+        let expect_old = this.read_immediate(expect_old)?; // read as immediate for the sake of `binary_op()`
+        let new = this.read_scalar(new)?;
+
+
+        // Check alignment requirements. Atomics must always be aligned to their size,
+        // even if the type they wrap would be less aligned (e.g. AtomicU64 on 32bit must
+        // be 8-aligned).
+        let align = Align::from_bytes(place.layout.size.bytes()).unwrap();
+        this.memory.check_ptr_access(place.ptr, place.layout.size, align)?;
+
+        
+        let old = this.atomic_compare_exchange_scalar(
+            place, expect_old, new, success, fail
+        )?;
+
+        // Return old value.
+        this.write_immediate(old, dest)?;
+        Ok(())
+    }
+
+    fn atomic_compare_exchange_weak(
+        &mut self, args: &[OpTy<'tcx, Tag>], dest: PlaceTy<'tcx, Tag>,
+        success: AtomicRwOp, fail: AtomicReadOp
+    ) -> InterpResult<'tcx> {
+
+        // FIXME: the weak part of this is currently not modelled,
+        //  it is assumed to always succeed unconditionally.
+        self.atomic_compare_exchange(args, dest, success, fail)
+    }
+
     fn float_to_int_unchecked<F>(
         &self,
         f: F,
diff --git a/src/shims/posix/linux/sync.rs b/src/shims/posix/linux/sync.rs
index 9d124872f5..5243431194 100644
--- a/src/shims/posix/linux/sync.rs
+++ b/src/shims/posix/linux/sync.rs
@@ -78,7 +78,18 @@ pub fn futex<'tcx>(
             // Read an `i32` through the pointer, regardless of any wrapper types.
             // It's not uncommon for `addr` to be passed as another type than `*mut i32`, such as `*const AtomicI32`.
             // FIXME: this fails if `addr` is not a pointer type.
-            let futex_val = this.read_scalar_at_offset(addr.into(), 0, this.machine.layouts.i32)?.to_i32()?;
+            // The atomic ordering for futex(https://man7.org/linux/man-pages/man2/futex.2.html):
+            //  "The load of the value of the futex word is an
+            //   atomic memory access (i.e., using atomic machine instructions
+            //   of the respective architecture).  This load, the comparison
+            //   with the expected value, and starting to sleep are performed
+            //   atomically and totally ordered with respect to other futex
+            //   operations on the same futex word."
+            // SeqCst is total order over all operations.
+            // FIXME: check if this should be changed when weak memory orders are added.
+            let futex_val = this.read_scalar_at_offset_atomic(
+                addr.into(), 0, this.machine.layouts.i32, AtomicReadOp::SeqCst
+            )?.to_i32()?;
             if val == futex_val {
                 // The value still matches, so we block the trait make it wait for FUTEX_WAKE.
                 this.block_thread(thread);
diff --git a/src/shims/posix/sync.rs b/src/shims/posix/sync.rs
index a0b5db42ed..868c72289a 100644
--- a/src/shims/posix/sync.rs
+++ b/src/shims/posix/sync.rs
@@ -62,7 +62,10 @@ fn mutex_get_kind<'mir, 'tcx: 'mir>(
     mutex_op: OpTy<'tcx, Tag>,
 ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
     let offset = if ecx.pointer_size().bytes() == 8 { 16 } else { 12 };
-    ecx.read_scalar_at_offset(mutex_op, offset, ecx.machine.layouts.i32)
+    ecx.read_scalar_at_offset_atomic(
+        mutex_op, offset, ecx.machine.layouts.i32,
+        AtomicReadOp::Relaxed
+    )
 }
 
 fn mutex_set_kind<'mir, 'tcx: 'mir>(
@@ -71,14 +74,20 @@ fn mutex_set_kind<'mir, 'tcx: 'mir>(
     kind: impl Into<ScalarMaybeUninit<Tag>>,
 ) -> InterpResult<'tcx, ()> {
     let offset = if ecx.pointer_size().bytes() == 8 { 16 } else { 12 };
-    ecx.write_scalar_at_offset(mutex_op, offset, kind, ecx.machine.layouts.i32)
+    ecx.write_scalar_at_offset_atomic(
+        mutex_op, offset, kind, ecx.machine.layouts.i32,
+        AtomicWriteOp::Relaxed
+    )
 }
 
 fn mutex_get_id<'mir, 'tcx: 'mir>(
     ecx: &MiriEvalContext<'mir, 'tcx>,
     mutex_op: OpTy<'tcx, Tag>,
 ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
-    ecx.read_scalar_at_offset(mutex_op, 4, ecx.machine.layouts.u32)
+    ecx.read_scalar_at_offset_atomic(
+        mutex_op, 4, ecx.machine.layouts.u32, 
+        AtomicReadOp::Relaxed
+    )
 }
 
 fn mutex_set_id<'mir, 'tcx: 'mir>(
@@ -86,7 +95,10 @@ fn mutex_set_id<'mir, 'tcx: 'mir>(
     mutex_op: OpTy<'tcx, Tag>,
     id: impl Into<ScalarMaybeUninit<Tag>>,
 ) -> InterpResult<'tcx, ()> {
-    ecx.write_scalar_at_offset(mutex_op, 4, id, ecx.machine.layouts.u32)
+    ecx.write_scalar_at_offset_atomic(
+        mutex_op, 4, id, ecx.machine.layouts.u32,
+        AtomicWriteOp::Relaxed
+    )
 }
 
 fn mutex_get_or_create_id<'mir, 'tcx: 'mir>(
@@ -116,7 +128,10 @@ fn rwlock_get_id<'mir, 'tcx: 'mir>(
     ecx: &MiriEvalContext<'mir, 'tcx>,
     rwlock_op: OpTy<'tcx, Tag>,
 ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
-    ecx.read_scalar_at_offset(rwlock_op, 4, ecx.machine.layouts.u32)
+    ecx.read_scalar_at_offset_atomic(
+        rwlock_op, 4, ecx.machine.layouts.u32,
+        AtomicReadOp::Relaxed
+    )
 }
 
 fn rwlock_set_id<'mir, 'tcx: 'mir>(
@@ -124,7 +139,10 @@ fn rwlock_set_id<'mir, 'tcx: 'mir>(
     rwlock_op: OpTy<'tcx, Tag>,
     id: impl Into<ScalarMaybeUninit<Tag>>,
 ) -> InterpResult<'tcx, ()> {
-    ecx.write_scalar_at_offset(rwlock_op, 4, id, ecx.machine.layouts.u32)
+    ecx.write_scalar_at_offset_atomic(
+        rwlock_op, 4, id, ecx.machine.layouts.u32,
+        AtomicWriteOp::Relaxed
+    )
 }
 
 fn rwlock_get_or_create_id<'mir, 'tcx: 'mir>(
@@ -177,7 +195,10 @@ fn cond_get_id<'mir, 'tcx: 'mir>(
     ecx: &MiriEvalContext<'mir, 'tcx>,
     cond_op: OpTy<'tcx, Tag>,
 ) -> InterpResult<'tcx, ScalarMaybeUninit<Tag>> {
-    ecx.read_scalar_at_offset(cond_op, 4, ecx.machine.layouts.u32)
+    ecx.read_scalar_at_offset_atomic(
+        cond_op, 4, ecx.machine.layouts.u32,
+        AtomicReadOp::Relaxed
+    )
 }
 
 fn cond_set_id<'mir, 'tcx: 'mir>(
@@ -185,7 +206,10 @@ fn cond_set_id<'mir, 'tcx: 'mir>(
     cond_op: OpTy<'tcx, Tag>,
     id: impl Into<ScalarMaybeUninit<Tag>>,
 ) -> InterpResult<'tcx, ()> {
-    ecx.write_scalar_at_offset(cond_op, 4, id, ecx.machine.layouts.u32)
+    ecx.write_scalar_at_offset_atomic(
+        cond_op, 4, id, ecx.machine.layouts.u32,
+        AtomicWriteOp::Relaxed
+    )
 }
 
 fn cond_get_or_create_id<'mir, 'tcx: 'mir>(
diff --git a/src/shims/posix/thread.rs b/src/shims/posix/thread.rs
index 7c9c489e6f..0ea20cdff6 100644
--- a/src/shims/posix/thread.rs
+++ b/src/shims/posix/thread.rs
@@ -15,25 +15,33 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
         let this = self.eval_context_mut();
 
         this.tcx.sess.warn(
-            "thread support is experimental. \
-             For example, Miri does not detect data races yet.",
+            "thread support is experimental and incomplete: weak memory effects are not emulated."
         );
 
+        // Create the new thread
         let new_thread_id = this.create_thread();
-        // Also switch to new thread so that we can push the first stackframe.
-        let old_thread_id = this.set_active_thread(new_thread_id);
 
+        // Write the current thread-id, switch to the next thread later
+        // to treat this write operation as occuring on the current thread.
         let thread_info_place = this.deref_operand(thread)?;
         this.write_scalar(
             Scalar::from_uint(new_thread_id.to_u32(), thread_info_place.layout.size),
             thread_info_place.into(),
         )?;
 
+        // Read the function argument that will be sent to the new thread
+        // before the thread starts executing since reading after the 
+        // context switch will incorrectly report a data-race.
         let fn_ptr = this.read_scalar(start_routine)?.check_init()?;
-        let instance = this.memory.get_fn(fn_ptr)?.as_instance()?;
-
         let func_arg = this.read_immediate(arg)?;
 
+        // Finally switch to new thread so that we can push the first stackframe.
+        // After this all accesses will be treated as occuring in the new thread.
+        let old_thread_id = this.set_active_thread(new_thread_id);
+
+        // Perform the function pointer load in the new thread frame.
+        let instance = this.memory.get_fn(fn_ptr)?.as_instance()?;
+
         // Note: the returned value is currently ignored (see the FIXME in
         // pthread_join below) because the Rust standard library does not use
         // it.
@@ -47,6 +55,7 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
             StackPopCleanup::None { cleanup: true },
         )?;
 
+        // Restore the old active thread frame.
         this.set_active_thread(old_thread_id);
 
         Ok(0)
diff --git a/src/sync.rs b/src/sync.rs
index 0c12da8d68..4d488565fa 100644
--- a/src/sync.rs
+++ b/src/sync.rs
@@ -61,6 +61,12 @@ struct Mutex {
     lock_count: usize,
     /// The queue of threads waiting for this mutex.
     queue: VecDeque<ThreadId>,
+    /// Data race handle, this tracks the happens-before
+    /// relationship between each mutex access. It is
+    /// released to during unlock and acquired from during
+    /// locking, and therefore stores the clock of the last
+    /// thread to release this mutex.
+    data_race: VClock
 }
 
 declare_id!(RwLockId);
@@ -77,6 +83,25 @@ struct RwLock {
     writer_queue: VecDeque<ThreadId>,
     /// The queue of reader threads waiting for this lock.
     reader_queue: VecDeque<ThreadId>,
+    /// Data race handle for writers, tracks the happens-before
+    /// ordering between each write access to a rwlock and is updated
+    /// after a sequence of concurrent readers to track the happens-
+    /// before ordering between the set of previous readers and
+    /// the current writer.
+    /// Contains the clock of the last thread to release a writer
+    /// lock or the joined clock of the set of last threads to release
+    /// shared reader locks.
+    data_race: VClock,
+    /// Data race handle for readers, this is temporary storage
+    /// for the combined happens-before ordering for between all
+    /// concurrent readers and the next writer, and the value
+    /// is stored to the main data_race variable once all
+    /// readers are finished.
+    /// Has to be stored separately since reader lock acquires
+    /// must load the clock of the last write and must not 
+    /// add happens-before orderings between shared reader
+    /// locks.
+    data_race_reader: VClock,
 }
 
 declare_id!(CondvarId);
@@ -94,12 +119,24 @@ struct CondvarWaiter {
 #[derive(Default, Debug)]
 struct Condvar {
     waiters: VecDeque<CondvarWaiter>,
+    /// Tracks the happens-before relationship
+    /// between a cond-var signal and a cond-var
+    /// wait during a non-suprious signal event.
+    /// Contains the clock of the last thread to
+    /// perform a futex-signal.
+    data_race: VClock,
 }
 
 /// The futex state.
 #[derive(Default, Debug)]
 struct Futex {
     waiters: VecDeque<FutexWaiter>,
+    /// Tracks the happens-before relationship
+    /// between a futex-wake and a futex-wait
+    /// during a non-spurious wake event.
+    /// Contains the clock of the last thread to
+    /// perform a futex-wake.
+    data_race: VClock,
 }
 
 /// A thread waiting on a futex.
@@ -205,6 +242,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
             mutex.owner = Some(thread);
         }
         mutex.lock_count = mutex.lock_count.checked_add(1).unwrap();
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.validate_lock_acquire(&mutex.data_race, thread);
+        }
     }
 
     /// Try unlocking by decreasing the lock count and returning the old lock
@@ -232,6 +272,9 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
                 mutex.owner = None;
                 // The mutex is completely unlocked. Try transfering ownership
                 // to another thread.
+                if let Some(data_race) = &this.memory.extra.data_race {
+                    data_race.validate_lock_release(&mut mutex.data_race, current_owner);
+                }
                 this.mutex_dequeue_and_lock(id);
             }
             Some(old_lock_count)
@@ -284,15 +327,20 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
         let this = self.eval_context_mut();
         assert!(!this.rwlock_is_write_locked(id), "the lock is write locked");
         trace!("rwlock_reader_lock: {:?} now also held (one more time) by {:?}", id, reader);
-        let count = this.machine.threads.sync.rwlocks[id].readers.entry(reader).or_insert(0);
+        let rwlock = &mut this.machine.threads.sync.rwlocks[id];
+        let count = rwlock.readers.entry(reader).or_insert(0);
         *count = count.checked_add(1).expect("the reader counter overflowed");
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.validate_lock_acquire(&rwlock.data_race, reader);
+        }
     }
 
     /// Try read-unlock the lock for `reader` and potentially give the lock to a new owner.
     /// Returns `true` if succeeded, `false` if this `reader` did not hold the lock.
     fn rwlock_reader_unlock(&mut self, id: RwLockId, reader: ThreadId) -> bool {
         let this = self.eval_context_mut();
-        match this.machine.threads.sync.rwlocks[id].readers.entry(reader) {
+        let rwlock = &mut this.machine.threads.sync.rwlocks[id];
+        match rwlock.readers.entry(reader) {
             Entry::Occupied(mut entry) => {
                 let count = entry.get_mut();
                 assert!(*count > 0, "rwlock locked with count == 0");
@@ -306,8 +354,18 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
             }
             Entry::Vacant(_) => return false, // we did not even own this lock
         }
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.validate_lock_release_shared(&mut rwlock.data_race_reader, reader);
+        }
+
         // The thread was a reader. If the lock is not held any more, give it to a writer.
         if this.rwlock_is_locked(id).not() {
+
+            // All the readers are finished, so set the writer data-race handle to the value
+            //  of the union of all reader data race handles, since the set of readers
+            //  happen-before the writers
+            let rwlock = &mut this.machine.threads.sync.rwlocks[id];
+            rwlock.data_race.clone_from(&rwlock.data_race_reader);
             this.rwlock_dequeue_and_lock_writer(id);
         }
         true
@@ -332,7 +390,11 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
         let this = self.eval_context_mut();
         assert!(!this.rwlock_is_locked(id), "the rwlock is already locked");
         trace!("rwlock_writer_lock: {:?} now held by {:?}", id, writer);
-        this.machine.threads.sync.rwlocks[id].writer = Some(writer);
+        let rwlock = &mut this.machine.threads.sync.rwlocks[id];
+        rwlock.writer = Some(writer);
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.validate_lock_acquire(&rwlock.data_race, writer);
+        }
     }
 
     #[inline]
@@ -347,6 +409,13 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
             }
             rwlock.writer = None;
             trace!("rwlock_writer_unlock: {:?} unlocked by {:?}", id, expected_writer);
+            // Release memory to both reader and writer vector clocks
+            //  since this writer happens-before both the union of readers once they are finished
+            //  and the next writer
+            if let Some(data_race) = &this.memory.extra.data_race {
+                data_race.validate_lock_release(&mut rwlock.data_race, current_writer);
+                data_race.validate_lock_release(&mut rwlock.data_race_reader, current_writer);
+            }
             // The thread was a writer.
             //
             // We are prioritizing writers here against the readers. As a
@@ -405,10 +474,22 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
     /// variable.
     fn condvar_signal(&mut self, id: CondvarId) -> Option<(ThreadId, MutexId)> {
         let this = self.eval_context_mut();
-        this.machine.threads.sync.condvars[id]
-            .waiters
+        let current_thread = this.get_active_thread();
+        let condvar = &mut this.machine.threads.sync.condvars[id];
+        let data_race = &this.memory.extra.data_race;
+
+        // Each condvar signal happens-before the end of the condvar wake
+        if let Some(data_race) = data_race {
+            data_race.validate_lock_release(&mut condvar.data_race, current_thread);
+        }
+        condvar.waiters
             .pop_front()
-            .map(|waiter| (waiter.thread, waiter.mutex))
+            .map(|waiter| {
+                if let Some(data_race) = data_race {
+                    data_race.validate_lock_acquire(&mut condvar.data_race, waiter.thread);
+                }
+                (waiter.thread, waiter.mutex)
+            })
     }
 
     #[inline]
@@ -420,15 +501,29 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
 
     fn futex_wait(&mut self, addr: Pointer<stacked_borrows::Tag>, thread: ThreadId) {
         let this = self.eval_context_mut();
-        let waiters = &mut this.machine.threads.sync.futexes.entry(addr.erase_tag()).or_default().waiters;
+        let futex = &mut this.machine.threads.sync.futexes.entry(addr.erase_tag()).or_default();
+        let waiters = &mut futex.waiters;
         assert!(waiters.iter().all(|waiter| waiter.thread != thread), "thread is already waiting");
         waiters.push_back(FutexWaiter { thread });
     }
 
     fn futex_wake(&mut self, addr: Pointer<stacked_borrows::Tag>) -> Option<ThreadId> {
         let this = self.eval_context_mut();
-        let waiters = &mut this.machine.threads.sync.futexes.get_mut(&addr.erase_tag())?.waiters;
-        waiters.pop_front().map(|waiter| waiter.thread)
+        let current_thread = this.get_active_thread();
+        let futex = &mut this.machine.threads.sync.futexes.get_mut(&addr.erase_tag())?;
+        let data_race =  &this.memory.extra.data_race;
+
+        // Each futex-wake happens-before the end of the futex wait
+        if let Some(data_race) = data_race {
+            data_race.validate_lock_release(&mut futex.data_race, current_thread);
+        }
+        let res = futex.waiters.pop_front().map(|waiter| {
+            if let Some(data_race) = data_race {
+                data_race.validate_lock_acquire(&futex.data_race, waiter.thread);  
+            }
+            waiter.thread
+        });
+        res
     }
 
     fn futex_remove_waiter(&mut self, addr: Pointer<stacked_borrows::Tag>, thread: ThreadId) {
diff --git a/src/thread.rs b/src/thread.rs
index eeaee7dc44..5d78343041 100644
--- a/src/thread.rs
+++ b/src/thread.rs
@@ -3,6 +3,7 @@
 use std::cell::RefCell;
 use std::collections::hash_map::Entry;
 use std::convert::TryFrom;
+use std::rc::Rc;
 use std::num::TryFromIntError;
 use std::time::{Duration, Instant, SystemTime};
 
@@ -327,7 +328,7 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
     }
 
     /// Mark that the active thread tries to join the thread with `joined_thread_id`.
-    fn join_thread(&mut self, joined_thread_id: ThreadId) -> InterpResult<'tcx> {
+    fn join_thread(&mut self, joined_thread_id: ThreadId, data_race: &Option<Rc<data_race::GlobalState>>) -> InterpResult<'tcx> {
         if self.threads[joined_thread_id].join_status != ThreadJoinStatus::Joinable {
             throw_ub_format!("trying to join a detached or already joined thread");
         }
@@ -351,6 +352,11 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
                 self.active_thread,
                 joined_thread_id
             );
+        } else {
+            // The thread has already terminated - mark join happens-before
+            if let Some(data_race) = data_race {
+                data_race.thread_joined(self.active_thread, joined_thread_id);
+            }
         }
         Ok(())
     }
@@ -425,7 +431,7 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
 
     /// Wakes up threads joining on the active one and deallocates thread-local statics.
     /// The `AllocId` that can now be freed is returned.
-    fn thread_terminated(&mut self) -> Vec<AllocId> {
+    fn thread_terminated(&mut self, data_race: &Option<Rc<data_race::GlobalState>>) -> Vec<AllocId> {
         let mut free_tls_statics = Vec::new();
         {
             let mut thread_local_statics = self.thread_local_alloc_ids.borrow_mut();
@@ -440,9 +446,17 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
                 return false;
             });
         }
+        // Set the thread into a terminated state in the data-race detector
+        if let Some(data_race) = data_race {
+            data_race.thread_terminated();
+        }
         // Check if we need to unblock any threads.
         for (i, thread) in self.threads.iter_enumerated_mut() {
             if thread.state == ThreadState::BlockedOnJoin(self.active_thread) {
+                // The thread has terminated, mark happens-before edge to joining thread
+                if let Some(data_race) = data_race {
+                    data_race.thread_joined(i, self.active_thread);
+                }
                 trace!("unblocking {:?} because {:?} terminated", i, self.active_thread);
                 thread.state = ThreadState::Enabled;
             }
@@ -456,7 +470,7 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
     /// used in stateless model checkers such as Loom: run the active thread as
     /// long as we can and switch only when we have to (the active thread was
     /// blocked, terminated, or has explicitly asked to be preempted).
-    fn schedule(&mut self) -> InterpResult<'tcx, SchedulingAction> {
+    fn schedule(&mut self, data_race: &Option<Rc<data_race::GlobalState>>) -> InterpResult<'tcx, SchedulingAction> {
         // Check whether the thread has **just** terminated (`check_terminated`
         // checks whether the thread has popped all its stack and if yes, sets
         // the thread state to terminated).
@@ -501,6 +515,9 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
             if thread.state == ThreadState::Enabled {
                 if !self.yield_active_thread || id != self.active_thread {
                     self.active_thread = id;
+                    if let Some(data_race) = data_race {
+                        data_race.thread_set_active(self.active_thread);
+                    }
                     break;
                 }
             }
@@ -554,7 +571,11 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
     #[inline]
     fn create_thread(&mut self) -> ThreadId {
         let this = self.eval_context_mut();
-        this.machine.threads.create_thread()
+        let id = this.machine.threads.create_thread();
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.thread_created(id);
+        }
+        id
     }
 
     #[inline]
@@ -566,12 +587,17 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
     #[inline]
     fn join_thread(&mut self, joined_thread_id: ThreadId) -> InterpResult<'tcx> {
         let this = self.eval_context_mut();
-        this.machine.threads.join_thread(joined_thread_id)
+        let data_race = &this.memory.extra.data_race;
+        this.machine.threads.join_thread(joined_thread_id, data_race)?;
+        Ok(())
     }
 
     #[inline]
     fn set_active_thread(&mut self, thread_id: ThreadId) -> ThreadId {
         let this = self.eval_context_mut();
+        if let Some(data_race) = &this.memory.extra.data_race {
+            data_race.thread_set_active(thread_id);
+        }
         this.machine.threads.set_active_thread_id(thread_id)
     }
 
@@ -626,6 +652,13 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
     #[inline]
     fn set_active_thread_name(&mut self, new_thread_name: Vec<u8>) {
         let this = self.eval_context_mut();
+        if let Some(data_race) = &this.memory.extra.data_race {
+            if let Ok(string) = String::from_utf8(new_thread_name.clone()) {
+                data_race.thread_set_name(
+                    this.machine.threads.active_thread, string
+                );
+            }
+        }
         this.machine.threads.set_thread_name(new_thread_name);
     }
 
@@ -695,7 +728,8 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
     #[inline]
     fn schedule(&mut self) -> InterpResult<'tcx, SchedulingAction> {
         let this = self.eval_context_mut();
-        this.machine.threads.schedule()
+        let data_race = &this.memory.extra.data_race;
+        this.machine.threads.schedule(data_race)
     }
 
     /// Handles thread termination of the active thread: wakes up threads joining on this one,
@@ -705,7 +739,8 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriEvalContextExt<'mir, 'tcx
     #[inline]
     fn thread_terminated(&mut self) -> InterpResult<'tcx> {
         let this = self.eval_context_mut();
-        for alloc_id in this.machine.threads.thread_terminated() {
+        let data_race = &this.memory.extra.data_race;
+        for alloc_id in this.machine.threads.thread_terminated(data_race) {
             let ptr = this.memory.global_base_pointer(alloc_id.into())?;
             this.memory.deallocate(ptr, None, MiriMemoryKind::Tls.into())?;
         }
diff --git a/src/vector_clock.rs b/src/vector_clock.rs
new file mode 100644
index 0000000000..6840d7e6cb
--- /dev/null
+++ b/src/vector_clock.rs
@@ -0,0 +1,660 @@
+use rustc_data_structures::fx::FxHashMap;
+use rustc_index::vec::Idx;
+use smallvec::SmallVec;
+use std::{
+    cmp::Ordering,
+    convert::TryFrom,
+    fmt::{self, Debug},
+    mem,
+    ops::Index,
+};
+
+/// A vector clock index, this is associated with a thread id
+/// but in some cases one vector index may be shared with
+/// multiple thread ids if it safe to do so.
+#[derive(Clone, Copy, Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
+pub struct VectorIdx(u32);
+
+impl VectorIdx {
+    #[inline(always)]
+    pub fn to_u32(self) -> u32 {
+        self.0
+    }
+
+    pub const MAX_INDEX: VectorIdx = VectorIdx(u32::MAX);
+}
+
+impl Idx for VectorIdx {
+    #[inline]
+    fn new(idx: usize) -> Self {
+        VectorIdx(u32::try_from(idx).unwrap())
+    }
+
+    #[inline]
+    fn index(self) -> usize {
+        usize::try_from(self.0).unwrap()
+    }
+}
+
+impl From<u32> for VectorIdx {
+    #[inline]
+    fn from(id: u32) -> Self {
+        Self(id)
+    }
+}
+
+/// A sparse mapping of vector index values to vector clocks, this
+/// is optimized for the common case with only one element stored
+/// inside the map.
+/// This is used to store the set of currently active release
+/// sequences at a given memory location, since RMW operations
+/// allow for multiple release sequences to be active at once
+/// and to be collapsed back to one active release sequence
+/// once a non RMW atomic store operation occurs.
+/// An all zero vector is considered to be equal to no
+/// element stored internally since it will never be
+/// stored and has no meaning as a release sequence
+/// vector clock.
+#[derive(Clone)]
+pub struct VSmallClockMap(VSmallClockMapInner);
+
+#[derive(Clone)]
+enum VSmallClockMapInner {
+    /// Zero or 1 vector elements, common
+    /// case for the sparse set.
+    /// The all zero vector clock is treated
+    /// as equal to the empty element.
+    Small(VectorIdx, VClock),
+
+    /// Hash-map of vector clocks.
+    Large(FxHashMap<VectorIdx, VClock>),
+}
+
+impl VSmallClockMap {
+    /// Remove all clock vectors from the map, setting them
+    /// to the zero vector.
+    pub fn clear(&mut self) {
+        match &mut self.0 {
+            VSmallClockMapInner::Small(_, clock) => clock.set_zero_vector(),
+            VSmallClockMapInner::Large(hash_map) => {
+                hash_map.clear();
+            }
+        }
+    }
+
+    /// Remove all clock vectors except for the clock vector
+    /// stored at the given index, which is retained.
+    pub fn retain_index(&mut self, index: VectorIdx) {
+        match &mut self.0 {
+            VSmallClockMapInner::Small(small_idx, clock) => {
+                if index != *small_idx {
+                    // The zero-vector is considered to equal
+                    // the empty element.
+                    clock.set_zero_vector()
+                }
+            }
+            VSmallClockMapInner::Large(hash_map) => {
+                let value = hash_map.remove(&index).unwrap_or_default();
+                self.0 = VSmallClockMapInner::Small(index, value);
+            }
+        }
+    }
+
+    /// Insert the vector clock into the associated vector
+    /// index.
+    pub fn insert(&mut self, index: VectorIdx, clock: &VClock) {
+        match &mut self.0 {
+            VSmallClockMapInner::Small(small_idx, small_clock) => {
+                if small_clock.is_zero_vector() {
+                    *small_idx = index;
+                    small_clock.clone_from(clock);
+                } else if !clock.is_zero_vector() {
+                    // Convert to using the hash-map representation.
+                    let mut hash_map = FxHashMap::default();
+                    hash_map.insert(*small_idx, mem::take(small_clock));
+                    hash_map.insert(index, clock.clone());
+                    self.0 = VSmallClockMapInner::Large(hash_map);
+                }
+            }
+            VSmallClockMapInner::Large(hash_map) =>
+                if !clock.is_zero_vector() {
+                    hash_map.insert(index, clock.clone());
+                },
+        }
+    }
+
+    /// Try to load the vector clock associated with the current
+    ///  vector index.
+    pub fn get(&self, index: VectorIdx) -> Option<&VClock> {
+        match &self.0 {
+            VSmallClockMapInner::Small(small_idx, small_clock) => {
+                if *small_idx == index && !small_clock.is_zero_vector() {
+                    Some(small_clock)
+                } else {
+                    None
+                }
+            }
+            VSmallClockMapInner::Large(hash_map) => hash_map.get(&index),
+        }
+    }
+}
+
+impl Default for VSmallClockMap {
+    #[inline]
+    fn default() -> Self {
+        VSmallClockMap(VSmallClockMapInner::Small(VectorIdx::new(0), VClock::default()))
+    }
+}
+
+impl Debug for VSmallClockMap {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        // Print the contents of the small vector clock set as the map
+        // of vector index to vector clock that they represent.
+        let mut map = f.debug_map();
+        match &self.0 {
+            VSmallClockMapInner::Small(small_idx, small_clock) =>
+                if !small_clock.is_zero_vector() {
+                    map.entry(&small_idx, &small_clock);
+                },
+            VSmallClockMapInner::Large(hash_map) =>
+                for (idx, elem) in hash_map.iter() {
+                    map.entry(idx, elem);
+                },
+        }
+        map.finish()
+    }
+}
+
+impl PartialEq for VSmallClockMap {
+    fn eq(&self, other: &Self) -> bool {
+        use VSmallClockMapInner::*;
+        match (&self.0, &other.0) {
+            (Small(i1, c1), Small(i2, c2)) => {
+                if c1.is_zero_vector() {
+                    // Either they are both zero or they are non-equal
+                    c2.is_zero_vector()
+                } else {
+                    // At least one is non-zero, so the full comparison is correct
+                    i1 == i2 && c1 == c2
+                }
+            }
+            (Small(idx, clock), Large(hash_map)) | (Large(hash_map), Small(idx, clock)) => {
+                if hash_map.len() == 0 {
+                    // Equal to the empty hash-map
+                    clock.is_zero_vector()
+                } else if hash_map.len() == 1 {
+                    // Equal to the hash-map with one element
+                    let (hash_idx, hash_clock) = hash_map.iter().next().unwrap();
+                    hash_idx == idx && hash_clock == clock
+                } else {
+                    false
+                }
+            }
+            (Large(map1), Large(map2)) => map1 == map2,
+        }
+    }
+}
+
+impl Eq for VSmallClockMap {}
+
+/// The size of the vector-clock to store inline
+/// clock vectors larger than this will be stored on the heap
+const SMALL_VECTOR: usize = 4;
+
+/// The type of the time-stamps recorded in the data-race detector
+/// set to a type of unsigned integer
+pub type VTimestamp = u32;
+
+/// A vector clock for detecting data-races, this is conceptually
+/// a map from a vector index (and thus a thread id) to a timestamp.
+/// The compare operations require that the invariant that the last
+/// element in the internal timestamp slice must not be a 0, hence
+/// all zero vector clocks are always represented by the empty slice;
+/// and allows for the implementation of compare operations to short
+/// circuit the calculation and return the correct result faster,
+/// also this means that there is only one unique valid length
+/// for each set of vector clock values and hence the PartialEq
+//  and Eq derivations are correct.
+#[derive(PartialEq, Eq, Default, Debug)]
+pub struct VClock(SmallVec<[VTimestamp; SMALL_VECTOR]>);
+
+impl VClock {
+    /// Create a new vector-clock containing all zeros except
+    /// for a value at the given index
+    pub fn new_with_index(index: VectorIdx, timestamp: VTimestamp) -> VClock {
+        let len = index.index() + 1;
+        let mut vec = smallvec::smallvec![0; len];
+        vec[index.index()] = timestamp;
+        VClock(vec)
+    }
+
+    /// Load the internal timestamp slice in the vector clock
+    #[inline]
+    pub fn as_slice(&self) -> &[VTimestamp] {
+        self.0.as_slice()
+    }
+
+    /// Get a mutable slice to the internal vector with minimum `min_len`
+    /// elements, to preserve invariants this vector must modify
+    /// the `min_len`-1 nth element to a non-zero value
+    #[inline]
+    fn get_mut_with_min_len(&mut self, min_len: usize) -> &mut [VTimestamp] {
+        if self.0.len() < min_len {
+            self.0.resize(min_len, 0);
+        }
+        assert!(self.0.len() >= min_len);
+        self.0.as_mut_slice()
+    }
+
+    /// Increment the vector clock at a known index
+    /// this will panic if the vector index overflows
+    #[inline]
+    pub fn increment_index(&mut self, idx: VectorIdx) {
+        let idx = idx.index();
+        let mut_slice = self.get_mut_with_min_len(idx + 1);
+        let idx_ref = &mut mut_slice[idx];
+        *idx_ref = idx_ref.checked_add(1).expect("Vector clock overflow")
+    }
+
+    // Join the two vector-clocks together, this
+    // sets each vector-element to the maximum value
+    // of that element in either of the two source elements.
+    pub fn join(&mut self, other: &Self) {
+        let rhs_slice = other.as_slice();
+        let lhs_slice = self.get_mut_with_min_len(rhs_slice.len());
+        for (l, &r) in lhs_slice.iter_mut().zip(rhs_slice.iter()) {
+            *l = r.max(*l);
+        }
+    }
+
+    /// Set the element at the current index of the vector
+    pub fn set_at_index(&mut self, other: &Self, idx: VectorIdx) {
+        let idx = idx.index();
+        let mut_slice = self.get_mut_with_min_len(idx + 1);
+        let slice = other.as_slice();
+        mut_slice[idx] = slice[idx];
+    }
+
+    /// Set the vector to the all-zero vector
+    #[inline]
+    pub fn set_zero_vector(&mut self) {
+        self.0.clear();
+    }
+
+    /// Return if this vector is the all-zero vector
+    pub fn is_zero_vector(&self) -> bool {
+        self.0.is_empty()
+    }
+}
+
+impl Clone for VClock {
+    fn clone(&self) -> Self {
+        VClock(self.0.clone())
+    }
+
+    // Optimized clone-from, can be removed
+    // and replaced with a derive once a similar
+    // optimization is inserted into SmallVec's
+    // clone implementation.
+    fn clone_from(&mut self, source: &Self) {
+        let source_slice = source.as_slice();
+        self.0.clear();
+        self.0.extend_from_slice(source_slice);
+    }
+}
+
+impl PartialOrd for VClock {
+    fn partial_cmp(&self, other: &VClock) -> Option<Ordering> {
+        // Load the values as slices
+        let lhs_slice = self.as_slice();
+        let rhs_slice = other.as_slice();
+
+        // Iterate through the combined vector slice continuously updating
+        // the value of `order` to the current comparison of the vector from
+        // index 0 to the currently checked index.
+        // An Equal ordering can be converted into Less or Greater ordering
+        // on finding an element that is less than or greater than the other
+        // but if one Greater and one Less element-wise comparison is found
+        // then no ordering is possible and so directly return an ordering
+        // of None.
+        let mut iter = lhs_slice.iter().zip(rhs_slice.iter());
+        let mut order = match iter.next() {
+            Some((lhs, rhs)) => lhs.cmp(rhs),
+            None => Ordering::Equal,
+        };
+        for (l, r) in iter {
+            match order {
+                Ordering::Equal => order = l.cmp(r),
+                Ordering::Less =>
+                    if l > r {
+                        return None;
+                    },
+                Ordering::Greater =>
+                    if l < r {
+                        return None;
+                    },
+            }
+        }
+
+        // Now test if either left or right have trailing elements,
+        // by the invariant the trailing elements have at least 1
+        // non zero value, so no additional calculation is required
+        // to determine the result of the PartialOrder.
+        let l_len = lhs_slice.len();
+        let r_len = rhs_slice.len();
+        match l_len.cmp(&r_len) {
+            // Equal means no additional elements: return current order
+            Ordering::Equal => Some(order),
+            // Right has at least 1 element > than the implicit 0,
+            // so the only valid values are Ordering::Less or None.
+            Ordering::Less => match order {
+                Ordering::Less | Ordering::Equal => Some(Ordering::Less),
+                Ordering::Greater => None,
+            },
+            // Left has at least 1 element > than the implicit 0,
+            // so the only valid values are Ordering::Greater or None.
+            Ordering::Greater => match order {
+                Ordering::Greater | Ordering::Equal => Some(Ordering::Greater),
+                Ordering::Less => None,
+            },
+        }
+    }
+
+    fn lt(&self, other: &VClock) -> bool {
+        // Load the values as slices
+        let lhs_slice = self.as_slice();
+        let rhs_slice = other.as_slice();
+
+        // If l_len > r_len then at least one element
+        // in l_len is > than r_len, therefore the result
+        // is either Some(Greater) or None, so return false
+        // early.
+        let l_len = lhs_slice.len();
+        let r_len = rhs_slice.len();
+        if l_len <= r_len {
+            // If any elements on the left are greater than the right
+            // then the result is None or Some(Greater), both of which
+            // return false, the earlier test asserts that no elements in the
+            // extended tail violate this assumption. Otherwise l <= r, finally
+            // the case where the values are potentially equal needs to be considered
+            // and false returned as well
+            let mut equal = l_len == r_len;
+            for (&l, &r) in lhs_slice.iter().zip(rhs_slice.iter()) {
+                if l > r {
+                    return false;
+                } else if l < r {
+                    equal = false;
+                }
+            }
+            !equal
+        } else {
+            false
+        }
+    }
+
+    fn le(&self, other: &VClock) -> bool {
+        // Load the values as slices
+        let lhs_slice = self.as_slice();
+        let rhs_slice = other.as_slice();
+
+        // If l_len > r_len then at least one element
+        // in l_len is > than r_len, therefore the result
+        // is either Some(Greater) or None, so return false
+        // early.
+        let l_len = lhs_slice.len();
+        let r_len = rhs_slice.len();
+        if l_len <= r_len {
+            // If any elements on the left are greater than the right
+            // then the result is None or Some(Greater), both of which
+            // return false, the earlier test asserts that no elements in the
+            // extended tail violate this assumption. Otherwise l <= r
+            !lhs_slice.iter().zip(rhs_slice.iter()).any(|(&l, &r)| l > r)
+        } else {
+            false
+        }
+    }
+
+    fn gt(&self, other: &VClock) -> bool {
+        // Load the values as slices
+        let lhs_slice = self.as_slice();
+        let rhs_slice = other.as_slice();
+
+        // If r_len > l_len then at least one element
+        // in r_len is > than l_len, therefore the result
+        // is either Some(Less) or None, so return false
+        // early.
+        let l_len = lhs_slice.len();
+        let r_len = rhs_slice.len();
+        if l_len >= r_len {
+            // If any elements on the left are less than the right
+            // then the result is None or Some(Less), both of which
+            // return false, the earlier test asserts that no elements in the
+            // extended tail violate this assumption. Otherwise l >=, finally
+            // the case where the values are potentially equal needs to be considered
+            // and false returned as well
+            let mut equal = l_len == r_len;
+            for (&l, &r) in lhs_slice.iter().zip(rhs_slice.iter()) {
+                if l < r {
+                    return false;
+                } else if l > r {
+                    equal = false;
+                }
+            }
+            !equal
+        } else {
+            false
+        }
+    }
+
+    fn ge(&self, other: &VClock) -> bool {
+        // Load the values as slices
+        let lhs_slice = self.as_slice();
+        let rhs_slice = other.as_slice();
+
+        // If r_len > l_len then at least one element
+        // in r_len is > than l_len, therefore the result
+        // is either Some(Less) or None, so return false
+        // early.
+        let l_len = lhs_slice.len();
+        let r_len = rhs_slice.len();
+        if l_len >= r_len {
+            // If any elements on the left are less than the right
+            // then the result is None or Some(Less), both of which
+            // return false, the earlier test asserts that no elements in the
+            // extended tail violate this assumption. Otherwise l >= r
+            !lhs_slice.iter().zip(rhs_slice.iter()).any(|(&l, &r)| l < r)
+        } else {
+            false
+        }
+    }
+}
+
+impl Index<VectorIdx> for VClock {
+    type Output = VTimestamp;
+
+    #[inline]
+    fn index(&self, index: VectorIdx) -> &VTimestamp {
+        self.as_slice().get(index.to_u32() as usize).unwrap_or(&0)
+    }
+}
+
+/// Test vector clock ordering operations
+///  data-race detection is tested in the external
+///  test suite
+#[cfg(test)]
+mod tests {
+
+    use super::{VClock, VSmallClockMap, VTimestamp, VectorIdx};
+    use std::cmp::Ordering;
+
+    #[test]
+    fn test_equal() {
+        let mut c1 = VClock::default();
+        let mut c2 = VClock::default();
+        assert_eq!(c1, c2);
+        c1.increment_index(VectorIdx(5));
+        assert_ne!(c1, c2);
+        c2.increment_index(VectorIdx(53));
+        assert_ne!(c1, c2);
+        c1.increment_index(VectorIdx(53));
+        assert_ne!(c1, c2);
+        c2.increment_index(VectorIdx(5));
+        assert_eq!(c1, c2);
+    }
+
+    #[test]
+    fn test_partial_order() {
+        // Small test
+        assert_order(&[1], &[1], Some(Ordering::Equal));
+        assert_order(&[1], &[2], Some(Ordering::Less));
+        assert_order(&[2], &[1], Some(Ordering::Greater));
+        assert_order(&[1], &[1, 2], Some(Ordering::Less));
+        assert_order(&[2], &[1, 2], None);
+
+        // Misc tests
+        assert_order(&[400], &[0, 1], None);
+
+        // Large test
+        assert_order(
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0],
+            Some(Ordering::Equal),
+        );
+        assert_order(
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 0],
+            Some(Ordering::Less),
+        );
+        assert_order(
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11],
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0],
+            Some(Ordering::Greater),
+        );
+        assert_order(
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11],
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 0],
+            None,
+        );
+        assert_order(
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9],
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0, 0],
+            Some(Ordering::Less),
+        );
+        assert_order(
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9],
+            &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 0],
+            Some(Ordering::Less),
+        );
+    }
+
+    fn from_slice(mut slice: &[VTimestamp]) -> VClock {
+        while let Some(0) = slice.last() {
+            slice = &slice[..slice.len() - 1]
+        }
+        VClock(smallvec::SmallVec::from_slice(slice))
+    }
+
+    fn assert_order(l: &[VTimestamp], r: &[VTimestamp], o: Option<Ordering>) {
+        let l = from_slice(l);
+        let r = from_slice(r);
+
+        //Test partial_cmp
+        let compare = l.partial_cmp(&r);
+        assert_eq!(compare, o, "Invalid comparison\n l: {:?}\n r: {:?}", l, r);
+        let alt_compare = r.partial_cmp(&l);
+        assert_eq!(
+            alt_compare,
+            o.map(Ordering::reverse),
+            "Invalid alt comparison\n l: {:?}\n r: {:?}",
+            l,
+            r
+        );
+
+        //Test operators with faster implementations
+        assert_eq!(
+            matches!(compare, Some(Ordering::Less)),
+            l < r,
+            "Invalid (<):\n l: {:?}\n r: {:?}",
+            l,
+            r
+        );
+        assert_eq!(
+            matches!(compare, Some(Ordering::Less) | Some(Ordering::Equal)),
+            l <= r,
+            "Invalid (<=):\n l: {:?}\n r: {:?}",
+            l,
+            r
+        );
+        assert_eq!(
+            matches!(compare, Some(Ordering::Greater)),
+            l > r,
+            "Invalid (>):\n l: {:?}\n r: {:?}",
+            l,
+            r
+        );
+        assert_eq!(
+            matches!(compare, Some(Ordering::Greater) | Some(Ordering::Equal)),
+            l >= r,
+            "Invalid (>=):\n l: {:?}\n r: {:?}",
+            l,
+            r
+        );
+        assert_eq!(
+            matches!(alt_compare, Some(Ordering::Less)),
+            r < l,
+            "Invalid alt (<):\n l: {:?}\n r: {:?}",
+            l,
+            r
+        );
+        assert_eq!(
+            matches!(alt_compare, Some(Ordering::Less) | Some(Ordering::Equal)),
+            r <= l,
+            "Invalid alt (<=):\n l: {:?}\n r: {:?}",
+            l,
+            r
+        );
+        assert_eq!(
+            matches!(alt_compare, Some(Ordering::Greater)),
+            r > l,
+            "Invalid alt (>):\n l: {:?}\n r: {:?}",
+            l,
+            r
+        );
+        assert_eq!(
+            matches!(alt_compare, Some(Ordering::Greater) | Some(Ordering::Equal)),
+            r >= l,
+            "Invalid alt (>=):\n l: {:?}\n r: {:?}",
+            l,
+            r
+        );
+    }
+
+    #[test]
+    pub fn test_vclock_set() {
+        let mut map = VSmallClockMap::default();
+        let v1 = from_slice(&[3, 0, 1]);
+        let v2 = from_slice(&[4, 2, 3]);
+        let v3 = from_slice(&[4, 8, 3]);
+        map.insert(VectorIdx(0), &v1);
+        assert_eq!(map.get(VectorIdx(0)), Some(&v1));
+        map.insert(VectorIdx(5), &v2);
+        assert_eq!(map.get(VectorIdx(0)), Some(&v1));
+        assert_eq!(map.get(VectorIdx(5)), Some(&v2));
+        map.insert(VectorIdx(53), &v3);
+        assert_eq!(map.get(VectorIdx(0)), Some(&v1));
+        assert_eq!(map.get(VectorIdx(5)), Some(&v2));
+        assert_eq!(map.get(VectorIdx(53)), Some(&v3));
+        map.retain_index(VectorIdx(53));
+        assert_eq!(map.get(VectorIdx(0)), None);
+        assert_eq!(map.get(VectorIdx(5)), None);
+        assert_eq!(map.get(VectorIdx(53)), Some(&v3));
+        map.clear();
+        assert_eq!(map.get(VectorIdx(0)), None);
+        assert_eq!(map.get(VectorIdx(5)), None);
+        assert_eq!(map.get(VectorIdx(53)), None);
+        map.insert(VectorIdx(53), &v3);
+        assert_eq!(map.get(VectorIdx(0)), None);
+        assert_eq!(map.get(VectorIdx(5)), None);
+        assert_eq!(map.get(VectorIdx(53)), Some(&v3));
+    }
+}
diff --git a/tests/compile-fail/data_race/atomic_read_na_write_race1.rs b/tests/compile-fail/data_race/atomic_read_na_write_race1.rs
new file mode 100644
index 0000000000..0b9610edc6
--- /dev/null
+++ b/tests/compile-fail/data_race/atomic_read_na_write_race1.rs
@@ -0,0 +1,31 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+#![feature(core_intrinsics)]
+
+use std::thread::spawn;
+use std::sync::atomic::AtomicUsize;
+use std::intrinsics::atomic_load;
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+pub fn main() {
+    let mut a = AtomicUsize::new(0);
+    let b = &mut a as *mut AtomicUsize;
+    let c = EvilSend(b);
+    unsafe {
+        let j1 = spawn(move || {
+            *(c.0 as *mut usize) = 32;
+        });
+
+        let j2 = spawn(move || {
+            //Equivalent to: (&*c.0).load(Ordering::SeqCst)
+            atomic_load(c.0 as *mut usize) //~ ERROR Data race
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+    }
+}
diff --git a/tests/compile-fail/data_race/atomic_read_na_write_race2.rs b/tests/compile-fail/data_race/atomic_read_na_write_race2.rs
new file mode 100644
index 0000000000..779babefd8
--- /dev/null
+++ b/tests/compile-fail/data_race/atomic_read_na_write_race2.rs
@@ -0,0 +1,31 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+
+use std::thread::spawn;
+use std::sync::atomic::AtomicUsize;
+use std::sync::atomic::Ordering;
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+pub fn main() {
+    let mut a = AtomicUsize::new(0);
+    let b = &mut a as *mut AtomicUsize;
+    let c = EvilSend(b);
+    unsafe {
+        let j1 = spawn(move || {
+            let atomic_ref = &mut *c.0;
+            atomic_ref.load(Ordering::SeqCst)
+        });
+
+        let j2 = spawn(move || {
+            let atomic_ref = &mut *c.0;
+            *atomic_ref.get_mut() = 32; //~ ERROR Data race
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+    }
+}
diff --git a/tests/compile-fail/data_race/atomic_write_na_read_race1.rs b/tests/compile-fail/data_race/atomic_write_na_read_race1.rs
new file mode 100644
index 0000000000..3211a5ae53
--- /dev/null
+++ b/tests/compile-fail/data_race/atomic_write_na_read_race1.rs
@@ -0,0 +1,31 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+
+use std::thread::spawn;
+use std::sync::atomic::AtomicUsize;
+use std::sync::atomic::Ordering;
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+pub fn main() {
+    let mut a = AtomicUsize::new(0);
+    let b = &mut a as *mut AtomicUsize;
+    let c = EvilSend(b);
+    unsafe {
+        let j1 = spawn(move || {
+            let atomic_ref = &mut *c.0;
+            atomic_ref.store(32, Ordering::SeqCst)
+        });
+
+        let j2 = spawn(move || {
+            let atomic_ref = &mut *c.0;
+            *atomic_ref.get_mut() //~ ERROR Data race
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+    }
+}
diff --git a/tests/compile-fail/data_race/atomic_write_na_read_race2.rs b/tests/compile-fail/data_race/atomic_write_na_read_race2.rs
new file mode 100644
index 0000000000..131d4e07b8
--- /dev/null
+++ b/tests/compile-fail/data_race/atomic_write_na_read_race2.rs
@@ -0,0 +1,31 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+#![feature(core_intrinsics)]
+
+use std::thread::spawn;
+use std::sync::atomic::AtomicUsize;
+use std::intrinsics::atomic_store;
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+pub fn main() {
+    let mut a = AtomicUsize::new(0);
+    let b = &mut a as *mut AtomicUsize;
+    let c = EvilSend(b);
+    unsafe {
+        let j1 = spawn(move || {
+            *(c.0 as *mut usize)
+        });
+
+        let j2 = spawn(move || {
+            //Equivalent to: (&*c.0).store(32, Ordering::SeqCst)
+            atomic_store(c.0 as *mut usize, 32); //~ ERROR Data race
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+    }
+}
diff --git a/tests/compile-fail/data_race/atomic_write_na_write_race1.rs b/tests/compile-fail/data_race/atomic_write_na_write_race1.rs
new file mode 100644
index 0000000000..74adf7ae4b
--- /dev/null
+++ b/tests/compile-fail/data_race/atomic_write_na_write_race1.rs
@@ -0,0 +1,31 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+#![feature(core_intrinsics)]
+
+use std::thread::spawn;
+use std::sync::atomic::AtomicUsize;
+use std::intrinsics::atomic_store;
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+pub fn main() {
+    let mut a = AtomicUsize::new(0);
+    let b = &mut a as *mut AtomicUsize;
+    let c = EvilSend(b);
+    unsafe {
+        let j1 = spawn(move || {
+            *(c.0 as *mut usize) = 32;
+        });
+
+        let j2 = spawn(move || {
+            //Equivalent to: (&*c.0).store(64, Ordering::SeqCst)
+            atomic_store(c.0 as *mut usize, 64); //~ ERROR Data race
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+    }
+}
diff --git a/tests/compile-fail/data_race/atomic_write_na_write_race2.rs b/tests/compile-fail/data_race/atomic_write_na_write_race2.rs
new file mode 100644
index 0000000000..75ad755fbd
--- /dev/null
+++ b/tests/compile-fail/data_race/atomic_write_na_write_race2.rs
@@ -0,0 +1,31 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+
+use std::thread::spawn;
+use std::sync::atomic::AtomicUsize;
+use std::sync::atomic::Ordering;
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+pub fn main() {
+    let mut a = AtomicUsize::new(0);
+    let b = &mut a as *mut AtomicUsize;
+    let c = EvilSend(b);
+    unsafe {
+        let j1 = spawn(move || {
+            let atomic_ref = &mut *c.0;
+            atomic_ref.store(64, Ordering::SeqCst);
+        });
+
+        let j2 = spawn(move || {
+            let atomic_ref = &mut *c.0;
+            *atomic_ref.get_mut() = 32; //~ ERROR Data race
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+    }
+}
diff --git a/tests/compile-fail/data_race/dangling_thread_async_race.rs b/tests/compile-fail/data_race/dangling_thread_async_race.rs
new file mode 100644
index 0000000000..d8b5d82f83
--- /dev/null
+++ b/tests/compile-fail/data_race/dangling_thread_async_race.rs
@@ -0,0 +1,44 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+// compile-flags: -Zmiri-disable-isolation
+
+use std::thread::{spawn, sleep};
+use std::time::Duration;
+use std::mem;
+
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+
+fn main() {
+    let mut a = 0u32;
+    let b = &mut a as *mut u32;
+    let c = EvilSend(b);
+
+    let join = unsafe {
+        spawn(move || {
+            *c.0 = 32;
+        })
+    };
+
+    // Detatch the thread and sleep until it terminates
+    mem::drop(join);
+    sleep(Duration::from_millis(100));
+
+    // Spawn and immediately join a thread
+    // to execute the join code-path
+    // and ensure that data-race detection
+    // remains enabled nevertheless.
+    spawn(|| ()).join().unwrap();
+
+    let join2 = unsafe {
+        spawn(move || {
+            *c.0 = 64; //~ ERROR Data race      
+        })
+    };
+
+    join2.join().unwrap();
+}
diff --git a/tests/compile-fail/data_race/dangling_thread_race.rs b/tests/compile-fail/data_race/dangling_thread_race.rs
new file mode 100644
index 0000000000..172b05bd4f
--- /dev/null
+++ b/tests/compile-fail/data_race/dangling_thread_race.rs
@@ -0,0 +1,41 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+// compile-flags: -Zmiri-disable-isolation
+
+use std::thread::{spawn, sleep};
+use std::time::Duration;
+use std::mem;
+
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+
+fn main() {
+    let mut a = 0u32;
+    let b = &mut a as *mut u32;
+    let c = EvilSend(b);
+
+    let join = unsafe {
+        spawn(move || {
+            *c.0 = 32;
+        })
+    };
+
+    // Detatch the thread and sleep until it terminates
+    mem::drop(join);
+    sleep(Duration::from_millis(100));
+
+    // Spawn and immediately join a thread
+    // to execute the join code-path
+    // and ensure that data-race detection
+    // remains enabled nevertheless.
+    spawn(|| ()).join().unwrap();
+
+
+    unsafe {
+        *c.0 = 64; //~ ERROR Data race
+    }
+}
diff --git a/tests/compile-fail/data_race/enable_after_join_to_main.rs b/tests/compile-fail/data_race/enable_after_join_to_main.rs
new file mode 100644
index 0000000000..c294317771
--- /dev/null
+++ b/tests/compile-fail/data_race/enable_after_join_to_main.rs
@@ -0,0 +1,38 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+
+use std::thread::spawn;
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+pub fn main() {
+    // Enable and then join with multiple threads.
+    let t1 = spawn(|| ());
+    let t2 = spawn(|| ());
+    let t3 = spawn(|| ());
+    let t4 = spawn(|| ());
+    t1.join().unwrap();
+    t2.join().unwrap();
+    t3.join().unwrap();
+    t4.join().unwrap();
+
+    // Perform write-write data race detection.
+    let mut a = 0u32;
+    let b = &mut a as *mut u32;
+    let c = EvilSend(b);
+    unsafe {
+        let j1 = spawn(move || {
+            *c.0 = 32;
+        });
+
+        let j2 = spawn(move || {
+            *c.0 = 64; //~ ERROR Data race
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+    }
+}
diff --git a/tests/compile-fail/data_race/read_write_race.rs b/tests/compile-fail/data_race/read_write_race.rs
new file mode 100644
index 0000000000..42fd7a51ff
--- /dev/null
+++ b/tests/compile-fail/data_race/read_write_race.rs
@@ -0,0 +1,27 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+
+use std::thread::spawn;
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+pub fn main() {
+    let mut a = 0u32;
+    let b = &mut a as *mut u32;
+    let c = EvilSend(b);
+    unsafe {
+        let j1 = spawn(move || {
+            *c.0
+        });
+
+        let j2 = spawn(move || {
+            *c.0 = 64; //~ ERROR Data race
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+    }
+}
diff --git a/tests/compile-fail/data_race/relax_acquire_race.rs b/tests/compile-fail/data_race/relax_acquire_race.rs
new file mode 100644
index 0000000000..2ae0aacbcf
--- /dev/null
+++ b/tests/compile-fail/data_race/relax_acquire_race.rs
@@ -0,0 +1,50 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+
+use std::thread::spawn;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+static SYNC: AtomicUsize = AtomicUsize::new(0);
+
+pub fn main() {
+    let mut a = 0u32;
+    let b = &mut a as *mut u32;
+    let c = EvilSend(b);
+
+    // Note: this is scheduler-dependent
+    // the operations need to occur in
+    // order:
+    //  1. store release : 1
+    //  2. load acquire : 1
+    //  3. store relaxed : 2
+    //  4. load acquire : 2
+    unsafe {
+        let j1 = spawn(move || {
+            *c.0 = 1;
+            SYNC.store(1, Ordering::Release);
+        });
+
+        let j2 = spawn(move || {
+            if SYNC.load(Ordering::Acquire) == 1 {
+                SYNC.store(2, Ordering::Relaxed);
+            }
+        });
+
+        let j3 = spawn(move || {
+            if SYNC.load(Ordering::Acquire) == 2 {
+                *c.0 //~ ERROR Data race
+            } else {
+                0
+            }
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+        j3.join().unwrap();
+    }
+}
diff --git a/tests/compile-fail/data_race/release_seq_race.rs b/tests/compile-fail/data_race/release_seq_race.rs
new file mode 100644
index 0000000000..59263cb712
--- /dev/null
+++ b/tests/compile-fail/data_race/release_seq_race.rs
@@ -0,0 +1,55 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+// compile-flags: -Zmiri-disable-isolation
+
+use std::thread::{spawn, sleep};
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::Duration;
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+static SYNC: AtomicUsize = AtomicUsize::new(0);
+
+pub fn main() {
+    let mut a = 0u32;
+    let b = &mut a as *mut u32;
+    let c = EvilSend(b);
+
+    // Note: this is scheduler-dependent
+    // the operations need to occur in
+    // order, the sleep operations currently
+    // force the desired ordering:
+    //  1. store release : 1
+    //  2. store relaxed : 2
+    //  3. store relaxed : 3
+    //  4. load acquire : 3
+    unsafe {
+        let j1 = spawn(move || {
+            *c.0 = 1;
+            SYNC.store(1, Ordering::Release);
+            sleep(Duration::from_millis(100));
+            SYNC.store(3, Ordering::Relaxed);
+        });
+
+        let j2 = spawn(move || {
+            // Blocks the acquire-release sequence
+            SYNC.store(2, Ordering::Relaxed);
+        });
+
+        let j3 = spawn(move || {
+            sleep(Duration::from_millis(1000));
+            if SYNC.load(Ordering::Acquire) == 3 {
+                *c.0 //~ ERROR Data race
+            } else {
+                0
+            }
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+        j3.join().unwrap();
+    }
+}
diff --git a/tests/compile-fail/data_race/rmw_race.rs b/tests/compile-fail/data_race/rmw_race.rs
new file mode 100644
index 0000000000..e523f8b374
--- /dev/null
+++ b/tests/compile-fail/data_race/rmw_race.rs
@@ -0,0 +1,51 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+
+use std::thread::spawn;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+static SYNC: AtomicUsize = AtomicUsize::new(0);
+
+pub fn main() {
+    let mut a = 0u32;
+    let b = &mut a as *mut u32;
+    let c = EvilSend(b);
+
+    // Note: this is scheduler-dependent
+    // the operations need to occur in
+    // order:
+    //  1. store release : 1
+    //  2. RMW relaxed : 1 -> 2
+    //  3. store relaxed : 3
+    //  4. load acquire : 3
+    unsafe {
+        let j1 = spawn(move || {
+            *c.0 = 1;
+            SYNC.store(1, Ordering::Release);
+        });
+
+        let j2 = spawn(move || {
+            if SYNC.swap(2, Ordering::Relaxed) == 1 {
+                // Blocks the acquire-release sequence
+                SYNC.store(3, Ordering::Relaxed);
+            }
+        });
+
+        let j3 = spawn(move || {
+            if SYNC.load(Ordering::Acquire) == 3 {
+                *c.0 //~ ERROR Data race
+            } else {
+                0
+            }
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+        j3.join().unwrap();
+    }
+}
diff --git a/tests/compile-fail/data_race/write_write_race.rs b/tests/compile-fail/data_race/write_write_race.rs
new file mode 100644
index 0000000000..aca19a46c1
--- /dev/null
+++ b/tests/compile-fail/data_race/write_write_race.rs
@@ -0,0 +1,27 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+
+use std::thread::spawn;
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+pub fn main() {
+    let mut a = 0u32;
+    let b = &mut a as *mut u32;
+    let c = EvilSend(b);
+    unsafe {
+        let j1 = spawn(move || {
+            *c.0 = 32;
+        });
+
+        let j2 = spawn(move || {
+            *c.0 = 64; //~ ERROR Data race
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+    }
+}
diff --git a/tests/run-pass/concurrency/data_race.rs b/tests/run-pass/concurrency/data_race.rs
new file mode 100644
index 0000000000..64e90024ed
--- /dev/null
+++ b/tests/run-pass/concurrency/data_race.rs
@@ -0,0 +1,122 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+
+
+use std::sync::atomic::{AtomicUsize, fence, Ordering};
+use std::thread::spawn;
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+static SYNC: AtomicUsize = AtomicUsize::new(0);
+
+fn test_fence_sync() {
+    let mut var = 0u32;
+    let ptr = &mut var as *mut u32;
+    let evil_ptr = EvilSend(ptr);
+    
+    
+    let j1 = spawn(move || {
+        unsafe { *evil_ptr.0 = 1; }
+        fence(Ordering::Release);
+        SYNC.store(1, Ordering::Relaxed)   
+    });
+
+    let j2 = spawn(move || {
+        if SYNC.load(Ordering::Relaxed) == 1 {
+            fence(Ordering::Acquire);
+            unsafe { *evil_ptr.0 }
+        } else {
+            0
+        }
+    });
+
+    j1.join().unwrap();
+    j2.join().unwrap();
+}
+
+
+fn test_multiple_reads() {
+    let mut var = 42u32;
+    let ptr = &mut var as *mut u32;
+    let evil_ptr = EvilSend(ptr);
+
+    let j1 = spawn(move || unsafe {*evil_ptr.0});
+    let j2 = spawn(move || unsafe {*evil_ptr.0});
+    let j3 = spawn(move || unsafe {*evil_ptr.0});
+    let j4 = spawn(move || unsafe {*evil_ptr.0});
+
+    assert_eq!(j1.join().unwrap(), 42);
+    assert_eq!(j2.join().unwrap(), 42);
+    assert_eq!(j3.join().unwrap(), 42);
+    assert_eq!(j4.join().unwrap(), 42);
+
+    var = 10;
+    assert_eq!(var, 10);
+}
+
+pub fn test_rmw_no_block() {
+    let mut a = 0u32;
+    let b = &mut a as *mut u32;
+    let c = EvilSend(b);
+
+    unsafe {
+        let j1 = spawn(move || {
+            *c.0 = 1;
+            SYNC.store(1, Ordering::Release);
+        });
+
+        let j2 = spawn(move || {
+            if SYNC.swap(2, Ordering::Relaxed) == 1 {
+                //No op, blocking store removed
+            }
+        });
+
+        let j3 = spawn(move || {
+            if SYNC.load(Ordering::Acquire) == 2 {
+                *c.0
+            } else {
+                0
+            }
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+        let v = j3.join().unwrap();
+        assert!(v == 1 || v == 2);
+    }
+}
+
+pub fn test_release_no_block() {
+    let mut a = 0u32;
+    let b = &mut a as *mut u32;
+    let c = EvilSend(b);
+
+    unsafe {
+        let j1 = spawn(move || {
+            *c.0 = 1;
+            SYNC.store(1, Ordering::Release);
+            SYNC.store(3, Ordering::Relaxed);
+        });
+
+        let j2 = spawn(move || {
+            if SYNC.load(Ordering::Acquire) == 3 {
+                *c.0
+            } else {
+                0
+            }
+        });
+
+        j1.join().unwrap();
+        assert_eq!(j2.join().unwrap(),1);
+    }
+}
+
+pub fn main() {
+    test_fence_sync();
+    test_multiple_reads();
+    test_rmw_no_block();
+    test_release_no_block();
+}
diff --git a/tests/run-pass/concurrency/data_race.stderr b/tests/run-pass/concurrency/data_race.stderr
new file mode 100644
index 0000000000..03676519d4
--- /dev/null
+++ b/tests/run-pass/concurrency/data_race.stderr
@@ -0,0 +1,2 @@
+warning: thread support is experimental and incomplete: weak memory effects are not emulated.
+
diff --git a/tests/run-pass/concurrency/disable_data_race_detector.rs b/tests/run-pass/concurrency/disable_data_race_detector.rs
new file mode 100644
index 0000000000..8b2d180f11
--- /dev/null
+++ b/tests/run-pass/concurrency/disable_data_race_detector.rs
@@ -0,0 +1,28 @@
+// ignore-windows: Concurrency on Windows is not supported yet.
+// compile-flags: -Zmiri-disable-data-race-detector
+
+use std::thread::spawn;
+
+#[derive(Copy, Clone)]
+struct EvilSend<T>(pub T);
+
+unsafe impl<T> Send for EvilSend<T> {}
+unsafe impl<T> Sync for EvilSend<T> {}
+
+pub fn main() {
+    let mut a = 0u32;
+    let b = &mut a as *mut u32;
+    let c = EvilSend(b);
+    unsafe {
+        let j1 = spawn(move || {
+            *c.0 = 32;
+        });
+
+        let j2 = spawn(move || {
+            *c.0 = 64; //~ ERROR Data race (but not detected as the detector is disabled)
+        });
+
+        j1.join().unwrap();
+        j2.join().unwrap();
+    }
+}
diff --git a/tests/run-pass/concurrency/disable_data_race_detector.stderr b/tests/run-pass/concurrency/disable_data_race_detector.stderr
new file mode 100644
index 0000000000..03676519d4
--- /dev/null
+++ b/tests/run-pass/concurrency/disable_data_race_detector.stderr
@@ -0,0 +1,2 @@
+warning: thread support is experimental and incomplete: weak memory effects are not emulated.
+
diff --git a/tests/run-pass/concurrency/linux-futex.stderr b/tests/run-pass/concurrency/linux-futex.stderr
index 2dbfb7721d..03676519d4 100644
--- a/tests/run-pass/concurrency/linux-futex.stderr
+++ b/tests/run-pass/concurrency/linux-futex.stderr
@@ -1,2 +1,2 @@
-warning: thread support is experimental. For example, Miri does not detect data races yet.
+warning: thread support is experimental and incomplete: weak memory effects are not emulated.
 
diff --git a/tests/run-pass/concurrency/simple.stderr b/tests/run-pass/concurrency/simple.stderr
index 7060411278..f46b1442d7 100644
--- a/tests/run-pass/concurrency/simple.stderr
+++ b/tests/run-pass/concurrency/simple.stderr
@@ -1,4 +1,4 @@
-warning: thread support is experimental. For example, Miri does not detect data races yet.
+warning: thread support is experimental and incomplete: weak memory effects are not emulated.
 
 thread '<unnamed>' panicked at 'Hello!', $DIR/simple.rs:54:9
 note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
diff --git a/tests/run-pass/concurrency/sync.stderr b/tests/run-pass/concurrency/sync.stderr
index 2dbfb7721d..03676519d4 100644
--- a/tests/run-pass/concurrency/sync.stderr
+++ b/tests/run-pass/concurrency/sync.stderr
@@ -1,2 +1,2 @@
-warning: thread support is experimental. For example, Miri does not detect data races yet.
+warning: thread support is experimental and incomplete: weak memory effects are not emulated.
 
diff --git a/tests/run-pass/concurrency/thread_locals.stderr b/tests/run-pass/concurrency/thread_locals.stderr
index 2dbfb7721d..03676519d4 100644
--- a/tests/run-pass/concurrency/thread_locals.stderr
+++ b/tests/run-pass/concurrency/thread_locals.stderr
@@ -1,2 +1,2 @@
-warning: thread support is experimental. For example, Miri does not detect data races yet.
+warning: thread support is experimental and incomplete: weak memory effects are not emulated.
 
diff --git a/tests/run-pass/concurrency/tls_lib_drop.stderr b/tests/run-pass/concurrency/tls_lib_drop.stderr
index 2dbfb7721d..03676519d4 100644
--- a/tests/run-pass/concurrency/tls_lib_drop.stderr
+++ b/tests/run-pass/concurrency/tls_lib_drop.stderr
@@ -1,2 +1,2 @@
-warning: thread support is experimental. For example, Miri does not detect data races yet.
+warning: thread support is experimental and incomplete: weak memory effects are not emulated.
 
diff --git a/tests/run-pass/libc.stderr b/tests/run-pass/libc.stderr
index 2dbfb7721d..03676519d4 100644
--- a/tests/run-pass/libc.stderr
+++ b/tests/run-pass/libc.stderr
@@ -1,2 +1,2 @@
-warning: thread support is experimental. For example, Miri does not detect data races yet.
+warning: thread support is experimental and incomplete: weak memory effects are not emulated.
 
diff --git a/tests/run-pass/panic/concurrent-panic.stderr b/tests/run-pass/panic/concurrent-panic.stderr
index eb5b5f59a0..1ee688c1d3 100644
--- a/tests/run-pass/panic/concurrent-panic.stderr
+++ b/tests/run-pass/panic/concurrent-panic.stderr
@@ -1,4 +1,4 @@
-warning: thread support is experimental. For example, Miri does not detect data races yet.
+warning: thread support is experimental and incomplete: weak memory effects are not emulated.
 
 Thread 1 starting, will block on mutex
 Thread 1 reported it has started