diff --git a/build.rs b/build.rs
index af52789fcd03..bcf81cce58e9 100644
--- a/build.rs
+++ b/build.rs
@@ -183,7 +183,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
             // to be a big chunk of work to implement them all there!
             ("simd", _) if target.contains("aarch64") => return true,
 
-            ("simd", "simd_bit_shift") => return true, // FIXME Unsupported feature: proposed SIMD operator I8x16Shl
             ("simd", "simd_conversions") => return true, // FIXME Unsupported feature: proposed SIMD operator I16x8NarrowI32x4S
             ("simd", "simd_f32x4") => return true, // FIXME expected V128(F32x4([CanonicalNan, CanonicalNan, Value(Float32 { bits: 0 }), Value(Float32 { bits: 0 })])), got V128(18428729675200069632)
             ("simd", "simd_f64x2") => return true, // FIXME expected V128(F64x2([Value(Float64 { bits: 9221120237041090560 }), Value(Float64 { bits: 0 })])), got V128(0)
diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs
index 6bcc2b94f40e..f38e4249bfa4 100644
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -493,8 +493,8 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
         );
     }
 
-    // SIMD shift right (arithmetic)
-    for ty in &[I16, I32, I64] {
+    // SIMD shift right (arithmetic, i16x8 and i32x4)
+    for ty in &[I16, I32] {
         let sshr = sshr.bind(vector(*ty, sse_vector_size));
         let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
         narrow.legalize(
@@ -502,6 +502,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
             vec![def!(b = bitcast_i64x2(y)), def!(a = x86_psra(x, b))],
         );
     }
+    // SIMD shift right (arithmetic, i8x16)
     {
         let sshr = sshr.bind(vector(I8, sse_vector_size));
         let bitcast_i64x2 = bitcast.bind(vector(I64, sse_vector_size));
@@ -526,6 +527,25 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
             ],
         );
     }
+    // SIMD shift right (arithmetic, i64x2)
+    {
+        let sshr_vector = sshr.bind(vector(I64, sse_vector_size));
+        let sshr_scalar_lane0 = sshr.bind(I64);
+        let sshr_scalar_lane1 = sshr.bind(I64);
+        narrow.legalize(
+            def!(z = sshr_vector(x, y)),
+            vec![
+                // Use scalar operations to shift the first lane.
+                def!(a = extractlane(x, uimm8_zero)),
+                def!(b = sshr_scalar_lane0(a, y)),
+                def!(c = insertlane(x, uimm8_zero, b)),
+                // Do the same for the second lane.
+                def!(d = extractlane(x, uimm8_one)),
+                def!(e = sshr_scalar_lane1(d, y)),
+                def!(z = insertlane(c, uimm8_one, e)),
+            ],
+        );
+    }
 
     // SIMD select
     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif
index 0b14984ed691..102719351b9d 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-legalize.clif
@@ -84,6 +84,20 @@ block0:
     return v2
 }
 
+function %sshr_i64x2() -> i64x2 {
+block0:
+    v0 = iconst.i32 1
+    v1 = vconst.i64x2 [1 2]
+    v2 = sshr v1, v0
+    ; check:  v3 = x86_pextr v1, 0
+    ; nextln: v4 = sshr v3, v0
+    ; nextln: v5 = x86_pinsr v1, 0, v4
+    ; nextln: v6 = x86_pextr v1, 1
+    ; nextln: v7 = sshr v6, v0
+    ; nextln: v2 = x86_pinsr v5, 1, v7
+    return v2
+}
+
 function %bitselect_i16x8() -> i16x8 {
 block0:
     v0 = vconst.i16x8 [0 0 0 0 0 0 0 0]
diff --git a/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif b/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif
index cceb63cddfc1..0f6ba31ed811 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-bitwise-run.clif
@@ -145,6 +145,16 @@ block0:
 }
 ; run
 
+function %sshr_i64x2(i64x2, i32) -> i64x2 {
+block0(v0:i64x2, v1:i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; run: %sshr_i64x2([1 -1], 0) == [1 -1]
+; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result
+; run: %sshr_i64x2([2 -2], 1) == [1 -1]
+; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0]
+
 function %bitselect_i8x16() -> b1 {
 block0:
     v0 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255]  ; the selector vector
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index f98dee0232ab..7513e456e3d6 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -1402,7 +1402,7 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         Operator::I8x16Shl | Operator::I16x8Shl | Operator::I32x4Shl | Operator::I64x2Shl => {
             let (a, b) = state.pop2();
             let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder);
-            let bitwidth = i64::from(builder.func.dfg.value_type(a).bits());
+            let bitwidth = i64::from(type_of(op).lane_bits());
             // The spec expects to shift with `b mod lanewidth`; so, e.g., for 16 bit lane-width
             // we do `b AND 15`; this means fewer instructions than `iconst + urem`.
             let b_mod_bitwidth = builder.ins().band_imm(b, bitwidth - 1);
@@ -1411,16 +1411,16 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         Operator::I8x16ShrU | Operator::I16x8ShrU | Operator::I32x4ShrU | Operator::I64x2ShrU => {
             let (a, b) = state.pop2();
             let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder);
-            let bitwidth = i64::from(builder.func.dfg.value_type(a).bits());
+            let bitwidth = i64::from(type_of(op).lane_bits());
             // The spec expects to shift with `b mod lanewidth`; so, e.g., for 16 bit lane-width
             // we do `b AND 15`; this means fewer instructions than `iconst + urem`.
             let b_mod_bitwidth = builder.ins().band_imm(b, bitwidth - 1);
             state.push1(builder.ins().ushr(bitcast_a, b_mod_bitwidth))
         }
-        Operator::I8x16ShrS | Operator::I16x8ShrS | Operator::I32x4ShrS => {
+        Operator::I8x16ShrS | Operator::I16x8ShrS | Operator::I32x4ShrS | Operator::I64x2ShrS => {
             let (a, b) = state.pop2();
             let bitcast_a = optionally_bitcast_vector(a, type_of(op), builder);
-            let bitwidth = i64::from(builder.func.dfg.value_type(a).bits());
+            let bitwidth = i64::from(type_of(op).lane_bits());
             // The spec expects to shift with `b mod lanewidth`; so, e.g., for 16 bit lane-width
             // we do `b AND 15`; this means fewer instructions than `iconst + urem`.
             let b_mod_bitwidth = builder.ins().band_imm(b, bitwidth - 1);
@@ -1544,7 +1544,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         }
         Operator::I8x16Mul
         | Operator::I64x2Mul
-        | Operator::I64x2ShrS
         | Operator::I32x4TruncSatF32x4S
         | Operator::I32x4TruncSatF32x4U
         | Operator::I64x2TruncSatF64x2S