diff --git a/compiler/rustc_codegen_ssa/src/base.rs b/compiler/rustc_codegen_ssa/src/base.rs
index 3e9d29df02cdf..19e36442dce19 100644
--- a/compiler/rustc_codegen_ssa/src/base.rs
+++ b/compiler/rustc_codegen_ssa/src/base.rs
@@ -17,10 +17,7 @@ use rustc_ast::expand::allocator::AllocatorKind;
 use rustc_attr as attr;
 use rustc_data_structures::fx::{FxHashMap, FxHashSet};
 use rustc_data_structures::profiling::{get_resident_set_size, print_time_passes_entry};
-
-use rustc_data_structures::sync::par_iter;
-#[cfg(parallel_compiler)]
-use rustc_data_structures::sync::ParallelIterator;
+use rustc_data_structures::sync::par_map;
 use rustc_hir as hir;
 use rustc_hir::def_id::{DefId, LOCAL_CRATE};
 use rustc_hir::lang_items::LangItem;
@@ -682,7 +679,7 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
     // This likely is a temporary measure. Once we don't have to support the
     // non-parallel compiler anymore, we can compile CGUs end-to-end in
     // parallel and get rid of the complicated scheduling logic.
-    let mut pre_compiled_cgus = if cfg!(parallel_compiler) {
+    let mut pre_compiled_cgus = if rustc_data_structures::sync::active() {
         tcx.sess.time("compile_first_CGU_batch", || {
             // Try to find one CGU to compile per thread.
             let cgus: Vec<_> = cgu_reuse
@@ -695,12 +692,10 @@ pub fn codegen_crate<B: ExtraBackendMethods>(
             // Compile the found CGUs in parallel.
             let start_time = Instant::now();
 
-            let pre_compiled_cgus = par_iter(cgus)
-                .map(|(i, _)| {
-                    let module = backend.compile_codegen_unit(tcx, codegen_units[i].name());
-                    (i, module)
-                })
-                .collect();
+            let pre_compiled_cgus = par_map(cgus, |(i, _)| {
+                let module = backend.compile_codegen_unit(tcx, codegen_units[i].name());
+                (i, module)
+            });
 
             total_codegen_time += start_time.elapsed();
 
diff --git a/compiler/rustc_data_structures/Cargo.toml b/compiler/rustc_data_structures/Cargo.toml
index 39f4bc63c88d1..06ce1adc685c9 100644
--- a/compiler/rustc_data_structures/Cargo.toml
+++ b/compiler/rustc_data_structures/Cargo.toml
@@ -10,11 +10,11 @@ arrayvec = { version = "0.7", default-features = false }
 bitflags = "1.2.1"
 cfg-if = "1.0"
 ena = "0.14.2"
-indexmap = { version = "1.9.3" }
+indexmap = { version = "1.9.3", features = ["rustc-rayon"] }
 jobserver_crate = { version = "0.1.13", package = "jobserver" }
 libc = "0.2"
 measureme = "10.0.0"
-rustc-rayon-core = { version = "0.5.0", optional = true }
+rustc-rayon-core = { version = "0.5.0" }
 rustc-rayon = { version = "0.5.0", optional = true }
 rustc_graphviz = { path = "../rustc_graphviz" }
 rustc-hash = "1.1.0"
@@ -51,4 +51,4 @@ features = [
 memmap2 = "0.2.1"
 
 [features]
-rustc_use_parallel_compiler = ["indexmap/rustc-rayon", "rustc-rayon", "rustc-rayon-core"]
+rustc_use_parallel_compiler = ["rustc-rayon"]
\ No newline at end of file
diff --git a/compiler/rustc_data_structures/src/sharded.rs b/compiler/rustc_data_structures/src/sharded.rs
index bd7a86f67800f..f720e5110a93d 100644
--- a/compiler/rustc_data_structures/src/sharded.rs
+++ b/compiler/rustc_data_structures/src/sharded.rs
@@ -1,5 +1,5 @@
 use crate::fx::{FxHashMap, FxHasher};
-use crate::sync::{Lock, LockGuard};
+use crate::sync::{active, Lock, LockGuard};
 use std::borrow::Borrow;
 use std::collections::hash_map::RawEntryMut;
 use std::hash::{Hash, Hasher};
@@ -9,20 +9,17 @@ use std::mem;
 #[cfg_attr(parallel_compiler, repr(align(64)))]
 struct CacheAligned<T>(T);
 
-#[cfg(parallel_compiler)]
 // 32 shards is sufficient to reduce contention on an 8-core Ryzen 7 1700,
 // but this should be tested on higher core count CPUs. How the `Sharded` type gets used
 // may also affect the ideal number of shards.
 const SHARD_BITS: usize = 5;
 
-#[cfg(not(parallel_compiler))]
-const SHARD_BITS: usize = 0;
-
 pub const SHARDS: usize = 1 << SHARD_BITS;
 
 /// An array of cache-line aligned inner locked structures with convenience methods.
 pub struct Sharded<T> {
     shards: [CacheAligned<Lock<T>>; SHARDS],
+    single_thread: bool,
 }
 
 impl<T: Default> Default for Sharded<T> {
@@ -35,31 +32,41 @@ impl<T: Default> Default for Sharded<T> {
 impl<T> Sharded<T> {
     #[inline]
     pub fn new(mut value: impl FnMut() -> T) -> Self {
-        Sharded { shards: [(); SHARDS].map(|()| CacheAligned(Lock::new(value()))) }
+        Sharded {
+            shards: [(); SHARDS].map(|()| CacheAligned(Lock::new(value()))),
+            single_thread: !active(),
+        }
     }
 
     /// The shard is selected by hashing `val` with `FxHasher`.
     #[inline]
     pub fn get_shard_by_value<K: Hash + ?Sized>(&self, val: &K) -> &Lock<T> {
-        if SHARDS == 1 { &self.shards[0].0 } else { self.get_shard_by_hash(make_hash(val)) }
+        if self.single_thread { &self.shards[0].0 } else { self.get_shard_by_hash(make_hash(val)) }
     }
 
     #[inline]
     pub fn get_shard_by_hash(&self, hash: u64) -> &Lock<T> {
-        &self.shards[get_shard_index_by_hash(hash)].0
-    }
-
-    #[inline]
-    pub fn get_shard_by_index(&self, i: usize) -> &Lock<T> {
-        &self.shards[i].0
+        if self.single_thread {
+            &self.shards[0].0
+        } else {
+            &self.shards[get_shard_index_by_hash(hash)].0
+        }
     }
 
     pub fn lock_shards(&self) -> Vec<LockGuard<'_, T>> {
-        (0..SHARDS).map(|i| self.shards[i].0.lock()).collect()
+        if self.single_thread {
+            vec![self.shards[0].0.lock()]
+        } else {
+            (0..SHARDS).map(|i| self.shards[i].0.lock()).collect()
+        }
     }
 
     pub fn try_lock_shards(&self) -> Option<Vec<LockGuard<'_, T>>> {
-        (0..SHARDS).map(|i| self.shards[i].0.try_lock()).collect()
+        if self.single_thread {
+            Some(vec![self.shards[0].0.try_lock()?])
+        } else {
+            (0..SHARDS).map(|i| self.shards[i].0.try_lock()).collect()
+        }
     }
 }
 
@@ -141,7 +148,7 @@ pub fn make_hash<K: Hash + ?Sized>(val: &K) -> u64 {
 /// consistently for each `Sharded` instance.
 #[inline]
 #[allow(clippy::modulo_one)]
-pub fn get_shard_index_by_hash(hash: u64) -> usize {
+fn get_shard_index_by_hash(hash: u64) -> usize {
     let hash_len = mem::size_of::<usize>();
     // Ignore the top 7 bits as hashbrown uses these and get the next SHARD_BITS highest bits.
     // hashbrown also uses the lowest bits, so we can't use those
diff --git a/compiler/rustc_data_structures/src/sync.rs b/compiler/rustc_data_structures/src/sync.rs
index ef1da85198fd4..534ba7f9c15e4 100644
--- a/compiler/rustc_data_structures/src/sync.rs
+++ b/compiler/rustc_data_structures/src/sync.rs
@@ -40,8 +40,13 @@
 //! [^2] `MTLockRef` is a typedef.
 
 use crate::owned_slice::OwnedSlice;
+use std::cell::{Cell, UnsafeCell};
 use std::collections::HashMap;
+use std::fmt::{Debug, Formatter};
 use std::hash::{BuildHasher, Hash};
+use std::intrinsics::likely;
+use std::marker::PhantomData;
+use std::mem::MaybeUninit;
 use std::ops::{Deref, DerefMut};
 use std::panic::{catch_unwind, resume_unwind, AssertUnwindSafe};
 
@@ -51,91 +56,49 @@ pub use std::sync::atomic::Ordering::SeqCst;
 pub use vec::{AppendOnlyIndexVec, AppendOnlyVec};
 
 mod vec;
+use parking_lot::lock_api::RawMutex as _;
+use parking_lot::RawMutex;
 
-cfg_if! {
-    if #[cfg(not(parallel_compiler))] {
-        pub unsafe auto trait Send {}
-        pub unsafe auto trait Sync {}
-
-        unsafe impl<T> Send for T {}
-        unsafe impl<T> Sync for T {}
-
-        use std::ops::Add;
-
-        /// This is a single threaded variant of `AtomicU64`, `AtomicUsize`, etc.
-        /// It has explicit ordering arguments and is only intended for use with
-        /// the native atomic types.
-        /// You should use this type through the `AtomicU64`, `AtomicUsize`, etc, type aliases
-        /// as it's not intended to be used separately.
-        #[derive(Debug, Default)]
-        pub struct Atomic<T: Copy>(Cell<T>);
+mod mode {
+    use super::Ordering;
+    use std::sync::atomic::AtomicU8;
 
-        impl<T: Copy> Atomic<T> {
-            #[inline]
-            pub fn new(v: T) -> Self {
-                Atomic(Cell::new(v))
-            }
+    const UNINITIALIZED: u8 = 0;
+    const INACTIVE: u8 = 1;
+    const ACTIVE: u8 = 2;
 
-            #[inline]
-            pub fn into_inner(self) -> T {
-                self.0.into_inner()
-            }
+    static MODE: AtomicU8 = AtomicU8::new(UNINITIALIZED);
 
-            #[inline]
-            pub fn load(&self, _: Ordering) -> T {
-                self.0.get()
-            }
-
-            #[inline]
-            pub fn store(&self, val: T, _: Ordering) {
-                self.0.set(val)
-            }
-
-            #[inline]
-            pub fn swap(&self, val: T, _: Ordering) -> T {
-                self.0.replace(val)
-            }
+    #[inline]
+    pub fn active() -> bool {
+        match MODE.load(Ordering::Relaxed) {
+            INACTIVE => false,
+            ACTIVE => true,
+            // Should panic here. Just for speed test.
+            // _ => panic!("uninitialized parallel mode!"),
+            _ => false,
         }
+    }
 
-        impl Atomic<bool> {
-            pub fn fetch_or(&self, val: bool, _: Ordering) -> bool {
-                let result = self.0.get() | val;
-                self.0.set(val);
-                result
-            }
-        }
+    // Only set by the `-Z threads` compile option
+    pub fn set(parallel: bool) {
+        let set: u8 = if parallel { ACTIVE } else { INACTIVE };
+        let previous =
+            MODE.compare_exchange(UNINITIALIZED, set, Ordering::Relaxed, Ordering::Relaxed);
 
-        impl<T: Copy + PartialEq> Atomic<T> {
-            #[inline]
-            pub fn compare_exchange(&self,
-                                    current: T,
-                                    new: T,
-                                    _: Ordering,
-                                    _: Ordering)
-                                    -> Result<T, T> {
-                let read = self.0.get();
-                if read == current {
-                    self.0.set(new);
-                    Ok(read)
-                } else {
-                    Err(read)
-                }
-            }
-        }
+        // Check that the mode was either uninitialized or was already set to the requested mode.
+        assert!(previous.is_ok() || previous == Err(set));
+    }
+}
 
-        impl<T: Add<Output=T> + Copy> Atomic<T> {
-            #[inline]
-            pub fn fetch_add(&self, val: T, _: Ordering) -> T {
-                let old = self.0.get();
-                self.0.set(old + val);
-                old
-            }
-        }
+pub use mode::{active, set};
+cfg_if! {
+    if #[cfg(not(parallel_compiler))] {
+        pub unsafe auto trait Send {}
+        pub unsafe auto trait Sync {}
 
-        pub type AtomicUsize = Atomic<usize>;
-        pub type AtomicBool = Atomic<bool>;
-        pub type AtomicU32 = Atomic<u32>;
-        pub type AtomicU64 = Atomic<u64>;
+        unsafe impl<T> Send for T {}
+        unsafe impl<T> Sync for T {}
 
         pub fn join<A, B, RA, RB>(oper_a: A, oper_b: B) -> (RA, RB)
             where A: FnOnce() -> RA,
@@ -146,7 +109,7 @@ cfg_if! {
 
         #[macro_export]
         macro_rules! parallel {
-            ($($blocks:tt),*) => {
+            ($($blocks:block),*) => {
                 // We catch panics here ensuring that all the blocks execute.
                 // This makes behavior consistent with the parallel compiler.
                 let mut panic = None;
@@ -165,12 +128,6 @@ cfg_if! {
             }
         }
 
-        pub use Iterator as ParallelIterator;
-
-        pub fn par_iter<T: IntoIterator>(t: T) -> T::IntoIter {
-            t.into_iter()
-        }
-
         pub fn par_for_each_in<T: IntoIterator>(t: T, mut for_each: impl FnMut(T::Item) + Sync + Send) {
             // We catch panics here ensuring that all the loop iterations execute.
             // This makes behavior consistent with the parallel compiler.
@@ -187,194 +144,290 @@ cfg_if! {
             }
         }
 
-        pub type MetadataRef = OwnedSlice;
-
-        pub use std::rc::Rc as Lrc;
-        pub use std::rc::Weak as Weak;
-        pub use std::cell::Ref as ReadGuard;
-        pub use std::cell::Ref as MappedReadGuard;
-        pub use std::cell::RefMut as WriteGuard;
-        pub use std::cell::RefMut as MappedWriteGuard;
-        pub use std::cell::RefMut as LockGuard;
-        pub use std::cell::RefMut as MappedLockGuard;
-
-        pub use std::cell::OnceCell;
-
-        use std::cell::RefCell as InnerRwLock;
-        use std::cell::RefCell as InnerLock;
-
-        use std::cell::Cell;
-
-        #[derive(Debug)]
-        pub struct WorkerLocal<T>(OneThread<T>);
-
-        impl<T> WorkerLocal<T> {
-            /// Creates a new worker local where the `initial` closure computes the
-            /// value this worker local should take for each thread in the thread pool.
-            #[inline]
-            pub fn new<F: FnMut(usize) -> T>(mut f: F) -> WorkerLocal<T> {
-                WorkerLocal(OneThread::new(f(0)))
-            }
-
-            /// Returns the worker-local value for each thread
-            #[inline]
-            pub fn into_inner(self) -> Vec<T> {
-                vec![OneThread::into_inner(self.0)]
+        pub fn par_map<T: IntoIterator, R, C: FromIterator<R>>(
+            t: T,
+            mut map: impl FnMut(<<T as IntoIterator>::IntoIter as Iterator>::Item) -> R,
+        ) -> C {
+            // We catch panics here ensuring that all the loop iterations execute.
+            let mut panic = None;
+            let r = t.into_iter().filter_map(|i| {
+                match catch_unwind(AssertUnwindSafe(|| map(i))) {
+                    Ok(r) => Some(r),
+                    Err(p) => {
+                        if panic.is_none() {
+                            panic = Some(p);
+                        }
+                        None
+                    }
+                }
+            }).collect();
+            if let Some(panic) = panic {
+                resume_unwind(panic);
             }
+            r
         }
 
-        impl<T> Deref for WorkerLocal<T> {
-            type Target = T;
+        pub type MetadataRef = OwnedSlice;
+    } else {
+        pub use std::marker::Send as Send;
+        pub use std::marker::Sync as Sync;
 
-            #[inline(always)]
-            fn deref(&self) -> &T {
-                &self.0
+        #[inline]
+        pub fn join<A, B, RA: DynSend, RB: DynSend>(oper_a: A, oper_b: B) -> (RA, RB)
+        where
+            A: FnOnce() -> RA + DynSend,
+            B: FnOnce() -> RB + DynSend,
+        {
+            if mode::active() {
+                let oper_a = FromDyn::from(oper_a);
+                let oper_b = FromDyn::from(oper_b);
+                let (a, b) = rayon::join(move || FromDyn::from(oper_a.into_inner()()), move || FromDyn::from(oper_b.into_inner()()));
+                (a.into_inner(), b.into_inner())
+            } else {
+                (oper_a(), oper_b())
             }
         }
 
-        pub type MTLockRef<'a, T> = &'a mut MTLock<T>;
+        // This function only works when `mode::active()`.
+        pub fn scope<'scope, OP, R>(op: OP) -> R
+        where
+            OP: FnOnce(&rayon::Scope<'scope>) -> R + DynSend,
+            R: DynSend,
+        {
+            let op = FromDyn::from(op);
+            rayon::scope(|s| FromDyn::from(op.into_inner()(s))).into_inner()
+        }
 
-        #[derive(Debug, Default)]
-        pub struct MTLock<T>(T);
+        /// Runs a list of blocks in parallel. The first block is executed immediately on
+        /// the current thread. Use that for the longest running block.
+        #[macro_export]
+        macro_rules! parallel {
+            (impl $fblock:block [$($c:expr,)*] [$block:expr $(, $rest:expr)*]) => {
+                parallel!(impl $fblock [$block, $($c,)*] [$($rest),*])
+            };
+            (impl $fblock:block [$($blocks:expr,)*] []) => {
+                ::rustc_data_structures::sync::scope(|s| {
+                    $(let block = rustc_data_structures::sync::FromDyn::from(|| $blocks);
+                    s.spawn(move |_| block.into_inner()());)*
+                    (|| $fblock)();
+                });
+            };
+            ($fblock:block, $($blocks:block),*) => {
+                if rustc_data_structures::sync::active() {
+                    // Reverse the order of the later blocks since Rayon executes them in reverse order
+                    // when using a single thread. This ensures the execution order matches that
+                    // of a single threaded rustc
+                    parallel!(impl $fblock [] [$($blocks),*]);
+                } else {
+                    // We catch panics here ensuring that all the blocks execute.
+                    // This makes behavior consistent with the parallel compiler.
+                    let mut panic = None;
+                    if let Err(p) = ::std::panic::catch_unwind(
+                        ::std::panic::AssertUnwindSafe(|| $fblock)
+                    ) {
+                        if panic.is_none() {
+                            panic = Some(p);
+                        }
+                    }
+                    $(
+                        if let Err(p) = ::std::panic::catch_unwind(
+                            ::std::panic::AssertUnwindSafe(|| $blocks)
+                        ) {
+                            if panic.is_none() {
+                                panic = Some(p);
+                            }
+                        }
+                    )*
+                    if let Some(panic) = panic {
+                        ::std::panic::resume_unwind(panic);
+                    }
+                }
+            };
+        }
 
-        impl<T> MTLock<T> {
-            #[inline(always)]
-            pub fn new(inner: T) -> Self {
-                MTLock(inner)
-            }
+        use rayon::iter::{FromParallelIterator, IntoParallelIterator, ParallelIterator};
 
-            #[inline(always)]
-            pub fn into_inner(self) -> T {
-                self.0
-            }
+        pub fn par_for_each_in<I, T: IntoIterator<Item = I> + IntoParallelIterator<Item = I>>(
+            t: T,
+            for_each: impl Fn(I) + DynSync + DynSend
+        ) {
+            if mode::active() {
+                let for_each = FromDyn::from(for_each);
+                let panic: Lock<Option<_>> = Lock::new(None);
+                t.into_par_iter().for_each(|i| if let Err(p) = catch_unwind(AssertUnwindSafe(|| for_each(i))) {
+                    let mut l = panic.lock();
+                    if l.is_none() {
+                        *l = Some(p)
+                    }
+                });
 
-            #[inline(always)]
-            pub fn get_mut(&mut self) -> &mut T {
-                &mut self.0
+                if let Some(panic) = panic.into_inner() {
+                    resume_unwind(panic);
+                }
+            } else {
+                // We catch panics here ensuring that all the loop iterations execute.
+                // This makes behavior consistent with the parallel compiler.
+                let mut panic = None;
+                t.into_iter().for_each(|i| {
+                    if let Err(p) = catch_unwind(AssertUnwindSafe(|| for_each(i))) {
+                        if panic.is_none() {
+                            panic = Some(p);
+                        }
+                    }
+                });
+                if let Some(panic) = panic {
+                    resume_unwind(panic);
+                }
             }
+        }
 
-            #[inline(always)]
-            pub fn lock(&self) -> &T {
-                &self.0
-            }
+        pub fn par_map<
+            I,
+            T: IntoIterator<Item = I> + IntoParallelIterator<Item = I>,
+            R: std::marker::Send,
+            C: FromIterator<R> + FromParallelIterator<R>
+        >(
+            t: T,
+            map: impl Fn(I) -> R + DynSync + DynSend
+        ) -> C {
+            if mode::active() {
+                let panic: Lock<Option<_>> = Lock::new(None);
+                let map = FromDyn::from(map);
+                // We catch panics here ensuring that all the loop iterations execute.
+                let r = t.into_par_iter().filter_map(|i| {
+                    match catch_unwind(AssertUnwindSafe(|| map(i))) {
+                        Ok(r) => Some(r),
+                        Err(p) => {
+                            let mut l = panic.lock();
+                            if l.is_none() {
+                                *l = Some(p);
+                            }
+                            None
+                        },
+                    }
+                }).collect();
 
-            #[inline(always)]
-            pub fn lock_mut(&mut self) -> &mut T {
-                &mut self.0
+                if let Some(panic) = panic.into_inner() {
+                    resume_unwind(panic);
+                }
+                r
+            } else {
+                // We catch panics here ensuring that all the loop iterations execute.
+                let mut panic = None;
+                let r = t.into_iter().filter_map(|i| {
+                    match catch_unwind(AssertUnwindSafe(|| map(i))) {
+                        Ok(r) => Some(r),
+                        Err(p) => {
+                            if panic.is_none() {
+                                panic = Some(p);
+                            }
+                            None
+                        }
+                    }
+                }).collect();
+                if let Some(panic) = panic {
+                    resume_unwind(panic);
+                }
+                r
             }
         }
 
-        // FIXME: Probably a bad idea (in the threaded case)
-        impl<T: Clone> Clone for MTLock<T> {
-            #[inline]
-            fn clone(&self) -> Self {
-                MTLock(self.0.clone())
-            }
-        }
-    } else {
-        pub use std::marker::Send as Send;
-        pub use std::marker::Sync as Sync;
+        pub type MetadataRef = OwnedSlice;
 
-        pub use parking_lot::RwLockReadGuard as ReadGuard;
-        pub use parking_lot::MappedRwLockReadGuard as MappedReadGuard;
-        pub use parking_lot::RwLockWriteGuard as WriteGuard;
-        pub use parking_lot::MappedRwLockWriteGuard as MappedWriteGuard;
+    }
+}
 
-        pub use parking_lot::MutexGuard as LockGuard;
-        pub use parking_lot::MappedMutexGuard as MappedLockGuard;
+pub use std::sync::Arc as Lrc;
 
-        pub use std::sync::OnceLock as OnceCell;
+use std::thread;
 
-        pub use std::sync::atomic::{AtomicBool, AtomicUsize, AtomicU32, AtomicU64};
+use parking_lot::RwLock as InnerRwLock;
 
-        pub use std::sync::Arc as Lrc;
-        pub use std::sync::Weak as Weak;
+pub use parking_lot::MappedRwLockReadGuard as MappedReadGuard;
+pub use parking_lot::MappedRwLockWriteGuard as MappedWriteGuard;
+pub use parking_lot::RwLockReadGuard as ReadGuard;
+pub use parking_lot::RwLockWriteGuard as WriteGuard;
 
-        pub type MTLockRef<'a, T> = &'a MTLock<T>;
+pub use std::sync::OnceLock as OnceCell;
 
-        #[derive(Debug, Default)]
-        pub struct MTLock<T>(Lock<T>);
+pub use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, AtomicUsize};
 
-        impl<T> MTLock<T> {
-            #[inline(always)]
-            pub fn new(inner: T) -> Self {
-                MTLock(Lock::new(inner))
-            }
+pub type MTLockRef<'a, T> = &'a MTLock<T>;
 
-            #[inline(always)]
-            pub fn into_inner(self) -> T {
-                self.0.into_inner()
-            }
+#[derive(Debug, Default)]
+pub struct MTLock<T>(Lock<T>);
 
-            #[inline(always)]
-            pub fn get_mut(&mut self) -> &mut T {
-                self.0.get_mut()
-            }
+impl<T> MTLock<T> {
+    #[inline(always)]
+    pub fn new(inner: T) -> Self {
+        MTLock(Lock::new(inner))
+    }
 
-            #[inline(always)]
-            pub fn lock(&self) -> LockGuard<'_, T> {
-                self.0.lock()
-            }
+    #[inline(always)]
+    pub fn into_inner(self) -> T {
+        self.0.into_inner()
+    }
 
-            #[inline(always)]
-            pub fn lock_mut(&self) -> LockGuard<'_, T> {
-                self.lock()
-            }
-        }
+    #[inline(always)]
+    pub fn get_mut(&mut self) -> &mut T {
+        self.0.get_mut()
+    }
 
-        use parking_lot::Mutex as InnerLock;
-        use parking_lot::RwLock as InnerRwLock;
+    #[inline(always)]
+    pub fn lock(&self) -> LockGuard<'_, T> {
+        self.0.lock()
+    }
 
-        use std::thread;
-        pub use rayon::{join, scope};
+    #[inline(always)]
+    pub fn lock_mut(&self) -> LockGuard<'_, T> {
+        self.lock()
+    }
+}
 
-        /// Runs a list of blocks in parallel. The first block is executed immediately on
-        /// the current thread. Use that for the longest running block.
-        #[macro_export]
-        macro_rules! parallel {
-            (impl $fblock:tt [$($c:tt,)*] [$block:tt $(, $rest:tt)*]) => {
-                parallel!(impl $fblock [$block, $($c,)*] [$($rest),*])
-            };
-            (impl $fblock:tt [$($blocks:tt,)*] []) => {
-                ::rustc_data_structures::sync::scope(|s| {
-                    $(
-                        s.spawn(|_| $blocks);
-                    )*
-                    $fblock;
-                })
-            };
-            ($fblock:tt, $($blocks:tt),*) => {
-                // Reverse the order of the later blocks since Rayon executes them in reverse order
-                // when using a single thread. This ensures the execution order matches that
-                // of a single threaded rustc
-                parallel!(impl $fblock [] [$($blocks),*]);
-            };
-        }
+/// This makes locks panic if they are already held.
+/// It is only useful when you are running in a single thread
+const ERROR_CHECKING: bool = false;
 
-        pub use rayon_core::WorkerLocal;
+pub unsafe trait DynSend {}
+pub unsafe trait DynSync {}
 
-        pub use rayon::iter::ParallelIterator;
-        use rayon::iter::IntoParallelIterator;
+unsafe impl<T> DynSend for T where T: Send {}
+unsafe impl<T> DynSync for T where T: Sync {}
 
-        pub fn par_iter<T: IntoParallelIterator>(t: T) -> T::Iter {
-            t.into_par_iter()
-        }
+#[derive(Copy, Clone)]
+pub struct FromDyn<T>(T);
 
-        pub fn par_for_each_in<T: IntoParallelIterator>(
-            t: T,
-            for_each: impl Fn(T::Item) + Sync + Send,
-        ) {
-            let ps: Vec<_> = t.into_par_iter().map(|i| catch_unwind(AssertUnwindSafe(|| for_each(i)))).collect();
-            ps.into_iter().for_each(|p| if let Err(panic) = p {
-                resume_unwind(panic)
-            });
-        }
+impl<T> FromDyn<T> {
+    #[inline(always)]
+    pub fn from(val: T) -> Self {
+        // Check that `sync::active()` is true on creation so we can
+        // implement `Send` and `Sync` for this structure when `T`
+        // implements `DynSend` and `DynSync` respectively.
+        #[cfg(parallel_compiler)]
+        assert!(mode::active());
+        FromDyn(val)
+    }
 
-        pub type MetadataRef = OwnedSlice;
+    #[inline(always)]
+    pub fn into_inner(self) -> T {
+        self.0
+    }
+}
+
+// `FromDyn` is `Send` if `T` is `DynSend`, since it ensures that sync::active() is true.
+#[cfg(parallel_compiler)]
+unsafe impl<T: DynSend> Send for FromDyn<T> {}
+
+// `FromDyn` is `Sync` if `T` is `DynSync`, since it ensures that sync::active() is true.
+#[cfg(parallel_compiler)]
+unsafe impl<T: DynSync> Sync for FromDyn<T> {}
+
+impl<T> std::ops::Deref for FromDyn<T> {
+    type Target = T;
 
-        /// This makes locks panic if they are already held.
-        /// It is only useful when you are running in a single thread
-        const ERROR_CHECKING: bool = false;
+    #[inline(always)]
+    fn deref(&self) -> &Self::Target {
+        &self.0
     }
 }
 
@@ -395,53 +448,86 @@ impl<K: Eq + Hash, V: Eq, S: BuildHasher> HashMapExt<K, V> for HashMap<K, V, S>
     }
 }
 
-#[derive(Debug)]
-pub struct Lock<T>(InnerLock<T>);
+pub struct Lock<T> {
+    single_thread: bool,
+    data: UnsafeCell<T>,
+    borrow: Cell<bool>,
+    mutex: RawMutex,
+}
+
+impl<T: Debug> Debug for Lock<T> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self.try_lock() {
+            Some(guard) => f.debug_struct("Lock").field("data", &&*guard).finish(),
+            None => {
+                struct LockedPlaceholder;
+                impl Debug for LockedPlaceholder {
+                    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+                        f.write_str("<locked>")
+                    }
+                }
+
+                f.debug_struct("Lock").field("data", &LockedPlaceholder).finish()
+            }
+        }
+    }
+}
 
 impl<T> Lock<T> {
-    #[inline(always)]
-    pub fn new(inner: T) -> Self {
-        Lock(InnerLock::new(inner))
+    #[inline]
+    pub fn new(val: T) -> Self {
+        Lock {
+            single_thread: !active(),
+            data: UnsafeCell::new(val),
+            borrow: Cell::new(false),
+            mutex: RawMutex::INIT,
+        }
     }
 
-    #[inline(always)]
+    #[inline]
     pub fn into_inner(self) -> T {
-        self.0.into_inner()
+        self.data.into_inner()
     }
 
-    #[inline(always)]
+    #[inline]
     pub fn get_mut(&mut self) -> &mut T {
-        self.0.get_mut()
+        self.data.get_mut()
     }
 
-    #[cfg(parallel_compiler)]
-    #[inline(always)]
-    pub fn try_lock(&self) -> Option<LockGuard<'_, T>> {
-        self.0.try_lock()
-    }
-
-    #[cfg(not(parallel_compiler))]
-    #[inline(always)]
+    #[inline]
     pub fn try_lock(&self) -> Option<LockGuard<'_, T>> {
-        self.0.try_borrow_mut().ok()
+        // SAFETY: the `&mut T` is accessible as long as self exists.
+        if likely(self.single_thread) {
+            if self.borrow.get() {
+                None
+            } else {
+                self.borrow.set(true);
+                Some(LockGuard { lock: &self, marker: PhantomData })
+            }
+        } else {
+            if !self.mutex.try_lock() {
+                None
+            } else {
+                Some(LockGuard { lock: &self, marker: PhantomData })
+            }
+        }
     }
 
-    #[cfg(parallel_compiler)]
-    #[inline(always)]
-    #[track_caller]
-    pub fn lock(&self) -> LockGuard<'_, T> {
-        if ERROR_CHECKING {
-            self.0.try_lock().expect("lock was already held")
+    #[inline(never)]
+    fn lock_raw(&self) {
+        if likely(self.single_thread) {
+            assert!(!self.borrow.get());
+            self.borrow.set(true);
         } else {
-            self.0.lock()
+            self.mutex.lock();
         }
     }
 
-    #[cfg(not(parallel_compiler))]
     #[inline(always)]
     #[track_caller]
     pub fn lock(&self) -> LockGuard<'_, T> {
-        self.0.borrow_mut()
+        self.lock_raw();
+        LockGuard { lock: &self, marker: PhantomData }
     }
 
     #[inline(always)]
@@ -470,6 +556,48 @@ impl<T: Default> Default for Lock<T> {
     }
 }
 
+// Just for speed test
+unsafe impl<T: Send> std::marker::Send for Lock<T> {}
+unsafe impl<T: Send> std::marker::Sync for Lock<T> {}
+
+pub struct LockGuard<'a, T> {
+    lock: &'a Lock<T>,
+    marker: PhantomData<&'a mut T>,
+}
+
+impl<T> Deref for LockGuard<'_, T> {
+    type Target = T;
+
+    #[inline(always)]
+    fn deref(&self) -> &T {
+        unsafe { &*self.lock.data.get() }
+    }
+}
+
+impl<T> DerefMut for LockGuard<'_, T> {
+    #[inline(always)]
+    fn deref_mut(&mut self) -> &mut T {
+        unsafe { &mut *self.lock.data.get() }
+    }
+}
+
+#[inline(never)]
+fn unlock_mt<T>(guard: &mut LockGuard<'_, T>) {
+    unsafe { guard.lock.mutex.unlock() }
+}
+
+impl<'a, T> Drop for LockGuard<'a, T> {
+    #[inline]
+    fn drop(&mut self) {
+        if likely(self.lock.single_thread) {
+            debug_assert!(self.lock.borrow.get());
+            self.lock.borrow.set(false);
+        } else {
+            unlock_mt(self)
+        }
+    }
+}
+
 #[derive(Debug, Default)]
 pub struct RwLock<T>(InnerRwLock<T>);
 
@@ -489,14 +617,6 @@ impl<T> RwLock<T> {
         self.0.get_mut()
     }
 
-    #[cfg(not(parallel_compiler))]
-    #[inline(always)]
-    #[track_caller]
-    pub fn read(&self) -> ReadGuard<'_, T> {
-        self.0.borrow()
-    }
-
-    #[cfg(parallel_compiler)]
     #[inline(always)]
     pub fn read(&self) -> ReadGuard<'_, T> {
         if ERROR_CHECKING {
@@ -512,26 +632,11 @@ impl<T> RwLock<T> {
         f(&*self.read())
     }
 
-    #[cfg(not(parallel_compiler))]
-    #[inline(always)]
-    pub fn try_write(&self) -> Result<WriteGuard<'_, T>, ()> {
-        self.0.try_borrow_mut().map_err(|_| ())
-    }
-
-    #[cfg(parallel_compiler)]
     #[inline(always)]
     pub fn try_write(&self) -> Result<WriteGuard<'_, T>, ()> {
         self.0.try_write().ok_or(())
     }
 
-    #[cfg(not(parallel_compiler))]
-    #[inline(always)]
-    #[track_caller]
-    pub fn write(&self) -> WriteGuard<'_, T> {
-        self.0.borrow_mut()
-    }
-
-    #[cfg(parallel_compiler)]
     #[inline(always)]
     pub fn write(&self) -> WriteGuard<'_, T> {
         if ERROR_CHECKING {
@@ -559,13 +664,6 @@ impl<T> RwLock<T> {
         self.write()
     }
 
-    #[cfg(not(parallel_compiler))]
-    #[inline(always)]
-    pub fn leak(&self) -> &T {
-        ReadGuard::leak(self.read())
-    }
-
-    #[cfg(parallel_compiler)]
     #[inline(always)]
     pub fn leak(&self) -> &T {
         let guard = self.read();
@@ -583,34 +681,68 @@ impl<T: Clone> Clone for RwLock<T> {
     }
 }
 
+#[derive(Debug)]
+pub struct WorkerLocal<T> {
+    single_thread: bool,
+    inner: T,
+    mt_inner: Option<rayon_core::WorkerLocal<T>>,
+}
+
+impl<T> WorkerLocal<T> {
+    /// Creates a new worker local where the `initial` closure computes the
+    /// value this worker local should take for each thread in the thread pool.
+    #[inline]
+    pub fn new<F: FnMut(usize) -> T>(mut f: F) -> WorkerLocal<T> {
+        if !active() {
+            WorkerLocal { single_thread: true, inner: f(0), mt_inner: None }
+        } else {
+            WorkerLocal {
+                single_thread: false,
+                inner: unsafe { MaybeUninit::uninit().assume_init() },
+                mt_inner: Some(rayon_core::WorkerLocal::new(f)),
+            }
+        }
+    }
+
+    /// Returns the worker-local value for each thread
+    #[inline]
+    pub fn into_inner(self) -> Vec<T> {
+        if self.single_thread { vec![self.inner] } else { self.mt_inner.unwrap().into_inner() }
+    }
+}
+
+impl<T> Deref for WorkerLocal<T> {
+    type Target = T;
+
+    #[inline(always)]
+    fn deref(&self) -> &T {
+        if self.single_thread { &self.inner } else { self.mt_inner.as_ref().unwrap().deref() }
+    }
+}
+
+// Just for speed test
+unsafe impl<T: Send> std::marker::Sync for WorkerLocal<T> {}
+
 /// A type which only allows its inner value to be used in one thread.
 /// It will panic if it is used on multiple threads.
 #[derive(Debug)]
 pub struct OneThread<T> {
-    #[cfg(parallel_compiler)]
     thread: thread::ThreadId,
     inner: T,
 }
 
-#[cfg(parallel_compiler)]
 unsafe impl<T> std::marker::Sync for OneThread<T> {}
-#[cfg(parallel_compiler)]
 unsafe impl<T> std::marker::Send for OneThread<T> {}
 
 impl<T> OneThread<T> {
     #[inline(always)]
     fn check(&self) {
-        #[cfg(parallel_compiler)]
         assert_eq!(thread::current().id(), self.thread);
     }
 
     #[inline(always)]
     pub fn new(inner: T) -> Self {
-        OneThread {
-            #[cfg(parallel_compiler)]
-            thread: thread::current().id(),
-            inner,
-        }
+        OneThread { thread: thread::current().id(), inner }
     }
 
     #[inline(always)]
diff --git a/compiler/rustc_data_structures/src/sync/vec.rs b/compiler/rustc_data_structures/src/sync/vec.rs
index 1783b4b357257..f3430b8c736d3 100644
--- a/compiler/rustc_data_structures/src/sync/vec.rs
+++ b/compiler/rustc_data_structures/src/sync/vec.rs
@@ -4,39 +4,22 @@ use rustc_index::vec::Idx;
 
 #[derive(Default)]
 pub struct AppendOnlyIndexVec<I: Idx, T: Copy> {
-    #[cfg(not(parallel_compiler))]
-    vec: elsa::vec::FrozenVec<T>,
-    #[cfg(parallel_compiler)]
     vec: elsa::sync::LockFreeFrozenVec<T>,
     _marker: PhantomData<fn(&I)>,
 }
 
 impl<I: Idx, T: Copy> AppendOnlyIndexVec<I, T> {
     pub fn new() -> Self {
-        Self {
-            #[cfg(not(parallel_compiler))]
-            vec: elsa::vec::FrozenVec::new(),
-            #[cfg(parallel_compiler)]
-            vec: elsa::sync::LockFreeFrozenVec::new(),
-            _marker: PhantomData,
-        }
+        Self { vec: elsa::sync::LockFreeFrozenVec::new(), _marker: PhantomData }
     }
 
     pub fn push(&self, val: T) -> I {
-        #[cfg(not(parallel_compiler))]
-        let i = self.vec.len();
-        #[cfg(not(parallel_compiler))]
-        self.vec.push(val);
-        #[cfg(parallel_compiler)]
         let i = self.vec.push(val);
         I::new(i)
     }
 
     pub fn get(&self, i: I) -> Option<T> {
         let i = i.index();
-        #[cfg(not(parallel_compiler))]
-        return self.vec.get_copy(i);
-        #[cfg(parallel_compiler)]
         return self.vec.get(i);
     }
 }
diff --git a/compiler/rustc_error_messages/Cargo.toml b/compiler/rustc_error_messages/Cargo.toml
index 481c94266f271..19e62918aaadc 100644
--- a/compiler/rustc_error_messages/Cargo.toml
+++ b/compiler/rustc_error_messages/Cargo.toml
@@ -9,7 +9,7 @@ edition = "2021"
 fluent-bundle = "0.15.2"
 fluent-syntax = "0.11"
 intl-memoizer = "0.5.1"
-rustc_baked_icu_data = { path = "../rustc_baked_icu_data" }
+rustc_baked_icu_data = { path = "../rustc_baked_icu_data", features = ["rustc_use_parallel_compiler"] }
 rustc_data_structures = { path = "../rustc_data_structures" }
 rustc_fluent_macro = { path = "../rustc_fluent_macro" }
 rustc_serialize = { path = "../rustc_serialize" }
@@ -22,4 +22,4 @@ icu_locid = "1.1.0"
 icu_provider_adapters = "1.1.0"
 
 [features]
-rustc_use_parallel_compiler = ['rustc_baked_icu_data/rustc_use_parallel_compiler']
+rustc_use_parallel_compiler = []
diff --git a/compiler/rustc_error_messages/src/lib.rs b/compiler/rustc_error_messages/src/lib.rs
index 88d94c93bf520..e5da8f2b2c8e4 100644
--- a/compiler/rustc_error_messages/src/lib.rs
+++ b/compiler/rustc_error_messages/src/lib.rs
@@ -22,15 +22,9 @@ use std::fs;
 use std::io;
 use std::path::{Path, PathBuf};
 
-#[cfg(not(parallel_compiler))]
-use std::cell::LazyCell as Lazy;
-#[cfg(parallel_compiler)]
 use std::sync::LazyLock as Lazy;
 
-#[cfg(parallel_compiler)]
 use intl_memoizer::concurrent::IntlLangMemoizer;
-#[cfg(not(parallel_compiler))]
-use intl_memoizer::IntlLangMemoizer;
 
 pub use fluent_bundle::{self, types::FluentType, FluentArgs, FluentError, FluentValue};
 pub use unic_langid::{langid, LanguageIdentifier};
@@ -39,16 +33,10 @@ fluent_messages! { "../messages.ftl" }
 
 pub type FluentBundle = fluent_bundle::bundle::FluentBundle<FluentResource, IntlLangMemoizer>;
 
-#[cfg(parallel_compiler)]
 fn new_bundle(locales: Vec<LanguageIdentifier>) -> FluentBundle {
     FluentBundle::new_concurrent(locales)
 }
 
-#[cfg(not(parallel_compiler))]
-fn new_bundle(locales: Vec<LanguageIdentifier>) -> FluentBundle {
-    FluentBundle::new(locales)
-}
-
 #[derive(Debug)]
 pub enum TranslationBundleError {
     /// Failed to read from `.ftl` file.
@@ -547,15 +535,6 @@ pub fn fluent_value_from_str_list_sep_by_and(l: Vec<Cow<'_, str>>) -> FluentValu
             Cow::Owned(result)
         }
 
-        #[cfg(not(parallel_compiler))]
-        fn as_string_threadsafe(
-            &self,
-            _intls: &intl_memoizer::concurrent::IntlLangMemoizer,
-        ) -> Cow<'static, str> {
-            unreachable!("`as_string_threadsafe` is not used in non-parallel rustc")
-        }
-
-        #[cfg(parallel_compiler)]
         fn as_string_threadsafe(
             &self,
             intls: &intl_memoizer::concurrent::IntlLangMemoizer,
diff --git a/compiler/rustc_errors/src/tests.rs b/compiler/rustc_errors/src/tests.rs
index 52103e4609770..8d029eb6e9429 100644
--- a/compiler/rustc_errors/src/tests.rs
+++ b/compiler/rustc_errors/src/tests.rs
@@ -26,12 +26,8 @@ fn make_dummy(ftl: &'static str) -> Dummy {
 
     let langid_en = langid!("en-US");
 
-    #[cfg(parallel_compiler)]
     let mut bundle = FluentBundle::new_concurrent(vec![langid_en]);
 
-    #[cfg(not(parallel_compiler))]
-    let mut bundle = FluentBundle::new(vec![langid_en]);
-
     bundle.add_resource(resource).expect("Failed to add FTL resources to the bundle.");
 
     Dummy { bundle }
diff --git a/compiler/rustc_interface/src/interface.rs b/compiler/rustc_interface/src/interface.rs
index be7fa9378ca66..3ed82947fe9af 100644
--- a/compiler/rustc_interface/src/interface.rs
+++ b/compiler/rustc_interface/src/interface.rs
@@ -58,6 +58,11 @@ impl Compiler {
     }
 }
 
+#[allow(rustc::bad_opt_access)]
+pub fn set_parallel_mode(sopts: &config::UnstableOptions) {
+    rustc_data_structures::sync::set(sopts.threads > 1);
+}
+
 /// Converts strings provided as `--cfg [cfgspec]` into a `crate_cfg`.
 pub fn parse_cfgspecs(cfgspecs: Vec<String>) -> FxHashSet<(String, Option<String>)> {
     rustc_span::create_default_session_if_not_set_then(move |_| {
diff --git a/compiler/rustc_interface/src/util.rs b/compiler/rustc_interface/src/util.rs
index 612903810d211..52719eaa967ad 100644
--- a/compiler/rustc_interface/src/util.rs
+++ b/compiler/rustc_interface/src/util.rs
@@ -126,12 +126,15 @@ fn get_stack_size() -> Option<usize> {
     env::var_os("RUST_MIN_STACK").is_none().then_some(STACK_SIZE)
 }
 
-#[cfg(not(parallel_compiler))]
 pub(crate) fn run_in_thread_pool_with_globals<F: FnOnce() -> R + Send, R: Send>(
     edition: Edition,
     _threads: usize,
     f: F,
 ) -> R {
+    #[cfg(parallel_compiler)]
+    if _threads > 1 {
+        return run_in_threads_pool_with_globals(edition, _threads, f);
+    }
     // The "thread pool" is a single spawned thread in the non-parallel
     // compiler. We run on a spawned thread instead of the main thread (a) to
     // provide control over the stack size, and (b) to increase similarity with
@@ -161,7 +164,7 @@ pub(crate) fn run_in_thread_pool_with_globals<F: FnOnce() -> R + Send, R: Send>(
 }
 
 #[cfg(parallel_compiler)]
-pub(crate) fn run_in_thread_pool_with_globals<F: FnOnce() -> R + Send, R: Send>(
+pub(crate) fn run_in_threads_pool_with_globals<F: FnOnce() -> R + Send, R: Send>(
     edition: Edition,
     threads: usize,
     f: F,
diff --git a/compiler/rustc_metadata/src/rmeta/encoder.rs b/compiler/rustc_metadata/src/rmeta/encoder.rs
index fd8e49efea03a..065e7c3bf5c77 100644
--- a/compiler/rustc_metadata/src/rmeta/encoder.rs
+++ b/compiler/rustc_metadata/src/rmeta/encoder.rs
@@ -8,7 +8,7 @@ use rustc_data_structures::fingerprint::Fingerprint;
 use rustc_data_structures::fx::{FxHashMap, FxIndexSet};
 use rustc_data_structures::memmap::{Mmap, MmapMut};
 use rustc_data_structures::stable_hasher::{Hash128, HashStable, StableHasher};
-use rustc_data_structures::sync::{join, par_iter, Lrc, ParallelIterator};
+use rustc_data_structures::sync::{join, par_for_each_in, Lrc};
 use rustc_data_structures::temp_dir::MaybeTempDir;
 use rustc_hir as hir;
 use rustc_hir::def::DefKind;
@@ -2125,7 +2125,7 @@ fn prefetch_mir(tcx: TyCtxt<'_>) {
         return;
     }
 
-    par_iter(tcx.mir_keys(())).for_each(|&def_id| {
+    par_for_each_in(tcx.mir_keys(()), |&def_id| {
         let (encode_const, encode_opt) = should_encode_mir(tcx, def_id);
 
         if encode_const {
diff --git a/compiler/rustc_middle/Cargo.toml b/compiler/rustc_middle/Cargo.toml
index dfbe8ac0ba3dc..323ec9a3a8fc9 100644
--- a/compiler/rustc_middle/Cargo.toml
+++ b/compiler/rustc_middle/Cargo.toml
@@ -27,7 +27,7 @@ rustc_hir = { path = "../rustc_hir" }
 rustc_index = { path = "../rustc_index" }
 rustc_macros = { path = "../rustc_macros" }
 rustc_query_system = { path = "../rustc_query_system" }
-rustc-rayon-core = { version = "0.5.0", optional = true }
+rustc-rayon-core = { version = "0.5.0" }
 rustc-rayon = { version = "0.5.0", optional = true }
 rustc_serialize = { path = "../rustc_serialize" }
 rustc_session = { path = "../rustc_session" }
@@ -39,4 +39,4 @@ thin-vec = "0.2.12"
 tracing = "0.1"
 
 [features]
-rustc_use_parallel_compiler = ["rustc-rayon", "rustc-rayon-core"]
+rustc_use_parallel_compiler = ["rustc-rayon"]
diff --git a/compiler/rustc_middle/src/ty/context/tls.rs b/compiler/rustc_middle/src/ty/context/tls.rs
index fb0d909307e78..6404efe3eca2a 100644
--- a/compiler/rustc_middle/src/ty/context/tls.rs
+++ b/compiler/rustc_middle/src/ty/context/tls.rs
@@ -4,8 +4,6 @@ use crate::dep_graph::TaskDepsRef;
 use crate::ty::query;
 use rustc_data_structures::sync::{self, Lock};
 use rustc_errors::Diagnostic;
-#[cfg(not(parallel_compiler))]
-use std::cell::Cell;
 use std::mem;
 use std::ptr;
 use thin_vec::ThinVec;
@@ -49,17 +47,8 @@ impl<'a, 'tcx> ImplicitCtxt<'a, 'tcx> {
     }
 }
 
-// Import the thread-local variable from Rayon, which is preserved for Rayon jobs.
-#[cfg(parallel_compiler)]
 use rayon_core::tlv::TLV;
 
-// Otherwise define our own
-#[cfg(not(parallel_compiler))]
-thread_local! {
-    /// A thread local variable that stores a pointer to the current `ImplicitCtxt`.
-    static TLV: Cell<*const ()> = const { Cell::new(ptr::null()) };
-}
-
 #[inline]
 fn erase(context: &ImplicitCtxt<'_, '_>) -> *const () {
     context as *const _ as *const ()
diff --git a/compiler/rustc_query_system/Cargo.toml b/compiler/rustc_query_system/Cargo.toml
index 34e576379762b..308a36af1e556 100644
--- a/compiler/rustc_query_system/Cargo.toml
+++ b/compiler/rustc_query_system/Cargo.toml
@@ -16,7 +16,7 @@ rustc_fluent_macro = { path = "../rustc_fluent_macro" }
 rustc_hir = { path = "../rustc_hir" }
 rustc_index = { path = "../rustc_index" }
 rustc_macros = { path = "../rustc_macros" }
-rustc-rayon-core = { version = "0.5.0", optional = true }
+rustc-rayon-core = { version = "0.5.0" }
 rustc_serialize = { path = "../rustc_serialize" }
 rustc_session = { path = "../rustc_session" }
 rustc_span = { path = "../rustc_span" }
@@ -27,4 +27,4 @@ thin-vec = "0.2.12"
 tracing = "0.1"
 
 [features]
-rustc_use_parallel_compiler = ["rustc-rayon-core"]
+rustc_use_parallel_compiler = []
diff --git a/compiler/rustc_query_system/src/dep_graph/graph.rs b/compiler/rustc_query_system/src/dep_graph/graph.rs
index fd9e685ab8085..c27ca68cd306e 100644
--- a/compiler/rustc_query_system/src/dep_graph/graph.rs
+++ b/compiler/rustc_query_system/src/dep_graph/graph.rs
@@ -1159,6 +1159,12 @@ impl<K: DepKind> CurrentDepGraph<K> {
             .get_or_alloc_cached_string("incr_comp_intern_dep_graph_node")
             .map(EventId::from_label);
 
+        let capacity = if rustc_data_structures::sync::active() {
+            new_node_count_estimate / sharded::SHARDS
+        } else {
+            new_node_count_estimate
+        };
+
         CurrentDepGraph {
             encoder: Steal::new(GraphEncoder::new(
                 encoder,
@@ -1167,10 +1173,7 @@ impl<K: DepKind> CurrentDepGraph<K> {
                 record_stats,
             )),
             new_node_to_index: Sharded::new(|| {
-                FxHashMap::with_capacity_and_hasher(
-                    new_node_count_estimate / sharded::SHARDS,
-                    Default::default(),
-                )
+                FxHashMap::with_capacity_and_hasher(capacity, Default::default())
             }),
             prev_index_to_index: Lock::new(IndexVec::from_elem_n(None, prev_graph_node_count)),
             anon_id_seed,
diff --git a/compiler/rustc_query_system/src/query/caches.rs b/compiler/rustc_query_system/src/query/caches.rs
index 29f6a07e81beb..ea6a75a6367e1 100644
--- a/compiler/rustc_query_system/src/query/caches.rs
+++ b/compiler/rustc_query_system/src/query/caches.rs
@@ -2,7 +2,6 @@ use crate::dep_graph::DepNodeIndex;
 
 use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::sharded;
-#[cfg(parallel_compiler)]
 use rustc_data_structures::sharded::Sharded;
 use rustc_data_structures::sync::Lock;
 use rustc_index::vec::{Idx, IndexVec};
@@ -37,10 +36,7 @@ impl<'tcx, K: Eq + Hash, V: 'tcx> CacheSelector<'tcx, V> for DefaultCacheSelecto
 }
 
 pub struct DefaultCache<K, V> {
-    #[cfg(parallel_compiler)]
     cache: Sharded<FxHashMap<K, (V, DepNodeIndex)>>,
-    #[cfg(not(parallel_compiler))]
-    cache: Lock<FxHashMap<K, (V, DepNodeIndex)>>,
 }
 
 impl<K, V> Default for DefaultCache<K, V> {
@@ -60,10 +56,8 @@ where
     #[inline(always)]
     fn lookup(&self, key: &K) -> Option<(V, DepNodeIndex)> {
         let key_hash = sharded::make_hash(key);
-        #[cfg(parallel_compiler)]
         let lock = self.cache.get_shard_by_hash(key_hash).lock();
-        #[cfg(not(parallel_compiler))]
-        let lock = self.cache.lock();
+
         let result = lock.raw_entry().from_key_hashed_nocheck(key_hash, key);
 
         if let Some((_, value)) = result { Some(*value) } else { None }
@@ -71,29 +65,17 @@ where
 
     #[inline]
     fn complete(&self, key: K, value: V, index: DepNodeIndex) {
-        #[cfg(parallel_compiler)]
         let mut lock = self.cache.get_shard_by_value(&key).lock();
-        #[cfg(not(parallel_compiler))]
-        let mut lock = self.cache.lock();
+
         // We may be overwriting another value. This is all right, since the dep-graph
         // will check that the fingerprint matches.
         lock.insert(key, (value, index));
     }
 
     fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
-        #[cfg(parallel_compiler)]
-        {
-            let shards = self.cache.lock_shards();
-            for shard in shards.iter() {
-                for (k, v) in shard.iter() {
-                    f(k, &v.0, v.1);
-                }
-            }
-        }
-        #[cfg(not(parallel_compiler))]
-        {
-            let map = self.cache.lock();
-            for (k, v) in map.iter() {
+        let shards = self.cache.lock_shards();
+        for shard in shards.iter() {
+            for (k, v) in shard.iter() {
                 f(k, &v.0, v.1);
             }
         }
@@ -151,10 +133,7 @@ impl<'tcx, K: Idx, V: 'tcx> CacheSelector<'tcx, V> for VecCacheSelector<K> {
 }
 
 pub struct VecCache<K: Idx, V> {
-    #[cfg(parallel_compiler)]
     cache: Sharded<IndexVec<K, Option<(V, DepNodeIndex)>>>,
-    #[cfg(not(parallel_compiler))]
-    cache: Lock<IndexVec<K, Option<(V, DepNodeIndex)>>>,
 }
 
 impl<K: Idx, V> Default for VecCache<K, V> {
@@ -173,38 +152,22 @@ where
 
     #[inline(always)]
     fn lookup(&self, key: &K) -> Option<(V, DepNodeIndex)> {
-        #[cfg(parallel_compiler)]
         let lock = self.cache.get_shard_by_hash(key.index() as u64).lock();
-        #[cfg(not(parallel_compiler))]
-        let lock = self.cache.lock();
+
         if let Some(Some(value)) = lock.get(*key) { Some(*value) } else { None }
     }
 
     #[inline]
     fn complete(&self, key: K, value: V, index: DepNodeIndex) {
-        #[cfg(parallel_compiler)]
         let mut lock = self.cache.get_shard_by_hash(key.index() as u64).lock();
-        #[cfg(not(parallel_compiler))]
-        let mut lock = self.cache.lock();
+
         lock.insert(key, (value, index));
     }
 
     fn iter(&self, f: &mut dyn FnMut(&Self::Key, &Self::Value, DepNodeIndex)) {
-        #[cfg(parallel_compiler)]
-        {
-            let shards = self.cache.lock_shards();
-            for shard in shards.iter() {
-                for (k, v) in shard.iter_enumerated() {
-                    if let Some(v) = v {
-                        f(&k, &v.0, v.1);
-                    }
-                }
-            }
-        }
-        #[cfg(not(parallel_compiler))]
-        {
-            let map = self.cache.lock();
-            for (k, v) in map.iter_enumerated() {
+        let shards = self.cache.lock_shards();
+        for shard in shards.iter() {
+            for (k, v) in shard.iter_enumerated() {
                 if let Some(v) = v {
                     f(&k, &v.0, v.1);
                 }
diff --git a/compiler/rustc_query_system/src/query/job.rs b/compiler/rustc_query_system/src/query/job.rs
index a534b54070cd0..6789f2f5f0976 100644
--- a/compiler/rustc_query_system/src/query/job.rs
+++ b/compiler/rustc_query_system/src/query/job.rs
@@ -15,7 +15,6 @@ use rustc_span::Span;
 use std::hash::Hash;
 use std::num::NonZeroU64;
 
-#[cfg(parallel_compiler)]
 use {
     parking_lot::{Condvar, Mutex},
     rayon_core,
@@ -47,17 +46,14 @@ impl QueryJobId {
         map.get(&self).unwrap().query.clone()
     }
 
-    #[cfg(parallel_compiler)]
     fn span<D: DepKind>(self, map: &QueryMap<D>) -> Span {
         map.get(&self).unwrap().job.span
     }
 
-    #[cfg(parallel_compiler)]
     fn parent<D: DepKind>(self, map: &QueryMap<D>) -> Option<QueryJobId> {
         map.get(&self).unwrap().job.parent
     }
 
-    #[cfg(parallel_compiler)]
     fn latch<D: DepKind>(self, map: &QueryMap<D>) -> Option<&QueryLatch<D>> {
         map.get(&self).unwrap().job.latch.as_ref()
     }
@@ -81,7 +77,6 @@ pub struct QueryJob<D: DepKind> {
     pub parent: Option<QueryJobId>,
 
     /// The latch that is used to wait on this job.
-    #[cfg(parallel_compiler)]
     latch: Option<QueryLatch<D>>,
     spooky: core::marker::PhantomData<D>,
 }
@@ -90,17 +85,9 @@ impl<D: DepKind> QueryJob<D> {
     /// Creates a new query job.
     #[inline]
     pub fn new(id: QueryJobId, span: Span, parent: Option<QueryJobId>) -> Self {
-        QueryJob {
-            id,
-            span,
-            parent,
-            #[cfg(parallel_compiler)]
-            latch: None,
-            spooky: PhantomData,
-        }
+        QueryJob { id, span, parent, latch: None, spooky: PhantomData }
     }
 
-    #[cfg(parallel_compiler)]
     pub(super) fn latch(&mut self) -> QueryLatch<D> {
         if self.latch.is_none() {
             self.latch = Some(QueryLatch::new());
@@ -114,17 +101,13 @@ impl<D: DepKind> QueryJob<D> {
     /// as there are no concurrent jobs which could be waiting on us
     #[inline]
     pub fn signal_complete(self) {
-        #[cfg(parallel_compiler)]
-        {
-            if let Some(latch) = self.latch {
-                latch.set();
-            }
+        if let Some(latch) = self.latch {
+            latch.set();
         }
     }
 }
 
 impl QueryJobId {
-    #[cfg(not(parallel_compiler))]
     pub(super) fn find_cycle_in_stack<D: DepKind>(
         &self,
         query_map: QueryMap<D>,
@@ -185,7 +168,6 @@ impl QueryJobId {
     }
 }
 
-#[cfg(parallel_compiler)]
 struct QueryWaiter<D: DepKind> {
     query: Option<QueryJobId>,
     condvar: Condvar,
@@ -193,7 +175,6 @@ struct QueryWaiter<D: DepKind> {
     cycle: Lock<Option<CycleError<D>>>,
 }
 
-#[cfg(parallel_compiler)]
 impl<D: DepKind> QueryWaiter<D> {
     fn notify(&self, registry: &rayon_core::Registry) {
         rayon_core::mark_unblocked(registry);
@@ -201,19 +182,16 @@ impl<D: DepKind> QueryWaiter<D> {
     }
 }
 
-#[cfg(parallel_compiler)]
 struct QueryLatchInfo<D: DepKind> {
     complete: bool,
     waiters: Vec<Lrc<QueryWaiter<D>>>,
 }
 
-#[cfg(parallel_compiler)]
 #[derive(Clone)]
 pub(super) struct QueryLatch<D: DepKind> {
     info: Lrc<Mutex<QueryLatchInfo<D>>>,
 }
 
-#[cfg(parallel_compiler)]
 impl<D: DepKind> QueryLatch<D> {
     fn new() -> Self {
         QueryLatch {
@@ -284,7 +262,6 @@ impl<D: DepKind> QueryLatch<D> {
 }
 
 /// A resumable waiter of a query. The usize is the index into waiters in the query's latch
-#[cfg(parallel_compiler)]
 type Waiter = (QueryJobId, usize);
 
 /// Visits all the non-resumable and resumable waiters of a query.
@@ -296,7 +273,6 @@ type Waiter = (QueryJobId, usize);
 /// For visits of resumable waiters it returns Some(Some(Waiter)) which has the
 /// required information to resume the waiter.
 /// If all `visit` calls returns None, this function also returns None.
-#[cfg(parallel_compiler)]
 fn visit_waiters<F, D>(
     query_map: &QueryMap<D>,
     query: QueryJobId,
@@ -332,7 +308,6 @@ where
 /// `span` is the reason for the `query` to execute. This is initially DUMMY_SP.
 /// If a cycle is detected, this initial value is replaced with the span causing
 /// the cycle.
-#[cfg(parallel_compiler)]
 fn cycle_check<D: DepKind>(
     query_map: &QueryMap<D>,
     query: QueryJobId,
@@ -373,7 +348,6 @@ fn cycle_check<D: DepKind>(
 /// Finds out if there's a path to the compiler root (aka. code which isn't in a query)
 /// from `query` without going through any of the queries in `visited`.
 /// This is achieved with a depth first search.
-#[cfg(parallel_compiler)]
 fn connected_to_root<D: DepKind>(
     query_map: &QueryMap<D>,
     query: QueryJobId,
@@ -396,7 +370,6 @@ fn connected_to_root<D: DepKind>(
 }
 
 // Deterministically pick an query from a list
-#[cfg(parallel_compiler)]
 fn pick_query<'a, T, F, D>(query_map: &QueryMap<D>, queries: &'a [T], f: F) -> &'a T
 where
     F: Fn(&T) -> (Span, QueryJobId),
@@ -423,7 +396,6 @@ where
 /// the function return true.
 /// If a cycle was not found, the starting query is removed from `jobs` and
 /// the function returns false.
-#[cfg(parallel_compiler)]
 fn remove_cycle<D: DepKind>(
     query_map: &QueryMap<D>,
     jobs: &mut Vec<QueryJobId>,
@@ -528,7 +500,6 @@ fn remove_cycle<D: DepKind>(
 /// uses a query latch and then resuming that waiter.
 /// There may be multiple cycles involved in a deadlock, so this searches
 /// all active queries for cycles before finally resuming all the waiters at once.
-#[cfg(parallel_compiler)]
 pub fn deadlock<D: DepKind>(query_map: QueryMap<D>, registry: &rayon_core::Registry) {
     let on_panic = OnDrop(|| {
         eprintln!("deadlock handler panicked, aborting process");
diff --git a/compiler/rustc_query_system/src/query/mod.rs b/compiler/rustc_query_system/src/query/mod.rs
index fa1f51b04da77..8abf8501b2ea1 100644
--- a/compiler/rustc_query_system/src/query/mod.rs
+++ b/compiler/rustc_query_system/src/query/mod.rs
@@ -2,7 +2,6 @@ mod plumbing;
 pub use self::plumbing::*;
 
 mod job;
-#[cfg(parallel_compiler)]
 pub use self::job::deadlock;
 pub use self::job::{print_query_stack, QueryInfo, QueryJob, QueryJobId, QueryJobInfo, QueryMap};
 
@@ -37,7 +36,6 @@ pub struct QueryStackFrame<D: DepKind> {
     pub dep_kind: D,
     /// This hash is used to deterministically pick
     /// a query to remove cycles in the parallel compiler.
-    #[cfg(parallel_compiler)]
     hash: Hash64,
 }
 
@@ -52,16 +50,7 @@ impl<D: DepKind> QueryStackFrame<D> {
         ty_adt_id: Option<DefId>,
         _hash: impl FnOnce() -> Hash64,
     ) -> Self {
-        Self {
-            description,
-            span,
-            def_id,
-            def_kind,
-            ty_adt_id,
-            dep_kind,
-            #[cfg(parallel_compiler)]
-            hash: _hash(),
-        }
+        Self { description, span, def_id, def_kind, ty_adt_id, dep_kind, hash: _hash() }
     }
 
     // FIXME(eddyb) Get more valid `Span`s on queries.
diff --git a/compiler/rustc_query_system/src/query/plumbing.rs b/compiler/rustc_query_system/src/query/plumbing.rs
index bce01debc53b0..0bc35a6265dc4 100644
--- a/compiler/rustc_query_system/src/query/plumbing.rs
+++ b/compiler/rustc_query_system/src/query/plumbing.rs
@@ -6,18 +6,17 @@ use crate::dep_graph::{DepContext, DepKind, DepNode, DepNodeIndex, DepNodeParams
 use crate::dep_graph::{DepGraphData, HasDepContext};
 use crate::ich::StableHashingContext;
 use crate::query::caches::QueryCache;
-#[cfg(parallel_compiler)]
 use crate::query::job::QueryLatch;
 use crate::query::job::{report_cycle, QueryInfo, QueryJob, QueryJobId, QueryJobInfo};
 use crate::query::SerializedDepNodeIndex;
 use crate::query::{QueryContext, QueryMap, QuerySideEffects, QueryStackFrame};
 use crate::HandleCycleError;
+use rustc_data_structures::cold_path;
 use rustc_data_structures::fingerprint::Fingerprint;
 use rustc_data_structures::fx::FxHashMap;
+use rustc_data_structures::sharded::Sharded;
 use rustc_data_structures::stack::ensure_sufficient_stack;
 use rustc_data_structures::sync::Lock;
-#[cfg(parallel_compiler)]
-use rustc_data_structures::{cold_path, sharded::Sharded};
 use rustc_errors::{DiagnosticBuilder, ErrorGuaranteed, FatalError};
 use rustc_span::{Span, DUMMY_SP};
 use std::cell::Cell;
@@ -30,10 +29,7 @@ use thin_vec::ThinVec;
 use super::QueryConfig;
 
 pub struct QueryState<K, D: DepKind> {
-    #[cfg(parallel_compiler)]
     active: Sharded<FxHashMap<K, QueryResult<D>>>,
-    #[cfg(not(parallel_compiler))]
-    active: Lock<FxHashMap<K, QueryResult<D>>>,
 }
 
 /// Indicates the state of a query for a given key in a query map.
@@ -52,15 +48,8 @@ where
     D: DepKind,
 {
     pub fn all_inactive(&self) -> bool {
-        #[cfg(parallel_compiler)]
-        {
-            let shards = self.active.lock_shards();
-            shards.iter().all(|shard| shard.is_empty())
-        }
-        #[cfg(not(parallel_compiler))]
-        {
-            self.active.lock().is_empty()
-        }
+        let shards = self.active.lock_shards();
+        shards.iter().all(|shard| shard.is_empty())
     }
 
     pub fn try_collect_active_jobs<Qcx: Copy>(
@@ -69,27 +58,11 @@ where
         make_query: fn(Qcx, K) -> QueryStackFrame<D>,
         jobs: &mut QueryMap<D>,
     ) -> Option<()> {
-        #[cfg(parallel_compiler)]
-        {
-            // We use try_lock_shards here since we are called from the
-            // deadlock handler, and this shouldn't be locked.
-            let shards = self.active.try_lock_shards()?;
-            for shard in shards.iter() {
-                for (k, v) in shard.iter() {
-                    if let QueryResult::Started(ref job) = *v {
-                        let query = make_query(qcx, *k);
-                        jobs.insert(job.id, QueryJobInfo { query, job: job.clone() });
-                    }
-                }
-            }
-        }
-        #[cfg(not(parallel_compiler))]
-        {
-            // We use try_lock here since we are called from the
-            // deadlock handler, and this shouldn't be locked.
-            // (FIXME: Is this relevant for non-parallel compilers? It doesn't
-            // really hurt much.)
-            for (k, v) in self.active.try_lock()?.iter() {
+        // We use try_lock_shards here since we are called from the
+        // deadlock handler, and this shouldn't be locked.
+        let shards = self.active.try_lock_shards()?;
+        for shard in shards.iter() {
+            for (k, v) in shard.iter() {
                 if let QueryResult::Started(ref job) = *v {
                     let query = make_query(qcx, *k);
                     jobs.insert(job.id, QueryJobInfo { query, job: job.clone() });
@@ -183,10 +156,8 @@ where
         cache.complete(key, result, dep_node_index);
 
         let job = {
-            #[cfg(parallel_compiler)]
             let mut lock = state.active.get_shard_by_value(&key).lock();
-            #[cfg(not(parallel_compiler))]
-            let mut lock = state.active.lock();
+
             match lock.remove(&key).unwrap() {
                 QueryResult::Started(job) => job,
                 QueryResult::Poisoned => panic!(),
@@ -208,10 +179,8 @@ where
         // Poison the query so jobs waiting on it panic.
         let state = self.state;
         let job = {
-            #[cfg(parallel_compiler)]
             let mut shard = state.active.get_shard_by_value(&self.key).lock();
-            #[cfg(not(parallel_compiler))]
-            let mut shard = state.active.lock();
+
             let job = match shard.remove(&self.key).unwrap() {
                 QueryResult::Started(job) => job,
                 QueryResult::Poisoned => panic!(),
@@ -254,7 +223,6 @@ where
 
 #[cold]
 #[inline(never)]
-#[cfg(not(parallel_compiler))]
 fn cycle_error<Q, Qcx>(
     query: Q,
     qcx: Qcx,
@@ -274,7 +242,6 @@ where
 }
 
 #[inline(always)]
-#[cfg(parallel_compiler)]
 fn wait_for_query<Q, Qcx>(
     query: Q,
     qcx: Qcx,
@@ -324,18 +291,14 @@ where
     Qcx: QueryContext,
 {
     let state = query.query_state(qcx);
-    #[cfg(parallel_compiler)]
     let mut state_lock = state.active.get_shard_by_value(&key).lock();
-    #[cfg(not(parallel_compiler))]
-    let mut state_lock = state.active.lock();
-
     // For the parallel compiler we need to check both the query cache and query state structures
     // while holding the state lock to ensure that 1) the query has not yet completed and 2) the
     // query is not still executing. Without checking the query cache here, we can end up
     // re-executing the query since `try_start` only checks that the query is not currently
     // executing, but another thread may have already completed the query and stores it result
     // in the query cache.
-    if cfg!(parallel_compiler) && qcx.dep_context().sess().threads() > 1 {
+    if qcx.dep_context().sess().threads() > 1 {
         if let Some((value, index)) = query.query_cache(qcx).lookup(&key) {
             qcx.dep_context().profiler().query_cache_hit(index.into());
             return (value, Some(index));
@@ -359,17 +322,15 @@ where
         }
         Entry::Occupied(mut entry) => {
             match entry.get_mut() {
-                #[cfg(not(parallel_compiler))]
                 QueryResult::Started(job) => {
-                    let id = job.id;
-                    drop(state_lock);
+                    if !rustc_data_structures::sync::active() {
+                        let id = job.id;
+                        drop(state_lock);
 
-                    // If we are single-threaded we know that we have cycle error,
-                    // so we just return the error.
-                    cycle_error(query, qcx, id, span)
-                }
-                #[cfg(parallel_compiler)]
-                QueryResult::Started(job) => {
+                        // If we are single-threaded we know that we have cycle error,
+                        // so we just return the error.
+                        return cycle_error(query, qcx, id, span);
+                    }
                     // Get the latch out
                     let latch = job.latch();
                     drop(state_lock);
diff --git a/src/librustdoc/clean/utils.rs b/src/librustdoc/clean/utils.rs
index b802fd065fe54..aa456921c8f2f 100644
--- a/src/librustdoc/clean/utils.rs
+++ b/src/librustdoc/clean/utils.rs
@@ -470,12 +470,6 @@ pub(crate) fn get_auto_trait_and_blanket_impls(
     cx: &mut DocContext<'_>,
     item_def_id: DefId,
 ) -> impl Iterator<Item = Item> {
-    // FIXME: To be removed once `parallel_compiler` bugs are fixed!
-    // More information in <https://github.com/rust-lang/rust/pull/106930>.
-    if cfg!(parallel_compiler) {
-        return vec![].into_iter().chain(vec![].into_iter());
-    }
-
     let auto_impls = cx
         .sess()
         .prof
diff --git a/src/librustdoc/lib.rs b/src/librustdoc/lib.rs
index 4a88dc5254de6..48ff9f1c30eb4 100644
--- a/src/librustdoc/lib.rs
+++ b/src/librustdoc/lib.rs
@@ -728,6 +728,9 @@ fn main_args(at_args: &[String]) -> MainResult {
         }
     };
 
+    // Set parallel mode early as the error handler will already create locks.
+    interface::set_parallel_mode(&options.unstable_opts);
+
     let diag = core::new_handler(
         options.error_format,
         None,