From 3b61fad895e590e3ee42520cba1bc664e788f2c5 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 28 Dec 2023 22:00:47 +0800
Subject: [PATCH 01/48] WIP: Add ScheduleCollection when requesting GC

---
 src/mmtk.rs                 |  2 +-
 src/plan/gc_requester.rs    | 29 +++++++++++------------------
 src/scheduler/controller.rs | 16 +---------------
 src/scheduler/scheduler.rs  | 29 ++---------------------------
 4 files changed, 15 insertions(+), 61 deletions(-)
diff --git a/src/mmtk.rs b/src/mmtk.rs
index fa90270637..3d91f23ae8 100644
--- a/src/mmtk.rs
+++ b/src/mmtk.rs
@@ -148,7 +148,7 @@ impl<VM: VMBinding> MMTK<VM> {
 
         let state = Arc::new(GlobalState::default());
 
-        let gc_requester = Arc::new(GCRequester::new());
+        let gc_requester = Arc::new(GCRequester::new(scheduler.clone()));
 
         let gc_trigger = Arc::new(GCTrigger::new(
             options.clone(),
diff --git a/src/plan/gc_requester.rs b/src/plan/gc_requester.rs
index 2e5f518cb2..0bae785202 100644
--- a/src/plan/gc_requester.rs
+++ b/src/plan/gc_requester.rs
@@ -1,7 +1,9 @@
+use crate::scheduler::gc_work::ScheduleCollection;
+use crate::scheduler::{GCWorkScheduler, WorkBucketStage};
 use crate::vm::VMBinding;
 use std::marker::PhantomData;
 use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::{Condvar, Mutex};
+use std::sync::{Arc, Mutex};
 
 struct RequestSync {
     request_count: isize,
@@ -12,27 +14,20 @@ struct RequestSync {
 /// and the GC coordinator thread waits for GC requests using this object.
 pub struct GCRequester<VM: VMBinding> {
     request_sync: Mutex<RequestSync>,
-    request_condvar: Condvar,
     request_flag: AtomicBool,
+    scheduler: Arc<GCWorkScheduler<VM>>,
     phantom: PhantomData<VM>,
 }
 
-// Clippy says we need this...
-impl<VM: VMBinding> Default for GCRequester<VM> {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 impl<VM: VMBinding> GCRequester<VM> {
-    pub fn new() -> Self {
+    pub fn new(scheduler: Arc<GCWorkScheduler<VM>>) -> Self {
         GCRequester {
             request_sync: Mutex::new(RequestSync {
                 request_count: 0,
                 last_request_count: -1,
             }),
-            request_condvar: Condvar::new(),
             request_flag: AtomicBool::new(false),
+            scheduler,
             phantom: PhantomData,
         }
     }
@@ -46,7 +41,8 @@ impl<VM: VMBinding> GCRequester<VM> {
         if !self.request_flag.load(Ordering::Relaxed) {
             self.request_flag.store(true, Ordering::Relaxed);
             guard.request_count += 1;
-            self.request_condvar.notify_all();
+
+            self.schedule_collection();
         }
     }
 
@@ -56,11 +52,8 @@ impl<VM: VMBinding> GCRequester<VM> {
         drop(guard);
     }
 
-    pub fn wait_for_request(&self) {
-        let mut guard = self.request_sync.lock().unwrap();
-        guard.last_request_count += 1;
-        while guard.last_request_count == guard.request_count {
-            guard = self.request_condvar.wait(guard).unwrap();
-        }
+    fn schedule_collection(&self) {
+        // Add a ScheduleCollection work packet.  It is the seed of other work packets.
+        self.scheduler.work_buckets[WorkBucketStage::Unconstrained].add(ScheduleCollection);
     }
 }
diff --git a/src/scheduler/controller.rs b/src/scheduler/controller.rs
index 3608f1ebfb..acc5426264 100644
--- a/src/scheduler/controller.rs
+++ b/src/scheduler/controller.rs
@@ -27,20 +27,6 @@ pub struct GCController<VM: VMBinding> {
 }
 
 impl<VM: VMBinding> GCController<VM> {
-    pub(crate) fn new(
-        mmtk: &'static MMTK<VM>,
-        requester: Arc<GCRequester<VM>>,
-        scheduler: Arc<GCWorkScheduler<VM>>,
-        coordinator_worker: GCWorker<VM>,
-    ) -> Box<GCController<VM>> {
-        Box::new(Self {
-            mmtk,
-            requester,
-            scheduler,
-            coordinator_worker,
-        })
-    }
-
     /// The main loop for the GC controller.
     pub fn run(&mut self, tls: VMWorkerThread) -> ! {
         probe!(mmtk, gccontroller_run);
@@ -50,7 +36,7 @@ impl<VM: VMBinding> GCController<VM> {
 
         loop {
             debug!("[STWController: Waiting for request...]");
-            self.requester.wait_for_request();
+            //self.requester.wait_for_request();
             debug!("[STWController: Request recieved.]");
 
             self.do_gc_until_completion_traced();
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 5c71296eca..46357e4a46 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -6,9 +6,8 @@ use crate::mmtk::MMTK;
 use crate::util::opaque_pointer::*;
 use crate::util::options::AffinityKind;
 use crate::util::rust_util::array_from_fn;
-use crate::vm::Collection;
-use crate::vm::{GCThreadContext, VMBinding};
-use crossbeam::deque::{self, Steal};
+use crate::vm::VMBinding;
+use crossbeam::deque::Steal;
 use enum_map::{Enum, EnumMap};
 use std::collections::HashMap;
 use std::sync::Arc;
@@ -18,8 +17,6 @@ pub struct GCWorkScheduler<VM: VMBinding> {
     pub work_buckets: EnumMap<WorkBucketStage, WorkBucket<VM>>,
     /// Workers
     pub(crate) worker_group: Arc<WorkerGroup<VM>>,
-    /// The shared part of the GC worker object of the controller thread
-    coordinator_worker_shared: Arc<GCWorkerShared<VM>>,
     /// Condition Variable for worker synchronization
     pub(crate) worker_monitor: Arc<WorkerMonitor>,
     /// How to assign the affinity of each GC thread. Specified by the user.
@@ -70,7 +67,6 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         Arc::new(Self {
             work_buckets,
             worker_group,
-            coordinator_worker_shared,
             worker_monitor,
             affinity,
         })
@@ -82,23 +78,6 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
 
     /// Create GC threads, including the controller thread and all workers.
     pub fn spawn_gc_threads(self: &Arc<Self>, mmtk: &'static MMTK<VM>, tls: VMThread) {
-        // Spawn the controller thread.
-        let coordinator_worker = GCWorker::new(
-            mmtk,
-            usize::MAX,
-            self.clone(),
-            true,
-            self.coordinator_worker_shared.clone(),
-            deque::Worker::new_fifo(),
-        );
-        let gc_controller = GCController::new(
-            mmtk,
-            mmtk.gc_requester.clone(),
-            self.clone(),
-            coordinator_worker,
-        );
-        VM::VMCollection::spawn_gc_thread(tls, GCThreadContext::<VM>::Controller(gc_controller));
-
         self.worker_group.spawn(mmtk, tls)
     }
 
@@ -379,8 +358,6 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
             let worker_stat = worker.borrow_stat();
             worker_stat.enable();
         }
-        let coordinator_worker_stat = self.coordinator_worker_shared.borrow_stat();
-        coordinator_worker_stat.enable();
     }
 
     pub fn statistics(&self) -> HashMap<String, String> {
@@ -389,8 +366,6 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
             let worker_stat = worker.borrow_stat();
             summary.merge(&worker_stat);
         }
-        let coordinator_worker_stat = self.coordinator_worker_shared.borrow_stat();
-        summary.merge(&coordinator_worker_stat);
         summary.harness_stat()
     }
 

From 18f7bb610f9aac268518951f718c4b19122e12d4 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 28 Dec 2023 23:05:05 +0800
Subject: [PATCH 02/48] WIP: Rework GC triggering

---
 src/plan/gc_requester.rs | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/plan/gc_requester.rs b/src/plan/gc_requester.rs
index 0bae785202..43f7c5a5ac 100644
--- a/src/plan/gc_requester.rs
+++ b/src/plan/gc_requester.rs
@@ -6,12 +6,11 @@ use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex};
 
 struct RequestSync {
-    request_count: isize,
-    last_request_count: isize,
+    /// Is GC scheduled (but not finished)?
+    gc_scheduled: bool,
 }
 
-/// GC requester.  This object allows other threads to request (trigger) GC,
-/// and the GC coordinator thread waits for GC requests using this object.
+/// This data structure lets mutators trigger GC, and may schedule collection when appropriate.
 pub struct GCRequester<VM: VMBinding> {
     request_sync: Mutex<RequestSync>,
     request_flag: AtomicBool,
@@ -23,8 +22,7 @@ impl<VM: VMBinding> GCRequester<VM> {
     pub fn new(scheduler: Arc<GCWorkScheduler<VM>>) -> Self {
         GCRequester {
             request_sync: Mutex::new(RequestSync {
-                request_count: 0,
-                last_request_count: -1,
+                gc_scheduled: true,
             }),
             request_flag: AtomicBool::new(false),
             scheduler,
@@ -38,22 +36,37 @@ impl<VM: VMBinding> GCRequester<VM> {
         }
 
         let mut guard = self.request_sync.lock().unwrap();
+        // Note: This is the double-checked locking algorithm.
+        // The load has the `Relaxed` order instead of `Acquire` because we only use the flag to
+        // remove successive requests, but we don't use it to synchronize other data fields.
         if !self.request_flag.load(Ordering::Relaxed) {
             self.request_flag.store(true, Ordering::Relaxed);
-            guard.request_count += 1;
 
-            self.schedule_collection();
+            self.try_schedule_collection(&mut *guard);
         }
     }
 
     pub fn clear_request(&self) {
-        let guard = self.request_sync.lock().unwrap();
+        let _guard = self.request_sync.lock().unwrap();
         self.request_flag.store(false, Ordering::Relaxed);
-        drop(guard);
     }
 
-    fn schedule_collection(&self) {
-        // Add a ScheduleCollection work packet.  It is the seed of other work packets.
-        self.scheduler.work_buckets[WorkBucketStage::Unconstrained].add(ScheduleCollection);
+    pub fn on_gc_finished(&self) {
+        let mut guard = self.request_sync.lock().unwrap();
+        guard.gc_scheduled = false;
+
+        self.try_schedule_collection(&mut *guard);
+    }
+
+    fn try_schedule_collection(&self, sync: &mut RequestSync) {
+        if self.request_flag.load(Ordering::Relaxed) && !sync.gc_scheduled {
+            // Add a ScheduleCollection work packet.  It is the seed of other work packets.
+            self.scheduler.work_buckets[WorkBucketStage::Unconstrained].add(ScheduleCollection);
+
+            sync.gc_scheduled = true;
+
+            // Note: We do not clear `request_flag` now.  It will be cleared by `clear_request`
+            // after all mutators have stopped.
+        }
     }
 }

From ffe8f33342f167c1662094c22e76198903adac61 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Tue, 2 Jan 2024 19:36:30 +0800
Subject: [PATCH 03/48] Working

---
 src/global_state.rs         |   7 ++
 src/plan/gc_requester.rs    |  30 ++++++--
 src/scheduler/controller.rs |  14 ++--
 src/scheduler/gc_work.rs    |  10 +++
 src/scheduler/scheduler.rs  | 137 ++++++++++++++++++++++++++++++++++--
 src/scheduler/worker.rs     | 114 ++++++++----------------------
 6 files changed, 210 insertions(+), 102 deletions(-)

diff --git a/src/global_state.rs b/src/global_state.rs
index e066811557..2340957eae 100644
--- a/src/global_state.rs
+++ b/src/global_state.rs
@@ -1,5 +1,8 @@
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::Mutex;
+use std::time::Instant;
+
+use atomic_refcell::AtomicRefCell;
 
 /// This stores some global states for an MMTK instance.
 /// Some MMTK components like plans and allocators may keep an reference to the struct, and can access it.
@@ -40,6 +43,9 @@ pub struct GlobalState {
     pub(crate) stacks_prepared: AtomicBool,
     /// A counter that keeps tracks of the number of bytes allocated since last stress test
     pub(crate) allocation_bytes: AtomicUsize,
+    /// The time when the current GC started.  Only accessible in `ScheduleCollection` and
+    /// `GCWorkScheduler::on_gc_finished` which never happen at the same time.
+    pub(crate) gc_start_time: AtomicRefCell<Option<Instant>>,
     /// A counteer that keeps tracks of the number of bytes allocated by malloc
     #[cfg(feature = "malloc_counted_size")]
     pub(crate) malloc_bytes: AtomicUsize,
@@ -214,6 +220,7 @@ impl Default for GlobalState {
             cur_collection_attempts: AtomicUsize::new(0),
             scanned_stacks: AtomicUsize::new(0),
             allocation_bytes: AtomicUsize::new(0),
+            gc_start_time: AtomicRefCell::new(None),
             #[cfg(feature = "malloc_counted_size")]
             malloc_bytes: AtomicUsize::new(0),
             #[cfg(feature = "count_live_bytes_in_gc")]
diff --git a/src/plan/gc_requester.rs b/src/plan/gc_requester.rs
index 43f7c5a5ac..53419aa486 100644
--- a/src/plan/gc_requester.rs
+++ b/src/plan/gc_requester.rs
@@ -1,5 +1,4 @@
-use crate::scheduler::gc_work::ScheduleCollection;
-use crate::scheduler::{GCWorkScheduler, WorkBucketStage};
+use crate::scheduler::GCWorkScheduler;
 use crate::vm::VMBinding;
 use std::marker::PhantomData;
 use std::sync::atomic::{AtomicBool, Ordering};
@@ -22,7 +21,7 @@ impl<VM: VMBinding> GCRequester<VM> {
     pub fn new(scheduler: Arc<GCWorkScheduler<VM>>) -> Self {
         GCRequester {
             request_sync: Mutex::new(RequestSync {
-                gc_scheduled: true,
+                gc_scheduled: false,
             }),
             request_flag: AtomicBool::new(false),
             scheduler,
@@ -30,6 +29,8 @@ impl<VM: VMBinding> GCRequester<VM> {
         }
     }
 
+    /// Request a GC.  Called by mutators when polling (during allocation) and when handling user
+    /// GC requests (e.g. `System.gc();` in Java);
     pub fn request(&self) {
         if self.request_flag.load(Ordering::Relaxed) {
             return;
@@ -46,11 +47,28 @@ impl<VM: VMBinding> GCRequester<VM> {
         }
     }
 
+    /// Returns true if GC has been scheduled.
+    pub fn is_gc_scheduled(&self) -> bool {
+        let guard = self.request_sync.lock().unwrap();
+        guard.gc_scheduled
+    }
+
+    /// Clear the "GC requested" flag so that mutators can trigger the next GC.
+    /// Called by a GC worker when all mutators have come to a stop.
     pub fn clear_request(&self) {
         let _guard = self.request_sync.lock().unwrap();
         self.request_flag.store(false, Ordering::Relaxed);
     }
 
+    /// Called by a GC worker when a GC has finished.
+    /// This will check the `request_flag` again and schedule the next GC.
+    ///
+    /// Note that this may schedule the next GC immediately if
+    /// 1.  The plan is concurrent, and a mutator triggered another GC while the current GC was
+    ///     still running (between `clear_request` and `on_gc_finished`), or
+    /// 2.  After the invocation of `resume_mutators`, a mutator runs so fast that it
+    ///     exhausted the heap, or called `handle_user_collection_request`, before this function
+    ///     is called.
     pub fn on_gc_finished(&self) {
         let mut guard = self.request_sync.lock().unwrap();
         guard.gc_scheduled = false;
@@ -59,9 +77,11 @@ impl<VM: VMBinding> GCRequester<VM> {
     }
 
     fn try_schedule_collection(&self, sync: &mut RequestSync) {
+        // Do not schedule collection if a GC is still in progress.
+        // When the GC finishes, a GC worker will call `on_gc_finished` and check `request_flag`
+        // again.
         if self.request_flag.load(Ordering::Relaxed) && !sync.gc_scheduled {
-            // Add a ScheduleCollection work packet.  It is the seed of other work packets.
-            self.scheduler.work_buckets[WorkBucketStage::Unconstrained].add(ScheduleCollection);
+            self.scheduler.schedule_collection();
 
             sync.gc_scheduled = true;
 
diff --git a/src/scheduler/controller.rs b/src/scheduler/controller.rs
index acc5426264..125efbbe91 100644
--- a/src/scheduler/controller.rs
+++ b/src/scheduler/controller.rs
@@ -75,17 +75,17 @@ impl<VM: VMBinding> GCController<VM> {
     fn do_gc_until_completion(&mut self) {
         let gc_start = std::time::Instant::now();
 
-        debug_assert!(
-            self.scheduler.worker_monitor.debug_is_sleeping(),
-            "Workers are still doing work when GC started."
-        );
+        // debug_assert!(
+        //     self.scheduler.worker_monitor.debug_is_sleeping(),
+        //     "Workers are still doing work when GC started."
+        // );
 
         // Add a ScheduleCollection work packet.  It is the seed of other work packets.
         self.scheduler.work_buckets[WorkBucketStage::Unconstrained].add(ScheduleCollection);
 
         // Notify only one worker at this time because there is only one work packet,
         // namely `ScheduleCollection`.
-        self.scheduler.worker_monitor.resume_and_wait(false);
+        // self.scheduler.worker_monitor.resume_and_wait(false);
 
         // Gradually open more buckets as workers stop each time they drain all open bucket.
         loop {
@@ -102,11 +102,11 @@ impl<VM: VMBinding> GCController<VM> {
 
             // Notify all workers because there should be many work packets available in the newly
             // opened bucket(s).
-            self.scheduler.worker_monitor.resume_and_wait(true);
+            // self.scheduler.worker_monitor.resume_and_wait(true);
         }
 
         // All GC workers must have parked by now.
-        debug_assert!(self.scheduler.worker_monitor.debug_is_sleeping());
+        // debug_assert!(self.scheduler.worker_monitor.debug_is_sleeping());
         debug_assert!(!self.scheduler.worker_group.has_designated_work());
         debug_assert!(self.scheduler.all_buckets_empty());
 
diff --git a/src/scheduler/gc_work.rs b/src/scheduler/gc_work.rs
index 692873eb84..a651437911 100644
--- a/src/scheduler/gc_work.rs
+++ b/src/scheduler/gc_work.rs
@@ -14,6 +14,16 @@ pub struct ScheduleCollection;
 
 impl<VM: VMBinding> GCWork<VM> for ScheduleCollection {
     fn do_work(&mut self, worker: &mut GCWorker<VM>, mmtk: &'static MMTK<VM>) {
+        // Record the time when GC starts.
+        {
+            let mut guard = mmtk.state.gc_start_time.borrow_mut();
+            let old_time = guard.replace(std::time::Instant::now());
+            debug_assert!(
+                old_time.is_none(),
+                "gc_start_time is still set when GC started: {old_time:?}",
+            );
+        }
+
         // Tell GC trigger that GC started.
         mmtk.gc_trigger.policy.on_gc_start(mmtk);
 
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 46357e4a46..3372ff9770 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -1,12 +1,16 @@
+use super::gc_work::ScheduleCollection;
 use super::stat::SchedulerStat;
 use super::work_bucket::*;
-use super::worker::{GCWorker, GCWorkerShared, ThreadId, WorkerGroup, WorkerMonitor};
+use super::worker::{GCWorker, ThreadId, WorkerGroup, WorkerMonitor};
 use super::*;
+use crate::global_state::GcStatus;
 use crate::mmtk::MMTK;
 use crate::util::opaque_pointer::*;
 use crate::util::options::AffinityKind;
 use crate::util::rust_util::array_from_fn;
+use crate::vm::Collection;
 use crate::vm::VMBinding;
+use crate::Plan;
 use crossbeam::deque::Steal;
 use enum_map::{Enum, EnumMap};
 use std::collections::HashMap;
@@ -62,8 +66,6 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
             }
         }
 
-        let coordinator_worker_shared = Arc::new(GCWorkerShared::<VM>::new(None));
-
         Arc::new(Self {
             work_buckets,
             worker_group,
@@ -86,9 +88,17 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         self.affinity.resolve_affinity(thread);
     }
 
+    /// Schedule collection.  Called via `GCRequester`.
+    /// Because this function may be called by a mutator thread, we only add a `ScheduleCollection`
+    /// work packet here so that a GC worker can wake up later and actually schedule the work for a
+    /// collection.
+    pub(crate) fn schedule_collection(&self) {
+        // Add a ScheduleCollection work packet.  It is the seed of other work packets.
+        self.work_buckets[WorkBucketStage::Unconstrained].add(ScheduleCollection);
+    }
+
     /// Schedule all the common work packets
     pub fn schedule_common_work<C: GCWorkContext<VM = VM>>(&self, plan: &'static C::PlanType) {
-        use crate::plan::Plan;
         use crate::scheduler::gc_work::*;
         // Stop & scan mutators (mutator scanning can happen before STW)
         self.work_buckets[WorkBucketStage::Unconstrained].add(StopMutators::<C>::new());
@@ -349,8 +359,125 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 return work;
             }
 
-            self.worker_monitor.park_and_wait(worker);
+            self.worker_monitor.park_and_wait(worker, || {
+                // This is the last worker parked.
+
+                // Test whether we are doing GC.
+                if worker.mmtk.gc_requester.is_gc_scheduled() {
+                    trace!("GC is scheduled.  Try to find more work to do...");
+                    // We are in the middle of GC, and the last GC worker parked.
+                    // Find more work for workers to do.
+                    let found_more_work = self.find_more_work_for_workers();
+
+                    if !found_more_work {
+                        // GC finished.
+                        self.on_gc_finished(worker);
+                    }
+
+                    found_more_work
+                } else {
+                    trace!("GC is not scheduled.  Wait for the first GC.");
+                    // GC is not scheduled.  Do nothing.
+                    // Note that when GC worker threads has just been created, they will try to get
+                    // work packets to execute.  But since the first GC has not started, yet, there
+                    // is not any work packets to execute, yet.  Therefore, all workers will park,
+                    // and the last parked worker will reach here.  In that case, we should simply
+                    // let workers wait until the first GC starts, instead of trying to open more
+                    // buckets.
+                    // If a GC worker spuriously wakes up when GC is not scheduled, it should not
+                    // do anything, either.
+                    false
+                }
+            });
+        }
+    }
+
+    /// Find more work for workers to do.  Return true if more work is available.
+    fn find_more_work_for_workers(&self) -> bool {
+        if self.worker_group.has_designated_work() {
+            return true;
         }
+
+        // See if any bucket has a sentinel.
+        if self.schedule_sentinels() {
+            return true;
+        }
+
+        // Try to open new buckets.
+        if self.update_buckets() {
+            return true;
+        }
+
+        // If all of the above failed, it means GC has finished.
+        false
+    }
+
+    /// Called when GC has finished, i.e. when all work packets have been executed.
+    fn on_gc_finished(&self, worker: &GCWorker<VM>) {
+        // All GC workers (except this one) must have parked by now.
+        debug_assert!(!self.worker_group.has_designated_work());
+        debug_assert!(self.all_buckets_empty());
+
+        // Deactivate all work buckets to prepare for the next GC.
+        self.deactivate_all();
+        self.debug_assert_all_buckets_deactivated();
+
+        let mmtk = worker.mmtk;
+
+        // Tell GC trigger that GC ended - this happens before we resume mutators.
+        mmtk.gc_trigger.policy.on_gc_end(mmtk);
+
+        // Compute the elapsed time of the GC.
+        let gc_start = {
+            let mut guard = mmtk.state.gc_start_time.borrow_mut();
+            guard.take().expect("gc_start_time was not set")
+        };
+        let elapsed = gc_start.elapsed();
+
+        info!(
+            "End of GC ({}/{} pages, took {} ms)",
+            mmtk.get_plan().get_reserved_pages(),
+            mmtk.get_plan().get_total_pages(),
+            elapsed.as_millis()
+        );
+
+        #[cfg(feature = "count_live_bytes_in_gc")]
+        {
+            let live_bytes = mmtk.state.get_live_bytes_in_last_gc();
+            let used_bytes =
+                mmtk.get_plan().get_used_pages() << crate::util::constants::LOG_BYTES_IN_PAGE;
+            debug_assert!(
+                live_bytes <= used_bytes,
+                "Live bytes of all live objects ({} bytes) is larger than used pages ({} bytes), something is wrong.",
+                live_bytes, used_bytes
+            );
+            info!(
+                "Live objects = {} bytes ({:04.1}% of {} used pages)",
+                live_bytes,
+                live_bytes as f64 * 100.0 / used_bytes as f64,
+                mmtk.get_plan().get_used_pages()
+            );
+        }
+
+        // All other workers are parked, so it is safe to access the Plan instance mutably.
+        let plan_mut: &mut dyn Plan<VM = VM> = unsafe { mmtk.get_plan_mut() };
+        plan_mut.end_of_gc(worker.tls);
+
+        #[cfg(feature = "extreme_assertions")]
+        if crate::util::edge_logger::should_check_duplicate_edges(mmtk.get_plan()) {
+            // reset the logging info at the end of each GC
+            mmtk.edge_logger.reset();
+        }
+
+        // Reset the triggering information.
+        mmtk.state.reset_collection_trigger();
+
+        // Set to NotInGC after everything, and right before resuming mutators.
+        mmtk.set_gc_status(GcStatus::NotInGC);
+        <VM as VMBinding>::VMCollection::resume_mutators(worker.tls);
+
+        // Notify the `GCRequester` that GC has finished.
+        mmtk.gc_requester.on_gc_finished();
     }
 
     pub fn enable_stat(&self) {
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index e415a14e46..3f483f7200 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -12,7 +12,7 @@ use crossbeam::queue::ArrayQueue;
 #[cfg(feature = "count_live_bytes_in_gc")]
 use std::sync::atomic::AtomicUsize;
 use std::sync::atomic::Ordering;
-use std::sync::{Arc, Condvar, Mutex};
+use std::sync::{Arc, Condvar, Mutex, MutexGuard};
 
 /// Represents the ID of a GC worker thread.
 pub type ThreadId = usize;
@@ -75,40 +75,14 @@ impl<VM: VMBinding> GCWorkerShared<VM> {
     }
 }
 
-/// Used to synchronize mutually exclusive operations between workers and controller,
-/// and also waking up workers when more work packets are available.
+/// A data structure for parking and waking up workers.
+/// It keeps track of the number of workers parked, and allow the last parked worker to perform
+/// operations that require all other workers to be parked.
 pub(crate) struct WorkerMonitor {
     /// The synchronized part.
     sync: Mutex<WorkerMonitorSync>,
     /// This is notified when new work is made available for the workers.
-    /// Particularly, it is notified when
-    /// -   `sync.worker_group_state` is transitioned to `Working` because
-    ///     -   some workers still have designated work, or
-    ///     -   some sentinel work packets are added to their drained buckets, or
-    ///     -   some work buckets are opened, or
-    /// -   any work packet is added to any open bucket.
-    /// Workers wait on this condvar.
     work_available: Condvar,
-    /// This is notified when all workers parked.
-    /// The coordinator waits on this condvar.
-    all_workers_parked: Condvar,
-}
-
-/// The state of the worker group.
-///
-/// The worker group alternates between the `Sleeping` and the `Working` state.  Workers are
-/// allowed to execute work packets in the `Working` state.  However, once workers entered the
-/// `Sleeping` state, they will not be allowed to packets from buckets until the coordinator
-/// explicitly transitions the state back to `Working` after it found more work for workers to do.
-#[derive(Clone, Copy, PartialEq, Eq, Debug)]
-enum WorkerGroupState {
-    /// In this state, the coordinator can open new buckets and close buckets,
-    /// but workers cannot execute any packets or get any work packets from any buckets.
-    /// Workers cannot unpark in this state.
-    Sleeping,
-    /// In this state, workers can get work packets from open buckets,
-    /// but no buckets can be opened or closed.
-    Working,
 }
 
 /// The synchronized part of `WorkerMonitor`.
@@ -117,8 +91,6 @@ pub(crate) struct WorkerMonitorSync {
     worker_count: usize,
     /// Number of parked workers.
     parked_workers: usize,
-    /// The worker group state.
-    worker_group_state: WorkerGroupState,
 }
 
 impl WorkerMonitor {
@@ -127,61 +99,37 @@ impl WorkerMonitor {
             sync: Mutex::new(WorkerMonitorSync {
                 worker_count,
                 parked_workers: 0,
-                worker_group_state: WorkerGroupState::Sleeping,
             }),
             work_available: Default::default(),
-            all_workers_parked: Default::default(),
         }
     }
 
     /// Wake up workers when more work packets are made available for workers.
     /// This function is called when adding work packets to buckets.
-    /// This function doesn't change the `work_group_state` variable.
-    /// If workers are in the `Sleeping` state, use `resume_and_wait` to resume workers.
     pub fn notify_work_available(&self, all: bool) {
-        let sync = self.sync.lock().unwrap();
-
-        // Don't notify workers if we are adding packets when workers are sleeping.
-        // This could happen when we add `ScheduleCollection` or schedule sentinels.
-        if sync.worker_group_state == WorkerGroupState::Sleeping {
-            return;
-        }
-
-        if all {
-            self.work_available.notify_all();
-        } else {
-            self.work_available.notify_one();
-        }
+        let mut guard = self.sync.lock().unwrap();
+        self.notify_work_available_inner(all, &mut guard);
     }
 
-    /// Wake up workers and wait until they transition to `Sleeping` state again.
-    /// This is called by the coordinator.
-    /// If `all` is true, notify all workers; otherwise only notify one worker.
-    pub fn resume_and_wait(&self, all: bool) {
-        let mut sync = self.sync.lock().unwrap();
-        sync.worker_group_state = WorkerGroupState::Working;
+    fn notify_work_available_inner(&self, all: bool, _guard: &mut MutexGuard<WorkerMonitorSync>) {
         if all {
             self.work_available.notify_all();
         } else {
             self.work_available.notify_one();
         }
-        let _sync = self
-            .all_workers_parked
-            .wait_while(sync, |sync| {
-                sync.worker_group_state == WorkerGroupState::Working
-            })
-            .unwrap();
     }
 
-    /// Test if the worker group is in the `Sleeping` state.
-    pub fn debug_is_sleeping(&self) -> bool {
-        let sync = self.sync.lock().unwrap();
-        sync.worker_group_state == WorkerGroupState::Sleeping
-    }
-
-    /// Park until more work is available.
-    /// The argument `worker` indicates this function can only be called by workers.
-    pub fn park_and_wait<VM: VMBinding>(&self, worker: &GCWorker<VM>) {
+    /// Park a worker until more work is available.
+    /// If it is the last worker parked, `on_last_parked` will be called.
+    /// If `on_last_parked` returns `true`, this function will notify other workers about available
+    /// work before unparking the current worker;
+    /// if `on_last_parked` returns `false`, the current worker will also wait for work packets to
+    /// be added.
+    pub fn park_and_wait<VM, F>(&self, worker: &GCWorker<VM>, on_last_parked: F)
+    where
+        VM: VMBinding,
+        F: FnOnce() -> bool,
+    {
         let mut sync = self.sync.lock().unwrap();
 
         // Park this worker
@@ -189,13 +137,17 @@ impl WorkerMonitor {
         trace!("Worker {} parked.", worker.ordinal);
 
         if all_parked {
-            // If all workers are parked, enter "Sleeping" state and notify controller.
-            sync.worker_group_state = WorkerGroupState::Sleeping;
-            debug!(
-                "Worker {} notifies the coordinator that all workerer parked.",
-                worker.ordinal
-            );
-            self.all_workers_parked.notify_one();
+            debug!("Worker {} is the last worker parked.", worker.ordinal);
+            let more_work_available = on_last_parked();
+            if more_work_available {
+                self.notify_work_available_inner(true, &mut sync);
+            } else {
+                // It didn't make more work available.  Keep waiting.
+                // This should happen when
+                // 1.   Worker threads have just been created, but GC has not started, yet, or
+                // 2.   A GC has just finished.
+                sync = self.work_available.wait(sync).unwrap();
+            }
         } else {
             // Otherwise wait until notified.
             // Note: The condition for this `cond.wait` is "more work is available".
@@ -205,14 +157,6 @@ impl WorkerMonitor {
             sync = self.work_available.wait(sync).unwrap();
         }
 
-        // If we are in the `Sleeping` state, wait until leaving that state.
-        sync = self
-            .work_available
-            .wait_while(sync, |sync| {
-                sync.worker_group_state == WorkerGroupState::Sleeping
-            })
-            .unwrap();
-
         // Unpark this worker.
         sync.dec_parked_workers();
         trace!("Worker {} unparked.", worker.ordinal);

From b64e2a78ecb42e1a60431f9e7473aa1a6f027f37 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Wed, 3 Jan 2024 12:17:20 +0800
Subject: [PATCH 04/48] WIP: Remove Controller

---
 src/memory_manager.rs       |  17 +----
 src/scheduler/controller.rs | 134 ------------------------------------
 src/scheduler/mod.rs        |   3 -
 src/scheduler/scheduler.rs  |  21 +++++-
 src/scheduler/worker.rs     |   8 ++-
 src/vm/collection.rs        |   6 +-
 6 files changed, 29 insertions(+), 160 deletions(-)
 delete mode 100644 src/scheduler/controller.rs

diff --git a/src/memory_manager.rs b/src/memory_manager.rs
index 67135e9678..e5a918233f 100644
--- a/src/memory_manager.rs
+++ b/src/memory_manager.rs
@@ -16,7 +16,7 @@ use crate::mmtk::MMTK;
 use crate::plan::AllocationSemantics;
 use crate::plan::{Mutator, MutatorContext};
 use crate::scheduler::WorkBucketStage;
-use crate::scheduler::{GCController, GCWork, GCWorker};
+use crate::scheduler::{GCWork, GCWorker};
 use crate::util::alloc::allocators::AllocatorSelector;
 use crate::util::constants::{LOG_BYTES_IN_PAGE, MIN_OBJECT_SIZE};
 use crate::util::heap::layout::vm_layout::vm_layout;
@@ -461,21 +461,6 @@ pub fn gc_poll<VM: VMBinding>(mmtk: &MMTK<VM>, tls: VMMutatorThread) {
     }
 }
 
-/// Run the main loop for the GC controller thread. This method does not return.
-///
-/// Arguments:
-/// * `tls`: The thread that will be used as the GC controller.
-/// * `gc_controller`: The execution context of the GC controller threa.
-///   It is the `GCController` passed to `Collection::spawn_gc_thread`.
-/// * `mmtk`: A reference to an MMTk instance.
-pub fn start_control_collector<VM: VMBinding>(
-    _mmtk: &'static MMTK<VM>,
-    tls: VMWorkerThread,
-    gc_controller: &mut GCController<VM>,
-) {
-    gc_controller.run(tls);
-}
-
 /// Run the main loop of a GC worker. This method does not return.
 ///
 /// Arguments:
diff --git a/src/scheduler/controller.rs b/src/scheduler/controller.rs
deleted file mode 100644
index 125efbbe91..0000000000
--- a/src/scheduler/controller.rs
+++ /dev/null
@@ -1,134 +0,0 @@
-//! The GC controller thread.
-//!
-//! MMTk has many GC threads.  There are many GC worker threads and one GC controller thread.
-//! The GC controller thread responds to GC requests and coordinates the workers to perform GC.
-
-use std::sync::Arc;
-
-use crate::plan::gc_requester::GCRequester;
-use crate::scheduler::gc_work::{EndOfGC, ScheduleCollection};
-use crate::scheduler::{GCWork, WorkBucketStage};
-use crate::util::VMWorkerThread;
-use crate::vm::VMBinding;
-use crate::MMTK;
-
-use super::{GCWorkScheduler, GCWorker};
-
-/// The thread local struct for the GC controller, the counterpart of `GCWorker`.
-pub struct GCController<VM: VMBinding> {
-    /// The reference to the MMTk instance.
-    mmtk: &'static MMTK<VM>,
-    /// The reference to the GC requester.
-    requester: Arc<GCRequester<VM>>,
-    /// The reference to the scheduler.
-    scheduler: Arc<GCWorkScheduler<VM>>,
-    /// The `GCWorker` is used to execute packets. The controller is also a `GCWorker`.
-    coordinator_worker: GCWorker<VM>,
-}
-
-impl<VM: VMBinding> GCController<VM> {
-    /// The main loop for the GC controller.
-    pub fn run(&mut self, tls: VMWorkerThread) -> ! {
-        probe!(mmtk, gccontroller_run);
-        // Initialize the GC worker for coordinator. We are not using the run() method from
-        // GCWorker so we manually initialize the worker here.
-        self.coordinator_worker.tls = tls;
-
-        loop {
-            debug!("[STWController: Waiting for request...]");
-            //self.requester.wait_for_request();
-            debug!("[STWController: Request recieved.]");
-
-            self.do_gc_until_completion_traced();
-            debug!("[STWController: Worker threads complete!]");
-        }
-    }
-
-    /// Find more work for workers to do.  Return true if more work is available.
-    fn find_more_work_for_workers(&mut self) -> bool {
-        if self.scheduler.worker_group.has_designated_work() {
-            return true;
-        }
-
-        // See if any bucket has a sentinel.
-        if self.scheduler.schedule_sentinels() {
-            return true;
-        }
-
-        // Try to open new buckets.
-        if self.scheduler.update_buckets() {
-            return true;
-        }
-
-        // If all of the above failed, it means GC has finished.
-        false
-    }
-
-    /// A wrapper method for [`do_gc_until_completion`](GCController::do_gc_until_completion) to insert USDT tracepoints.
-    fn do_gc_until_completion_traced(&mut self) {
-        probe!(mmtk, gc_start);
-        self.do_gc_until_completion();
-        probe!(mmtk, gc_end);
-    }
-
-    /// Coordinate workers to perform GC in response to a GC request.
-    fn do_gc_until_completion(&mut self) {
-        let gc_start = std::time::Instant::now();
-
-        // debug_assert!(
-        //     self.scheduler.worker_monitor.debug_is_sleeping(),
-        //     "Workers are still doing work when GC started."
-        // );
-
-        // Add a ScheduleCollection work packet.  It is the seed of other work packets.
-        self.scheduler.work_buckets[WorkBucketStage::Unconstrained].add(ScheduleCollection);
-
-        // Notify only one worker at this time because there is only one work packet,
-        // namely `ScheduleCollection`.
-        // self.scheduler.worker_monitor.resume_and_wait(false);
-
-        // Gradually open more buckets as workers stop each time they drain all open bucket.
-        loop {
-            // Workers should only transition to the `Sleeping` state when all open buckets have
-            // been drained.
-            self.scheduler.assert_all_activated_buckets_are_empty();
-
-            let new_work_available = self.find_more_work_for_workers();
-
-            // GC finishes if there is no new work to do.
-            if !new_work_available {
-                break;
-            }
-
-            // Notify all workers because there should be many work packets available in the newly
-            // opened bucket(s).
-            // self.scheduler.worker_monitor.resume_and_wait(true);
-        }
-
-        // All GC workers must have parked by now.
-        // debug_assert!(self.scheduler.worker_monitor.debug_is_sleeping());
-        debug_assert!(!self.scheduler.worker_group.has_designated_work());
-        debug_assert!(self.scheduler.all_buckets_empty());
-
-        // Deactivate all work buckets to prepare for the next GC.
-        // NOTE: There is no need to hold any lock.
-        // Workers are in the `Sleeping` state.
-        // so they will not wake up while we deactivate buckets.
-        self.scheduler.deactivate_all();
-
-        // Tell GC trigger that GC ended - this happens before EndOfGC where we resume mutators.
-        self.mmtk.gc_trigger.policy.on_gc_end(self.mmtk);
-
-        // Finalization: Resume mutators, reset gc states
-        // Note: Resume-mutators must happen after all work buckets are closed.
-        //       Otherwise, for generational GCs, workers will receive and process
-        //       newly generated remembered-sets from those open buckets.
-        //       But these remsets should be preserved until next GC.
-        let mut end_of_gc = EndOfGC {
-            elapsed: gc_start.elapsed(),
-        };
-        end_of_gc.do_work_with_stat(&mut self.coordinator_worker, self.mmtk);
-
-        self.scheduler.debug_assert_all_buckets_deactivated();
-    }
-}
diff --git a/src/scheduler/mod.rs b/src/scheduler/mod.rs
index 03b7d822bc..45501e9943 100644
--- a/src/scheduler/mod.rs
+++ b/src/scheduler/mod.rs
@@ -20,8 +20,5 @@ mod worker;
 pub(crate) use worker::current_worker_ordinal;
 pub use worker::GCWorker;
 
-mod controller;
-pub use controller::GCController;
-
 pub(crate) mod gc_work;
 pub use gc_work::ProcessEdgesWork;
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 3372ff9770..9fc8fc18fd 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -276,7 +276,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     }
 
     /// Check if all the work buckets are empty
-    pub(crate) fn assert_all_activated_buckets_are_empty(&self) {
+    pub(crate) fn assert_all_activated_buckets_are_empty(&self, worker: &GCWorker<VM>) {
         let mut error_example = None;
         for (id, bucket) in self.work_buckets.iter() {
             if bucket.is_activated() && !bucket.is_empty() {
@@ -286,6 +286,16 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 // we should show at least one abnormal bucket in the panic message
                 // so that we still have some information for debugging.
                 error_example = Some(id);
+
+                while !bucket.is_empty() {
+                    match bucket.poll(&worker.local_work_buffer) {
+                        Steal::Success(w) => {
+                            error!("  Bucket {:?} has {:?}", id, w.get_type_name());
+                        },
+                        Steal::Retry => continue,
+                        _ => {}
+                    }
+                }
             }
         }
         if let Some(id) = error_example {
@@ -364,8 +374,12 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
 
                 // Test whether we are doing GC.
                 if worker.mmtk.gc_requester.is_gc_scheduled() {
-                    trace!("GC is scheduled.  Try to find more work to do...");
                     // We are in the middle of GC, and the last GC worker parked.
+                    trace!("GC is scheduled.  Try to find more work to do...");
+
+                    // During GC, if all workers parked, all open buckets must have been drained.
+                    self.assert_all_activated_buckets_are_empty(worker);
+
                     // Find more work for workers to do.
                     let found_more_work = self.find_more_work_for_workers();
 
@@ -395,16 +409,19 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     /// Find more work for workers to do.  Return true if more work is available.
     fn find_more_work_for_workers(&self) -> bool {
         if self.worker_group.has_designated_work() {
+            trace!("Some workers have designated work.");
             return true;
         }
 
         // See if any bucket has a sentinel.
         if self.schedule_sentinels() {
+            trace!("Some sentinels are scheduled.");
             return true;
         }
 
         // Try to open new buckets.
         if self.update_buckets() {
+            trace!("Some buckets are opened.");
             return true;
         }
 
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 3f483f7200..dead43a048 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -134,7 +134,13 @@ impl WorkerMonitor {
 
         // Park this worker
         let all_parked = sync.inc_parked_workers();
-        trace!("Worker {} parked.", worker.ordinal);
+        trace!(
+            "Worker {} parked.  parked/total: {}/{}.  All parked: {}",
+            worker.ordinal,
+            sync.parked_workers,
+            sync.worker_count,
+            all_parked
+        );
 
         if all_parked {
             debug!("Worker {} is the last worker parked.", worker.ordinal);
diff --git a/src/vm/collection.rs b/src/vm/collection.rs
index 5420224ab0..7fe17ccdfd 100644
--- a/src/vm/collection.rs
+++ b/src/vm/collection.rs
@@ -4,9 +4,8 @@ use crate::vm::VMBinding;
 use crate::{scheduler::*, Mutator};
 
 /// Thread context for the spawned GC thread.  It is used by spawn_gc_thread.
+/// Currently, `GCWorker` is the only kind of thread that mmtk-core will create.
 pub enum GCThreadContext<VM: VMBinding> {
-    /// The GC thread to spawn is a controller thread. There is only one controller thread.
-    Controller(Box<GCController<VM>>),
     /// The GC thread to spawn is a worker thread. There can be multiple worker threads.
     Worker(Box<GCWorker<VM>>),
 }
@@ -53,10 +52,9 @@ pub trait Collection<VM: VMBinding> {
     /// * `tls`: The thread pointer for the parent thread that we spawn new threads from. This is the same `tls` when the VM
     ///   calls `initialize_collection()` and passes as an argument.
     /// * `ctx`: The context for the GC thread.
-    ///   * If `Controller` is passed, it means spawning a thread to run as the GC controller.
-    ///     The spawned thread shall call `memory_manager::start_control_collector`.
     ///   * If `Worker` is passed, it means spawning a thread to run as a GC worker.
     ///     The spawned thread shall call `memory_manager::start_worker`.
+    ///     Currently `Worker` is the only kind of thread which mmtk-core will create.
     ///   In either case, the `Box` inside should be passed back to the called function.
     fn spawn_gc_thread(tls: VMThread, ctx: GCThreadContext<VM>);
 

From 4fa8fac5bf7f6a73750157fe076bc0104d7b67d3 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Wed, 3 Jan 2024 17:24:55 +0800
Subject: [PATCH 05/48] Fix deadlock between GCRequester and GC workers.

---
 src/global_state.rs          | 20 +++++++++---
 src/mmtk.rs                  | 14 ++++-----
 src/plan/gc_requester.rs     | 59 +++++++++++++++++++++---------------
 src/scheduler/scheduler.rs   | 45 +++++++++++++++++++++------
 src/scheduler/work_bucket.rs | 22 +++++++++++---
 src/scheduler/worker.rs      | 46 +++++++++++++++++++++-------
 6 files changed, 145 insertions(+), 61 deletions(-)

diff --git a/src/global_state.rs b/src/global_state.rs
index 2340957eae..55dbab7b3b 100644
--- a/src/global_state.rs
+++ b/src/global_state.rs
@@ -1,8 +1,9 @@
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use std::sync::Mutex;
 use std::time::Instant;
 
+use atomic::Atomic;
 use atomic_refcell::AtomicRefCell;
+use bytemuck::NoUninit;
 
 /// This stores some global states for an MMTK instance.
 /// Some MMTK components like plans and allocators may keep an reference to the struct, and can access it.
@@ -20,7 +21,7 @@ pub struct GlobalState {
     /// bindings to temporarily disable GC, at which point, we do not trigger GC even if the heap is full.
     pub(crate) trigger_gc_when_heap_is_full: AtomicBool,
     /// The current GC status.
-    pub(crate) gc_status: Mutex<GcStatus>,
+    pub(crate) gc_status: Atomic<GcStatus>,
     /// Is the current GC an emergency collection? Emergency means we may run out of memory soon, and we should
     /// attempt to collect as much as we can.
     pub(crate) emergency_collection: AtomicBool,
@@ -209,7 +210,7 @@ impl Default for GlobalState {
         Self {
             initialized: AtomicBool::new(false),
             trigger_gc_when_heap_is_full: AtomicBool::new(true),
-            gc_status: Mutex::new(GcStatus::NotInGC),
+            gc_status: Atomic::new(GcStatus::NotInGC),
             stacks_prepared: AtomicBool::new(false),
             emergency_collection: AtomicBool::new(false),
             user_triggered_collection: AtomicBool::new(false),
@@ -229,9 +230,20 @@ impl Default for GlobalState {
     }
 }
 
-#[derive(PartialEq)]
+/// GC status, an indicator of whether MMTk is in the progress of a GC or not.
+///
+/// This type was only used for assertions in JikesRVM.  After we removed the coordinator (a.k.a.
+/// controller) thread, we use the GC status to decide whether GC workers should open new work
+/// buckets when the last worker parked.
+// FIXME: `GcProper` is inherited from JikesRVM, but it is not used in MMTk core.  Consider
+// removing it.
+#[derive(PartialEq, Clone, Copy, NoUninit, Debug)]
+#[repr(u8)]
 pub enum GcStatus {
+    /// Not in GC
     NotInGC,
+    /// GC has started, but not all stacks have been scanned, yet.
     GcPrepare,
+    /// GC has started, and all stacks have been scanned.
     GcProper,
 }
diff --git a/src/mmtk.rs b/src/mmtk.rs
index 3d91f23ae8..9c433fa06d 100644
--- a/src/mmtk.rs
+++ b/src/mmtk.rs
@@ -255,15 +255,15 @@ impl<VM: VMBinding> MMTK<VM> {
         self.inside_sanity.load(Ordering::Relaxed)
     }
 
-    pub(crate) fn set_gc_status(&self, s: GcStatus) {
-        let mut gc_status = self.state.gc_status.lock().unwrap();
-        if *gc_status == GcStatus::NotInGC {
+    pub(crate) fn set_gc_status(&self, new_status: GcStatus) {
+        let old_status = self.state.gc_status.swap(new_status, Ordering::SeqCst);
+        debug_assert_ne!(old_status, new_status);
+        if old_status == GcStatus::NotInGC {
             self.state.stacks_prepared.store(false, Ordering::SeqCst);
             // FIXME stats
             self.stats.start_gc();
         }
-        *gc_status = s;
-        if *gc_status == GcStatus::NotInGC {
+        if new_status == GcStatus::NotInGC {
             // FIXME stats
             if self.stats.get_gathering_stats() {
                 self.stats.end_gc();
@@ -273,12 +273,12 @@ impl<VM: VMBinding> MMTK<VM> {
 
     /// Return true if a collection is in progress.
     pub fn gc_in_progress(&self) -> bool {
-        *self.state.gc_status.lock().unwrap() != GcStatus::NotInGC
+        self.state.gc_status.load(Ordering::SeqCst) != GcStatus::NotInGC
     }
 
     /// Return true if a collection is in progress and past the preparatory stage.
     pub fn gc_in_progress_proper(&self) -> bool {
-        *self.state.gc_status.lock().unwrap() == GcStatus::GcProper
+        self.state.gc_status.load(Ordering::SeqCst) == GcStatus::GcProper
     }
 
     /// Return true if the current GC is an emergency GC.
diff --git a/src/plan/gc_requester.rs b/src/plan/gc_requester.rs
index 53419aa486..03d54889df 100644
--- a/src/plan/gc_requester.rs
+++ b/src/plan/gc_requester.rs
@@ -5,13 +5,17 @@ use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex};
 
 struct RequestSync {
-    /// Is GC scheduled (but not finished)?
+    /// Has the GCRequester called `GCWorkScheduler::schedule_collection` for the current request?
+    /// This flag exists so that once `GCRequester` called `GCWorkScheduler::schedule_collection`,
+    /// it cannot call it again until the GC it initiated finished.
     gc_scheduled: bool,
 }
 
 /// This data structure lets mutators trigger GC, and may schedule collection when appropriate.
 pub struct GCRequester<VM: VMBinding> {
     request_sync: Mutex<RequestSync>,
+    /// An atomic flag outside `RequestSync` so that mutators can check if GC has already been
+    /// requested in `poll` without acquiring the mutex.
     request_flag: AtomicBool,
     scheduler: Arc<GCWorkScheduler<VM>>,
     phantom: PhantomData<VM>,
@@ -32,18 +36,23 @@ impl<VM: VMBinding> GCRequester<VM> {
     /// Request a GC.  Called by mutators when polling (during allocation) and when handling user
     /// GC requests (e.g. `System.gc();` in Java);
     pub fn request(&self) {
+        // Note: This is the double-checked locking algorithm.
+        // The load has the `Relaxed` order instead of `Acquire` because we are not doing lazy
+        // initialization here.  We are only using this flag to remove successive requests.
         if self.request_flag.load(Ordering::Relaxed) {
             return;
         }
 
         let mut guard = self.request_sync.lock().unwrap();
-        // Note: This is the double-checked locking algorithm.
-        // The load has the `Relaxed` order instead of `Acquire` because we only use the flag to
-        // remove successive requests, but we don't use it to synchronize other data fields.
         if !self.request_flag.load(Ordering::Relaxed) {
             self.request_flag.store(true, Ordering::Relaxed);
 
-            self.try_schedule_collection(&mut *guard);
+            let should_schedule_gc = self.try_schedule_collection(&mut guard);
+            if should_schedule_gc {
+                self.scheduler.mutator_schedule_collection();
+                // Note: We do not clear `request_flag` now.  It will be cleared by `clear_request`
+                // after all mutators have stopped.
+            }
         }
     }
 
@@ -61,32 +70,34 @@ impl<VM: VMBinding> GCRequester<VM> {
     }
 
     /// Called by a GC worker when a GC has finished.
-    /// This will check the `request_flag` again and schedule the next GC.
-    ///
-    /// Note that this may schedule the next GC immediately if
-    /// 1.  The plan is concurrent, and a mutator triggered another GC while the current GC was
-    ///     still running (between `clear_request` and `on_gc_finished`), or
-    /// 2.  After the invocation of `resume_mutators`, a mutator runs so fast that it
-    ///     exhausted the heap, or called `handle_user_collection_request`, before this function
-    ///     is called.
-    pub fn on_gc_finished(&self) {
+    /// This will check the `request_flag` again and check if we should immediately schedule the
+    /// next GC.  If we should, `gc_scheduled` will be set back to `true` and this function will
+    /// return `true`.
+    pub fn on_gc_finished(&self) -> bool {
         let mut guard = self.request_sync.lock().unwrap();
         guard.gc_scheduled = false;
 
-        self.try_schedule_collection(&mut *guard);
+        self.try_schedule_collection(&mut guard)
     }
 
-    fn try_schedule_collection(&self, sync: &mut RequestSync) {
-        // Do not schedule collection if a GC is still in progress.
-        // When the GC finishes, a GC worker will call `on_gc_finished` and check `request_flag`
-        // again.
+    /// Decide whether we should schedule a new collection.  Will transition the state of
+    /// `gc_scheduled` from `false` to `true` if we should schedule a new collection.
+    /// Return `true` if the state transition happens.
+    fn try_schedule_collection(&self, sync: &mut RequestSync) -> bool {
+        // The time to schedule a collection is when `request_flag` is `true` but `gc_scheduled`
+        // is `false`.  `gc_scheduled` is `true` if either
+        //
+        // 1.  another mutator called `request()` concurrently and scheduled a collection, or
+        // 2.  a new GC is requested while the current GC is still in progress.
+        //
+        // If `gc_scheduled` is `true` when GC is requested, we do nothing now.  But when the
+        // currrent GC finishes, a GC worker will call `on_gc_finished` which clears the
+        // `gc_scheduled` flag, and checks the `request_flag` again to trigger the next GC.
         if self.request_flag.load(Ordering::Relaxed) && !sync.gc_scheduled {
-            self.scheduler.schedule_collection();
-
             sync.gc_scheduled = true;
-
-            // Note: We do not clear `request_flag` now.  It will be cleared by `clear_request`
-            // after all mutators have stopped.
+            true
+        } else {
+            false
         }
     }
 }
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 9fc8fc18fd..0f035d7a93 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -5,6 +5,7 @@ use super::worker::{GCWorker, ThreadId, WorkerGroup, WorkerMonitor};
 use super::*;
 use crate::global_state::GcStatus;
 use crate::mmtk::MMTK;
+use crate::scheduler::worker::LastParkedResult;
 use crate::util::opaque_pointer::*;
 use crate::util::options::AffinityKind;
 use crate::util::rust_util::array_from_fn;
@@ -89,10 +90,11 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     }
 
     /// Schedule collection.  Called via `GCRequester`.
-    /// Because this function may be called by a mutator thread, we only add a `ScheduleCollection`
+    /// Because this function is called by a mutator thread, we only add a `ScheduleCollection`
     /// work packet here so that a GC worker can wake up later and actually schedule the work for a
     /// collection.
-    pub(crate) fn schedule_collection(&self) {
+    pub(crate) fn mutator_schedule_collection(&self) {
+        debug!("Adding ScheduleCollection work packet upon mutator request.");
         // Add a ScheduleCollection work packet.  It is the seed of other work packets.
         self.work_buckets[WorkBucketStage::Unconstrained].add(ScheduleCollection);
     }
@@ -373,7 +375,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 // This is the last worker parked.
 
                 // Test whether we are doing GC.
-                if worker.mmtk.gc_requester.is_gc_scheduled() {
+                if worker.mmtk.gc_in_progress() {
                     // We are in the middle of GC, and the last GC worker parked.
                     trace!("GC is scheduled.  Try to find more work to do...");
 
@@ -383,12 +385,17 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                     // Find more work for workers to do.
                     let found_more_work = self.find_more_work_for_workers();
 
-                    if !found_more_work {
+                    if found_more_work {
+                        LastParkedResult::WakeAll
+                    } else {
                         // GC finished.
-                        self.on_gc_finished(worker);
+                        let scheduled_next_gc = self.on_gc_finished(worker);
+                        if scheduled_next_gc {
+                            LastParkedResult::WakeSelf
+                        } else {
+                            LastParkedResult::ParkSelf
+                        }
                     }
-
-                    found_more_work
                 } else {
                     trace!("GC is not scheduled.  Wait for the first GC.");
                     // GC is not scheduled.  Do nothing.
@@ -400,7 +407,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                     // buckets.
                     // If a GC worker spuriously wakes up when GC is not scheduled, it should not
                     // do anything, either.
-                    false
+                    LastParkedResult::ParkSelf
                 }
             });
         }
@@ -430,7 +437,8 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     }
 
     /// Called when GC has finished, i.e. when all work packets have been executed.
-    fn on_gc_finished(&self, worker: &GCWorker<VM>) {
+    /// Return `true` if it scheduled the next GC immediately.
+    fn on_gc_finished(&self, worker: &GCWorker<VM>) -> bool {
         // All GC workers (except this one) must have parked by now.
         debug_assert!(!self.worker_group.has_designated_work());
         debug_assert!(self.all_buckets_empty());
@@ -494,7 +502,24 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         <VM as VMBinding>::VMCollection::resume_mutators(worker.tls);
 
         // Notify the `GCRequester` that GC has finished.
-        mmtk.gc_requester.on_gc_finished();
+        let should_schedule_gc_now = mmtk.gc_requester.on_gc_finished();
+        if should_schedule_gc_now {
+            // We should schedule the next GC immediately.  This means GC was triggered between
+            // `clear_request` (when stacks were scanned) and `on_gc_finished` (right above).  This
+            // can happen if
+            // 1.  It is concurrent GC, and a mutator triggered another GC while the current GC was
+            //     still running, or
+            // 2.  It is STW GC, but after the invocation of `resume_mutators` above, one mutator
+            //     ran so fast that it triggered a GC before we called `on_gc_finished`.
+            // Note that we are holding the `WorkerMonitor` mutex, and cannot notify workers now.
+            // When this function returns, the current worker should continue to execute the newly
+            // added `ScheduleCollection` work packet.
+            debug!("GC already requested before `on_gc_finished`.  Add ScheduleCollection now.");
+            self.work_buckets[WorkBucketStage::Unconstrained].add_no_notify(ScheduleCollection);
+            true
+        } else {
+            false
+        }
     }
 
     pub fn enable_stat(&self) {
diff --git a/src/scheduler/work_bucket.rs b/src/scheduler/work_bucket.rs
index d6c6cb6545..d7076354eb 100644
--- a/src/scheduler/work_bucket.rs
+++ b/src/scheduler/work_bucket.rs
@@ -139,6 +139,19 @@ impl<VM: VMBinding> WorkBucket<VM> {
         self.notify_one_worker();
     }
 
+    /// Add a work packet ot this bucket, but do not notify any workers.
+    /// This is useful when the current thread is holding the mutex of `WorkerMonitor` which is
+    /// used for notifying workers.  This usually happens if the current thread is the last worker
+    /// parked.
+    pub(crate) fn add_no_notify<W: GCWork<VM>>(&self, work: W) {
+        self.queue.push(Box::new(work));
+    }
+
+    /// Like `add_no_notify`, but the work is boxed.
+    pub(crate) fn add_boxed_no_notify(&self, work: Box<dyn GCWork<VM>>) {
+        self.queue.push(work);
+    }
+
     /// Add multiple packets with a higher priority.
     /// Panic if this bucket cannot receive prioritized packets.
     pub fn bulk_add_prioritized(&self, work_vec: Vec<Box<dyn GCWork<VM>>>) {
@@ -210,11 +223,10 @@ impl<VM: VMBinding> WorkBucket<VM> {
             sentinel.take()
         };
         if let Some(work) = maybe_sentinel {
-            // We don't need to call `self.add` because this function is called by the coordinator
-            // when workers are stopped.  We don't need to notify the workers because the
-            // coordinator will do that later.
-            // We can just "sneak" the sentinel work packet into the current bucket.
-            self.queue.push(work);
+            // We don't need to notify other workers because this function is called by the last
+            // parked worker.  After this function returns, the caller will notify workers because
+            // more work packets become available.
+            self.add_boxed_no_notify(work);
             true
         } else {
             false
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index dead43a048..764f9f649d 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -75,6 +75,17 @@ impl<VM: VMBinding> GCWorkerShared<VM> {
     }
 }
 
+/// The result type of the `on_last_parked` call-back in `WorkMonitor::park_and_wait`.
+/// It decides how many workers should wake up after `on_last_parked`.
+pub(crate) enum LastParkedResult {
+    /// The last parked worker should wait, too, until more work packets are added.
+    ParkSelf,
+    /// The last parked worker should unpark and find work packet to do.
+    WakeSelf,
+    /// Wake up all parked GC workers.
+    WakeAll,
+}
+
 /// A data structure for parking and waking up workers.
 /// It keeps track of the number of workers parked, and allow the last parked worker to perform
 /// operations that require all other workers to be parked.
@@ -128,7 +139,7 @@ impl WorkerMonitor {
     pub fn park_and_wait<VM, F>(&self, worker: &GCWorker<VM>, on_last_parked: F)
     where
         VM: VMBinding,
-        F: FnOnce() -> bool,
+        F: FnOnce() -> LastParkedResult,
     {
         let mut sync = self.sync.lock().unwrap();
 
@@ -144,15 +155,23 @@ impl WorkerMonitor {
 
         if all_parked {
             debug!("Worker {} is the last worker parked.", worker.ordinal);
-            let more_work_available = on_last_parked();
-            if more_work_available {
-                self.notify_work_available_inner(true, &mut sync);
-            } else {
-                // It didn't make more work available.  Keep waiting.
-                // This should happen when
-                // 1.   Worker threads have just been created, but GC has not started, yet, or
-                // 2.   A GC has just finished.
-                sync = self.work_available.wait(sync).unwrap();
+            let result = on_last_parked();
+            match result {
+                LastParkedResult::ParkSelf => {
+                    // It didn't make more work available.  Keep waiting.
+                    // This should happen when
+                    // 1.  Worker threads have just been created, but GC has not started, yet, or
+                    // 2.  A GC has just finished.
+                    sync = self.work_available.wait(sync).unwrap();
+                }
+                LastParkedResult::WakeSelf => {
+                    // Continue without waiting.
+                    // This should happen when only one work packet is made available.
+                }
+                LastParkedResult::WakeAll => {
+                    // Notify all GC workers.
+                    self.notify_work_available_inner(true, &mut sync);
+                }
             }
         } else {
             // Otherwise wait until notified.
@@ -165,7 +184,12 @@ impl WorkerMonitor {
 
         // Unpark this worker.
         sync.dec_parked_workers();
-        trace!("Worker {} unparked.", worker.ordinal);
+        trace!(
+            "Worker {} unparked.  parked/total: {}/{}.",
+            worker.ordinal,
+            sync.parked_workers,
+            sync.worker_count,
+        );
     }
 }
 

From 8f8c3e008cb99781278d70e5354f7f52f5108c6c Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 4 Jan 2024 12:11:25 +0800
Subject: [PATCH 06/48] Remove comments and add back trace points

---
 docs/header/mmtk.h         |  3 ---
 src/scheduler/gc_work.rs   |  3 +++
 src/scheduler/scheduler.rs | 20 +++++++++++---------
 src/scheduler/worker.rs    | 21 ++-------------------
 src/util/options.rs        |  2 +-
 src/vm/collection.rs       |  3 +--
 tools/tracing/README.md    |  1 -
 7 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/docs/header/mmtk.h b/docs/header/mmtk.h
index 7fb6949d6b..3985bee06a 100644
--- a/docs/header/mmtk.h
+++ b/docs/header/mmtk.h
@@ -84,9 +84,6 @@ extern void mmtk_scan_region();
 // Request MMTk to trigger a GC. Note that this may not actually trigger a GC
 extern void mmtk_handle_user_collection_request(void* tls);
 
-// Run the main loop for the GC controller thread. Does not return
-extern void mmtk_start_control_collector(void* tls, void* worker);
-
 // Run the main loop for a GC worker. Does not return
 extern void mmtk_start_worker(void* tls, void* worker);
 
diff --git a/src/scheduler/gc_work.rs b/src/scheduler/gc_work.rs
index a651437911..2be49a06eb 100644
--- a/src/scheduler/gc_work.rs
+++ b/src/scheduler/gc_work.rs
@@ -14,6 +14,9 @@ pub struct ScheduleCollection;
 
 impl<VM: VMBinding> GCWork<VM> for ScheduleCollection {
     fn do_work(&mut self, worker: &mut GCWorker<VM>, mmtk: &'static MMTK<VM>) {
+        // This is officially when a GC starts.
+        probe!(mmtk, gc_start);
+
         // Record the time when GC starts.
         {
             let mut guard = mmtk.state.gc_start_time.borrow_mut();
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 0f035d7a93..c81ea280ed 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -79,7 +79,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         self.worker_group.as_ref().worker_count()
     }
 
-    /// Create GC threads, including the controller thread and all workers.
+    /// Create GC threads, including all workers.
     pub fn spawn_gc_threads(self: &Arc<Self>, mmtk: &'static MMTK<VM>, tls: VMThread) {
         self.worker_group.spawn(mmtk, tls)
     }
@@ -501,6 +501,9 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         mmtk.set_gc_status(GcStatus::NotInGC);
         <VM as VMBinding>::VMCollection::resume_mutators(worker.tls);
 
+        // GC offically ends here.
+        probe!(mmtk, gc_end);
+
         // Notify the `GCRequester` that GC has finished.
         let should_schedule_gc_now = mmtk.gc_requester.on_gc_finished();
         if should_schedule_gc_now {
@@ -542,14 +545,13 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         mmtk.gc_requester.clear_request();
         let first_stw_bucket = &self.work_buckets[WorkBucketStage::first_stw_stage()];
         debug_assert!(!first_stw_bucket.is_activated());
-        // Note: This is the only place where a non-coordinator thread opens a bucket.
-        // If the `StopMutators` is executed by the coordinator thread, it will open
-        // the `Prepare` bucket and let workers start executing packets while the coordinator
-        // can still add more work packets to `Prepare`.  However, since `Prepare` is the first STW
-        // bucket and only the coordinator can open any subsequent buckets, workers cannot execute
-        // work packets out of order.  This is not generally true if we are not opening the first
-        // STW bucket.  In the future, we should redesign the opening condition of work buckets to
-        // make the synchronization more robust,
+        // Note: This is the only place where a bucket is opened without having all workers parked.
+        // We usually require all workers to park before opening new buckets because otherwise
+        // packets will be executed out of order.  However, since `Prepare` is the first STW
+        // bucket, and all subsequent buckets require all workers to park before opening, workers
+        // cannot execute work packets out of order.  This is not generally true if we are not
+        // opening the first STW bucket.  In the future, we should redesign the opening condition
+        // of work buckets to make the synchronization more robust,
         first_stw_bucket.activate();
         self.worker_monitor.notify_work_available(true);
     }
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 764f9f649d..bd5367f987 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -44,10 +44,6 @@ pub struct GCWorkerShared<VM: VMBinding> {
     #[cfg(feature = "count_live_bytes_in_gc")]
     live_bytes: AtomicUsize,
     /// A queue of GCWork that can only be processed by the owned thread.
-    ///
-    /// Note: Currently, designated work cannot be added from the GC controller thread, or
-    /// there will be synchronization problems.  If it is necessary to do so, we need to
-    /// update the code in `GCWorkScheduler::poll_slow` for proper synchornization.
     pub designated_work: ArrayQueue<Box<dyn GCWork<VM>>>,
     /// Handle for stealing packets from the current worker
     pub stealer: Option<Stealer<Box<dyn GCWork<VM>>>>,
@@ -218,12 +214,10 @@ impl WorkerMonitorSync {
 }
 
 /// A GC worker.  This part is privately owned by a worker thread.
-/// The GC controller also has an embedded `GCWorker` because it may also execute work packets.
 pub struct GCWorker<VM: VMBinding> {
     /// The VM-specific thread-local state of the GC thread.
     pub tls: VMWorkerThread,
-    /// The ordinal of the worker, numbered from 0 to the number of workers minus one. The ordinal
-    /// is usize::MAX if it is the embedded worker of the GC controller thread.
+    /// The ordinal of the worker, numbered from 0 to the number of workers minus one.
     pub ordinal: ThreadId,
     /// The reference to the scheduler.
     scheduler: Arc<GCWorkScheduler<VM>>,
@@ -231,9 +225,6 @@ pub struct GCWorker<VM: VMBinding> {
     copy: GCWorkerCopyContext<VM>,
     /// The reference to the MMTk instance.
     pub mmtk: &'static MMTK<VM>,
-    /// True if this struct is the embedded GCWorker of the controller thread.
-    /// False if this struct belongs to a standalone GCWorker thread.
-    is_coordinator: bool,
     /// Reference to the shared part of the GC worker.  It is used for synchronization.
     pub shared: Arc<GCWorkerShared<VM>>,
     /// Local work packet queue.
@@ -262,7 +253,6 @@ impl<VM: VMBinding> GCWorker<VM> {
         mmtk: &'static MMTK<VM>,
         ordinal: ThreadId,
         scheduler: Arc<GCWorkScheduler<VM>>,
-        is_coordinator: bool,
         shared: Arc<GCWorkerShared<VM>>,
         local_work_buffer: deque::Worker<Box<dyn GCWork<VM>>>,
     ) -> Self {
@@ -273,7 +263,6 @@ impl<VM: VMBinding> GCWorker<VM> {
             copy: GCWorkerCopyContext::new_non_copy(),
             scheduler,
             mmtk,
-            is_coordinator,
             shared,
             local_work_buffer,
         }
@@ -307,11 +296,6 @@ impl<VM: VMBinding> GCWorker<VM> {
         self.local_work_buffer.push(Box::new(work));
     }
 
-    /// Is this worker a coordinator or a normal GC worker?
-    pub fn is_coordinator(&self) -> bool {
-        self.is_coordinator
-    }
-
     /// Get the scheduler. There is only one scheduler per MMTk instance.
     pub fn scheduler(&self) -> &GCWorkScheduler<VM> {
         &self.scheduler
@@ -364,7 +348,7 @@ impl<VM: VMBinding> GCWorker<VM> {
     }
 }
 
-/// A worker group to manage all the GC workers (except the coordinator worker).
+/// A worker group to manage all the GC workers.
 pub(crate) struct WorkerGroup<VM: VMBinding> {
     /// Shared worker data
     pub workers_shared: Vec<Arc<GCWorkerShared<VM>>>,
@@ -401,7 +385,6 @@ impl<VM: VMBinding> WorkerGroup<VM> {
                 mmtk,
                 ordinal,
                 mmtk.scheduler.clone(),
-                false,
                 shared.clone(),
                 unspawned_local_work_queues.pop().unwrap(),
             ));
diff --git a/src/util/options.rs b/src/util/options.rs
index 11a1b329ae..eb6513e1c3 100644
--- a/src/util/options.rs
+++ b/src/util/options.rs
@@ -705,7 +705,7 @@ mod gc_trigger_tests {
 options! {
     /// The GC plan to use.
     plan:                  PlanSelector         [env_var: true, command_line: true] [always_valid] = PlanSelector::GenImmix,
-    /// Number of GC worker threads. (There is always one GC controller thread besides the GC workers)
+    /// Number of GC worker threads.
     // FIXME: Currently we create GCWorkScheduler when MMTK is created, which is usually static.
     // To allow this as a command-line option, we need to refactor the creation fo the `MMTK` instance.
     // See: https://github.com/mmtk/mmtk-core/issues/532
diff --git a/src/vm/collection.rs b/src/vm/collection.rs
index 7fe17ccdfd..b3b172ded0 100644
--- a/src/vm/collection.rs
+++ b/src/vm/collection.rs
@@ -30,8 +30,7 @@ pub trait Collection<VM: VMBinding> {
     /// This method may not be called by the same GC thread that called `stop_all_mutators`.
     ///
     /// Arguments:
-    /// * `tls`: The thread pointer for the GC worker.  Currently it is the tls of the embedded `GCWorker` instance
-    /// of the coordinator thread, but it is subject to change, and should not be depended on.
+    /// * `tls`: The thread pointer for the GC worker.
     fn resume_mutators(tls: VMWorkerThread);
 
     /// Block the current thread for GC. This is called when an allocation request cannot be fulfilled and a GC
diff --git a/tools/tracing/README.md b/tools/tracing/README.md
index 23a633dd21..1c8598abe0 100644
--- a/tools/tracing/README.md
+++ b/tools/tracing/README.md
@@ -15,7 +15,6 @@ Currently, the core provides the following tracepoints.
 - `mmtk:collection_initialized()`: GC is enabled
 - `mmtk:harness_begin()`: the timing iteration of a benchmark begins
 - `mmtk:harness_end()`: the timing iteration of a benchmark ends
-- `mmtk:gccontroller_run()`: the GC controller thread enters its work loop
 - `mmtk:gcworker_run()`: a GC worker thread enters its work loop
 - `mmtk:gc_start()`: a collection epoch starts
 - `mmtk:gc_end()`: a collection epoch ends

From 845eea6f0d5ca1d1732071ab82fc13d6deb46b69 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Fri, 5 Jan 2024 00:05:20 +0800
Subject: [PATCH 07/48] WIP: Updating comments and minor code change.

Found a case where workers may sleep forever.  Need to be fixed later.
---
 Cargo.toml                 |  1 +
 src/global_state.rs        |  7 +++--
 src/scheduler/scheduler.rs | 54 ++++++++++++++------------------
 src/scheduler/worker.rs    | 63 ++++++++++++++++++++++++++------------
 src/util/options.rs        |  3 --
 5 files changed, 73 insertions(+), 55 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 558f820701..3461c25835 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -50,6 +50,7 @@ static_assertions = "1.1.0"
 strum = "0.25"
 strum_macros = "0.25"
 sysinfo = "0.29"
+rand = "0.8.5"
 
 [dev-dependencies]
 paste = "1.0.8"
diff --git a/src/global_state.rs b/src/global_state.rs
index 55dbab7b3b..4d57fd2e23 100644
--- a/src/global_state.rs
+++ b/src/global_state.rs
@@ -44,8 +44,11 @@ pub struct GlobalState {
     pub(crate) stacks_prepared: AtomicBool,
     /// A counter that keeps tracks of the number of bytes allocated since last stress test
     pub(crate) allocation_bytes: AtomicUsize,
-    /// The time when the current GC started.  Only accessible in `ScheduleCollection` and
-    /// `GCWorkScheduler::on_gc_finished` which never happen at the same time.
+    /// The time when the current GC started.  Currently only used for logging.
+    /// Note that some (but not all) `GCTriggerPolicy` implementations do their own time tracking
+    /// independently for their own need.
+    /// This field only accessible in `ScheduleCollection` and `GCWorkScheduler::on_gc_finished`
+    /// which never happen at the same time, so `AtomicRefCell` is enough.
     pub(crate) gc_start_time: AtomicRefCell<Option<Instant>>,
     /// A counteer that keeps tracks of the number of bytes allocated by malloc
     #[cfg(feature = "malloc_counted_size")]
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index c81ea280ed..1c44e8c736 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -22,7 +22,7 @@ pub struct GCWorkScheduler<VM: VMBinding> {
     pub work_buckets: EnumMap<WorkBucketStage, WorkBucket<VM>>,
     /// Workers
     pub(crate) worker_group: Arc<WorkerGroup<VM>>,
-    /// Condition Variable for worker synchronization
+    /// Monitor for worker synchronization, including a mutex and conditional variables.
     pub(crate) worker_monitor: Arc<WorkerMonitor>,
     /// How to assign the affinity of each GC thread. Specified by the user.
     affinity: AffinityKind,
@@ -48,14 +48,16 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
 
         // Set the open condition of each bucket.
         {
-            // Unconstrained is always open. Prepare will be opened at the beginning of a GC.
-            // This vec will grow for each stage we call with open_next()
             let first_stw_stage = WorkBucketStage::first_stw_stage();
             let mut open_stages: Vec<WorkBucketStage> = vec![first_stw_stage];
-            // The rest will open after the previous stage is done.
             let stages = (0..WorkBucketStage::LENGTH).map(WorkBucketStage::from_usize);
             for stage in stages {
+                // Unconstrained is always open.
+                // The first STW stage (Prepare) will be opened when the world stopped
+                // (i.e. when all mutators are suspended).
                 if stage != WorkBucketStage::Unconstrained && stage != first_stw_stage {
+                    // Other work packets will be opened after previous stages are done
+                    // (i.e their buckets are drained and all workers parked).
                     let cur_stages = open_stages.clone();
                     work_buckets[stage].set_open_condition(
                         move |scheduler: &GCWorkScheduler<VM>| {
@@ -278,7 +280,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     }
 
     /// Check if all the work buckets are empty
-    pub(crate) fn assert_all_activated_buckets_are_empty(&self, worker: &GCWorker<VM>) {
+    pub(crate) fn assert_all_activated_buckets_are_empty(&self) {
         let mut error_example = None;
         for (id, bucket) in self.work_buckets.iter() {
             if bucket.is_activated() && !bucket.is_empty() {
@@ -288,16 +290,6 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 // we should show at least one abnormal bucket in the panic message
                 // so that we still have some information for debugging.
                 error_example = Some(id);
-
-                while !bucket.is_empty() {
-                    match bucket.poll(&worker.local_work_buffer) {
-                        Steal::Success(w) => {
-                            error!("  Bucket {:?} has {:?}", id, w.get_type_name());
-                        },
-                        Steal::Retry => continue,
-                        _ => {}
-                    }
-                }
             }
         }
         if let Some(id) = error_example {
@@ -377,10 +369,10 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 // Test whether we are doing GC.
                 if worker.mmtk.gc_in_progress() {
                     // We are in the middle of GC, and the last GC worker parked.
-                    trace!("GC is scheduled.  Try to find more work to do...");
+                    trace!("The last worker parked during GC.  Try to find more work to do...");
 
                     // During GC, if all workers parked, all open buckets must have been drained.
-                    self.assert_all_activated_buckets_are_empty(worker);
+                    self.assert_all_activated_buckets_are_empty();
 
                     // Find more work for workers to do.
                     let found_more_work = self.find_more_work_for_workers();
@@ -397,16 +389,16 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                         }
                     }
                 } else {
-                    trace!("GC is not scheduled.  Wait for the first GC.");
-                    // GC is not scheduled.  Do nothing.
-                    // Note that when GC worker threads has just been created, they will try to get
-                    // work packets to execute.  But since the first GC has not started, yet, there
-                    // is not any work packets to execute, yet.  Therefore, all workers will park,
-                    // and the last parked worker will reach here.  In that case, we should simply
-                    // let workers wait until the first GC starts, instead of trying to open more
-                    // buckets.
-                    // If a GC worker spuriously wakes up when GC is not scheduled, it should not
-                    // do anything, either.
+                    trace!("The last worker parked while not in GC.  Wait for GC to start.");
+                    // GC has not started, yet.  Do not try to open work buckets.
+                    //
+                    // This branch is usually reached when `initialize_colection` has just been
+                    // called and GC worker threads have just been created.  In that case, there is
+                    // no work packets to execute, and workers should park and wait for the first
+                    // GC.
+                    //
+                    // This branch can also be reached if a GC worker spuriously wakes up while not
+                    // in GC.
                     LastParkedResult::ParkSelf
                 }
             });
@@ -439,7 +431,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     /// Called when GC has finished, i.e. when all work packets have been executed.
     /// Return `true` if it scheduled the next GC immediately.
     fn on_gc_finished(&self, worker: &GCWorker<VM>) -> bool {
-        // All GC workers (except this one) must have parked by now.
+        // All GC workers must have parked by now.
         debug_assert!(!self.worker_group.has_designated_work());
         debug_assert!(self.all_buckets_empty());
 
@@ -466,6 +458,9 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
             elapsed.as_millis()
         );
 
+        // USDT tracepoint for the end of GC.
+        probe!(mmtk, gc_end);
+
         #[cfg(feature = "count_live_bytes_in_gc")]
         {
             let live_bytes = mmtk.state.get_live_bytes_in_last_gc();
@@ -501,9 +496,6 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         mmtk.set_gc_status(GcStatus::NotInGC);
         <VM as VMBinding>::VMCollection::resume_mutators(worker.tls);
 
-        // GC offically ends here.
-        probe!(mmtk, gc_end);
-
         // Notify the `GCRequester` that GC has finished.
         let should_schedule_gc_now = mmtk.gc_requester.on_gc_finished();
         if should_schedule_gc_now {
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index bd5367f987..754d50b0a3 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -84,7 +84,7 @@ pub(crate) enum LastParkedResult {
 
 /// A data structure for parking and waking up workers.
 /// It keeps track of the number of workers parked, and allow the last parked worker to perform
-/// operations that require all other workers to be parked.
+/// operations that can only be performed when all workers have parked.
 pub(crate) struct WorkerMonitor {
     /// The synchronized part.
     sync: Mutex<WorkerMonitorSync>,
@@ -118,6 +118,8 @@ impl WorkerMonitor {
         self.notify_work_available_inner(all, &mut guard);
     }
 
+    /// Like `notify_work_available` but the current thread must have already acquired the
+    /// mutex of `WorkerMonitorSync`.
     fn notify_work_available_inner(&self, all: bool, _guard: &mut MutexGuard<WorkerMonitorSync>) {
         if all {
             self.work_available.notify_all();
@@ -126,17 +128,19 @@ impl WorkerMonitor {
         }
     }
 
-    /// Park a worker until more work is available.
+    /// Park a worker until work packets are available.
     /// If it is the last worker parked, `on_last_parked` will be called.
-    /// If `on_last_parked` returns `true`, this function will notify other workers about available
-    /// work before unparking the current worker;
-    /// if `on_last_parked` returns `false`, the current worker will also wait for work packets to
-    /// be added.
+    /// The return value of `on_last_parked` will determine whether this worker will block wait,
+    /// too, and whether other worker will be waken up.
     pub fn park_and_wait<VM, F>(&self, worker: &GCWorker<VM>, on_last_parked: F)
     where
         VM: VMBinding,
         F: FnOnce() -> LastParkedResult,
     {
+        warn!("Sleep before acquiring self.sync...");
+        std::thread::sleep(std::time::Duration::from_millis(30));
+        warn!("Slept.  Now acquire self.sync...");
+
         let mut sync = self.sync.lock().unwrap();
 
         // Park this worker
@@ -149,33 +153,54 @@ impl WorkerMonitor {
             all_parked
         );
 
+        let mut should_wait = false;
+
         if all_parked {
             debug!("Worker {} is the last worker parked.", worker.ordinal);
             let result = on_last_parked();
             match result {
                 LastParkedResult::ParkSelf => {
-                    // It didn't make more work available.  Keep waiting.
-                    // This should happen when
-                    // 1.  Worker threads have just been created, but GC has not started, yet, or
-                    // 2.  A GC has just finished.
-                    sync = self.work_available.wait(sync).unwrap();
+                    should_wait = true;
                 }
                 LastParkedResult::WakeSelf => {
                     // Continue without waiting.
-                    // This should happen when only one work packet is made available.
                 }
                 LastParkedResult::WakeAll => {
-                    // Notify all GC workers.
                     self.notify_work_available_inner(true, &mut sync);
                 }
             }
         } else {
-            // Otherwise wait until notified.
-            // Note: The condition for this `cond.wait` is "more work is available".
-            // If this worker spuriously wakes up, then in the next loop iteration, the
-            // `poll_schedulable_work` invocation above will fail, and the worker will reach
-            // here and wait again.
-            sync = self.work_available.wait(sync).unwrap();
+            should_wait = true;
+        }
+
+        if should_wait {
+            // Note:
+            // 1.  The condition variable `work_available` is guarded by `self.sync`.  Because the
+            //     last parked worker is holding the mutex `self.sync` when executing
+            //     `on_last_parked`, no workers can unpark (even if they spuriously wake up) during
+            //     `on_last_parked` because they cannot re-acquire the mutex `self.sync`.
+            // 2.  Workers may spuriously wake up and unpark when `on_last_parked` is not being
+            //     executed (including the case when the last parked worker is waiting here, too).
+            //     Spurious wake-up is safe because the actual condition for this `cond.wait` is
+            //     "more work is available".  If a worker spuriously wakes up, then in the next
+            //     loop iteration, it will call `poll_schedulable_work`, and find no work packets
+            //     to execute.  Then the worker will reach here again and wait.
+            // 3.  Mutators may add a `ScheduleCollection` work packet via `GCRequester` to trigger
+            //     GC.  It is the only case where a work packet is added by a thread that is not a
+            //     GC worker.  Because the mutator must hold the mutex `self.sync` to notify GC
+            //     workers when adding a work packet to a bucket, either of the two can happen:
+            //     1.  The mutator called `work_available.notify_one` after the worker has called
+            //         `wait`, in which case one worker (not necessarily the last parked worker)
+            //         will be waken up.
+            //     2.  The mutator notified when the last worker has just entered this function.
+            //         In that case, ... Oh no!  All workers sleep forever!
+            warn!("Now wait...");
+            if rand::random::<bool>() {
+                sync = self.work_available.wait(sync).unwrap();
+                warn!("Out from wait.");
+            } else {
+                warn!("Emulated spurious wakeup.");
+            }
         }
 
         // Unpark this worker.
diff --git a/src/util/options.rs b/src/util/options.rs
index eb6513e1c3..fb51d0fc65 100644
--- a/src/util/options.rs
+++ b/src/util/options.rs
@@ -706,9 +706,6 @@ options! {
     /// The GC plan to use.
     plan:                  PlanSelector         [env_var: true, command_line: true] [always_valid] = PlanSelector::GenImmix,
     /// Number of GC worker threads.
-    // FIXME: Currently we create GCWorkScheduler when MMTK is created, which is usually static.
-    // To allow this as a command-line option, we need to refactor the creation fo the `MMTK` instance.
-    // See: https://github.com/mmtk/mmtk-core/issues/532
     threads:               usize                [env_var: true, command_line: true] [|v: &usize| *v > 0]    = num_cpus::get(),
     /// Enable an optimization that only scans the part of the stack that has changed since the last GC (not supported)
     use_short_stack_scans: bool                 [env_var: true, command_line: true]  [always_valid] = false,

From 2d52f00d5c30268ae6cfced27a1c5a214162e969 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Fri, 5 Jan 2024 03:18:22 +0800
Subject: [PATCH 08/48] Fix a case where workers wait forever

Mutators no longer trigger GC by adding ScheduleCollection directly.
---
 Cargo.toml                 |   1 -
 src/plan/gc_requester.rs   |   2 +-
 src/scheduler/scheduler.rs | 114 +++++++++++++++++++++----------------
 src/scheduler/worker.rs    | 103 +++++++++++++++++++++++----------
 4 files changed, 139 insertions(+), 81 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 3461c25835..558f820701 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -50,7 +50,6 @@ static_assertions = "1.1.0"
 strum = "0.25"
 strum_macros = "0.25"
 sysinfo = "0.29"
-rand = "0.8.5"
 
 [dev-dependencies]
 paste = "1.0.8"
diff --git a/src/plan/gc_requester.rs b/src/plan/gc_requester.rs
index 03d54889df..234f74b423 100644
--- a/src/plan/gc_requester.rs
+++ b/src/plan/gc_requester.rs
@@ -49,7 +49,7 @@ impl<VM: VMBinding> GCRequester<VM> {
 
             let should_schedule_gc = self.try_schedule_collection(&mut guard);
             if should_schedule_gc {
-                self.scheduler.mutator_schedule_collection();
+                self.scheduler.request_schedule_collection();
                 // Note: We do not clear `request_flag` now.  It will be cleared by `clear_request`
                 // after all mutators have stopped.
             }
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 1c44e8c736..151bdcc8c8 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -91,14 +91,16 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         self.affinity.resolve_affinity(thread);
     }
 
-    /// Schedule collection.  Called via `GCRequester`.
-    /// Because this function is called by a mutator thread, we only add a `ScheduleCollection`
-    /// work packet here so that a GC worker can wake up later and actually schedule the work for a
-    /// collection.
-    pub(crate) fn mutator_schedule_collection(&self) {
-        debug!("Adding ScheduleCollection work packet upon mutator request.");
-        // Add a ScheduleCollection work packet.  It is the seed of other work packets.
-        self.work_buckets[WorkBucketStage::Unconstrained].add(ScheduleCollection);
+    /// Request a GC to be scheduled.  Called by mutator via `GCRequester`.
+    pub(crate) fn request_schedule_collection(&self) {
+        debug!("A mutator is sending GC-scheduling request to workers...");
+        self.worker_monitor.request_schedule_collection();
+    }
+
+    /// Add the `ScheduleCollection` packet.  Called by the last parked worker.
+    fn add_schedule_collection_packet(&self) {
+        // We are still holding the mutex `WorkerMonitor::sync`.  Do not notify now.
+        self.work_buckets[WorkBucketStage::Unconstrained].add_no_notify(ScheduleCollection);
     }
 
     /// Schedule all the common work packets
@@ -363,45 +365,57 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 return work;
             }
 
-            self.worker_monitor.park_and_wait(worker, || {
-                // This is the last worker parked.
-
-                // Test whether we are doing GC.
-                if worker.mmtk.gc_in_progress() {
-                    // We are in the middle of GC, and the last GC worker parked.
-                    trace!("The last worker parked during GC.  Try to find more work to do...");
-
-                    // During GC, if all workers parked, all open buckets must have been drained.
-                    self.assert_all_activated_buckets_are_empty();
-
-                    // Find more work for workers to do.
-                    let found_more_work = self.find_more_work_for_workers();
-
-                    if found_more_work {
-                        LastParkedResult::WakeAll
-                    } else {
-                        // GC finished.
-                        let scheduled_next_gc = self.on_gc_finished(worker);
-                        if scheduled_next_gc {
-                            LastParkedResult::WakeSelf
-                        } else {
-                            LastParkedResult::ParkSelf
-                        }
-                    }
+            self.worker_monitor
+                .park_and_wait(worker, |should_schedule_gc| {
+                    self.on_last_parked(worker, should_schedule_gc)
+                });
+        }
+    }
+
+    /// Called when the last worker parked.
+    /// `should_schedule_gc` is true if a mutator requested a GC.
+    fn on_last_parked(&self, worker: &GCWorker<VM>, should_schedule_gc: bool) -> LastParkedResult {
+        // Test whether this is happening during GC.
+        if worker.mmtk.gc_in_progress() {
+            assert!(
+                !should_schedule_gc,
+                "GC request sent to WorkerMonitor while GC is still in progress."
+            );
+
+            // We are in the middle of GC, and the last GC worker parked.
+            trace!("The last worker parked during GC.  Try to find more work to do...");
+
+            // During GC, if all workers parked, all open buckets must have been drained.
+            self.assert_all_activated_buckets_are_empty();
+
+            // Find more work for workers to do.
+            let found_more_work = self.find_more_work_for_workers();
+
+            if found_more_work {
+                LastParkedResult::WakeAll
+            } else {
+                // GC finished.
+                let scheduled_next_gc = self.on_gc_finished(worker);
+                if scheduled_next_gc {
+                    LastParkedResult::WakeSelf
                 } else {
-                    trace!("The last worker parked while not in GC.  Wait for GC to start.");
-                    // GC has not started, yet.  Do not try to open work buckets.
-                    //
-                    // This branch is usually reached when `initialize_colection` has just been
-                    // called and GC worker threads have just been created.  In that case, there is
-                    // no work packets to execute, and workers should park and wait for the first
-                    // GC.
-                    //
-                    // This branch can also be reached if a GC worker spuriously wakes up while not
-                    // in GC.
                     LastParkedResult::ParkSelf
                 }
-            });
+            }
+        } else {
+            trace!(
+                "The last worker parked while not in GC.  should_schedule_gc: {}",
+                should_schedule_gc
+            );
+
+            if should_schedule_gc {
+                // A mutator requested a GC to be scheduled.
+                self.add_schedule_collection_packet();
+                LastParkedResult::WakeSelf
+            } else {
+                // Wait until GC is requested.
+                LastParkedResult::ParkSelf
+            }
         }
     }
 
@@ -506,13 +520,15 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
             //     still running, or
             // 2.  It is STW GC, but after the invocation of `resume_mutators` above, one mutator
             //     ran so fast that it triggered a GC before we called `on_gc_finished`.
-            // Note that we are holding the `WorkerMonitor` mutex, and cannot notify workers now.
-            // When this function returns, the current worker should continue to execute the newly
-            // added `ScheduleCollection` work packet.
-            debug!("GC already requested before `on_gc_finished`.  Add ScheduleCollection now.");
-            self.work_buckets[WorkBucketStage::Unconstrained].add_no_notify(ScheduleCollection);
+            debug!("GC already requested before `on_gc_finished`.  Schedule GC now.");
+            self.add_schedule_collection_packet();
             true
         } else {
+            // Note that if a mutator attempts to request GC after `on_gc_finished`, it will call
+            // `request_schedule_collection`, but will block on the mutex `WorkerMonitor::sync`
+            // because the current GC worker is holding it.  After the current worker calls
+            // `work_available.wait()`, the mutator will continue and wake up a GC worker (not
+            // necessarily this one) to schedule the GC.
             false
         }
     }
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 754d50b0a3..74351bedc1 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -98,6 +98,8 @@ pub(crate) struct WorkerMonitorSync {
     worker_count: usize,
     /// Number of parked workers.
     parked_workers: usize,
+    /// True if a mutator has requested the workers to schedule a GC.
+    should_schedule_gc: bool,
 }
 
 impl WorkerMonitor {
@@ -106,12 +108,26 @@ impl WorkerMonitor {
             sync: Mutex::new(WorkerMonitorSync {
                 worker_count,
                 parked_workers: 0,
+                should_schedule_gc: false,
             }),
             work_available: Default::default(),
         }
     }
 
-    /// Wake up workers when more work packets are made available for workers.
+    /// Request a GC worker to schedule the next GC.
+    /// Callable from mutator threads.
+    pub fn request_schedule_collection(&self) {
+        let mut guard = self.sync.lock().unwrap();
+        assert!(
+            !guard.should_schedule_gc,
+            "should_schedule_gc is already set"
+        );
+        guard.should_schedule_gc = true;
+        self.notify_work_available_inner(false, &mut guard);
+    }
+
+    /// Wake up workers when more work packets are made available for workers,
+    /// or a mutator has requested the GC workers to schedule a GC.
     /// This function is called when adding work packets to buckets.
     pub fn notify_work_available(&self, all: bool) {
         let mut guard = self.sync.lock().unwrap();
@@ -130,17 +146,15 @@ impl WorkerMonitor {
 
     /// Park a worker until work packets are available.
     /// If it is the last worker parked, `on_last_parked` will be called.
-    /// The return value of `on_last_parked` will determine whether this worker will block wait,
-    /// too, and whether other worker will be waken up.
+    /// The argument is true if `sync.gc_requested` is `true`,
+    /// and `sync.gc_requested` will be cleared to `false` regardless of its value.
+    /// The return value of `on_last_parked` will determine whether this worker will block and
+    /// wait, too, and whether other worker will be waken up.
     pub fn park_and_wait<VM, F>(&self, worker: &GCWorker<VM>, on_last_parked: F)
     where
         VM: VMBinding,
-        F: FnOnce() -> LastParkedResult,
+        F: FnOnce(bool) -> LastParkedResult,
     {
-        warn!("Sleep before acquiring self.sync...");
-        std::thread::sleep(std::time::Duration::from_millis(30));
-        warn!("Slept.  Now acquire self.sync...");
-
         let mut sync = self.sync.lock().unwrap();
 
         // Park this worker
@@ -157,7 +171,8 @@ impl WorkerMonitor {
 
         if all_parked {
             debug!("Worker {} is the last worker parked.", worker.ordinal);
-            let result = on_last_parked();
+            let should_schedule_gc = std::mem::replace(&mut sync.should_schedule_gc, false);
+            let result = on_last_parked(should_schedule_gc);
             match result {
                 LastParkedResult::ParkSelf => {
                     should_wait = true;
@@ -174,33 +189,61 @@ impl WorkerMonitor {
         }
 
         if should_wait {
-            // Note:
+            // Notes on CondVar usage:
+            //
+            // Conditional variables may spurious wake up.  Therefore, they are usually tested in a
+            // loop while holding a mutex
+            //
+            //      lock();
+            //      while condition() {
+            //          condvar.wait();
+            //      }
+            //      unlock();
+            //
+            // The actual condition for this `self.work_available.wait(sync)` is:
+            //
+            // 1.  any work packet is available, or
+            // 2.  a request for scheduling GC is submitted.
+            //
+            // But it is not used like the typical use pattern shown above, mainly because work
+            // packets can be added without holding the mutex `self.sync`.  This means one worker
+            // can add a new work packet (no mutex needed) right after another worker finds no work
+            // packets are available and then park.  In other words, condition (1) can suddenly
+            // become true after a worker sees it is false but before the worker blocks waiting on
+            // the CondVar.  If this happens, the last parked worker will block forever and never
+            // get notified.  This may happen if mutators or the previously existing "coordinator
+            // thread" can add work packets.
+            //
+            // However, after the "coordinator thread" was removed, only GC worker threads can add
+            // work packets during GC.  Parked workers (except the last parked worker) cannot make
+            // more work packets availble (by adding new packets or opening buckets).  For this
+            // reason, the **last** parked worker can be sure that after it finds no packets
+            // available, no other workers can add another work packet (because they all parked).
+            // So the **last** parked worker can open more buckets or declare GC finished.
+            //
+            // Condition (2), i.e. `sync.should_schedule_gc` is guarded by the mutator `sync`.
+            // When set (by a mutator via `request_schedule_collection`), it will notify a
+            // worker; and the last parked worker always checks it before waiting.  So this
+            // condition will not be set without any worker noticing.
+            //
+            // Note that generational barriers may add `ProcessModBuf` work packets when not in GC.
+            // This is benign because those work packets are not executed immediately, and are
+            // guaranteed to be executed in the next GC.
+            //
+            // Notes on spurious wake-up:
+            //
             // 1.  The condition variable `work_available` is guarded by `self.sync`.  Because the
             //     last parked worker is holding the mutex `self.sync` when executing
             //     `on_last_parked`, no workers can unpark (even if they spuriously wake up) during
             //     `on_last_parked` because they cannot re-acquire the mutex `self.sync`.
+            //
             // 2.  Workers may spuriously wake up and unpark when `on_last_parked` is not being
             //     executed (including the case when the last parked worker is waiting here, too).
-            //     Spurious wake-up is safe because the actual condition for this `cond.wait` is
-            //     "more work is available".  If a worker spuriously wakes up, then in the next
-            //     loop iteration, it will call `poll_schedulable_work`, and find no work packets
-            //     to execute.  Then the worker will reach here again and wait.
-            // 3.  Mutators may add a `ScheduleCollection` work packet via `GCRequester` to trigger
-            //     GC.  It is the only case where a work packet is added by a thread that is not a
-            //     GC worker.  Because the mutator must hold the mutex `self.sync` to notify GC
-            //     workers when adding a work packet to a bucket, either of the two can happen:
-            //     1.  The mutator called `work_available.notify_one` after the worker has called
-            //         `wait`, in which case one worker (not necessarily the last parked worker)
-            //         will be waken up.
-            //     2.  The mutator notified when the last worker has just entered this function.
-            //         In that case, ... Oh no!  All workers sleep forever!
-            warn!("Now wait...");
-            if rand::random::<bool>() {
-                sync = self.work_available.wait(sync).unwrap();
-                warn!("Out from wait.");
-            } else {
-                warn!("Emulated spurious wakeup.");
-            }
+            //     If one or more GC workers spuriously wake up, they will check for work packets,
+            //     and park again if not available.  The last parked worker will ensure the two
+            //     conditions listed above are both false before blocking.  If either condition is
+            //     true, the last parked worker will take action.
+            sync = self.work_available.wait(sync).unwrap();
         }
 
         // Unpark this worker.

From fa2aa5012281ae9744be77b89cd442d4407b8b87 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Fri, 5 Jan 2024 18:04:15 +0800
Subject: [PATCH 09/48] Added comments and removed redundant code.

---
 src/global_state.rs      | 19 ++++++++++++------
 src/mmtk.rs              |  2 ++
 src/plan/gc_requester.rs | 42 +++++++++++++++++++---------------------
 src/scheduler/worker.rs  | 31 ++++++++++++++++-------------
 4 files changed, 53 insertions(+), 41 deletions(-)

diff --git a/src/global_state.rs b/src/global_state.rs
index 4d57fd2e23..468efb9f37 100644
--- a/src/global_state.rs
+++ b/src/global_state.rs
@@ -233,13 +233,20 @@ impl Default for GlobalState {
     }
 }
 
-/// GC status, an indicator of whether MMTk is in the progress of a GC or not.
+/// GC status.
 ///
-/// This type was only used for assertions in JikesRVM.  After we removed the coordinator (a.k.a.
-/// controller) thread, we use the GC status to decide whether GC workers should open new work
-/// buckets when the last worker parked.
-// FIXME: `GcProper` is inherited from JikesRVM, but it is not used in MMTk core.  Consider
-// removing it.
+/// -   It starts in the `NotInGC` state.
+/// -   It enters `GcPrepare` when GC starts (i.e. when `ScheduleCollection` is executed).
+/// -   It enters `GcProper` when all mutators have stopped.
+/// -   It returns to `NotInGC` when GC finishes.
+///
+/// All states except `NotInGC` are considered "in GC".
+///
+/// The status is checked by GC workers.  The last parked worker only tries to open new buckets
+/// during GC.
+///
+/// The distinction between `GcPrepare` and `GcProper` is inherited from JikesRVM.  Several
+/// assertions in JikesRVM involve those two states.
 #[derive(PartialEq, Clone, Copy, NoUninit, Debug)]
 #[repr(u8)]
 pub enum GcStatus {
diff --git a/src/mmtk.rs b/src/mmtk.rs
index 9c433fa06d..44ef399083 100644
--- a/src/mmtk.rs
+++ b/src/mmtk.rs
@@ -344,6 +344,8 @@ impl<VM: VMBinding> MMTK<VM> {
         self.state
             .internal_triggered_collection
             .store(true, Ordering::Relaxed);
+        // TODO: The current `GCRequester::request()` is probably incorrect for internally triggered GC.
+        // Consider removing functions related to "internal triggered collection".
         self.gc_requester.request();
     }
 
diff --git a/src/plan/gc_requester.rs b/src/plan/gc_requester.rs
index 234f74b423..43ae8e2abd 100644
--- a/src/plan/gc_requester.rs
+++ b/src/plan/gc_requester.rs
@@ -1,24 +1,33 @@
 use crate::scheduler::GCWorkScheduler;
 use crate::vm::VMBinding;
-use std::marker::PhantomData;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex};
 
 struct RequestSync {
-    /// Has the GCRequester called `GCWorkScheduler::schedule_collection` for the current request?
-    /// This flag exists so that once `GCRequester` called `GCWorkScheduler::schedule_collection`,
-    /// it cannot call it again until the GC it initiated finished.
+    /// Are the GC workers already aware that we requested a GC?
+    ///
+    /// Mutators call `GCRequester::request()` to trigger GC.  It communicates with workers by
+    /// calling `GCWorkScheduler::request_schedule_collection`.  Under the hood, it sets a
+    /// synchronized variable and notifies one worker.  Conceptually, it is sending a message to GC
+    /// workers.
+    ///
+    /// The purpose of this variable is preventing the message above from being sent while GC
+    /// workers are still doing GC.  This is mainly significant for concurrent GC.  In the current
+    /// design (inherited from JikesRVM), `request_flag` is cleared when all mutators are
+    /// suspended, at which time GC is still in progress.  If `GCRequester::request()` is called
+    /// after `request_flag` is cleared but before `on_gc_finished` is called, the mutator will not
+    /// send the message.  But GC workers will check `request_flag` when GC finishes, and schedule
+    /// the next GC.
     gc_scheduled: bool,
 }
 
-/// This data structure lets mutators trigger GC, and may schedule collection when appropriate.
+/// This data structure lets mutators trigger GC.
 pub struct GCRequester<VM: VMBinding> {
     request_sync: Mutex<RequestSync>,
     /// An atomic flag outside `RequestSync` so that mutators can check if GC has already been
     /// requested in `poll` without acquiring the mutex.
     request_flag: AtomicBool,
     scheduler: Arc<GCWorkScheduler<VM>>,
-    phantom: PhantomData<VM>,
 }
 
 impl<VM: VMBinding> GCRequester<VM> {
@@ -29,12 +38,11 @@ impl<VM: VMBinding> GCRequester<VM> {
             }),
             request_flag: AtomicBool::new(false),
             scheduler,
-            phantom: PhantomData,
         }
     }
 
     /// Request a GC.  Called by mutators when polling (during allocation) and when handling user
-    /// GC requests (e.g. `System.gc();` in Java);
+    /// GC requests (e.g. `System.gc();` in Java).
     pub fn request(&self) {
         // Note: This is the double-checked locking algorithm.
         // The load has the `Relaxed` order instead of `Acquire` because we are not doing lazy
@@ -56,12 +64,6 @@ impl<VM: VMBinding> GCRequester<VM> {
         }
     }
 
-    /// Returns true if GC has been scheduled.
-    pub fn is_gc_scheduled(&self) -> bool {
-        let guard = self.request_sync.lock().unwrap();
-        guard.gc_scheduled
-    }
-
     /// Clear the "GC requested" flag so that mutators can trigger the next GC.
     /// Called by a GC worker when all mutators have come to a stop.
     pub fn clear_request(&self) {
@@ -82,16 +84,12 @@ impl<VM: VMBinding> GCRequester<VM> {
 
     /// Decide whether we should schedule a new collection.  Will transition the state of
     /// `gc_scheduled` from `false` to `true` if we should schedule a new collection.
+    ///
     /// Return `true` if the state transition happens.
     fn try_schedule_collection(&self, sync: &mut RequestSync) -> bool {
-        // The time to schedule a collection is when `request_flag` is `true` but `gc_scheduled`
-        // is `false`.  `gc_scheduled` is `true` if either
-        //
-        // 1.  another mutator called `request()` concurrently and scheduled a collection, or
-        // 2.  a new GC is requested while the current GC is still in progress.
-        //
-        // If `gc_scheduled` is `true` when GC is requested, we do nothing now.  But when the
-        // currrent GC finishes, a GC worker will call `on_gc_finished` which clears the
+        // The time to schedule a collection is when `request_flag` is `true` but `gc_scheduled` is
+        // `false`.  If `gc_scheduled` is `true` when GC is requested, we do nothing now.  But when
+        // the currrent GC finishes, a GC worker will call `on_gc_finished` which clears the
         // `gc_scheduled` flag, and checks the `request_flag` again to trigger the next GC.
         if self.request_flag.load(Ordering::Relaxed) && !sync.gc_scheduled {
             sync.gc_scheduled = true;
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 74351bedc1..302bc9d239 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -82,13 +82,16 @@ pub(crate) enum LastParkedResult {
     WakeAll,
 }
 
-/// A data structure for parking and waking up workers.
-/// It keeps track of the number of workers parked, and allow the last parked worker to perform
-/// operations that can only be performed when all workers have parked.
+/// A data structure for synchronizing workers.
+///
+/// -   It allows workers to park and unpark.  It keeps track of the number of workers parked, and
+///     allows the last parked worker to perform operations that can only be performed when all
+///     workers have parked.
+/// -   It allows mutators to notify workers to schedule a GC.
 pub(crate) struct WorkerMonitor {
     /// The synchronized part.
     sync: Mutex<WorkerMonitorSync>,
-    /// This is notified when new work is made available for the workers.
+    /// This is notified when new work packets are available, or a mutator has requested GC.
     work_available: Condvar,
 }
 
@@ -99,6 +102,10 @@ pub(crate) struct WorkerMonitorSync {
     /// Number of parked workers.
     parked_workers: usize,
     /// True if a mutator has requested the workers to schedule a GC.
+    ///
+    /// Consider this as a one-element message queue.  The last parked worker will poll this
+    /// "queue" by reading this field and setting it to `false`.  The last parked worker will
+    /// schedule a GC whenever seeing `true` in this field.
     should_schedule_gc: bool,
 }
 
@@ -128,7 +135,6 @@ impl WorkerMonitor {
 
     /// Wake up workers when more work packets are made available for workers,
     /// or a mutator has requested the GC workers to schedule a GC.
-    /// This function is called when adding work packets to buckets.
     pub fn notify_work_available(&self, all: bool) {
         let mut guard = self.sync.lock().unwrap();
         self.notify_work_available_inner(all, &mut guard);
@@ -144,12 +150,12 @@ impl WorkerMonitor {
         }
     }
 
-    /// Park a worker until work packets are available.
+    /// Park a worker and wait on the CondVar `work_available`.
+    ///
     /// If it is the last worker parked, `on_last_parked` will be called.
-    /// The argument is true if `sync.gc_requested` is `true`,
-    /// and `sync.gc_requested` will be cleared to `false` regardless of its value.
-    /// The return value of `on_last_parked` will determine whether this worker will block and
-    /// wait, too, and whether other worker will be waken up.
+    /// The argument of `on_last_parked` is true if `sync.gc_requested` is `true`.
+    /// The return value of `on_last_parked` will determine whether this worker and other workers
+    /// will wake up or block waiting.
     pub fn park_and_wait<VM, F>(&self, worker: &GCWorker<VM>, on_last_parked: F)
     where
         VM: VMBinding,
@@ -191,8 +197,7 @@ impl WorkerMonitor {
         if should_wait {
             // Notes on CondVar usage:
             //
-            // Conditional variables may spurious wake up.  Therefore, they are usually tested in a
-            // loop while holding a mutex
+            // Conditional variables are usually tested in a loop while holding a mutex
             //
             //      lock();
             //      while condition() {
@@ -229,7 +234,7 @@ impl WorkerMonitor {
             // Note that generational barriers may add `ProcessModBuf` work packets when not in GC.
             // This is benign because those work packets are not executed immediately, and are
             // guaranteed to be executed in the next GC.
-            //
+
             // Notes on spurious wake-up:
             //
             // 1.  The condition variable `work_available` is guarded by `self.sync`.  Because the

From 5bda9ee3362887a2b0499eabb347191f5d887cb6 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 11 Jan 2024 16:22:20 +0800
Subject: [PATCH 10/48] Move the gc_start trace point.

---
 src/scheduler/gc_work.rs   | 3 ---
 src/scheduler/scheduler.rs | 4 ++++
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/scheduler/gc_work.rs b/src/scheduler/gc_work.rs
index 2be49a06eb..a651437911 100644
--- a/src/scheduler/gc_work.rs
+++ b/src/scheduler/gc_work.rs
@@ -14,9 +14,6 @@ pub struct ScheduleCollection;
 
 impl<VM: VMBinding> GCWork<VM> for ScheduleCollection {
     fn do_work(&mut self, worker: &mut GCWorker<VM>, mmtk: &'static MMTK<VM>) {
-        // This is officially when a GC starts.
-        probe!(mmtk, gc_start);
-
         // Record the time when GC starts.
         {
             let mut guard = mmtk.state.gc_start_time.borrow_mut();
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 151bdcc8c8..927dd8fbd0 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -99,6 +99,10 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
 
     /// Add the `ScheduleCollection` packet.  Called by the last parked worker.
     fn add_schedule_collection_packet(&self) {
+        // We set the eBPF trace point here so that bpftrace scripts can start recording work
+        // packet events before the `ScheduleCollection` work packet starts.
+        probe!(mmtk, gc_start);
+
         // We are still holding the mutex `WorkerMonitor::sync`.  Do not notify now.
         self.work_buckets[WorkBucketStage::Unconstrained].add_no_notify(ScheduleCollection);
     }

From 72e6eb93f0c119c7fe44dbf1b148921e32c3219c Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Fri, 12 Jan 2024 14:27:57 +0800
Subject: [PATCH 11/48] Remove the EndOfGC work packet.

---
 src/scheduler/gc_work.rs | 51 ----------------------------------------
 1 file changed, 51 deletions(-)

diff --git a/src/scheduler/gc_work.rs b/src/scheduler/gc_work.rs
index a651437911..1084cffc57 100644
--- a/src/scheduler/gc_work.rs
+++ b/src/scheduler/gc_work.rs
@@ -222,57 +222,6 @@ impl<C: GCWorkContext> GCWork<C::VM> for StopMutators<C> {
     }
 }
 
-#[derive(Default)]
-pub struct EndOfGC {
-    pub elapsed: std::time::Duration,
-}
-
-impl<VM: VMBinding> GCWork<VM> for EndOfGC {
-    fn do_work(&mut self, worker: &mut GCWorker<VM>, mmtk: &'static MMTK<VM>) {
-        info!(
-            "End of GC ({}/{} pages, took {} ms)",
-            mmtk.get_plan().get_reserved_pages(),
-            mmtk.get_plan().get_total_pages(),
-            self.elapsed.as_millis()
-        );
-
-        #[cfg(feature = "count_live_bytes_in_gc")]
-        {
-            let live_bytes = mmtk.state.get_live_bytes_in_last_gc();
-            let used_bytes =
-                mmtk.get_plan().get_used_pages() << crate::util::constants::LOG_BYTES_IN_PAGE;
-            debug_assert!(
-                live_bytes <= used_bytes,
-                "Live bytes of all live objects ({} bytes) is larger than used pages ({} bytes), something is wrong.",
-                live_bytes, used_bytes
-            );
-            info!(
-                "Live objects = {} bytes ({:04.1}% of {} used pages)",
-                live_bytes,
-                live_bytes as f64 * 100.0 / used_bytes as f64,
-                mmtk.get_plan().get_used_pages()
-            );
-        }
-
-        // We assume this is the only running work packet that accesses plan at the point of execution
-        let plan_mut: &mut dyn Plan<VM = VM> = unsafe { mmtk.get_plan_mut() };
-        plan_mut.end_of_gc(worker.tls);
-
-        #[cfg(feature = "extreme_assertions")]
-        if crate::util::edge_logger::should_check_duplicate_edges(mmtk.get_plan()) {
-            // reset the logging info at the end of each GC
-            mmtk.edge_logger.reset();
-        }
-
-        // Reset the triggering information.
-        mmtk.state.reset_collection_trigger();
-
-        // Set to NotInGC after everything, and right before resuming mutators.
-        mmtk.set_gc_status(GcStatus::NotInGC);
-        <VM as VMBinding>::VMCollection::resume_mutators(worker.tls);
-    }
-}
-
 /// This implements `ObjectTracer` by forwarding the `trace_object` calls to the wrapped
 /// `ProcessEdgesWork` instance.
 pub(crate) struct ProcessEdgesWorkTracer<E: ProcessEdgesWork> {

From 742c3a22ec572bc99bc4c8bdcddacd8b8b5283fe Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Fri, 12 Jan 2024 18:55:56 +0800
Subject: [PATCH 12/48] WIP: Refactor to support forking

---
 src/global_state.rs        |   9 ---
 src/plan/gc_requester.rs   |  68 +-----------------
 src/scheduler/gc_work.rs   |  10 ---
 src/scheduler/scheduler.rs | 139 ++++++++++++++++++-------------------
 src/scheduler/worker.rs    |  66 ++++++++++++------
 5 files changed, 115 insertions(+), 177 deletions(-)

diff --git a/src/global_state.rs b/src/global_state.rs
index 468efb9f37..1c7b7ee627 100644
--- a/src/global_state.rs
+++ b/src/global_state.rs
@@ -1,8 +1,6 @@
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use std::time::Instant;
 
 use atomic::Atomic;
-use atomic_refcell::AtomicRefCell;
 use bytemuck::NoUninit;
 
 /// This stores some global states for an MMTK instance.
@@ -44,12 +42,6 @@ pub struct GlobalState {
     pub(crate) stacks_prepared: AtomicBool,
     /// A counter that keeps tracks of the number of bytes allocated since last stress test
     pub(crate) allocation_bytes: AtomicUsize,
-    /// The time when the current GC started.  Currently only used for logging.
-    /// Note that some (but not all) `GCTriggerPolicy` implementations do their own time tracking
-    /// independently for their own need.
-    /// This field only accessible in `ScheduleCollection` and `GCWorkScheduler::on_gc_finished`
-    /// which never happen at the same time, so `AtomicRefCell` is enough.
-    pub(crate) gc_start_time: AtomicRefCell<Option<Instant>>,
     /// A counteer that keeps tracks of the number of bytes allocated by malloc
     #[cfg(feature = "malloc_counted_size")]
     pub(crate) malloc_bytes: AtomicUsize,
@@ -224,7 +216,6 @@ impl Default for GlobalState {
             cur_collection_attempts: AtomicUsize::new(0),
             scanned_stacks: AtomicUsize::new(0),
             allocation_bytes: AtomicUsize::new(0),
-            gc_start_time: AtomicRefCell::new(None),
             #[cfg(feature = "malloc_counted_size")]
             malloc_bytes: AtomicUsize::new(0),
             #[cfg(feature = "count_live_bytes_in_gc")]
diff --git a/src/plan/gc_requester.rs b/src/plan/gc_requester.rs
index 43ae8e2abd..ff2aa7e0b8 100644
--- a/src/plan/gc_requester.rs
+++ b/src/plan/gc_requester.rs
@@ -1,29 +1,10 @@
 use crate::scheduler::GCWorkScheduler;
 use crate::vm::VMBinding;
 use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::{Arc, Mutex};
-
-struct RequestSync {
-    /// Are the GC workers already aware that we requested a GC?
-    ///
-    /// Mutators call `GCRequester::request()` to trigger GC.  It communicates with workers by
-    /// calling `GCWorkScheduler::request_schedule_collection`.  Under the hood, it sets a
-    /// synchronized variable and notifies one worker.  Conceptually, it is sending a message to GC
-    /// workers.
-    ///
-    /// The purpose of this variable is preventing the message above from being sent while GC
-    /// workers are still doing GC.  This is mainly significant for concurrent GC.  In the current
-    /// design (inherited from JikesRVM), `request_flag` is cleared when all mutators are
-    /// suspended, at which time GC is still in progress.  If `GCRequester::request()` is called
-    /// after `request_flag` is cleared but before `on_gc_finished` is called, the mutator will not
-    /// send the message.  But GC workers will check `request_flag` when GC finishes, and schedule
-    /// the next GC.
-    gc_scheduled: bool,
-}
+use std::sync::Arc;
 
 /// This data structure lets mutators trigger GC.
 pub struct GCRequester<VM: VMBinding> {
-    request_sync: Mutex<RequestSync>,
     /// An atomic flag outside `RequestSync` so that mutators can check if GC has already been
     /// requested in `poll` without acquiring the mutex.
     request_flag: AtomicBool,
@@ -33,9 +14,6 @@ pub struct GCRequester<VM: VMBinding> {
 impl<VM: VMBinding> GCRequester<VM> {
     pub fn new(scheduler: Arc<GCWorkScheduler<VM>>) -> Self {
         GCRequester {
-            request_sync: Mutex::new(RequestSync {
-                gc_scheduled: false,
-            }),
             request_flag: AtomicBool::new(false),
             scheduler,
         }
@@ -44,58 +22,18 @@ impl<VM: VMBinding> GCRequester<VM> {
     /// Request a GC.  Called by mutators when polling (during allocation) and when handling user
     /// GC requests (e.g. `System.gc();` in Java).
     pub fn request(&self) {
-        // Note: This is the double-checked locking algorithm.
-        // The load has the `Relaxed` order instead of `Acquire` because we are not doing lazy
-        // initialization here.  We are only using this flag to remove successive requests.
         if self.request_flag.load(Ordering::Relaxed) {
             return;
         }
 
-        let mut guard = self.request_sync.lock().unwrap();
-        if !self.request_flag.load(Ordering::Relaxed) {
-            self.request_flag.store(true, Ordering::Relaxed);
-
-            let should_schedule_gc = self.try_schedule_collection(&mut guard);
-            if should_schedule_gc {
-                self.scheduler.request_schedule_collection();
-                // Note: We do not clear `request_flag` now.  It will be cleared by `clear_request`
-                // after all mutators have stopped.
-            }
+        if !self.request_flag.swap(true, Ordering::Relaxed) {
+            self.scheduler.request_schedule_collection();
         }
     }
 
     /// Clear the "GC requested" flag so that mutators can trigger the next GC.
     /// Called by a GC worker when all mutators have come to a stop.
     pub fn clear_request(&self) {
-        let _guard = self.request_sync.lock().unwrap();
         self.request_flag.store(false, Ordering::Relaxed);
     }
-
-    /// Called by a GC worker when a GC has finished.
-    /// This will check the `request_flag` again and check if we should immediately schedule the
-    /// next GC.  If we should, `gc_scheduled` will be set back to `true` and this function will
-    /// return `true`.
-    pub fn on_gc_finished(&self) -> bool {
-        let mut guard = self.request_sync.lock().unwrap();
-        guard.gc_scheduled = false;
-
-        self.try_schedule_collection(&mut guard)
-    }
-
-    /// Decide whether we should schedule a new collection.  Will transition the state of
-    /// `gc_scheduled` from `false` to `true` if we should schedule a new collection.
-    ///
-    /// Return `true` if the state transition happens.
-    fn try_schedule_collection(&self, sync: &mut RequestSync) -> bool {
-        // The time to schedule a collection is when `request_flag` is `true` but `gc_scheduled` is
-        // `false`.  If `gc_scheduled` is `true` when GC is requested, we do nothing now.  But when
-        // the currrent GC finishes, a GC worker will call `on_gc_finished` which clears the
-        // `gc_scheduled` flag, and checks the `request_flag` again to trigger the next GC.
-        if self.request_flag.load(Ordering::Relaxed) && !sync.gc_scheduled {
-            sync.gc_scheduled = true;
-            true
-        } else {
-            false
-        }
-    }
 }
diff --git a/src/scheduler/gc_work.rs b/src/scheduler/gc_work.rs
index 1084cffc57..8563693cdc 100644
--- a/src/scheduler/gc_work.rs
+++ b/src/scheduler/gc_work.rs
@@ -14,16 +14,6 @@ pub struct ScheduleCollection;
 
 impl<VM: VMBinding> GCWork<VM> for ScheduleCollection {
     fn do_work(&mut self, worker: &mut GCWorker<VM>, mmtk: &'static MMTK<VM>) {
-        // Record the time when GC starts.
-        {
-            let mut guard = mmtk.state.gc_start_time.borrow_mut();
-            let old_time = guard.replace(std::time::Instant::now());
-            debug_assert!(
-                old_time.is_none(),
-                "gc_start_time is still set when GC started: {old_time:?}",
-            );
-        }
-
         // Tell GC trigger that GC started.
         mmtk.gc_trigger.policy.on_gc_start(mmtk);
 
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 927dd8fbd0..46bd5c4d43 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -1,11 +1,11 @@
 use super::gc_work::ScheduleCollection;
 use super::stat::SchedulerStat;
 use super::work_bucket::*;
-use super::worker::{GCWorker, ThreadId, WorkerGroup, WorkerMonitor};
+use super::worker::{GCWorker, ThreadId, WorkerGoals, WorkerGroup, WorkerMonitor};
 use super::*;
 use crate::global_state::GcStatus;
 use crate::mmtk::MMTK;
-use crate::scheduler::worker::LastParkedResult;
+use crate::scheduler::worker::{LastParkedResult, WorkerGoal};
 use crate::util::opaque_pointer::*;
 use crate::util::options::AffinityKind;
 use crate::util::rust_util::array_from_fn;
@@ -16,6 +16,7 @@ use crossbeam::deque::Steal;
 use enum_map::{Enum, EnumMap};
 use std::collections::HashMap;
 use std::sync::Arc;
+use std::time::Instant;
 
 pub struct GCWorkScheduler<VM: VMBinding> {
     /// Work buckets
@@ -99,10 +100,6 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
 
     /// Add the `ScheduleCollection` packet.  Called by the last parked worker.
     fn add_schedule_collection_packet(&self) {
-        // We set the eBPF trace point here so that bpftrace scripts can start recording work
-        // packet events before the `ScheduleCollection` work packet starts.
-        probe!(mmtk, gc_start);
-
         // We are still holding the mutex `WorkerMonitor::sync`.  Do not notify now.
         self.work_buckets[WorkBucketStage::Unconstrained].add_no_notify(ScheduleCollection);
     }
@@ -370,59 +367,85 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
             }
 
             self.worker_monitor
-                .park_and_wait(worker, |should_schedule_gc| {
-                    self.on_last_parked(worker, should_schedule_gc)
-                });
+                .park_and_wait(worker, |goals| self.on_last_parked(worker, goals));
         }
     }
 
     /// Called when the last worker parked.
     /// `should_schedule_gc` is true if a mutator requested a GC.
-    fn on_last_parked(&self, worker: &GCWorker<VM>, should_schedule_gc: bool) -> LastParkedResult {
-        // Test whether this is happening during GC.
-        if worker.mmtk.gc_in_progress() {
-            assert!(
-                !should_schedule_gc,
-                "GC request sent to WorkerMonitor while GC is still in progress."
-            );
+    fn on_last_parked(&self, worker: &GCWorker<VM>, goals: &mut WorkerGoals) -> LastParkedResult {
+        let Some(ref current_goal) = goals.current else {
+            // There is no goal.  Find a request to respond to.
+            return self.respond_to_requests(goals);
+        };
+
+        match current_goal {
+            worker::WorkerGoal::Gc { start_time } => {
+                // We are in the progress of GC.
 
-            // We are in the middle of GC, and the last GC worker parked.
-            trace!("The last worker parked during GC.  Try to find more work to do...");
+                // In stop-the-world GC, mutators cannot request for GC while GC is in progress.
+                // When we support concurrent GC, we should remove this assertion.
+                assert!(
+                    !goals.requests.gc,
+                    "GC request sent to WorkerMonitor while GC is still in progress."
+                );
 
-            // During GC, if all workers parked, all open buckets must have been drained.
-            self.assert_all_activated_buckets_are_empty();
+                // We are in the middle of GC, and the last GC worker parked.
+                trace!("The last worker parked during GC.  Try to find more work to do...");
 
-            // Find more work for workers to do.
-            let found_more_work = self.find_more_work_for_workers();
+                // During GC, if all workers parked, all open buckets must have been drained.
+                self.assert_all_activated_buckets_are_empty();
 
-            if found_more_work {
-                LastParkedResult::WakeAll
-            } else {
-                // GC finished.
-                let scheduled_next_gc = self.on_gc_finished(worker);
-                if scheduled_next_gc {
-                    LastParkedResult::WakeSelf
+                // Find more work for workers to do.
+                let found_more_work = self.find_more_work_for_workers();
+
+                if found_more_work {
+                    LastParkedResult::WakeAll
                 } else {
-                    LastParkedResult::ParkSelf
+                    // GC finished.
+                    self.on_gc_finished(worker, start_time);
+
+                    // Clear the current goal
+                    goals.current = None;
+                    self.respond_to_requests(goals)
                 }
             }
-        } else {
-            trace!(
-                "The last worker parked while not in GC.  should_schedule_gc: {}",
-                should_schedule_gc
-            );
-
-            if should_schedule_gc {
-                // A mutator requested a GC to be scheduled.
-                self.add_schedule_collection_packet();
-                LastParkedResult::WakeSelf
-            } else {
-                // Wait until GC is requested.
-                LastParkedResult::ParkSelf
+            worker::WorkerGoal::StopForFork => {
+                // A worker parked again when it is asked to exit.
+                unimplemented!()
             }
         }
     }
 
+    /// Respond to a worker reqeust.
+    fn respond_to_requests(&self, goals: &mut WorkerGoals) -> LastParkedResult {
+        assert!(goals.current.is_none());
+
+        if goals.requests.gc {
+            // A mutator requested a GC to be scheduled.
+            goals.requests.gc = false;
+
+            // We set the eBPF trace point here so that bpftrace scripts can start recording work
+            // packet events before the `ScheduleCollection` work packet starts.
+            probe!(mmtk, gc_start);
+
+            goals.current = Some(WorkerGoal::Gc {
+                start_time: Instant::now(),
+            });
+
+            self.add_schedule_collection_packet();
+            return LastParkedResult::WakeSelf;
+        }
+
+        if goals.requests.stop_for_fork {
+            // The VM wants to fork.  GC threads should exit.
+            unimplemented!()
+        }
+
+        // No reqeusts.  Park this worker, too.
+        return LastParkedResult::ParkSelf;
+    }
+
     /// Find more work for workers to do.  Return true if more work is available.
     fn find_more_work_for_workers(&self) -> bool {
         if self.worker_group.has_designated_work() {
@@ -448,7 +471,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
 
     /// Called when GC has finished, i.e. when all work packets have been executed.
     /// Return `true` if it scheduled the next GC immediately.
-    fn on_gc_finished(&self, worker: &GCWorker<VM>) -> bool {
+    fn on_gc_finished(&self, worker: &GCWorker<VM>, start_time: &Instant) {
         // All GC workers must have parked by now.
         debug_assert!(!self.worker_group.has_designated_work());
         debug_assert!(self.all_buckets_empty());
@@ -463,11 +486,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         mmtk.gc_trigger.policy.on_gc_end(mmtk);
 
         // Compute the elapsed time of the GC.
-        let gc_start = {
-            let mut guard = mmtk.state.gc_start_time.borrow_mut();
-            guard.take().expect("gc_start_time was not set")
-        };
-        let elapsed = gc_start.elapsed();
+        let elapsed = start_time.elapsed();
 
         info!(
             "End of GC ({}/{} pages, took {} ms)",
@@ -513,28 +532,6 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         // Set to NotInGC after everything, and right before resuming mutators.
         mmtk.set_gc_status(GcStatus::NotInGC);
         <VM as VMBinding>::VMCollection::resume_mutators(worker.tls);
-
-        // Notify the `GCRequester` that GC has finished.
-        let should_schedule_gc_now = mmtk.gc_requester.on_gc_finished();
-        if should_schedule_gc_now {
-            // We should schedule the next GC immediately.  This means GC was triggered between
-            // `clear_request` (when stacks were scanned) and `on_gc_finished` (right above).  This
-            // can happen if
-            // 1.  It is concurrent GC, and a mutator triggered another GC while the current GC was
-            //     still running, or
-            // 2.  It is STW GC, but after the invocation of `resume_mutators` above, one mutator
-            //     ran so fast that it triggered a GC before we called `on_gc_finished`.
-            debug!("GC already requested before `on_gc_finished`.  Schedule GC now.");
-            self.add_schedule_collection_packet();
-            true
-        } else {
-            // Note that if a mutator attempts to request GC after `on_gc_finished`, it will call
-            // `request_schedule_collection`, but will block on the mutex `WorkerMonitor::sync`
-            // because the current GC worker is holding it.  After the current worker calls
-            // `work_available.wait()`, the mutator will continue and wake up a GC worker (not
-            // necessarily this one) to schedule the GC.
-            false
-        }
     }
 
     pub fn enable_stat(&self) {
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 98806c561f..10e061fb89 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -91,8 +91,9 @@ pub(crate) enum LastParkedResult {
 pub(crate) struct WorkerMonitor {
     /// The synchronized part.
     sync: Mutex<WorkerMonitorSync>,
-    /// This is notified when new work packets are available, or a mutator has requested GC.
-    work_available: Condvar,
+    /// Notified if workers have anything to do.  That include any work packets available, and any
+    /// field in `sync.goals.requests` set to true.
+    have_anything_to_do: Condvar,
 }
 
 /// The synchronized part of `WorkerMonitor`.
@@ -101,12 +102,36 @@ pub(crate) struct WorkerMonitorSync {
     worker_count: usize,
     /// Number of parked workers.
     parked_workers: usize,
-    /// True if a mutator has requested the workers to schedule a GC.
-    ///
-    /// Consider this as a one-element message queue.  The last parked worker will poll this
-    /// "queue" by reading this field and setting it to `false`.  The last parked worker will
-    /// schedule a GC whenever seeing `true` in this field.
-    should_schedule_gc: bool,
+    /// Current and requested goals.
+    goals: WorkerGoals,
+}
+
+#[derive(Default)]
+pub(crate) struct WorkerGoals {
+    /// What are the workers doing now?
+    pub(crate) current: Option<WorkerGoal>,
+    /// Requests received from mutators.
+    pub(crate) requests: WorkerRequests,
+}
+
+/// The thing workers are currently doing.  This affects several things, such as what the last
+/// parked worker will do, and whether workers will stop themselves.
+pub(crate) enum WorkerGoal {
+    Gc { start_time: std::time::Instant },
+    StopForFork,
+}
+
+/// Reqeusts received from mutators.  Workers respond to those requests when they do not have a
+/// current goal.  Multiple things can be requested at the same time, and workers respond to the
+/// thing with the highest priority.
+///
+/// The fields of this structs are ordered with decreasing priority.
+#[derive(Default)] // All fields should be false by default.
+pub(crate) struct WorkerRequests {
+    /// The VM needs to fork.  Workers should save their contexts and exit.
+    pub(crate) stop_for_fork: bool,
+    /// GC is requested.  Workers should schedule a GC.
+    pub(crate) gc: bool,
 }
 
 impl WorkerMonitor {
@@ -115,9 +140,9 @@ impl WorkerMonitor {
             sync: Mutex::new(WorkerMonitorSync {
                 worker_count,
                 parked_workers: 0,
-                should_schedule_gc: false,
+                goals: Default::default(),
             }),
-            work_available: Default::default(),
+            have_anything_to_do: Default::default(),
         }
     }
 
@@ -125,12 +150,10 @@ impl WorkerMonitor {
     /// Callable from mutator threads.
     pub fn request_schedule_collection(&self) {
         let mut guard = self.sync.lock().unwrap();
-        assert!(
-            !guard.should_schedule_gc,
-            "should_schedule_gc is already set"
-        );
-        guard.should_schedule_gc = true;
-        self.notify_work_available_inner(false, &mut guard);
+        if guard.goals.requests.gc == false {
+            guard.goals.requests.gc = true;
+            self.notify_work_available_inner(false, &mut guard);
+        }
     }
 
     /// Wake up workers when more work packets are made available for workers,
@@ -144,9 +167,9 @@ impl WorkerMonitor {
     /// mutex of `WorkerMonitorSync`.
     fn notify_work_available_inner(&self, all: bool, _guard: &mut MutexGuard<WorkerMonitorSync>) {
         if all {
-            self.work_available.notify_all();
+            self.have_anything_to_do.notify_all();
         } else {
-            self.work_available.notify_one();
+            self.have_anything_to_do.notify_one();
         }
     }
 
@@ -159,7 +182,7 @@ impl WorkerMonitor {
     pub fn park_and_wait<VM, F>(&self, worker: &GCWorker<VM>, on_last_parked: F)
     where
         VM: VMBinding,
-        F: FnOnce(bool) -> LastParkedResult,
+        F: FnOnce(&mut WorkerGoals) -> LastParkedResult,
     {
         let mut sync = self.sync.lock().unwrap();
 
@@ -177,8 +200,7 @@ impl WorkerMonitor {
 
         if all_parked {
             debug!("Worker {} is the last worker parked.", worker.ordinal);
-            let should_schedule_gc = std::mem::replace(&mut sync.should_schedule_gc, false);
-            let result = on_last_parked(should_schedule_gc);
+            let result = on_last_parked(&mut sync.goals);
             match result {
                 LastParkedResult::ParkSelf => {
                     should_wait = true;
@@ -248,7 +270,7 @@ impl WorkerMonitor {
             //     and park again if not available.  The last parked worker will ensure the two
             //     conditions listed above are both false before blocking.  If either condition is
             //     true, the last parked worker will take action.
-            sync = self.work_available.wait(sync).unwrap();
+            sync = self.have_anything_to_do.wait(sync).unwrap();
         }
 
         // Unpark this worker.

From 90cb6df46af05f0beb087667911a435858a18bf5 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Fri, 12 Jan 2024 19:28:04 +0800
Subject: [PATCH 13/48] Coding style change

---
 src/scheduler/scheduler.rs | 2 +-
 src/scheduler/worker.rs    | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 46bd5c4d43..408ef35bc6 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -443,7 +443,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         }
 
         // No reqeusts.  Park this worker, too.
-        return LastParkedResult::ParkSelf;
+        LastParkedResult::ParkSelf
     }
 
     /// Find more work for workers to do.  Return true if more work is available.
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 10e061fb89..310edf2aa8 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -117,7 +117,10 @@ pub(crate) struct WorkerGoals {
 /// The thing workers are currently doing.  This affects several things, such as what the last
 /// parked worker will do, and whether workers will stop themselves.
 pub(crate) enum WorkerGoal {
-    Gc { start_time: std::time::Instant },
+    Gc {
+        start_time: std::time::Instant,
+    },
+    #[allow(unused)] // TODO: Implement forking support later.
     StopForFork,
 }
 
@@ -150,7 +153,7 @@ impl WorkerMonitor {
     /// Callable from mutator threads.
     pub fn request_schedule_collection(&self) {
         let mut guard = self.sync.lock().unwrap();
-        if guard.goals.requests.gc == false {
+        if !guard.goals.requests.gc {
             guard.goals.requests.gc = true;
             self.notify_work_available_inner(false, &mut guard);
         }

From 2a25331731ca17ea80ca9b3718db278b69e0d055 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Sat, 13 Jan 2024 15:38:38 +0800
Subject: [PATCH 14/48] Split WorkerGoals and WorkerMonitor

They are now too complex and should be put in dedicatd files.
---
 src/plan/gc_requester.rs        |   7 +-
 src/scheduler/mod.rs            |   2 +
 src/scheduler/scheduler.rs      |  18 ++-
 src/scheduler/work_bucket.rs    |   2 +-
 src/scheduler/worker.rs         | 247 +-------------------------------
 src/scheduler/worker_goals.rs   |  47 ++++++
 src/scheduler/worker_monitor.rs | 238 ++++++++++++++++++++++++++++++
 7 files changed, 310 insertions(+), 251 deletions(-)
 create mode 100644 src/scheduler/worker_goals.rs
 create mode 100644 src/scheduler/worker_monitor.rs

diff --git a/src/plan/gc_requester.rs b/src/plan/gc_requester.rs
index ff2aa7e0b8..49ae9ce2c9 100644
--- a/src/plan/gc_requester.rs
+++ b/src/plan/gc_requester.rs
@@ -5,8 +5,8 @@ use std::sync::Arc;
 
 /// This data structure lets mutators trigger GC.
 pub struct GCRequester<VM: VMBinding> {
-    /// An atomic flag outside `RequestSync` so that mutators can check if GC has already been
-    /// requested in `poll` without acquiring the mutex.
+    /// Set by mutators to trigger GC.  It is atomic so that mutators can check if GC has already
+    /// been requested efficiently in `poll` without acquiring any mutex.
     request_flag: AtomicBool,
     scheduler: Arc<GCWorkScheduler<VM>>,
 }
@@ -27,6 +27,9 @@ impl<VM: VMBinding> GCRequester<VM> {
         }
 
         if !self.request_flag.swap(true, Ordering::Relaxed) {
+            // `GCWorkScheduler::request_schedule_collection` needs to hold a mutex to communicate
+            // with GC workers, so only the first mutator that changed the `request_flag` to `true`
+            // shall do it.
             self.scheduler.request_schedule_collection();
         }
     }
diff --git a/src/scheduler/mod.rs b/src/scheduler/mod.rs
index 45501e9943..b4d79adeb8 100644
--- a/src/scheduler/mod.rs
+++ b/src/scheduler/mod.rs
@@ -17,6 +17,8 @@ mod work_bucket;
 pub use work_bucket::WorkBucketStage;
 
 mod worker;
+mod worker_monitor;
+mod worker_goals;
 pub(crate) use worker::current_worker_ordinal;
 pub use worker::GCWorker;
 
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 408ef35bc6..4b5d933448 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -1,11 +1,12 @@
 use super::gc_work::ScheduleCollection;
 use super::stat::SchedulerStat;
 use super::work_bucket::*;
-use super::worker::{GCWorker, ThreadId, WorkerGoals, WorkerGroup, WorkerMonitor};
+use super::worker::{GCWorker, ThreadId, WorkerGroup};
+use super::worker_goals::{WorkerGoal, WorkerGoals};
+use super::worker_monitor::{LastParkedResult, WorkerMonitor};
 use super::*;
 use crate::global_state::GcStatus;
 use crate::mmtk::MMTK;
-use crate::scheduler::worker::{LastParkedResult, WorkerGoal};
 use crate::util::opaque_pointer::*;
 use crate::util::options::AffinityKind;
 use crate::util::rust_util::array_from_fn;
@@ -95,7 +96,14 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     /// Request a GC to be scheduled.  Called by mutator via `GCRequester`.
     pub(crate) fn request_schedule_collection(&self) {
         debug!("A mutator is sending GC-scheduling request to workers...");
-        self.worker_monitor.request_schedule_collection();
+        self.worker_monitor.make_request(|requests| {
+            if !requests.gc {
+                requests.gc = true;
+                true
+            } else {
+                false
+            }
+        });
     }
 
     /// Add the `ScheduleCollection` packet.  Called by the last parked worker.
@@ -380,7 +388,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         };
 
         match current_goal {
-            worker::WorkerGoal::Gc { start_time } => {
+            WorkerGoal::Gc { start_time } => {
                 // We are in the progress of GC.
 
                 // In stop-the-world GC, mutators cannot request for GC while GC is in progress.
@@ -410,7 +418,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                     self.respond_to_requests(goals)
                 }
             }
-            worker::WorkerGoal::StopForFork => {
+            WorkerGoal::StopForFork => {
                 // A worker parked again when it is asked to exit.
                 unimplemented!()
             }
diff --git a/src/scheduler/work_bucket.rs b/src/scheduler/work_bucket.rs
index d7076354eb..4386832b76 100644
--- a/src/scheduler/work_bucket.rs
+++ b/src/scheduler/work_bucket.rs
@@ -1,4 +1,4 @@
-use super::worker::WorkerMonitor;
+use super::worker_monitor::WorkerMonitor;
 use super::*;
 use crate::vm::VMBinding;
 use crossbeam::deque::{Injector, Steal, Worker};
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 310edf2aa8..19255c19be 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -12,7 +12,7 @@ use crossbeam::queue::ArrayQueue;
 #[cfg(feature = "count_live_bytes_in_gc")]
 use std::sync::atomic::AtomicUsize;
 use std::sync::atomic::Ordering;
-use std::sync::{Arc, Condvar, Mutex, MutexGuard};
+use std::sync::{Arc, Mutex};
 
 /// Represents the ID of a GC worker thread.
 pub type ThreadId = usize;
@@ -33,8 +33,9 @@ pub fn current_worker_ordinal() -> ThreadId {
     ordinal
 }
 
-/// The part shared between a GCWorker and the scheduler.
-/// This structure is used for communication, e.g. adding new work packets.
+/// The struct has one instance per worker, but is shared between workers via the scheduler
+/// instance.  This structure is used for communication between workers, e.g. adding designated
+/// work packets, stealing work packets from other workers, and collecting per-worker statistics.
 pub struct GCWorkerShared<VM: VMBinding> {
     /// Worker-local statistics data.
     stat: AtomicRefCell<WorkerLocalStat<VM>>,
@@ -71,246 +72,6 @@ impl<VM: VMBinding> GCWorkerShared<VM> {
     }
 }
 
-/// The result type of the `on_last_parked` call-back in `WorkMonitor::park_and_wait`.
-/// It decides how many workers should wake up after `on_last_parked`.
-pub(crate) enum LastParkedResult {
-    /// The last parked worker should wait, too, until more work packets are added.
-    ParkSelf,
-    /// The last parked worker should unpark and find work packet to do.
-    WakeSelf,
-    /// Wake up all parked GC workers.
-    WakeAll,
-}
-
-/// A data structure for synchronizing workers.
-///
-/// -   It allows workers to park and unpark.  It keeps track of the number of workers parked, and
-///     allows the last parked worker to perform operations that can only be performed when all
-///     workers have parked.
-/// -   It allows mutators to notify workers to schedule a GC.
-pub(crate) struct WorkerMonitor {
-    /// The synchronized part.
-    sync: Mutex<WorkerMonitorSync>,
-    /// Notified if workers have anything to do.  That include any work packets available, and any
-    /// field in `sync.goals.requests` set to true.
-    have_anything_to_do: Condvar,
-}
-
-/// The synchronized part of `WorkerMonitor`.
-pub(crate) struct WorkerMonitorSync {
-    /// The total number of workers.
-    worker_count: usize,
-    /// Number of parked workers.
-    parked_workers: usize,
-    /// Current and requested goals.
-    goals: WorkerGoals,
-}
-
-#[derive(Default)]
-pub(crate) struct WorkerGoals {
-    /// What are the workers doing now?
-    pub(crate) current: Option<WorkerGoal>,
-    /// Requests received from mutators.
-    pub(crate) requests: WorkerRequests,
-}
-
-/// The thing workers are currently doing.  This affects several things, such as what the last
-/// parked worker will do, and whether workers will stop themselves.
-pub(crate) enum WorkerGoal {
-    Gc {
-        start_time: std::time::Instant,
-    },
-    #[allow(unused)] // TODO: Implement forking support later.
-    StopForFork,
-}
-
-/// Reqeusts received from mutators.  Workers respond to those requests when they do not have a
-/// current goal.  Multiple things can be requested at the same time, and workers respond to the
-/// thing with the highest priority.
-///
-/// The fields of this structs are ordered with decreasing priority.
-#[derive(Default)] // All fields should be false by default.
-pub(crate) struct WorkerRequests {
-    /// The VM needs to fork.  Workers should save their contexts and exit.
-    pub(crate) stop_for_fork: bool,
-    /// GC is requested.  Workers should schedule a GC.
-    pub(crate) gc: bool,
-}
-
-impl WorkerMonitor {
-    pub fn new(worker_count: usize) -> Self {
-        Self {
-            sync: Mutex::new(WorkerMonitorSync {
-                worker_count,
-                parked_workers: 0,
-                goals: Default::default(),
-            }),
-            have_anything_to_do: Default::default(),
-        }
-    }
-
-    /// Request a GC worker to schedule the next GC.
-    /// Callable from mutator threads.
-    pub fn request_schedule_collection(&self) {
-        let mut guard = self.sync.lock().unwrap();
-        if !guard.goals.requests.gc {
-            guard.goals.requests.gc = true;
-            self.notify_work_available_inner(false, &mut guard);
-        }
-    }
-
-    /// Wake up workers when more work packets are made available for workers,
-    /// or a mutator has requested the GC workers to schedule a GC.
-    pub fn notify_work_available(&self, all: bool) {
-        let mut guard = self.sync.lock().unwrap();
-        self.notify_work_available_inner(all, &mut guard);
-    }
-
-    /// Like `notify_work_available` but the current thread must have already acquired the
-    /// mutex of `WorkerMonitorSync`.
-    fn notify_work_available_inner(&self, all: bool, _guard: &mut MutexGuard<WorkerMonitorSync>) {
-        if all {
-            self.have_anything_to_do.notify_all();
-        } else {
-            self.have_anything_to_do.notify_one();
-        }
-    }
-
-    /// Park a worker and wait on the CondVar `work_available`.
-    ///
-    /// If it is the last worker parked, `on_last_parked` will be called.
-    /// The argument of `on_last_parked` is true if `sync.gc_requested` is `true`.
-    /// The return value of `on_last_parked` will determine whether this worker and other workers
-    /// will wake up or block waiting.
-    pub fn park_and_wait<VM, F>(&self, worker: &GCWorker<VM>, on_last_parked: F)
-    where
-        VM: VMBinding,
-        F: FnOnce(&mut WorkerGoals) -> LastParkedResult,
-    {
-        let mut sync = self.sync.lock().unwrap();
-
-        // Park this worker
-        let all_parked = sync.inc_parked_workers();
-        trace!(
-            "Worker {} parked.  parked/total: {}/{}.  All parked: {}",
-            worker.ordinal,
-            sync.parked_workers,
-            sync.worker_count,
-            all_parked
-        );
-
-        let mut should_wait = false;
-
-        if all_parked {
-            debug!("Worker {} is the last worker parked.", worker.ordinal);
-            let result = on_last_parked(&mut sync.goals);
-            match result {
-                LastParkedResult::ParkSelf => {
-                    should_wait = true;
-                }
-                LastParkedResult::WakeSelf => {
-                    // Continue without waiting.
-                }
-                LastParkedResult::WakeAll => {
-                    self.notify_work_available_inner(true, &mut sync);
-                }
-            }
-        } else {
-            should_wait = true;
-        }
-
-        if should_wait {
-            // Notes on CondVar usage:
-            //
-            // Conditional variables are usually tested in a loop while holding a mutex
-            //
-            //      lock();
-            //      while condition() {
-            //          condvar.wait();
-            //      }
-            //      unlock();
-            //
-            // The actual condition for this `self.work_available.wait(sync)` is:
-            //
-            // 1.  any work packet is available, or
-            // 2.  a request for scheduling GC is submitted.
-            //
-            // But it is not used like the typical use pattern shown above, mainly because work
-            // packets can be added without holding the mutex `self.sync`.  This means one worker
-            // can add a new work packet (no mutex needed) right after another worker finds no work
-            // packets are available and then park.  In other words, condition (1) can suddenly
-            // become true after a worker sees it is false but before the worker blocks waiting on
-            // the CondVar.  If this happens, the last parked worker will block forever and never
-            // get notified.  This may happen if mutators or the previously existing "coordinator
-            // thread" can add work packets.
-            //
-            // However, after the "coordinator thread" was removed, only GC worker threads can add
-            // work packets during GC.  Parked workers (except the last parked worker) cannot make
-            // more work packets availble (by adding new packets or opening buckets).  For this
-            // reason, the **last** parked worker can be sure that after it finds no packets
-            // available, no other workers can add another work packet (because they all parked).
-            // So the **last** parked worker can open more buckets or declare GC finished.
-            //
-            // Condition (2), i.e. `sync.should_schedule_gc` is guarded by the mutator `sync`.
-            // When set (by a mutator via `request_schedule_collection`), it will notify a
-            // worker; and the last parked worker always checks it before waiting.  So this
-            // condition will not be set without any worker noticing.
-            //
-            // Note that generational barriers may add `ProcessModBuf` work packets when not in GC.
-            // This is benign because those work packets are not executed immediately, and are
-            // guaranteed to be executed in the next GC.
-
-            // Notes on spurious wake-up:
-            //
-            // 1.  The condition variable `work_available` is guarded by `self.sync`.  Because the
-            //     last parked worker is holding the mutex `self.sync` when executing
-            //     `on_last_parked`, no workers can unpark (even if they spuriously wake up) during
-            //     `on_last_parked` because they cannot re-acquire the mutex `self.sync`.
-            //
-            // 2.  Workers may spuriously wake up and unpark when `on_last_parked` is not being
-            //     executed (including the case when the last parked worker is waiting here, too).
-            //     If one or more GC workers spuriously wake up, they will check for work packets,
-            //     and park again if not available.  The last parked worker will ensure the two
-            //     conditions listed above are both false before blocking.  If either condition is
-            //     true, the last parked worker will take action.
-            sync = self.have_anything_to_do.wait(sync).unwrap();
-        }
-
-        // Unpark this worker.
-        sync.dec_parked_workers();
-        trace!(
-            "Worker {} unparked.  parked/total: {}/{}.",
-            worker.ordinal,
-            sync.parked_workers,
-            sync.worker_count,
-        );
-    }
-}
-
-impl WorkerMonitorSync {
-    /// Increase the packed-workers counter.
-    /// Called before a worker is parked.
-    ///
-    /// Return true if all the workers are parked.
-    fn inc_parked_workers(&mut self) -> bool {
-        let old = self.parked_workers;
-        debug_assert!(old < self.worker_count);
-        let new = old + 1;
-        self.parked_workers = new;
-        new == self.worker_count
-    }
-
-    /// Decrease the packed-workers counter.
-    /// Called after a worker is resumed from the parked state.
-    fn dec_parked_workers(&mut self) {
-        let old = self.parked_workers;
-        debug_assert!(old <= self.worker_count);
-        debug_assert!(old > 0);
-        let new = old - 1;
-        self.parked_workers = new;
-    }
-}
-
 /// A GC worker.  This part is privately owned by a worker thread.
 pub struct GCWorker<VM: VMBinding> {
     /// The VM-specific thread-local state of the GC thread.
diff --git a/src/scheduler/worker_goals.rs b/src/scheduler/worker_goals.rs
new file mode 100644
index 0000000000..ae9a36763a
--- /dev/null
+++ b/src/scheduler/worker_goals.rs
@@ -0,0 +1,47 @@
+//! This module contain "goals" which are larger than work packets, and describes what workers are
+//! working towards on a high level.
+//!
+//! A "goal" is represented by a `WorkerGoal`.  All workers work towards a single goal at a time.
+//! THe current goal influences the behavior of GC workers, especially the last parked worker.
+//! For example,
+//!
+//! -   When in the progress of GC, the last parker will try to open buckets or announce the GC
+//!     has finished.
+//! -   When stopping for fork, every worker should exit when waken.
+//!
+//! The struct `WorkerRequests` keeps a list of requests from mutators, such as requests for GC
+//! and requests for forking.  But the GC workers will only respond to one request at a time.
+
+use std::time::Instant;
+
+/// This current and reqeusted goals.
+#[derive(Default)]
+pub(crate) struct WorkerGoals {
+    /// What are the workers doing now?
+    pub(crate) current: Option<WorkerGoal>,
+    /// Requests received from mutators.
+    pub(crate) requests: WorkerRequests,
+}
+
+/// The thing workers are currently doing.  This affects several things, such as what the last
+/// parked worker will do, and whether workers will stop themselves.
+pub(crate) enum WorkerGoal {
+    Gc {
+        start_time: Instant,
+    },
+    #[allow(unused)] // TODO: Implement forking support later.
+    StopForFork,
+}
+
+/// Reqeusts received from mutators.  Workers respond to those requests when they do not have a
+/// current goal.  Multiple things can be requested at the same time, and workers respond to the
+/// thing with the highest priority.
+///
+/// The fields of this structs are ordered with decreasing priority.
+#[derive(Default)] // All fields should be false by default.
+pub(crate) struct WorkerRequests {
+    /// The VM needs to fork.  Workers should save their contexts and exit.
+    pub(crate) stop_for_fork: bool,
+    /// GC is requested.  Workers should schedule a GC.
+    pub(crate) gc: bool,
+}
diff --git a/src/scheduler/worker_monitor.rs b/src/scheduler/worker_monitor.rs
new file mode 100644
index 0000000000..aba804f274
--- /dev/null
+++ b/src/scheduler/worker_monitor.rs
@@ -0,0 +1,238 @@
+//! This module contains `WorkerMonitor` and related types.  It purposes includes:
+//!
+//! -   allowing workers to park,
+//! -   letting the last parked worker take action, and
+//! -   letting workers and mutators to notify workers when workers are given things to do.
+
+use std::sync::{Condvar, Mutex};
+
+use crate::vm::VMBinding;
+
+use super::{
+    worker_goals::{WorkerGoals, WorkerRequests},
+    GCWorker,
+};
+
+/// The result type of the `on_last_parked` call-back in `WorkMonitor::park_and_wait`.
+/// It decides how many workers should wake up after `on_last_parked`.
+pub(crate) enum LastParkedResult {
+    /// The last parked worker should wait, too, until more work packets are added.
+    ParkSelf,
+    /// The last parked worker should unpark and find work packet to do.
+    WakeSelf,
+    /// Wake up all parked GC workers.
+    WakeAll,
+}
+
+/// A data structure for synchronizing workers with each other and with mutators.
+///
+/// Unlike `GCWorkerShared`, there is only one instance of `WorkerMonitor`.
+///
+/// -   It allows workers to park and unpark.
+/// -   It allows mutators to notify workers to schedule a GC.
+pub(crate) struct WorkerMonitor {
+    /// The synchronized part.
+    sync: Mutex<WorkerMonitorSync>,
+    /// Workers wait on this when idle.  Notified if workers have things to do.  That include:
+    /// -   any work packets available, and
+    /// -   any field in `sync.goals.requests` set to true.
+    workers_have_anything_to_do: Condvar,
+}
+
+/// The synchronized part of `WorkerMonitor`.
+struct WorkerMonitorSync {
+    /// Count parked workers.
+    parker: WorkerParker,
+    /// Current and requested goals.
+    goals: WorkerGoals,
+}
+
+/// This struct counts the number of workers parked and identifies the last parked worker.
+struct WorkerParker {
+    /// The total number of workers.
+    worker_count: usize,
+    /// Number of parked workers.
+    parked_workers: usize,
+}
+
+impl WorkerParker {
+    fn new(worker_count: usize) -> Self {
+        Self {
+            worker_count,
+            parked_workers: 0,
+        }
+    }
+
+    /// Increase the packed-workers counter.
+    /// Called before a worker is parked.
+    ///
+    /// Return true if all the workers are parked.
+    fn inc_parked_workers(&mut self) -> bool {
+        let old = self.parked_workers;
+        debug_assert!(old < self.worker_count);
+        let new = old + 1;
+        self.parked_workers = new;
+        new == self.worker_count
+    }
+
+    /// Decrease the packed-workers counter.
+    /// Called after a worker is resumed from the parked state.
+    fn dec_parked_workers(&mut self) {
+        let old = self.parked_workers;
+        debug_assert!(old <= self.worker_count);
+        debug_assert!(old > 0);
+        let new = old - 1;
+        self.parked_workers = new;
+    }
+}
+
+impl WorkerMonitor {
+    pub fn new(worker_count: usize) -> Self {
+        Self {
+            sync: Mutex::new(WorkerMonitorSync {
+                parker: WorkerParker::new(worker_count),
+                goals: Default::default(),
+            }),
+            workers_have_anything_to_do: Default::default(),
+        }
+    }
+
+    /// Make a request.  The `callback` will be called while holding the mutex `self.sync` so that
+    /// the caller can set some of its fields to true.
+    ///
+    /// The `callback` has one parameter:
+    ///
+    /// -   a mutable reference to the `WorkerRequests` instance.
+    ///
+    /// The `callback` should return `true` if it set any field of `WorkerRequests` to true.
+    pub fn make_request<Callback>(&self, callback: Callback)
+    where
+        Callback: FnOnce(&mut WorkerRequests) -> bool,
+    {
+        let mut guard = self.sync.lock().unwrap();
+        let set_any_fields = callback(&mut guard.goals.requests);
+        if set_any_fields {
+            self.notify_work_available(false);
+        }
+    }
+
+    /// Wake up workers when more work packets are made available for workers,
+    /// or a mutator has requested the GC workers to schedule a GC.
+    pub fn notify_work_available(&self, all: bool) {
+        if all {
+            self.workers_have_anything_to_do.notify_all();
+        } else {
+            self.workers_have_anything_to_do.notify_one();
+        }
+    }
+
+    /// Park a worker and wait on the CondVar `work_available`.
+    ///
+    /// If it is the last worker parked, `on_last_parked` will be called.
+    /// The argument of `on_last_parked` is true if `sync.gc_requested` is `true`.
+    /// The return value of `on_last_parked` will determine whether this worker and other workers
+    /// will wake up or block waiting.
+    pub fn park_and_wait<VM, F>(&self, worker: &GCWorker<VM>, on_last_parked: F)
+    where
+        VM: VMBinding,
+        F: FnOnce(&mut WorkerGoals) -> LastParkedResult,
+    {
+        let mut sync = self.sync.lock().unwrap();
+
+        // Park this worker
+        let all_parked = sync.parker.inc_parked_workers();
+        trace!(
+            "Worker {} parked.  parked/total: {}/{}.  All parked: {}",
+            worker.ordinal,
+            sync.parker.parked_workers,
+            sync.parker.worker_count,
+            all_parked
+        );
+
+        let mut should_wait = false;
+
+        if all_parked {
+            debug!("Worker {} is the last worker parked.", worker.ordinal);
+            let result = on_last_parked(&mut sync.goals);
+            match result {
+                LastParkedResult::ParkSelf => {
+                    should_wait = true;
+                }
+                LastParkedResult::WakeSelf => {
+                    // Continue without waiting.
+                }
+                LastParkedResult::WakeAll => {
+                    self.notify_work_available(true);
+                }
+            }
+        } else {
+            should_wait = true;
+        }
+
+        if should_wait {
+            // Notes on CondVar usage:
+            //
+            // Conditional variables are usually tested in a loop while holding a mutex
+            //
+            //      lock();
+            //      while condition() {
+            //          condvar.wait();
+            //      }
+            //      unlock();
+            //
+            // The actual condition for this `self.work_available.wait(sync)` is:
+            //
+            // 1.  any work packet is available, or
+            // 2.  a request for scheduling GC is submitted.
+            //
+            // But it is not used like the typical use pattern shown above, mainly because work
+            // packets can be added without holding the mutex `self.sync`.  This means one worker
+            // can add a new work packet (no mutex needed) right after another worker finds no work
+            // packets are available and then park.  In other words, condition (1) can suddenly
+            // become true after a worker sees it is false but before the worker blocks waiting on
+            // the CondVar.  If this happens, the last parked worker will block forever and never
+            // get notified.  This may happen if mutators or the previously existing "coordinator
+            // thread" can add work packets.
+            //
+            // However, after the "coordinator thread" was removed, only GC worker threads can add
+            // work packets during GC.  Parked workers (except the last parked worker) cannot make
+            // more work packets availble (by adding new packets or opening buckets).  For this
+            // reason, the **last** parked worker can be sure that after it finds no packets
+            // available, no other workers can add another work packet (because they all parked).
+            // So the **last** parked worker can open more buckets or declare GC finished.
+            //
+            // Condition (2), i.e. `sync.should_schedule_gc` is guarded by the mutator `sync`.
+            // When set (by a mutator via `request_schedule_collection`), it will notify a
+            // worker; and the last parked worker always checks it before waiting.  So this
+            // condition will not be set without any worker noticing.
+            //
+            // Note that generational barriers may add `ProcessModBuf` work packets when not in GC.
+            // This is benign because those work packets are not executed immediately, and are
+            // guaranteed to be executed in the next GC.
+
+            // Notes on spurious wake-up:
+            //
+            // 1.  The condition variable `work_available` is guarded by `self.sync`.  Because the
+            //     last parked worker is holding the mutex `self.sync` when executing
+            //     `on_last_parked`, no workers can unpark (even if they spuriously wake up) during
+            //     `on_last_parked` because they cannot re-acquire the mutex `self.sync`.
+            //
+            // 2.  Workers may spuriously wake up and unpark when `on_last_parked` is not being
+            //     executed (including the case when the last parked worker is waiting here, too).
+            //     If one or more GC workers spuriously wake up, they will check for work packets,
+            //     and park again if not available.  The last parked worker will ensure the two
+            //     conditions listed above are both false before blocking.  If either condition is
+            //     true, the last parked worker will take action.
+            sync = self.workers_have_anything_to_do.wait(sync).unwrap();
+        }
+
+        // Unpark this worker.
+        sync.parker.dec_parked_workers();
+        trace!(
+            "Worker {} unparked.  parked/total: {}/{}.",
+            worker.ordinal,
+            sync.parker.parked_workers,
+            sync.parker.worker_count,
+        );
+    }
+}

From 1f9d9053a9ef75ac5ac69426f63f936c11836172 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Wed, 17 Jan 2024 16:35:38 +0800
Subject: [PATCH 15/48] Create worker structs lazily

---
 src/memory_manager.rs   | 12 ++++++++
 src/scheduler/worker.rs | 65 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/src/memory_manager.rs b/src/memory_manager.rs
index e5a918233f..8f0b5d9403 100644
--- a/src/memory_manager.rs
+++ b/src/memory_manager.rs
@@ -495,6 +495,18 @@ pub fn initialize_collection<VM: VMBinding>(mmtk: &'static MMTK<VM>, tls: VMThre
     probe!(mmtk, collection_initialized);
 }
 
+pub fn uninitialize_collection<VM: VMBinding>(mmtk: &'static MMTK<VM>, tls: VMThread) {
+    assert!(
+        mmtk.state.is_initialized(),
+        "MMTk collection has not been initialized, yet (was initialize_collection() called before?)"
+    );
+    //mmtk.scheduler.stop_gc_threads_for_forking(mmtk, tls);
+    mmtk.state.initialized.store(false, Ordering::SeqCst);
+    //probe!(mmtk, collection_initialized);
+    unimplemented!()
+}
+
+
 /// Allow MMTk to trigger garbage collection when heap is full. This should only be used in pair with disable_collection().
 /// See the comments on disable_collection(). If disable_collection() is not used, there is no need to call this function at all.
 /// Note this call is not thread safe, only one VM thread should call this.
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 19255c19be..e3e707a7da 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -214,13 +214,37 @@ impl<VM: VMBinding> GCWorker<VM> {
     }
 }
 
+enum WorkerCreationState<VM: VMBinding> {
+    NotCreated {
+        /// The local work queues for to-be-created workers.
+        unspawned_local_work_queues: Vec<deque::Worker<Box<dyn GCWork<VM>>>>,
+    },
+    Created {
+        /// `Worker` instances for worker threads that have not been created yet, or worker thread that
+        /// are suspended (e.g. for forking).
+        suspended_workers: Vec<Box<GCWorker<VM>>>,
+    },
+}
+
+impl<VM: VMBinding> WorkerCreationState<VM> {
+    pub fn is_created(&self) -> bool {
+        matches!(self, WorkerCreationState::Created { .. })
+    }
+}
+
 /// A worker group to manage all the GC workers.
 pub(crate) struct WorkerGroup<VM: VMBinding> {
     /// Shared worker data
     pub workers_shared: Vec<Arc<GCWorkerShared<VM>>>,
-    unspawned_local_work_queues: Mutex<Vec<deque::Worker<Box<dyn GCWork<VM>>>>>,
+    /// The stateful part
+    state: Mutex<WorkerCreationState<VM>>,
 }
 
+/// We have to persuade Rust that `WorkerGroup` is safe to share because it thinks one worker can
+/// refer to another worker via the path "worker -> scheduler -> worker_group -> suspended_workers
+/// -> worker" which it thinks is cyclic reference and unsafe.
+unsafe impl<VM: VMBinding> Sync for WorkerGroup<VM> {}
+
 impl<VM: VMBinding> WorkerGroup<VM> {
     /// Create a WorkerGroup
     pub fn new(num_workers: usize) -> Arc<Self> {
@@ -238,13 +262,21 @@ impl<VM: VMBinding> WorkerGroup<VM> {
 
         Arc::new(Self {
             workers_shared,
-            unspawned_local_work_queues: Mutex::new(unspawned_local_work_queues),
+            state: Mutex::new(WorkerCreationState::NotCreated {
+                unspawned_local_work_queues,
+            }),
         })
     }
 
-    /// Spawn all the worker threads
-    pub fn spawn(&self, mmtk: &'static MMTK<VM>, tls: VMThread) {
-        let mut unspawned_local_work_queues = self.unspawned_local_work_queues.lock().unwrap();
+    /// Create `GCWorker` instances, but do not create threads, yet.
+    fn create_worker_structs(&self, state: &mut WorkerCreationState<VM>, mmtk: &'static MMTK<VM>) {
+        let WorkerCreationState::NotCreated { unspawned_local_work_queues } = state else {
+            panic!("GCWorker structs have already been created");
+        };
+
+        assert_eq!(self.workers_shared.len(), unspawned_local_work_queues.len());
+        let mut suspended_workers = Vec::with_capacity(self.workers_shared.len());
+
         // Spawn each worker thread.
         for (ordinal, shared) in self.workers_shared.iter().enumerate() {
             let worker = Box::new(GCWorker::new(
@@ -254,9 +286,30 @@ impl<VM: VMBinding> WorkerGroup<VM> {
                 shared.clone(),
                 unspawned_local_work_queues.pop().unwrap(),
             ));
+            suspended_workers.push(worker);
+        }
+
+        *state = WorkerCreationState::Created { suspended_workers };
+    }
+
+    /// Spawn all the worker threads
+    pub fn spawn(&self, mmtk: &'static MMTK<VM>, tls: VMThread) {
+        let mut guard = self.state.lock().unwrap();
+
+        // Create worker structs lazily.  If we are spawning workers after fork, worker structs
+        // will have been created already, so we can skip the creation.
+        if !guard.is_created() {
+            self.create_worker_structs(&mut *guard, mmtk);
+        }
+
+        let WorkerCreationState::Created { ref mut suspended_workers } = *guard else {
+            panic!("GCWorker structs have not been created, yet.");
+        };
+
+        // Drain the queue.  We transfer the ownership of each `GCWorker` instance to a GC thread.
+        for worker in suspended_workers.drain(..) {
             VM::VMCollection::spawn_gc_thread(tls, GCThreadContext::<VM>::Worker(worker));
         }
-        debug_assert!(unspawned_local_work_queues.is_empty());
     }
 
     /// Get the number of workers in the group

From 029bfdae06be6d27bfdc3aa4f7a29dced6e65db3 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 18 Jan 2024 13:22:27 +0800
Subject: [PATCH 16/48] Allow the loop in GCWorker::run to break

---
 src/memory_manager.rs           |  5 ++++-
 src/scheduler/scheduler.rs      | 20 ++++++++++++++------
 src/scheduler/worker.rs         | 32 +++++++++++++++++++++++++-------
 src/scheduler/worker_monitor.rs |  7 +++++--
 4 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/src/memory_manager.rs b/src/memory_manager.rs
index 8f0b5d9403..961b7e2745 100644
--- a/src/memory_manager.rs
+++ b/src/memory_manager.rs
@@ -461,7 +461,10 @@ pub fn gc_poll<VM: VMBinding>(mmtk: &MMTK<VM>, tls: VMMutatorThread) {
     }
 }
 
-/// Run the main loop of a GC worker. This method does not return.
+/// Run the main loop of a GC worker.
+///
+/// This method runs until the worker exits.  Currently a worker may exit after
+/// `uninitialize_collection` is called.
 ///
 /// Arguments:
 /// * `tls`: The thread that will be used as the GC worker.
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 4b5d933448..37873a5159 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -1,3 +1,5 @@
+use self::worker::{PollResult, WorkerShouldExit};
+
 use super::gc_work::ScheduleCollection;
 use super::stat::SchedulerStat;
 use super::work_bucket::*;
@@ -362,20 +364,26 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
 
     /// Called by workers to get a schedulable work packet.
     /// Park the worker if there're no available packets.
-    pub fn poll(&self, worker: &GCWorker<VM>) -> Box<dyn GCWork<VM>> {
-        self.poll_schedulable_work(worker)
-            .unwrap_or_else(|| self.poll_slow(worker))
+    pub(crate) fn poll(&self, worker: &GCWorker<VM>) -> PollResult<VM> {
+        if let Some(work) = self.poll_schedulable_work(worker) {
+            return Ok(work);
+        }
+        self.poll_slow(worker)
     }
 
-    fn poll_slow(&self, worker: &GCWorker<VM>) -> Box<dyn GCWork<VM>> {
+    fn poll_slow(&self, worker: &GCWorker<VM>) -> PollResult<VM> {
         loop {
             // Retry polling
             if let Some(work) = self.poll_schedulable_work(worker) {
-                return work;
+                return Ok(work);
             }
 
-            self.worker_monitor
+            let should_exit = self.worker_monitor
                 .park_and_wait(worker, |goals| self.on_last_parked(worker, goals));
+
+            if should_exit {
+                return Err(WorkerShouldExit);
+            }
         }
     }
 
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index e3e707a7da..52217908bf 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -107,6 +107,16 @@ impl<VM: VMBinding> GCWorkerShared<VM> {
     }
 }
 
+/// A special error type that indicate a worker should exit.
+/// This may happen if the VM needs to fork and asks workers to exit.
+pub(crate) struct WorkerShouldExit;
+
+/// The result type of `GCWorker::pool`.
+/// Too many functions return `Option<Box<dyn GCWork<VM>>>`.  In most cases, when `None` is
+/// returned, the caller should try getting work packets from another place.  To avoid confusion,
+/// we use `Err(WorkerShouldExit)` to clearly indicate that the worker should exit immediately.
+pub(crate) type PollResult<VM> = Result<Box<dyn GCWork<VM>>, WorkerShouldExit>;
+
 impl<VM: VMBinding> GCWorker<VM> {
     pub(crate) fn new(
         mmtk: &'static MMTK<VM>,
@@ -171,12 +181,16 @@ impl<VM: VMBinding> GCWorker<VM> {
     /// 2. Poll from the local work queue.
     /// 3. Poll from activated global work-buckets
     /// 4. Steal from other workers
-    fn poll(&self) -> Box<dyn GCWork<VM>> {
-        self.shared
-            .designated_work
-            .pop()
-            .or_else(|| self.local_work_buffer.pop())
-            .unwrap_or_else(|| self.scheduler().poll(self))
+    fn poll(&mut self) -> PollResult<VM> {
+        if let Some(work) = self.shared.designated_work.pop() {
+            return Ok(work);
+        }
+
+        if let Some(work) = self.local_work_buffer.pop() {
+            return Ok(work);
+        }
+
+        self.scheduler().poll(self)
     }
 
     /// Entry of the worker thread. Resolve thread affinity, if it has been specified by the user.
@@ -197,7 +211,10 @@ impl<VM: VMBinding> GCWorker<VM> {
             // If we have work_start and work_end, we cannot measure the first
             // poll.
             probe!(mmtk, work_poll);
-            let mut work = self.poll();
+            let Ok(mut work) = self.poll() else {
+                // The worker is asked to exit.  Break from the loop.
+                break;
+            };
             // probe! expands to an empty block on unsupported platforms
             #[allow(unused_variables)]
             let typename = work.get_type_name();
@@ -211,6 +228,7 @@ impl<VM: VMBinding> GCWorker<VM> {
             probe!(mmtk, work, typename.as_ptr(), typename.len());
             work.do_work_with_stat(self, mmtk);
         }
+        probe!(mmtk, gcworker_exit);
     }
 }
 
diff --git a/src/scheduler/worker_monitor.rs b/src/scheduler/worker_monitor.rs
index aba804f274..8b39dd727a 100644
--- a/src/scheduler/worker_monitor.rs
+++ b/src/scheduler/worker_monitor.rs
@@ -9,7 +9,7 @@ use std::sync::{Condvar, Mutex};
 use crate::vm::VMBinding;
 
 use super::{
-    worker_goals::{WorkerGoals, WorkerRequests},
+    worker_goals::{WorkerGoal, WorkerGoals, WorkerRequests},
     GCWorker,
 };
 
@@ -132,7 +132,7 @@ impl WorkerMonitor {
     /// The argument of `on_last_parked` is true if `sync.gc_requested` is `true`.
     /// The return value of `on_last_parked` will determine whether this worker and other workers
     /// will wake up or block waiting.
-    pub fn park_and_wait<VM, F>(&self, worker: &GCWorker<VM>, on_last_parked: F)
+    pub fn park_and_wait<VM, F>(&self, worker: &GCWorker<VM>, on_last_parked: F) -> bool
     where
         VM: VMBinding,
         F: FnOnce(&mut WorkerGoals) -> LastParkedResult,
@@ -234,5 +234,8 @@ impl WorkerMonitor {
             sync.parker.parked_workers,
             sync.parker.worker_count,
         );
+
+        // If the current goal is `StopForFork`, return true so that the worker thread will exit.
+        matches!(sync.goals.current, Some(WorkerGoal::StopForFork))
     }
 }

From d32885901b96bbb4f18a0cf4e19970b443f4930f Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 18 Jan 2024 16:17:45 +0800
Subject: [PATCH 17/48] Stop and restart GC workers for forking

---
 src/memory_manager.rs           | 20 +++++++++++------
 src/scheduler/scheduler.rs      | 28 ++++++++++++++++++++++--
 src/scheduler/worker.rs         | 38 ++++++++++++++++++++++++++++++++-
 src/scheduler/worker_goals.rs   |  5 +++--
 src/scheduler/worker_monitor.rs |  7 ++++++
 5 files changed, 86 insertions(+), 12 deletions(-)

diff --git a/src/memory_manager.rs b/src/memory_manager.rs
index 961b7e2745..964d55270a 100644
--- a/src/memory_manager.rs
+++ b/src/memory_manager.rs
@@ -474,15 +474,18 @@ pub fn gc_poll<VM: VMBinding>(mmtk: &MMTK<VM>, tls: VMMutatorThread) {
 pub fn start_worker<VM: VMBinding>(
     mmtk: &'static MMTK<VM>,
     tls: VMWorkerThread,
-    worker: &mut GCWorker<VM>,
+    mut worker: Box<GCWorker<VM>>,
 ) {
     worker.run(tls, mmtk);
+    mmtk.scheduler.worker_group.surrender_gc_worker(worker);
 }
 
 /// Initialize the scheduler and GC workers that are required for doing garbage collections.
 /// This is a mandatory call for a VM during its boot process once its thread system
-/// is ready. This should only be called once. This call will invoke Collection::spawn_gc_thread()
-/// to create GC threads.
+/// is ready.  This call will invoke Collection::spawn_gc_thread() to create GC threads.
+///
+/// This function can also be called after `uninitialize_collection` is called in order to re-spawn
+/// the GC thtreads.
 ///
 /// Arguments:
 /// * `mmtk`: A reference to an MMTk instance.
@@ -498,15 +501,18 @@ pub fn initialize_collection<VM: VMBinding>(mmtk: &'static MMTK<VM>, tls: VMThre
     probe!(mmtk, collection_initialized);
 }
 
-pub fn uninitialize_collection<VM: VMBinding>(mmtk: &'static MMTK<VM>, tls: VMThread) {
+/// Ask GC worker threads to exit.  Currently, this function is for the sole purpose of forking.
+///
+/// Arguments:
+/// * `mmtk`: A reference to an MMTk instance.
+pub fn uninitialize_collection<VM: VMBinding>(mmtk: &'static MMTK<VM>) {
     assert!(
         mmtk.state.is_initialized(),
         "MMTk collection has not been initialized, yet (was initialize_collection() called before?)"
     );
-    //mmtk.scheduler.stop_gc_threads_for_forking(mmtk, tls);
+    mmtk.scheduler.stop_gc_threads_for_forking();
     mmtk.state.initialized.store(false, Ordering::SeqCst);
-    //probe!(mmtk, collection_initialized);
-    unimplemented!()
+    probe!(mmtk, collection_uninitialized);
 }
 
 
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 37873a5159..ab08d6c390 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -90,6 +90,22 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         self.worker_group.spawn(mmtk, tls)
     }
 
+    /// Ask all GC workers to exit for forking, and wait until all workers exited.
+    pub fn stop_gc_threads_for_forking(self: &Arc<Self>) {
+        self.worker_monitor.make_request(|requests| {
+            if !requests.stop_for_fork {
+                requests.stop_for_fork = true;
+                true
+            } else {
+                false
+            }
+        });
+
+        self.worker_group.wait_until_worker_exited();
+
+        self.worker_monitor.on_all_workers_exited();
+    }
+
     /// Resolve the affinity of a thread.
     pub fn resolve_affinity(&self, thread: ThreadId) {
         self.affinity.resolve_affinity(thread);
@@ -101,6 +117,8 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         self.worker_monitor.make_request(|requests| {
             if !requests.gc {
                 requests.gc = true;
+                warn!("GC requested.");
+                dbg!(requests);
                 true
             } else {
                 false
@@ -449,13 +467,19 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 start_time: Instant::now(),
             });
 
+            warn!("Responded to GC request.");
+            dbg!(goals);
+
             self.add_schedule_collection_packet();
             return LastParkedResult::WakeSelf;
         }
 
         if goals.requests.stop_for_fork {
-            // The VM wants to fork.  GC threads should exit.
-            unimplemented!()
+            // A mutator wanted to fork.
+            goals.requests.stop_for_fork = false;
+
+            goals.current = Some(WorkerGoal::StopForFork);
+            return LastParkedResult::WakeAll;
         }
 
         // No reqeusts.  Park this worker, too.
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 52217908bf..2e65407b56 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -12,7 +12,7 @@ use crossbeam::queue::ArrayQueue;
 #[cfg(feature = "count_live_bytes_in_gc")]
 use std::sync::atomic::AtomicUsize;
 use std::sync::atomic::Ordering;
-use std::sync::{Arc, Mutex};
+use std::sync::{Arc, Condvar, Mutex};
 
 /// Represents the ID of a GC worker thread.
 pub type ThreadId = usize;
@@ -197,6 +197,7 @@ impl<VM: VMBinding> GCWorker<VM> {
     /// Each worker will keep polling and executing work packets in a loop.
     pub fn run(&mut self, tls: VMWorkerThread, mmtk: &'static MMTK<VM>) {
         probe!(mmtk, gcworker_run);
+        warn!("Worker {} started.", self.ordinal);
         WORKER_ORDINAL.with(|x| x.store(self.ordinal, Ordering::SeqCst));
         self.scheduler.resolve_affinity(self.ordinal);
         self.tls = tls;
@@ -256,6 +257,9 @@ pub(crate) struct WorkerGroup<VM: VMBinding> {
     pub workers_shared: Vec<Arc<GCWorkerShared<VM>>>,
     /// The stateful part
     state: Mutex<WorkerCreationState<VM>>,
+    /// The condition of "all workers exited", i.e. the number of suspended workers is equal to the
+    /// number of workers.
+    cond_all_workers_exited: Condvar,
 }
 
 /// We have to persuade Rust that `WorkerGroup` is safe to share because it thinks one worker can
@@ -283,6 +287,7 @@ impl<VM: VMBinding> WorkerGroup<VM> {
             state: Mutex::new(WorkerCreationState::NotCreated {
                 unspawned_local_work_queues,
             }),
+            cond_all_workers_exited: Default::default(),
         })
     }
 
@@ -330,6 +335,37 @@ impl<VM: VMBinding> WorkerGroup<VM> {
         }
     }
 
+    /// Surrender the `GCWorker` struct when a GC worker exits.
+    pub fn surrender_gc_worker(&self, worker: Box<GCWorker<VM>>) {
+        let mut state = self.state.lock().unwrap();
+        let WorkerCreationState::Created { ref mut suspended_workers } = *state else {
+            panic!("GCWorker structs have not been created, yet.");
+        };
+        let ordinal = worker.ordinal;
+        suspended_workers.push(worker);
+        info!(
+            "Worker {} surrendered. ({}/{})",
+            ordinal,
+            suspended_workers.len(),
+            self.worker_count()
+        );
+        if suspended_workers.len() == self.worker_count() {
+            info!("All {} workers surrendered.", self.worker_count());
+            self.cond_all_workers_exited.notify_all();
+        }
+    }
+
+    /// Wait until all workers exited.
+    pub fn wait_until_worker_exited(&self) {
+        let guard = self.state.lock().unwrap();
+        let _guard = self.cond_all_workers_exited.wait_while(guard, |state| {
+            let WorkerCreationState::Created { ref suspended_workers } = *state else {
+                panic!("GCWorker structs have not been created, yet.");
+            };
+            suspended_workers.len() != self.worker_count()
+        });
+    }
+
     /// Get the number of workers in the group
     pub fn worker_count(&self) -> usize {
         self.workers_shared.len()
diff --git a/src/scheduler/worker_goals.rs b/src/scheduler/worker_goals.rs
index ae9a36763a..0dc991f9f0 100644
--- a/src/scheduler/worker_goals.rs
+++ b/src/scheduler/worker_goals.rs
@@ -15,7 +15,7 @@
 use std::time::Instant;
 
 /// This current and reqeusted goals.
-#[derive(Default)]
+#[derive(Default, Debug)]
 pub(crate) struct WorkerGoals {
     /// What are the workers doing now?
     pub(crate) current: Option<WorkerGoal>,
@@ -25,6 +25,7 @@ pub(crate) struct WorkerGoals {
 
 /// The thing workers are currently doing.  This affects several things, such as what the last
 /// parked worker will do, and whether workers will stop themselves.
+#[derive(Debug)]
 pub(crate) enum WorkerGoal {
     Gc {
         start_time: Instant,
@@ -38,7 +39,7 @@ pub(crate) enum WorkerGoal {
 /// thing with the highest priority.
 ///
 /// The fields of this structs are ordered with decreasing priority.
-#[derive(Default)] // All fields should be false by default.
+#[derive(Default, Debug)] // All fields should be false by default.
 pub(crate) struct WorkerRequests {
     /// The VM needs to fork.  Workers should save their contexts and exit.
     pub(crate) stop_for_fork: bool,
diff --git a/src/scheduler/worker_monitor.rs b/src/scheduler/worker_monitor.rs
index 8b39dd727a..893d17353f 100644
--- a/src/scheduler/worker_monitor.rs
+++ b/src/scheduler/worker_monitor.rs
@@ -153,6 +153,7 @@ impl WorkerMonitor {
 
         if all_parked {
             debug!("Worker {} is the last worker parked.", worker.ordinal);
+            dbg!(&sync.goals);
             let result = on_last_parked(&mut sync.goals);
             match result {
                 LastParkedResult::ParkSelf => {
@@ -238,4 +239,10 @@ impl WorkerMonitor {
         // If the current goal is `StopForFork`, return true so that the worker thread will exit.
         matches!(sync.goals.current, Some(WorkerGoal::StopForFork))
     }
+
+    /// Called when all workers have exited.
+    pub fn on_all_workers_exited(&self) {
+        let mut sync = self.sync.try_lock().unwrap();
+        sync.goals.current = None;
+    }
 }

From 0d79d8d362d48daf7ea74c9f562d7b5cc47ca7e3 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Fri, 19 Jan 2024 17:08:27 +0800
Subject: [PATCH 18/48] Adjust logging

---
 src/scheduler/scheduler.rs      | 10 +++-------
 src/scheduler/worker.rs         | 11 ++++++++---
 src/scheduler/worker_monitor.rs |  3 +--
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index ab08d6c390..7a30007273 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -92,6 +92,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
 
     /// Ask all GC workers to exit for forking, and wait until all workers exited.
     pub fn stop_gc_threads_for_forking(self: &Arc<Self>) {
+        debug!("A mutator is requesting GC threads to stop for forking...");
         self.worker_monitor.make_request(|requests| {
             if !requests.stop_for_fork {
                 requests.stop_for_fork = true;
@@ -117,8 +118,6 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         self.worker_monitor.make_request(|requests| {
             if !requests.gc {
                 requests.gc = true;
-                warn!("GC requested.");
-                dbg!(requests);
                 true
             } else {
                 false
@@ -456,7 +455,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         assert!(goals.current.is_none());
 
         if goals.requests.gc {
-            // A mutator requested a GC to be scheduled.
+            trace!("A mutator requested a GC to be scheduled.");
             goals.requests.gc = false;
 
             // We set the eBPF trace point here so that bpftrace scripts can start recording work
@@ -467,15 +466,12 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 start_time: Instant::now(),
             });
 
-            warn!("Responded to GC request.");
-            dbg!(goals);
-
             self.add_schedule_collection_packet();
             return LastParkedResult::WakeSelf;
         }
 
         if goals.requests.stop_for_fork {
-            // A mutator wanted to fork.
+            trace!("A mutator wanted to fork.");
             goals.requests.stop_for_fork = false;
 
             goals.current = Some(WorkerGoal::StopForFork);
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 2e65407b56..eb7b78b6a9 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -197,7 +197,7 @@ impl<VM: VMBinding> GCWorker<VM> {
     /// Each worker will keep polling and executing work packets in a loop.
     pub fn run(&mut self, tls: VMWorkerThread, mmtk: &'static MMTK<VM>) {
         probe!(mmtk, gcworker_run);
-        warn!("Worker {} started.", self.ordinal);
+        trace!("Worker {} started.", self.ordinal);
         WORKER_ORDINAL.with(|x| x.store(self.ordinal, Ordering::SeqCst));
         self.scheduler.resolve_affinity(self.ordinal);
         self.tls = tls;
@@ -229,6 +229,7 @@ impl<VM: VMBinding> GCWorker<VM> {
             probe!(mmtk, work, typename.as_ptr(), typename.len());
             work.do_work_with_stat(self, mmtk);
         }
+        trace!("Worker {} exiting...", self.ordinal);
         probe!(mmtk, gcworker_exit);
     }
 }
@@ -333,6 +334,8 @@ impl<VM: VMBinding> WorkerGroup<VM> {
         for worker in suspended_workers.drain(..) {
             VM::VMCollection::spawn_gc_thread(tls, GCThreadContext::<VM>::Worker(worker));
         }
+
+        debug!("Spawned {} worker threads.", self.worker_count());
     }
 
     /// Surrender the `GCWorker` struct when a GC worker exits.
@@ -343,14 +346,14 @@ impl<VM: VMBinding> WorkerGroup<VM> {
         };
         let ordinal = worker.ordinal;
         suspended_workers.push(worker);
-        info!(
+        trace!(
             "Worker {} surrendered. ({}/{})",
             ordinal,
             suspended_workers.len(),
             self.worker_count()
         );
         if suspended_workers.len() == self.worker_count() {
-            info!("All {} workers surrendered.", self.worker_count());
+            debug!("All {} workers surrendered.", self.worker_count());
             self.cond_all_workers_exited.notify_all();
         }
     }
@@ -364,6 +367,8 @@ impl<VM: VMBinding> WorkerGroup<VM> {
             };
             suspended_workers.len() != self.worker_count()
         });
+
+        debug!("All workers have exited.");
     }
 
     /// Get the number of workers in the group
diff --git a/src/scheduler/worker_monitor.rs b/src/scheduler/worker_monitor.rs
index 893d17353f..cba34c6928 100644
--- a/src/scheduler/worker_monitor.rs
+++ b/src/scheduler/worker_monitor.rs
@@ -152,8 +152,7 @@ impl WorkerMonitor {
         let mut should_wait = false;
 
         if all_parked {
-            debug!("Worker {} is the last worker parked.", worker.ordinal);
-            dbg!(&sync.goals);
+            trace!("Worker {} is the last worker parked.", worker.ordinal);
             let result = on_last_parked(&mut sync.goals);
             match result {
                 LastParkedResult::ParkSelf => {

From 44f22ba37d654cecae29f6dd7a6468465d8c5fad Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Fri, 19 Jan 2024 17:27:42 +0800
Subject: [PATCH 19/48] Update eBPF tracing document

---
 tools/tracing/README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/tracing/README.md b/tools/tracing/README.md
index 9082f47942..96ff3e6a6d 100644
--- a/tools/tracing/README.md
+++ b/tools/tracing/README.md
@@ -16,10 +16,13 @@ shipped with the MMTk release you use.
 
 Currently, the core provides the following tracepoints.
 
--   `mmtk:collection_initialized()`: GC is enabled
+-   `mmtk:collection_initialized()`: All GC worker threads are spawn
+-   `mmtk:collection_uninitialized()`: All GC worker threads exited.  This may happend during
+    execution, for example, if the VM needs to fork and requests GC workers to exit.
 -   `mmtk:harness_begin()`: the timing iteration of a benchmark begins
 -   `mmtk:harness_end()`: the timing iteration of a benchmark ends
 -   `mmtk:gcworker_run()`: a GC worker thread enters its work loop
+-   `mmtk:gcworker_exit()`: a GC worker thread exits its work loop
 -   `mmtk:gc_start()`: a collection epoch starts
 -   `mmtk:gc_end()`: a collection epoch ends
 -   `mmtk:process_edges(num_edges: int, is_roots: bool)`: a invocation of the `process_edges`

From 54f586b92e6e4410ebc5db2040fc0f1e9e0ab556 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Wed, 31 Jan 2024 17:13:59 +0800
Subject: [PATCH 20/48] Revert the change on `GcStatus`.

The change is unnecessary because GC workers now use the active goal to
determine what to do when the last worker parked.
---
 src/global_state.rs | 28 ++++------------------------
 src/mmtk.rs         | 14 +++++++-------
 2 files changed, 11 insertions(+), 31 deletions(-)

diff --git a/src/global_state.rs b/src/global_state.rs
index 1c7b7ee627..e066811557 100644
--- a/src/global_state.rs
+++ b/src/global_state.rs
@@ -1,7 +1,5 @@
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-
-use atomic::Atomic;
-use bytemuck::NoUninit;
+use std::sync::Mutex;
 
 /// This stores some global states for an MMTK instance.
 /// Some MMTK components like plans and allocators may keep an reference to the struct, and can access it.
@@ -19,7 +17,7 @@ pub struct GlobalState {
     /// bindings to temporarily disable GC, at which point, we do not trigger GC even if the heap is full.
     pub(crate) trigger_gc_when_heap_is_full: AtomicBool,
     /// The current GC status.
-    pub(crate) gc_status: Atomic<GcStatus>,
+    pub(crate) gc_status: Mutex<GcStatus>,
     /// Is the current GC an emergency collection? Emergency means we may run out of memory soon, and we should
     /// attempt to collect as much as we can.
     pub(crate) emergency_collection: AtomicBool,
@@ -205,7 +203,7 @@ impl Default for GlobalState {
         Self {
             initialized: AtomicBool::new(false),
             trigger_gc_when_heap_is_full: AtomicBool::new(true),
-            gc_status: Atomic::new(GcStatus::NotInGC),
+            gc_status: Mutex::new(GcStatus::NotInGC),
             stacks_prepared: AtomicBool::new(false),
             emergency_collection: AtomicBool::new(false),
             user_triggered_collection: AtomicBool::new(false),
@@ -224,27 +222,9 @@ impl Default for GlobalState {
     }
 }
 
-/// GC status.
-///
-/// -   It starts in the `NotInGC` state.
-/// -   It enters `GcPrepare` when GC starts (i.e. when `ScheduleCollection` is executed).
-/// -   It enters `GcProper` when all mutators have stopped.
-/// -   It returns to `NotInGC` when GC finishes.
-///
-/// All states except `NotInGC` are considered "in GC".
-///
-/// The status is checked by GC workers.  The last parked worker only tries to open new buckets
-/// during GC.
-///
-/// The distinction between `GcPrepare` and `GcProper` is inherited from JikesRVM.  Several
-/// assertions in JikesRVM involve those two states.
-#[derive(PartialEq, Clone, Copy, NoUninit, Debug)]
-#[repr(u8)]
+#[derive(PartialEq)]
 pub enum GcStatus {
-    /// Not in GC
     NotInGC,
-    /// GC has started, but not all stacks have been scanned, yet.
     GcPrepare,
-    /// GC has started, and all stacks have been scanned.
     GcProper,
 }
diff --git a/src/mmtk.rs b/src/mmtk.rs
index 44ef399083..d6d4b27c02 100644
--- a/src/mmtk.rs
+++ b/src/mmtk.rs
@@ -255,15 +255,15 @@ impl<VM: VMBinding> MMTK<VM> {
         self.inside_sanity.load(Ordering::Relaxed)
     }
 
-    pub(crate) fn set_gc_status(&self, new_status: GcStatus) {
-        let old_status = self.state.gc_status.swap(new_status, Ordering::SeqCst);
-        debug_assert_ne!(old_status, new_status);
-        if old_status == GcStatus::NotInGC {
+    pub(crate) fn set_gc_status(&self, s: GcStatus) {
+        let mut gc_status = self.state.gc_status.lock().unwrap();
+        if *gc_status == GcStatus::NotInGC {
             self.state.stacks_prepared.store(false, Ordering::SeqCst);
             // FIXME stats
             self.stats.start_gc();
         }
-        if new_status == GcStatus::NotInGC {
+        *gc_status = s;
+        if *gc_status == GcStatus::NotInGC {
             // FIXME stats
             if self.stats.get_gathering_stats() {
                 self.stats.end_gc();
@@ -273,12 +273,12 @@ impl<VM: VMBinding> MMTK<VM> {
 
     /// Return true if a collection is in progress.
     pub fn gc_in_progress(&self) -> bool {
-        self.state.gc_status.load(Ordering::SeqCst) != GcStatus::NotInGC
+        *self.state.gc_status.lock().unwrap() != GcStatus::NotInGC
     }
 
     /// Return true if a collection is in progress and past the preparatory stage.
     pub fn gc_in_progress_proper(&self) -> bool {
-        self.state.gc_status.load(Ordering::SeqCst) == GcStatus::GcProper
+        *self.state.gc_status.lock().unwrap() == GcStatus::GcProper
     }
 
     /// Return true if the current GC is an emergency GC.

From 822f6f5825f778c135936d0b041a2bd541e4752e Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 1 Feb 2024 12:17:36 +0800
Subject: [PATCH 21/48] Fix clippy warning

---
 src/scheduler/worker.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index eb7b78b6a9..5f6838b408 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -240,8 +240,12 @@ enum WorkerCreationState<VM: VMBinding> {
         unspawned_local_work_queues: Vec<deque::Worker<Box<dyn GCWork<VM>>>>,
     },
     Created {
-        /// `Worker` instances for worker threads that have not been created yet, or worker thread that
-        /// are suspended (e.g. for forking).
+        /// `Worker` instances for worker threads that have not been created yet, or worker thread
+        /// that are suspended (e.g. for forking).
+        // Note: Clippy warns about `Vec<Box<T>>` because `Vec<T>` is already in the heap.
+        // However, the purpose of this `Vec` is allowing GC worker threads to give their
+        // `Box<GCWorker<VM>>` instances back to this pool.  Therefore, the `Box` is necessary.
+        #[allow(clippy::vec_box)]
         suspended_workers: Vec<Box<GCWorker<VM>>>,
     },
 }

From 7cb0b2c12be341c084e03f2cd943fad8ac088f83 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 1 Feb 2024 12:17:54 +0800
Subject: [PATCH 22/48] Formatting

---
 src/memory_manager.rs      | 1 -
 src/scheduler/mod.rs       | 2 +-
 src/scheduler/scheduler.rs | 3 ++-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/memory_manager.rs b/src/memory_manager.rs
index 964d55270a..364ceff616 100644
--- a/src/memory_manager.rs
+++ b/src/memory_manager.rs
@@ -515,7 +515,6 @@ pub fn uninitialize_collection<VM: VMBinding>(mmtk: &'static MMTK<VM>) {
     probe!(mmtk, collection_uninitialized);
 }
 
-
 /// Allow MMTk to trigger garbage collection when heap is full. This should only be used in pair with disable_collection().
 /// See the comments on disable_collection(). If disable_collection() is not used, there is no need to call this function at all.
 /// Note this call is not thread safe, only one VM thread should call this.
diff --git a/src/scheduler/mod.rs b/src/scheduler/mod.rs
index b4d79adeb8..6cc3b4da94 100644
--- a/src/scheduler/mod.rs
+++ b/src/scheduler/mod.rs
@@ -17,8 +17,8 @@ mod work_bucket;
 pub use work_bucket::WorkBucketStage;
 
 mod worker;
-mod worker_monitor;
 mod worker_goals;
+mod worker_monitor;
 pub(crate) use worker::current_worker_ordinal;
 pub use worker::GCWorker;
 
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 7a30007273..79972a9edb 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -395,7 +395,8 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 return Ok(work);
             }
 
-            let should_exit = self.worker_monitor
+            let should_exit = self
+                .worker_monitor
                 .park_and_wait(worker, |goals| self.on_last_parked(worker, goals));
 
             if should_exit {

From 176ad475b0ba05d6c1024b2c345ed203dcd02448 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Mon, 5 Feb 2024 17:28:02 +0800
Subject: [PATCH 23/48] Replace uninitialize_collection with prepare/after
 fork.

---
 src/memory_manager.rs      | 33 +++--------------
 src/mmtk.rs                | 50 +++++++++++++++++++++++++
 src/scheduler/scheduler.rs | 12 +++++-
 src/scheduler/worker.rs    | 75 +++++++++++++++++++++++++-------------
 4 files changed, 115 insertions(+), 55 deletions(-)

diff --git a/src/memory_manager.rs b/src/memory_manager.rs
index 364ceff616..f30eae8b40 100644
--- a/src/memory_manager.rs
+++ b/src/memory_manager.rs
@@ -480,39 +480,16 @@ pub fn start_worker<VM: VMBinding>(
     mmtk.scheduler.worker_group.surrender_gc_worker(worker);
 }
 
-/// Initialize the scheduler and GC workers that are required for doing garbage collections.
+/// Initialize the GC worker threads that are required for doing garbage collections.
 /// This is a mandatory call for a VM during its boot process once its thread system
-/// is ready.  This call will invoke Collection::spawn_gc_thread() to create GC threads.
-///
-/// This function can also be called after `uninitialize_collection` is called in order to re-spawn
-/// the GC thtreads.
+/// is ready.  This call will invoke `Collection::spawn_gc_thread()`` to create GC threads.
 ///
 /// Arguments:
 /// * `mmtk`: A reference to an MMTk instance.
-/// * `tls`: The thread that wants to enable the collection. This value will be passed back to the VM in
-///   Collection::spawn_gc_thread() so that the VM knows the context.
+/// * `tls`: The thread that wants to enable the collection. This value will be passed back to
+///   the VM in `Collection::spawn_gc_thread()` so that the VM knows the context.
 pub fn initialize_collection<VM: VMBinding>(mmtk: &'static MMTK<VM>, tls: VMThread) {
-    assert!(
-        !mmtk.state.is_initialized(),
-        "MMTk collection has been initialized (was initialize_collection() already called before?)"
-    );
-    mmtk.scheduler.spawn_gc_threads(mmtk, tls);
-    mmtk.state.initialized.store(true, Ordering::SeqCst);
-    probe!(mmtk, collection_initialized);
-}
-
-/// Ask GC worker threads to exit.  Currently, this function is for the sole purpose of forking.
-///
-/// Arguments:
-/// * `mmtk`: A reference to an MMTk instance.
-pub fn uninitialize_collection<VM: VMBinding>(mmtk: &'static MMTK<VM>) {
-    assert!(
-        mmtk.state.is_initialized(),
-        "MMTk collection has not been initialized, yet (was initialize_collection() called before?)"
-    );
-    mmtk.scheduler.stop_gc_threads_for_forking();
-    mmtk.state.initialized.store(false, Ordering::SeqCst);
-    probe!(mmtk, collection_uninitialized);
+    mmtk.initialize_collection(tls);
 }
 
 /// Allow MMTk to trigger garbage collection when heap is full. This should only be used in pair with disable_collection().
diff --git a/src/mmtk.rs b/src/mmtk.rs
index d6d4b27c02..7e665c212e 100644
--- a/src/mmtk.rs
+++ b/src/mmtk.rs
@@ -220,6 +220,56 @@ impl<VM: VMBinding> MMTK<VM> {
         }
     }
 
+    /// Initialize the GC worker threads that are required for doing garbage collections.
+    /// This is a mandatory call for a VM during its boot process once its thread system
+    /// is ready.  This call will invoke `Collection::spawn_gc_thread()`` to create GC threads.
+    ///
+    /// # Arguments
+    ///
+    /// *   `tls`: The thread that wants to enable the collection. This value will be passed back
+    ///     to the VM in `Collection::spawn_gc_thread()` so that the VM knows the context.
+    pub fn initialize_collection(&'static self, tls: VMThread) {
+        assert!(
+            !self.state.is_initialized(),
+            "MMTk collection has been initialized (was initialize_collection() already called before?)"
+        );
+        self.scheduler.spawn_gc_threads(self, tls);
+        self.state.initialized.store(true, Ordering::SeqCst);
+        probe!(mmtk, collection_initialized);
+    }
+
+    /// Prepare an MMTk instance for calling the `fork()` system call.
+    ///
+    /// This function makes all MMTk threads (currently including GC worker threads) save their
+    /// contexts and stop.  A subsequent call to `MMTK::after_fork()` will re-spawn the threads
+    /// using the saved contexts.
+    ///
+    /// This function returns when all MMTK threads stopped.
+    pub fn prepare_to_fork(&'static self) {
+        assert!(
+            self.state.is_initialized(),
+            "MMTk collection has not been initialized, yet (was initialize_collection() called before?)"
+        );
+        self.scheduler.stop_gc_threads_for_forking();
+    }
+
+    /// Call this function after the VM called the `fork()` system call.
+    ///
+    /// This function will re-spawn MMTk threads from saved contexts.
+    ///
+    /// # Arguments
+    ///
+    /// *   `tls`: The thread that wants to respawn MMTk threads after forking. This value will be
+    ///     passed back to the VM in `Collection::spawn_gc_thread()` so that the VM knows the
+    ///     context.
+    pub fn after_fork(&'static self, tls: VMThread) {
+        assert!(
+            self.state.is_initialized(),
+            "MMTk collection has not been initialized, yet (was initialize_collection() called before?)"
+        );
+        self.scheduler.respawn_gc_threads_after_forking(tls);
+    }
+
     /// Generic hook to allow benchmarks to be harnessed. MMTk will trigger a GC
     /// to clear any residual garbage and start collecting statistics for the benchmark.
     /// This is usually called by the benchmark harness as its last step before the actual benchmark.
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 79972a9edb..960fca80e8 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -85,13 +85,16 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         self.worker_group.as_ref().worker_count()
     }
 
-    /// Create GC threads, including all workers.
+    /// Create GC threads, including all workers,  for the first time.
     pub fn spawn_gc_threads(self: &Arc<Self>, mmtk: &'static MMTK<VM>, tls: VMThread) {
-        self.worker_group.spawn(mmtk, tls)
+        self.worker_group.create_workers(mmtk);
+        self.worker_group.spawn(tls)
     }
 
     /// Ask all GC workers to exit for forking, and wait until all workers exited.
     pub fn stop_gc_threads_for_forking(self: &Arc<Self>) {
+        self.worker_group.prepare_surrender_buffer();
+
         debug!("A mutator is requesting GC threads to stop for forking...");
         self.worker_monitor.make_request(|requests| {
             if !requests.stop_for_fork {
@@ -107,6 +110,11 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         self.worker_monitor.on_all_workers_exited();
     }
 
+    /// Respawn GC threads after forking.
+    pub fn respawn_gc_threads_after_forking(self: &Arc<Self>, tls: VMThread) {
+        self.worker_group.spawn(tls)
+    }
+
     /// Resolve the affinity of a thread.
     pub fn resolve_affinity(&self, thread: ThreadId) {
         self.affinity.resolve_affinity(thread);
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 5f6838b408..5d19d003ac 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -197,7 +197,12 @@ impl<VM: VMBinding> GCWorker<VM> {
     /// Each worker will keep polling and executing work packets in a loop.
     pub fn run(&mut self, tls: VMWorkerThread, mmtk: &'static MMTK<VM>) {
         probe!(mmtk, gcworker_run);
-        trace!("Worker {} started.", self.ordinal);
+        debug!(
+            "Worker started. PID: {}, tid: {}, ordinal: {}",
+            unsafe { libc::getpid() },
+            unsafe { libc::gettid() },
+            self.ordinal
+        );
         WORKER_ORDINAL.with(|x| x.store(self.ordinal, Ordering::SeqCst));
         self.scheduler.resolve_affinity(self.ordinal);
         self.tls = tls;
@@ -229,17 +234,25 @@ impl<VM: VMBinding> GCWorker<VM> {
             probe!(mmtk, work, typename.as_ptr(), typename.len());
             work.do_work_with_stat(self, mmtk);
         }
-        trace!("Worker {} exiting...", self.ordinal);
+        debug!(
+            "Worker exiting. PID: {}, tid: {}, ordinal: {}",
+            unsafe { libc::getpid() },
+            unsafe { libc::gettid() },
+            self.ordinal
+        );
         probe!(mmtk, gcworker_exit);
     }
 }
 
 enum WorkerCreationState<VM: VMBinding> {
+    /// `GCWorker`` structs have not been created yet.
     NotCreated {
         /// The local work queues for to-be-created workers.
         unspawned_local_work_queues: Vec<deque::Worker<Box<dyn GCWork<VM>>>>,
     },
-    Created {
+    /// `GCWorker`` structs are created, but worker threads are either not been spawn, yet, or in
+    /// the progress of stopping for forking.
+    Resting {
         /// `Worker` instances for worker threads that have not been created yet, or worker thread
         /// that are suspended (e.g. for forking).
         // Note: Clippy warns about `Vec<Box<T>>` because `Vec<T>` is already in the heap.
@@ -248,12 +261,9 @@ enum WorkerCreationState<VM: VMBinding> {
         #[allow(clippy::vec_box)]
         suspended_workers: Vec<Box<GCWorker<VM>>>,
     },
-}
-
-impl<VM: VMBinding> WorkerCreationState<VM> {
-    pub fn is_created(&self) -> bool {
-        matches!(self, WorkerCreationState::Created { .. })
-    }
+    /// All `GCWorker`` structs have been transferred to worker threads, and worker threads are
+    /// running.
+    Spawned,
 }
 
 /// A worker group to manage all the GC workers.
@@ -297,8 +307,11 @@ impl<VM: VMBinding> WorkerGroup<VM> {
     }
 
     /// Create `GCWorker` instances, but do not create threads, yet.
-    fn create_worker_structs(&self, state: &mut WorkerCreationState<VM>, mmtk: &'static MMTK<VM>) {
-        let WorkerCreationState::NotCreated { unspawned_local_work_queues } = state else {
+    pub fn create_workers(&self, mmtk: &'static MMTK<VM>) {
+        debug!("Creating GCWorker instances...");
+        let mut state = self.state.lock().unwrap();
+
+        let WorkerCreationState::NotCreated { ref mut unspawned_local_work_queues } = *state else {
             panic!("GCWorker structs have already been created");
         };
 
@@ -317,20 +330,16 @@ impl<VM: VMBinding> WorkerGroup<VM> {
             suspended_workers.push(worker);
         }
 
-        *state = WorkerCreationState::Created { suspended_workers };
+        *state = WorkerCreationState::Resting { suspended_workers };
+        debug!("GCWorker instances created.");
     }
 
     /// Spawn all the worker threads
-    pub fn spawn(&self, mmtk: &'static MMTK<VM>, tls: VMThread) {
-        let mut guard = self.state.lock().unwrap();
-
-        // Create worker structs lazily.  If we are spawning workers after fork, worker structs
-        // will have been created already, so we can skip the creation.
-        if !guard.is_created() {
-            self.create_worker_structs(&mut *guard, mmtk);
-        }
+    pub fn spawn(&self, tls: VMThread) {
+        debug!("Spawning GC workers.  PID: {}", unsafe { libc::getpid() });
+        let mut state = self.state.lock().unwrap();
 
-        let WorkerCreationState::Created { ref mut suspended_workers } = *guard else {
+        let WorkerCreationState::Resting { ref mut suspended_workers } = *state else {
             panic!("GCWorker structs have not been created, yet.");
         };
 
@@ -339,13 +348,27 @@ impl<VM: VMBinding> WorkerGroup<VM> {
             VM::VMCollection::spawn_gc_thread(tls, GCThreadContext::<VM>::Worker(worker));
         }
 
-        debug!("Spawned {} worker threads.", self.worker_count());
+        *state = WorkerCreationState::Spawned;
+
+        debug!(
+            "Spawned {} worker threads.  PID: {}",
+            self.worker_count(),
+            unsafe { libc::getpid() }
+        );
+    }
+
+    /// Prepare the buffer for workers to surrender their `GCWorker` structs.
+    pub fn prepare_surrender_buffer(&self) {
+        let mut state = self.state.lock().unwrap();
+        *state = WorkerCreationState::Resting {
+            suspended_workers: Vec::with_capacity(self.worker_count()),
+        }
     }
 
     /// Surrender the `GCWorker` struct when a GC worker exits.
     pub fn surrender_gc_worker(&self, worker: Box<GCWorker<VM>>) {
         let mut state = self.state.lock().unwrap();
-        let WorkerCreationState::Created { ref mut suspended_workers } = *state else {
+        let WorkerCreationState::Resting { ref mut suspended_workers } = *state else {
             panic!("GCWorker structs have not been created, yet.");
         };
         let ordinal = worker.ordinal;
@@ -366,13 +389,15 @@ impl<VM: VMBinding> WorkerGroup<VM> {
     pub fn wait_until_worker_exited(&self) {
         let guard = self.state.lock().unwrap();
         let _guard = self.cond_all_workers_exited.wait_while(guard, |state| {
-            let WorkerCreationState::Created { ref suspended_workers } = *state else {
+            let WorkerCreationState::Resting { ref suspended_workers } = *state else {
                 panic!("GCWorker structs have not been created, yet.");
             };
             suspended_workers.len() != self.worker_count()
         });
 
-        debug!("All workers have exited.");
+        debug!("All workers have exited.  PID: {}", unsafe {
+            libc::getpid()
+        });
     }
 
     /// Get the number of workers in the group

From d3ddf7d73b29d5459261fd4eeffc0d649a783f89 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Wed, 7 Feb 2024 16:14:19 +0800
Subject: [PATCH 24/48] Fix unused warning

---
 src/memory_manager.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/memory_manager.rs b/src/memory_manager.rs
index 53b8d55f6b..86468bc370 100644
--- a/src/memory_manager.rs
+++ b/src/memory_manager.rs
@@ -25,7 +25,7 @@ use crate::util::{Address, ObjectReference};
 use crate::vm::edge_shape::MemorySlice;
 use crate::vm::ReferenceGlue;
 use crate::vm::VMBinding;
-use std::sync::atomic::Ordering;
+
 /// Initialize an MMTk instance. A VM should call this method after creating an [`crate::MMTK`]
 /// instance but before using any of the methods provided in MMTk (except `process()` and `process_bulk()`).
 ///
@@ -438,6 +438,7 @@ pub fn free_with_size<VM: VMBinding>(mmtk: &MMTK<VM>, addr: Address, old_size: u
 /// Get the current active malloc'd bytes. Here MMTk only accounts for bytes that are done through those 'counted malloc' functions.
 #[cfg(feature = "malloc_counted_size")]
 pub fn get_malloc_bytes<VM: VMBinding>(mmtk: &MMTK<VM>) -> usize {
+    use std::sync::atomic::Ordering;
     mmtk.state.malloc_bytes.load(Ordering::SeqCst)
 }
 
@@ -536,6 +537,7 @@ pub fn free_bytes<VM: VMBinding>(mmtk: &MMTK<VM>) -> usize {
 /// to call this method is at the end of a GC (e.g. when the runtime is about to resume threads).
 #[cfg(feature = "count_live_bytes_in_gc")]
 pub fn live_bytes_in_last_gc<VM: VMBinding>(mmtk: &MMTK<VM>) -> usize {
+    use std::sync::atomic::Ordering;
     mmtk.state.live_bytes_in_last_gc.load(Ordering::SeqCst)
 }
 

From 5c125fbf4f0199398a682026bacc64e868a9aba3 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Wed, 7 Feb 2024 19:43:55 +0800
Subject: [PATCH 25/48] Do not wait for all workers to give in their structs

---
 src/memory_manager.rs      |  2 +-
 src/mmtk.rs                | 25 +++++++++++++++++++++----
 src/scheduler/scheduler.rs | 10 ++++++++--
 src/scheduler/worker.rs    | 31 +++++--------------------------
 4 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/src/memory_manager.rs b/src/memory_manager.rs
index 86468bc370..e3cf6b52be 100644
--- a/src/memory_manager.rs
+++ b/src/memory_manager.rs
@@ -477,7 +477,7 @@ pub fn start_worker<VM: VMBinding>(
     mut worker: Box<GCWorker<VM>>,
 ) {
     worker.run(tls, mmtk);
-    mmtk.scheduler.worker_group.surrender_gc_worker(worker);
+    mmtk.scheduler.surrender_gc_worker(worker);
 }
 
 /// Initialize the GC worker threads that are required for doing garbage collections.
diff --git a/src/mmtk.rs b/src/mmtk.rs
index d95e23ee88..7c96b9a1d3 100644
--- a/src/mmtk.rs
+++ b/src/mmtk.rs
@@ -240,11 +240,28 @@ impl<VM: VMBinding> MMTK<VM> {
 
     /// Prepare an MMTk instance for calling the `fork()` system call.
     ///
-    /// This function makes all MMTk threads (currently including GC worker threads) save their
-    /// contexts and stop.  A subsequent call to `MMTK::after_fork()` will re-spawn the threads
-    /// using the saved contexts.
+    /// The `fork()` system call is available on Linux and some UNIX variants, and may be emulated
+    /// on other platforms by libraries such as Cygwin.  If `fork()` is called when the process has
+    /// multiple threads, it will only duplicate the current thread into the child process, and the
+    /// child process can only call async-signal-safe functions, notably `exec()`.  For VMs that
+    /// that use multi-process concurrency, it is imperative that when calling `fork()`, only one
+    /// thread may exist in the process.
     ///
-    /// This function returns when all MMTK threads stopped.
+    /// This function helps VMs that use `fork()` for multi-process concurrency.  It instructs all
+    /// GC threads to save their contexts and return from their entry-point functions.
+    ///
+    /// Currently, such threads only include GC workers, and the entry point is
+    /// [`crate::memory_manager::start_worker`].
+    ///
+    /// A subsequent call to `MMTK::after_fork()` will re-spawn the threads using their saved
+    /// contexts.
+    ///
+    /// # Caution!
+    ///
+    /// This function sends an asynchronous message to GC threads and returns immediately, but it
+    /// is only safe for the VM to call `fork()` after the underlying **native threads** of the GC
+    /// threads have exited.  After calling this function, the VM should wait for their underlying
+    /// native threads to exit in VM-specific manner before calling `fork()`.
     pub fn prepare_to_fork(&'static self) {
         assert!(
             self.state.is_initialized(),
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 960fca80e8..925fb11ed3 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -104,10 +104,16 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 false
             }
         });
+    }
 
-        self.worker_group.wait_until_worker_exited();
+    /// Surrender the `GCWorker` struct of a GC worker when it exits.
+    pub fn surrender_gc_worker(&self, worker: Box<GCWorker<VM>>) {
+        let all_surrendered = self.worker_group.return_gc_worker_struct(worker);
 
-        self.worker_monitor.on_all_workers_exited();
+        if all_surrendered {
+            debug!("All {} workers surrendered.", self.worker_group.worker_count());
+            self.worker_monitor.on_all_workers_exited();
+        }
     }
 
     /// Respawn GC threads after forking.
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 5d19d003ac..9de6e6c045 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -12,7 +12,7 @@ use crossbeam::queue::ArrayQueue;
 #[cfg(feature = "count_live_bytes_in_gc")]
 use std::sync::atomic::AtomicUsize;
 use std::sync::atomic::Ordering;
-use std::sync::{Arc, Condvar, Mutex};
+use std::sync::{Arc, Mutex};
 
 /// Represents the ID of a GC worker thread.
 pub type ThreadId = usize;
@@ -272,9 +272,6 @@ pub(crate) struct WorkerGroup<VM: VMBinding> {
     pub workers_shared: Vec<Arc<GCWorkerShared<VM>>>,
     /// The stateful part
     state: Mutex<WorkerCreationState<VM>>,
-    /// The condition of "all workers exited", i.e. the number of suspended workers is equal to the
-    /// number of workers.
-    cond_all_workers_exited: Condvar,
 }
 
 /// We have to persuade Rust that `WorkerGroup` is safe to share because it thinks one worker can
@@ -302,7 +299,6 @@ impl<VM: VMBinding> WorkerGroup<VM> {
             state: Mutex::new(WorkerCreationState::NotCreated {
                 unspawned_local_work_queues,
             }),
-            cond_all_workers_exited: Default::default(),
         })
     }
 
@@ -365,8 +361,9 @@ impl<VM: VMBinding> WorkerGroup<VM> {
         }
     }
 
-    /// Surrender the `GCWorker` struct when a GC worker exits.
-    pub fn surrender_gc_worker(&self, worker: Box<GCWorker<VM>>) {
+    /// Return the `GCWorker` struct to the worker group.
+    /// This function returns `true` if all workers returned their `GCWorker` structs.
+    pub fn return_gc_worker_struct(&self, worker: Box<GCWorker<VM>>) -> bool {
         let mut state = self.state.lock().unwrap();
         let WorkerCreationState::Resting { ref mut suspended_workers } = *state else {
             panic!("GCWorker structs have not been created, yet.");
@@ -379,25 +376,7 @@ impl<VM: VMBinding> WorkerGroup<VM> {
             suspended_workers.len(),
             self.worker_count()
         );
-        if suspended_workers.len() == self.worker_count() {
-            debug!("All {} workers surrendered.", self.worker_count());
-            self.cond_all_workers_exited.notify_all();
-        }
-    }
-
-    /// Wait until all workers exited.
-    pub fn wait_until_worker_exited(&self) {
-        let guard = self.state.lock().unwrap();
-        let _guard = self.cond_all_workers_exited.wait_while(guard, |state| {
-            let WorkerCreationState::Resting { ref suspended_workers } = *state else {
-                panic!("GCWorker structs have not been created, yet.");
-            };
-            suspended_workers.len() != self.worker_count()
-        });
-
-        debug!("All workers have exited.  PID: {}", unsafe {
-            libc::getpid()
-        });
+        suspended_workers.len() == self.worker_count()
     }
 
     /// Get the number of workers in the group

From 41ee28cdc3c1c2f3f10f8e95ff21d8af7ddf9aa3 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 8 Feb 2024 11:05:01 +0800
Subject: [PATCH 26/48] Fix comment

---
 src/memory_manager.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/memory_manager.rs b/src/memory_manager.rs
index e3cf6b52be..7842acd2fa 100644
--- a/src/memory_manager.rs
+++ b/src/memory_manager.rs
@@ -464,7 +464,7 @@ pub fn gc_poll<VM: VMBinding>(mmtk: &MMTK<VM>, tls: VMMutatorThread) {
 /// Run the main loop of a GC worker.
 ///
 /// This method runs until the worker exits.  Currently a worker may exit after
-/// `uninitialize_collection` is called.
+/// [`crate::mmtk::MMTK::prepare_to_fork`] is called.
 ///
 /// Arguments:
 /// * `tls`: The thread that will be used as the GC worker.

From c4ab8b4c9324b1432f234e23b186bf426f538b2b Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 8 Feb 2024 11:34:27 +0800
Subject: [PATCH 27/48] Only print TID on linux.

---
 src/scheduler/worker.rs   | 23 ++++++++++++-----------
 src/util/rust_util/mod.rs | 14 ++++++++++++++
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 9de6e6c045..730a525602 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -198,10 +198,9 @@ impl<VM: VMBinding> GCWorker<VM> {
     pub fn run(&mut self, tls: VMWorkerThread, mmtk: &'static MMTK<VM>) {
         probe!(mmtk, gcworker_run);
         debug!(
-            "Worker started. PID: {}, tid: {}, ordinal: {}",
-            unsafe { libc::getpid() },
-            unsafe { libc::gettid() },
-            self.ordinal
+            "Worker started. ordinal: {}, {}",
+            self.ordinal,
+            crate::util::rust_util::debug_process_thread_id(),
         );
         WORKER_ORDINAL.with(|x| x.store(self.ordinal, Ordering::SeqCst));
         self.scheduler.resolve_affinity(self.ordinal);
@@ -235,10 +234,9 @@ impl<VM: VMBinding> GCWorker<VM> {
             work.do_work_with_stat(self, mmtk);
         }
         debug!(
-            "Worker exiting. PID: {}, tid: {}, ordinal: {}",
-            unsafe { libc::getpid() },
-            unsafe { libc::gettid() },
-            self.ordinal
+            "Worker exiting. ordinal: {}, {}",
+            self.ordinal,
+            crate::util::rust_util::debug_process_thread_id(),
         );
         probe!(mmtk, gcworker_exit);
     }
@@ -332,7 +330,10 @@ impl<VM: VMBinding> WorkerGroup<VM> {
 
     /// Spawn all the worker threads
     pub fn spawn(&self, tls: VMThread) {
-        debug!("Spawning GC workers.  PID: {}", unsafe { libc::getpid() });
+        debug!(
+            "Spawning GC workers.  {}",
+            crate::util::rust_util::debug_process_thread_id(),
+        );
         let mut state = self.state.lock().unwrap();
 
         let WorkerCreationState::Resting { ref mut suspended_workers } = *state else {
@@ -347,9 +348,9 @@ impl<VM: VMBinding> WorkerGroup<VM> {
         *state = WorkerCreationState::Spawned;
 
         debug!(
-            "Spawned {} worker threads.  PID: {}",
+            "Spawned {} worker threads.  {}",
             self.worker_count(),
-            unsafe { libc::getpid() }
+            crate::util::rust_util::debug_process_thread_id(),
         );
     }
 
diff --git a/src/util/rust_util/mod.rs b/src/util/rust_util/mod.rs
index efd7224881..062df368f4 100644
--- a/src/util/rust_util/mod.rs
+++ b/src/util/rust_util/mod.rs
@@ -161,3 +161,17 @@ mod initialize_once_tests {
         assert_eq!(INITIALIZE_COUNT.load(Ordering::SeqCst), 1);
     }
 }
+
+/// Create a formatted string that makes the best effort idenfying the current process and thread.
+pub fn debug_process_thread_id() -> String {
+    let pid = unsafe { libc::getpid() };
+    if cfg!(target_os = "linux") {
+        // `gettid()` is Linux-specific.
+        let tid = unsafe { libc::gettid() };
+        format!("PID: {}, TID: {}", pid, tid)
+    } else {
+        // TODO: When we support other platforms, use platform-specific methods to get thread
+        // identifiers.
+        format!("PID: {}", pid)
+    }
+}

From d9b451a3d8e9d157293cc90c5fcf5e421faf8465 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 8 Feb 2024 12:13:03 +0800
Subject: [PATCH 28/48] Fix comments

---
 src/mmtk.rs                | 6 +++++-
 src/scheduler/scheduler.rs | 5 ++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/mmtk.rs b/src/mmtk.rs
index 7c96b9a1d3..56bbf7a9c9 100644
--- a/src/mmtk.rs
+++ b/src/mmtk.rs
@@ -222,7 +222,11 @@ impl<VM: VMBinding> MMTK<VM> {
 
     /// Initialize the GC worker threads that are required for doing garbage collections.
     /// This is a mandatory call for a VM during its boot process once its thread system
-    /// is ready.  This call will invoke `Collection::spawn_gc_thread()`` to create GC threads.
+    /// is ready.  This function should be called exactly once, and should be called before
+    /// allocating objects in the MMTk heap.
+    ///
+    /// Internally, this function will spawn the GC worker threads, and the `tls` argument is used
+    /// for this purpose.
     ///
     /// # Arguments
     ///
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 925fb11ed3..a9c1ffc1c1 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -111,7 +111,10 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         let all_surrendered = self.worker_group.return_gc_worker_struct(worker);
 
         if all_surrendered {
-            debug!("All {} workers surrendered.", self.worker_group.worker_count());
+            debug!(
+                "All {} workers surrendered.",
+                self.worker_group.worker_count()
+            );
             self.worker_monitor.on_all_workers_exited();
         }
     }

From 7a2074ea7c9d29cfa7e686c0c0d2e832b1f66232 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 8 Feb 2024 15:10:12 +0800
Subject: [PATCH 29/48] Refactor WorkerRequest and fix comments

Use WorkerRequest to abstract out common access patterns.
---
 src/mmtk.rs                   |  3 ++-
 src/plan/gc_requester.rs      |  4 ++--
 src/scheduler/scheduler.rs    | 40 ++++++++++++--------------------
 src/scheduler/worker_goals.rs | 43 ++++++++++++++++++++++++++++++++---
 4 files changed, 59 insertions(+), 31 deletions(-)

diff --git a/src/mmtk.rs b/src/mmtk.rs
index 56bbf7a9c9..a1ac528c90 100644
--- a/src/mmtk.rs
+++ b/src/mmtk.rs
@@ -258,7 +258,8 @@ impl<VM: VMBinding> MMTK<VM> {
     /// [`crate::memory_manager::start_worker`].
     ///
     /// A subsequent call to `MMTK::after_fork()` will re-spawn the threads using their saved
-    /// contexts.
+    /// contexts.  The VM must not allocate objects in the MMTk heap before calling
+    /// `MMTK::after_fork()`.
     ///
     /// # Caution!
     ///
diff --git a/src/plan/gc_requester.rs b/src/plan/gc_requester.rs
index 49ae9ce2c9..e3a8462f96 100644
--- a/src/plan/gc_requester.rs
+++ b/src/plan/gc_requester.rs
@@ -28,8 +28,8 @@ impl<VM: VMBinding> GCRequester<VM> {
 
         if !self.request_flag.swap(true, Ordering::Relaxed) {
             // `GCWorkScheduler::request_schedule_collection` needs to hold a mutex to communicate
-            // with GC workers, so only the first mutator that changed the `request_flag` to `true`
-            // shall do it.
+            // with GC workers, which is expensive for functions like `poll`.  We use the atomic
+            // flag `request_flag` to elide the need to acquire the mutex in subsequent calls.
             self.scheduler.request_schedule_collection();
         }
     }
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index a9c1ffc1c1..4dd1d53e7b 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -26,7 +26,7 @@ pub struct GCWorkScheduler<VM: VMBinding> {
     pub work_buckets: EnumMap<WorkBucketStage, WorkBucket<VM>>,
     /// Workers
     pub(crate) worker_group: Arc<WorkerGroup<VM>>,
-    /// Monitor for worker synchronization, including a mutex and conditional variables.
+    /// For synchronized communication between workers and with mutators.
     pub(crate) worker_monitor: Arc<WorkerMonitor>,
     /// How to assign the affinity of each GC thread. Specified by the user.
     affinity: AffinityKind,
@@ -85,25 +85,22 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         self.worker_group.as_ref().worker_count()
     }
 
-    /// Create GC threads, including all workers,  for the first time.
+    /// Create GC threads for the first time.  It will also create the `GCWorker` instances.
+    ///
+    /// Currently GC threads only include worker threads, and we currently have only one worker
+    /// group.  We may add more worker groups in the future.
     pub fn spawn_gc_threads(self: &Arc<Self>, mmtk: &'static MMTK<VM>, tls: VMThread) {
         self.worker_group.create_workers(mmtk);
         self.worker_group.spawn(tls)
     }
 
-    /// Ask all GC workers to exit for forking, and wait until all workers exited.
+    /// Ask all GC workers to exit for forking.
     pub fn stop_gc_threads_for_forking(self: &Arc<Self>) {
         self.worker_group.prepare_surrender_buffer();
 
         debug!("A mutator is requesting GC threads to stop for forking...");
-        self.worker_monitor.make_request(|requests| {
-            if !requests.stop_for_fork {
-                requests.stop_for_fork = true;
-                true
-            } else {
-                false
-            }
-        });
+        self.worker_monitor
+            .make_request(|requests| requests.stop_for_fork.set());
     }
 
     /// Surrender the `GCWorker` struct of a GC worker when it exits.
@@ -119,7 +116,8 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         }
     }
 
-    /// Respawn GC threads after forking.
+    /// Respawn GC threads after forking.  This will reuse the `GCWorker` instances of stopped
+    /// workers.
     pub fn respawn_gc_threads_after_forking(self: &Arc<Self>, tls: VMThread) {
         self.worker_group.spawn(tls)
     }
@@ -132,14 +130,8 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     /// Request a GC to be scheduled.  Called by mutator via `GCRequester`.
     pub(crate) fn request_schedule_collection(&self) {
         debug!("A mutator is sending GC-scheduling request to workers...");
-        self.worker_monitor.make_request(|requests| {
-            if !requests.gc {
-                requests.gc = true;
-                true
-            } else {
-                false
-            }
-        });
+        self.worker_monitor
+            .make_request(|requests| requests.gc.set());
     }
 
     /// Add the `ScheduleCollection` packet.  Called by the last parked worker.
@@ -437,7 +429,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 // In stop-the-world GC, mutators cannot request for GC while GC is in progress.
                 // When we support concurrent GC, we should remove this assertion.
                 assert!(
-                    !goals.requests.gc,
+                    !goals.requests.gc.debug_is_set(),
                     "GC request sent to WorkerMonitor while GC is still in progress."
                 );
 
@@ -472,9 +464,8 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     fn respond_to_requests(&self, goals: &mut WorkerGoals) -> LastParkedResult {
         assert!(goals.current.is_none());
 
-        if goals.requests.gc {
+        if goals.requests.gc.poll() {
             trace!("A mutator requested a GC to be scheduled.");
-            goals.requests.gc = false;
 
             // We set the eBPF trace point here so that bpftrace scripts can start recording work
             // packet events before the `ScheduleCollection` work packet starts.
@@ -488,9 +479,8 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
             return LastParkedResult::WakeSelf;
         }
 
-        if goals.requests.stop_for_fork {
+        if goals.requests.stop_for_fork.poll() {
             trace!("A mutator wanted to fork.");
-            goals.requests.stop_for_fork = false;
 
             goals.current = Some(WorkerGoal::StopForFork);
             return LastParkedResult::WakeAll;
diff --git a/src/scheduler/worker_goals.rs b/src/scheduler/worker_goals.rs
index 0dc991f9f0..73d7a3f17e 100644
--- a/src/scheduler/worker_goals.rs
+++ b/src/scheduler/worker_goals.rs
@@ -39,10 +39,47 @@ pub(crate) enum WorkerGoal {
 /// thing with the highest priority.
 ///
 /// The fields of this structs are ordered with decreasing priority.
-#[derive(Default, Debug)] // All fields should be false by default.
+#[derive(Default, Debug)]
 pub(crate) struct WorkerRequests {
     /// The VM needs to fork.  Workers should save their contexts and exit.
-    pub(crate) stop_for_fork: bool,
+    pub(crate) stop_for_fork: WorkerRequest,
     /// GC is requested.  Workers should schedule a GC.
-    pub(crate) gc: bool,
+    pub(crate) gc: WorkerRequest,
+}
+
+/// To record whether a specific goal has been reqeuested.
+/// It is basically a wrapper of `bool`, but forces it to be accessed in a particular way.
+#[derive(Default, Debug)] // Default: False by default.
+pub(crate) struct WorkerRequest {
+    /// True if the goal has been requested.
+    requested: bool,
+}
+
+impl WorkerRequest {
+    /// Set the goal as requested.  Return `true` if its requested state changed from `false` to
+    /// `true`.
+    pub fn set(&mut self) -> bool {
+        if !self.requested {
+            self.requested = true;
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Get the requested state and clear it.  Return `true` if the requested state was `true`.
+    pub fn poll(&mut self) -> bool {
+        if self.requested {
+            self.requested = false;
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Test if the request is set.  For debug only.  The last parked worker should use `poll` to
+    /// get the state and clear it.
+    pub fn debug_is_set(&self) -> bool {
+        self.requested
+    }
 }

From eb1d8dfd8fd1bb41e618884052cab536c27d378d Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 8 Feb 2024 20:44:21 +0800
Subject: [PATCH 30/48] Use EnumMap for requested goals

---
 src/global_state.rs             |  6 ++
 src/scheduler/scheduler.rs      | 77 ++++++++++++++-----------
 src/scheduler/worker_goals.rs   | 99 +++++++++++++++------------------
 src/scheduler/worker_monitor.rs | 25 +++------
 4 files changed, 103 insertions(+), 104 deletions(-)

diff --git a/src/global_state.rs b/src/global_state.rs
index 4823119851..f355e697f2 100644
--- a/src/global_state.rs
+++ b/src/global_state.rs
@@ -1,5 +1,8 @@
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::Mutex;
+use std::time::Instant;
+
+use atomic_refcell::AtomicRefCell;
 
 /// This stores some global states for an MMTK instance.
 /// Some MMTK components like plans and allocators may keep an reference to the struct, and can access it.
@@ -15,6 +18,8 @@ pub struct GlobalState {
     pub(crate) initialized: AtomicBool,
     /// The current GC status.
     pub(crate) gc_status: Mutex<GcStatus>,
+    /// When did the last GC start? Only accessed by the last parked worker.
+    pub(crate) gc_start_time: AtomicRefCell<Option<Instant>>,
     /// Is the current GC an emergency collection? Emergency means we may run out of memory soon, and we should
     /// attempt to collect as much as we can.
     pub(crate) emergency_collection: AtomicBool,
@@ -195,6 +200,7 @@ impl Default for GlobalState {
         Self {
             initialized: AtomicBool::new(false),
             gc_status: Mutex::new(GcStatus::NotInGC),
+            gc_start_time: AtomicRefCell::new(None),
             stacks_prepared: AtomicBool::new(false),
             emergency_collection: AtomicBool::new(false),
             user_triggered_collection: AtomicBool::new(false),
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 4dd1d53e7b..9ed448eed8 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -99,8 +99,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         self.worker_group.prepare_surrender_buffer();
 
         debug!("A mutator is requesting GC threads to stop for forking...");
-        self.worker_monitor
-            .make_request(|requests| requests.stop_for_fork.set());
+        self.worker_monitor.make_request(WorkerGoal::StopForFork);
     }
 
     /// Surrender the `GCWorker` struct of a GC worker when it exits.
@@ -130,8 +129,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     /// Request a GC to be scheduled.  Called by mutator via `GCRequester`.
     pub(crate) fn request_schedule_collection(&self) {
         debug!("A mutator is sending GC-scheduling request to workers...");
-        self.worker_monitor
-            .make_request(|requests| requests.gc.set());
+        self.worker_monitor.make_request(WorkerGoal::Gc);
     }
 
     /// Add the `ScheduleCollection` packet.  Called by the last parked worker.
@@ -417,19 +415,19 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     /// Called when the last worker parked.
     /// `should_schedule_gc` is true if a mutator requested a GC.
     fn on_last_parked(&self, worker: &GCWorker<VM>, goals: &mut WorkerGoals) -> LastParkedResult {
-        let Some(ref current_goal) = goals.current else {
+        let Some(ref current_goal) = goals.current() else {
             // There is no goal.  Find a request to respond to.
-            return self.respond_to_requests(goals);
+            return self.respond_to_requests(worker, goals);
         };
 
         match current_goal {
-            WorkerGoal::Gc { start_time } => {
+            WorkerGoal::Gc => {
                 // We are in the progress of GC.
 
                 // In stop-the-world GC, mutators cannot request for GC while GC is in progress.
                 // When we support concurrent GC, we should remove this assertion.
                 assert!(
-                    !goals.requests.gc.debug_is_set(),
+                    !goals.debug_is_requested(WorkerGoal::Gc),
                     "GC request sent to WorkerMonitor while GC is still in progress."
                 );
 
@@ -446,11 +444,11 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                     LastParkedResult::WakeAll
                 } else {
                     // GC finished.
-                    self.on_gc_finished(worker, start_time);
+                    self.on_gc_finished(worker);
 
                     // Clear the current goal
-                    goals.current = None;
-                    self.respond_to_requests(goals)
+                    goals.on_current_goal_completed();
+                    self.respond_to_requests(worker, goals)
                 }
             }
             WorkerGoal::StopForFork => {
@@ -461,33 +459,40 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     }
 
     /// Respond to a worker reqeust.
-    fn respond_to_requests(&self, goals: &mut WorkerGoals) -> LastParkedResult {
-        assert!(goals.current.is_none());
-
-        if goals.requests.gc.poll() {
-            trace!("A mutator requested a GC to be scheduled.");
-
-            // We set the eBPF trace point here so that bpftrace scripts can start recording work
-            // packet events before the `ScheduleCollection` work packet starts.
-            probe!(mmtk, gc_start);
+    fn respond_to_requests(
+        &self,
+        worker: &GCWorker<VM>,
+        goals: &mut WorkerGoals,
+    ) -> LastParkedResult {
+        assert!(goals.current().is_none());
+
+        let Some(goal) = goals.poll_next_goal() else {
+            // No reqeusts.  Park this worker, too.
+            return LastParkedResult::ParkSelf;
+        };
 
-            goals.current = Some(WorkerGoal::Gc {
-                start_time: Instant::now(),
-            });
+        match goal {
+            WorkerGoal::Gc => {
+                trace!("A mutator requested a GC to be scheduled.");
 
-            self.add_schedule_collection_packet();
-            return LastParkedResult::WakeSelf;
-        }
+                // We set the eBPF trace point here so that bpftrace scripts can start recording
+                // work packet events before the `ScheduleCollection` work packet starts.
+                probe!(mmtk, gc_start);
 
-        if goals.requests.stop_for_fork.poll() {
-            trace!("A mutator wanted to fork.");
+                {
+                    let mut gc_start_time = worker.mmtk.state.gc_start_time.borrow_mut();
+                    assert!(gc_start_time.is_none(), "GC already started?");
+                    *gc_start_time = Some(Instant::now());
+                }
 
-            goals.current = Some(WorkerGoal::StopForFork);
-            return LastParkedResult::WakeAll;
+                self.add_schedule_collection_packet();
+                LastParkedResult::WakeSelf
+            }
+            WorkerGoal::StopForFork => {
+                trace!("A mutator wanted to fork.");
+                LastParkedResult::WakeAll
+            }
         }
-
-        // No reqeusts.  Park this worker, too.
-        LastParkedResult::ParkSelf
     }
 
     /// Find more work for workers to do.  Return true if more work is available.
@@ -515,7 +520,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
 
     /// Called when GC has finished, i.e. when all work packets have been executed.
     /// Return `true` if it scheduled the next GC immediately.
-    fn on_gc_finished(&self, worker: &GCWorker<VM>, start_time: &Instant) {
+    fn on_gc_finished(&self, worker: &GCWorker<VM>) {
         // All GC workers must have parked by now.
         debug_assert!(!self.worker_group.has_designated_work());
         debug_assert!(self.all_buckets_empty());
@@ -530,6 +535,10 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         mmtk.gc_trigger.policy.on_gc_end(mmtk);
 
         // Compute the elapsed time of the GC.
+        let start_time = {
+            let mut gc_start_time = worker.mmtk.state.gc_start_time.borrow_mut();
+            gc_start_time.take().expect("GC not started yet?")
+        };
         let elapsed = start_time.elapsed();
 
         info!(
diff --git a/src/scheduler/worker_goals.rs b/src/scheduler/worker_goals.rs
index 73d7a3f17e..3bf367d8cf 100644
--- a/src/scheduler/worker_goals.rs
+++ b/src/scheduler/worker_goals.rs
@@ -7,79 +7,72 @@
 //!
 //! -   When in the progress of GC, the last parker will try to open buckets or announce the GC
 //!     has finished.
-//! -   When stopping for fork, every worker should exit when waken.
+//! -   When stopping for fork, every waken worker should save its thread state (giving in the
+//!     `GCWorker` struct) and exit.
 //!
-//! The struct `WorkerRequests` keeps a list of requests from mutators, such as requests for GC
-//! and requests for forking.  But the GC workers will only respond to one request at a time.
+//! The struct `WorkerGoals` keeps the set of goals requested by mutators, but GC workers will only
+//! respond to one request at a time, and will favor higher-priority goals.
 
-use std::time::Instant;
+use enum_map::{Enum, EnumMap};
 
 /// This current and reqeusted goals.
 #[derive(Default, Debug)]
 pub(crate) struct WorkerGoals {
-    /// What are the workers doing now?
-    pub(crate) current: Option<WorkerGoal>,
-    /// Requests received from mutators.
-    pub(crate) requests: WorkerRequests,
+    /// The current goal.
+    current: Option<WorkerGoal>,
+    /// Requests received from mutators.  `requests[goal]` is true if the `goal` is requested.
+    requests: EnumMap<WorkerGoal, bool>,
 }
 
-/// The thing workers are currently doing.  This affects several things, such as what the last
-/// parked worker will do, and whether workers will stop themselves.
-#[derive(Debug)]
+/// A goal, i.e. something that workers should work together to achieve.
+///
+/// Members of this `enum` should be listed from the highest priority to the lowest priority.
+#[derive(Debug, Enum, Clone, Copy)]
 pub(crate) enum WorkerGoal {
-    Gc {
-        start_time: Instant,
-    },
-    #[allow(unused)] // TODO: Implement forking support later.
+    /// Do a garbage collection.
+    Gc,
+    /// Stop all GC threads so that the VM can call `fork()`.
     StopForFork,
 }
 
-/// Reqeusts received from mutators.  Workers respond to those requests when they do not have a
-/// current goal.  Multiple things can be requested at the same time, and workers respond to the
-/// thing with the highest priority.
-///
-/// The fields of this structs are ordered with decreasing priority.
-#[derive(Default, Debug)]
-pub(crate) struct WorkerRequests {
-    /// The VM needs to fork.  Workers should save their contexts and exit.
-    pub(crate) stop_for_fork: WorkerRequest,
-    /// GC is requested.  Workers should schedule a GC.
-    pub(crate) gc: WorkerRequest,
-}
-
-/// To record whether a specific goal has been reqeuested.
-/// It is basically a wrapper of `bool`, but forces it to be accessed in a particular way.
-#[derive(Default, Debug)] // Default: False by default.
-pub(crate) struct WorkerRequest {
-    /// True if the goal has been requested.
-    requested: bool,
-}
-
-impl WorkerRequest {
-    /// Set the goal as requested.  Return `true` if its requested state changed from `false` to
-    /// `true`.
-    pub fn set(&mut self) -> bool {
-        if !self.requested {
-            self.requested = true;
+impl WorkerGoals {
+    /// Set the `goal` as requested.  Return `true` if the requested state of the `goal` changed
+    /// from `false` to `true`.
+    pub fn set_request(&mut self, goal: WorkerGoal) -> bool {
+        if !self.requests[goal] {
+            self.requests[goal] = true;
             true
         } else {
             false
         }
     }
 
-    /// Get the requested state and clear it.  Return `true` if the requested state was `true`.
-    pub fn poll(&mut self) -> bool {
-        if self.requested {
-            self.requested = false;
-            true
-        } else {
-            false
+    /// Move the highest priority goal from the pending requests to the current request.  Return
+    /// that goal, or `None` if no goal has been requested.
+    pub fn poll_next_goal(&mut self) -> Option<WorkerGoal> {
+        for (goal, requested) in self.requests.iter_mut() {
+            if *requested {
+                *requested = false;
+                self.current = Some(goal);
+                return Some(goal);
+            }
         }
+        None
+    }
+
+    /// Get the current goal if exists.
+    pub fn current(&self) -> Option<WorkerGoal> {
+        self.current
+    }
+
+    /// Called when the current goal is completed.  This will clear the current goal.
+    pub fn on_current_goal_completed(&mut self) {
+        self.current = None
     }
 
-    /// Test if the request is set.  For debug only.  The last parked worker should use `poll` to
-    /// get the state and clear it.
-    pub fn debug_is_set(&self) -> bool {
-        self.requested
+    /// Test if the given `goal` is requested.  Used for debug purpose, only.  The workers always
+    /// respond to the request of the highest priority first.
+    pub fn debug_is_requested(&self, goal: WorkerGoal) -> bool {
+        self.requests[goal]
     }
 }
diff --git a/src/scheduler/worker_monitor.rs b/src/scheduler/worker_monitor.rs
index cba34c6928..70aabce4e4 100644
--- a/src/scheduler/worker_monitor.rs
+++ b/src/scheduler/worker_monitor.rs
@@ -9,7 +9,7 @@ use std::sync::{Condvar, Mutex};
 use crate::vm::VMBinding;
 
 use super::{
-    worker_goals::{WorkerGoal, WorkerGoals, WorkerRequests},
+    worker_goals::{WorkerGoal, WorkerGoals},
     GCWorker,
 };
 
@@ -97,21 +97,12 @@ impl WorkerMonitor {
         }
     }
 
-    /// Make a request.  The `callback` will be called while holding the mutex `self.sync` so that
-    /// the caller can set some of its fields to true.
-    ///
-    /// The `callback` has one parameter:
-    ///
-    /// -   a mutable reference to the `WorkerRequests` instance.
-    ///
-    /// The `callback` should return `true` if it set any field of `WorkerRequests` to true.
-    pub fn make_request<Callback>(&self, callback: Callback)
-    where
-        Callback: FnOnce(&mut WorkerRequests) -> bool,
-    {
+    /// Make a request.  Can be called by a mutator to request the workers to work towards the
+    /// given `goal`.
+    pub fn make_request(&self, goal: WorkerGoal) {
         let mut guard = self.sync.lock().unwrap();
-        let set_any_fields = callback(&mut guard.goals.requests);
-        if set_any_fields {
+        let newly_requested = guard.goals.set_request(goal);
+        if newly_requested {
             self.notify_work_available(false);
         }
     }
@@ -236,12 +227,12 @@ impl WorkerMonitor {
         );
 
         // If the current goal is `StopForFork`, return true so that the worker thread will exit.
-        matches!(sync.goals.current, Some(WorkerGoal::StopForFork))
+        matches!(sync.goals.current(), Some(WorkerGoal::StopForFork))
     }
 
     /// Called when all workers have exited.
     pub fn on_all_workers_exited(&self) {
         let mut sync = self.sync.try_lock().unwrap();
-        sync.goals.current = None;
+        sync.goals.on_current_goal_completed();
     }
 }

From 54d798a06d4dfa9381ae1867ebee7fdd476a6b22 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 8 Feb 2024 21:37:23 +0800
Subject: [PATCH 31/48] Fix style for stable Rust

---
 src/util/rust_util/mod.rs | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/util/rust_util/mod.rs b/src/util/rust_util/mod.rs
index 062df368f4..23da0c00f3 100644
--- a/src/util/rust_util/mod.rs
+++ b/src/util/rust_util/mod.rs
@@ -125,6 +125,20 @@ where
     unsafe { result_array.assume_init() }
 }
 
+/// Create a formatted string that makes the best effort idenfying the current process and thread.
+pub fn debug_process_thread_id() -> String {
+    let pid = unsafe { libc::getpid() };
+    if cfg!(target_os = "linux") {
+        // `gettid()` is Linux-specific.
+        let tid = unsafe { libc::gettid() };
+        format!("PID: {}, TID: {}", pid, tid)
+    } else {
+        // TODO: When we support other platforms, use platform-specific methods to get thread
+        // identifiers.
+        format!("PID: {}", pid)
+    }
+}
+
 #[cfg(test)]
 mod initialize_once_tests {
     use super::*;
@@ -161,17 +175,3 @@ mod initialize_once_tests {
         assert_eq!(INITIALIZE_COUNT.load(Ordering::SeqCst), 1);
     }
 }
-
-/// Create a formatted string that makes the best effort idenfying the current process and thread.
-pub fn debug_process_thread_id() -> String {
-    let pid = unsafe { libc::getpid() };
-    if cfg!(target_os = "linux") {
-        // `gettid()` is Linux-specific.
-        let tid = unsafe { libc::gettid() };
-        format!("PID: {}, TID: {}", pid, tid)
-    } else {
-        // TODO: When we support other platforms, use platform-specific methods to get thread
-        // identifiers.
-        format!("PID: {}", pid)
-    }
-}

From f0e2755f7fba0b352a0cfdf42ef0e25eb5f0c818 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 8 Feb 2024 21:39:07 +0800
Subject: [PATCH 32/48] Fix MacOS build

---
 src/util/rust_util/mod.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/util/rust_util/mod.rs b/src/util/rust_util/mod.rs
index 23da0c00f3..f8a4b3c7c2 100644
--- a/src/util/rust_util/mod.rs
+++ b/src/util/rust_util/mod.rs
@@ -128,11 +128,14 @@ where
 /// Create a formatted string that makes the best effort idenfying the current process and thread.
 pub fn debug_process_thread_id() -> String {
     let pid = unsafe { libc::getpid() };
-    if cfg!(target_os = "linux") {
+    #[cfg(target_os = "linux")]
+    {
         // `gettid()` is Linux-specific.
         let tid = unsafe { libc::gettid() };
         format!("PID: {}, TID: {}", pid, tid)
-    } else {
+    }
+    #[cfg(not(target_os = "linux"))]
+    {
         // TODO: When we support other platforms, use platform-specific methods to get thread
         // identifiers.
         format!("PID: {}", pid)

From ea53ce10ba19aba6a7727efbc1f51c59350eef99 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Tue, 19 Mar 2024 12:10:46 +0800
Subject: [PATCH 33/48] Minor fix.

If a worker parked again when it is asked to exit, it should panic.  It
is not "unimplemented".
---
 src/scheduler/scheduler.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 9ed448eed8..4357685116 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -452,8 +452,10 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 }
             }
             WorkerGoal::StopForFork => {
-                // A worker parked again when it is asked to exit.
-                unimplemented!()
+                panic!(
+                    "Worker {} parked again when it is asked to exit.",
+                    worker.ordinal
+                )
             }
         }
     }

From 2c15ecdb7a0eaff3eb51cc11182e63620764cfc6 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Tue, 19 Mar 2024 14:06:32 +0800
Subject: [PATCH 34/48] Update comment

---
 src/mmtk.rs | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/mmtk.rs b/src/mmtk.rs
index a827313543..24339543b4 100644
--- a/src/mmtk.rs
+++ b/src/mmtk.rs
@@ -245,21 +245,33 @@ impl<VM: VMBinding> MMTK<VM> {
     /// Prepare an MMTk instance for calling the `fork()` system call.
     ///
     /// The `fork()` system call is available on Linux and some UNIX variants, and may be emulated
-    /// on other platforms by libraries such as Cygwin.  If `fork()` is called when the process has
-    /// multiple threads, it will only duplicate the current thread into the child process, and the
-    /// child process can only call async-signal-safe functions, notably `exec()`.  For VMs that
-    /// that use multi-process concurrency, it is imperative that when calling `fork()`, only one
-    /// thread may exist in the process.
+    /// on other platforms by libraries such as Cygwin.  The properties of the `fork()` system call
+    /// requires the users to do some preparation before calling it.
+    ///
+    /// -   **Multi-threading**:  If `fork()` is called when the process has multiple threads, it
+    /// will only duplicate the current thread into the child process, and the child process can
+    /// only call async-signal-safe functions, notably `exec()`.  For VMs that that use
+    /// multi-process concurrency, it is imperative that when calling `fork()`, only one thread may
+    /// exist in the process.
+    ///
+    /// -   **File descriptors**: The child process inherits copies of the parent's set of open
+    /// file descriptors.  This may or may not be desired depending on use cases.
     ///
     /// This function helps VMs that use `fork()` for multi-process concurrency.  It instructs all
-    /// GC threads to save their contexts and return from their entry-point functions.
+    /// GC threads to save their contexts and return from their entry-point functions.  Currently,
+    /// such threads only include GC workers, and the entry point is
+    /// [`crate::memory_manager::start_worker`].  A subsequent call to `MMTK::after_fork()` will
+    /// re-spawn the threads using their saved contexts.  The VM must not allocate objects in the
+    /// MMTk heap before calling `MMTK::after_fork()`.
     ///
-    /// Currently, such threads only include GC workers, and the entry point is
-    /// [`crate::memory_manager::start_worker`].
+    /// TODO: Currently, the MMTk core does not keep any files open for a long time.  In the
+    /// future, this function and the `after_fork` function may be used for handling open file
+    /// descriptors across invocations of `fork()`.  One possible use case is logging GC activities
+    /// and statistics to files, such as performing heap dumps across multiple GCs.
     ///
-    /// A subsequent call to `MMTK::after_fork()` will re-spawn the threads using their saved
-    /// contexts.  The VM must not allocate objects in the MMTk heap before calling
-    /// `MMTK::after_fork()`.
+    /// If a VM intends to execute another program by calling `fork()` and immediately calling
+    /// `exec`, it may skip this function because the state of the MMTk instance will be irrelevant
+    /// in that case.
     ///
     /// # Caution!
     ///

From 9aacb2d1beda5aac068c046749e2d6e09850f0b0 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 21 Mar 2024 12:06:31 +0800
Subject: [PATCH 35/48] Change type/variable/function names

Change names to make them simpler and consistent.
---
 src/scheduler/scheduler.rs |  2 +-
 src/scheduler/worker.rs    | 56 ++++++++++++++++++++------------------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 4357685116..42872d4931 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -104,7 +104,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
 
     /// Surrender the `GCWorker` struct of a GC worker when it exits.
     pub fn surrender_gc_worker(&self, worker: Box<GCWorker<VM>>) {
-        let all_surrendered = self.worker_group.return_gc_worker_struct(worker);
+        let all_surrendered = self.worker_group.surrender_gc_worker(worker);
 
         if all_surrendered {
             debug!(
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 730a525602..bb3dd83c36 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -242,25 +242,27 @@ impl<VM: VMBinding> GCWorker<VM> {
     }
 }
 
+/// Data that are owned by a worker group but depend on whether the `GCWorker` structs and the
+/// actual GC worker threads have been created or not.
 enum WorkerCreationState<VM: VMBinding> {
-    /// `GCWorker`` structs have not been created yet.
+    /// `GCWorker` structs have not been created yet.
     NotCreated {
         /// The local work queues for to-be-created workers.
-        unspawned_local_work_queues: Vec<deque::Worker<Box<dyn GCWork<VM>>>>,
+        local_work_queues: Vec<deque::Worker<Box<dyn GCWork<VM>>>>,
     },
-    /// `GCWorker`` structs are created, but worker threads are either not been spawn, yet, or in
-    /// the progress of stopping for forking.
+    /// `GCWorker` structs are created, but worker threads are either not spawn, yet, or in the
+    /// progress of stopping for forking.
     Resting {
-        /// `Worker` instances for worker threads that have not been created yet, or worker thread
-        /// that are suspended (e.g. for forking).
+        /// `GCWorker` instances not currently owned by active GC worker threads.  Once GC workers
+        /// are (re-)spawn, they will take ownership of these `GCWorker` instances.
         // Note: Clippy warns about `Vec<Box<T>>` because `Vec<T>` is already in the heap.
         // However, the purpose of this `Vec` is allowing GC worker threads to give their
         // `Box<GCWorker<VM>>` instances back to this pool.  Therefore, the `Box` is necessary.
         #[allow(clippy::vec_box)]
-        suspended_workers: Vec<Box<GCWorker<VM>>>,
+        workers: Vec<Box<GCWorker<VM>>>,
     },
-    /// All `GCWorker`` structs have been transferred to worker threads, and worker threads are
-    /// running.
+    /// All worker threads are spawn and running.  `GCWorker` structs have been transferred to
+    /// worker threads.
     Spawned,
 }
 
@@ -272,9 +274,9 @@ pub(crate) struct WorkerGroup<VM: VMBinding> {
     state: Mutex<WorkerCreationState<VM>>,
 }
 
-/// We have to persuade Rust that `WorkerGroup` is safe to share because it thinks one worker can
-/// refer to another worker via the path "worker -> scheduler -> worker_group -> suspended_workers
-/// -> worker" which it thinks is cyclic reference and unsafe.
+/// We have to persuade Rust that `WorkerGroup` is safe to share because the compiler thinks one
+/// worker can refer to another worker via the path "worker -> scheduler -> worker_group ->
+/// `Resting::workers` -> worker" which is cyclic reference and unsafe.
 unsafe impl<VM: VMBinding> Sync for WorkerGroup<VM> {}
 
 impl<VM: VMBinding> WorkerGroup<VM> {
@@ -295,7 +297,7 @@ impl<VM: VMBinding> WorkerGroup<VM> {
         Arc::new(Self {
             workers_shared,
             state: Mutex::new(WorkerCreationState::NotCreated {
-                unspawned_local_work_queues,
+                local_work_queues: unspawned_local_work_queues,
             }),
         })
     }
@@ -305,12 +307,12 @@ impl<VM: VMBinding> WorkerGroup<VM> {
         debug!("Creating GCWorker instances...");
         let mut state = self.state.lock().unwrap();
 
-        let WorkerCreationState::NotCreated { ref mut unspawned_local_work_queues } = *state else {
+        let WorkerCreationState::NotCreated { ref mut local_work_queues } = *state else {
             panic!("GCWorker structs have already been created");
         };
 
-        assert_eq!(self.workers_shared.len(), unspawned_local_work_queues.len());
-        let mut suspended_workers = Vec::with_capacity(self.workers_shared.len());
+        assert_eq!(self.workers_shared.len(), local_work_queues.len());
+        let mut workers = Vec::with_capacity(self.workers_shared.len());
 
         // Spawn each worker thread.
         for (ordinal, shared) in self.workers_shared.iter().enumerate() {
@@ -319,12 +321,12 @@ impl<VM: VMBinding> WorkerGroup<VM> {
                 ordinal,
                 mmtk.scheduler.clone(),
                 shared.clone(),
-                unspawned_local_work_queues.pop().unwrap(),
+                local_work_queues.pop().unwrap(),
             ));
-            suspended_workers.push(worker);
+            workers.push(worker);
         }
 
-        *state = WorkerCreationState::Resting { suspended_workers };
+        *state = WorkerCreationState::Resting { workers };
         debug!("GCWorker instances created.");
     }
 
@@ -336,12 +338,12 @@ impl<VM: VMBinding> WorkerGroup<VM> {
         );
         let mut state = self.state.lock().unwrap();
 
-        let WorkerCreationState::Resting { ref mut suspended_workers } = *state else {
+        let WorkerCreationState::Resting { ref mut workers } = *state else {
             panic!("GCWorker structs have not been created, yet.");
         };
 
         // Drain the queue.  We transfer the ownership of each `GCWorker` instance to a GC thread.
-        for worker in suspended_workers.drain(..) {
+        for worker in workers.drain(..) {
             VM::VMCollection::spawn_gc_thread(tls, GCThreadContext::<VM>::Worker(worker));
         }
 
@@ -358,26 +360,26 @@ impl<VM: VMBinding> WorkerGroup<VM> {
     pub fn prepare_surrender_buffer(&self) {
         let mut state = self.state.lock().unwrap();
         *state = WorkerCreationState::Resting {
-            suspended_workers: Vec::with_capacity(self.worker_count()),
+            workers: Vec::with_capacity(self.worker_count()),
         }
     }
 
     /// Return the `GCWorker` struct to the worker group.
     /// This function returns `true` if all workers returned their `GCWorker` structs.
-    pub fn return_gc_worker_struct(&self, worker: Box<GCWorker<VM>>) -> bool {
+    pub fn surrender_gc_worker(&self, worker: Box<GCWorker<VM>>) -> bool {
         let mut state = self.state.lock().unwrap();
-        let WorkerCreationState::Resting { ref mut suspended_workers } = *state else {
+        let WorkerCreationState::Resting { ref mut workers } = *state else {
             panic!("GCWorker structs have not been created, yet.");
         };
         let ordinal = worker.ordinal;
-        suspended_workers.push(worker);
+        workers.push(worker);
         trace!(
             "Worker {} surrendered. ({}/{})",
             ordinal,
-            suspended_workers.len(),
+            workers.len(),
             self.worker_count()
         );
-        suspended_workers.len() == self.worker_count()
+        workers.len() == self.worker_count()
     }
 
     /// Get the number of workers in the group

From 351d9821b4d57b86f9dade0d8cd7b47c07edbb4e Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 21 Mar 2024 13:16:49 +0800
Subject: [PATCH 36/48] Align GCWorkers with GCWorkerShared

The index of `worker_group.workers_shared` should represent the ordinal
---
 src/scheduler/worker.rs | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index bb3dd83c36..1a7c7b4a7f 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -282,23 +282,21 @@ unsafe impl<VM: VMBinding> Sync for WorkerGroup<VM> {}
 impl<VM: VMBinding> WorkerGroup<VM> {
     /// Create a WorkerGroup
     pub fn new(num_workers: usize) -> Arc<Self> {
-        let unspawned_local_work_queues = (0..num_workers)
+        let local_work_queues = (0..num_workers)
             .map(|_| deque::Worker::new_fifo())
             .collect::<Vec<_>>();
 
         let workers_shared = (0..num_workers)
             .map(|i| {
                 Arc::new(GCWorkerShared::<VM>::new(Some(
-                    unspawned_local_work_queues[i].stealer(),
+                    local_work_queues[i].stealer(),
                 )))
             })
             .collect::<Vec<_>>();
 
         Arc::new(Self {
             workers_shared,
-            state: Mutex::new(WorkerCreationState::NotCreated {
-                local_work_queues: unspawned_local_work_queues,
-            }),
+            state: Mutex::new(WorkerCreationState::NotCreated { local_work_queues }),
         })
     }
 
@@ -312,19 +310,21 @@ impl<VM: VMBinding> WorkerGroup<VM> {
         };
 
         assert_eq!(self.workers_shared.len(), local_work_queues.len());
-        let mut workers = Vec::with_capacity(self.workers_shared.len());
 
         // Spawn each worker thread.
-        for (ordinal, shared) in self.workers_shared.iter().enumerate() {
-            let worker = Box::new(GCWorker::new(
-                mmtk,
-                ordinal,
-                mmtk.scheduler.clone(),
-                shared.clone(),
-                local_work_queues.pop().unwrap(),
-            ));
-            workers.push(worker);
-        }
+        let workers = (local_work_queues.drain(..))
+            .zip(self.workers_shared.iter())
+            .enumerate()
+            .map(|(ordinal, (queue, shared))| {
+                Box::new(GCWorker::new(
+                    mmtk,
+                    ordinal,
+                    mmtk.scheduler.clone(),
+                    shared.clone(),
+                    queue,
+                ))
+            })
+            .collect::<Vec<_>>();
 
         *state = WorkerCreationState::Resting { workers };
         debug!("GCWorker instances created.");

From 9f743a89baf32684551d3b4e8017ed152f1790e1 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Fri, 22 Mar 2024 09:24:25 +0800
Subject: [PATCH 37/48] Remove comment sentence

It is not our intention to require calling `initialize_collection`
before any allocation.
---
 src/mmtk.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/mmtk.rs b/src/mmtk.rs
index 24339543b4..4aa9bd2d7b 100644
--- a/src/mmtk.rs
+++ b/src/mmtk.rs
@@ -222,8 +222,7 @@ impl<VM: VMBinding> MMTK<VM> {
 
     /// Initialize the GC worker threads that are required for doing garbage collections.
     /// This is a mandatory call for a VM during its boot process once its thread system
-    /// is ready.  This function should be called exactly once, and should be called before
-    /// allocating objects in the MMTk heap.
+    /// is ready.  This function should be called exactly once.
     ///
     /// Internally, this function will spawn the GC worker threads, and the `tls` argument is used
     /// for this purpose.

From fd2f9c3d9fccd0c4b8bcc056c34f158c790f8c52 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Mon, 25 Mar 2024 13:53:47 +0800
Subject: [PATCH 38/48] Adjust eBPF trace points and their docs.

---
 src/mmtk.rs                   | 2 ++
 src/scheduler/worker_goals.rs | 2 ++
 tools/tracing/README.md       | 6 ++++--
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/mmtk.rs b/src/mmtk.rs
index 4aa9bd2d7b..60b1475e69 100644
--- a/src/mmtk.rs
+++ b/src/mmtk.rs
@@ -283,6 +283,7 @@ impl<VM: VMBinding> MMTK<VM> {
             self.state.is_initialized(),
             "MMTk collection has not been initialized, yet (was initialize_collection() called before?)"
         );
+        probe!(mmtk, prepare_to_fork);
         self.scheduler.stop_gc_threads_for_forking();
     }
 
@@ -300,6 +301,7 @@ impl<VM: VMBinding> MMTK<VM> {
             self.state.is_initialized(),
             "MMTk collection has not been initialized, yet (was initialize_collection() called before?)"
         );
+        probe!(mmtk, after_fork);
         self.scheduler.respawn_gc_threads_after_forking(tls);
     }
 
diff --git a/src/scheduler/worker_goals.rs b/src/scheduler/worker_goals.rs
index 3bf367d8cf..009d3ce563 100644
--- a/src/scheduler/worker_goals.rs
+++ b/src/scheduler/worker_goals.rs
@@ -54,6 +54,7 @@ impl WorkerGoals {
             if *requested {
                 *requested = false;
                 self.current = Some(goal);
+                probe!(mmtk, goal_set, goal);
                 return Some(goal);
             }
         }
@@ -67,6 +68,7 @@ impl WorkerGoals {
 
     /// Called when the current goal is completed.  This will clear the current goal.
     pub fn on_current_goal_completed(&mut self) {
+        probe!(mmtk, goal_complete);
         self.current = None
     }
 
diff --git a/tools/tracing/README.md b/tools/tracing/README.md
index 96ff3e6a6d..0dc05b3210 100644
--- a/tools/tracing/README.md
+++ b/tools/tracing/README.md
@@ -17,8 +17,10 @@ shipped with the MMTk release you use.
 Currently, the core provides the following tracepoints.
 
 -   `mmtk:collection_initialized()`: All GC worker threads are spawn
--   `mmtk:collection_uninitialized()`: All GC worker threads exited.  This may happend during
-    execution, for example, if the VM needs to fork and requests GC workers to exit.
+-   `mmtk:prepare_fork()`: The VM requests MMTk core to prepare for calling `fork()`.
+-   `mmtk:after_fork()`: The VM notifies MMTk core it has finished calling `fork()`.
+-   `mmtk:goal_set(goal: int)`: GC workers have started working on a goal.
+-   `mmtk:goal_complete(goal: int)`: GC workers have fihisned working on a goal.
 -   `mmtk:harness_begin()`: the timing iteration of a benchmark begins
 -   `mmtk:harness_end()`: the timing iteration of a benchmark ends
 -   `mmtk:gcworker_run()`: a GC worker thread enters its work loop

From 991314ac56eaef035f71f37a8f30c983225a3564 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Mon, 25 Mar 2024 14:30:20 +0800
Subject: [PATCH 39/48] Update comments and clean up simple wrapper functions

This commit also does some minor refactoring so that simple wrapper
functions in `memory_manager` will not have duplicated comments as their
wrapped functions.
---
 src/memory_manager.rs         | 23 +++--------------------
 src/mmtk.rs                   | 10 ++++++----
 src/scheduler/scheduler.rs    |  3 ++-
 src/scheduler/worker.rs       | 19 +++++++++++++++----
 src/util/test_util/mock_vm.rs |  2 +-
 src/vm/scanning.rs            |  2 +-
 6 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/src/memory_manager.rs b/src/memory_manager.rs
index 7842acd2fa..cabaa93ead 100644
--- a/src/memory_manager.rs
+++ b/src/memory_manager.rs
@@ -461,33 +461,16 @@ pub fn gc_poll<VM: VMBinding>(mmtk: &MMTK<VM>, tls: VMMutatorThread) {
     }
 }
 
-/// Run the main loop of a GC worker.
-///
-/// This method runs until the worker exits.  Currently a worker may exit after
-/// [`crate::mmtk::MMTK::prepare_to_fork`] is called.
-///
-/// Arguments:
-/// * `tls`: The thread that will be used as the GC worker.
-/// * `worker`: The execution context of the GC worker thread.
-///   It is the `GCWorker` passed to `Collection::spawn_gc_thread`.
-/// * `mmtk`: A reference to an MMTk instance.
+/// Wrapper for [`crate::scheduler::GCWorker::run`].
 pub fn start_worker<VM: VMBinding>(
     mmtk: &'static MMTK<VM>,
     tls: VMWorkerThread,
-    mut worker: Box<GCWorker<VM>>,
+    worker: Box<GCWorker<VM>>,
 ) {
     worker.run(tls, mmtk);
-    mmtk.scheduler.surrender_gc_worker(worker);
 }
 
-/// Initialize the GC worker threads that are required for doing garbage collections.
-/// This is a mandatory call for a VM during its boot process once its thread system
-/// is ready.  This call will invoke `Collection::spawn_gc_thread()`` to create GC threads.
-///
-/// Arguments:
-/// * `mmtk`: A reference to an MMTk instance.
-/// * `tls`: The thread that wants to enable the collection. This value will be passed back to
-///   the VM in `Collection::spawn_gc_thread()` so that the VM knows the context.
+/// Wrapper for [`crate::mmtk::MMTK::initialize_collection`].
 pub fn initialize_collection<VM: VMBinding>(mmtk: &'static MMTK<VM>, tls: VMThread) {
     mmtk.initialize_collection(tls);
 }
diff --git a/src/mmtk.rs b/src/mmtk.rs
index 60b1475e69..86c9973954 100644
--- a/src/mmtk.rs
+++ b/src/mmtk.rs
@@ -222,15 +222,17 @@ impl<VM: VMBinding> MMTK<VM> {
 
     /// Initialize the GC worker threads that are required for doing garbage collections.
     /// This is a mandatory call for a VM during its boot process once its thread system
-    /// is ready.  This function should be called exactly once.
+    /// is ready.
     ///
-    /// Internally, this function will spawn the GC worker threads, and the `tls` argument is used
-    /// for this purpose.
+    /// Internally, this function will invoke [`Collection::spawn_gc_thread()`] to spawn GC worker
+    /// threads.
     ///
     /// # Arguments
     ///
     /// *   `tls`: The thread that wants to enable the collection. This value will be passed back
-    ///     to the VM in `Collection::spawn_gc_thread()` so that the VM knows the context.
+    ///     to the VM in [`Collection::spawn_gc_thread()`] so that the VM knows the context.
+    ///
+    /// [`Collection::spawn_gc_thread()`]: crate::vm::Collection::spawn_gc_thread()
     pub fn initialize_collection(&'static self, tls: VMThread) {
         assert!(
             !self.state.is_initialized(),
diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 42872d4931..5d72484d5c 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -116,7 +116,8 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     }
 
     /// Respawn GC threads after forking.  This will reuse the `GCWorker` instances of stopped
-    /// workers.
+    /// workers.  `tls` is the VM thread that requests GC threads to be re-spawn, and will be
+    /// passed down to [`crate::vm::Collection::spawn_gc_thread`].
     pub fn respawn_gc_threads_after_forking(self: &Arc<Self>, tls: VMThread) {
         self.worker_group.spawn(tls)
     }
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 1a7c7b4a7f..d3b063c820 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -193,9 +193,18 @@ impl<VM: VMBinding> GCWorker<VM> {
         self.scheduler().poll(self)
     }
 
-    /// Entry of the worker thread. Resolve thread affinity, if it has been specified by the user.
-    /// Each worker will keep polling and executing work packets in a loop.
-    pub fn run(&mut self, tls: VMWorkerThread, mmtk: &'static MMTK<VM>) {
+    /// Entry point of the worker thread.
+    ///
+    /// This function will resolve thread affinity, if it has been specified by the user.
+    ///
+    /// Each worker will keep polling and executing work packets in a loop.  It runs until the
+    /// worker is requested to exit.  Currently a worker may exit after
+    /// [`crate::mmtk::MMTK::prepare_to_fork`] is called.
+    ///
+    /// Arguments:
+    /// * `tls`: The VM-specific thread-local storage for this GC worker thread.
+    /// * `mmtk`: A reference to an MMTk instance.
+    pub fn run(mut self: Box<Self>, tls: VMWorkerThread, mmtk: &'static MMTK<VM>) {
         probe!(mmtk, gcworker_run);
         debug!(
             "Worker started. ordinal: {}, {}",
@@ -231,7 +240,7 @@ impl<VM: VMBinding> GCWorker<VM> {
             std::hint::black_box(unsafe { *(typename.as_ptr()) });
 
             probe!(mmtk, work, typename.as_ptr(), typename.len());
-            work.do_work_with_stat(self, mmtk);
+            work.do_work_with_stat(&mut self, mmtk);
         }
         debug!(
             "Worker exiting. ordinal: {}, {}",
@@ -239,6 +248,8 @@ impl<VM: VMBinding> GCWorker<VM> {
             crate::util::rust_util::debug_process_thread_id(),
         );
         probe!(mmtk, gcworker_exit);
+
+        mmtk.scheduler.surrender_gc_worker(self);
     }
 }
 
diff --git a/src/util/test_util/mock_vm.rs b/src/util/test_util/mock_vm.rs
index 4ac620ac80..9f5d0c7122 100644
--- a/src/util/test_util/mock_vm.rs
+++ b/src/util/test_util/mock_vm.rs
@@ -82,7 +82,7 @@ where
     func(&mut lock)
 }
 
-/// A test that uses `MockVM`` should use this method to wrap the entire test
+/// A test that uses `MockVM` should use this method to wrap the entire test
 /// that may use `MockVM`.
 ///
 /// # Arguents
diff --git a/src/vm/scanning.rs b/src/vm/scanning.rs
index 21768f4337..df6a0c6607 100644
--- a/src/vm/scanning.rs
+++ b/src/vm/scanning.rs
@@ -124,7 +124,7 @@ pub trait RootsWorkFactory<ES: Edge>: Clone + Send + 'static {
     /// Create work packets to handle transitively pinning (TP) roots.
     ///
     /// Similar to `create_process_pinning_roots_work`, this work packet will not move objects in `nodes`.
-    /// Unlike ``create_process_pinning_roots_work`, no objects in the transitive closure of `nodes` will be moved, either.
+    /// Unlike `create_process_pinning_roots_work`, no objects in the transitive closure of `nodes` will be moved, either.
     ///
     /// Arguments:
     /// * `nodes`: A vector of references to objects pointed by root edges.

From eaa00bfcacaa07a5e8c92f1303aeeb58b57d4639 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Mon, 25 Mar 2024 16:07:00 +0800
Subject: [PATCH 40/48] Refactored state transition.

When spawning GC workers the first time, the worker group transition
directly from the `Initial` state to the `Spawned` state.
---
 src/scheduler/scheduler.rs |  5 +-
 src/scheduler/worker.rs    | 95 +++++++++++++++++++++++---------------
 2 files changed, 61 insertions(+), 39 deletions(-)

diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 5d72484d5c..91c2d86fac 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -90,8 +90,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     /// Currently GC threads only include worker threads, and we currently have only one worker
     /// group.  We may add more worker groups in the future.
     pub fn spawn_gc_threads(self: &Arc<Self>, mmtk: &'static MMTK<VM>, tls: VMThread) {
-        self.worker_group.create_workers(mmtk);
-        self.worker_group.spawn(tls)
+        self.worker_group.initial_spawn(tls, mmtk);
     }
 
     /// Ask all GC workers to exit for forking.
@@ -119,7 +118,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     /// workers.  `tls` is the VM thread that requests GC threads to be re-spawn, and will be
     /// passed down to [`crate::vm::Collection::spawn_gc_thread`].
     pub fn respawn_gc_threads_after_forking(self: &Arc<Self>, tls: VMThread) {
-        self.worker_group.spawn(tls)
+        self.worker_group.respawn(tls)
     }
 
     /// Resolve the affinity of a thread.
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index d3b063c820..5105799acb 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -253,41 +253,41 @@ impl<VM: VMBinding> GCWorker<VM> {
     }
 }
 
-/// Data that are owned by a worker group but depend on whether the `GCWorker` structs and the
-/// actual GC worker threads have been created or not.
+/// Stateful part of [`WorkerGroup`].
 enum WorkerCreationState<VM: VMBinding> {
-    /// `GCWorker` structs have not been created yet.
-    NotCreated {
+    /// The initial state.  `GCWorker` structs have not been created and GC worker threads have not
+    /// been spawn.
+    Initial {
         /// The local work queues for to-be-created workers.
         local_work_queues: Vec<deque::Worker<Box<dyn GCWork<VM>>>>,
     },
-    /// `GCWorker` structs are created, but worker threads are either not spawn, yet, or in the
-    /// progress of stopping for forking.
-    Resting {
+    /// All worker threads are spawn and running.  `GCWorker` structs have been transferred to
+    /// worker threads.
+    Spawned,
+    /// Worker threads are stopping, or have already stopped, for forking. Instances of `GCWorker`
+    /// structs are collected here to be reused when GC workers are respawn.
+    Surrendered {
         /// `GCWorker` instances not currently owned by active GC worker threads.  Once GC workers
-        /// are (re-)spawn, they will take ownership of these `GCWorker` instances.
+        /// are respawn, they will take ownership of these `GCWorker` instances.
         // Note: Clippy warns about `Vec<Box<T>>` because `Vec<T>` is already in the heap.
         // However, the purpose of this `Vec` is allowing GC worker threads to give their
         // `Box<GCWorker<VM>>` instances back to this pool.  Therefore, the `Box` is necessary.
         #[allow(clippy::vec_box)]
         workers: Vec<Box<GCWorker<VM>>>,
     },
-    /// All worker threads are spawn and running.  `GCWorker` structs have been transferred to
-    /// worker threads.
-    Spawned,
 }
 
 /// A worker group to manage all the GC workers.
 pub(crate) struct WorkerGroup<VM: VMBinding> {
     /// Shared worker data
     pub workers_shared: Vec<Arc<GCWorkerShared<VM>>>,
-    /// The stateful part
-    state: Mutex<WorkerCreationState<VM>>,
+    /// The stateful part.  `None` means state transition is underway.
+    state: Mutex<Option<WorkerCreationState<VM>>>,
 }
 
 /// We have to persuade Rust that `WorkerGroup` is safe to share because the compiler thinks one
 /// worker can refer to another worker via the path "worker -> scheduler -> worker_group ->
-/// `Resting::workers` -> worker" which is cyclic reference and unsafe.
+/// `Surrendered::workers` -> worker" which is cyclic reference and unsafe.
 unsafe impl<VM: VMBinding> Sync for WorkerGroup<VM> {}
 
 impl<VM: VMBinding> WorkerGroup<VM> {
@@ -307,23 +307,50 @@ impl<VM: VMBinding> WorkerGroup<VM> {
 
         Arc::new(Self {
             workers_shared,
-            state: Mutex::new(WorkerCreationState::NotCreated { local_work_queues }),
+            state: Mutex::new(Some(WorkerCreationState::Initial { local_work_queues })),
         })
     }
 
-    /// Create `GCWorker` instances, but do not create threads, yet.
-    pub fn create_workers(&self, mmtk: &'static MMTK<VM>) {
-        debug!("Creating GCWorker instances...");
+    /// Spawn GC worker threads for the first time.
+    pub fn initial_spawn(&self, tls: VMThread, mmtk: &'static MMTK<VM>) {
         let mut state = self.state.lock().unwrap();
 
-        let WorkerCreationState::NotCreated { ref mut local_work_queues } = *state else {
+        let WorkerCreationState::Initial { local_work_queues } = state.take().unwrap() else {
             panic!("GCWorker structs have already been created");
         };
 
+        let workers = self.create_workers(local_work_queues, mmtk);
+        self.spawn(workers, tls);
+
+        *state = Some(WorkerCreationState::Spawned);
+    }
+
+    /// Respawn GC threads after stopping for forking.
+    pub fn respawn(&self, tls: VMThread) {
+        let mut state = self.state.lock().unwrap();
+
+        let WorkerCreationState::Surrendered { workers } = state.take().unwrap() else {
+            panic!("GCWorker structs have not been created, yet.");
+        };
+
+        self.spawn(workers, tls);
+
+        *state = Some(WorkerCreationState::Spawned)
+    }
+
+    /// Create `GCWorker` instances.
+    #[allow(clippy::vec_box)] // See `WorkerCreationState::Surrendered`.
+    fn create_workers(
+        &self,
+        local_work_queues: Vec<deque::Worker<Box<dyn GCWork<VM>>>>,
+        mmtk: &'static MMTK<VM>,
+    ) -> Vec<Box<GCWorker<VM>>> {
+        debug!("Creating GCWorker instances...");
+
         assert_eq!(self.workers_shared.len(), local_work_queues.len());
 
-        // Spawn each worker thread.
-        let workers = (local_work_queues.drain(..))
+        // Each `GCWorker` instance corresponds to a `GCWorkerShared` at the same index.
+        let workers = (local_work_queues.into_iter())
             .zip(self.workers_shared.iter())
             .enumerate()
             .map(|(ordinal, (queue, shared))| {
@@ -337,29 +364,23 @@ impl<VM: VMBinding> WorkerGroup<VM> {
             })
             .collect::<Vec<_>>();
 
-        *state = WorkerCreationState::Resting { workers };
-        debug!("GCWorker instances created.");
+        debug!("Created {} GCWorker instances.", workers.len());
+        workers
     }
 
     /// Spawn all the worker threads
-    pub fn spawn(&self, tls: VMThread) {
+    #[allow(clippy::vec_box)] // See `WorkerCreationState::Surrendered`.
+    fn spawn(&self, workers: Vec<Box<GCWorker<VM>>>, tls: VMThread) {
         debug!(
             "Spawning GC workers.  {}",
             crate::util::rust_util::debug_process_thread_id(),
         );
-        let mut state = self.state.lock().unwrap();
 
-        let WorkerCreationState::Resting { ref mut workers } = *state else {
-            panic!("GCWorker structs have not been created, yet.");
-        };
-
-        // Drain the queue.  We transfer the ownership of each `GCWorker` instance to a GC thread.
-        for worker in workers.drain(..) {
+        // We transfer the ownership of each `GCWorker` instance to a GC thread.
+        for worker in workers {
             VM::VMCollection::spawn_gc_thread(tls, GCThreadContext::<VM>::Worker(worker));
         }
 
-        *state = WorkerCreationState::Spawned;
-
         debug!(
             "Spawned {} worker threads.  {}",
             self.worker_count(),
@@ -370,16 +391,18 @@ impl<VM: VMBinding> WorkerGroup<VM> {
     /// Prepare the buffer for workers to surrender their `GCWorker` structs.
     pub fn prepare_surrender_buffer(&self) {
         let mut state = self.state.lock().unwrap();
-        *state = WorkerCreationState::Resting {
+        assert!(matches!(*state, Some(WorkerCreationState::Spawned)));
+
+        *state = Some(WorkerCreationState::Surrendered {
             workers: Vec::with_capacity(self.worker_count()),
-        }
+        })
     }
 
     /// Return the `GCWorker` struct to the worker group.
     /// This function returns `true` if all workers returned their `GCWorker` structs.
     pub fn surrender_gc_worker(&self, worker: Box<GCWorker<VM>>) -> bool {
         let mut state = self.state.lock().unwrap();
-        let WorkerCreationState::Resting { ref mut workers } = *state else {
+        let WorkerCreationState::Surrendered { ref mut workers } = state.as_mut().unwrap() else {
             panic!("GCWorker structs have not been created, yet.");
         };
         let ordinal = worker.ordinal;

From b791aa7c7384088906cf2fae64d813cf4c51e065 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Mon, 25 Mar 2024 16:47:00 +0800
Subject: [PATCH 41/48] Minor changes

---
 src/scheduler/scheduler.rs      | 17 ++++++-----------
 src/scheduler/worker_goals.rs   |  2 +-
 src/scheduler/worker_monitor.rs | 18 +++++++++++++++---
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 91c2d86fac..f11351b76f 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -1,4 +1,4 @@
-use self::worker::{PollResult, WorkerShouldExit};
+use self::worker::PollResult;
 
 use super::gc_work::ScheduleCollection;
 use super::stat::SchedulerStat;
@@ -402,18 +402,13 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 return Ok(work);
             }
 
-            let should_exit = self
-                .worker_monitor
-                .park_and_wait(worker, |goals| self.on_last_parked(worker, goals));
-
-            if should_exit {
-                return Err(WorkerShouldExit);
-            }
+            self.worker_monitor
+                .park_and_wait(worker, |goals| self.on_last_parked(worker, goals))?;
         }
     }
 
-    /// Called when the last worker parked.
-    /// `should_schedule_gc` is true if a mutator requested a GC.
+    /// Called when the last worker parked.  `goal` allows this function to inspect and change the
+    /// current goal.
     fn on_last_parked(&self, worker: &GCWorker<VM>, goals: &mut WorkerGoals) -> LastParkedResult {
         let Some(ref current_goal) = goals.current() else {
             // There is no goal.  Find a request to respond to.
@@ -469,7 +464,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         assert!(goals.current().is_none());
 
         let Some(goal) = goals.poll_next_goal() else {
-            // No reqeusts.  Park this worker, too.
+            // No requests.  Park this worker, too.
             return LastParkedResult::ParkSelf;
         };
 
diff --git a/src/scheduler/worker_goals.rs b/src/scheduler/worker_goals.rs
index 009d3ce563..eea70f5924 100644
--- a/src/scheduler/worker_goals.rs
+++ b/src/scheduler/worker_goals.rs
@@ -2,7 +2,7 @@
 //! working towards on a high level.
 //!
 //! A "goal" is represented by a `WorkerGoal`.  All workers work towards a single goal at a time.
-//! THe current goal influences the behavior of GC workers, especially the last parked worker.
+//! The current goal influences the behavior of GC workers, especially the last parked worker.
 //! For example,
 //!
 //! -   When in the progress of GC, the last parker will try to open buckets or announce the GC
diff --git a/src/scheduler/worker_monitor.rs b/src/scheduler/worker_monitor.rs
index 70aabce4e4..6694a2f404 100644
--- a/src/scheduler/worker_monitor.rs
+++ b/src/scheduler/worker_monitor.rs
@@ -9,6 +9,7 @@ use std::sync::{Condvar, Mutex};
 use crate::vm::VMBinding;
 
 use super::{
+    worker::WorkerShouldExit,
     worker_goals::{WorkerGoal, WorkerGoals},
     GCWorker,
 };
@@ -123,7 +124,14 @@ impl WorkerMonitor {
     /// The argument of `on_last_parked` is true if `sync.gc_requested` is `true`.
     /// The return value of `on_last_parked` will determine whether this worker and other workers
     /// will wake up or block waiting.
-    pub fn park_and_wait<VM, F>(&self, worker: &GCWorker<VM>, on_last_parked: F) -> bool
+    ///
+    /// This function returns `Ok(())` if the current worker should continue working,
+    /// or `Err(WorkerShouldExit)` if the current worker should exit now.
+    pub fn park_and_wait<VM, F>(
+        &self,
+        worker: &GCWorker<VM>,
+        on_last_parked: F,
+    ) -> Result<(), WorkerShouldExit>
     where
         VM: VMBinding,
         F: FnOnce(&mut WorkerGoals) -> LastParkedResult,
@@ -226,8 +234,12 @@ impl WorkerMonitor {
             sync.parker.worker_count,
         );
 
-        // If the current goal is `StopForFork`, return true so that the worker thread will exit.
-        matches!(sync.goals.current(), Some(WorkerGoal::StopForFork))
+        // If the current goal is `StopForFork`, the worker thread should exit.
+        if matches!(sync.goals.current(), Some(WorkerGoal::StopForFork)) {
+            return Err(WorkerShouldExit);
+        }
+
+        Ok(())
     }
 
     /// Called when all workers have exited.

From f049f73ebe8e7ee6bbfdad13576455a09e1b5f68 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Mon, 1 Apr 2024 19:59:56 +0800
Subject: [PATCH 42/48] Add a mock test for forking support.

---
 .../tests/mock_tests/mock_test_init_fork.rs   | 197 ++++++++++++++++++
 src/vm/tests/mock_tests/mod.rs                |   1 +
 2 files changed, 198 insertions(+)
 create mode 100644 src/vm/tests/mock_tests/mock_test_init_fork.rs

diff --git a/src/vm/tests/mock_tests/mock_test_init_fork.rs b/src/vm/tests/mock_tests/mock_test_init_fork.rs
new file mode 100644
index 0000000000..589348bfb7
--- /dev/null
+++ b/src/vm/tests/mock_tests/mock_test_init_fork.rs
@@ -0,0 +1,197 @@
+use std::{
+    sync::{Condvar, Mutex, MutexGuard},
+    thread::JoinHandle,
+    time::Duration,
+};
+
+use super::mock_test_prelude::*;
+use crate::{
+    util::{options::GCTriggerSelector, Address, OpaquePointer, VMThread, VMWorkerThread},
+    MMTKBuilder, MMTK,
+};
+
+#[derive(Default)]
+struct ForkTestShared {
+    sync: Mutex<ForkTestSync>,
+    all_threads_spawn: Condvar,
+    all_threads_exited: Condvar,
+    all_threads_running: Condvar,
+}
+
+#[derive(Default)]
+struct ForkTestSync {
+    join_handles: Vec<JoinHandle<()>>,
+    /// Number of threads spawn.
+    spawn_threds: usize,
+    /// Number of threads that have actually entered our entry-point function.
+    running_threads: usize,
+    /// Number of threads that have returned from `memory_manager::start_worker`.
+    exited_threads: usize,
+}
+
+lazy_static! {
+    static ref SHARED: ForkTestShared = ForkTestShared::default();
+}
+
+// We fix the number of threads so that we can assert the number of GC threads spawn.
+const NUM_WORKER_THREADS: usize = 4;
+
+// Don't block the CI.
+const TIMEOUT: Duration = Duration::from_secs(5);
+
+/// A convenient wrapper that panics on timeout.
+fn wait_timeout_while<'a, 'c, T, F>(
+    guard: MutexGuard<'a, T>,
+    condvar: &'c Condvar,
+    condition: F,
+) -> MutexGuard<'a, T>
+where
+    F: FnMut(&mut T) -> bool,
+{
+    let (guard, timeout_result) = condvar
+        .wait_timeout_while(guard, TIMEOUT, condition)
+        .unwrap();
+    assert!(!timeout_result.timed_out());
+    guard
+}
+
+fn simple_spawn_gc_thread(
+    _vm_thread: VMThread,
+    context: GCThreadContext<MockVM>,
+    mmtk: &'static MMTK<MockVM>,
+) {
+    let GCThreadContext::Worker(worker) = context;
+    let join_handle = std::thread::spawn(move || {
+        let ordinal = worker.ordinal;
+        println!("GC thread starting. Ordinal: {ordinal}");
+
+        {
+            let mut sync = SHARED.sync.lock().unwrap();
+            sync.running_threads += 1;
+            if sync.running_threads == NUM_WORKER_THREADS {
+                SHARED.all_threads_running.notify_all();
+            }
+        }
+
+        let gc_thread_tls = VMWorkerThread(VMThread(OpaquePointer::from_address(Address::ZERO)));
+        memory_manager::start_worker(mmtk, gc_thread_tls, worker);
+
+        {
+            let mut sync = SHARED.sync.lock().unwrap();
+            sync.running_threads -= 1;
+            sync.exited_threads += 1;
+            if sync.exited_threads == NUM_WORKER_THREADS {
+                SHARED.all_threads_exited.notify_all();
+            }
+        }
+
+        println!("GC thread stopped. Ordinal: {ordinal}");
+    });
+
+    {
+        let mut sync = SHARED.sync.lock().unwrap();
+        sync.join_handles.push(join_handle);
+        sync.spawn_threds += 1;
+        if sync.spawn_threds == NUM_WORKER_THREADS {
+            SHARED.all_threads_spawn.notify_all();
+        }
+    }
+}
+
+/// Test the `initial_collection` function with actual running GC threads, and the functions for
+/// supporting forking.
+#[test]
+pub fn test_initialize_collection_and_fork() {
+    // We don't use fixtures or `with_mockvm` because we want to precisely control the
+    // initialization process.
+    let mut builder = MMTKBuilder::new();
+    // The exact heap size doesn't matter because we don't even allocate.
+    let trigger = GCTriggerSelector::FixedHeapSize(1024 * 1024);
+    builder.options.gc_trigger.set(trigger);
+    builder.options.threads.set(NUM_WORKER_THREADS);
+    let mmtk: &'static mut MMTK<MockVM> = Box::leak(Box::new(builder.build::<MockVM>()));
+
+    let mock_vm = MockVM {
+        spawn_gc_thread: MockMethod::new_fixed(Box::new(|(vm_thread, context)| {
+            simple_spawn_gc_thread(vm_thread, context, mmtk)
+        })),
+        ..Default::default()
+    };
+    write_mockvm(move |mock_vm_ref| *mock_vm_ref = mock_vm);
+
+    let test_thread_tls = VMThread(OpaquePointer::from_address(Address::ZERO));
+
+    // Initialize collection.  This will spawn GC worker threads.
+    mmtk.initialize_collection(test_thread_tls);
+
+    // Wait for GC workers to be spawned, and get their join handles.
+    let join_handles = {
+        println!("Waiting for GC worker threads to be spawn");
+        let sync = SHARED.sync.lock().unwrap();
+
+        // We wait for `all_threads_spawn` instead of `all_threads_running`.  It is not necessary
+        // to wait for all GC worker threads to enter `memory_manager::start_worker` before calling
+        // `prepare_to_fork`.  The impementation of `initialize_collection` and `prepare_to_fork`
+        // should be robust against GC worker threads that start slower than usual.
+        let mut sync = wait_timeout_while(sync, &SHARED.all_threads_spawn, |sync| {
+            sync.spawn_threds < NUM_WORKER_THREADS
+        });
+
+        // Take join handles out of `SHARED.sync` so that the main thread can join them without
+        // holding the Mutex, and GC workers can acquire the mutex and mutate `SHARED.sync`.
+        std::mem::take(&mut sync.join_handles)
+    };
+
+    assert_eq!(join_handles.len(), NUM_WORKER_THREADS);
+
+    // Now we prepare to fork.  GC worker threads should go down.
+    mmtk.prepare_to_fork();
+
+    println!("Waiting for GC worker threads to stop");
+
+    {
+        // In theory, we can just join the join handles, and is unnecessary to wait for
+        // `SHARED.all_threads_exited`.  This is a workaround for the fact that
+        // `JoinHandle::join()` does not have a variant that supports timeout.  We use
+        // `wait_timeout_while` so that it won't hang the CI.  When the condvar
+        // `all_threads_exited` is notified, all GC workers will have returned from
+        // `memory_manager::start_worker`, and it is unlikely that the `join_handle.join()` below
+        // will block for too long.
+        let sync = SHARED.sync.lock().unwrap();
+        let _sync = wait_timeout_while(sync, &SHARED.all_threads_exited, |sync| {
+            sync.exited_threads < NUM_WORKER_THREADS
+        });
+    }
+
+    for join_handle in join_handles {
+        // TODO: PThread has `pthread_timedjoin_np`, but the `JoinHandle` in the Rust standard
+        // library doesn't have a variant that supports timeout.  Let's wait for the Rust library
+        // to update.
+        join_handle.join().unwrap();
+    }
+
+    println!("All GC worker threads stopped");
+
+    {
+        let mut sync = SHARED.sync.lock().unwrap();
+        assert_eq!(sync.running_threads, 0);
+
+        // Reset counters.
+        sync.spawn_threds = 0;
+        sync.exited_threads = 0;
+    }
+
+    // We don't actually call `fork()` in this test, but we pretend we have called `fork()`.
+    // We now try to resume GC workers.
+    mmtk.after_fork(test_thread_tls);
+
+    {
+        println!("Waiting for GC worker threads to be running after calling `after_fork`");
+        let sync = SHARED.sync.lock().unwrap();
+        let _sync = wait_timeout_while(sync, &SHARED.all_threads_running, |sync| {
+            sync.running_threads < NUM_WORKER_THREADS
+        });
+    }
+
+    println!("All GC worker threads are up and running.");
+}
diff --git a/src/vm/tests/mock_tests/mod.rs b/src/vm/tests/mock_tests/mod.rs
index 51a085cec8..5bf880fbb6 100644
--- a/src/vm/tests/mock_tests/mod.rs
+++ b/src/vm/tests/mock_tests/mod.rs
@@ -33,6 +33,7 @@ mod mock_test_edges;
 #[cfg(target_os = "linux")]
 mod mock_test_handle_mmap_conflict;
 mod mock_test_handle_mmap_oom;
+mod mock_test_init_fork;
 mod mock_test_is_in_mmtk_spaces;
 mod mock_test_issue139_allocate_non_multiple_of_min_alignment;
 mod mock_test_issue867_allocate_unrealistically_large_object;

From a6d834ae7a3f088c540e3543fb7e8ff9fbc507b9 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Tue, 2 Apr 2024 15:52:51 +0800
Subject: [PATCH 43/48] Fix clippy warning

---
 src/vm/tests/mock_tests/mock_test_init_fork.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/vm/tests/mock_tests/mock_test_init_fork.rs b/src/vm/tests/mock_tests/mock_test_init_fork.rs
index 589348bfb7..9605d4c71e 100644
--- a/src/vm/tests/mock_tests/mock_test_init_fork.rs
+++ b/src/vm/tests/mock_tests/mock_test_init_fork.rs
@@ -40,9 +40,9 @@ const NUM_WORKER_THREADS: usize = 4;
 const TIMEOUT: Duration = Duration::from_secs(5);
 
 /// A convenient wrapper that panics on timeout.
-fn wait_timeout_while<'a, 'c, T, F>(
+fn wait_timeout_while<'a, T, F>(
     guard: MutexGuard<'a, T>,
-    condvar: &'c Condvar,
+    condvar: &Condvar,
     condition: F,
 ) -> MutexGuard<'a, T>
 where

From e9da4c2b763955180e34b324c630211e148ef79f Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Tue, 2 Apr 2024 21:18:01 +0800
Subject: [PATCH 44/48] Fix stale comments

---
 src/scheduler/scheduler.rs                     |  1 -
 src/scheduler/worker_monitor.rs                | 16 ++++++++--------
 src/vm/collection.rs                           | 10 +++++-----
 src/vm/tests/mock_tests/mock_test_init_fork.rs |  2 +-
 4 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index f11351b76f..836476bebc 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -516,7 +516,6 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
     }
 
     /// Called when GC has finished, i.e. when all work packets have been executed.
-    /// Return `true` if it scheduled the next GC immediately.
     fn on_gc_finished(&self, worker: &GCWorker<VM>) {
         // All GC workers must have parked by now.
         debug_assert!(!self.worker_group.has_designated_work());
diff --git a/src/scheduler/worker_monitor.rs b/src/scheduler/worker_monitor.rs
index 6694a2f404..573134ffaf 100644
--- a/src/scheduler/worker_monitor.rs
+++ b/src/scheduler/worker_monitor.rs
@@ -2,7 +2,7 @@
 //!
 //! -   allowing workers to park,
 //! -   letting the last parked worker take action, and
-//! -   letting workers and mutators to notify workers when workers are given things to do.
+//! -   letting workers and mutators notify workers when workers are given things to do.
 
 use std::sync::{Condvar, Mutex};
 
@@ -118,7 +118,7 @@ impl WorkerMonitor {
         }
     }
 
-    /// Park a worker and wait on the CondVar `work_available`.
+    /// Park a worker and wait on the CondVar `workers_have_anything_to_do`.
     ///
     /// If it is the last worker parked, `on_last_parked` will be called.
     /// The argument of `on_last_parked` is true if `sync.gc_requested` is `true`.
@@ -179,10 +179,10 @@ impl WorkerMonitor {
             //      }
             //      unlock();
             //
-            // The actual condition for this `self.work_available.wait(sync)` is:
+            // The actual condition for this `self.workers_have_anything_to_do.wait(sync)` is:
             //
             // 1.  any work packet is available, or
-            // 2.  a request for scheduling GC is submitted.
+            // 2.  a goal (such as doing GC) is requested
             //
             // But it is not used like the typical use pattern shown above, mainly because work
             // packets can be added without holding the mutex `self.sync`.  This means one worker
@@ -200,8 +200,8 @@ impl WorkerMonitor {
             // available, no other workers can add another work packet (because they all parked).
             // So the **last** parked worker can open more buckets or declare GC finished.
             //
-            // Condition (2), i.e. `sync.should_schedule_gc` is guarded by the mutator `sync`.
-            // When set (by a mutator via `request_schedule_collection`), it will notify a
+            // Condition (2), i.e. goals added to `sync.goals`, is guarded by the monitor `sync`.
+            // When a mutator adds a goal via `WorkerMonitor::make_request`, it will notify a
             // worker; and the last parked worker always checks it before waiting.  So this
             // condition will not be set without any worker noticing.
             //
@@ -211,8 +211,8 @@ impl WorkerMonitor {
 
             // Notes on spurious wake-up:
             //
-            // 1.  The condition variable `work_available` is guarded by `self.sync`.  Because the
-            //     last parked worker is holding the mutex `self.sync` when executing
+            // 1.  The condition variable `workers_have_anything_to_do` is guarded by `self.sync`.
+            //     Because the last parked worker is holding the mutex `self.sync` when executing
             //     `on_last_parked`, no workers can unpark (even if they spuriously wake up) during
             //     `on_last_parked` because they cannot re-acquire the mutex `self.sync`.
             //
diff --git a/src/vm/collection.rs b/src/vm/collection.rs
index baf6642f26..16e87eebe0 100644
--- a/src/vm/collection.rs
+++ b/src/vm/collection.rs
@@ -4,7 +4,7 @@ use crate::util::opaque_pointer::*;
 use crate::vm::VMBinding;
 use crate::{scheduler::*, Mutator};
 
-/// Thread context for the spawned GC thread.  It is used by spawn_gc_thread.
+/// Thread context for the spawned GC thread.  It is used by `spawn_gc_thread`.
 /// Currently, `GCWorker` is the only kind of thread that mmtk-core will create.
 pub enum GCThreadContext<VM: VMBinding> {
     /// The GC thread to spawn is a worker thread. There can be multiple worker threads.
@@ -46,16 +46,16 @@ pub trait Collection<VM: VMBinding> {
     /// Ask the VM to spawn a GC thread for MMTk. A GC thread may later call into the VM through these VM traits. Some VMs
     /// have assumptions that those calls needs to be within VM internal threads.
     /// As a result, MMTk does not spawn GC threads itself to avoid breaking this kind of assumptions.
-    /// MMTk calls this method to spawn GC threads during [`initialize_collection()`](../memory_manager/fn.initialize_collection.html).
+    /// MMTk calls this method to spawn GC threads during [`crate::mmtk::MMTK::initialize_collection`]
+    /// and [`crate::mmtk::MMTK::after_fork`].
     ///
     /// Arguments:
     /// * `tls`: The thread pointer for the parent thread that we spawn new threads from. This is the same `tls` when the VM
     ///   calls `initialize_collection()` and passes as an argument.
     /// * `ctx`: The context for the GC thread.
-    ///   * If `Worker` is passed, it means spawning a thread to run as a GC worker.
-    ///     The spawned thread shall call `memory_manager::start_worker`.
+    ///   * If [`GCThreadContext::Worker`] is passed, it means spawning a thread to run as a GC worker.
+    ///     The spawned thread shall call the entry point function `GCWorker::run`.
     ///     Currently `Worker` is the only kind of thread which mmtk-core will create.
-    ///   In either case, the `Box` inside should be passed back to the called function.
     fn spawn_gc_thread(tls: VMThread, ctx: GCThreadContext<VM>);
 
     /// Inform the VM of an out-of-memory error. The binding should hook into the VM's error
diff --git a/src/vm/tests/mock_tests/mock_test_init_fork.rs b/src/vm/tests/mock_tests/mock_test_init_fork.rs
index 9605d4c71e..596ce6fe77 100644
--- a/src/vm/tests/mock_tests/mock_test_init_fork.rs
+++ b/src/vm/tests/mock_tests/mock_test_init_fork.rs
@@ -98,7 +98,7 @@ fn simple_spawn_gc_thread(
     }
 }
 
-/// Test the `initial_collection` function with actual running GC threads, and the functions for
+/// Test the `initialize_collection` function with actual running GC threads, and the functions for
 /// supporting forking.
 #[test]
 pub fn test_initialize_collection_and_fork() {

From d293c04f2a736097dcd92a172db2a1ae157e174d Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Tue, 2 Apr 2024 22:16:11 +0800
Subject: [PATCH 45/48] Added unit tests

Also made `park_and_wait` method more testable.
---
 src/scheduler/scheduler.rs      |   3 +-
 src/scheduler/worker.rs         |   1 +
 src/scheduler/worker_goals.rs   |  36 +++++++++++
 src/scheduler/worker_monitor.rs | 111 +++++++++++++++++++++++++++++---
 4 files changed, 141 insertions(+), 10 deletions(-)

diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
index 836476bebc..bc310bf482 100644
--- a/src/scheduler/scheduler.rs
+++ b/src/scheduler/scheduler.rs
@@ -402,8 +402,9 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 return Ok(work);
             }
 
+            let ordinal = worker.ordinal;
             self.worker_monitor
-                .park_and_wait(worker, |goals| self.on_last_parked(worker, goals))?;
+                .park_and_wait(ordinal, |goals| self.on_last_parked(worker, goals))?;
         }
     }
 
diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
index 5105799acb..1e823c7417 100644
--- a/src/scheduler/worker.rs
+++ b/src/scheduler/worker.rs
@@ -109,6 +109,7 @@ impl<VM: VMBinding> GCWorkerShared<VM> {
 
 /// A special error type that indicate a worker should exit.
 /// This may happen if the VM needs to fork and asks workers to exit.
+#[derive(Debug)]
 pub(crate) struct WorkerShouldExit;
 
 /// The result type of `GCWorker::pool`.
diff --git a/src/scheduler/worker_goals.rs b/src/scheduler/worker_goals.rs
index eea70f5924..d11e4680f6 100644
--- a/src/scheduler/worker_goals.rs
+++ b/src/scheduler/worker_goals.rs
@@ -78,3 +78,39 @@ impl WorkerGoals {
         self.requests[goal]
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::{WorkerGoal, WorkerGoals};
+
+    #[test]
+    fn test_poll_none() {
+        let mut goals = WorkerGoals::default();
+        let next_goal = goals.poll_next_goal();
+
+        assert!(matches!(next_goal, None));
+        assert!(matches!(goals.current(), None));
+    }
+
+    #[test]
+    fn test_poll_one() {
+        let mut goals = WorkerGoals::default();
+        goals.set_request(WorkerGoal::StopForFork);
+        let next_goal = goals.poll_next_goal();
+
+        assert!(matches!(next_goal, Some(WorkerGoal::StopForFork)));
+        assert!(matches!(goals.current(), Some(WorkerGoal::StopForFork)));
+    }
+
+    #[test]
+    fn test_goals_priority() {
+        let mut goals = WorkerGoals::default();
+        goals.set_request(WorkerGoal::StopForFork);
+        goals.set_request(WorkerGoal::Gc);
+
+        let next_goal = goals.poll_next_goal();
+
+        assert!(matches!(next_goal, Some(WorkerGoal::Gc)));
+        assert!(matches!(goals.current(), Some(WorkerGoal::Gc)));
+    }
+}
diff --git a/src/scheduler/worker_monitor.rs b/src/scheduler/worker_monitor.rs
index 573134ffaf..c84dedfcd3 100644
--- a/src/scheduler/worker_monitor.rs
+++ b/src/scheduler/worker_monitor.rs
@@ -6,12 +6,9 @@
 
 use std::sync::{Condvar, Mutex};
 
-use crate::vm::VMBinding;
-
 use super::{
     worker::WorkerShouldExit,
     worker_goals::{WorkerGoal, WorkerGoals},
-    GCWorker,
 };
 
 /// The result type of the `on_last_parked` call-back in `WorkMonitor::park_and_wait`.
@@ -127,13 +124,12 @@ impl WorkerMonitor {
     ///
     /// This function returns `Ok(())` if the current worker should continue working,
     /// or `Err(WorkerShouldExit)` if the current worker should exit now.
-    pub fn park_and_wait<VM, F>(
+    pub fn park_and_wait<F>(
         &self,
-        worker: &GCWorker<VM>,
+        ordinal: usize,
         on_last_parked: F,
     ) -> Result<(), WorkerShouldExit>
     where
-        VM: VMBinding,
         F: FnOnce(&mut WorkerGoals) -> LastParkedResult,
     {
         let mut sync = self.sync.lock().unwrap();
@@ -142,7 +138,7 @@ impl WorkerMonitor {
         let all_parked = sync.parker.inc_parked_workers();
         trace!(
             "Worker {} parked.  parked/total: {}/{}.  All parked: {}",
-            worker.ordinal,
+            ordinal,
             sync.parker.parked_workers,
             sync.parker.worker_count,
             all_parked
@@ -151,7 +147,7 @@ impl WorkerMonitor {
         let mut should_wait = false;
 
         if all_parked {
-            trace!("Worker {} is the last worker parked.", worker.ordinal);
+            trace!("Worker {} is the last worker parked.", ordinal);
             let result = on_last_parked(&mut sync.goals);
             match result {
                 LastParkedResult::ParkSelf => {
@@ -229,7 +225,7 @@ impl WorkerMonitor {
         sync.parker.dec_parked_workers();
         trace!(
             "Worker {} unparked.  parked/total: {}/{}.",
-            worker.ordinal,
+            ordinal,
             sync.parker.parked_workers,
             sync.parker.worker_count,
         );
@@ -248,3 +244,100 @@ impl WorkerMonitor {
         sync.goals.on_current_goal_completed();
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::sync::{
+        atomic::{AtomicBool, AtomicUsize, Ordering},
+        Arc,
+    };
+
+    use super::WorkerMonitor;
+
+    /// Test if the `WorkerMonitor::park_and_wait` method calls the `on_last_parked` callback
+    /// properly.
+    #[test]
+    fn test_last_worker_park_wake_all() {
+        let number_threads = 4;
+        let worker_monitor = Arc::new(WorkerMonitor::new(number_threads));
+        let on_last_parked_called = AtomicUsize::new(0);
+        let should_unpark = AtomicBool::new(false);
+
+        std::thread::scope(|scope| {
+            for ordinal in 0..number_threads {
+                let worker_monitor = worker_monitor.clone();
+                let on_last_parked_called = &on_last_parked_called;
+                let should_unpark = &should_unpark;
+                scope.spawn(move || {
+                    // This emulates the use pattern in the scheduler, i.e. checking the condition
+                    // ("Is there any work packets available") without holding a mutex.
+                    while !should_unpark.load(Ordering::SeqCst) {
+                        println!("Thread {} parking...", ordinal);
+                        worker_monitor
+                            .park_and_wait(ordinal, |_goals| {
+                                println!("Thread {} is the last thread parked.", ordinal);
+                                on_last_parked_called.fetch_add(1, Ordering::SeqCst);
+                                should_unpark.store(true, Ordering::SeqCst);
+                                super::LastParkedResult::WakeAll
+                            })
+                            .unwrap();
+                        println!("Thread {} unparked.", ordinal);
+                    }
+                });
+            }
+        });
+
+        // `on_last_parked` should only be called once.
+        assert_eq!(on_last_parked_called.load(Ordering::SeqCst), 1);
+    }
+
+    /// Like `test_last_worker_park_wake_all`, but only wake up the last parked worker when it
+    /// parked.
+    #[test]
+    fn test_last_worker_park_wake_self() {
+        let number_threads = 4;
+        let worker_monitor = Arc::new(WorkerMonitor::new(number_threads));
+        let on_last_parked_called = AtomicUsize::new(0);
+        let threads_running = AtomicUsize::new(0);
+        let should_unpark = AtomicBool::new(false);
+
+        std::thread::scope(|scope| {
+            for ordinal in 0..number_threads {
+                let worker_monitor = worker_monitor.clone();
+                let on_last_parked_called = &on_last_parked_called;
+                let threads_running = &threads_running;
+                let should_unpark = &should_unpark;
+                scope.spawn(move || {
+                    let mut i_am_the_last_parked_worker = false;
+                    // Record the number of threads entering the following `while` loop.
+                    threads_running.fetch_add(1, Ordering::SeqCst);
+                    while !should_unpark.load(Ordering::SeqCst) {
+                        println!("Thread {} parking...", ordinal);
+                        worker_monitor
+                            .park_and_wait(ordinal, |_goals| {
+                                println!("Thread {} is the last thread parked.", ordinal);
+                                on_last_parked_called.fetch_add(1, Ordering::SeqCst);
+                                should_unpark.store(true, Ordering::SeqCst);
+                                i_am_the_last_parked_worker = true;
+                                super::LastParkedResult::WakeSelf
+                            })
+                            .unwrap();
+                        println!("Thread {} unparked.", ordinal);
+                    }
+                    threads_running.fetch_sub(1, Ordering::SeqCst);
+
+                    if i_am_the_last_parked_worker {
+                        println!("The last parked worker woke up");
+                        // Only the current worker should wake and leave the `while` loop above.
+                        assert_eq!(threads_running.load(Ordering::SeqCst), number_threads - 1);
+                        should_unpark.store(true, Ordering::SeqCst);
+                        worker_monitor.notify_work_available(true);
+                    }
+                });
+            }
+        });
+
+        // `on_last_parked` should only be called once.
+        assert_eq!(on_last_parked_called.load(Ordering::SeqCst), 1);
+    }
+}

From d9ef6a36f76d24695efabc2ef7aeacc27bda5dbc Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Wed, 3 Apr 2024 10:30:00 +0800
Subject: [PATCH 46/48] Style fix

---
 src/scheduler/worker_goals.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/scheduler/worker_goals.rs b/src/scheduler/worker_goals.rs
index d11e4680f6..695471f7b2 100644
--- a/src/scheduler/worker_goals.rs
+++ b/src/scheduler/worker_goals.rs
@@ -88,8 +88,8 @@ mod tests {
         let mut goals = WorkerGoals::default();
         let next_goal = goals.poll_next_goal();
 
-        assert!(matches!(next_goal, None));
-        assert!(matches!(goals.current(), None));
+        assert!(next_goal.is_none());
+        assert!(goals.current().is_none());
     }
 
     #[test]

From 31a660cf74245a8469dd2d55e13fbdf51c0450d9 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 4 Apr 2024 10:21:35 +0800
Subject: [PATCH 47/48] Fix typo

---
 src/scheduler/work_bucket.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scheduler/work_bucket.rs b/src/scheduler/work_bucket.rs
index 4386832b76..b2923b565a 100644
--- a/src/scheduler/work_bucket.rs
+++ b/src/scheduler/work_bucket.rs
@@ -139,7 +139,7 @@ impl<VM: VMBinding> WorkBucket<VM> {
         self.notify_one_worker();
     }
 
-    /// Add a work packet ot this bucket, but do not notify any workers.
+    /// Add a work packet to this bucket, but do not notify any workers.
     /// This is useful when the current thread is holding the mutex of `WorkerMonitor` which is
     /// used for notifying workers.  This usually happens if the current thread is the last worker
     /// parked.

From 49c979e18dbc6ac225e6f78f199867c131a3bcc0 Mon Sep 17 00:00:00 2001
From: Kunshan Wang <wks1986@gmail.com>
Date: Thu, 4 Apr 2024 10:26:36 +0800
Subject: [PATCH 48/48] Add hyperlink

---
 src/scheduler/work_bucket.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scheduler/work_bucket.rs b/src/scheduler/work_bucket.rs
index b2923b565a..ab55093bad 100644
--- a/src/scheduler/work_bucket.rs
+++ b/src/scheduler/work_bucket.rs
@@ -147,7 +147,7 @@ impl<VM: VMBinding> WorkBucket<VM> {
         self.queue.push(Box::new(work));
     }
 
-    /// Like `add_no_notify`, but the work is boxed.
+    /// Like [`WorkBucket::add_no_notify`], but the work is boxed.
     pub(crate) fn add_boxed_no_notify(&self, work: Box<dyn GCWork<VM>>) {
         self.queue.push(work);
     }