From 296eeb4432e34f73eaf3c959f47f17993d728f7c Mon Sep 17 00:00:00 2001
From: d-netto <diogonetto.dcn@gmail.com>
Date: Fri, 20 Sep 2024 12:23:05 -0300
Subject: [PATCH] per-thread start-time; use a proper macro/API to increment
 metrics; split GC time into safepoint time and proper GC time; introduce
 internal spinlock time; introduce feature flag to report metrics in terms of
 CPU time

---
 base/locks-mt.jl     |  2 --
 src/gc.c             | 19 ++++++++++++++-----
 src/init.c           |  2 ++
 src/julia_internal.h |  1 +
 src/julia_locks.h    |  4 ++++
 src/julia_threads.h  | 10 ++++++++--
 src/partr.c          | 29 +++++++++++++++++++----------
 src/safepoint.c      | 15 +++++++++------
 src/sys.c            | 22 ++++++++++++++++++++++
 src/threading.c      | 26 +++++++-------------------
 src/threading.h      |  1 -
 11 files changed, 86 insertions(+), 45 deletions(-)

diff --git a/base/locks-mt.jl b/base/locks-mt.jl
index 5c5db758787ef8..5d355b9ed200c2 100644
--- a/base/locks-mt.jl
+++ b/base/locks-mt.jl
@@ -39,10 +39,8 @@ end
 Base.assert_havelock(l::SpinLock) = islocked(l) ? nothing : Base.concurrency_violation()
 
 function lock(l::SpinLock)
-    # t0 = time_ns()
     while true
         if @inline trylock(l)
-            # ccall(:jl_record_lock_spin_time, Cvoid, (Culonglong,), time_ns() - t0)
             return
         end
         ccall(:jl_cpu_suspend, Cvoid, ())
diff --git a/src/gc.c b/src/gc.c
index 18b12d84bcc684..3ae7061194e0a7 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -3752,7 +3752,11 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     int8_t old_state = jl_atomic_load_relaxed(&ptls->gc_state);
     jl_atomic_store_release(&ptls->gc_state, JL_GC_STATE_WAITING);
     // `jl_safepoint_start_gc()` makes sure only one thread can run the GC.
-    uint64_t t0 = jl_hrtime(); // time we entered GC
+
+    // time we started the GC stop-the-world phase
+    uint64_t safepoint_start_wall_time = jl_hrtime();
+    uint64_t safepoint_start_for_tls_timing = jl_record_time_for_tls_metric();
+
     if (!jl_safepoint_start_gc()) {
         // either another thread is running GC, or the GC got disabled just now.
         jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
@@ -3781,8 +3785,11 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     jl_gc_wait_for_the_world(gc_all_tls_states, gc_n_threads);
     JL_PROBE_GC_STOP_THE_WORLD();
 
-    uint64_t t1 = jl_hrtime();  // time user code stopped running
-    uint64_t duration = t1 - t0;
+    // time we ended the GC stop-the-world phase
+    uint64_t safepoint_end_wall_time = jl_hrtime();
+    uint64_t safepoint_end_for_tls_timing = jl_record_time_for_tls_metric();
+
+    uint64_t duration = safepoint_end_wall_time - safepoint_start_wall_time;
     if (duration > gc_num.max_time_to_safepoint)
         gc_num.max_time_to_safepoint = duration;
     gc_num.time_to_safepoint = duration;
@@ -3809,8 +3816,10 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection)
     jl_safepoint_end_gc();
     jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING);
     JL_PROBE_GC_END();
-    // Time how long GC took.
-    ptls->timing_tls.gc_time += jl_hrtime() - t1;
+
+    // Measure how long safepoint (stop-the-world) and GC took
+    jl_increment_timing_tls_metric(ptls, safepoint_time, safepoint_end_for_tls_timing - safepoint_start_for_tls_timing);
+    jl_increment_timing_tls_metric(ptls, gc_time, jl_record_time_for_tls_metric() - safepoint_end_for_tls_timing);
 
     // Only disable finalizers on current thread
     // Doing this on all threads is racy (it's impossible to check
diff --git a/src/init.c b/src/init.c
index e482f8b77ee9ba..93b8681df6a3c9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -903,6 +903,8 @@ static NOINLINE void _finish_julia_init(JL_IMAGE_SEARCH rel, jl_ptls_t ptls, jl_
     jl_start_threads();
     jl_start_gc_threads();
     uv_barrier_wait(&thread_init_done);
+    // `start_time`is initialized to 0, so it's fine to use an increment call here
+    jl_increment_timing_tls_metric(ptls, start_time, jl_record_time_for_tls_metric());
 
     uv_mutex_init(&array_to_string_print_lock);
 
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 94b1f85112d7df..b6c0f6fda90c63 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -192,6 +192,7 @@ int jl_running_under_rr(int recheck) JL_NOTSAFEPOINT;
 // timers
 // Returns time in nanosec
 JL_DLLEXPORT uint64_t jl_hrtime(void) JL_NOTSAFEPOINT;
+JL_DLLEXPORT uint64_t jl_record_time_for_tls_metric(void) JL_NOTSAFEPOINT;
 
 JL_DLLEXPORT void jl_set_peek_cond(uintptr_t);
 JL_DLLEXPORT double jl_get_profile_peek_duration(void);
diff --git a/src/julia_locks.h b/src/julia_locks.h
index 47e258f69aab2f..e43f6130a2c6df 100644
--- a/src/julia_locks.h
+++ b/src/julia_locks.h
@@ -22,10 +22,14 @@ JL_DLLEXPORT int _jl_mutex_trylock_nogc(jl_task_t *self, jl_mutex_t *lock) JL_NO
 JL_DLLEXPORT int _jl_mutex_trylock(jl_task_t *self, jl_mutex_t *lock);
 JL_DLLEXPORT void _jl_mutex_unlock(jl_task_t *self, jl_mutex_t *lock);
 JL_DLLEXPORT void _jl_mutex_unlock_nogc(jl_mutex_t *lock) JL_NOTSAFEPOINT;
+// Unfortunately we can't include `julia_internal.h` here, so we need to forward declare this
+JL_DLLEXPORT uint64_t jl_record_time_for_tls_metric(void) JL_NOTSAFEPOINT;
 
 static inline void jl_mutex_wait(jl_mutex_t *lock, int safepoint)
 {
+    uint64_t t0 = jl_record_time_for_tls_metric();
     _jl_mutex_wait(jl_current_task, lock, safepoint);
+    jl_increment_timing_tls_metric(jl_current_task->ptls, internal_spinlock_time, jl_record_time_for_tls_metric() - t0);
 }
 
 static inline void jl_mutex_lock_nogc(jl_mutex_t *lock) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER
diff --git a/src/julia_threads.h b/src/julia_threads.h
index 63f0e457b71c7d..577991dace3267 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -126,12 +126,18 @@ struct _jl_bt_element_t;
 
 typedef struct {
     uint64_t start_time;
-    uint64_t sleep_time;
+    uint64_t internal_spinlock_time;
     uint64_t scheduler_time;
-    /* uint64_t lock_spin_time; */
+    uint64_t safepoint_time;
     uint64_t gc_time;
 } jl_timing_tls_states_t;
 
+#define jl_increment_timing_tls_metric(ptls, metric, dt) do { \
+    if (ptls) { \
+        ptls->timing_tls.metric += dt; \
+    } \
+} while (0)
+
 // This includes all the thread local states we care about for a thread.
 // Changes to TLS field types must be reflected in codegen.
 #define JL_MAX_BT_SIZE 80000
diff --git a/src/partr.c b/src/partr.c
index b4b0b520c3f86f..e72c39e895cd24 100644
--- a/src/partr.c
+++ b/src/partr.c
@@ -136,6 +136,8 @@ void jl_parallel_gc_threadfun(void *arg)
 
     // free the thread argument here
     free(targ);
+    // `start_time`is initialized to 0, so it's fine to use an increment call here
+    jl_increment_timing_tls_metric(ptls, start_time, jl_record_time_for_tls_metric());
 
     while (1) {
         uv_mutex_lock(&gc_threads_lock);
@@ -171,6 +173,8 @@ void jl_concurrent_gc_threadfun(void *arg)
 
     // free the thread argument here
     free(targ);
+    // `start_time`is initialized to 0, so it's fine to use an increment call here
+    jl_increment_timing_tls_metric(ptls, start_time, jl_record_time_for_tls_metric());
 
     while (1) {
         assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_CONCURRENT_COLLECTOR_THREAD);
@@ -198,6 +202,8 @@ void jl_threadfun(void *arg)
 
     // free the thread argument here
     free(targ);
+    // `start_time`is initialized to 0, so it's fine to use an increment call here
+    jl_increment_timing_tls_metric(ptls, start_time, jl_record_time_for_tls_metric());
 
     (void)jl_gc_unsafe_enter(ptls);
     jl_finish_task(ct); // noreturn
@@ -385,18 +391,16 @@ static int may_sleep(jl_ptls_t ptls) JL_NOTSAFEPOINT
 
 extern _Atomic(unsigned) _threadedregion;
 
-JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, jl_value_t *checkempty)
+static jl_task_t *_jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, jl_value_t *checkempty)
 {
     jl_task_t *ct = jl_current_task;
     uint64_t start_cycles = 0;
-    uint64_t t0 = jl_hrtime();
+
     while (1) {
-        jl_ptls_t ptls = ct->ptls;
         jl_task_t *task = get_next_task(trypoptask, q);
-        if (task) {
-            ptls->timing_tls.scheduler_time += jl_hrtime() - t0;
+        if (task)
             return task;
-        }
+
         // quick, race-y check to see if there seems to be any stuff in there
         jl_cpu_pause();
         if (!check_empty(checkempty)) {
@@ -405,6 +409,7 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
         }
 
         jl_cpu_pause();
+        jl_ptls_t ptls = ct->ptls;
         if (sleep_check_after_threshold(&start_cycles) || (ptls->tid == 0 && (!jl_atomic_load_relaxed(&_threadedregion) || wait_empty))) {
             // acquire sleep-check lock
             jl_atomic_store_relaxed(&ptls->sleep_check_state, sleeping);
@@ -426,7 +431,6 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
                     JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
                 }
                 if (task)
-                    ptls->timing_tls.scheduler_time += jl_hrtime() - t0;
                     return task;
                 continue;
             }
@@ -435,7 +439,6 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
                     jl_atomic_store_relaxed(&ptls->sleep_check_state, not_sleeping); // let other threads know they don't need to wake us
                     JL_PROBE_RT_SLEEP_CHECK_TASK_WAKE(ptls);
                 }
-                ptls->timing_tls.scheduler_time += jl_hrtime() - t0;
                 return task;
             }
 
@@ -510,7 +513,6 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
 
             // the other threads will just wait for an individual wake signal to resume
             JULIA_DEBUG_SLEEPWAKE( ptls->sleep_enter = cycleclock() );
-            uint64_t tsleep0 = jl_hrtime();
             int8_t gc_state = jl_gc_safe_enter(ptls);
             uv_mutex_lock(&ptls->sleep_lock);
             while (may_sleep(ptls)) {
@@ -527,7 +529,6 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
             assert(jl_atomic_load_relaxed(&ptls->sleep_check_state) == not_sleeping);
             uv_mutex_unlock(&ptls->sleep_lock);
             JULIA_DEBUG_SLEEPWAKE( ptls->sleep_leave = cycleclock() );
-            ptls->timing_tls.sleep_time += jl_hrtime() - tsleep0;
             jl_gc_safe_leave(ptls, gc_state); // contains jl_gc_safepoint
             start_cycles = 0;
             if (task) {
@@ -543,6 +544,14 @@ JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q,
     }
 }
 
+JL_DLLEXPORT jl_task_t *jl_task_get_next(jl_value_t *trypoptask, jl_value_t *q, jl_value_t *checkempty)
+{
+    uint64_t t0 = jl_record_time_for_tls_metric();
+    jl_task_t *task = _jl_task_get_next(trypoptask, q, checkempty);
+    jl_increment_timing_tls_metric(jl_current_task->ptls, scheduler_time, jl_record_time_for_tls_metric() - t0);
+    return task;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/safepoint.c b/src/safepoint.c
index 17fdd8c5263c10..c05ca035c50e08 100644
--- a/src/safepoint.c
+++ b/src/safepoint.c
@@ -156,15 +156,12 @@ void jl_safepoint_end_gc(void)
     uv_cond_broadcast(&safepoint_cond);
 }
 
-void jl_safepoint_wait_gc(void)
+void _jl_safepoint_wait_gc(void)
 {
     jl_task_t *ct = jl_current_task; (void)ct;
     JL_TIMING_SUSPEND_TASK(GC_SAFEPOINT, ct);
     // The thread should have set this is already
-    jl_ptls_t ptls = ct->ptls;
-    assert(jl_atomic_load_relaxed(&ptls->gc_state) != 0);
-    // Time how long this thread is stopped while GC is running.
-    uint64_t t0 = jl_hrtime();
+    assert(jl_atomic_load_relaxed(&ct->ptls->gc_state) != 0);
     // Use normal volatile load in the loop for speed until GC finishes.
     // Then use an acquire load to make sure the GC result is visible on this thread.
     while (jl_atomic_load_relaxed(&jl_gc_running) || jl_atomic_load_acquire(&jl_gc_running)) {
@@ -176,7 +173,13 @@ void jl_safepoint_wait_gc(void)
             uv_cond_wait(&safepoint_cond, &safepoint_lock);
         uv_mutex_unlock(&safepoint_lock);
     }
-    ptls->timing_tls.gc_time = jl_hrtime() - t0;
+}
+
+void jl_safepoint_wait_gc(void)
+{
+    uint64_t t0 = jl_record_time_for_tls_metric();
+    _jl_safepoint_wait_gc();
+    jl_increment_timing_tls_metric(jl_current_task->ptls, gc_time, jl_record_time_for_tls_metric() - t0);
 }
 
 void jl_safepoint_enable_sigint(void)
diff --git a/src/sys.c b/src/sys.c
index 2de4bc61a20b81..8fb7fc46284b65 100644
--- a/src/sys.c
+++ b/src/sys.c
@@ -464,6 +464,28 @@ JL_DLLEXPORT uint64_t jl_hrtime(void) JL_NOTSAFEPOINT
     return uv_hrtime();
 }
 
+JL_DLLEXPORT uint64_t jl_cpu_time(void) JL_NOTSAFEPOINT
+{
+#if defined(_OS_LINUX_)
+    struct rusage usage;
+    if (getrusage(RUSAGE_THREAD, &usage) == 0) {
+        return (uint64_t)usage.ru_utime.tv_sec * 1000000000 + (uint64_t)usage.ru_utime.tv_usec * 1000;
+    }
+    return 0;
+#else
+    return 0;
+#endif
+}
+
+JL_DLLEXPORT uint64_t jl_record_time_for_tls_metric(void) JL_NOTSAFEPOINT
+{
+#ifdef USE_CPU_TIMING_FOR_TLS_METRIC
+    return jl_cpu_time();
+#else
+    return jl_hrtime();
+#endif
+}
+
 // -- iterating the environment --
 
 #ifdef __APPLE__
diff --git a/src/threading.c b/src/threading.c
index ee8c753b23a2b6..72d6168352b068 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -49,22 +49,19 @@ JL_DLLEXPORT _Atomic(uint8_t) jl_measure_compile_time_enabled = 0;
 JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_compile_time = 0;
 JL_DLLEXPORT _Atomic(uint64_t) jl_cumulative_recompile_time = 0;
 
-static uint64_t jl_thread_start_time;
-void jl_set_thread_start_time(void)
-{
-    jl_thread_start_time = jl_hrtime();
-}
-
-// TODO: not just current thread
 JL_DLLEXPORT uint64_t jl_thread_up_time(void)
 {
-    return jl_hrtime() - jl_thread_start_time;
+    jl_ptls_t ptls = jl_current_task->ptls;
+    return jl_record_time_for_tls_metric() - ptls->timing_tls.start_time;
 }
+
 JL_DLLEXPORT uint64_t jl_thread_user_time(uint8_t tid)
 {
     jl_ptls_t ptls = jl_atomic_load_relaxed(&jl_all_tls_states)[tid];
     jl_timing_tls_states_t *timing = &ptls->timing_tls;
-    return jl_thread_up_time() - timing->gc_time - timing->scheduler_time;
+    return jl_record_time_for_tls_metric() - timing->start_time -
+              timing->internal_spinlock_time - timing->scheduler_time -
+              timing->safepoint_time - timing->gc_time;
 }
 
 JL_DLLEXPORT void *jl_get_ptls_states(void)
@@ -783,7 +780,6 @@ void jl_start_threads(void)
         }
         uv_thread_detach(&uvtid);
     }
-    jl_set_thread_start_time();
 }
 
 _Atomic(unsigned) _threadedregion; // HACK: keep track of whether to prioritize IO or threading
@@ -813,13 +809,6 @@ JL_DLLEXPORT void jl_exit_threaded_region(void)
 
 // Profiling stubs
 
-/* JL_DLLEXPORT void jl_record_lock_spin_time(uint64_t time) JL_NOTSAFEPOINT */
-/* { */
-/*     jl_task_t *ct = jl_current_task; */
-/*     jl_ptls_t ptls = ct->ptls; */
-/*     ptls->timing_tls.lock_spin_time += time; */
-/* } */
-
 void _jl_mutex_init(jl_mutex_t *lock, const char *name) JL_NOTSAFEPOINT
 {
     jl_atomic_store_relaxed(&lock->owner, (jl_task_t*)NULL);
@@ -842,12 +831,10 @@ void _jl_mutex_wait(jl_task_t *self, jl_mutex_t *lock, int safepoint)
         return;
     }
     JL_TIMING(LOCK_SPIN, LOCK_SPIN);
-    /* uint64_t t0 = jl_hrtime(); */
     while (1) {
         if (owner == NULL && jl_atomic_cmpswap(&lock->owner, &owner, self)) {
             lock->count = 1;
             jl_profile_lock_acquired(lock);
-            /* jl_record_lock_spin_time(jl_hrtime() - t0); */
             return;
         }
         if (safepoint) {
@@ -947,6 +934,7 @@ void _jl_mutex_unlock(jl_task_t *self, jl_mutex_t *lock)
     }
 }
 
+
 // Make gc alignment available for threading
 // see threads.jl alignment
 JL_DLLEXPORT int jl_alignment(size_t sz)
diff --git a/src/threading.h b/src/threading.h
index b6dfe95e089330..cb265376997132 100644
--- a/src/threading.h
+++ b/src/threading.h
@@ -30,7 +30,6 @@ void jl_init_threadinginfra(void);
 void jl_parallel_gc_threadfun(void *arg);
 void jl_concurrent_gc_threadfun(void *arg);
 void jl_threadfun(void *arg);
-void jl_set_thread_start_time(void);
 
 #ifdef __cplusplus
 }