Skip to content

Commit

Permalink
Back out "Roctracer: Implement reliable data flushing on stop" (#829)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #829

Revert due to profiling breakage on amd;

`while (s_flush.correlationId_ != 0) ` loops indefinitely, and I don't see `s_flush.correlationId_` being decremented anywhere so that is likely the issue.

Reviewed By: leitian, aaronenyeshi, xw285cornell

Differential Revision: D51185531

fbshipit-source-id: 1ec3f41f8089af3d48f9b03847bbfcbe44778b79
  • Loading branch information
nmacchioni authored and facebook-github-bot committed Nov 10, 2023
1 parent 8d99b09 commit 61d9111
Show file tree
Hide file tree
Showing 3 changed files with 2 additions and 58 deletions.
1 change: 1 addition & 0 deletions libkineto/src/RoctracerActivityApi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ void RoctracerActivityApi::clearActivities() {
kernelNames_.clear();
}


void RoctracerActivityApi::enableActivities(
const std::set<ActivityType>& selected_activities) {
#ifdef HAS_ROCTRACER
Expand Down
58 changes: 1 addition & 57 deletions libkineto/src/RoctracerLogger.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
#include <cstring>
#include <chrono>
#include <time.h>
#include <mutex>
#include <unistd.h>

#include "ThreadUtil.h"

Expand All @@ -26,21 +24,6 @@ using namespace std::chrono;

constexpr size_t kBufSize(2 * 1024 * 1024);

class Flush
{
public:
std::atomic<bool> doFlush_ {false};
std::mutex mutex_;
std::atomic<uint64_t> maxCorrelationId_;
uint64_t correlationId_ {0};
void reportCorrelation(const uint64_t &cid) {
uint64_t prev = maxCorrelationId_;
while (prev < cid && !maxCorrelationId_.compare_exchange_weak(prev, cid))
{}
}
};
static Flush s_flush;

RoctracerLogger& RoctracerLogger::singleton() {
static RoctracerLogger instance;
return instance;
Expand Down Expand Up @@ -108,7 +91,6 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
case HIP_API_ID_hipExtLaunchKernel:
case HIP_API_ID_hipLaunchCooperativeKernel: // Should work here
{
s_flush.reportCorrelation(data->correlation_id);
auto &args = data->args.hipLaunchKernel;
dis->kernelRows_.emplace_back(data->correlation_id,
domain,
Expand All @@ -134,7 +116,6 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
case HIP_API_ID_hipModuleLaunchKernel:
case HIP_API_ID_hipExtModuleLaunchKernel:
{
s_flush.reportCorrelation(data->correlation_id);
auto &args = data->args.hipModuleLaunchKernel;
dis->kernelRows_.emplace_back(data->correlation_id,
domain,
Expand Down Expand Up @@ -227,7 +208,6 @@ void RoctracerLogger::api_callback(uint32_t domain, uint32_t cid, const void* ca
case HIP_API_ID_hipMemcpyAsync:
case HIP_API_ID_hipMemcpyWithStream:
{
s_flush.reportCorrelation(data->correlation_id);
auto &args = data->args.hipMemcpyAsync;
dis->copyRows_.emplace_back(data->correlation_id,
domain,
Expand Down Expand Up @@ -272,22 +252,6 @@ void RoctracerLogger::activity_callback(const char* begin, const char* end, void
auto &gpuTraceBuffers = singleton().gpuTraceBuffers_;
memcpy(buffer, begin, size);
gpuTraceBuffers->emplace_back(buffer, size);

// If we are stopping the tracer, implement reliable flushing
if (s_flush.doFlush_) {
std::unique_lock<std::mutex> lock(s_flush.mutex_);
// scan the records looking for the final correlation id
const roctracer_record_t* record = (const roctracer_record_t*)(begin);
const roctracer_record_t* end_record = (const roctracer_record_t*)(end);

while (record < end_record) {
if (record->correlation_id == s_flush.correlationId_) {
s_flush.correlationId_ = 0; // Clear id to signal we found it
break;
}
roctracer_next_record(record, &record);
}
}
}

void RoctracerLogger::startLogging() {
Expand Down Expand Up @@ -348,32 +312,12 @@ void RoctracerLogger::startLogging() {
}

externalCorrelationEnabled_ = true;
logging_ = true;
roctracer_start();
}

void RoctracerLogger::stopLogging() {
if (logging_ == false)
return;

// If we are stopping the tracer, implement reliable flushing
std::unique_lock<std::mutex> lock(s_flush.mutex_);

s_flush.doFlush_ = true;
s_flush.correlationId_ = s_flush.maxCorrelationId_; // load ending id from the running max

// Poll on the worker finding the record and clearing s_flush.correlationId_
while (s_flush.correlationId_ != 0) {
lock.unlock();
roctracer_flush_activity_expl(hccPool_);
usleep(1000);
lock.lock();
}

s_flush.doFlush_ = false;

roctracer_stop();
logging_ = false;
roctracer_flush_activity_expl(hccPool_);
}

void RoctracerLogger::endTracing() {
Expand Down
1 change: 0 additions & 1 deletion libkineto/src/RoctracerLogger.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ class RoctracerLogger {

std::unique_ptr<std::list<RoctracerActivityBuffer>> gpuTraceBuffers_;
bool externalCorrelationEnabled_{true};
bool logging_{false};

friend class onnxruntime::profiling::RocmProfiler;
friend class libkineto::RoctracerActivityApi;
Expand Down

0 comments on commit 61d9111

Please sign in to comment.