diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h index 670bb3c3a..783ec6e76 100644 --- a/include/flexflow/request_manager.h +++ b/include/flexflow/request_manager.h @@ -248,6 +248,7 @@ struct NewProfileInfo { int num_generated_tokens; long long speculation_start_timestamp; long long speculation_end_timestamp; + long long suffix_tree_update_time; }; struct RequestProfileInfo { int llm_prefilling_steps = 0; diff --git a/inference/suffix_decoding/suffix_decoding.cc b/inference/suffix_decoding/suffix_decoding.cc index a5a3d16f8..de8233373 100644 --- a/inference/suffix_decoding/suffix_decoding.cc +++ b/inference/suffix_decoding/suffix_decoding.cc @@ -719,7 +719,7 @@ void FlexFlow::top_level_task(Task const *task, } */ - std::string header = "llm,partition,max_tree_depth,online_tree_update,matching_strategy,max_requests_per_batch,max_tokens_per_batch,request_guid,request_step_idx,timestamp,num_speculated_tokens,num_accepted_tokens,prefix_length,speculation_score,num_generated_tokens"; + std::string header = "llm,partition,max_tree_depth,online_tree_update,matching_strategy,max_requests_per_batch,max_tokens_per_batch,request_guid,request_step_idx,timestamp,speculation_start_timestamp,speculation_end_timestamp,tree_update_time,num_speculated_tokens,num_accepted_tokens,prefix_length,speculation_score,num_generated_tokens"; // csv filepath // create csv filepath and add header if it doesn't exist @@ -755,6 +755,9 @@ void FlexFlow::top_level_task(Task const *task, file << info.request_guid << "," << info.request_step_idx << "," << info.timestamp << "," + << info.speculation_start_timestamp << "," + << info.speculation_end_timestamp << "," + << info.suffix_tree_update_time << "," << info.num_speculated_tokens << "," << info.num_accepted_tokens << "," << info.prefix_length << "," diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc index 59a2f616f..f3de55135 100644 --- a/src/runtime/request_manager.cc +++ b/src/runtime/request_manager.cc @@ -2177,6 +2177,7 @@ bool RequestManager::update_llm_suffix_decoding_results( bool request_completed = false; // Iterate over the requests + long long tree_update_time = 0; for (int request_index = 0; request_index < get_max_requests_per_batch(); ++request_index) { if (!request_available[request_index]) { @@ -2213,7 +2214,10 @@ bool RequestManager::update_llm_suffix_decoding_results( if (eos_token_found or request.decode_length() >= get_max_output_length() or request.tokens.size() >= get_max_sequence_length()) { if (get_suffix_tree_online_tree_update()) { + long long int start = Realm::Clock::current_time_in_microseconds(); insert_completed_request_into_suffix_tree(request_index); + long long int end = Realm::Clock::current_time_in_microseconds(); + tree_update_time += (end - start); } // Request is completed request_update_attainment(request_index, attained); @@ -2230,6 +2234,24 @@ bool RequestManager::update_llm_suffix_decoding_results( } } + int idx=0; + for (int request_index = 0; request_index < get_max_requests_per_batch(); ++request_index) { + if (!request_available[request_index]) { + // Request in this slot is unavailable + continue; + } + int guid = guid_of_requests[request_index]; + + // check that the new_profiling_info.size()-nb_requests_decoded + idx has request_guid == guid + assert(new_profiling_info.size() - nb_requests_decoded + idx < new_profiling_info.size() >= 0); + assert(new_profiling_info.size() - nb_requests_decoded + idx < new_profiling_info.size()); + assert(new_profiling_info[new_profiling_info.size()-nb_requests_decoded + idx].request_guid == guid); + + new_profiling_info[new_profiling_info.size()-nb_requests_decoded + idx].suffix_tree_update_time = tree_update_time; + + idx++; + } + // Some requests may be completed after appending the verified tokens. // If there is a request completed, return true. return request_completed; @@ -2937,6 +2959,9 @@ void RequestManager::get_verify_results_suffix_decoding( new_profile_info.timestamp = current_time; new_profile_info.request_guid = guid; new_profile_info.request_step_idx = profiling_requests[guid].llm_decoding_steps-1; + new_profile_info.speculation_start_timestamp = profiling_requests[guid].speculation_start_timestamp; + new_profile_info.speculation_end_timestamp = profiling_requests[guid].speculation_end_timestamp; + new_profile_info.suffix_tree_update_time = 0; new_profile_info.num_speculated_tokens = (int)request.suffix_decoding_best_token_ids.size(); new_profile_info.num_accepted_tokens = accepted_tokens; new_profile_info.prefix_length = request.suffix_decoding_best_prefix_length;