Skip to content

Commit

Permalink
Merge releases/2024/4 into master (openvinotoolkit#854)
Browse files Browse the repository at this point in the history
Co-authored-by: Alina Kladieva <alina.kladieva@intel.com>
Co-authored-by: Zlobin Vladimir <vladimir.zlobin@intel.com>
Co-authored-by: mzegla <milosz.zeglarski@intel.com>
Co-authored-by: Pavel Esir <pavel.esir@intel.com>
Co-authored-by: Pavel Esir <pavel.esir@gmail.com>
Co-authored-by: Artur Paniukov <chgk1101@gmail.com>
Co-authored-by: Ekaterina Aidova <ekaterina.aidova@intel.com>
Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
Co-authored-by: Mikhail Ryzhov <mikhail.ryzhov@intel.com>
Co-authored-by: Trawinski, Dariusz <dariusz.trawinski@intel.com>
Co-authored-by: TolyaTalamanov <anatoliy.talamanov@intel.com>
Co-authored-by: Andrei Kochin <andrei.kochin@intel.com>
  • Loading branch information
13 people authored Sep 12, 2024
1 parent 1bceaca commit 8887760
Show file tree
Hide file tree
Showing 21 changed files with 496 additions and 118 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/causal_lm_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,7 @@ jobs:
| diff ./pred_greedy.txt -
cpp-chat_sample-ubuntu:
runs-on: ubuntu-20.04
runs-on: ubuntu-24.04
defaults:
run:
shell: bash
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lcm_dreamshaper_cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ concurrency:

jobs:
lcm_dreamshaper_v7_cpp-linux:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
defaults:
run:
shell: bash -l {0}
Expand Down
14 changes: 14 additions & 0 deletions src/cpp/include/openvino/genai/generation_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
* @param ignore_eos if set to true, then generation will not stop even if <eos> token is met.
* @param eos_token_id token_id of <eos> (end of sentence)
* @param min_new_tokens set 0 probability for eos_token_id for the first eos_token_id generated tokens. Ignored for non continuous batching.
* @param stop_strings vector of strings that will cause pipeline to stop generating further tokens. Ignored for non continuous batching.
* @param include_stop_str_in_output if set to true stop string that matched generation will be included in generation output (default: false)
* @param stop_token_ids vector of tokens that will cause pipeline to stop generating further tokens. Ignored for non continuous batching.
*
* Beam search specific parameters:
* @param num_beams number of beams for beam search. 1 disables beam search.
Expand Down Expand Up @@ -61,7 +64,9 @@ enum class StopCriteria { EARLY, HEURISTIC, NEVER };
* @param frequency_penalty reduces absolute log prob as many times as the token was generated. Ignored for non continuous batching.
* @param rng_seed initializes random generator. Ignored for non continuous batching.
*/

class OPENVINO_GENAI_EXPORTS GenerationConfig {

public:
GenerationConfig() = default;
explicit GenerationConfig(const std::string& json_path);
Expand All @@ -71,6 +76,11 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
size_t max_length = SIZE_MAX;
bool ignore_eos = false;
size_t min_new_tokens = 0;

std::set<std::string> stop_strings;
// Default setting in vLLM (and OpenAI API) is not to include stop string in the output
bool include_stop_str_in_output = false;
std::set<int64_t> stop_token_ids;

// Beam search specific
size_t num_beam_groups = 1;
Expand Down Expand Up @@ -99,6 +109,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
*/
void set_eos_token_id(size_t tokenizer_eos_token_id);
size_t get_max_new_tokens(size_t prompt_length = 0) const;

bool is_greedy_decoding() const;
bool is_beam_search() const;
bool is_multinomial() const;
Expand All @@ -123,6 +134,9 @@ static constexpr ov::Property<size_t> max_new_tokens{"max_new_tokens"};
static constexpr ov::Property<size_t> max_length{"max_length"};
static constexpr ov::Property<bool> ignore_eos{"ignore_eos"};
static constexpr ov::Property<size_t> min_new_tokens{"min_new_tokens"};
static constexpr ov::Property<std::vector<std::string>> stop_strings{"stop_strings"};
static constexpr ov::Property<bool> include_stop_str_in_output{"include_stop_str_in_output"};
static constexpr ov::Property<std::vector<std::vector<int64_t>>> stop_token_ids{"stop_token_ids"};

static constexpr ov::Property<size_t> num_beam_groups{"num_beam_groups"};
static constexpr ov::Property<size_t> num_beams{"num_beams"};
Expand Down
3 changes: 2 additions & 1 deletion src/cpp/include/openvino/genai/generation_handle.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ struct GenerationResult {
};

struct GenerationOutput {
std::vector<int64_t> generated_token_ids;
std::vector<int64_t> generated_ids;
std::vector<float> generated_log_probs;
float score;
GenerationFinishReason finish_reason;
};
Expand Down
8 changes: 4 additions & 4 deletions src/cpp/src/continuous_batching_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ class ContinuousBatchingPipeline::Impl {
m_scheduler = std::make_shared<Scheduler>(updated_config);
// and finally create model runner
m_model_runner = std::make_shared<ModelRunner>(infer_request, updated_config);
m_sampler = std::make_shared<Sampler>();
m_sampler = std::make_shared<Sampler>(m_tokenizer);
m_sampler->set_seed(m_generation_config.rng_seed);

// read default generation config
Expand Down Expand Up @@ -299,8 +299,8 @@ class ContinuousBatchingPipeline::Impl {
if (streamer_ptr) {
std::unordered_map<uint64_t, GenerationOutput> token = generations.at(0).get()->back();
OPENVINO_ASSERT(1 == token.size());
OPENVINO_ASSERT(1 == token.begin()->second.generated_token_ids.size());
continue_generation = !streamer_ptr->put(token.begin()->second.generated_token_ids.at(0));
OPENVINO_ASSERT(1 == token.begin()->second.generated_ids.size());
continue_generation = !streamer_ptr->put(token.begin()->second.generated_ids.at(0));
}
}
if (streamer_ptr) {
Expand All @@ -319,7 +319,7 @@ class ContinuousBatchingPipeline::Impl {
auto num_outputs = std::min(sampling_params[generation_idx].num_return_sequences, generation_outputs.size());
for (size_t generation_output_idx = 0; generation_output_idx < num_outputs; ++generation_output_idx) {
const auto& generation_output = generation_outputs[generation_output_idx];
result.m_generation_ids.push_back(std::move(generation_output.generated_token_ids));
result.m_generation_ids.push_back(std::move(generation_output.generated_ids));
result.m_scores.push_back(generation_output.score);
}
result.m_status = generation->get_status();
Expand Down
13 changes: 13 additions & 0 deletions src/cpp/src/generation_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ GenerationConfig::GenerationConfig(const std::string& json_path) {
read_json_param(data, "max_new_tokens", max_new_tokens);
read_json_param(data, "max_length", max_length);
// note that ignore_eos is not present in HF GenerationConfig
read_json_param(data, "ignore_eos", ignore_eos);
read_json_param(data, "min_new_tokens", min_new_tokens);
read_json_param(data, "stop_strings", stop_strings);
// note that include_stop_str_in_output is not present in HF GenerationConfig
read_json_param(data, "include_stop_str_in_output", include_stop_str_in_output);
// note that stop_token_ids is not present in HF GenerationConfig
read_json_param(data, "stop_token_ids", stop_token_ids);
read_json_param(data, "num_beam_groups", num_beam_groups);
read_json_param(data, "num_beams", num_beams);
read_json_param(data, "diversity_penalty", diversity_penalty);
Expand Down Expand Up @@ -57,6 +64,8 @@ void GenerationConfig::set_eos_token_id(size_t tokenizer_eos_token_id) {
"EOS token ID is different in generation config (", eos_token_id, ") and tokenizer (",
tokenizer_eos_token_id, ")");
}
// Merge user defined stop tokens with model EOS token
stop_token_ids.insert(eos_token_id);
}

void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) {
Expand All @@ -65,6 +74,10 @@ void GenerationConfig::update_generation_config(const ov::AnyMap& config_map) {
read_anymap_param(config_map, "max_new_tokens", max_new_tokens);
read_anymap_param(config_map, "max_length", max_length);
read_anymap_param(config_map, "ignore_eos", ignore_eos);
read_anymap_param(config_map, "min_new_tokens", min_new_tokens);
read_anymap_param(config_map, "stop_strings", stop_strings);
read_anymap_param(config_map, "include_stop_str_in_output", include_stop_str_in_output);
read_anymap_param(config_map, "stop_token_ids", stop_token_ids);
read_anymap_param(config_map, "num_beam_groups", num_beam_groups);
read_anymap_param(config_map, "num_beams", num_beams);
read_anymap_param(config_map, "diversity_penalty", diversity_penalty);
Expand Down
3 changes: 2 additions & 1 deletion src/cpp/src/generation_handle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ void add_partial_result(std::unordered_map<uint64_t, GenerationOutput>& partial_
if (partial_result_iter == partial_results.end()) {
partial_results.emplace(iteration_result.first, iteration_result.second);
} else {
partial_result_iter->second.generated_token_ids.push_back(iteration_result.second.generated_token_ids[0]);
partial_result_iter->second.generated_ids.push_back(iteration_result.second.generated_ids[0]);
partial_result_iter->second.generated_log_probs.push_back(iteration_result.second.generated_log_probs[0]);
partial_result_iter->second.score = iteration_result.second.score;
partial_result_iter->second.finish_reason = iteration_result.second.finish_reason;
}
Expand Down
11 changes: 6 additions & 5 deletions src/cpp/src/logit_processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,13 +229,14 @@ class RepetitionPenaltyTransform : public IPenaltyTransformer {

class EOSPenaltyTransform : public ILogitTransformer {
public:
EOSPenaltyTransform(size_t eos_token_id, size_t min_generated_tokens) :
m_eos_token_id(eos_token_id), m_applicable_tensor_len(min_generated_tokens) {}
EOSPenaltyTransform(const std::set<int64_t>& stop_token_ids, size_t min_generated_tokens) :
m_stop_token_ids(stop_token_ids), m_applicable_tensor_len(min_generated_tokens) {}

void apply(Logits& logits) override {
// Since EOS penalty is applied early, the token vector is not initialized yet
// and we can assume element order match token ids.
logits.m_data[m_eos_token_id] = 0.f;
for (auto stop_token_id: m_stop_token_ids)
logits.m_data[stop_token_id] = 0.f;
}


Expand All @@ -245,7 +246,7 @@ class EOSPenaltyTransform : public ILogitTransformer {

protected:
size_t m_applicable_tensor_len = std::numeric_limits<size_t>::max();
size_t m_eos_token_id;
std::set<int64_t> m_stop_token_ids;
};

class FrequencyPenaltyTransform : public IPenaltyTransformer {
Expand Down Expand Up @@ -317,7 +318,7 @@ class LogitProcessor {

if (sampling_params.min_new_tokens > 0) {
m_logit_transformers.emplace_back(
new LogitTransformers::EOSPenaltyTransform(sampling_params.eos_token_id, sampling_params.min_new_tokens)
new LogitTransformers::EOSPenaltyTransform(sampling_params.stop_token_ids, sampling_params.min_new_tokens)
);
}

Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/perf_metrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace {

ov::genai::MeanStdPair calc_mean_and_std(const std::vector<ov::genai::MicroSeconds>& durations) {
if (durations.size() == 0) {
return {-1.0f, -1.0f};
return {-1, -1};
}
// Accepts time durations in microseconds and returns standard deviation and mean in milliseconds.
float mean = std::accumulate(durations.begin(), durations.end(), 0.0f,
Expand Down
Loading

0 comments on commit 8887760

Please sign in to comment.