-
Notifications
You must be signed in to change notification settings - Fork 526
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
schedule pendding online chunkserver #252
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,33 +32,97 @@ namespace curve { | |
namespace mds { | ||
namespace schedule { | ||
int CopySetScheduler::Schedule() { | ||
LOG(INFO) << "copysetScheduler begin"; | ||
LOG(INFO) << "schedule: copysetScheduler begin"; | ||
|
||
int res = 0; | ||
int oneRoundGenOp = 0; | ||
for (auto lid : topo_->GetLogicalpools()) { | ||
res = DoCopySetSchedule(lid); | ||
oneRoundGenOp += DoCopySetSchedule(lid); | ||
} | ||
return res; | ||
|
||
LOG(INFO) << "schedule: copysetScheduler end, generate operator num " | ||
<< oneRoundGenOp; | ||
return oneRoundGenOp; | ||
} | ||
|
||
int CopySetScheduler::DoCopySetSchedule(PoolIdType lid) { | ||
// 1. collect the chunkserver list and copyset list of the cluster, then | ||
// collect copyset on every online chunkserver | ||
auto copysetList = topo_->GetCopySetInfosInLogicalPool(lid); | ||
auto chunkserverList = topo_->GetChunkServersInLogicalPool(lid); | ||
std::map<ChunkServerIdType, std::vector<CopySetInfo>> distribute; | ||
SchedulerHelper::CopySetDistributionInOnlineChunkServer( | ||
copysetList, chunkserverList, &distribute); | ||
if (distribute.empty()) { | ||
LOG(WARNING) << "no not-retired chunkserver in topology"; | ||
return UNINTIALIZE_ID; | ||
int CopySetScheduler::PenddingCopySetSchedule(const std::map<ChunkServerIdType, | ||
std::vector<CopySetInfo>> &distribute) { | ||
int oneRoundGenOp = 0; | ||
// for every chunkserver, find one copyset to migrate out | ||
for (auto it = distribute.begin(); it != distribute.end(); it++) { | ||
ChunkServerIdType source = it->first; | ||
int copysetNum = it->second.size(); | ||
if (copysetNum == 0) { | ||
continue; | ||
} | ||
|
||
// find one copyset to migrate out from source chunkserver | ||
for (auto info : it->second) { | ||
// does not meet the basic conditions | ||
if (!CopySetSatisfiyBasicMigrationCond(info)) { | ||
continue; | ||
} | ||
|
||
auto target = SelectBestPlacementChunkServer(info, source); | ||
if (target == UNINTIALIZE_ID) { | ||
LOG(WARNING) << "copysetScheduler can not select chunkServer " | ||
"to migrate " << info.CopySetInfoStr() | ||
<< ", which replica: " << source << " is pendding"; | ||
continue; | ||
} | ||
|
||
Operator op = operatorFactory.CreateChangePeerOperator( | ||
info, source, target, OperatorPriority::HighPriority); | ||
op.timeLimit = std::chrono::seconds(changeTimeSec_); | ||
|
||
if (AddOperatorAndCreateCopyset(op, info, target)) { | ||
oneRoundGenOp++; | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can I know if the migration is complete? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The function int StatusTool::ChunkServerListCmd() in status_tool.cpp is modified. It can print the pendding status of chunkserver. |
||
} | ||
|
||
if (oneRoundGenOp != 0) { | ||
LOG(INFO) << "pendding copyset scheduler migrate " << oneRoundGenOp | ||
<< " copyset at this round"; | ||
} | ||
|
||
return oneRoundGenOp; | ||
} | ||
|
||
bool CopySetScheduler::AddOperatorAndCreateCopyset(const Operator &op, | ||
const CopySetInfo &choose, | ||
const ChunkServerIdType &target) { | ||
// add operator | ||
if (!opController_->AddOperator(op)) { | ||
LOG(INFO) << "copysetSchduler add op " << op.OpToString() | ||
<< " fail, copyset has already has operator" | ||
<< " or operator num exceeds the limit."; | ||
return false; | ||
} | ||
|
||
// create copyset | ||
if (!topo_->CreateCopySetAtChunkServer(choose.id, target)) { | ||
LOG(ERROR) << "copysetScheduler create " << choose.CopySetInfoStr() | ||
<< " on chunkServer: " << target | ||
<< " error, delete operator" << op.OpToString(); | ||
opController_->RemoveOperator(choose.id); | ||
return false; | ||
} | ||
|
||
LOG(INFO) << "copysetScheduler create " << choose.CopySetInfoStr() | ||
<< "on chunkserver:" << target | ||
<< " success. generator op: " | ||
<< op.OpToString() << "success"; | ||
return true; | ||
} | ||
|
||
int CopySetScheduler::NormalCopySetSchedule(const std::map<ChunkServerIdType, | ||
std::vector<CopySetInfo>> &distribute) { | ||
// 2. measure the average, range and standard deviation of number of copyset | ||
// on chunkservers | ||
float avg; | ||
int range; | ||
float stdvariance; | ||
int oneRoundGenOp = 0; | ||
StatsCopysetDistribute(distribute, &avg, &range, &stdvariance); | ||
/** | ||
* 3. Set migration condition | ||
|
@@ -83,37 +147,53 @@ int CopySetScheduler::DoCopySetSchedule(PoolIdType lid) { | |
**/ | ||
ChunkServerIdType source = UNINTIALIZE_ID; | ||
if (range <= avg * copysetNumRangePercent_) { | ||
return source; | ||
return oneRoundGenOp; | ||
} | ||
|
||
Operator op; | ||
ChunkServerIdType target = UNINTIALIZE_ID; | ||
CopySetInfo choose; | ||
// this function call will select the source, target and the copyset | ||
if (CopySetMigration(distribute, &op, &source, &target, &choose)) { | ||
// add operator | ||
if (!opController_->AddOperator(op)) { | ||
LOG(INFO) << "copysetSchduler add op " << op.OpToString() | ||
<< " fail, copyset has already has operator"; | ||
if (AddOperatorAndCreateCopyset(op, choose, target)) { | ||
oneRoundGenOp++; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This piece of code coincides with the above if (CopySetMigration(distribute, &op, &source, &target, &choose)) { There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. most Most of the processing logic is different |
||
} | ||
} | ||
|
||
return oneRoundGenOp; | ||
} | ||
|
||
// create copyset | ||
if (!topo_->CreateCopySetAtChunkServer(choose.id, target)) { | ||
LOG(ERROR) << "copysetScheduler create " << choose.CopySetInfoStr() | ||
<< " on chunkServer: " << target | ||
<< " error, delete operator" << op.OpToString(); | ||
opController_->RemoveOperator(choose.id); | ||
} else { | ||
LOG(INFO) << "copysetScheduler create " << choose.CopySetInfoStr() | ||
<< "on chunkserver:" << target | ||
<< " success. generator op: " | ||
<< op.OpToString() << "success"; | ||
int CopySetScheduler::DoCopySetSchedule(PoolIdType lid) { | ||
// 1. collect the chunkserver list and copyset list of the cluster, then | ||
// collect copyset on every online chunkserver | ||
auto copysetList = topo_->GetCopySetInfosInLogicalPool(lid); | ||
auto chunkserverList = topo_->GetChunkServersInLogicalPool(lid); | ||
|
||
std::map<ChunkServerIdType, std::vector<CopySetInfo>> penddingDistribute; | ||
SchedulerHelper::GetCopySetDistributionInOnlineChunkServer( | ||
copysetList, chunkserverList, &penddingDistribute); | ||
SchedulerHelper::FilterCopySetDistributions(ChunkServerStatus::PENDDING, | ||
chunkserverList, &penddingDistribute); | ||
if (!penddingDistribute.empty()) { | ||
int oneRoundGenOp = PenddingCopySetSchedule(penddingDistribute); | ||
// If generate pendding copy set schedule, return here. | ||
if (oneRoundGenOp != 0) { | ||
return oneRoundGenOp; | ||
} | ||
} | ||
|
||
LOG_EVERY_N(INFO, 20) << "copysetScheduler is continually adjusting"; | ||
LOG(INFO) << "copysetScheduler end."; | ||
return static_cast<int>(source); | ||
// If no pendding copyset schedule operator generated, | ||
// run NormalCopySetSchedule | ||
std::map<ChunkServerIdType, std::vector<CopySetInfo>> normalDistribute; | ||
SchedulerHelper::GetCopySetDistributionInOnlineChunkServer( | ||
copysetList, chunkserverList, &normalDistribute); | ||
SchedulerHelper::FilterCopySetDistributions(ChunkServerStatus::READWRITE, | ||
chunkserverList, &normalDistribute); | ||
if (normalDistribute.empty()) { | ||
LOG(WARNING) << "no not-retired chunkserver in topology"; | ||
return 0; | ||
} | ||
return NormalCopySetSchedule(normalDistribute); | ||
} | ||
|
||
void CopySetScheduler::StatsCopysetDistribute( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,12 +34,15 @@ namespace curve { | |
namespace mds { | ||
namespace schedule { | ||
int LeaderScheduler::Schedule() { | ||
LOG(INFO) << "leaderScheduler begin."; | ||
|
||
LOG(INFO) << "schedule: leaderScheduler begin."; | ||
int oneRoundGenOp = 0; | ||
for (auto lid : topo_->GetLogicalpools()) { | ||
DoLeaderSchedule(lid); | ||
oneRoundGenOp += DoLeaderSchedule(lid); | ||
} | ||
return 1; | ||
|
||
LOG(INFO) << "schedule: leaderScheduler end, generate operator num " | ||
<< oneRoundGenOp; | ||
return oneRoundGenOp; | ||
} | ||
|
||
int LeaderScheduler::DoLeaderSchedule(PoolIdType lid) { | ||
|
@@ -57,7 +60,8 @@ int LeaderScheduler::DoLeaderSchedule(PoolIdType lid) { | |
std::shuffle(csInfos.begin(), csInfos.end(), g); | ||
|
||
for (auto csInfo : csInfos) { | ||
if (csInfo.IsOffline()) { | ||
// skip offline chunkserver or pendding chunkserver | ||
if (csInfo.IsOffline() || csInfo.IsPendding()) { | ||
continue; | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if (csInfo.IsOffline() || csInfo.IsPendding()) { There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
|
@@ -177,6 +181,12 @@ bool LeaderScheduler::transferLeaderOut(ChunkServerIdType source, int count, | |
break; | ||
} | ||
|
||
// can not transfer to pendding chunkserver | ||
if (csInfo.IsPendding()) { | ||
continue; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if (csInfo.IsOffline() || csInfo.IsPendding()) { There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The case between offline and pendding is different. If the chunk is offline, it break this while. If the chunk is pendding, it skip this round with continue. |
||
|
||
// can not transfer to myself | ||
if (source == peerInfo.id) { | ||
continue; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -353,7 +353,7 @@ bool SchedulerHelper::InvovledReplicasSatisfyScatterWidthAfterMigration( | |
return allSatisfy; | ||
} | ||
|
||
void SchedulerHelper::CopySetDistributionInOnlineChunkServer( | ||
void SchedulerHelper::GetCopySetDistributionInOnlineChunkServer( | ||
const std::vector<CopySetInfo> ©setList, | ||
const std::vector<ChunkServerInfo> &chunkserverList, | ||
std::map<ChunkServerIdType, std::vector<CopySetInfo>> *out) { | ||
|
@@ -370,15 +370,30 @@ void SchedulerHelper::CopySetDistributionInOnlineChunkServer( | |
|
||
// remove offline chunkserver, and report empty list for empty chunkserver | ||
for (auto item : chunkserverList) { | ||
// remove offline chunkserver | ||
if (item.IsOffline()) { | ||
out->erase(item.info.id); | ||
continue; | ||
} | ||
|
||
// report empty list for chunkserver with no copyset | ||
if (out->find(item.info.id) == out->end()) { | ||
(*out)[item.info.id] = std::vector<CopySetInfo>{}; | ||
} | ||
} | ||
} | ||
|
||
void SchedulerHelper::FilterCopySetDistributions(const ChunkServerStatus status, | ||
const std::vector<ChunkServerInfo> &chunkserverList, | ||
std::map<ChunkServerIdType, std::vector<CopySetInfo>> *distributions) { | ||
for (auto item : chunkserverList) { | ||
if (item.status != status) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Find the chunkserver with the same status, this judgment condition is enough. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
distributions->erase(item.info.id); | ||
} | ||
} | ||
return; | ||
} | ||
|
||
} // namespace schedule | ||
} // namespace mds | ||
} // namespace curve | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it more reasonable to schedule chunkserver in pendding status by recoverSchedule?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
RecorveSchedule handle the copyset which aready lost at least one copy. CopysetScheduler handle the copyset which has all copys.