Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Octopus 49533 for me #8

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions doc/man/8/ceph.rst
Original file line number Diff line number Diff line change
Expand Up @@ -985,11 +985,16 @@ data should remain readable and writeable, although data redundancy
may be reduced as some PGs may end up in a degraded (but active)
state. It will return a success code if it is okay to stop the
OSD(s), or an error code and informative message if it is not or if no
conclusion can be drawn at the current time.
conclusion can be drawn at the current time. When ``--max <num>`` is
provided, up to <num> OSDs IDs will return (including the provided
OSDs) that can all be stopped simultaneously. This allows larger sets
of stoppable OSDs to be generated easily by providing a single
starting OSD and a max. Additional OSDs are drawn from adjacent locations
in the CRUSH hierarchy.

Usage::

ceph osd ok-to-stop <id> [<ids>...]
ceph osd ok-to-stop <id> [<ids>...] [--max <num>]

Subcommand ``pause`` pauses osd.

Expand Down
7 changes: 7 additions & 0 deletions qa/standalone/misc/ok-to-stop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,8 @@ function TEST_0_osd() {
ceph osd ok-to-stop 3 || return 1
! ceph osd ok-to-stop 0 1 || return 1
! ceph osd ok-to-stop 2 3 || return 1
ceph osd ok-to-stop 0 --max 2 | grep '[0]' || return 1
ceph osd ok-to-stop 1 --max 2 | grep '[1]' || return 1

# with min_size 2 we can stop 1 osds
ceph osd pool set ec min_size 2 || return 1
Expand All @@ -274,6 +276,11 @@ function TEST_0_osd() {
! ceph osd ok-to-stop 0 1 2 || return 1
! ceph osd ok-to-stop 1 2 3 || return 1

ceph osd ok-to-stop 0 --max 2 | grep '[0,1]' || return 1
ceph osd ok-to-stop 0 --max 20 | grep '[0,1]' || return 1
ceph osd ok-to-stop 2 --max 2 | grep '[2,3]' || return 1
ceph osd ok-to-stop 2 --max 20 | grep '[2,3]' || return 1

# we should get the same result with one of the osds already down
kill_daemons $dir TERM osd.0 || return 1
ceph osd down 0 || return 1
Expand Down
25 changes: 12 additions & 13 deletions src/ceph.in
Original file line number Diff line number Diff line change
Expand Up @@ -1221,18 +1221,6 @@ def main():
errno.errorcode.get(ret, 'Unknown'), outs),
file=sys.stderr)

if ret < 0:
ret = -ret
errstr = errno.errorcode.get(ret, 'Unknown')
print(u'Error {0}: {1}'.format(errstr, outs), file=sys.stderr)
if len(targets) > 1:
final_ret = ret
else:
return ret

if outs:
print(prefix + outs, file=sys.stderr)

sys.stdout.flush()

if parsed_args.output_file:
Expand All @@ -1258,12 +1246,23 @@ def main():
except IOError as e:
if e.errno != errno.EPIPE:
raise e
final_e = None
try:
sys.stdout.flush()
except IOError as e:
if e.errno != errno.EPIPE:
raise e
final_e = e

if ret < 0:
ret = -ret
errstr = errno.errorcode.get(ret, 'Unknown')
print(u'Error {0}: {1}'.format(errstr, outs), file=sys.stderr)
final_ret = ret
elif outs:
print(prefix + outs, file=sys.stderr)

if final_e:
raise final_e

# Block until command completion (currently scrub and deep_scrub only)
if block:
Expand Down
206 changes: 151 additions & 55 deletions src/mgr/DaemonServer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -847,6 +847,137 @@ void DaemonServer::log_access_denied(
"#client-authentication";
}

void DaemonServer::_check_offlines_pgs(
const set<int>& osds,
const OSDMap& osdmap,
const PGMap& pgmap,
offline_pg_report *report)
{
// reset output
*report = offline_pg_report();
report->osds = osds;

for (const auto& q : pgmap.pg_stat) {
set<int32_t> pg_acting; // net acting sets (with no missing if degraded)
bool found = false;
if (q.second.state & PG_STATE_DEGRADED) {
for (auto& anm : q.second.avail_no_missing) {
if (osds.count(anm.osd)) {
found = true;
continue;
}
if (anm.osd != CRUSH_ITEM_NONE) {
pg_acting.insert(anm.osd);
}
}
} else {
for (auto& a : q.second.acting) {
if (osds.count(a)) {
found = true;
continue;
}
if (a != CRUSH_ITEM_NONE) {
pg_acting.insert(a);
}
}
}
if (!found) {
continue;
}
const pg_pool_t *pi = osdmap.get_pg_pool(q.first.pool());
bool dangerous = false;
if (!pi) {
report->bad_no_pool.insert(q.first); // pool is creating or deleting
dangerous = true;
}
if (!(q.second.state & PG_STATE_ACTIVE)) {
report->bad_already_inactive.insert(q.first);
dangerous = true;
}
if (pg_acting.size() < pi->min_size) {
report->bad_become_inactive.insert(q.first);
dangerous = true;
}
if (dangerous) {
report->not_ok.insert(q.first);
} else {
report->ok.insert(q.first);
if (q.second.state & PG_STATE_DEGRADED) {
report->ok_become_more_degraded.insert(q.first);
} else {
report->ok_become_degraded.insert(q.first);
}
}
}
dout(20) << osds << " -> " << report->ok.size() << " ok, "
<< report->not_ok.size() << " not ok" << dendl;
}

void DaemonServer::_maximize_ok_to_stop_set(
const set<int>& orig_osds,
unsigned max,
const OSDMap& osdmap,
const PGMap& pgmap,
offline_pg_report *out_report)
{
dout(20) << "orig_osds " << orig_osds << " max " << max << dendl;
_check_offlines_pgs(orig_osds, osdmap, pgmap, out_report);
if (!out_report->ok_to_stop()) {
return;
}
if (orig_osds.size() >= max) {
// already at max
return;
}

// semi-arbitrarily start with the first osd in the set
offline_pg_report report;
set<int> osds = orig_osds;
int parent = *osds.begin();
set<int> children;

while (true) {
// identify the next parent
int r = osdmap.crush->get_immediate_parent_id(parent, &parent);
if (r < 0) {
return; // just go with what we have so far!
}

// get candidate additions that are beneath this point in the tree
children.clear();
r = osdmap.crush->get_all_children(parent, &children);
if (r < 0) {
return; // just go with what we have so far!
}
dout(20) << " parent " << parent << " children " << children << dendl;

// try adding in more osds
int failed = 0; // how many children we failed to add to our set
for (auto o : children) {
if (o >= 0 && osdmap.is_up(o) && osds.count(o) == 0) {
osds.insert(o);
_check_offlines_pgs(osds, osdmap, pgmap, &report);
if (!report.ok_to_stop()) {
osds.erase(o);
++failed;
continue;
}
*out_report = report;
if (osds.size() == max) {
dout(20) << " hit max" << dendl;
return; // yay, we hit the max
}
}
}

if (failed) {
// we hit some failures; go with what we have
dout(20) << " hit some peer failures" << dendl;
return;
}
}
}

bool DaemonServer::_handle_command(
std::shared_ptr<CommandContext>& cmdctx)
{
Expand Down Expand Up @@ -1570,6 +1701,8 @@ bool DaemonServer::_handle_command(
vector<string> ids;
cmd_getval(cmdctx->cmdmap, "ids", ids);
set<int> osds;
int64_t max = 1;
cmd_getval(cmdctx->cmdmap, "max", max);
int r;
cluster_state.with_osdmap([&](const OSDMap& osdmap) {
r = osdmap.parse_osd_id_list(ids, &osds, &ss);
Expand All @@ -1578,78 +1711,41 @@ bool DaemonServer::_handle_command(
ss << "must specify one or more OSDs";
r = -EINVAL;
}
if (max < (int)osds.size()) {
max = osds.size();
}
if (r < 0) {
cmdctx->reply(r, ss);
return true;
}
int touched_pgs = 0;
int dangerous_pgs = 0;
offline_pg_report out_report;
cluster_state.with_osdmap_and_pgmap([&](const OSDMap& osdmap, const PGMap& pg_map) {
if (pg_map.num_pg_unknown > 0) {
ss << pg_map.num_pg_unknown << " pgs have unknown state; "
<< "cannot draw any conclusions";
r = -EAGAIN;
return;
}
for (const auto& q : pg_map.pg_stat) {
set<int32_t> pg_acting; // net acting sets (with no missing if degraded)
bool found = false;
if (q.second.state & PG_STATE_DEGRADED) {
for (auto& anm : q.second.avail_no_missing) {
if (osds.count(anm.osd)) {
found = true;
continue;
}
if (anm.osd != CRUSH_ITEM_NONE) {
pg_acting.insert(anm.osd);
}
}
} else {
for (auto& a : q.second.acting) {
if (osds.count(a)) {
found = true;
continue;
}
if (a != CRUSH_ITEM_NONE) {
pg_acting.insert(a);
}
}
}
if (!found) {
continue;
}
touched_pgs++;
if (!(q.second.state & PG_STATE_ACTIVE) ||
(q.second.state & PG_STATE_DEGRADED)) {
++dangerous_pgs;
continue;
}
const pg_pool_t *pi = osdmap.get_pg_pool(q.first.pool());
if (!pi) {
++dangerous_pgs; // pool is creating or deleting
} else {
if (pg_acting.size() < pi->min_size) {
++dangerous_pgs;
}
}
}
_maximize_ok_to_stop_set(
osds, max, osdmap, pg_map,
&out_report);
});
if (r) {
if (r < 0) {
cmdctx->reply(r, ss);
return true;
}
if (dangerous_pgs) {
ss << dangerous_pgs << " PGs are already too degraded, would become"
<< " too degraded or might become unavailable";
if (!f) {
f.reset(Formatter::create("json"));
}
f->dump_object("ok_to_stop", out_report);
f->flush(cmdctx->odata);
cmdctx->odata.append("\n");
if (!out_report.ok_to_stop()) {
ss << "unsafe to stop osd(s)";
cmdctx->reply(-EBUSY, ss);
return true;
} else {
cmdctx->reply(0, ss);
}
ss << "OSD(s) " << osds << " are ok to stop without reducing"
<< " availability or risking data, provided there are no other concurrent failures"
<< " or interventions." << std::endl;
ss << touched_pgs << " PGs are likely to be"
<< " degraded (but remain available) as a result.";
cmdctx->reply(0, ss);
return true;
} else if (prefix == "pg force-recovery" ||
prefix == "pg force-backfill" ||
Expand Down
Loading