Skip to content

Commit 032951d

Browse files
idryomovgregkh
authored andcommitted
libceph: apply new_state before new_up_client on incrementals
commit 930c532869774ebf8af9efe9484c597f896a7d46 upstream. Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Josh Durgin <jdurgin@redhat.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 79cc80f commit 032951d

1 file changed

Lines changed: 113 additions & 43 deletions

File tree

net/ceph/osdmap.c

Lines changed: 113 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1191,6 +1191,115 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
11911191
return map;
11921192
}
11931193

1194+
/*
1195+
* Encoding order is (new_up_client, new_state, new_weight). Need to
1196+
* apply in the (new_weight, new_state, new_up_client) order, because
1197+
* an incremental map may look like e.g.
1198+
*
1199+
* new_up_client: { osd=6, addr=... } # set osd_state and addr
1200+
* new_state: { osd=6, xorstate=EXISTS } # clear osd_state
1201+
*/
1202+
static int decode_new_up_state_weight(void **p, void *end,
1203+
struct ceph_osdmap *map)
1204+
{
1205+
void *new_up_client;
1206+
void *new_state;
1207+
void *new_weight_end;
1208+
u32 len;
1209+
1210+
new_up_client = *p;
1211+
ceph_decode_32_safe(p, end, len, e_inval);
1212+
len *= sizeof(u32) + sizeof(struct ceph_entity_addr);
1213+
ceph_decode_need(p, end, len, e_inval);
1214+
*p += len;
1215+
1216+
new_state = *p;
1217+
ceph_decode_32_safe(p, end, len, e_inval);
1218+
len *= sizeof(u32) + sizeof(u8);
1219+
ceph_decode_need(p, end, len, e_inval);
1220+
*p += len;
1221+
1222+
/* new_weight */
1223+
ceph_decode_32_safe(p, end, len, e_inval);
1224+
while (len--) {
1225+
s32 osd;
1226+
u32 w;
1227+
1228+
ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
1229+
osd = ceph_decode_32(p);
1230+
w = ceph_decode_32(p);
1231+
BUG_ON(osd >= map->max_osd);
1232+
pr_info("osd%d weight 0x%x %s\n", osd, w,
1233+
w == CEPH_OSD_IN ? "(in)" :
1234+
(w == CEPH_OSD_OUT ? "(out)" : ""));
1235+
map->osd_weight[osd] = w;
1236+
1237+
/*
1238+
* If we are marking in, set the EXISTS, and clear the
1239+
* AUTOOUT and NEW bits.
1240+
*/
1241+
if (w) {
1242+
map->osd_state[osd] |= CEPH_OSD_EXISTS;
1243+
map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
1244+
CEPH_OSD_NEW);
1245+
}
1246+
}
1247+
new_weight_end = *p;
1248+
1249+
/* new_state (up/down) */
1250+
*p = new_state;
1251+
len = ceph_decode_32(p);
1252+
while (len--) {
1253+
s32 osd;
1254+
u8 xorstate;
1255+
int ret;
1256+
1257+
osd = ceph_decode_32(p);
1258+
xorstate = ceph_decode_8(p);
1259+
if (xorstate == 0)
1260+
xorstate = CEPH_OSD_UP;
1261+
BUG_ON(osd >= map->max_osd);
1262+
if ((map->osd_state[osd] & CEPH_OSD_UP) &&
1263+
(xorstate & CEPH_OSD_UP))
1264+
pr_info("osd%d down\n", osd);
1265+
if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
1266+
(xorstate & CEPH_OSD_EXISTS)) {
1267+
pr_info("osd%d does not exist\n", osd);
1268+
map->osd_weight[osd] = CEPH_OSD_IN;
1269+
ret = set_primary_affinity(map, osd,
1270+
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1271+
if (ret)
1272+
return ret;
1273+
memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
1274+
map->osd_state[osd] = 0;
1275+
} else {
1276+
map->osd_state[osd] ^= xorstate;
1277+
}
1278+
}
1279+
1280+
/* new_up_client */
1281+
*p = new_up_client;
1282+
len = ceph_decode_32(p);
1283+
while (len--) {
1284+
s32 osd;
1285+
struct ceph_entity_addr addr;
1286+
1287+
osd = ceph_decode_32(p);
1288+
ceph_decode_copy(p, &addr, sizeof(addr));
1289+
ceph_decode_addr(&addr);
1290+
BUG_ON(osd >= map->max_osd);
1291+
pr_info("osd%d up\n", osd);
1292+
map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
1293+
map->osd_addr[osd] = addr;
1294+
}
1295+
1296+
*p = new_weight_end;
1297+
return 0;
1298+
1299+
e_inval:
1300+
return -EINVAL;
1301+
}
1302+
11941303
/*
11951304
* decode and apply an incremental map update.
11961305
*/
@@ -1290,49 +1399,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
12901399
__remove_pg_pool(&map->pg_pools, pi);
12911400
}
12921401

1293-
/* new_up */
1294-
ceph_decode_32_safe(p, end, len, e_inval);
1295-
while (len--) {
1296-
u32 osd;
1297-
struct ceph_entity_addr addr;
1298-
ceph_decode_32_safe(p, end, osd, e_inval);
1299-
ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
1300-
ceph_decode_addr(&addr);
1301-
pr_info("osd%d up\n", osd);
1302-
BUG_ON(osd >= map->max_osd);
1303-
map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
1304-
map->osd_addr[osd] = addr;
1305-
}
1306-
1307-
/* new_state */
1308-
ceph_decode_32_safe(p, end, len, e_inval);
1309-
while (len--) {
1310-
u32 osd;
1311-
u8 xorstate;
1312-
ceph_decode_32_safe(p, end, osd, e_inval);
1313-
xorstate = **(u8 **)p;
1314-
(*p)++; /* clean flag */
1315-
if (xorstate == 0)
1316-
xorstate = CEPH_OSD_UP;
1317-
if (xorstate & CEPH_OSD_UP)
1318-
pr_info("osd%d down\n", osd);
1319-
if (osd < map->max_osd)
1320-
map->osd_state[osd] ^= xorstate;
1321-
}
1322-
1323-
/* new_weight */
1324-
ceph_decode_32_safe(p, end, len, e_inval);
1325-
while (len--) {
1326-
u32 osd, off;
1327-
ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
1328-
osd = ceph_decode_32(p);
1329-
off = ceph_decode_32(p);
1330-
pr_info("osd%d weight 0x%x %s\n", osd, off,
1331-
off == CEPH_OSD_IN ? "(in)" :
1332-
(off == CEPH_OSD_OUT ? "(out)" : ""));
1333-
if (osd < map->max_osd)
1334-
map->osd_weight[osd] = off;
1335-
}
1402+
/* new_up_client, new_state, new_weight */
1403+
err = decode_new_up_state_weight(p, end, map);
1404+
if (err)
1405+
goto bad;
13361406

13371407
/* new_pg_temp */
13381408
err = decode_new_pg_temp(p, end, map);

0 commit comments

Comments
 (0)