In fld_client_rpc(), if the FLD_QUERY request between MDTs fails
with -EWOUDBLOCK because the connection is lost, return -EAGAIN
to notify the caller to retry.
It also reverts the patch https://review.whamcloud.com/12586/, which
was landed on b2_6_90_0-5-g6db07f0 to avoid returning -EAGAIN from
lod_object_init() to confuse lu_object_find_at() (thinks the object
was dying when it encounters -EAGAIN). In current Lustre version,
lu_object_find_at() just returned found object and let's caller to
check whether it's dying.
Fixes:
6db07f095fba ("LU-5871 lod: Do not return EAGAIN in lod_object_init")
Change-Id: Ie83ebfdae2bd50c96a59a065f7f3c3dcfad04e42
Signed-off-by: Hongchao Zhang <hongchao@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/34962
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Lai Siyao <lai.siyao@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
LASSERT(exp != NULL);
-again:
imp = class_exp2cliimp(exp);
switch (fld_op) {
case FLD_QUERY:
req->rq_reply_portal = MDC_REPLY_PORTAL;
ptlrpc_at_set_req_timeout(req);
- obd_get_request_slot(&exp->exp_obd->u.cli);
- rc = ptlrpc_queue_wait(req);
- obd_put_request_slot(&exp->exp_obd->u.cli);
+ if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ && req->rq_no_delay)) {
+ /* the same error returned by ptlrpc_import_delay_req */
+ rc = -EWOULDBLOCK;
+ req->rq_status = rc;
+ } else {
+ obd_get_request_slot(&exp->exp_obd->u.cli);
+ rc = ptlrpc_queue_wait(req);
+ obd_put_request_slot(&exp->exp_obd->u.cli);
+ }
if (rc == -ENOENT) {
/* Don't loop forever on non-existing FID sequences. */
OCD_HAS_FLAG(&imp->imp_connect_data, LIGHTWEIGHT) &&
rc != -ENOTSUPP) {
/*
- * Since LWP is not replayable, so it will keep
- * trying unless umount happens or the remote
- * target does not support the operation, otherwise
- * it would cause unecessary failure of the
- * application.
+ * Since LWP is not replayable, so notify the caller
+ * to retry if needed after a while.
*/
- ptlrpc_req_finished(req);
- rc = 0;
- goto again;
+ rc = -EAGAIN;
}
GOTO(out_req, rc);
}
#define OBD_FAIL_FLD 0x1100
#define OBD_FAIL_FLD_QUERY_NET 0x1101
#define OBD_FAIL_FLD_READ_NET 0x1102
+#define OBD_FAIL_FLD_QUERY_REQ 0x1103
#define OBD_FAIL_SEC_CTX 0x1200
#define OBD_FAIL_SEC_CTX_INIT_NET 0x1201
ENTRY;
rc = lod_fld_lookup(env, lod, lu_object_fid(lo), &idx, &type);
- if (rc != 0) {
- /* Note: Sometimes, it will Return EAGAIN here, see
- * ptrlpc_import_delay_req(), which might confuse
- * lu_object_find_at() and make it wait there incorrectly.
- * so we convert it to EIO here.*/
- if (rc == -EAGAIN)
- rc = -EIO;
-
+ if (rc != 0)
RETURN(rc);
- }
if (type == LU_SEQ_RANGE_MDT &&
idx == lu_site2seq(lo->lo_dev->ld_site)->ss_node_id) {
}
run_test 110j "drop update reply during cross-MDT ln"
+test_110k() {
+ [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTS"
+ [[ $MDS1_VERSION -ge $(version_code 2.12.55) ]] ||
+ { skip "Need MDS version at least 2.12.55"; }
+
+ stop mds2 || error "stop mds2 failed"
+ umount $MOUNT
+
+#define OBD_FAIL_FLD_QUERY_REQ 0x1103
+ do_facet mds2 lctl set_param fail_loc=0x1103
+ start mds2 $(mdsdevname 2) -o abort_recovery ||
+ error "start MDS with abort_recovery should succeed"
+ do_facet mds2 lctl set_param fail_loc=0
+
+ # cleanup
+ stop mds2 || error "cleanup: stop mds2 failed"
+ start mds2 $(mdsdevname 2) || error "cleanup: start mds2 failed"
+ zconf_mount $(hostname) $MOUNT || error "cleanup: mount failed"
+ client_up || error "post-failover df failed"
+}
+run_test 110k "FID_QUERY failed during recovery"
+
# LU-2844 mdt prepare fail should not cause umount oops
test_111 ()
{