From 0260125850038b0c16e9aeb17f922eaa71a47306 Mon Sep 17 00:00:00 2001 From: "John L. Hammond" Date: Fri, 3 Sep 2021 13:08:00 -0500 Subject: [PATCH] EX-3749 mdc: non blocking close and changelog release Use interruptible RPCs in mdc_close() and the llog client functions. In chlg_release() send a SIGKILL to the changelog producer thread to wake from waiting on any RPC responses. Add sanity.sh test 160r to verify. Lustre-change: https://review.whamcloud.com/44842 Lustre-commit: 83e0eb2be4fa61344137b2f50724a1ca1dc532ae Signed-off-by: John L. Hammond Change-Id: I5f8b7cd839f5d6cf704ac07c7d583c45013921c4 Reviewed-by: Alex Zhuravlev Reviewed-by: Mike Pershin Reviewed-on: https://review.whamcloud.com/44960 Tested-by: jenkins Reviewed-by: Andreas Dilger Tested-by: Andreas Dilger --- lustre/include/obd_support.h | 3 +++ lustre/mdc/mdc_changelog.c | 6 +++++- lustre/mdc/mdc_request.c | 2 ++ lustre/ptlrpc/llog_client.c | 8 ++++++++ lustre/ptlrpc/llog_server.c | 6 ++++++ lustre/tests/hot-pools.sh | 27 +++++++++++++++++++++++++++ lustre/tests/sanity.sh | 34 ++++++++++++++++++++++++++++++++++ 7 files changed, 85 insertions(+), 1 deletion(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index dbe9b51..f7010080 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -556,6 +556,9 @@ extern char obd_jobid_var[]; /* was OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET 0x1307 until 2.1 */ /* was OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET 0x1308 until 1.8 */ /* was OBD_FAIL_LLOG_CATINFO_NET 0x1309 until 2.3 */ +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_OPEN_PAUSE 0x130a +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_PAUSE 0x130b +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_PAUSE 0x130c #define OBD_FAIL_MDS_SYNC_CAPA_SL 0x1310 #define OBD_FAIL_SEQ_ALLOC 0x1311 #define OBD_FAIL_CAT_RECORDS 0x1312 diff --git a/lustre/mdc/mdc_changelog.c b/lustre/mdc/mdc_changelog.c index c21c829..92b4692 100644 --- a/lustre/mdc/mdc_changelog.c +++ b/lustre/mdc/mdc_changelog.c @@ -292,6 +292,8 @@ static int chlg_load(void *args) int rc; ENTRY; + allow_signal(SIGKILL); + crs->crs_last_catidx = 0; crs->crs_last_idx = 0; @@ -661,8 +663,10 @@ static int chlg_release(struct inode *inode, struct file *file) struct chlg_rec_entry *tmp; int rc = 0; - if (crs->crs_prod_task) + if (crs->crs_prod_task) { + send_sig(SIGKILL, crs->crs_prod_task, 1); rc = kthread_stop(crs->crs_prod_task); + } list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) enq_record_delete(rec); diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index d47e747..b034fec 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -922,6 +922,8 @@ static int mdc_close(struct obd_export *exp, struct md_op_data *op_data, GOTO(out, rc = -ENOMEM); } + req->rq_allow_intr = 1; + if (u32_count > 0) req_capsule_set_size(&req->rq_pill, &RMF_U32, RCL_CLIENT, u32_count * sizeof(__u32)); diff --git a/lustre/ptlrpc/llog_client.c b/lustre/ptlrpc/llog_client.c index fa9782e..57aa4b2 100644 --- a/lustre/ptlrpc/llog_client.c +++ b/lustre/ptlrpc/llog_client.c @@ -98,6 +98,8 @@ static int llog_client_open(const struct lu_env *env, if (!req) GOTO(out, rc = -ENOMEM); + req->rq_allow_intr = 1; + if (name) req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, strlen(name) + 1); @@ -164,6 +166,8 @@ static int llog_client_next_block(const struct lu_env *env, if (!req) GOTO(err_exit, rc = -ENOMEM); + req->rq_allow_intr = 1; + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); body->lgd_logid = loghandle->lgh_id; body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1; @@ -236,6 +240,8 @@ static int llog_client_prev_block(const struct lu_env *env, if (!req) GOTO(err_exit, rc = -ENOMEM); + req->rq_allow_intr = 1; + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); body->lgd_logid = loghandle->lgh_id; body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1; @@ -287,6 +293,8 @@ static int llog_client_read_header(const struct lu_env *env, if (!req) GOTO(err_exit, rc = -ENOMEM); + req->rq_allow_intr = 1; + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); body->lgd_logid = handle->lgh_id; body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1; diff --git a/lustre/ptlrpc/llog_server.c b/lustre/ptlrpc/llog_server.c index e5391a5..05380494 100644 --- a/lustre/ptlrpc/llog_server.c +++ b/lustre/ptlrpc/llog_server.c @@ -65,6 +65,8 @@ int llog_origin_handle_open(struct ptlrpc_request *req) ENTRY; + OBD_FAIL_TIMEOUT(OBD_FAIL_LLOG_ORIGIN_HANDLE_OPEN_PAUSE, cfs_fail_val); + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); if (body == NULL) RETURN(err_serious(-EFAULT)); @@ -123,6 +125,8 @@ int llog_origin_handle_next_block(struct ptlrpc_request *req) ENTRY; + OBD_FAIL_TIMEOUT(OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_PAUSE, cfs_fail_val); + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); if (body == NULL) RETURN(err_serious(-EFAULT)); @@ -245,6 +249,8 @@ int llog_origin_handle_read_header(struct ptlrpc_request *req) ENTRY; + OBD_FAIL_TIMEOUT(OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_PAUSE, cfs_fail_val); + body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); if (body == NULL) RETURN(err_serious(-EFAULT)); diff --git a/lustre/tests/hot-pools.sh b/lustre/tests/hot-pools.sh index d9db469..1e323cb 100644 --- a/lustre/tests/hot-pools.sh +++ b/lustre/tests/hot-pools.sh @@ -1528,6 +1528,33 @@ test_15() { } run_test 15 "lamigo: replicate PFL and FLR files" +test_16() { + local facet + local pid_file + local start_pid + local status + + init_hot_pools_env + facet=${LAMIGO_MDT_FACET[0]} + pid_file=${LAMIGO_PIDFILE[0]} + + start_one_lamigo_cmd + start_pid=$! + echo "start_pid = '${start_pid}'" >&2 + + check_one_lamigo_is_started || error "failed to start lamigo" + + stop $facet + do_facet $facet "pkill --pidfile=${pid_file} lamigo" + + wait ${start_pid} + status=$? + echo "status = '${status}'" >&2 + + start $facet +} +run_test 16 "lamigo can terminate with unmounted MDT" + # lpurge test cases test_51() { init_hot_pools_env diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 3b41b27..28075ce 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -15891,6 +15891,40 @@ test_160q() { } run_test 160q "changelog effective mask is DEFMASK if not set" +test_160r() { + local fail_loc + local pid + local status + + # define OBD_FAIL_LLOG_ORIGIN_HANDLE_OPEN_PAUSE 0x130a + # define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_PAUSE 0x130b + # define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_PAUSE 0x130c + # + # next block and read header may happen in a separate thread + # due to llog_process() setting fork=true, but we can run the + # test anyway. + + umount_client $MOUNT || "failed to umount client" + + for fail_loc in 0x130a 0x130b 0x130c; do + do_facet $SINGLEMDS $LCTL set_param fail_loc=$fail_loc fail_val=30 + + mount_client $MOUNT "$MOUNT_OPTS" & + pid=$! + + sleep 10 + killall mount.lustre + wait $pid + status=$? + echo "status = '${status}'" >&2 + + do_facet $SINGLEMDS $LCTL set_param fail_loc=0 fail_val=0 + done + + mount_client $MOUNT "$MOUNT_OPTS" || error "cannot mount client" +} +run_test 160r "interrupt mount in llog open, next block, and read header" + test_161a() { [ $PARALLEL == "yes" ] && skip "skip parallel run" -- 1.8.3.1