From 58edec38160f44be0ef784fecfab830a43f92fa8 Mon Sep 17 00:00:00 2001 From: Vladimir Saveliev Date: Thu, 12 Jul 2018 23:27:24 +0300 Subject: [PATCH] LU-11131 target: keep reply data bit set on failover The following scenario leads to failure of recent reint rpc: 1. mdt server has number of rpcs being handled, rpc 1 from client A and rpc 2 from client B. 2. shutdown for the server starts 3. rpc 1 is processed, reply data is added, but client A gets ENODEV in reply (ptlrpc_send_reply()) as shutdown is running 3. shutdown reaches class_disconnect_exports() and links an export A to the list of zombie exports 4. obd_zombid thread wakes up and destroy the export A, which includes freeing of reply data list with clearing bits in lut->lut_reply_bitmap (tgt_free_reply_data()) 5. export B is still processing the rpc 2 and looks for free bit in the lut->lut_reply_bitmap to store reply data (tgt_add_reply_data()). If it finds a bit which has been just freed by obd_zombid thread, then reply data from export A will get overwritten in reply_data file with reply data from export B 6. after failover, reply data gets restored with tgt_reply_data_init(). The reply data of client A is missing 7. client A reconnects and resends its rpc 1. Server does not find reply data and processes the rpc as if it has not been seen yet. In case of unlink, the directory entry already does not exist so rpc 1 fails The fix is to not free bits in lut->lut_reply_bitmap in case of failover. Test illustrating the issue is added. Signed-off-by: Vladimir Saveliev Cray-bug-id: LUS-6004 Reviewed-by: Alexey Lyashkov Reviewed-by: Andriy Skulysh Tested-by: Elena Gryaznova Change-Id: I6db3728f3271ce2751fbe08dadca365eb2ffe727 Reviewed-on: https://review.whamcloud.com/32798 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Mike Pershin Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/mdd/mdd_dir.c | 6 ++++++ lustre/target/tgt_lastrcvd.c | 7 +++++++ lustre/tests/recovery-small.sh | 30 ++++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 5ff7fbf..84825eb 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -464,6 +464,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_TGT_SLUGGISH_NET 0x719 #define OBD_FAIL_TGT_RCVD_EIO 0x720 #define OBD_FAIL_TGT_RECOVERY_REQ_RACE 0x721 +#define OBD_FAIL_TGT_REPLY_DATA_RACE 0x722 #define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 #define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 diff --git a/lustre/mdd/mdd_dir.c b/lustre/mdd/mdd_dir.c index cf83e1f..121468b 100644 --- a/lustre/mdd/mdd_dir.c +++ b/lustre/mdd/mdd_dir.c @@ -1679,6 +1679,9 @@ static int mdd_unlink(const struct lu_env *env, struct md_object *pobj, int rc, is_dir = 0, cl_flags = 0; ENTRY; + /* let shutdown to start */ + CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLY_DATA_RACE, 1); + /* cobj == NULL means only delete name entry */ if (likely(cobj != NULL)) { mdd_cobj = md2mdd_obj(cobj); @@ -2901,6 +2904,9 @@ static int mdd_rename(const struct lu_env *env, int rc, rc2; ENTRY; + /* let unlink to complete and commit */ + CFS_FAIL_TIMEOUT(OBD_FAIL_TGT_REPLY_DATA_RACE, 2 + cfs_fail_val); + if (tobj) mdd_tobj = md2mdd_obj(tobj); diff --git a/lustre/target/tgt_lastrcvd.c b/lustre/target/tgt_lastrcvd.c index b316781..9432c31 100644 --- a/lustre/target/tgt_lastrcvd.c +++ b/lustre/target/tgt_lastrcvd.c @@ -148,6 +148,13 @@ static int tgt_clear_reply_slot(struct lu_target *lut, int idx) int chunk; int b; + if (lut->lut_obd->obd_stopping) + /* + * in case of failover keep the bit set in order to + * avoid overwriting slots in reply_data which might + * be required by resent rpcs + */ + return 0; chunk = idx / LUT_REPLY_SLOTS_PER_CHUNK; b = idx % LUT_REPLY_SLOTS_PER_CHUNK; diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index ca6c07b..e36b42f 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -2690,6 +2690,36 @@ test_133() { } run_test 133 "don't fail on flock resend" +test_134() { + [ -z "$CLIENTS" ] && skip "Need two or more clients" && return + [ $CLIENTCOUNT -lt 2 ] && + { skip "Need 2+ clients, have $CLIENTCOUNT" && return; } + + mkdir -p $MOUNT/$tdir/1 $MOUNT/$tdir/2 || error "mkdir failed" + touch $MOUNT/$tdir/1/$tfile $MOUNT/$tdir/2/$tfile || + error "touch failed" + zconf_umount_clients $CLIENTS $MOUNT + zconf_mount_clients $CLIENTS $MOUNT + +#define OBD_FAIL_TGT_REPLY_DATA_RACE 0x722 + # assume commit interval is 5 + do_facet mds1 "$LCTL set_param fail_loc=0x722 fail_val=5" + + local -a clients=(${CLIENTS//,/ }) + local client1=${clients[0]} + local client2=${clients[1]} + + do_node $client1 rm $MOUNT/$tdir/1/$tfile & + rmpid=$! + do_node $client2 mv $MOUNT/$tdir/2/$tfile $MOUNT/$tdir/2/${tfile}_2 & + mvpid=$! + fail mds1 + wait $rmpid || error "rm failed" + wait $mvpid || error "mv failed" + return 0 +} +run_test 134 "race between failover and search for reply data free slot" + complete $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1