From eb64594e4473af859e74a0e831316cead0f5c49b Mon Sep 17 00:00:00 2001 From: Vladimir Saveliev Date: Thu, 9 Sep 2021 15:05:24 +0300 Subject: [PATCH] LU-12347 llite: do not take mod rpc slot for getxattr The following scenario may lead to client eviction: clientA clientB MDS threadA1: write to file F1, get and hold DoM MDC LDLM lock L1: ->cl_io_loop() ->cl_io_lock() : ->mdc_lock_granted() ->lock->l_writers++ [hold ref until write done] threadA2-A8: create files F2-F8: ->ll_file_open() ->mdc_enqueue_base() ->ldlm_cli_enqueue() ->ptlrpc_get_mod_rpc_slot() ->ptlrpc_queue_wait() [hold RPC slot until create done] OST(s) in recovery. MDS waiting on OST(s) to precreate new objects. threadA1: -> cl_io_start() -> __generic_file_aio_write() -> file_remove_suid() -> ll_xattr_cache_refill() -> mdc_xattr_common() -> ptlrpc_get_mod_rpc_slot() [blocked waiting for RPC slot] threadB1: write file F1, enqueue DoM MDC lock L1 MDS sends blocking AST to clientA for lock L1 ldlm_threadA3: cannot cancel busy lock L1: -> ldlm_handle_bl_callback() ["Lock L1 referenced, will be cancelled later"] MDS evicts clientA for not cancelling lock L1 threadA1: never completes write: ->cl_io_end() ->cl_io_unlock() ->osc_lock_cancel() ->lock->l_writers--; The fix is to add IT_GETXATTR to list of operations which do not need mod rpc slot. Tests to illustrate the issue is added. wait_for_function(): total sleep time (wait) is to be equal to max when 1 is returned. Signed-off-by: Vladimir Saveliev HPE-bug-id: LUS-7271 Change-Id: I1b80677df084bda141b9ac58a78b765bd0b14a41 Reviewed-on: https://review.whamcloud.com/44151 Reviewed-by: Andreas Dilger Reviewed-by: Andrew Perepechko Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/llite/xattr_cache.c | 2 + lustre/mdc/mdc_locks.c | 2 +- lustre/tests/replay-dual.sh | 56 ++++++++++++++++++++++++ lustre/tests/sanity.sh | 46 -------------------- lustre/tests/test-framework.sh | 99 +++++++++++++++++++++++++++++++++--------- 6 files changed, 138 insertions(+), 68 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 83e9f09..723b331 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -602,6 +602,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_LLITE_RACE_MOUNT 0x1417 #define OBD_FAIL_LLITE_PAGE_ALLOC 0x1418 #define OBD_FAIL_LLITE_OPEN_DELAY 0x1419 +#define OBD_FAIL_LLITE_XATTR_PAUSE 0x1420 #define OBD_FAIL_FID_INDIR 0x1501 #define OBD_FAIL_FID_INLMA 0x1502 diff --git a/lustre/llite/xattr_cache.c b/lustre/llite/xattr_cache.c index c0ef7df..a42329d 100644 --- a/lustre/llite/xattr_cache.c +++ b/lustre/llite/xattr_cache.c @@ -443,6 +443,8 @@ static int ll_xattr_cache_refill(struct inode *inode) ENTRY; + CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_XATTR_PAUSE, cfs_fail_val ?: 2); + rc = ll_xattr_find_get_lock(inode, &oit, &req); if (rc) GOTO(err_req, rc); diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index 93c5622..8f9a1fb 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -893,7 +893,7 @@ static inline bool mdc_skip_mod_rpc_slot(const struct lookup_intent *it) { if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP || - it->it_op == IT_READDIR || + it->it_op == IT_READDIR || it->it_op == IT_GETXATTR || (it->it_op == IT_LAYOUT && !(it->it_flags & MDS_FMODE_WRITE)))) return true; return false; diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 2270e1e..f16b900 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -1088,6 +1088,62 @@ test_30() { } run_test 30 "layout lock replay is not blocked on IO" +test_31() { + mkdir_on_mdt0 $DIR1/$tdir + $LFS setstripe -c 1 -i 0 $DIR1/$tdir + for (( i=0; i < 10; i++ )) ; do + mkdir -p $DIR1/$tdir/d.${i} + done + mkdir $DIR1/$tdir/mdtdir + $LFS setstripe -E 1M -L mdt $DIR1/$tdir/mdtdir + + # failover has to take longer than blocking timeout involved + # by second multiop below which is set to obd_timeout/2 by + # disabling AT + local timeout=$(do_facet mds1 $LCTL get_param -n timeout) + + timeout=$((timeout / 2 + 5)) + fail ost1 $timeout & + local failpid=$! + + sleep 1 + + # consume preallocated objects, precreate thread will be awakened + consume_precreations $DIR1/$tdir mds1 0 1 + + # disable AT so that blocking timeout gets set to obd_timeout/2 + local amm=$(at_max_get mds1) + + at_max_set 0 mds1 + stack_trap "at_max_set $amm mds1" + + declare -a multiops + + #define OBD_FAIL_LLITE_XATTR_PAUSE 0x1420 + $LCTL set_param fail_loc=0x80001420 + $MULTIOP $DIR1/$tdir/mdtdir/$tfile Osw4096c & + multiops+=($!) + sleep 0.5 + $MULTIOP $DIR2/$tdir/mdtdir/$tfile oO_WRONLY:w4096c & + multiops+=($!) + sleep 0.5 + local mmrif=$($LCTL get_param -n \ + mdc.$FSNAME-MDT0000-mdc-*.max_mod_rpcs_in_flight | tail -1) + # these are blocked by precreation until ost failover is in progress + for (( i=0; i < $mmrif; i++ )) ; do + $MULTIOP $DIR1/$tdir/d.${i}/parallel Oc & + multiops+=($!) + done + wait $failpid + local failed=0 + + for pid in ${multiops[@]}; do + wait $pid || ((failed++)) + done + ((failed == 0)) || error "$failed multiops failed" +} +run_test 31 "deadlock on file_remove_privs and occupied mod rpc slots" + complete $SECONDS SLEEP=$((SECONDS - $NOW)) [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 0a81f05..b3b11f8 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -1860,52 +1860,6 @@ reset_enospc() { wait_update_facet $SINGLEMDS "$get_prealloc" "" $delay } -__exhaust_precreations() { - local OSTIDX=$1 - local FAILLOC=$2 - local FAILIDX=${3:-$OSTIDX} - local ofacet=ost$((OSTIDX + 1)) - - mkdir_on_mdt0 $DIR/$tdir - local mdtidx=$($LFS getstripe -m $DIR/$tdir) - local mfacet=mds$((mdtidx + 1)) - echo OSTIDX=$OSTIDX MDTIDX=$mdtidx - - local OST=$(ostname_from_index $OSTIDX) - - # on the mdt's osc - local mdtosc_proc1=$(get_mdtosc_proc_path $mfacet $OST) - local last_id=$(do_facet $mfacet lctl get_param -n \ - osp.$mdtosc_proc1.prealloc_last_id) - local next_id=$(do_facet $mfacet lctl get_param -n \ - osp.$mdtosc_proc1.prealloc_next_id) - - local mdtosc_proc2=$(get_mdtosc_proc_path $mfacet) - do_facet $mfacet lctl get_param osp.$mdtosc_proc2.prealloc* - - test_mkdir -p $DIR/$tdir/${OST} - $LFS setstripe -i $OSTIDX -c 1 $DIR/$tdir/${OST} -#define OBD_FAIL_OST_ENOSPC 0x215 - do_facet $ofacet lctl set_param fail_val=$FAILIDX fail_loc=0x215 - echo "Creating to objid $last_id on ost $OST..." - createmany -o $DIR/$tdir/${OST}/f $next_id $((last_id - next_id + 2)) - do_facet $mfacet lctl get_param osp.$mdtosc_proc2.prealloc* - do_facet $ofacet lctl set_param fail_loc=$FAILLOC -} - -exhaust_precreations() { - __exhaust_precreations $1 $2 $3 - sleep_maxage -} - -exhaust_all_precreations() { - local i - for (( i=0; i < OSTCOUNT; i++ )) ; do - __exhaust_precreations $i $1 -1 - done - sleep_maxage -} - test_27n() { [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs" [ $PARALLEL == "yes" ] && skip "skip parallel run" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index bd4e5db..4da8b3f 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -2728,11 +2728,12 @@ remount_facet() { reboot_facet() { local facet=$1 local node=$(facet_active_host $facet) + local sleep_time=${2:-10} if [ "$FAILURE_MODE" = HARD ]; then boot_node $node else - sleep 10 + sleep $sleep_time fi } @@ -3662,7 +3663,7 @@ facet_failover() { facet=$(echo ${affecteds[index]} | tr -s " " | cut -d"," -f 1) echo reboot facets: ${affecteds[index]} - reboot_facet $facet + reboot_facet $facet $sleep_time change_active ${affecteds[index]} @@ -5841,30 +5842,30 @@ check_and_cleanup_lustre() { # General functions wait_for_function () { - local quiet="" - - # suppress fn both stderr and stdout - if [ "$1" = "--quiet" ]; then - shift - quiet=" > /dev/null 2>&1" + local quiet="" - fi + # suppress fn both stderr and stdout + if [ "$1" = "--quiet" ]; then + shift + quiet=" > /dev/null 2>&1" + fi - local fn=$1 - local max=${2:-900} - local sleep=${3:-5} + local fn=$1 + local max=${2:-900} + local sleep=${3:-5} - local wait=0 + local wait=0 - while true; do + while true; do - eval $fn $quiet && return 0 + eval $fn $quiet && return 0 - wait=$((wait + sleep)) - [ $wait -lt $max ] || return 1 - echo waiting $fn, $((max - wait)) secs left ... - sleep $sleep - done + [ $wait -lt $max ] || return 1 + echo waiting $fn, $((max - wait)) secs left ... + wait=$((wait + sleep)) + [ $wait -gt $max ] && ((sleep -= wait - max)) + sleep $sleep + done } check_network() { @@ -6821,7 +6822,7 @@ ostuuid_from_index() } ostname_from_index() { - local uuid=$(ostuuid_from_index $1) + local uuid=$(ostuuid_from_index $1 $2) echo ${uuid/_UUID/} } @@ -10677,3 +10678,59 @@ wait_nm_sync() { fi echo "waited $((i - 1)) seconds for sync" } + +consume_precreations() { + local dir=$1 + local mfacet=$2 + local OSTIDX=$3 + local extra=${4:-2} + local OST=$(ostname_from_index $OSTIDX $dir) + + test_mkdir -p $dir/${OST} + $LFS setstripe -i $OSTIDX -c 1 ${dir}/${OST} + + # on the mdt's osc + local mdtosc_proc=$(get_mdtosc_proc_path $mfacet $OST) + local last_id=$(do_facet $mfacet $LCTL get_param -n \ + osp.$mdtosc_proc.prealloc_last_id) + local next_id=$(do_facet $mfacet $LCTL get_param -n \ + osp.$mdtosc_proc.prealloc_next_id) + echo "Creating to objid $last_id on ost $OST..." + createmany -o $dir/${OST}/f $next_id $((last_id - next_id + extra)) +} + +__exhaust_precreations() { + local OSTIDX=$1 + local FAILLOC=$2 + local FAILIDX=${3:-$OSTIDX} + local ofacet=ost$((OSTIDX + 1)) + + mkdir_on_mdt0 $DIR/$tdir + local mdtidx=$($LFS getstripe -m $DIR/$tdir) + local mfacet=mds$((mdtidx + 1)) + echo OSTIDX=$OSTIDX MDTIDX=$mdtidx + + local mdtosc_proc=$(get_mdtosc_proc_path $mfacet) + do_facet $mfacet $LCTL get_param osp.$mdtosc_proc.prealloc* + +#define OBD_FAIL_OST_ENOSPC 0x215 + do_facet $ofacet $LCTL set_param fail_val=$FAILIDX fail_loc=0x215 + + consume_precreations $DIR/$tdir $mfacet $OSTIDX + + do_facet $mfacet $LCTL get_param osp.$mdtosc_proc.prealloc* + do_facet $ofacet $LCTL set_param fail_loc=$FAILLOC +} + +exhaust_precreations() { + __exhaust_precreations $1 $2 $3 + sleep_maxage +} + +exhaust_all_precreations() { + local i + for (( i=0; i < OSTCOUNT; i++ )) ; do + __exhaust_precreations $i $1 -1 + done + sleep_maxage +} -- 1.8.3.1