Whamcloud - gitweb
LU-12347 llite: do not take mod rpc slot for getxattr 51/44151/10
authorVladimir Saveliev <vlaidimir.saveliev@hpe.com>
Thu, 9 Sep 2021 12:05:24 +0000 (15:05 +0300)
committerOleg Drokin <green@whamcloud.com>
Tue, 30 Nov 2021 03:44:59 +0000 (03:44 +0000)
The following scenario may lead to client eviction:
clientA                clientB                  MDS
threadA1: write to file F1, get
and hold DoM MDC LDLM lock L1:
   ->cl_io_loop()
    ->cl_io_lock()
     :
     ->mdc_lock_granted()
      ->lock->l_writers++
      [hold ref until write done]

threadA2-A8: create files F2-F8:
   ->ll_file_open()
    ->mdc_enqueue_base()
     ->ldlm_cli_enqueue()
      ->ptlrpc_get_mod_rpc_slot()
      ->ptlrpc_queue_wait()
      [hold RPC slot until create done]

                                              OST(s) in recovery.
                                              MDS waiting on OST(s) to
                                              precreate new objects.

threadA1:
   -> cl_io_start()
    -> __generic_file_aio_write()
     -> file_remove_suid()
      -> ll_xattr_cache_refill()
       -> mdc_xattr_common()
        -> ptlrpc_get_mod_rpc_slot()
        [blocked waiting for RPC slot]

                       threadB1: write file F1,
       enqueue DoM MDC lock L1

                                              MDS sends blocking AST
                                              to clientA for lock L1

ldlm_threadA3: cannot cancel busy lock L1:
   -> ldlm_handle_bl_callback()
   ["Lock L1 referenced, will be cancelled later"]

                                              MDS evicts clientA for
                                              not cancelling lock L1

threadA1: never completes write:
  ->cl_io_end()
   ->cl_io_unlock()
    ->osc_lock_cancel()
     ->lock->l_writers--;

The fix is to add IT_GETXATTR to list of operations which do not
need mod rpc slot.

Tests to illustrate the issue is added.

wait_for_function(): total sleep time (wait) is to be equal to max
when 1 is returned.

Signed-off-by: Vladimir Saveliev <vlaidimir.saveliev@hpe.com>
HPE-bug-id: LUS-7271
Change-Id: I1b80677df084bda141b9ac58a78b765bd0b14a41
Reviewed-on: https://review.whamcloud.com/44151
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Andrew Perepechko <andrew.perepechko@hpe.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/llite/xattr_cache.c
lustre/mdc/mdc_locks.c
lustre/tests/replay-dual.sh
lustre/tests/sanity.sh
lustre/tests/test-framework.sh

index 83e9f09..723b331 100644 (file)
@@ -602,6 +602,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_LLITE_RACE_MOUNT                  0x1417
 #define OBD_FAIL_LLITE_PAGE_ALLOC                  0x1418
 #define OBD_FAIL_LLITE_OPEN_DELAY                  0x1419
+#define OBD_FAIL_LLITE_XATTR_PAUSE                 0x1420
 
 #define OBD_FAIL_FID_INDIR     0x1501
 #define OBD_FAIL_FID_INLMA     0x1502
index c0ef7df..a42329d 100644 (file)
@@ -443,6 +443,8 @@ static int ll_xattr_cache_refill(struct inode *inode)
 
        ENTRY;
 
+       CFS_FAIL_TIMEOUT(OBD_FAIL_LLITE_XATTR_PAUSE, cfs_fail_val ?: 2);
+
        rc = ll_xattr_find_get_lock(inode, &oit, &req);
        if (rc)
                GOTO(err_req, rc);
index 93c5622..8f9a1fb 100644 (file)
@@ -893,7 +893,7 @@ static inline bool mdc_skip_mod_rpc_slot(const struct lookup_intent *it)
 {
        if (it != NULL &&
            (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP ||
-            it->it_op == IT_READDIR ||
+            it->it_op == IT_READDIR || it->it_op == IT_GETXATTR ||
             (it->it_op == IT_LAYOUT && !(it->it_flags & MDS_FMODE_WRITE))))
                return true;
        return false;
index 2270e1e..f16b900 100755 (executable)
@@ -1088,6 +1088,62 @@ test_30() {
 }
 run_test 30 "layout lock replay is not blocked on IO"
 
+test_31() {
+       mkdir_on_mdt0 $DIR1/$tdir
+       $LFS setstripe -c 1 -i 0 $DIR1/$tdir
+       for (( i=0; i < 10; i++ )) ; do
+               mkdir -p $DIR1/$tdir/d.${i}
+       done
+       mkdir $DIR1/$tdir/mdtdir
+       $LFS setstripe -E 1M -L mdt $DIR1/$tdir/mdtdir
+
+       # failover has to take longer than blocking timeout involved
+       # by second multiop below which is set to obd_timeout/2 by
+       # disabling AT
+       local timeout=$(do_facet mds1 $LCTL get_param -n timeout)
+
+       timeout=$((timeout / 2 + 5))
+       fail ost1 $timeout &
+       local failpid=$!
+
+       sleep 1
+
+       # consume preallocated objects, precreate thread will be awakened
+       consume_precreations $DIR1/$tdir mds1 0 1
+
+       # disable AT so that blocking timeout gets set to obd_timeout/2
+       local amm=$(at_max_get mds1)
+
+       at_max_set 0 mds1
+       stack_trap "at_max_set $amm mds1"
+
+       declare -a multiops
+
+       #define OBD_FAIL_LLITE_XATTR_PAUSE                 0x1420
+       $LCTL set_param fail_loc=0x80001420
+       $MULTIOP $DIR1/$tdir/mdtdir/$tfile Osw4096c &
+       multiops+=($!)
+       sleep 0.5
+       $MULTIOP $DIR2/$tdir/mdtdir/$tfile oO_WRONLY:w4096c &
+       multiops+=($!)
+       sleep 0.5
+       local mmrif=$($LCTL get_param -n \
+               mdc.$FSNAME-MDT0000-mdc-*.max_mod_rpcs_in_flight | tail -1)
+       # these are blocked by precreation until ost failover is in progress
+       for (( i=0; i < $mmrif; i++ )) ; do
+               $MULTIOP $DIR1/$tdir/d.${i}/parallel Oc &
+               multiops+=($!)
+       done
+       wait $failpid
+       local failed=0
+
+       for pid in ${multiops[@]}; do
+               wait $pid || ((failed++))
+       done
+       ((failed == 0)) || error "$failed multiops failed"
+}
+run_test 31 "deadlock on file_remove_privs and occupied mod rpc slots"
+
 complete $SECONDS
 SLEEP=$((SECONDS - $NOW))
 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
index 0a81f05..b3b11f8 100755 (executable)
@@ -1860,52 +1860,6 @@ reset_enospc() {
        wait_update_facet $SINGLEMDS "$get_prealloc" "" $delay
 }
 
-__exhaust_precreations() {
-       local OSTIDX=$1
-       local FAILLOC=$2
-       local FAILIDX=${3:-$OSTIDX}
-       local ofacet=ost$((OSTIDX + 1))
-
-       mkdir_on_mdt0 $DIR/$tdir
-       local mdtidx=$($LFS getstripe -m $DIR/$tdir)
-       local mfacet=mds$((mdtidx + 1))
-       echo OSTIDX=$OSTIDX MDTIDX=$mdtidx
-
-       local OST=$(ostname_from_index $OSTIDX)
-
-       # on the mdt's osc
-       local mdtosc_proc1=$(get_mdtosc_proc_path $mfacet $OST)
-       local last_id=$(do_facet $mfacet lctl get_param -n \
-                       osp.$mdtosc_proc1.prealloc_last_id)
-       local next_id=$(do_facet $mfacet lctl get_param -n \
-                       osp.$mdtosc_proc1.prealloc_next_id)
-
-       local mdtosc_proc2=$(get_mdtosc_proc_path $mfacet)
-       do_facet $mfacet lctl get_param osp.$mdtosc_proc2.prealloc*
-
-       test_mkdir -p $DIR/$tdir/${OST}
-       $LFS setstripe -i $OSTIDX -c 1 $DIR/$tdir/${OST}
-#define OBD_FAIL_OST_ENOSPC              0x215
-       do_facet $ofacet lctl set_param fail_val=$FAILIDX fail_loc=0x215
-       echo "Creating to objid $last_id on ost $OST..."
-       createmany -o $DIR/$tdir/${OST}/f $next_id $((last_id - next_id + 2))
-       do_facet $mfacet lctl get_param osp.$mdtosc_proc2.prealloc*
-       do_facet $ofacet lctl set_param fail_loc=$FAILLOC
-}
-
-exhaust_precreations() {
-       __exhaust_precreations $1 $2 $3
-       sleep_maxage
-}
-
-exhaust_all_precreations() {
-       local i
-       for (( i=0; i < OSTCOUNT; i++ )) ; do
-               __exhaust_precreations $i $1 -1
-       done
-       sleep_maxage
-}
-
 test_27n() {
        [[ $OSTCOUNT -lt 2 ]] && skip_env "needs >= 2 OSTs"
        [ $PARALLEL == "yes" ] && skip "skip parallel run"
index bd4e5db..4da8b3f 100755 (executable)
@@ -2728,11 +2728,12 @@ remount_facet() {
 reboot_facet() {
        local facet=$1
        local node=$(facet_active_host $facet)
+       local sleep_time=${2:-10}
 
        if [ "$FAILURE_MODE" = HARD ]; then
                boot_node $node
        else
-               sleep 10
+               sleep $sleep_time
        fi
 }
 
@@ -3662,7 +3663,7 @@ facet_failover() {
                facet=$(echo ${affecteds[index]} | tr -s " " | cut -d"," -f 1)
                echo reboot facets: ${affecteds[index]}
 
-               reboot_facet $facet
+               reboot_facet $facet $sleep_time
 
                change_active ${affecteds[index]}
 
@@ -5841,30 +5842,30 @@ check_and_cleanup_lustre() {
 # General functions
 
 wait_for_function () {
-    local quiet=""
-
-    # suppress fn both stderr and stdout
-    if [ "$1" = "--quiet" ]; then
-        shift
-        quiet=" > /dev/null 2>&1"
+       local quiet=""
 
-    fi
+       # suppress fn both stderr and stdout
+       if [ "$1" = "--quiet" ]; then
+               shift
+               quiet=" > /dev/null 2>&1"
+       fi
 
-    local fn=$1
-    local max=${2:-900}
-    local sleep=${3:-5}
+       local fn=$1
+       local max=${2:-900}
+       local sleep=${3:-5}
 
-    local wait=0
+       local wait=0
 
-    while true; do
+       while true; do
 
-        eval $fn $quiet && return 0
+               eval $fn $quiet && return 0
 
-        wait=$((wait + sleep))
-        [ $wait -lt $max ] || return 1
-        echo waiting $fn, $((max - wait)) secs left ...
-        sleep $sleep
-    done
+               [ $wait -lt $max ] || return 1
+               echo waiting $fn, $((max - wait)) secs left ...
+               wait=$((wait + sleep))
+               [ $wait -gt $max ] && ((sleep -= wait - max))
+               sleep $sleep
+       done
 }
 
 check_network() {
@@ -6821,7 +6822,7 @@ ostuuid_from_index()
 }
 
 ostname_from_index() {
-    local uuid=$(ostuuid_from_index $1)
+    local uuid=$(ostuuid_from_index $1 $2)
     echo ${uuid/_UUID/}
 }
 
@@ -10677,3 +10678,59 @@ wait_nm_sync() {
        fi
        echo "waited $((i - 1)) seconds for sync"
 }
+
+consume_precreations() {
+       local dir=$1
+       local mfacet=$2
+       local OSTIDX=$3
+       local extra=${4:-2}
+       local OST=$(ostname_from_index $OSTIDX $dir)
+
+       test_mkdir -p $dir/${OST}
+       $LFS setstripe -i $OSTIDX -c 1 ${dir}/${OST}
+
+       # on the mdt's osc
+       local mdtosc_proc=$(get_mdtosc_proc_path $mfacet $OST)
+       local last_id=$(do_facet $mfacet $LCTL get_param -n \
+                       osp.$mdtosc_proc.prealloc_last_id)
+       local next_id=$(do_facet $mfacet $LCTL get_param -n \
+                       osp.$mdtosc_proc.prealloc_next_id)
+       echo "Creating to objid $last_id on ost $OST..."
+       createmany -o $dir/${OST}/f $next_id $((last_id - next_id + extra))
+}
+
+__exhaust_precreations() {
+       local OSTIDX=$1
+       local FAILLOC=$2
+       local FAILIDX=${3:-$OSTIDX}
+       local ofacet=ost$((OSTIDX + 1))
+
+       mkdir_on_mdt0 $DIR/$tdir
+       local mdtidx=$($LFS getstripe -m $DIR/$tdir)
+       local mfacet=mds$((mdtidx + 1))
+       echo OSTIDX=$OSTIDX MDTIDX=$mdtidx
+
+       local mdtosc_proc=$(get_mdtosc_proc_path $mfacet)
+       do_facet $mfacet $LCTL get_param osp.$mdtosc_proc.prealloc*
+
+#define OBD_FAIL_OST_ENOSPC              0x215
+       do_facet $ofacet $LCTL set_param fail_val=$FAILIDX fail_loc=0x215
+
+       consume_precreations $DIR/$tdir $mfacet $OSTIDX
+
+       do_facet $mfacet $LCTL get_param osp.$mdtosc_proc.prealloc*
+       do_facet $ofacet $LCTL set_param fail_loc=$FAILLOC
+}
+
+exhaust_precreations() {
+       __exhaust_precreations $1 $2 $3
+       sleep_maxage
+}
+
+exhaust_all_precreations() {
+       local i
+       for (( i=0; i < OSTCOUNT; i++ )) ; do
+               __exhaust_precreations $i $1 -1
+       done
+       sleep_maxage
+}