Whamcloud - gitweb
LU-17865 osc: fiemap deadlock fix 63/55163/3
authorAlexander Zarochentsev <alexander.zarochentsev@hpe.com>
Mon, 20 May 2024 18:33:18 +0000 (18:33 +0000)
committerOleg Drokin <green@whamcloud.com>
Wed, 29 May 2024 04:43:29 +0000 (04:43 +0000)
A fiemap call may deadlock due to wrongly requesting an ldlm lock at
server while the same lock is cached and pinned at the client. Two PR
lock requests are compatible so the deadlock also needs a concurrent
write lock.

ll_fiemap_info_key is shared between osc_object_fiemap()
calls, once OBD_FL_SRVLOCK flag is set, it is reused for
all subsequent RPCs regardless of the local lock caching status.

HPE-bug-id: LUS-12353
Signed-off-by: Alexander Zarochentsev <alexander.zarochentsev@hpe.com>
Change-Id: I6e76bc5e4549ed887b8f6177432acf90f9ec614d
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55163
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andrew Perepechko <andrew.perepechko@hpe.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/osc/osc_object.c
lustre/tests/sanity.sh

index 787853d..816c2f7 100644 (file)
@@ -441,6 +441,7 @@ extern bool obd_enable_health_write;
 #define OBD_FAIL_OSC_NO_SIZE_DATA        0x415
 #define OBD_FAIL_OSC_DELAY_CANCEL        0x416
 #define OBD_FAIL_OSC_SLOW_PAGE_EVICT    0x417
+#define OBD_FAIL_OSC_FIEMAP             0x418
 #define OBD_FAIL_OSC_MARK_COMPRESSED    0x419
 
 #define OBD_FAIL_PTLRPC                  0x500
index d9b2290..2b1e90c 100644 (file)
@@ -282,13 +282,15 @@ static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj,
                               LDLM_FL_BLOCK_GRANTED | LDLM_FL_LVB_READY,
                               &resid, LDLM_EXTENT, &policy,
                               LCK_PR | LCK_PW, &lockh);
+       fmkey->lfik_oa.o_valid |= OBD_MD_FLFLAGS;
        if (mode) { /* lock is cached on client */
+               fmkey->lfik_oa.o_flags &= ~OBD_FL_SRVLOCK;
                if (mode != LCK_PR) {
                        ldlm_lock_addref(&lockh, LCK_PR);
                        ldlm_lock_decref(&lockh, LCK_PW);
                }
+               CFS_FAIL_TIMEOUT(OBD_FAIL_OSC_FIEMAP, cfs_fail_val);
        } else { /* no cached lock, needs acquire lock on server side */
-               fmkey->lfik_oa.o_valid |= OBD_MD_FLFLAGS;
                fmkey->lfik_oa.o_flags |= OBD_FL_SRVLOCK;
        }
 
index dbe6116..5cf8fe3 100755 (executable)
@@ -15987,6 +15987,29 @@ test_130g() {
 }
 run_test 130g "FIEMAP (overstripe file)"
 
+test_130h() {
+       (( OSTCOUNT < 2 )) && skip_env "need 2 OSTs"
+
+       $LFS setstripe -o 0,1 -S 1M $DIR/$tfile
+       $LFS getstripe $DIR/$tfile
+       dd if=/dev/zero of=$DIR/$tfile bs=1M count=2
+       $LCTL set_param ldlm.namespaces.*-OST0000-osc-*.lru_size=clear
+       sleep 1
+       local before=$(date +%s)
+       ##define OBD_FAIL_OSC_FIEMAP              0x418
+       $LCTL set_param fail_loc=0x80000418 fail_val=5
+       checkfiemap $DIR/$tfile $((2 * 1024 * 1024)) &
+       sleep 1
+       dd if=/dev/zero of=$DIR/$tfile bs=1M count=3
+       wait
+       $LCTL set_param fail_loc=0 fail_val=0
+       # check for client eviction
+       local evict=$($LCTL get_param osc.$FSNAME-OST0001-osc-f*.state |
+          awk -F"[ [,]" '/EVICTED ]$/ { if (mx<$5) {mx=$5;} } END { print mx }')
+       [ -z "$evict" ] || [[ $evict -le $before ]] || error "eviction happened"
+}
+run_test 130h "FIEMAP deadlock"
+
 # Test for writev/readv
 test_131a() {
        rwv -f $DIR/$tfile -w -n 3 524288 1048576 1572864 ||