Whamcloud - gitweb
LU-18174 osd-ldiskfs: do not miss readdir's actor failure 68/56168/5
authorVladimir Saveliev <vladimir.saveliev@hpe.com>
Thu, 9 Jan 2025 14:58:15 +0000 (17:58 +0300)
committerOleg Drokin <green@whamcloud.com>
Sun, 2 Feb 2025 06:24:57 +0000 (06:24 +0000)
lfsck falls into endless loop in osd_check_lmv() if osd_iget2()
returns error:
lfsck_master_engine
 lfsck_master_oit_engine
  lfsck_object_find_bottom
   lfsck_object_find_by_dev
    lu_object_find_at
     lu_object_start
      osd_object_init
       osd_fid_lookup
        osd_check_lmv      << this endlessly calls iterate_dir
         iterate_dir
          ldiskfs_readdir
           ldiskfs_dx_readdir
            call_filldir
             dir_emit
              osd_stripe_dir_filldir
               osd_iget
                osd_iget2   << this returns error

Use struct osd_check_lmv_buf to inform osd_check_lmv() about an error
in readdir callback.

Test to illustrate the issus is added.

Test-Parameters: trivial testlist=sanity-lfsck env=ONLY=43
HPE-bug-id: LUS-12365
Signed-off-by: Vladimir Saveliev <vladimir.saveliev@hpe.com>
Change-Id: I57a4b739c9ad5a8c09bdad05752714830d584595
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/56168
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Lai Siyao <lai.siyao@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/osd-ldiskfs/osd_handler.c
lustre/tests/conf-sanity.sh
lustre/tests/sanity-lfsck.sh
lustre/tests/test-framework.sh

index b6223d5..d274b28 100644 (file)
@@ -283,6 +283,8 @@ extern bool obd_enable_health_write;
 
 #define OBD_FAIL_OFD_SET_OID                           0x1e0
 #define OBD_FAIL_OFD_COMMITRW_DELAY                    0x1e1
+#define OBD_FAIL_OFD_IGET_FAIL_TO_START                        0x1e2
+#define OBD_FAIL_OFD_IGET_FAIL                         0x1e3
 
 #define OBD_FAIL_OST                           0x200
 #define OBD_FAIL_OST_CONNECT_NET               0x201
index 253d1a8..1d4ff86 100644 (file)
@@ -904,11 +904,12 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj)
 
 struct osd_check_lmv_buf {
        /* please keep it as first member */
-       struct dir_context ctx;
+       struct dir_context      oclb_ctx;
        struct osd_thread_info *oclb_info;
-       struct osd_device *oclb_dev;
-       int oclb_items;
-       bool oclb_found;
+       struct osd_device      *oclb_dev;
+       int                     oclb_items;
+       bool                    oclb_found;
+       int                     oclb_rc;
 };
 
 /**
@@ -950,10 +951,18 @@ static int osd_stripe_dir_filldir(void *buf,
 
        osd_id_gen(id, ino, OSD_OII_NOGEN);
        inode = osd_iget(oti, dev, id, 0);
-       if (IS_ERR(inode))
+       if (IS_ERR(inode)) {
+               oclb->oclb_rc = PTR_ERR(inode);
                return PTR_ERR(inode);
+       }
 
        iput(inode);
+
+       if (CFS_FAIL_CHECK(OBD_FAIL_OFD_IGET_FAIL)) {
+               oclb->oclb_rc = -ESTALE;
+               RETURN(-ESTALE);
+       }
+
        osd_add_oi_cache(oti, dev, id, fid);
        /* Check shard by scrub only if it has a problem with OI */
        if (osd_oi_lookup(oti, dev, fid, &id2, 0) || !osd_id_eq(id, &id2))
@@ -1008,7 +1017,7 @@ static int osd_check_lmv(struct osd_thread_info *oti, struct osd_device *dev,
        struct file *filp;
        struct lmv_mds_md_v1 *lmv1;
        struct osd_check_lmv_buf oclb = {
-               .ctx.actor = osd_stripe_dir_filldir,
+               .oclb_ctx.actor = osd_stripe_dir_filldir,
                .oclb_info = oti,
                .oclb_dev = dev,
                .oclb_found = false,
@@ -1061,11 +1070,18 @@ again:
        if (le32_to_cpu(lmv1->lmv_magic) != LMV_MAGIC_V1)
                GOTO(out, rc = 0);
 
+       CFS_FAIL_CHECK_RESET(OBD_FAIL_OFD_IGET_FAIL_TO_START,
+                            OBD_FAIL_OFD_IGET_FAIL);
        do {
                oclb.oclb_items = 0;
-               rc = iterate_dir(filp, &oclb.ctx);
+               oclb.oclb_rc = 0;
+               rc = iterate_dir(filp, &oclb.oclb_ctx);
+               if (rc == 0)
+                       rc = oclb.oclb_rc;
        } while (rc >= 0 && oclb.oclb_items > 0 && !oclb.oclb_found &&
                 filp->f_pos != LDISKFS_HTREE_EOF_64BIT);
+       CFS_FAIL_CHECK_RESET(OBD_FAIL_OFD_IGET_FAIL, 0);
+
 out:
        fput(filp);
        if (rc < 0)
index d76268d..f38ccf9 100755 (executable)
@@ -11530,18 +11530,6 @@ test_136() {
 }
 run_test 136 "don't panic with bad obdecho setup"
 
-wait_osp_import() {
-       local facet=$1
-       local remtgt=$(facet_svc $2)
-       local expected=$3
-       local loctgt=$(facet_svc $facet)
-       local param="osp.$remtgt-os[pc]-${loctgt#*-}.*_server_uuid"
-
-       do_rpc_nodes "$(facet_active_host $facet)" \
-                       wait_import_state $expected $param ||
-               error "$param: import is not in expected state"
-}
-
 test_137() {
        (( MDS1_VERSION >= $(version_code 2.15.61) )) ||
                skip "need MDS version at least 2.15.61"
index fd504f1..da611e3 100755 (executable)
@@ -6302,6 +6302,28 @@ test_42() {
 }
 run_test 42 "LFSCK can repair inconsistent MDT-object/OST-object encryption flags"
 
+test_43()
+{
+       [[ $mds1_FSTYPE == ldiskfs ]] || skip "only ldiskfs uses iterate_dir"
+       [[ $MDSCOUNT -lt 2 ]] && skip "needs >= 2 MDTs"
+
+       $LFS mkdir -i 1 -c 2 $DIR/$tdir-{1..10} || error "(1) Fail to mkdir"
+
+       remount_facet mds2 "-o abort_recov"
+
+       #define OBD_FAIL_OFD_IGET_FAIL_TO_START                        0x1e2
+       do_facet mds2 $LCTL set_param fail_loc=0x1e2
+       do_facet mds2 $LCTL lfsck_start -M ${FSNAME}-MDT0001 -t namespace
+
+       wait_update_facet mds2 \
+               "$LCTL get_param -n mdd.$(facet_svc mds2).lfsck_namespace |
+               awk '/^status/ { print \\\$2 }'" "completed" 32 || {
+               error "(5) mds2 is not the expected 'completed'"
+       }
+       wait_osp_import mds1 mds2 FULL
+}
+run_test 43 "LFSCK does not loop endlessly on iget failure in scanning-phase1"
+
 test_44() {
        lfsck_prep 3 3
 
index 2aac1d0..81f0faf 100755 (executable)
@@ -3184,7 +3184,7 @@ remount_facet() {
        local facet=$1
 
        stop $facet
-       mount_facet $facet
+       mount_facet $@
 }
 
 reboot_facet() {
@@ -8938,6 +8938,18 @@ wait_mgc_import_state() {
        fi
 }
 
+wait_osp_import() {
+       local facet=$1
+       local remtgt=$(facet_svc $2)
+       local expected=$3
+       local loctgt=$(facet_svc $facet)
+       local param="osp.$remtgt-os[pc]-${loctgt#*-}.*_server_uuid"
+
+       do_rpc_nodes "$(facet_active_host $facet)" \
+                       wait_import_state $expected $param ||
+               error "$param: import is not in expected state"
+}
+
 wait_dne_interconnect() {
        local num