Whamcloud - gitweb
LU-14719 lod: distributed transaction check space 39/47039/8
authorLai Siyao <lai.siyao@whamcloud.com>
Wed, 30 Mar 2022 21:50:22 +0000 (17:50 -0400)
committerOleg Drokin <green@whamcloud.com>
Tue, 25 Oct 2022 17:32:01 +0000 (17:32 +0000)
Distributed transaction failure may cause file missing or disconnected
directories, to avoid failure on disk full, check remote MDT free
space before transaction start.

The block/inode watermarks in obd_statfs_info are used to check
whether MDT has enough free blocks/inodes.

Add sanity 230x.

Signed-off-by: Lai Siyao <lai.siyao@whamcloud.com>
Change-Id: I0922e9c8668e8b842d313576bd68b52fa5d434ac
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/47039
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Qian Yingjin <qian@ddn.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/lod/lod_dev.c
lustre/lod/lod_internal.h
lustre/lod/lproc_lod.c
lustre/osp/osp_dev.c
lustre/osp/osp_precreate.c
lustre/tests/replay-single.sh
lustre/tests/sanity.sh

index 7a69520..1c72ce4 100644 (file)
@@ -1444,6 +1444,52 @@ static struct thandle *lod_trans_create(const struct lu_env *env,
        return th;
 }
 
+/* distributed transaction failure may cause object missing or disconnected
+ * directories, check space before transaction start.
+ */
+static int lod_trans_space_check(const struct lu_env *env,
+                                struct thandle *th)
+{
+       struct lod_thread_info *info = lod_env_info(env);
+       struct obd_statfs *sfs = &info->lti_osfs;
+       struct top_thandle *top_th = container_of(th, struct top_thandle,
+                                                 tt_super);
+       struct top_multiple_thandle *tmt = top_th->tt_multiple_thandle;
+       struct sub_thandle *st;
+       int rc;
+
+       if (likely(!tmt))
+               return 0;
+
+       list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
+               struct dt_device *sub_dt;
+
+               if (st->st_sub_th == NULL)
+                       continue;
+
+               if (st->st_sub_th == top_th->tt_master_sub_thandle)
+                       continue;
+
+               sub_dt = st->st_sub_th->th_dev;
+               rc = dt_statfs(env, sub_dt, sfs);
+               if (rc) {
+                       CDEBUG(D_INFO, "%s: fail - statfs error: rc = %d\n",
+                              sub_dt->dd_lu_dev.ld_obd->obd_name, rc);
+                       return rc;
+               }
+
+               if (unlikely(sfs->os_state &
+                            (OS_STATFS_ENOINO | OS_STATFS_ENOSPC))) {
+                       CDEBUG(D_INFO, "%s: fail - target state %x: rc = %d\n",
+                              sub_dt->dd_lu_dev.ld_obd->obd_name,
+                              sfs->os_state, -ENOSPC);
+                       return -ENOSPC;
+               }
+       }
+
+       return 0;
+}
+
 /**
  * Implementation of dt_device_operations::dt_trans_start() for LOD
  *
@@ -1455,7 +1501,17 @@ static struct thandle *lod_trans_create(const struct lu_env *env,
 static int lod_trans_start(const struct lu_env *env, struct dt_device *dt,
                           struct thandle *th)
 {
-       return top_trans_start(env, dt2lod_dev(dt)->lod_child, th);
+       struct lod_device *lod = dt2lod_dev(dt);
+
+       if (lod->lod_dist_txn_check_space) {
+               int rc;
+
+               rc = lod_trans_space_check(env, th);
+               if (rc)
+                       return rc;
+       }
+
+       return top_trans_start(env, lod->lod_child, th);
 }
 
 static int lod_trans_cb_add(struct thandle *th,
@@ -1827,6 +1883,7 @@ static int lod_init0(const struct lu_env *env, struct lod_device *lod,
        lu_tgt_descs_init(&lod->lod_ost_descs, false);
        lu_qos_rr_init(&lod->lod_mdt_descs.ltd_qos.lq_rr);
        lu_qos_rr_init(&lod->lod_ost_descs.ltd_qos.lq_rr);
+       lod->lod_dist_txn_check_space = 1;
 
        RETURN(0);
 
index 43a1c09..09863d9 100644 (file)
@@ -115,7 +115,8 @@ struct lod_device {
        unsigned int          lod_recovery_completed:1,
                              lod_initialized:1,
                              lod_lmv_failout:1,
-                             lod_child_got_update_log:1;
+                             lod_child_got_update_log:1,
+                             lod_dist_txn_check_space:1;
 
        /* protect ld_active_tgt_count, ltd_active and lod_md_root */
        spinlock_t           lod_lock;
index 363b3a7..3130a72 100644 (file)
@@ -1098,6 +1098,37 @@ out:
 }
 LUSTRE_RW_ATTR(mdt_hash);
 
+static ssize_t dist_txn_check_space_show(struct kobject *kobj,
+                                        struct attribute *attr,
+                                        char *buf)
+{
+       struct dt_device *dt = container_of(kobj, struct dt_device,
+                                           dd_kobj);
+       struct lod_device *lod = dt2lod_dev(dt);
+
+       return scnprintf(buf, PAGE_SIZE, "%d\n", lod->lod_dist_txn_check_space);
+}
+
+static ssize_t dist_txn_check_space_store(struct kobject *kobj,
+                                         struct attribute *attr,
+                                         const char *buffer, size_t count)
+{
+       struct dt_device *dt = container_of(kobj, struct dt_device,
+                                           dd_kobj);
+       struct lod_device *lod = dt2lod_dev(dt);
+       bool val = 0;
+       int rc;
+
+       rc = kstrtobool(buffer, &val);
+       if (rc)
+               return rc;
+
+       lod->lod_dist_txn_check_space = val;
+
+       return count;
+}
+LUSTRE_RW_ATTR(dist_txn_check_space);
+
 static const struct proc_ops lod_proc_mdt_fops = {
        PROC_OWNER(THIS_MODULE)
        .proc_open      = lod_mdts_seq_open,
@@ -1319,6 +1350,7 @@ static struct attribute *lod_attrs[] = {
        &lustre_attr_mdt_qos_prio_free.attr,
        &lustre_attr_mdt_qos_threshold_rr.attr,
        &lustre_attr_mdt_hash.attr,
+       &lustre_attr_dist_txn_check_space.attr,
        NULL,
 };
 
index a8cd2e5..1fae9ab 100644 (file)
@@ -795,9 +795,6 @@ static int osp_statfs(const struct lu_env *env, struct dt_device *dev,
                info->os_reserved_mb_high = d->opd_reserved_mb_high;
        }
 
-       if (d->opd_pre == NULL)
-               RETURN(0);
-
        CDEBUG(D_OTHER, "%s: %llu blocks, %llu free, %llu avail, "
               "%u bsize, %u reserved mb low, %u reserved mb high, "
               "%llu files, %llu free files\n", d->opd_obd->obd_name,
@@ -805,8 +802,7 @@ static int osp_statfs(const struct lu_env *env, struct dt_device *dev,
               d->opd_reserved_mb_low, d->opd_reserved_mb_high,
               sfs->os_files, sfs->os_ffree);
 
-
-       if (info && !info->os_enable_pre)
+       if (d->opd_pre == NULL || (info && !info->os_enable_pre))
                RETURN(0);
 
        /*
index 672cd82..94c315b 100644 (file)
@@ -161,7 +161,7 @@ static int osp_statfs_interpret(const struct lu_env *env,
        if (d->opd_pre)
                osp_pre_update_status_msfs(d, msfs, 0);
        else
-               d->opd_statfs = *msfs;
+               osp_pre_update_msfs(d, msfs);
 
        /* schedule next update */
        maxage_ns = d->opd_statfs_maxage * NSEC_PER_SEC;
@@ -171,12 +171,13 @@ static int osp_statfs_interpret(const struct lu_env *env,
        d->opd_statfs_update_in_progress = 0;
 
        sfs = &d->opd_statfs;
-       CDEBUG(D_CACHE, "%s (%p): %llu blocks, %llu free, %llu avail, "
-              "%u bsize, %u reserved mb low, %u reserved mb high,"
-              "%llu files, %llu free files\n", d->opd_obd->obd_name, d,
-              sfs->os_blocks, sfs->os_bfree, sfs->os_bavail, sfs->os_bsize,
-              d->opd_reserved_mb_low, d->opd_reserved_mb_high,
-              sfs->os_files, sfs->os_ffree);
+       CDEBUG(D_CACHE,
+              "%s (%p): %llu blocks, %llu free, %llu avail, %u bsize, %u reserved mb low, %u reserved mb high, %u reserved ino low, %u reserved ino high, %llu files, %llu free files %#x\n",
+              d->opd_obd->obd_name, d, sfs->os_blocks, sfs->os_bfree,
+              sfs->os_bavail, sfs->os_bsize, d->opd_reserved_mb_low,
+              d->opd_reserved_mb_high, d->opd_reserved_ino_low,
+              d->opd_reserved_ino_high, sfs->os_files, sfs->os_ffree,
+              sfs->os_state);
 
        RETURN(0);
 out:
@@ -1062,8 +1063,8 @@ static void osp_pre_update_msfs(struct osp_device *d, struct obd_statfs *msfs)
 
        if (unlikely(d->opd_reserved_ino_high == 0 &&
                     d->opd_reserved_ino_low == 0)) {
-               /* Use ~0.1% by default to disallow distributed transactions,
-                * and ~0.2% to allow, set both watermark
+               /* Use ~0.0001% by default to disallow distributed transactions,
+                * and ~0.0002% to allow, set both watermark
                 */
                spin_lock(&d->opd_pre_lock);
                if (d->opd_reserved_ino_high == 0 &&
@@ -1095,7 +1096,10 @@ static void osp_pre_update_msfs(struct osp_device *d, struct obd_statfs *msfs)
               d->opd_obd->obd_name, msfs->os_blocks, msfs->os_bfree,
               msfs->os_bavail, available_mb, d->opd_reserved_mb_high,
               msfs->os_files, msfs->os_ffree, msfs->os_state,
-              d->opd_pre_status);
+              d->opd_pre ? d->opd_pre_status : 0);
+
+       if (!d->opd_pre)
+               goto update;
 
        if (msfs->os_state & (OS_STATFS_ENOINO | OS_STATFS_ENOSPC)) {
                d->opd_pre_status = -ENOSPC;
@@ -1128,6 +1132,7 @@ static void osp_pre_update_msfs(struct osp_device *d, struct obd_statfs *msfs)
                msfs->os_state |= OS_STATFS_NOPRECREATE;
        /* else don't clear flags in new msfs->os_state sent from OST */
 
+update:
        /* copy only new statfs state to make it visible to MDS threads */
        if (&d->opd_statfs != msfs)
                d->opd_statfs = *msfs;
index a0ac037..47989b8 100755 (executable)
@@ -4159,23 +4159,25 @@ test_111f() {
 run_test 111f "DNE: unlink striped dir, uncommit on MDT1, fail MDT1/MDT2"
 
 test_111g() {
-       [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
-       [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
+       (( $MDSCOUNT >= 2 )) || skip "needs >= 2 MDTs"
+       (( $MDS1_VERSION -ge $(version_code 2.7.56) )) ||
                skip "Need MDS version at least 2.7.56"
 
-       ([ $FAILURE_MODE == "HARD" ] &&
-               [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
-               skip "MDTs needs to be on diff hosts for HARD fail mode" &&
-               return 0
+       ([ $FAILURE_MODE != "HARD" ] ||
+               [ "$(facet_host mds1)" != "$(facet_host mds2)" ]) ||
+               skip "MDTs needs to be on diff hosts for HARD fail mode"
 
+       start_full_debug_logging
        mkdir -p $DIR/$tdir
        $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
+       $LFS df -i
        replay_barrier mds1
        replay_barrier mds2
        rm -rf $DIR/$tdir/striped_dir
        fail mds1,mds2
        $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
-                       error "striped dir still exists"
+               error "striped dir still exists"
+       stop_full_debug_logging
        return 0
 }
 run_test 111g "DNE: unlink striped dir, fail MDT1/MDT2"
index 119e91c..ff60160 100755 (executable)
@@ -21027,6 +21027,48 @@ test_230w() {
 }
 run_test 230w "non-recursive mode dir migration"
 
+test_230x() {
+       (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTs"
+       (( MDS1_VERSION >= $(version_code 2.15.0) )) ||
+               skip "Need MDS version at least 2.15.0"
+
+       mkdir -p $DIR/$tdir || error "mkdir failed"
+       createmany -d $DIR/$tdir/sub 100 || error "createmany failed"
+
+       local mdt_name=$(mdtname_from_index 0)
+       local low=$(do_facet mds2 $LCTL get_param -n \
+               osp.*$mdt_name-osp-MDT0001.reserved_ino_low)
+       local high=$(do_facet mds2 $LCTL get_param -n \
+               osp.*$mdt_name-osp-MDT0001.reserved_ino_high)
+       local ffree=$($LFS df -i $MOUNT | awk "/$mdt_name/ { print \$4 }")
+       local maxage=$(do_facet mds2 $LCTL get_param -n \
+               osp.*$mdt_name-osp-MDT0001.maxage)
+
+       stack_trap "do_facet mds2 $LCTL set_param -n \
+               osp.*$mdt_name-osp-MDT0001.reserved_ino_low=$low \
+               osp.*$mdt_name-osp-MDT0001.reserved_ino_high=$high" EXIT
+       stack_trap "do_facet mds2 $LCTL set_param -n \
+               osp.*$mdt_name-osp-MDT0001.maxage=$maxage" EXIT
+
+       do_facet mds2 $LCTL set_param -n \
+               osp.*$mdt_name-osp-MDT0001.reserved_ino_low=$((ffree + 1))
+       do_facet mds2 $LCTL set_param -n osp.*$mdt_name-osp-MDT0001.maxage=1
+       sleep 4
+       $LFS migrate -m 1 -c $MDSCOUNT $DIR/$tdir &&
+               error "migrate $tdir should fail"
+
+       do_facet mds2 $LCTL set_param -n \
+               osp.*$mdt_name-osp-MDT0001.reserved_ino_low=$low
+       do_facet mds2 $LCTL set_param -n \
+               osp.*$mdt_name-osp-MDT0001.reserved_ino_high=$high
+       sleep 4
+       $LFS migrate -m 1 -c $MDSCOUNT $DIR/$tdir ||
+               error "migrate failed"
+       (( $($LFS getdirstripe -c $DIR/$tdir) == $MDSCOUNT )) ||
+               error "$tdir stripe count mismatch"
+}
+run_test 230x "dir migration check space"
+
 test_231a()
 {
        # For simplicity this test assumes that max_pages_per_rpc