Whamcloud - gitweb
LU-14565 ofd: Do not rely on tgd_blockbit 54/43154/23
authorArshad Hussain <arshad.hussain@aeoncomputing.com>
Mon, 29 Mar 2021 05:22:11 +0000 (10:52 +0530)
committerOleg Drokin <green@whamcloud.com>
Wed, 19 May 2021 02:03:22 +0000 (02:03 +0000)
tgd_blockbit is recordsize bits set during mkfs.
This once set does not change. However, 'zfs set'
can be used to change the OST blocksize. Instead
of using cached value of 'tgd_blockbit' always
calculate the blocksize bits which may have
changed.

Test-case: sanity/104c added.

Signed-off-by: Arshad Hussain <arshad.hussain@aeoncomputing.com>
Change-Id: Icc100cca0d5ae492c41d60f0bf97512450f796bc
Reviewed-on: https://review.whamcloud.com/43154
Reviewed-by: Wang Shilong <wshilong@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/mdt/mdt_handler.c
lustre/ofd/ofd_obd.c
lustre/tests/sanity.sh

index fa160bc..29a4f51 100644 (file)
@@ -435,6 +435,7 @@ static int mdt_statfs(struct tgt_session_info *tsi)
        struct mdt_body *reqbody = NULL;
        struct mdt_statfs_cache *msf;
        ktime_t kstart = ktime_get();
+       int current_blockbits;
        int rc;
 
        ENTRY;
@@ -491,6 +492,15 @@ static int mdt_statfs(struct tgt_session_info *tsi)
                        spin_unlock(&mdt->mdt_lock);
        }
 
+       /* tgd_blockbit is recordsize bits set during mkfs.
+        * This once set does not change. However, 'zfs set'
+        * can be used to change the MDT blocksize. Instead
+        * of using cached value of 'tgd_blockbit' always
+        * calculate the blocksize bits which may have
+        * changed.
+        */
+       current_blockbits = fls64(osfs->os_bsize) - 1;
+
        /* at least try to account for cached pages.  its still racy and
         * might be under-reporting if clients haven't announced their
         * caches with brw recently */
@@ -498,12 +508,12 @@ static int mdt_statfs(struct tgt_session_info *tsi)
               " pending %llu free %llu avail %llu\n",
               tgd->tgd_tot_dirty, tgd->tgd_tot_granted,
               tgd->tgd_tot_pending,
-              osfs->os_bfree << tgd->tgd_blockbits,
-              osfs->os_bavail << tgd->tgd_blockbits);
+              osfs->os_bfree << current_blockbits,
+              osfs->os_bavail << current_blockbits);
 
        osfs->os_bavail -= min_t(u64, osfs->os_bavail,
                                 ((tgd->tgd_tot_dirty + tgd->tgd_tot_pending +
-                                  osfs->os_bsize - 1) >> tgd->tgd_blockbits));
+                                  osfs->os_bsize - 1) >> current_blockbits));
 
        tgt_grant_sanity_check(mdt->mdt_lu_dev.ld_obd, __func__);
        CDEBUG(D_CACHE, "%llu blocks: %llu free, %llu avail; "
@@ -512,15 +522,15 @@ static int mdt_statfs(struct tgt_session_info *tsi)
               osfs->os_files, osfs->os_ffree, osfs->os_state);
 
        if (!exp_grant_param_supp(tsi->tsi_exp) &&
-           tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) {
+           current_blockbits > COMPAT_BSIZE_SHIFT) {
                /* clients which don't support OBD_CONNECT_GRANT_PARAM
                 * should not see a block size > page size, otherwise
                 * cl_lost_grant goes mad. Therefore, we emulate a 4KB (=2^12)
                 * block size which is the biggest block size known to work
                 * with all client's page size. */
-               osfs->os_blocks <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
-               osfs->os_bfree  <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
-               osfs->os_bavail <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+               osfs->os_blocks <<= current_blockbits - COMPAT_BSIZE_SHIFT;
+               osfs->os_bfree  <<= current_blockbits - COMPAT_BSIZE_SHIFT;
+               osfs->os_bavail <<= current_blockbits - COMPAT_BSIZE_SHIFT;
                osfs->os_bsize = 1 << COMPAT_BSIZE_SHIFT;
        }
        if (rc == 0)
index ced6f6a..67ebade 100644 (file)
@@ -709,6 +709,7 @@ int ofd_statfs(const struct lu_env *env,  struct obd_export *exp,
        struct obd_device *obd = class_exp2obd(exp);
        struct ofd_device *ofd = ofd_exp(exp);
        struct tg_grants_data *tgd = &ofd->ofd_lut.lut_tgd;
+       int current_blockbits;
        int rc;
 
        ENTRY;
@@ -717,22 +718,30 @@ int ofd_statfs(const struct lu_env *env,  struct obd_export *exp,
        if (unlikely(rc))
                GOTO(out, rc);
 
+       /* tgd_blockbit is recordsize bits set during mkfs.
+        * This once set does not change. However, 'zfs set'
+        * can be used to change the OST blocksize. Instead
+        * of using cached value of 'tgd_blockbit' always
+        * calculate the blocksize bits which may have
+        * changed.
+        */
+       current_blockbits = fls64(osfs->os_bsize) - 1;
+
        /*
         * at least try to account for cached pages.  its still racy and
         * might be under-reporting if clients haven't announced their
         * caches with brw recently
         */
-
        CDEBUG(D_SUPER | D_CACHE,
               "blocks cached %llu granted %llu pending %llu free %llu avail %llu\n",
               tgd->tgd_tot_dirty, tgd->tgd_tot_granted,
               tgd->tgd_tot_pending,
-              osfs->os_bfree << tgd->tgd_blockbits,
-              osfs->os_bavail << tgd->tgd_blockbits);
+              osfs->os_bfree << current_blockbits,
+              osfs->os_bavail << current_blockbits);
 
        osfs->os_bavail -= min_t(u64, osfs->os_bavail,
                                 ((tgd->tgd_tot_dirty + tgd->tgd_tot_pending +
-                                  osfs->os_bsize - 1) >> tgd->tgd_blockbits));
+                                  osfs->os_bsize - 1) >> current_blockbits));
 
        /*
         * The QoS code on the MDS does not care about space reserved for
@@ -743,7 +752,7 @@ int ofd_statfs(const struct lu_env *env,  struct obd_export *exp,
 
                ted = &obd->obd_self_export->exp_target_data;
                osfs->os_granted = min_t(u64, osfs->os_bavail,
-                                         ted->ted_grant >> tgd->tgd_blockbits);
+                                         ted->ted_grant >> current_blockbits);
                osfs->os_bavail -= osfs->os_granted;
        }
 
@@ -768,7 +777,7 @@ int ofd_statfs(const struct lu_env *env,  struct obd_export *exp,
                osfs->os_state |= OS_STATFS_NOPRECREATE;
 
        if (obd->obd_self_export != exp && !exp_grant_param_supp(exp) &&
-           tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) {
+           current_blockbits > COMPAT_BSIZE_SHIFT) {
                /*
                 * clients which don't support OBD_CONNECT_GRANT_PARAM
                 * should not see a block size > page size, otherwise
@@ -776,10 +785,10 @@ int ofd_statfs(const struct lu_env *env,  struct obd_export *exp,
                 * block size which is the biggest block size known to work
                 * with all client's page size.
                 */
-               osfs->os_blocks <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
-               osfs->os_bfree  <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
-               osfs->os_bavail <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
-               osfs->os_granted <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+               osfs->os_blocks <<= current_blockbits - COMPAT_BSIZE_SHIFT;
+               osfs->os_bfree  <<= current_blockbits - COMPAT_BSIZE_SHIFT;
+               osfs->os_bavail <<= current_blockbits - COMPAT_BSIZE_SHIFT;
+               osfs->os_granted <<= current_blockbits - COMPAT_BSIZE_SHIFT;
                osfs->os_bsize    = 1 << COMPAT_BSIZE_SHIFT;
        }
 
index 38a91c2..2bc8ed7 100755 (executable)
@@ -10900,6 +10900,106 @@ test_104b() {
 }
 run_test 104b "$RUNAS lfs check servers test ===================="
 
+#
+# Verify $1 is within range of $2.
+# Success when $1 is within range. That is, when $1 is >= 2% of $2 and
+# $1 is <= 2% of $2. Else Fail.
+#
+value_in_range() {
+       # Strip all units (M, G, T)
+       actual=$(echo $1 | tr -d A-Z)
+       expect=$(echo $2 | tr -d A-Z)
+
+       expect_lo=$(($expect * 98 / 100)) # 2% below
+       expect_hi=$(($expect * 102 / 100)) # 2% above
+
+       # permit 2% drift above and below
+       (( $actual >= $expect_lo && $actual <= $expect_hi ))
+}
+
+test_104c() {
+       [ $PARALLEL == "yes" ] && skip "skip parallel run"
+       [ "$ost1_FSTYPE" == "zfs" ] || skip "zfs only test"
+
+       local ost_param="osd-zfs.$FSNAME-OST0000."
+       local mdt_param="osd-zfs.$FSNAME-MDT0000."
+       local ofacets=$(get_facets OST)
+       local mfacets=$(get_facets MDS)
+       local saved_ost_blocks=
+       local saved_mdt_blocks=
+
+       echo "Before recordsize change"
+       lfs_df=($($LFS df -h | grep "filesystem_summary:"))
+       df=($(df -h | grep "/mnt/lustre"$))
+
+       # For checking.
+       echo "lfs output : ${lfs_df[*]}"
+       echo "df  output : ${df[*]}"
+
+       for facet in ${ofacets//,/ }; do
+               if [ -z $saved_ost_blocks ]; then
+                       saved_ost_blocks=$(do_facet $facet \
+                               lctl get_param -n $ost_param.blocksize)
+                       echo "OST Blocksize: $saved_ost_blocks"
+               fi
+               ost=$(do_facet $facet lctl get_param -n $ost_param.mntdev)
+               do_facet $facet zfs set recordsize=32768 $ost
+       done
+
+       # BS too small. Sufficient for functional testing.
+       for facet in ${mfacets//,/ }; do
+               if [ -z $saved_mdt_blocks ]; then
+                       saved_mdt_blocks=$(do_facet $facet \
+                               lctl get_param -n $mdt_param.blocksize)
+                       echo "MDT Blocksize: $saved_mdt_blocks"
+               fi
+               mdt=$(do_facet $facet lctl get_param -n $mdt_param.mntdev)
+               do_facet $facet zfs set recordsize=32768 $mdt
+       done
+
+       # Give new values chance to reflect change
+       sleep 2
+
+       echo "After recordsize change"
+       lfs_df_after=($($LFS df -h | grep "filesystem_summary:"))
+       df_after=($(df -h | grep "/mnt/lustre"$))
+
+       # For checking.
+       echo "lfs output : ${lfs_df_after[*]}"
+       echo "df  output : ${df_after[*]}"
+
+       # Verify lfs df
+       value_in_range ${lfs_df_after[1]%.*} ${lfs_df[1]%.*} ||
+               error "lfs_df bytes: ${lfs_df_after[1]%.*} != ${lfs_df[1]%.*}"
+       value_in_range ${lfs_df_after[2]%.*} ${lfs_df[2]%.*} ||
+               error "lfs_df used: ${lfs_df_after[2]%.*} != ${lfs_df[2]%.*}"
+       value_in_range ${lfs_df_after[3]%.*} ${lfs_df[3]%.*} ||
+               error "lfs_df avail: ${lfs_df_after[3]%.*} != ${lfs_df[3]%.*}"
+
+       # Verify df
+       value_in_range ${df_after[1]%.*} ${df[1]%.*} ||
+               error "df bytes: ${df_after[1]%.*} != ${df[1]%.*}"
+       value_in_range ${df_after[2]%.*} ${df[2]%.*} ||
+               error "df used: ${df_after[2]%.*} != ${df[2]%.*}"
+       value_in_range ${df_after[3]%.*} ${df[3]%.*} ||
+               error "df avail: ${df_after[3]%.*} != ${df[3]%.*}"
+
+       # Restore MDT recordize back to original
+       for facet in ${mfacets//,/ }; do
+               mdt=$(do_facet $facet lctl get_param -n $mdt_param.mntdev)
+               do_facet $facet zfs set recordsize=$saved_mdt_blocks $mdt
+       done
+
+       # Restore OST recordize back to original
+       for facet in ${ofacets//,/ }; do
+               ost=$(do_facet $facet lctl get_param -n $ost_param.mntdev)
+               do_facet $facet zfs set recordsize=$saved_ost_blocks $ost
+       done
+
+       return 0
+}
+run_test 104c "Verify df vs lfs_df stays same after recordsize change"
+
 test_105a() {
        # doesn't work on 2.4 kernels
        touch $DIR/$tfile