Whamcloud - gitweb
LU-13765 osd-ldiskfs: Extend credit correctly for fallocate 42/39342/18
authorArshad Hussain <arshad.super@gmail.com>
Wed, 9 Sep 2020 23:18:13 +0000 (04:48 +0530)
committerOleg Drokin <green@whamcloud.com>
Thu, 29 Oct 2020 04:28:42 +0000 (04:28 +0000)
In OSD layer, before call ->fallocate(), Lustre has already
created journal handle for the fallocate transcation. In
ldiskfs/ext4, for very large range fallocate, the operation
may split into multiple transaction and call journal start/stop
multiple times inside fallocate. However, nested journal will
ignore requested credits, this result in running out of credits
at the end.

As we can not predict the total number of credits needed in
advance especially for large fallocate, thus in this patch, we
move fallocate logic into Lustre OSD, so that it could reserve
credits correctly. It extends credits for the current transaction
when found the left buffer credits is less than needed, and then
restart the transaction.

Testcase sanity/150e and sanity-quota/1h added to verify the
issue.

Test-Parameters: trivial testlist=sanity ostsizegb=12 env=ONLY="150e"
Test-Parameters: testlist=sanity-quota
Signed-off-by: Arshad Hussain <arshad.super@gmail.com>
Signed-off-by: Qian Yingjin <qian@ddn.com>
Change-Id: Ib7565ed2c1ae72eef4832fbcb710e0ee70c53aec
Reviewed-on: https://review.whamcloud.com/39342
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Wang Shilong <wshilong@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/dt_object.h
lustre/ofd/ofd_objects.c
lustre/osd-ldiskfs/osd_io.c
lustre/osd-zfs/osd_io.c
lustre/tests/sanity-quota.sh
lustre/tests/sanity.sh

index f260afb..601b918 100644 (file)
@@ -1436,8 +1436,8 @@ struct dt_body_operations {
         * \retval negative     negated errno on error
         */
        int (*dbo_declare_fallocate)(const struct lu_env *env,
-                                   struct dt_object *dt,
-                                   struct thandle *th);
+                                   struct dt_object *dt, __u64 start,
+                                   __u64 end, int mode, struct thandle *th);
        /**
         * Allocate specified region for an object
         *
@@ -2587,14 +2587,16 @@ static inline int dt_ladvise(const struct lu_env *env, struct dt_object *dt,
 }
 
 static inline int dt_declare_falloc(const struct lu_env *env,
-                                     struct dt_object *dt, struct thandle *th)
+                                   struct dt_object *dt, __u64 start,
+                                   __u64 end, int mode, struct thandle *th)
 {
        LASSERT(dt);
        if (!dt->do_body_ops)
                return -EOPNOTSUPP;
        LASSERT(dt->do_body_ops);
        LASSERT(dt->do_body_ops->dbo_declare_fallocate);
-       return dt->do_body_ops->dbo_declare_fallocate(env, dt, th);
+       return dt->do_body_ops->dbo_declare_fallocate(env, dt, start, end,
+                                                     mode, th);
 }
 
 static inline int dt_falloc(const struct lu_env *env, struct dt_object *dt,
index f99d86b..5447b54 100644 (file)
@@ -795,7 +795,7 @@ int ofd_object_fallocate(const struct lu_env *env, struct ofd_object *fo,
        if (rc)
                GOTO(stop, rc);
 
-       rc = dt_declare_falloc(env, dob, th);
+       rc = dt_declare_falloc(env, dob, start, end, mode, th);
        if (rc)
                GOTO(stop, rc);
 
index 4e1735a..e1fea27 100644 (file)
@@ -38,6 +38,8 @@
  *
  */
 
+#define DEBUG_SUBSYSTEM        S_OSD
+
 /* prerequisite for linux/xattr.h */
 #include <linux/types.h>
 /* prerequisite for linux/xattr.h */
@@ -1888,36 +1890,116 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt,
 }
 
 static int osd_declare_fallocate(const struct lu_env *env,
-                                struct dt_object *dt, struct thandle *th)
+                                struct dt_object *dt, __u64 start, __u64 end,
+                                int mode, struct thandle *th)
 {
-       struct osd_thandle *oh;
-       struct inode *inode;
+       struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super);
+       struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
+       struct inode *inode = osd_dt_obj(dt)->oo_inode;
+       long long quota_space = 0;
+       /* 5 is max tree depth. (inode + 4 index blocks) */
+       int depth = 5;
        int rc;
+
        ENTRY;
 
-       LASSERT(th);
-       oh = container_of(th, struct osd_thandle, ot_super);
+       /*
+        * Only mode == 0 (which is standard prealloc) is supported now.
+        * Rest of mode options is not supported yet.
+        */
+       if (mode & ~FALLOC_FL_KEEP_SIZE)
+               RETURN(-EOPNOTSUPP);
 
-       osd_trans_declare_op(env, oh, OSD_OT_PREALLOC,
-                            osd_dto_credits_noquota[DTO_WRITE_BLOCK]);
-       inode = osd_dt_obj(dt)->oo_inode;
+       LASSERT(th);
        LASSERT(inode);
 
+       /* quota space for metadata blocks
+        * approximate metadata estimate should be good enough.
+        */
+       quota_space += PAGE_SIZE;
+       quota_space += depth * LDISKFS_BLOCK_SIZE(osd_sb(osd));
+
+       /* quota space should be reported in 1K blocks */
+       quota_space = toqb(quota_space) + toqb(end - start) +
+                     LDISKFS_META_TRANS_BLOCKS(inode->i_sb);
+
+       /* We don't need to reserve credits for whole fallocate here.
+        * We reserve space only for metadata. Fallocate credits are
+        * extended as required
+        */
        rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
-                                  i_projid_read(inode), 0, oh, osd_dt_obj(dt),
-                                  NULL, OSD_QID_BLK);
+                                  i_projid_read(inode), quota_space, oh,
+                                  osd_dt_obj(dt), NULL, OSD_QID_BLK);
        RETURN(rc);
 }
 
+/* Borrow @ext4_chunk_trans_blocks */
+static int osd_chunk_trans_blocks(struct inode *inode, int nrblocks)
+{
+       ldiskfs_group_t groups;
+       int gdpblocks;
+       int idxblocks;
+       int depth;
+       int ret;
+
+       depth = ext_depth(inode);
+       idxblocks = depth * 2;
+
+       /*
+        * Now let's see how many group bitmaps and group descriptors need
+        * to account.
+        */
+       groups = idxblocks + 1;
+       gdpblocks = groups;
+       if (groups > LDISKFS_SB(inode->i_sb)->s_groups_count)
+               groups = LDISKFS_SB(inode->i_sb)->s_groups_count;
+       if (gdpblocks > LDISKFS_SB(inode->i_sb)->s_gdb_count)
+               gdpblocks = LDISKFS_SB(inode->i_sb)->s_gdb_count;
+
+       /* bitmaps and block group descriptor blocks */
+       ret = idxblocks + groups + gdpblocks;
+
+       /* Blocks for super block, inode, quota and xattr blocks */
+       ret += LDISKFS_META_TRANS_BLOCKS(inode->i_sb);
+
+       return ret;
+}
+
+static int osd_extend_restart_trans(handle_t *handle, int needed)
+{
+       int rc;
+
+       if (ldiskfs_handle_has_enough_credits(handle, needed))
+               return 0;
+
+       rc = ldiskfs_journal_extend(handle, needed - handle->h_buffer_credits);
+       if (rc <= 0)
+               return rc;
+
+       rc = ldiskfs_journal_restart(handle, needed);
+
+       return rc;
+}
+
 static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
                         __u64 start, __u64 end, int mode, struct thandle *th)
 {
+       struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super);
+       handle_t *handle = ldiskfs_journal_current_handle();
+       unsigned int save_credits = oh->ot_credits;
        struct osd_object *obj = osd_dt_obj(dt);
        struct inode *inode = obj->oo_inode;
-       struct file *file;
+       struct ldiskfs_map_blocks map;
+       unsigned int credits;
+       ldiskfs_lblk_t blen;
+       ldiskfs_lblk_t boff;
+       loff_t new_size = 0;
+       int depth = 0;
+       int flags;
        int rc = 0;
 
        ENTRY;
+
        /*
         * Only mode == 0 (which is standard prealloc) is supported now.
         * Rest of mode options is not supported yet.
@@ -1928,17 +2010,105 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
        LASSERT(dt_object_exists(dt));
        LASSERT(osd_invariant(obj));
        LASSERT(inode != NULL);
+
+       CDEBUG(D_INODE, "fallocate: inode #%lu: start %llu end %llu mode %d\n",
+              inode->i_ino, start, end, mode);
+
        dquot_initialize(inode);
 
        LASSERT(th);
 
-       osd_trans_exec_op(env, th, OSD_OT_PREALLOC);
+       boff = start >> inode->i_blkbits;
+       blen = (ALIGN(end, 1 << inode->i_blkbits) >> inode->i_blkbits) - boff;
+
+       flags = LDISKFS_GET_BLOCKS_CREATE;
+       if (mode & FALLOC_FL_KEEP_SIZE)
+               flags |= LDISKFS_GET_BLOCKS_KEEP_SIZE;
+
+       inode_lock(inode);
 
        /*
-        * Because f_op->fallocate() does not have an inode arg
+        * We only support preallocation for extent-based file only.
         */
-       file = osd_quasi_file(env, inode);
-       rc = file->f_op->fallocate(file, mode, start, end - start);
+       if (!(ldiskfs_test_inode_flag(inode, LDISKFS_INODE_EXTENTS)))
+               GOTO(out, rc = -EOPNOTSUPP);
+
+       if (!(mode & FALLOC_FL_KEEP_SIZE) && (end > i_size_read(inode) ||
+           end > LDISKFS_I(inode)->i_disksize)) {
+               new_size = end;
+               rc = inode_newsize_ok(inode, new_size);
+               if (rc)
+                       GOTO(out, rc);
+       }
+
+       inode_dio_wait(inode);
+
+       map.m_lblk = boff;
+       map.m_len = blen;
+
+       /*
+        * Don't normalize the request if it can fit in one extent so
+        * that it doesn't get unnecessarily split into multiple
+        * extents.
+        */
+       if (blen <= EXT_UNWRITTEN_MAX_LEN)
+               flags |= LDISKFS_GET_BLOCKS_NO_NORMALIZE;
+
+       /*
+        * credits to insert 1 extent into extent tree.
+        */
+       credits = osd_chunk_trans_blocks(inode, blen);
+       depth = ext_depth(inode);
+
+       while (rc >= 0 && blen) {
+               loff_t epos;
+
+               /*
+                * Recalculate credits when extent tree depth changes.
+                */
+               if (depth != ext_depth(inode)) {
+                       credits = osd_chunk_trans_blocks(inode, blen);
+                       depth = ext_depth(inode);
+               }
+
+               /* TODO: quota check */
+               rc = osd_extend_restart_trans(handle, credits);
+               if (rc)
+                       break;
+
+               rc = ldiskfs_map_blocks(handle, inode, &map, flags);
+               if (rc <= 0) {
+                       CDEBUG(D_INODE,
+                              "inode #%lu: block %u: len %u: ldiskfs_map_blocks returned %d\n",
+                              inode->i_ino, map.m_lblk, map.m_len, rc);
+                       ldiskfs_mark_inode_dirty(handle, inode);
+                       break;
+               }
+
+               map.m_lblk += rc;
+               map.m_len = blen = blen - rc;
+               epos = (loff_t)map.m_lblk << inode->i_blkbits;
+               inode->i_ctime = current_time(inode);
+               if (new_size) {
+                       if (epos > end)
+                               epos = end;
+                       if (ldiskfs_update_inode_size(inode, epos) & 0x1)
+                               inode->i_mtime = inode->i_ctime;
+               } else {
+                       if (epos > inode->i_size)
+                               ldiskfs_set_inode_flag(inode,
+                                                      LDISKFS_INODE_EOFBLOCKS);
+               }
+
+               ldiskfs_mark_inode_dirty(handle, inode);
+       }
+
+out:
+       inode_unlock(inode);
+
+       /* extand credits if needed for operations such as attribute set */
+       if (rc >= 0)
+               rc = osd_extend_restart_trans(handle, save_credits);
 
        RETURN(rc);
 }
index 1f71bd9..e528ae5 100644 (file)
@@ -1166,7 +1166,8 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
 }
 
 static int osd_declare_fallocate(const struct lu_env *env,
-                                struct dt_object *dt, struct thandle *th)
+                                struct dt_object *dt, __u64 start, __u64 end,
+                                int mode, struct thandle *th)
 {
        int rc = -EOPNOTSUPP;
        ENTRY;
index e579ee1..ea32869 100755 (executable)
@@ -558,6 +558,28 @@ test_1_check_write() {
                        "user write success, but expect EDQUOT"
 }
 
+check_write_fallocate() {
+       local testfile="$1"
+       local qtype="$2"
+       local limit=$3
+       local short_qtype=${qtype:0:1}
+
+       count=$((limit/2))
+       log "Write ${count}MiB Using Fallocate"
+       $RUNAS fallocate -l${count}MiB $testfile ||
+               quota_error $short_qtype $TSTUSR "Write ${count}MiB fail"
+
+       cancel_lru_locks osc
+       sync; sync_all_data || true
+       sleep 2
+
+       count=$((limit + 1))
+       log "Write ${count}MiB Using Fallocate"
+       $RUNAS fallocate -l${count}MiB $testfile &&
+               quota_error $short_qtype $TSTUSR \
+               "Write success, expect EDQUOT" || true
+}
+
 # test block hardlimit
 test_1a() {
        local limit=10  # 10M
@@ -1062,6 +1084,44 @@ test_1g() {
 }
 run_test 1g "Quota pools: Block hard limit with wide striping"
 
+test_1h() {
+       local limit=10  # 10M
+       local testfile="$DIR/$tdir/$tfile-0"
+
+       [ "$ost1_FSTYPE" != ldiskfs ] && skip "non-ldiskfs backend"
+       [ $OST1_VERSION -lt $(version_code 2.13.50) ] &&
+               skip "Need OST version at least 2.13.53"
+
+       setup_quota_test || error "setup quota failed with $?"
+       trap cleanup_quota_test EXIT
+
+       # enable ost quota
+       set_ost_qtype $QTYPE || error "enable ost quota failed"
+
+       # test for user
+       log "User quota (block hardlimit:$limit MB)"
+       $LFS setquota -u $TSTUSR -b 0 -B ${limit}M -i 0 -I 0 $DIR ||
+               error "set user quota failed"
+
+       # make sure the system is clean
+       local used=$(getquota -u $TSTUSR global curspace)
+       [ $used -ne 0 ] && error "Used space($used) for user $TSTUSR isn't 0."
+
+       $LFS setstripe $testfile -c 1 || error "setstripe $testfile failed"
+       chown $TSTUSR.$TSTUSR $testfile || error "chown $testfile failed"
+
+       check_write_fallocate $testfile "user" $limit
+
+       rm -f $testfile
+       wait_delete_completed || error "wait_delete_completed failed"
+       sync_all_data || true
+       used=$(getquota -u $TSTUSR global curspace)
+       [ $used -ne 0 ] && quota_error u $TSTUSR \
+               "user quota isn't released after deletion"
+       resetquota -u $TSTUSR
+}
+run_test 1h "Block hard limit test using fallocate"
+
 # test inode hardlimit
 test_2() {
        local TESTFILE="$DIR/$tdir/$tfile-0"
index 44f5646..78fa3ed 100755 (executable)
@@ -13415,6 +13415,51 @@ test_150d() {
 }
 run_test 150d "Verify fallocate Size and Blocks - Non zero start"
 
+test_150e() {
+       [ "$ost1_FSTYPE" != ldiskfs ] && skip "non-ldiskfs backend"
+       [ $OST1_VERSION -ge $(version_code 2.13.55) ] ||
+               skip "Need OST version at least 2.13.55"
+
+       echo "df before:"
+       $LFS df
+       $LFS setstripe -c${OSTCOUNT} $DIR/$tfile ||
+               error "$LFS setstripe -c${OSTCOUNT} $DIR/$tfile failed"
+
+       # Find OST with Minimum Size
+       min_size_ost=$($LFS df | awk "/$FSNAME-OST/ { print \$4 }" |
+                      sort -un | head -1)
+
+       # Get 90% of the available space
+       local space=$(((min_size_ost * 90)/100 * OSTCOUNT))
+
+       fallocate -l${space}k $DIR/$tfile ||
+               error "fallocate ${space}k $DIR/$tfile failed"
+       echo "'fallocate -l ${space}k $DIR/$tfile' succeeded"
+
+       # get size immediately after fallocate. This should be correctly
+       # updated
+       local size=$(stat -c '%s' $DIR/$tfile)
+       local used=$(( $(stat -c '%b * %B' $DIR/$tfile) / 1024))
+
+       # Sleep for a while for statfs to get updated. And not pull from cache.
+       sleep 2
+
+       echo "df after fallocate:"
+       $LFS df
+
+       (( size / 1024 == space )) || error "size $size != requested $space"
+       [ "$ost1_FSTYPE" != ldiskfs ] || (( used >= space )) ||
+               error "used $used < space $space"
+
+       rm $DIR/$tfile || error "rm failed"
+       sync
+       wait_delete_completed
+
+       echo "df after unlink:"
+       $LFS df
+}
+run_test 150e "Verify 90% of available OST space consumed by fallocate"
+
 #LU-2902 roc_hit was not able to read all values from lproc
 function roc_hit_init() {
        local list=$(comma_list $(osts_nodes))