Whamcloud - gitweb
LU-16032 osd: move unlink of large objects to separate thread 95/47995/31
authorArtem Blagodarenko <ablagodarenko@whamcloud.com>
Fri, 13 Oct 2023 07:49:07 +0000 (15:49 +0800)
committerOleg Drokin <green@whamcloud.com>
Wed, 8 Nov 2023 21:58:46 +0000 (21:58 +0000)
Final unlink and freeing of blocks for large objects can lead to
a thread hung with this call stack:

  Net: Service thread pid 1739 was inactive for 200.16s.
  The thread might be hung, or it might only be slow and will
  resume later.
  Dumping the stack trace for debugging purposes:
    __wait_on_buffer+0x2a/0x30
    ldiskfs_wait_block_bitmap+0xe0/0xf0 [ldiskfs]
    ldiskfs_read_block_bitmap+0x31/0x60 [ldiskfs]
    ldiskfs_free_blocks+0x329/0xbb0 [ldiskfs]
    ldiskfs_ext_remove_space+0x8a9/0x1150 [ldiskfs]
    ldiskfs_ext_truncate+0xb0/0xe0 [ldiskfs]
    ldiskfs_truncate+0x3b7/0x3f0 [ldiskfs]
    ldiskfs_evict_inode+0x58a/0x630 [ldiskfs]
    evict+0xb4/0x180
    iput+0xfc/0x190
    osd_object_delete+0x1f8/0x370 [osd_ldiskfs]
    lu_object_free.isra.30+0x68/0x170 [obdclass]
    lu_object_put+0xc5/0x3e0 [obdclass]
    ofd_destroy_by_fid+0x20e/0x500 [ofd]
    ofd_destroy_hdl+0x267/0x9f0 [ofd]
    tgt_request_handle+0xaee/0x15f0 [ptlrpc]
    ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc]
    ptlrpc_main+0xb34/0x1470 [ptlrpc]
    kthread+0xd1/0xe0

Let's move final unlink to workqueue if inode size > 1GB.  The size
threshold be configured by setting the minimum async truncate size
with the "osd-ldiskfs.*.delay_unlink_mb" parameter.

Writes to "osd-ldiskfs.*.force_sync" parameter will flush pending
delayed unlinks so that space can be reclaimed as needed.

Change-Id: Id535ae4c58732769effabee42835bc2da8cb5cc1
Signed-off-by: Artem Blagodarenko <ablagodarenko@whamcloud.com>
DDN-bug-id: DDN-3144
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/47995
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_lproc.c
lustre/tests/sanity.sh

index 3a38bff..68ae68a 100644 (file)
@@ -94,6 +94,9 @@ int ldiskfs_track_declares_assert;
 module_param(ldiskfs_track_declares_assert, int, 0644);
 MODULE_PARM_DESC(ldiskfs_track_declares_assert, "LBUG during tracking of declares");
 
+/* 1 GiB in 512-byte sectors */
+int ldiskfs_delayed_unlink_blocks = (1 << (30 - 9));
+
 /* Slab to allocate dynlocks */
 struct kmem_cache *dynlock_cachep;
 
@@ -2266,6 +2269,37 @@ static int osd_trans_cb_add(struct thandle *th, struct dt_txn_commit_cb *dcb)
        return 0;
 }
 
+struct osd_delayed_iput_work {
+       struct work_struct diw_work;
+       struct inode      *diw_inode;
+};
+
+static void osd_delayed_iput_fn(struct work_struct *work)
+{
+       struct osd_delayed_iput_work *diwork;
+       struct inode *inode;
+
+       diwork = container_of(work, struct osd_delayed_iput_work, diw_work);
+       inode = diwork->diw_inode;
+       CDEBUG(D_INODE, "%s: delayed iput (ino=%lu)\n",
+              inode->i_sb->s_id, inode->i_ino);
+       iput(inode);
+       OBD_FREE_PTR(diwork);
+}
+
+noinline void osd_delayed_iput(struct inode *inode,
+                              struct osd_delayed_iput_work *diwork)
+{
+       if (!diwork) {
+               iput(inode);
+       } else {
+               INIT_WORK(&diwork->diw_work, osd_delayed_iput_fn);
+               diwork->diw_inode = inode;
+               queue_work(LDISKFS_SB(inode->i_sb)->s_misc_wq,
+                          &diwork->diw_work);
+       }
+}
+
 /*
  * Called just before object is freed. Releases all resources except for
  * object itself (that is released by osd_object_free()).
@@ -2278,6 +2312,7 @@ static void osd_object_delete(const struct lu_env *env, struct lu_object *l)
        struct osd_object *obj = osd_obj(l);
        struct qsd_instance *qsd = osd_def_qsd(osd_obj2dev(obj));
        struct inode *inode = obj->oo_inode;
+       struct osd_delayed_iput_work *diwork = NULL;
        __u64 projid;
        qid_t uid;
        qid_t gid;
@@ -2293,6 +2328,9 @@ static void osd_object_delete(const struct lu_env *env, struct lu_object *l)
        if (!inode)
                return;
 
+       if (inode->i_blocks > ldiskfs_delayed_unlink_blocks)
+               OBD_ALLOC(diwork, sizeof(*diwork));
+
        if (osd_has_index(obj) &&  obj->oo_dt.do_index_ops == &osd_index_iam_ops)
                ldiskfs_set_inode_flag(inode, LDISKFS_INODE_JOURNAL_DATA);
 
@@ -2301,7 +2339,7 @@ static void osd_object_delete(const struct lu_env *env, struct lu_object *l)
        projid = i_projid_read(inode);
 
        obj->oo_inode = NULL;
-       iput(inode);
+       osd_delayed_iput(inode, diwork);
 
        /* do not rebalance quota if the caller needs to release memory
         * otherwise qsd_refresh_usage() may went into a new ldiskfs
@@ -8901,6 +8939,31 @@ static const struct obd_ops osd_obd_device_ops = {
        .o_health_check = osd_health_check,
 };
 
+static ssize_t delayed_unlink_mb_show(struct kobject *kobj,
+                                     struct attribute *attr, char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%d\n",
+                       ldiskfs_delayed_unlink_blocks >> 11);
+}
+
+static ssize_t delayed_unlink_mb_store(struct kobject *kobj,
+                                      struct attribute *attr,
+                                      const char *buffer, size_t count)
+{
+       u64 delayed_unlink_bytes;
+       int rc;
+
+       rc = sysfs_memparse(buffer, count, &delayed_unlink_bytes, "MiB");
+       if (rc)
+               return rc;
+
+       ldiskfs_delayed_unlink_blocks = delayed_unlink_bytes >> 9;
+
+       return count;
+}
+LUSTRE_RW_ATTR(delayed_unlink_mb);
+
+
 static ssize_t track_declares_assert_show(struct kobject *kobj,
                                   struct attribute *attr,
                                   char *buf)
@@ -8954,11 +9017,21 @@ static int __init osd_init(void)
        if (kobj) {
                rc = sysfs_create_file(kobj,
                                       &lustre_attr_track_declares_assert.attr);
-               kobject_put(kobj);
                if (rc) {
-                       CWARN("osd-ldiskfs: track_declares_assert failed to register with sysfs\n");
+                       CWARN("%s: track_declares_assert sysfs registration failed: rc = %d\n",
+                             "osd-ldiskfs", rc);
                        rc = 0;
                }
+
+               rc = sysfs_create_file(kobj,
+                                      &lustre_attr_delayed_unlink_mb.attr);
+               if (rc) {
+                       CWARN("%s: delayed_unlink_mb registration failed: rc = %d\n",
+                             "osd-ldiskfs", rc);
+                       rc = 0;
+               }
+
+               kobject_put(kobj);
        }
 
 #ifndef HAVE_FLUSH_DELAYED_FPUT
index a547352..f4a1fcc 100644 (file)
@@ -288,22 +288,16 @@ LUSTRE_RW_ATTR(fallocate_zero_blocks);
 ssize_t force_sync_store(struct kobject *kobj, struct attribute *attr,
                         const char *buffer, size_t count)
 {
-       struct dt_device *dt = container_of(kobj, struct dt_device,
-                                           dd_kobj);
+       struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj);
        struct osd_device *osd = osd_dt_dev(dt);
-       struct lu_env env;
        int rc;
 
        LASSERT(osd);
        if (unlikely(!osd->od_mnt))
                return -EINPROGRESS;
 
-       rc = lu_env_init(&env, LCT_LOCAL);
-       if (rc)
-               return rc;
-
-       rc = dt_sync(&env, dt);
-       lu_env_fini(&env);
+       flush_workqueue(LDISKFS_SB(osd_sb(osd_dt_dev(dt)))->s_misc_wq);
+       rc = dt_sync(NULL, dt);
 
        return rc == 0 ? count : rc;
 }
index 5ef65dc..7719047 100755 (executable)
@@ -26564,6 +26564,42 @@ test_319() {
 }
 run_test 319 "lost lease lock on migrate error"
 
+test_360() {
+       (( $OST1_VERSION >= $(version_code 2.15.58.96) )) ||
+               skip "Need OST version at least 2.15.58.96"
+       [[ "$ost1_FSTYPE" == "ldiskfs" ]] || skip "ldiskfs only test"
+
+       check_set_fallocate_or_skip
+       do_facet ost1 "$LCTL set_param osd-ldiskfs.delayed_unlink_mb=1MiB"
+
+       mkdir $DIR/$tdir/
+       do_facet ost1 $LCTL set_param debug=+inode
+       do_facet ost1 $LCTL clear
+       local files=100
+
+       for ((i = 0; i < $files; i++)); do
+               fallocate -l 1280k $DIR/$tdir/$tfile.$i ||
+                       error "fallocate 1280k $DIR/$tdir/$tfile.$i failed"
+       done
+       local min=$(($($LFS find $DIR/$tdir --ost 0 | wc -l) / 2))
+
+       for ((i = 0; i < $files; i++)); do
+               unlink $DIR/$tdir/$tfile.$i ||
+                       error "unlink $DIR/$tdir/$tfile.$i failed"
+       done
+
+       local count=0
+       local loop
+
+       for (( loop = 0; loop < 30 && count < min; loop++)); do
+               sleep 1
+               (( count += $(do_facet ost1 $LCTL dk | grep -c "delayed iput")))
+               echo "Count[$loop]: $count"
+       done
+       (( count >= min )) || error "$count < $min delayed iput after $loop s"
+}
+run_test 360 "ldiskfs unlink in a separate thread"
+
 test_398a() { # LU-4198
        local ost1_imp=$(get_osc_import_name client ost1)
        local imp_name=$($LCTL list_param osc.$ost1_imp | head -n1 |