From: Artem Blagodarenko Date: Fri, 13 Oct 2023 07:49:07 +0000 (+0800) Subject: LU-16032 osd: move unlink of large objects to separate thread X-Git-Tag: 2.15.59~38 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=a772e90243ea0ff1de6ae9c67e1f6384c431d200;p=fs%2Flustre-release.git LU-16032 osd: move unlink of large objects to separate thread Final unlink and freeing of blocks for large objects can lead to a thread hung with this call stack: Net: Service thread pid 1739 was inactive for 200.16s. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes: __wait_on_buffer+0x2a/0x30 ldiskfs_wait_block_bitmap+0xe0/0xf0 [ldiskfs] ldiskfs_read_block_bitmap+0x31/0x60 [ldiskfs] ldiskfs_free_blocks+0x329/0xbb0 [ldiskfs] ldiskfs_ext_remove_space+0x8a9/0x1150 [ldiskfs] ldiskfs_ext_truncate+0xb0/0xe0 [ldiskfs] ldiskfs_truncate+0x3b7/0x3f0 [ldiskfs] ldiskfs_evict_inode+0x58a/0x630 [ldiskfs] evict+0xb4/0x180 iput+0xfc/0x190 osd_object_delete+0x1f8/0x370 [osd_ldiskfs] lu_object_free.isra.30+0x68/0x170 [obdclass] lu_object_put+0xc5/0x3e0 [obdclass] ofd_destroy_by_fid+0x20e/0x500 [ofd] ofd_destroy_hdl+0x267/0x9f0 [ofd] tgt_request_handle+0xaee/0x15f0 [ptlrpc] ptlrpc_server_handle_request+0x24b/0xab0 [ptlrpc] ptlrpc_main+0xb34/0x1470 [ptlrpc] kthread+0xd1/0xe0 Let's move final unlink to workqueue if inode size > 1GB. The size threshold be configured by setting the minimum async truncate size with the "osd-ldiskfs.*.delay_unlink_mb" parameter. Writes to "osd-ldiskfs.*.force_sync" parameter will flush pending delayed unlinks so that space can be reclaimed as needed. Change-Id: Id535ae4c58732769effabee42835bc2da8cb5cc1 Signed-off-by: Artem Blagodarenko DDN-bug-id: DDN-3144 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/47995 Reviewed-by: Andreas Dilger Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin Tested-by: jenkins Tested-by: Maloo --- diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 3a38bff..68ae68a 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -94,6 +94,9 @@ int ldiskfs_track_declares_assert; module_param(ldiskfs_track_declares_assert, int, 0644); MODULE_PARM_DESC(ldiskfs_track_declares_assert, "LBUG during tracking of declares"); +/* 1 GiB in 512-byte sectors */ +int ldiskfs_delayed_unlink_blocks = (1 << (30 - 9)); + /* Slab to allocate dynlocks */ struct kmem_cache *dynlock_cachep; @@ -2266,6 +2269,37 @@ static int osd_trans_cb_add(struct thandle *th, struct dt_txn_commit_cb *dcb) return 0; } +struct osd_delayed_iput_work { + struct work_struct diw_work; + struct inode *diw_inode; +}; + +static void osd_delayed_iput_fn(struct work_struct *work) +{ + struct osd_delayed_iput_work *diwork; + struct inode *inode; + + diwork = container_of(work, struct osd_delayed_iput_work, diw_work); + inode = diwork->diw_inode; + CDEBUG(D_INODE, "%s: delayed iput (ino=%lu)\n", + inode->i_sb->s_id, inode->i_ino); + iput(inode); + OBD_FREE_PTR(diwork); +} + +noinline void osd_delayed_iput(struct inode *inode, + struct osd_delayed_iput_work *diwork) +{ + if (!diwork) { + iput(inode); + } else { + INIT_WORK(&diwork->diw_work, osd_delayed_iput_fn); + diwork->diw_inode = inode; + queue_work(LDISKFS_SB(inode->i_sb)->s_misc_wq, + &diwork->diw_work); + } +} + /* * Called just before object is freed. Releases all resources except for * object itself (that is released by osd_object_free()). @@ -2278,6 +2312,7 @@ static void osd_object_delete(const struct lu_env *env, struct lu_object *l) struct osd_object *obj = osd_obj(l); struct qsd_instance *qsd = osd_def_qsd(osd_obj2dev(obj)); struct inode *inode = obj->oo_inode; + struct osd_delayed_iput_work *diwork = NULL; __u64 projid; qid_t uid; qid_t gid; @@ -2293,6 +2328,9 @@ static void osd_object_delete(const struct lu_env *env, struct lu_object *l) if (!inode) return; + if (inode->i_blocks > ldiskfs_delayed_unlink_blocks) + OBD_ALLOC(diwork, sizeof(*diwork)); + if (osd_has_index(obj) && obj->oo_dt.do_index_ops == &osd_index_iam_ops) ldiskfs_set_inode_flag(inode, LDISKFS_INODE_JOURNAL_DATA); @@ -2301,7 +2339,7 @@ static void osd_object_delete(const struct lu_env *env, struct lu_object *l) projid = i_projid_read(inode); obj->oo_inode = NULL; - iput(inode); + osd_delayed_iput(inode, diwork); /* do not rebalance quota if the caller needs to release memory * otherwise qsd_refresh_usage() may went into a new ldiskfs @@ -8901,6 +8939,31 @@ static const struct obd_ops osd_obd_device_ops = { .o_health_check = osd_health_check, }; +static ssize_t delayed_unlink_mb_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", + ldiskfs_delayed_unlink_blocks >> 11); +} + +static ssize_t delayed_unlink_mb_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + u64 delayed_unlink_bytes; + int rc; + + rc = sysfs_memparse(buffer, count, &delayed_unlink_bytes, "MiB"); + if (rc) + return rc; + + ldiskfs_delayed_unlink_blocks = delayed_unlink_bytes >> 9; + + return count; +} +LUSTRE_RW_ATTR(delayed_unlink_mb); + + static ssize_t track_declares_assert_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -8954,11 +9017,21 @@ static int __init osd_init(void) if (kobj) { rc = sysfs_create_file(kobj, &lustre_attr_track_declares_assert.attr); - kobject_put(kobj); if (rc) { - CWARN("osd-ldiskfs: track_declares_assert failed to register with sysfs\n"); + CWARN("%s: track_declares_assert sysfs registration failed: rc = %d\n", + "osd-ldiskfs", rc); rc = 0; } + + rc = sysfs_create_file(kobj, + &lustre_attr_delayed_unlink_mb.attr); + if (rc) { + CWARN("%s: delayed_unlink_mb registration failed: rc = %d\n", + "osd-ldiskfs", rc); + rc = 0; + } + + kobject_put(kobj); } #ifndef HAVE_FLUSH_DELAYED_FPUT diff --git a/lustre/osd-ldiskfs/osd_lproc.c b/lustre/osd-ldiskfs/osd_lproc.c index a547352..f4a1fcc 100644 --- a/lustre/osd-ldiskfs/osd_lproc.c +++ b/lustre/osd-ldiskfs/osd_lproc.c @@ -288,22 +288,16 @@ LUSTRE_RW_ATTR(fallocate_zero_blocks); ssize_t force_sync_store(struct kobject *kobj, struct attribute *attr, const char *buffer, size_t count) { - struct dt_device *dt = container_of(kobj, struct dt_device, - dd_kobj); + struct dt_device *dt = container_of(kobj, struct dt_device, dd_kobj); struct osd_device *osd = osd_dt_dev(dt); - struct lu_env env; int rc; LASSERT(osd); if (unlikely(!osd->od_mnt)) return -EINPROGRESS; - rc = lu_env_init(&env, LCT_LOCAL); - if (rc) - return rc; - - rc = dt_sync(&env, dt); - lu_env_fini(&env); + flush_workqueue(LDISKFS_SB(osd_sb(osd_dt_dev(dt)))->s_misc_wq); + rc = dt_sync(NULL, dt); return rc == 0 ? count : rc; } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 5ef65dc..7719047 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -26564,6 +26564,42 @@ test_319() { } run_test 319 "lost lease lock on migrate error" +test_360() { + (( $OST1_VERSION >= $(version_code 2.15.58.96) )) || + skip "Need OST version at least 2.15.58.96" + [[ "$ost1_FSTYPE" == "ldiskfs" ]] || skip "ldiskfs only test" + + check_set_fallocate_or_skip + do_facet ost1 "$LCTL set_param osd-ldiskfs.delayed_unlink_mb=1MiB" + + mkdir $DIR/$tdir/ + do_facet ost1 $LCTL set_param debug=+inode + do_facet ost1 $LCTL clear + local files=100 + + for ((i = 0; i < $files; i++)); do + fallocate -l 1280k $DIR/$tdir/$tfile.$i || + error "fallocate 1280k $DIR/$tdir/$tfile.$i failed" + done + local min=$(($($LFS find $DIR/$tdir --ost 0 | wc -l) / 2)) + + for ((i = 0; i < $files; i++)); do + unlink $DIR/$tdir/$tfile.$i || + error "unlink $DIR/$tdir/$tfile.$i failed" + done + + local count=0 + local loop + + for (( loop = 0; loop < 30 && count < min; loop++)); do + sleep 1 + (( count += $(do_facet ost1 $LCTL dk | grep -c "delayed iput"))) + echo "Count[$loop]: $count" + done + (( count >= min )) || error "$count < $min delayed iput after $loop s" +} +run_test 360 "ldiskfs unlink in a separate thread" + test_398a() { # LU-4198 local ost1_imp=$(get_osc_import_name client ost1) local imp_name=$($LCTL list_param osc.$ost1_imp | head -n1 |