From 5968fec949e5f30033cbb4ee695920a7e051d793 Mon Sep 17 00:00:00 2001 From: alex Date: Sun, 11 Jan 2004 19:43:34 +0000 Subject: [PATCH] - initial release of reworked ext3-delete-thread for 2.4.24. the idea is very simple: don't drop i_nlink to 0 in ext3_unlink() if file is large NOTE: to be discussed yet --- .../patches/ext3-delete_thread-2.4.24.patch | 434 +++++++++++++++++++++ 1 file changed, 434 insertions(+) create mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.24.patch diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.24.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.24.patch new file mode 100644 index 0000000..b274755 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.24.patch @@ -0,0 +1,434 @@ + fs/ext3/file.c | 4 + fs/ext3/inode.c | 116 ++++++++++++++++++++++ + fs/ext3/super.c | 230 +++++++++++++++++++++++++++++++++++++++++++++ + include/linux/ext3_fs.h | 5 + include/linux/ext3_fs_sb.h | 10 + + 5 files changed, 365 insertions(+) + +Index: linux-2.4.24-vanilla/fs/ext3/super.c +=================================================================== +--- linux-2.4.24-vanilla.orig/fs/ext3/super.c 2004-01-10 17:49:10.000000000 +0300 ++++ linux-2.4.24-vanilla/fs/ext3/super.c 2004-01-11 22:30:24.000000000 +0300 +@@ -400,6 +400,126 @@ + } + } + ++#ifdef EXT3_DELETE_THREAD ++/* ++ * Delete inodes in a loop until there are no more to be deleted. ++ * Normally, we run in the background doing the deletes and sleeping again, ++ * and clients just add new inodes to be deleted onto the end of the list. ++ * If someone is concerned about free space (e.g. block allocation or similar) ++ * then they can sleep on s_delete_waiter_queue and be woken up when space ++ * has been freed. ++ */ ++int ext3_delete_thread(void *data) ++{ ++ struct super_block *sb = data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct task_struct *tsk = current; ++ struct inode *inode; ++ unsigned long blocks; ++ ++ /* Almost like daemonize, but not quite */ ++ exit_mm(current); ++ tsk->session = 1; ++ tsk->pgrp = 1; ++ tsk->tty = NULL; ++ exit_files(current); ++ reparent_to_init(); ++ ++ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); ++ sigfillset(&tsk->blocked); ++ ++ /*tsk->flags |= PF_KERNTHREAD;*/ ++ ++ INIT_LIST_HEAD(&sbi->s_delete_list); ++ wake_up(&sbi->s_delete_waiter_queue); ++ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev)); ++ ++ /* main loop */ ++ for (;;) { ++ wait_event_interruptible(sbi->s_delete_thread_queue, ++ !list_empty(&sbi->s_delete_list) || ++ !test_opt(sb, ASYNCDEL)); ++ ext3_debug("%s woken up: %lu inodes, %lu blocks\n", ++ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); ++ ++ spin_lock(&sbi->s_delete_lock); ++ if (list_empty(&sbi->s_delete_list)) { ++ clear_opt(sbi->s_mount_opt, ASYNCDEL); ++ memset(&sbi->s_delete_list, 0, ++ sizeof(sbi->s_delete_list)); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("delete thread on %s exiting\n", ++ kdevname(sb->s_dev)); ++ wake_up(&sbi->s_delete_waiter_queue); ++ break; ++ } ++ ++ while (!list_empty(&sbi->s_delete_list)) { ++ inode = list_entry(sbi->s_delete_list.next, ++ struct inode, i_devices); ++ blocks = inode->i_blocks >> (inode->i_blkbits - 9); ++ ++ list_del_init(&inode->i_devices); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("%s delete ino %lu blk %lu\n", ++ tsk->comm, inode->i_ino, blocks); ++ ++ J_ASSERT(inode->i_nlink == 1); ++ inode->i_nlink = 0; ++ iput(inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ sbi->s_delete_blocks -= blocks; ++ sbi->s_delete_inodes--; ++ } ++ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) { ++ ext3_warning(sb, __FUNCTION__, ++ "%lu blocks, %lu inodes on list?\n", ++ sbi->s_delete_blocks,sbi->s_delete_inodes); ++ sbi->s_delete_blocks = 0; ++ sbi->s_delete_inodes = 0; ++ } ++ spin_unlock(&sbi->s_delete_lock); ++ wake_up(&sbi->s_delete_waiter_queue); ++ } ++ ++ return 0; ++} ++ ++static void ext3_start_delete_thread(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int rc; ++ ++ spin_lock_init(&sbi->s_delete_lock); ++ init_waitqueue_head(&sbi->s_delete_thread_queue); ++ init_waitqueue_head(&sbi->s_delete_waiter_queue); ++ ++ if (!test_opt(sb, ASYNCDEL)) ++ return; ++ ++ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); ++ if (rc < 0) ++ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", ++ rc); ++ else ++ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); ++} ++ ++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) ++{ ++ if (sbi->s_delete_list.next == 0) /* thread never started */ ++ return; ++ ++ clear_opt(sbi->s_mount_opt, ASYNCDEL); ++ wake_up(&sbi->s_delete_thread_queue); ++ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next == 0); ++} ++#else ++#define ext3_start_delete_thread(sbi) do {} while(0) ++#define ext3_stop_delete_thread(sbi) do {} while(0) ++#endif /* EXT3_DELETE_THREAD */ ++ + void ext3_put_super (struct super_block * sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); +@@ -527,6 +647,13 @@ + clear_opt (*mount_options, XATTR_USER); + else + #endif ++#ifdef EXT3_DELETE_THREAD ++ if (!strcmp(this_char, "asyncdel")) ++ set_opt(*mount_options, ASYNCDEL); ++ else if (!strcmp(this_char, "noasyncdel")) ++ clear_opt(*mount_options, ASYNCDEL); ++ else ++#endif + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -1227,6 +1354,7 @@ + } + + ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); ++ ext3_start_delete_thread(sb); + /* + * akpm: core read_super() calls in here with the superblock locked. + * That deadlocks, because orphan cleanup needs to lock the superblock +@@ -1618,7 +1746,12 @@ + static int ext3_sync_fs(struct super_block *sb) + { + tid_t target; +- ++ ++ if (atomic_read(&sb->s_active) == 0) { ++ /* fs is being umounted: time to stop delete thread */ ++ ext3_stop_delete_thread(EXT3_SB(sb)); ++ } ++ + sb->s_dirt = 0; + target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); + log_wait_commit(EXT3_SB(sb)->s_journal, target); +@@ -1682,6 +1815,9 @@ + if (!parse_options(data, &tmp, sbi, &tmp, 1)) + return -EINVAL; + ++ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY)) ++ ext3_stop_delete_thread(sbi); ++ + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + ext3_abort(sb, __FUNCTION__, "Abort forced by user"); + +Index: linux-2.4.24-vanilla/fs/ext3/inode.c +=================================================================== +--- linux-2.4.24-vanilla.orig/fs/ext3/inode.c 2004-01-10 17:49:11.000000000 +0300 ++++ linux-2.4.24-vanilla/fs/ext3/inode.c 2004-01-11 22:26:24.000000000 +0300 +@@ -2551,6 +2551,118 @@ + return err; + } + ++#ifdef EXT3_DELETE_THREAD ++/* Move blocks from to-be-truncated inode over to a new inode, and delete ++ * that one from the delete thread instead. This avoids a lot of latency ++ * when truncating large files. ++ * ++ * If we have any problem deferring the truncate, just truncate it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * delete thread when we run out of space. ++ */ ++void ext3_truncate_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); ++ struct inode *new_inode; ++ handle_t *handle; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) ++ goto out_truncate; ++ ++ /* XXX This is a temporary limitation for code simplicity. ++ * We could truncate to arbitrary sizes at some later time. ++ */ ++ if (old_inode->i_size != 0) ++ goto out_truncate; ++ ++ /* We may want to truncate the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || ++ old_inode->i_size > oei->i_disksize) ++ goto out_truncate; ++ ++ /* We can't use the delete thread as-is during real orphan recovery, ++ * as we add to the orphan list here, causing ext3_orphan_cleanup() ++ * to loop endlessly. It would be nice to do so, but needs work. ++ */ ++ if (oei->i_state & EXT3_STATE_DELETE || ++ sbi->s_mount_state & EXT3_ORPHAN_FS) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ goto out_truncate; ++ } ++ ++ ext3_discard_prealloc(old_inode); ++ ++ /* old_inode = 1 ++ * new_inode = sb + GDT + ibitmap ++ * orphan list = 1 inode/superblock for add, 2 inodes for del ++ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS ++ */ ++ handle = ext3_journal_start(old_inode, 7); ++ if (IS_ERR(handle)) ++ goto out_truncate; ++ ++ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); ++ if (IS_ERR(new_inode)) { ++ ext3_debug("truncate inode %lu directly (no new inodes)\n", ++ old_inode->i_ino); ++ goto out_journal; ++ } ++ ++ nei = EXT3_I(new_inode); ++ ++ down_write(&oei->truncate_sem); ++ new_inode->i_size = old_inode->i_size; ++ new_inode->i_blocks = old_inode->i_blocks; ++ new_inode->i_uid = old_inode->i_uid; ++ new_inode->i_gid = old_inode->i_gid; ++ new_inode->i_nlink = 0; ++ ++ /* FIXME when we do arbitrary truncates */ ++ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; ++ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME; ++ ++ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); ++ memset(oei->i_data, 0, sizeof(oei->i_data)); ++ ++ nei->i_disksize = oei->i_disksize; ++ nei->i_state |= EXT3_STATE_DELETE; ++ up_write(&oei->truncate_sem); ++ ++ if (ext3_orphan_add(handle, new_inode) < 0) ++ goto out_journal; ++ ++ if (ext3_orphan_del(handle, old_inode) < 0) { ++ ext3_orphan_del(handle, new_inode); ++ iput(new_inode); ++ goto out_journal; ++ } ++ ++ ext3_journal_stop(handle, old_inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_devices)); ++ list_add_tail(&new_inode->i_devices, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ ext3_debug("delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ return; ++ ++out_journal: ++ ext3_journal_stop(handle, old_inode); ++out_truncate: ++ ext3_truncate(old_inode); ++} ++#endif /* EXT3_DELETE_THREAD */ ++ + /* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. +Index: linux-2.4.24-vanilla/fs/ext3/file.c +=================================================================== +--- linux-2.4.24-vanilla.orig/fs/ext3/file.c 2004-01-10 17:49:10.000000000 +0300 ++++ linux-2.4.24-vanilla/fs/ext3/file.c 2004-01-11 22:26:24.000000000 +0300 +@@ -126,7 +126,11 @@ + }; + + struct inode_operations ext3_file_inode_operations = { ++#ifdef EXT3_DELETE_THREAD ++ truncate: ext3_truncate_thread, /* BKL held */ ++#else + truncate: ext3_truncate, /* BKL held */ ++#endif + setattr: ext3_setattr, /* BKL held */ + setxattr: ext3_setxattr, /* BKL held */ + getxattr: ext3_getxattr, /* BKL held */ +Index: linux-2.4.24-vanilla/fs/ext3/namei.c +=================================================================== +--- linux-2.4.24-vanilla.orig/fs/ext3/namei.c 2004-01-10 17:49:10.000000000 +0300 ++++ linux-2.4.24-vanilla/fs/ext3/namei.c 2004-01-11 22:26:24.000000000 +0300 +@@ -1936,6 +1936,36 @@ + return retval; + } + ++static int ext3_try_to_delay_deletion(struct inode *inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long blocks; ++ ++ if (!test_opt(inode->i_sb, ASYNCDEL)) ++ return 0; ++ ++ /* We may want to delete the inode immediately and not defer it */ ++ blocks = inode->i_blocks >> (inode->i_blkbits - 9); ++ if (IS_SYNC(inode) || blocks <= EXT3_NDIR_BLOCKS) ++ return 0; ++ ++ inode->i_nlink = 1; ++ atomic_inc(&inode->i_count); ++ ei->i_state |= EXT3_STATE_DELETE; ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&inode->i_devices)); ++ list_add_tail(&inode->i_devices, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ ++ return 0; ++} ++ + static int ext3_unlink(struct inode * dir, struct dentry *dentry) + { + int retval; +@@ -1977,8 +2007,10 @@ + ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + inode->i_nlink--; +- if (!inode->i_nlink) ++ if (!inode->i_nlink) { ++ ext3_try_to_delay_deletion(inode); + ext3_orphan_add(handle, inode); ++ } + inode->i_ctime = dir->i_ctime; + ext3_mark_inode_dirty(handle, inode); + retval = 0; +Index: linux-2.4.24-vanilla/include/linux/ext3_fs.h +=================================================================== +--- linux-2.4.24-vanilla.orig/include/linux/ext3_fs.h 2004-01-10 17:49:11.000000000 +0300 ++++ linux-2.4.24-vanilla/include/linux/ext3_fs.h 2004-01-11 22:26:24.000000000 +0300 +@@ -193,6 +193,7 @@ + */ + #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ + #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ ++#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ + + /* + * ioctl commands +@@ -320,6 +321,7 @@ + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ ++#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -697,6 +699,9 @@ + extern void ext3_dirty_inode(struct inode *); + extern int ext3_change_inode_journal_flag(struct inode *, int); + extern void ext3_truncate (struct inode *); ++#ifdef EXT3_DELETE_THREAD ++extern void ext3_truncate_thread(struct inode *inode); ++#endif + extern void ext3_set_inode_flags(struct inode *); + + /* ioctl.c */ +Index: linux-2.4.24-vanilla/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.4.24-vanilla.orig/include/linux/ext3_fs_sb.h 2004-01-10 17:49:10.000000000 +0300 ++++ linux-2.4.24-vanilla/include/linux/ext3_fs_sb.h 2004-01-11 22:28:44.000000000 +0300 +@@ -29,6 +29,8 @@ + + #define EXT3_MAX_GROUP_LOADED 8 + ++#define EXT3_DELETE_THREAD ++ + /* + * third extended-fs super-block data in memory + */ +@@ -76,6 +78,14 @@ + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif ++#ifdef EXT3_DELETE_THREAD ++ spinlock_t s_delete_lock; ++ struct list_head s_delete_list; ++ unsigned long s_delete_blocks; ++ unsigned long s_delete_inodes; ++ wait_queue_head_t s_delete_thread_queue; ++ wait_queue_head_t s_delete_waiter_queue; ++#endif + }; + + #endif /* _LINUX_EXT3_FS_SB */ -- 1.8.3.1