fs/ext3/file.c | 4 fs/ext3/inode.c | 116 ++++++++++++++++++++++ fs/ext3/super.c | 230 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/ext3_fs.h | 5 include/linux/ext3_fs_sb.h | 10 + 5 files changed, 365 insertions(+) Index: linux-2.4.24/fs/ext3/super.c =================================================================== --- linux-2.4.24.orig/fs/ext3/super.c 2004-01-12 20:36:31.000000000 +0300 +++ linux-2.4.24/fs/ext3/super.c 2004-01-13 16:27:43.000000000 +0300 @@ -400,6 +400,127 @@ } } +#ifdef EXT3_DELETE_THREAD +/* + * Delete inodes in a loop until there are no more to be deleted. + * Normally, we run in the background doing the deletes and sleeping again, + * and clients just add new inodes to be deleted onto the end of the list. + * If someone is concerned about free space (e.g. block allocation or similar) + * then they can sleep on s_delete_waiter_queue and be woken up when space + * has been freed. + */ +int ext3_delete_thread(void *data) +{ + struct super_block *sb = data; + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct task_struct *tsk = current; + + /* Almost like daemonize, but not quite */ + exit_mm(current); + tsk->session = 1; + tsk->pgrp = 1; + tsk->tty = NULL; + exit_files(current); + reparent_to_init(); + + sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); + sigfillset(&tsk->blocked); + + /*tsk->flags |= PF_KERNTHREAD;*/ + + INIT_LIST_HEAD(&sbi->s_delete_list); + wake_up(&sbi->s_delete_waiter_queue); + ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev)); + + /* main loop */ + for (;;) { + wait_event_interruptible(sbi->s_delete_thread_queue, + !list_empty(&sbi->s_delete_list) || + !test_opt(sb, ASYNCDEL)); + ext3_debug("%s woken up: %lu inodes, %lu blocks\n", + tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); + + spin_lock(&sbi->s_delete_lock); + if (list_empty(&sbi->s_delete_list)) { + clear_opt(sbi->s_mount_opt, ASYNCDEL); + memset(&sbi->s_delete_list, 0, + sizeof(sbi->s_delete_list)); + spin_unlock(&sbi->s_delete_lock); + ext3_debug("delete thread on %s exiting\n", + kdevname(sb->s_dev)); + wake_up(&sbi->s_delete_waiter_queue); + break; + } + + while (!list_empty(&sbi->s_delete_list)) { + struct inode *inode=list_entry(sbi->s_delete_list.next, + struct inode, i_devices); + unsigned long blocks = inode->i_blocks >> + (inode->i_blkbits - 9); + + list_del_init(&inode->i_devices); + spin_unlock(&sbi->s_delete_lock); + ext3_debug("%s delete ino %lu blk %lu\n", + tsk->comm, inode->i_ino, blocks); + + J_ASSERT(EXT3_I(inode)->i_state & EXT3_STATE_DELETE); + J_ASSERT(inode->i_nlink == 1); + inode->i_nlink = 0; + iput(inode); + + spin_lock(&sbi->s_delete_lock); + sbi->s_delete_blocks -= blocks; + sbi->s_delete_inodes--; + } + if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) { + ext3_warning(sb, __FUNCTION__, + "%lu blocks, %lu inodes on list?\n", + sbi->s_delete_blocks,sbi->s_delete_inodes); + sbi->s_delete_blocks = 0; + sbi->s_delete_inodes = 0; + } + spin_unlock(&sbi->s_delete_lock); + wake_up(&sbi->s_delete_waiter_queue); + } + + return 0; +} + +static void ext3_start_delete_thread(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + int rc; + + spin_lock_init(&sbi->s_delete_lock); + init_waitqueue_head(&sbi->s_delete_thread_queue); + init_waitqueue_head(&sbi->s_delete_waiter_queue); + + if (!test_opt(sb, ASYNCDEL)) + return; + + rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); + if (rc < 0) + printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", + rc); + else + wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); +} + +static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) +{ + if (sbi->s_delete_list.next == 0) /* thread never started */ + return; + + clear_opt(sbi->s_mount_opt, ASYNCDEL); + wake_up(&sbi->s_delete_thread_queue); + wait_event(sbi->s_delete_waiter_queue, + sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0); +} +#else +#define ext3_start_delete_thread(sbi) do {} while(0) +#define ext3_stop_delete_thread(sbi) do {} while(0) +#endif /* EXT3_DELETE_THREAD */ + void ext3_put_super (struct super_block * sb) { struct ext3_sb_info *sbi = EXT3_SB(sb); @@ -407,6 +529,9 @@ kdev_t j_dev = sbi->s_journal->j_dev; int i; +#ifdef EXT3_DELETE_THREAD + J_ASSERT(sbi->s_delete_inodes == 0); +#endif ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { @@ -527,6 +650,13 @@ clear_opt (*mount_options, XATTR_USER); else #endif +#ifdef EXT3_DELETE_THREAD + if (!strcmp(this_char, "asyncdel")) + set_opt(*mount_options, ASYNCDEL); + else if (!strcmp(this_char, "noasyncdel")) + clear_opt(*mount_options, ASYNCDEL); + else +#endif if (!strcmp (this_char, "bsddf")) clear_opt (*mount_options, MINIX_DF); else if (!strcmp (this_char, "nouid32")) { @@ -1227,6 +1357,7 @@ } ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); + ext3_start_delete_thread(sb); /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock @@ -1618,7 +1749,12 @@ static int ext3_sync_fs(struct super_block *sb) { tid_t target; - + + if (atomic_read(&sb->s_active) == 0) { + /* fs is being umounted: time to stop delete thread */ + ext3_stop_delete_thread(EXT3_SB(sb)); + } + sb->s_dirt = 0; target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); log_wait_commit(EXT3_SB(sb)->s_journal, target); @@ -1682,6 +1818,9 @@ if (!parse_options(data, &tmp, sbi, &tmp, 1)) return -EINVAL; + if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY)) + ext3_stop_delete_thread(sbi); + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) ext3_abort(sb, __FUNCTION__, "Abort forced by user"); Index: linux-2.4.24/fs/ext3/inode.c =================================================================== --- linux-2.4.24.orig/fs/ext3/inode.c 2004-01-12 20:36:31.000000000 +0300 +++ linux-2.4.24/fs/ext3/inode.c 2004-01-12 20:36:32.000000000 +0300 @@ -2551,6 +2551,118 @@ return err; } +#ifdef EXT3_DELETE_THREAD +/* Move blocks from to-be-truncated inode over to a new inode, and delete + * that one from the delete thread instead. This avoids a lot of latency + * when truncating large files. + * + * If we have any problem deferring the truncate, just truncate it right away. + * If we defer it, we also mark how many blocks it would free, so that we + * can keep the statfs data correct, and we know if we should sleep on the + * delete thread when we run out of space. + */ +void ext3_truncate_thread(struct inode *old_inode) +{ + struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); + struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); + struct inode *new_inode; + handle_t *handle; + unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); + + if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) + goto out_truncate; + + /* XXX This is a temporary limitation for code simplicity. + * We could truncate to arbitrary sizes at some later time. + */ + if (old_inode->i_size != 0) + goto out_truncate; + + /* We may want to truncate the inode immediately and not defer it */ + if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || + old_inode->i_size > oei->i_disksize) + goto out_truncate; + + /* We can't use the delete thread as-is during real orphan recovery, + * as we add to the orphan list here, causing ext3_orphan_cleanup() + * to loop endlessly. It would be nice to do so, but needs work. + */ + if (oei->i_state & EXT3_STATE_DELETE || + sbi->s_mount_state & EXT3_ORPHAN_FS) { + ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", + old_inode->i_ino, blocks); + goto out_truncate; + } + + ext3_discard_prealloc(old_inode); + + /* old_inode = 1 + * new_inode = sb + GDT + ibitmap + * orphan list = 1 inode/superblock for add, 2 inodes for del + * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS + */ + handle = ext3_journal_start(old_inode, 7); + if (IS_ERR(handle)) + goto out_truncate; + + new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); + if (IS_ERR(new_inode)) { + ext3_debug("truncate inode %lu directly (no new inodes)\n", + old_inode->i_ino); + goto out_journal; + } + + nei = EXT3_I(new_inode); + + down_write(&oei->truncate_sem); + new_inode->i_size = old_inode->i_size; + new_inode->i_blocks = old_inode->i_blocks; + new_inode->i_uid = old_inode->i_uid; + new_inode->i_gid = old_inode->i_gid; + new_inode->i_nlink = 1; + + /* FIXME when we do arbitrary truncates */ + old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; + old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME; + + memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); + memset(oei->i_data, 0, sizeof(oei->i_data)); + + nei->i_disksize = oei->i_disksize; + nei->i_state |= EXT3_STATE_DELETE; + up_write(&oei->truncate_sem); + + if (ext3_orphan_add(handle, new_inode) < 0) + goto out_journal; + + if (ext3_orphan_del(handle, old_inode) < 0) { + ext3_orphan_del(handle, new_inode); + iput(new_inode); + goto out_journal; + } + + ext3_journal_stop(handle, old_inode); + + spin_lock(&sbi->s_delete_lock); + J_ASSERT(list_empty(&new_inode->i_devices)); + list_add_tail(&new_inode->i_devices, &sbi->s_delete_list); + sbi->s_delete_blocks += blocks; + sbi->s_delete_inodes++; + spin_unlock(&sbi->s_delete_lock); + + ext3_debug("delete inode %lu (%lu blocks) by thread\n", + new_inode->i_ino, blocks); + + wake_up(&sbi->s_delete_thread_queue); + return; + +out_journal: + ext3_journal_stop(handle, old_inode); +out_truncate: + ext3_truncate(old_inode); +} +#endif /* EXT3_DELETE_THREAD */ + /* * On success, We end up with an outstanding reference count against * iloc->bh. This _must_ be cleaned up later. Index: linux-2.4.24/fs/ext3/file.c =================================================================== --- linux-2.4.24.orig/fs/ext3/file.c 2004-01-12 20:36:29.000000000 +0300 +++ linux-2.4.24/fs/ext3/file.c 2004-01-12 20:36:32.000000000 +0300 @@ -126,7 +126,11 @@ }; struct inode_operations ext3_file_inode_operations = { +#ifdef EXT3_DELETE_THREAD + truncate: ext3_truncate_thread, /* BKL held */ +#else truncate: ext3_truncate, /* BKL held */ +#endif setattr: ext3_setattr, /* BKL held */ setxattr: ext3_setxattr, /* BKL held */ getxattr: ext3_getxattr, /* BKL held */ Index: linux-2.4.24/fs/ext3/namei.c =================================================================== --- linux-2.4.24.orig/fs/ext3/namei.c 2004-01-12 20:36:31.000000000 +0300 +++ linux-2.4.24/fs/ext3/namei.c 2004-01-12 20:36:32.000000000 +0300 @@ -1936,6 +1936,40 @@ return retval; } +#ifdef EXT3_DELETE_THREAD +static int ext3_try_to_delay_deletion(struct inode *inode) +{ + struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); + struct ext3_inode_info *ei = EXT3_I(inode); + unsigned long blocks; + + if (!test_opt(inode->i_sb, ASYNCDEL)) + return 0; + + /* We may want to delete the inode immediately and not defer it */ + blocks = inode->i_blocks >> (inode->i_blkbits - 9); + if (IS_SYNC(inode) || blocks <= EXT3_NDIR_BLOCKS) + return 0; + + inode->i_nlink = 1; + atomic_inc(&inode->i_count); + ei->i_state |= EXT3_STATE_DELETE; + + spin_lock(&sbi->s_delete_lock); + J_ASSERT(list_empty(&inode->i_devices)); + list_add_tail(&inode->i_devices, &sbi->s_delete_list); + sbi->s_delete_blocks += blocks; + sbi->s_delete_inodes++; + spin_unlock(&sbi->s_delete_lock); + + wake_up(&sbi->s_delete_thread_queue); + + return 0; +} +#else +#define ext3_try_to_delay_deletion(inode) do {} while (0) +#endif + static int ext3_unlink(struct inode * dir, struct dentry *dentry) { int retval; @@ -1977,8 +2007,10 @@ ext3_update_dx_flag(dir); ext3_mark_inode_dirty(handle, dir); inode->i_nlink--; - if (!inode->i_nlink) + if (!inode->i_nlink) { + ext3_try_to_delay_deletion(inode); ext3_orphan_add(handle, inode); + } inode->i_ctime = dir->i_ctime; ext3_mark_inode_dirty(handle, inode); retval = 0; Index: linux-2.4.24/include/linux/ext3_fs.h =================================================================== --- linux-2.4.24.orig/include/linux/ext3_fs.h 2004-01-12 20:36:31.000000000 +0300 +++ linux-2.4.24/include/linux/ext3_fs.h 2004-01-12 20:36:32.000000000 +0300 @@ -193,6 +193,7 @@ */ #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ +#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ /* * ioctl commands @@ -320,6 +321,7 @@ #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ +#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H @@ -697,6 +699,9 @@ extern void ext3_dirty_inode(struct inode *); extern int ext3_change_inode_journal_flag(struct inode *, int); extern void ext3_truncate (struct inode *); +#ifdef EXT3_DELETE_THREAD +extern void ext3_truncate_thread(struct inode *inode); +#endif extern void ext3_set_inode_flags(struct inode *); /* ioctl.c */ Index: linux-2.4.24/include/linux/ext3_fs_sb.h =================================================================== --- linux-2.4.24.orig/include/linux/ext3_fs_sb.h 2004-01-12 20:36:31.000000000 +0300 +++ linux-2.4.24/include/linux/ext3_fs_sb.h 2004-01-12 20:36:32.000000000 +0300 @@ -29,6 +29,8 @@ #define EXT3_MAX_GROUP_LOADED 8 +#define EXT3_DELETE_THREAD + /* * third extended-fs super-block data in memory */ @@ -76,6 +78,14 @@ struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ #endif +#ifdef EXT3_DELETE_THREAD + spinlock_t s_delete_lock; + struct list_head s_delete_list; + unsigned long s_delete_blocks; + unsigned long s_delete_inodes; + wait_queue_head_t s_delete_thread_queue; + wait_queue_head_t s_delete_waiter_queue; +#endif }; #endif /* _LINUX_EXT3_FS_SB */