X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fkernel_patches%2Fpatches%2Fext3-delete_thread-2.4.29.patch;fp=lustre%2Fkernel_patches%2Fpatches%2Fext3-delete_thread-2.4.29.patch;h=39c47a74aa5e5a35e3b622f358a2bb5ef5d506ed;hb=113303973ec9f8484eb2355a1a6ef3c4c7fd6a56;hp=0000000000000000000000000000000000000000;hpb=272294dfa235dac803ed2d2b2ee8e0bd402622bb;p=fs%2Flustre-release.git diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.29.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.29.patch new file mode 100644 index 0000000..39c47a7 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.29.patch @@ -0,0 +1,442 @@ +Index: linux-2.4.29/fs/ext3/super.c +=================================================================== +--- linux-2.4.29.orig/fs/ext3/super.c 2005-05-03 15:53:33.047533872 +0300 ++++ linux-2.4.29/fs/ext3/super.c 2005-05-03 15:54:47.192262160 +0300 +@@ -400,6 +400,127 @@ + } + } + ++#ifdef EXT3_DELETE_THREAD ++/* ++ * Delete inodes in a loop until there are no more to be deleted. ++ * Normally, we run in the background doing the deletes and sleeping again, ++ * and clients just add new inodes to be deleted onto the end of the list. ++ * If someone is concerned about free space (e.g. block allocation or similar) ++ * then they can sleep on s_delete_waiter_queue and be woken up when space ++ * has been freed. ++ */ ++int ext3_delete_thread(void *data) ++{ ++ struct super_block *sb = data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct task_struct *tsk = current; ++ ++ /* Almost like daemonize, but not quite */ ++ exit_mm(current); ++ tsk->session = 1; ++ tsk->pgrp = 1; ++ tsk->tty = NULL; ++ exit_files(current); ++ reparent_to_init(); ++ ++ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); ++ sigfillset(&tsk->blocked); ++ ++ /*tsk->flags |= PF_KERNTHREAD;*/ ++ ++ INIT_LIST_HEAD(&sbi->s_delete_list); ++ wake_up(&sbi->s_delete_waiter_queue); ++ ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev)); ++ ++ /* main loop */ ++ for (;;) { ++ wait_event_interruptible(sbi->s_delete_thread_queue, ++ !list_empty(&sbi->s_delete_list) || ++ !test_opt(sb, ASYNCDEL)); ++ ext3_debug("%s woken up: %lu inodes, %lu blocks\n", ++ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); ++ ++ spin_lock(&sbi->s_delete_lock); ++ if (list_empty(&sbi->s_delete_list)) { ++ clear_opt(sbi->s_mount_opt, ASYNCDEL); ++ memset(&sbi->s_delete_list, 0, ++ sizeof(sbi->s_delete_list)); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("delete thread on %s exiting\n", ++ kdevname(sb->s_dev)); ++ wake_up(&sbi->s_delete_waiter_queue); ++ break; ++ } ++ ++ while (!list_empty(&sbi->s_delete_list)) { ++ struct inode *inode=list_entry(sbi->s_delete_list.next, ++ struct inode, i_devices); ++ unsigned long blocks = inode->i_blocks >> ++ (inode->i_blkbits - 9); ++ ++ list_del_init(&inode->i_devices); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("%s delete ino %lu blk %lu\n", ++ tsk->comm, inode->i_ino, blocks); ++ ++ J_ASSERT(EXT3_I(inode)->i_state & EXT3_STATE_DELETE); ++ J_ASSERT(inode->i_nlink == 1); ++ inode->i_nlink = 0; ++ iput(inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ sbi->s_delete_blocks -= blocks; ++ sbi->s_delete_inodes--; ++ } ++ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) { ++ ext3_warning(sb, __FUNCTION__, ++ "%lu blocks, %lu inodes on list?\n", ++ sbi->s_delete_blocks,sbi->s_delete_inodes); ++ sbi->s_delete_blocks = 0; ++ sbi->s_delete_inodes = 0; ++ } ++ spin_unlock(&sbi->s_delete_lock); ++ wake_up(&sbi->s_delete_waiter_queue); ++ } ++ ++ return 0; ++} ++ ++static void ext3_start_delete_thread(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int rc; ++ ++ spin_lock_init(&sbi->s_delete_lock); ++ init_waitqueue_head(&sbi->s_delete_thread_queue); ++ init_waitqueue_head(&sbi->s_delete_waiter_queue); ++ ++ if (!test_opt(sb, ASYNCDEL)) ++ return; ++ ++ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); ++ if (rc < 0) ++ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", ++ rc); ++ else ++ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); ++} ++ ++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) ++{ ++ if (sbi->s_delete_list.next == 0) /* thread never started */ ++ return; ++ ++ clear_opt(sbi->s_mount_opt, ASYNCDEL); ++ wake_up(&sbi->s_delete_thread_queue); ++ wait_event(sbi->s_delete_waiter_queue, ++ sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0); ++} ++#else ++#define ext3_start_delete_thread(sbi) do {} while(0) ++#define ext3_stop_delete_thread(sbi) do {} while(0) ++#endif /* EXT3_DELETE_THREAD */ ++ + void ext3_put_super (struct super_block * sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); +@@ -407,6 +528,9 @@ + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++#ifdef EXT3_DELETE_THREAD ++ J_ASSERT(sbi->s_delete_inodes == 0); ++#endif + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -526,6 +650,13 @@ + clear_opt (*mount_options, XATTR_USER); + else + #endif ++#ifdef EXT3_DELETE_THREAD ++ if (!strcmp(this_char, "asyncdel")) ++ set_opt(*mount_options, ASYNCDEL); ++ else if (!strcmp(this_char, "noasyncdel")) ++ clear_opt(*mount_options, ASYNCDEL); ++ else ++#endif + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -1244,6 +1375,7 @@ + } + + ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); ++ ext3_start_delete_thread(sb); + EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; + ext3_orphan_cleanup(sb, es); + EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; +@@ -1626,7 +1758,12 @@ + static int ext3_sync_fs(struct super_block *sb) + { + tid_t target; +- ++ ++ if (atomic_read(&sb->s_active) == 0) { ++ /* fs is being umounted: time to stop delete thread */ ++ ext3_stop_delete_thread(EXT3_SB(sb)); ++ } ++ + sb->s_dirt = 0; + target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); + log_wait_commit(EXT3_SB(sb)->s_journal, target); +@@ -1690,6 +1827,9 @@ + if (!parse_options(data, &tmp, sbi, &tmp, 1)) + return -EINVAL; + ++ if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY)) ++ ext3_stop_delete_thread(sbi); ++ + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + ext3_abort(sb, __FUNCTION__, "Abort forced by user"); + +Index: linux-2.4.29/fs/ext3/inode.c +=================================================================== +--- linux-2.4.29.orig/fs/ext3/inode.c 2005-05-03 15:53:36.555000656 +0300 ++++ linux-2.4.29/fs/ext3/inode.c 2005-05-03 15:53:56.901907456 +0300 +@@ -2562,6 +2562,118 @@ + return err; + } + ++#ifdef EXT3_DELETE_THREAD ++/* Move blocks from to-be-truncated inode over to a new inode, and delete ++ * that one from the delete thread instead. This avoids a lot of latency ++ * when truncating large files. ++ * ++ * If we have any problem deferring the truncate, just truncate it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * delete thread when we run out of space. ++ */ ++void ext3_truncate_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct ext3_inode_info *nei, *oei = EXT3_I(old_inode); ++ struct inode *new_inode; ++ handle_t *handle; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next) ++ goto out_truncate; ++ ++ /* XXX This is a temporary limitation for code simplicity. ++ * We could truncate to arbitrary sizes at some later time. ++ */ ++ if (old_inode->i_size != 0) ++ goto out_truncate; ++ ++ /* We may want to truncate the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || ++ old_inode->i_size > oei->i_disksize) ++ goto out_truncate; ++ ++ /* We can't use the delete thread as-is during real orphan recovery, ++ * as we add to the orphan list here, causing ext3_orphan_cleanup() ++ * to loop endlessly. It would be nice to do so, but needs work. ++ */ ++ if (oei->i_state & EXT3_STATE_DELETE || ++ sbi->s_mount_state & EXT3_ORPHAN_FS) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ goto out_truncate; ++ } ++ ++ ext3_discard_prealloc(old_inode); ++ ++ /* old_inode = 1 ++ * new_inode = sb + GDT + ibitmap ++ * orphan list = 1 inode/superblock for add, 2 inodes for del ++ * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS ++ */ ++ handle = ext3_journal_start(old_inode, 7); ++ if (IS_ERR(handle)) ++ goto out_truncate; ++ ++ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode); ++ if (IS_ERR(new_inode)) { ++ ext3_debug("truncate inode %lu directly (no new inodes)\n", ++ old_inode->i_ino); ++ goto out_journal; ++ } ++ ++ nei = EXT3_I(new_inode); ++ ++ down_write(&oei->truncate_sem); ++ new_inode->i_size = old_inode->i_size; ++ new_inode->i_blocks = old_inode->i_blocks; ++ new_inode->i_uid = old_inode->i_uid; ++ new_inode->i_gid = old_inode->i_gid; ++ new_inode->i_nlink = 1; ++ ++ /* FIXME when we do arbitrary truncates */ ++ old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0; ++ old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME; ++ ++ memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data)); ++ memset(oei->i_data, 0, sizeof(oei->i_data)); ++ ++ nei->i_disksize = oei->i_disksize; ++ nei->i_state |= EXT3_STATE_DELETE; ++ up_write(&oei->truncate_sem); ++ ++ if (ext3_orphan_add(handle, new_inode) < 0) ++ goto out_journal; ++ ++ if (ext3_orphan_del(handle, old_inode) < 0) { ++ ext3_orphan_del(handle, new_inode); ++ iput(new_inode); ++ goto out_journal; ++ } ++ ++ ext3_journal_stop(handle, old_inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_devices)); ++ list_add_tail(&new_inode->i_devices, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ ext3_debug("delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ return; ++ ++out_journal: ++ ext3_journal_stop(handle, old_inode); ++out_truncate: ++ ext3_truncate(old_inode); ++} ++#endif /* EXT3_DELETE_THREAD */ ++ + /* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. +Index: linux-2.4.29/fs/ext3/file.c +=================================================================== +--- linux-2.4.29.orig/fs/ext3/file.c 2005-04-07 19:31:00.000000000 +0300 ++++ linux-2.4.29/fs/ext3/file.c 2005-05-03 15:53:56.902907304 +0300 +@@ -123,7 +123,11 @@ + }; + + struct inode_operations ext3_file_inode_operations = { ++#ifdef EXT3_DELETE_THREAD ++ truncate: ext3_truncate_thread, /* BKL held */ ++#else + truncate: ext3_truncate, /* BKL held */ ++#endif + setattr: ext3_setattr, /* BKL held */ + setxattr: ext3_setxattr, /* BKL held */ + getxattr: ext3_getxattr, /* BKL held */ +Index: linux-2.4.29/fs/ext3/namei.c +=================================================================== +--- linux-2.4.29.orig/fs/ext3/namei.c 2005-05-03 15:53:33.044534328 +0300 ++++ linux-2.4.29/fs/ext3/namei.c 2005-05-03 15:53:56.905906848 +0300 +@@ -838,6 +838,40 @@ + return retval; + } + ++#ifdef EXT3_DELETE_THREAD ++static int ext3_try_to_delay_deletion(struct inode *inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long blocks; ++ ++ if (!test_opt(inode->i_sb, ASYNCDEL)) ++ return 0; ++ ++ /* We may want to delete the inode immediately and not defer it */ ++ blocks = inode->i_blocks >> (inode->i_blkbits - 9); ++ if (IS_SYNC(inode) || blocks <= EXT3_NDIR_BLOCKS) ++ return 0; ++ ++ inode->i_nlink = 1; ++ atomic_inc(&inode->i_count); ++ ei->i_state |= EXT3_STATE_DELETE; ++ ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&inode->i_devices)); ++ list_add_tail(&inode->i_devices, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++ ++ return 0; ++} ++#else ++#define ext3_try_to_delay_deletion(inode) do {} while (0) ++#endif ++ + static int ext3_unlink(struct inode * dir, struct dentry *dentry) + { + int retval; +@@ -878,8 +912,10 @@ + dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + inode->i_nlink--; +- if (!inode->i_nlink) ++ if (!inode->i_nlink) { ++ ext3_try_to_delay_deletion(inode); + ext3_orphan_add(handle, inode); ++ } + inode->i_ctime = dir->i_ctime; + ext3_mark_inode_dirty(handle, inode); + retval = 0; +Index: linux-2.4.29/include/linux/ext3_fs.h +=================================================================== +--- linux-2.4.29.orig/include/linux/ext3_fs.h 2005-05-03 15:53:37.124914016 +0300 ++++ linux-2.4.29/include/linux/ext3_fs.h 2005-05-03 15:53:56.907906544 +0300 +@@ -188,6 +188,7 @@ + */ + #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ + #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ ++#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ + + /* + * ioctl commands +@@ -315,6 +316,7 @@ + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ ++#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -639,6 +641,9 @@ + extern void ext3_dirty_inode(struct inode *); + extern int ext3_change_inode_journal_flag(struct inode *, int); + extern void ext3_truncate (struct inode *); ++#ifdef EXT3_DELETE_THREAD ++extern void ext3_truncate_thread(struct inode *inode); ++#endif + extern void ext3_set_inode_flags(struct inode *); + + /* ioctl.c */ +Index: linux-2.4.29/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-2.4.29.orig/include/linux/ext3_fs_sb.h 2005-05-03 15:53:33.048533720 +0300 ++++ linux-2.4.29/include/linux/ext3_fs_sb.h 2005-05-03 15:53:56.909906240 +0300 +@@ -29,6 +29,8 @@ + + #define EXT3_MAX_GROUP_LOADED 8 + ++#define EXT3_DELETE_THREAD ++ + /* + * third extended-fs super-block data in memory + */ +@@ -74,6 +76,14 @@ + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif ++#ifdef EXT3_DELETE_THREAD ++ spinlock_t s_delete_lock; ++ struct list_head s_delete_list; ++ unsigned long s_delete_blocks; ++ unsigned long s_delete_inodes; ++ wait_queue_head_t s_delete_thread_queue; ++ wait_queue_head_t s_delete_waiter_queue; ++#endif + }; + + #endif /* _LINUX_EXT3_FS_SB */