Whamcloud - gitweb
land b1_5 onto HEAD
[fs/lustre-release.git] / lustre / kernel_patches / patches / ext3-delete_thread-2.4.29.patch
diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.29.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.29.patch
new file mode 100644 (file)
index 0000000..39c47a7
--- /dev/null
@@ -0,0 +1,442 @@
+Index: linux-2.4.29/fs/ext3/super.c
+===================================================================
+--- linux-2.4.29.orig/fs/ext3/super.c  2005-05-03 15:53:33.047533872 +0300
++++ linux-2.4.29/fs/ext3/super.c       2005-05-03 15:54:47.192262160 +0300
+@@ -400,6 +400,127 @@
+       }
+ }
++#ifdef EXT3_DELETE_THREAD
++/*
++ * Delete inodes in a loop until there are no more to be deleted.
++ * Normally, we run in the background doing the deletes and sleeping again,
++ * and clients just add new inodes to be deleted onto the end of the list.
++ * If someone is concerned about free space (e.g. block allocation or similar)
++ * then they can sleep on s_delete_waiter_queue and be woken up when space
++ * has been freed.
++ */
++int ext3_delete_thread(void *data)
++{
++      struct super_block *sb = data;
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
++      struct task_struct *tsk = current;
++
++      /* Almost like daemonize, but not quite */
++      exit_mm(current);
++      tsk->session = 1;
++      tsk->pgrp = 1;
++      tsk->tty = NULL;
++      exit_files(current);
++      reparent_to_init();
++
++      sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
++      sigfillset(&tsk->blocked);
++
++      /*tsk->flags |= PF_KERNTHREAD;*/
++
++      INIT_LIST_HEAD(&sbi->s_delete_list);
++      wake_up(&sbi->s_delete_waiter_queue);
++      ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
++
++      /* main loop */
++      for (;;) {
++              wait_event_interruptible(sbi->s_delete_thread_queue,
++                                       !list_empty(&sbi->s_delete_list) ||
++                                       !test_opt(sb, ASYNCDEL));
++              ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
++                         tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
++
++              spin_lock(&sbi->s_delete_lock);
++              if (list_empty(&sbi->s_delete_list)) {
++                      clear_opt(sbi->s_mount_opt, ASYNCDEL);
++                      memset(&sbi->s_delete_list, 0,
++                             sizeof(sbi->s_delete_list));
++                      spin_unlock(&sbi->s_delete_lock);
++                      ext3_debug("delete thread on %s exiting\n",
++                                 kdevname(sb->s_dev));
++                      wake_up(&sbi->s_delete_waiter_queue);
++                      break;
++              }
++
++              while (!list_empty(&sbi->s_delete_list)) {
++                      struct inode *inode=list_entry(sbi->s_delete_list.next,
++                                                     struct inode, i_devices);
++                      unsigned long blocks = inode->i_blocks >>
++                                                      (inode->i_blkbits - 9);
++
++                      list_del_init(&inode->i_devices);
++                      spin_unlock(&sbi->s_delete_lock);
++                      ext3_debug("%s delete ino %lu blk %lu\n",
++                                 tsk->comm, inode->i_ino, blocks);
++
++                      J_ASSERT(EXT3_I(inode)->i_state & EXT3_STATE_DELETE);
++                      J_ASSERT(inode->i_nlink == 1);
++                      inode->i_nlink = 0;
++                      iput(inode);
++
++                      spin_lock(&sbi->s_delete_lock);
++                      sbi->s_delete_blocks -= blocks;
++                      sbi->s_delete_inodes--;
++              }
++              if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
++                      ext3_warning(sb, __FUNCTION__,
++                                   "%lu blocks, %lu inodes on list?\n",
++                                   sbi->s_delete_blocks,sbi->s_delete_inodes);
++                      sbi->s_delete_blocks = 0;
++                      sbi->s_delete_inodes = 0;
++              }
++              spin_unlock(&sbi->s_delete_lock);
++              wake_up(&sbi->s_delete_waiter_queue);
++      }
++
++      return 0;
++}
++
++static void ext3_start_delete_thread(struct super_block *sb)
++{
++      struct ext3_sb_info *sbi = EXT3_SB(sb);
++      int rc;
++
++      spin_lock_init(&sbi->s_delete_lock);
++      init_waitqueue_head(&sbi->s_delete_thread_queue);
++      init_waitqueue_head(&sbi->s_delete_waiter_queue);
++
++      if (!test_opt(sb, ASYNCDEL))
++              return;
++
++      rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
++      if (rc < 0)
++              printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
++                     rc);
++      else
++              wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
++}
++
++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
++{
++      if (sbi->s_delete_list.next == 0)       /* thread never started */
++              return;
++
++      clear_opt(sbi->s_mount_opt, ASYNCDEL);
++      wake_up(&sbi->s_delete_thread_queue);
++      wait_event(sbi->s_delete_waiter_queue,
++                      sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0);
++}
++#else
++#define ext3_start_delete_thread(sbi) do {} while(0)
++#define ext3_stop_delete_thread(sbi) do {} while(0)
++#endif /* EXT3_DELETE_THREAD */
++
+ void ext3_put_super (struct super_block * sb)
+ {
+       struct ext3_sb_info *sbi = EXT3_SB(sb);
+@@ -407,6 +528,9 @@
+       kdev_t j_dev = sbi->s_journal->j_dev;
+       int i;
++#ifdef EXT3_DELETE_THREAD
++      J_ASSERT(sbi->s_delete_inodes == 0);
++#endif
+       ext3_xattr_put_super(sb);
+       journal_destroy(sbi->s_journal);
+       if (!(sb->s_flags & MS_RDONLY)) {
+@@ -526,6 +650,13 @@
+                       clear_opt (*mount_options, XATTR_USER);
+               else
+ #endif
++#ifdef EXT3_DELETE_THREAD
++              if (!strcmp(this_char, "asyncdel"))
++                      set_opt(*mount_options, ASYNCDEL);
++              else if (!strcmp(this_char, "noasyncdel"))
++                      clear_opt(*mount_options, ASYNCDEL);
++              else
++#endif
+               if (!strcmp (this_char, "bsddf"))
+                       clear_opt (*mount_options, MINIX_DF);
+               else if (!strcmp (this_char, "nouid32")) {
+@@ -1244,6 +1375,7 @@
+       }
+       ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
++      ext3_start_delete_thread(sb);
+       EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
+       ext3_orphan_cleanup(sb, es);
+       EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
+@@ -1626,7 +1758,12 @@
+ static int ext3_sync_fs(struct super_block *sb)
+ {
+       tid_t target;
+-      
++
++      if (atomic_read(&sb->s_active) == 0) {
++              /* fs is being umounted: time to stop delete thread */
++              ext3_stop_delete_thread(EXT3_SB(sb));
++      }
++
+       sb->s_dirt = 0;
+       target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
+       log_wait_commit(EXT3_SB(sb)->s_journal, target);
+@@ -1690,6 +1827,9 @@
+       if (!parse_options(data, &tmp, sbi, &tmp, 1))
+               return -EINVAL;
++      if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
++              ext3_stop_delete_thread(sbi);
++
+       if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+               ext3_abort(sb, __FUNCTION__, "Abort forced by user");
+Index: linux-2.4.29/fs/ext3/inode.c
+===================================================================
+--- linux-2.4.29.orig/fs/ext3/inode.c  2005-05-03 15:53:36.555000656 +0300
++++ linux-2.4.29/fs/ext3/inode.c       2005-05-03 15:53:56.901907456 +0300
+@@ -2562,6 +2562,118 @@
+       return err;
+ }
++#ifdef EXT3_DELETE_THREAD
++/* Move blocks from to-be-truncated inode over to a new inode, and delete
++ * that one from the delete thread instead.  This avoids a lot of latency
++ * when truncating large files.
++ *
++ * If we have any problem deferring the truncate, just truncate it right away.
++ * If we defer it, we also mark how many blocks it would free, so that we
++ * can keep the statfs data correct, and we know if we should sleep on the
++ * delete thread when we run out of space.
++ */
++void ext3_truncate_thread(struct inode *old_inode)
++{
++      struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
++      struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
++      struct inode *new_inode;
++      handle_t *handle;
++      unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
++
++      if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
++              goto out_truncate;
++
++      /* XXX This is a temporary limitation for code simplicity.
++       *     We could truncate to arbitrary sizes at some later time.
++       */
++      if (old_inode->i_size != 0)
++              goto out_truncate;
++
++      /* We may want to truncate the inode immediately and not defer it */
++      if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
++          old_inode->i_size > oei->i_disksize)
++              goto out_truncate;
++
++      /* We can't use the delete thread as-is during real orphan recovery,
++       * as we add to the orphan list here, causing ext3_orphan_cleanup()
++       * to loop endlessly.  It would be nice to do so, but needs work.
++       */
++      if (oei->i_state & EXT3_STATE_DELETE ||
++          sbi->s_mount_state & EXT3_ORPHAN_FS) {
++              ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
++                         old_inode->i_ino, blocks);
++              goto out_truncate;
++      }
++
++      ext3_discard_prealloc(old_inode);
++
++      /* old_inode   = 1
++       * new_inode   = sb + GDT + ibitmap
++       * orphan list = 1 inode/superblock for add, 2 inodes for del
++       * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
++       */
++      handle = ext3_journal_start(old_inode, 7);
++      if (IS_ERR(handle))
++              goto out_truncate;
++
++      new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
++      if (IS_ERR(new_inode)) {
++              ext3_debug("truncate inode %lu directly (no new inodes)\n",
++                         old_inode->i_ino);
++              goto out_journal;
++      }
++
++      nei = EXT3_I(new_inode);
++
++      down_write(&oei->truncate_sem);
++      new_inode->i_size = old_inode->i_size;
++      new_inode->i_blocks = old_inode->i_blocks;
++      new_inode->i_uid = old_inode->i_uid;
++      new_inode->i_gid = old_inode->i_gid;
++      new_inode->i_nlink = 1;
++
++      /* FIXME when we do arbitrary truncates */
++      old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
++      old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
++
++      memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
++      memset(oei->i_data, 0, sizeof(oei->i_data));
++
++      nei->i_disksize = oei->i_disksize;
++      nei->i_state |= EXT3_STATE_DELETE;
++      up_write(&oei->truncate_sem);
++
++      if (ext3_orphan_add(handle, new_inode) < 0)
++              goto out_journal;
++
++      if (ext3_orphan_del(handle, old_inode) < 0) {
++              ext3_orphan_del(handle, new_inode);
++              iput(new_inode);
++              goto out_journal;
++      }
++
++      ext3_journal_stop(handle, old_inode);
++
++      spin_lock(&sbi->s_delete_lock);
++      J_ASSERT(list_empty(&new_inode->i_devices));
++      list_add_tail(&new_inode->i_devices, &sbi->s_delete_list);
++      sbi->s_delete_blocks += blocks;
++      sbi->s_delete_inodes++;
++      spin_unlock(&sbi->s_delete_lock);
++
++      ext3_debug("delete inode %lu (%lu blocks) by thread\n",
++                 new_inode->i_ino, blocks);
++
++      wake_up(&sbi->s_delete_thread_queue);
++      return;
++
++out_journal:
++      ext3_journal_stop(handle, old_inode);
++out_truncate:
++      ext3_truncate(old_inode);
++}
++#endif /* EXT3_DELETE_THREAD */
++
+ /* 
+  * On success, We end up with an outstanding reference count against
+  * iloc->bh.  This _must_ be cleaned up later. 
+Index: linux-2.4.29/fs/ext3/file.c
+===================================================================
+--- linux-2.4.29.orig/fs/ext3/file.c   2005-04-07 19:31:00.000000000 +0300
++++ linux-2.4.29/fs/ext3/file.c        2005-05-03 15:53:56.902907304 +0300
+@@ -123,7 +123,11 @@
+ };
+ struct inode_operations ext3_file_inode_operations = {
++#ifdef EXT3_DELETE_THREAD
++      truncate:       ext3_truncate_thread,   /* BKL held */
++#else
+       truncate:       ext3_truncate,          /* BKL held */
++#endif
+       setattr:        ext3_setattr,           /* BKL held */
+       setxattr:       ext3_setxattr,          /* BKL held */
+       getxattr:       ext3_getxattr,          /* BKL held */
+Index: linux-2.4.29/fs/ext3/namei.c
+===================================================================
+--- linux-2.4.29.orig/fs/ext3/namei.c  2005-05-03 15:53:33.044534328 +0300
++++ linux-2.4.29/fs/ext3/namei.c       2005-05-03 15:53:56.905906848 +0300
+@@ -838,6 +838,40 @@
+       return retval;
+ }
++#ifdef EXT3_DELETE_THREAD
++static int ext3_try_to_delay_deletion(struct inode *inode)
++{
++      struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb);
++      struct ext3_inode_info *ei = EXT3_I(inode);
++      unsigned long blocks;
++
++      if (!test_opt(inode->i_sb, ASYNCDEL))
++              return 0;
++
++      /* We may want to delete the inode immediately and not defer it */
++      blocks = inode->i_blocks >> (inode->i_blkbits - 9);
++      if (IS_SYNC(inode) || blocks <= EXT3_NDIR_BLOCKS)
++              return 0;
++
++      inode->i_nlink = 1;
++      atomic_inc(&inode->i_count);
++      ei->i_state |= EXT3_STATE_DELETE;
++
++      spin_lock(&sbi->s_delete_lock);
++      J_ASSERT(list_empty(&inode->i_devices));
++      list_add_tail(&inode->i_devices, &sbi->s_delete_list);
++      sbi->s_delete_blocks += blocks;
++      sbi->s_delete_inodes++;
++      spin_unlock(&sbi->s_delete_lock);
++
++      wake_up(&sbi->s_delete_thread_queue);
++
++      return 0;
++}
++#else
++#define ext3_try_to_delay_deletion(inode) do {} while (0)
++#endif
++
+ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
+ {
+       int retval;
+@@ -878,8 +912,10 @@
+       dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+       ext3_mark_inode_dirty(handle, dir);
+       inode->i_nlink--;
+-      if (!inode->i_nlink)
++      if (!inode->i_nlink) {
++              ext3_try_to_delay_deletion(inode);
+               ext3_orphan_add(handle, inode);
++      }
+       inode->i_ctime = dir->i_ctime;
+       ext3_mark_inode_dirty(handle, inode);
+       retval = 0;
+Index: linux-2.4.29/include/linux/ext3_fs.h
+===================================================================
+--- linux-2.4.29.orig/include/linux/ext3_fs.h  2005-05-03 15:53:37.124914016 +0300
++++ linux-2.4.29/include/linux/ext3_fs.h       2005-05-03 15:53:56.907906544 +0300
+@@ -188,6 +188,7 @@
+  */
+ #define EXT3_STATE_JDATA              0x00000001 /* journaled data exists */
+ #define EXT3_STATE_NEW                        0x00000002 /* inode is newly created */
++#define EXT3_STATE_DELETE             0x00000010 /* deferred delete inode */
+ /*
+  * ioctl commands
+@@ -315,6 +316,7 @@
+ #define EXT3_MOUNT_UPDATE_JOURNAL     0x1000  /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32           0x2000  /* Disable 32-bit UIDs */
+ #define EXT3_MOUNT_XATTR_USER         0x4000  /* Extended user attributes */
++#define EXT3_MOUNT_ASYNCDEL           0x20000 /* Delayed deletion */
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+@@ -639,6 +641,9 @@
+ extern void ext3_dirty_inode(struct inode *);
+ extern int ext3_change_inode_journal_flag(struct inode *, int);
+ extern void ext3_truncate (struct inode *);
++#ifdef EXT3_DELETE_THREAD
++extern void ext3_truncate_thread(struct inode *inode);
++#endif
+ extern void ext3_set_inode_flags(struct inode *);
+ /* ioctl.c */
+Index: linux-2.4.29/include/linux/ext3_fs_sb.h
+===================================================================
+--- linux-2.4.29.orig/include/linux/ext3_fs_sb.h       2005-05-03 15:53:33.048533720 +0300
++++ linux-2.4.29/include/linux/ext3_fs_sb.h    2005-05-03 15:53:56.909906240 +0300
+@@ -29,6 +29,8 @@
+ #define EXT3_MAX_GROUP_LOADED 8
++#define EXT3_DELETE_THREAD
++
+ /*
+  * third extended-fs super-block data in memory
+  */
+@@ -74,6 +76,14 @@
+       struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
+       wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
+ #endif
++#ifdef EXT3_DELETE_THREAD
++      spinlock_t s_delete_lock;
++      struct list_head s_delete_list;
++      unsigned long s_delete_blocks;
++      unsigned long s_delete_inodes;
++      wait_queue_head_t s_delete_thread_queue;
++      wait_queue_head_t s_delete_waiter_queue;
++#endif
+ };
+ #endif        /* _LINUX_EXT3_FS_SB */