Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / kernel_patches / patches / ext3-delete_thread-2.4.24.patch
1  fs/ext3/file.c             |    4 
2  fs/ext3/inode.c            |  116 ++++++++++++++++++++++
3  fs/ext3/super.c            |  230 +++++++++++++++++++++++++++++++++++++++++++++
4  include/linux/ext3_fs.h    |    5 
5  include/linux/ext3_fs_sb.h |   10 +
6  5 files changed, 365 insertions(+)
7
8 Index: linux-2.4.24/fs/ext3/super.c
9 ===================================================================
10 --- linux-2.4.24.orig/fs/ext3/super.c   2004-01-12 20:36:31.000000000 +0300
11 +++ linux-2.4.24/fs/ext3/super.c        2004-01-13 16:27:43.000000000 +0300
12 @@ -400,6 +400,127 @@
13         }
14  }
15  
16 +#ifdef EXT3_DELETE_THREAD
17 +/*
18 + * Delete inodes in a loop until there are no more to be deleted.
19 + * Normally, we run in the background doing the deletes and sleeping again,
20 + * and clients just add new inodes to be deleted onto the end of the list.
21 + * If someone is concerned about free space (e.g. block allocation or similar)
22 + * then they can sleep on s_delete_waiter_queue and be woken up when space
23 + * has been freed.
24 + */
25 +int ext3_delete_thread(void *data)
26 +{
27 +       struct super_block *sb = data;
28 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
29 +       struct task_struct *tsk = current;
30 +
31 +       /* Almost like daemonize, but not quite */
32 +       exit_mm(current);
33 +       tsk->session = 1;
34 +       tsk->pgrp = 1;
35 +       tsk->tty = NULL;
36 +       exit_files(current);
37 +       reparent_to_init();
38 +
39 +       sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
40 +       sigfillset(&tsk->blocked);
41 +
42 +       /*tsk->flags |= PF_KERNTHREAD;*/
43 +
44 +       INIT_LIST_HEAD(&sbi->s_delete_list);
45 +       wake_up(&sbi->s_delete_waiter_queue);
46 +       ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
47 +
48 +       /* main loop */
49 +       for (;;) {
50 +               wait_event_interruptible(sbi->s_delete_thread_queue,
51 +                                        !list_empty(&sbi->s_delete_list) ||
52 +                                        !test_opt(sb, ASYNCDEL));
53 +               ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
54 +                          tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
55 +
56 +               spin_lock(&sbi->s_delete_lock);
57 +               if (list_empty(&sbi->s_delete_list)) {
58 +                       clear_opt(sbi->s_mount_opt, ASYNCDEL);
59 +                       memset(&sbi->s_delete_list, 0,
60 +                              sizeof(sbi->s_delete_list));
61 +                       spin_unlock(&sbi->s_delete_lock);
62 +                       ext3_debug("delete thread on %s exiting\n",
63 +                                  kdevname(sb->s_dev));
64 +                       wake_up(&sbi->s_delete_waiter_queue);
65 +                       break;
66 +               }
67 +
68 +               while (!list_empty(&sbi->s_delete_list)) {
69 +                       struct inode *inode=list_entry(sbi->s_delete_list.next,
70 +                                                      struct inode, i_devices);
71 +                       unsigned long blocks = inode->i_blocks >>
72 +                                                       (inode->i_blkbits - 9);
73 +
74 +                       list_del_init(&inode->i_devices);
75 +                       spin_unlock(&sbi->s_delete_lock);
76 +                       ext3_debug("%s delete ino %lu blk %lu\n",
77 +                                  tsk->comm, inode->i_ino, blocks);
78 +
79 +                       J_ASSERT(EXT3_I(inode)->i_state & EXT3_STATE_DELETE);
80 +                       J_ASSERT(inode->i_nlink == 1);
81 +                       inode->i_nlink = 0;
82 +                       iput(inode);
83 +
84 +                       spin_lock(&sbi->s_delete_lock);
85 +                       sbi->s_delete_blocks -= blocks;
86 +                       sbi->s_delete_inodes--;
87 +               }
88 +               if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
89 +                       ext3_warning(sb, __FUNCTION__,
90 +                                    "%lu blocks, %lu inodes on list?\n",
91 +                                    sbi->s_delete_blocks,sbi->s_delete_inodes);
92 +                       sbi->s_delete_blocks = 0;
93 +                       sbi->s_delete_inodes = 0;
94 +               }
95 +               spin_unlock(&sbi->s_delete_lock);
96 +               wake_up(&sbi->s_delete_waiter_queue);
97 +       }
98 +
99 +       return 0;
100 +}
101 +
102 +static void ext3_start_delete_thread(struct super_block *sb)
103 +{
104 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
105 +       int rc;
106 +
107 +       spin_lock_init(&sbi->s_delete_lock);
108 +       init_waitqueue_head(&sbi->s_delete_thread_queue);
109 +       init_waitqueue_head(&sbi->s_delete_waiter_queue);
110 +
111 +       if (!test_opt(sb, ASYNCDEL))
112 +               return;
113 +
114 +       rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
115 +       if (rc < 0)
116 +               printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
117 +                      rc);
118 +       else
119 +               wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
120 +}
121 +
122 +static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
123 +{
124 +       if (sbi->s_delete_list.next == 0)       /* thread never started */
125 +               return;
126 +
127 +       clear_opt(sbi->s_mount_opt, ASYNCDEL);
128 +       wake_up(&sbi->s_delete_thread_queue);
129 +       wait_event(sbi->s_delete_waiter_queue,
130 +                       sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0);
131 +}
132 +#else
133 +#define ext3_start_delete_thread(sbi) do {} while(0)
134 +#define ext3_stop_delete_thread(sbi) do {} while(0)
135 +#endif /* EXT3_DELETE_THREAD */
136 +
137  void ext3_put_super (struct super_block * sb)
138  {
139         struct ext3_sb_info *sbi = EXT3_SB(sb);
140 @@ -407,6 +529,9 @@
141         kdev_t j_dev = sbi->s_journal->j_dev;
142         int i;
143  
144 +#ifdef EXT3_DELETE_THREAD
145 +       J_ASSERT(sbi->s_delete_inodes == 0);
146 +#endif
147         ext3_xattr_put_super(sb);
148         journal_destroy(sbi->s_journal);
149         if (!(sb->s_flags & MS_RDONLY)) {
150 @@ -527,6 +650,13 @@
151                         clear_opt (*mount_options, XATTR_USER);
152                 else
153  #endif
154 +#ifdef EXT3_DELETE_THREAD
155 +               if (!strcmp(this_char, "asyncdel"))
156 +                       set_opt(*mount_options, ASYNCDEL);
157 +               else if (!strcmp(this_char, "noasyncdel"))
158 +                       clear_opt(*mount_options, ASYNCDEL);
159 +               else
160 +#endif
161                 if (!strcmp (this_char, "bsddf"))
162                         clear_opt (*mount_options, MINIX_DF);
163                 else if (!strcmp (this_char, "nouid32")) {
164 @@ -1227,6 +1357,7 @@
165         }
166  
167         ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
168 +       ext3_start_delete_thread(sb);
169         /*
170          * akpm: core read_super() calls in here with the superblock locked.
171          * That deadlocks, because orphan cleanup needs to lock the superblock
172 @@ -1618,7 +1749,12 @@
173  static int ext3_sync_fs(struct super_block *sb)
174  {
175         tid_t target;
176 -       
177 +
178 +       if (atomic_read(&sb->s_active) == 0) {
179 +               /* fs is being umounted: time to stop delete thread */
180 +               ext3_stop_delete_thread(EXT3_SB(sb));
181 +       }
182 +
183         sb->s_dirt = 0;
184         target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
185         log_wait_commit(EXT3_SB(sb)->s_journal, target);
186 @@ -1682,6 +1818,9 @@
187         if (!parse_options(data, &tmp, sbi, &tmp, 1))
188                 return -EINVAL;
189  
190 +       if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
191 +               ext3_stop_delete_thread(sbi);
192 +
193         if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
194                 ext3_abort(sb, __FUNCTION__, "Abort forced by user");
195  
196 Index: linux-2.4.24/fs/ext3/inode.c
197 ===================================================================
198 --- linux-2.4.24.orig/fs/ext3/inode.c   2004-01-12 20:36:31.000000000 +0300
199 +++ linux-2.4.24/fs/ext3/inode.c        2004-01-12 20:36:32.000000000 +0300
200 @@ -2551,6 +2551,118 @@
201         return err;
202  }
203  
204 +#ifdef EXT3_DELETE_THREAD
205 +/* Move blocks from to-be-truncated inode over to a new inode, and delete
206 + * that one from the delete thread instead.  This avoids a lot of latency
207 + * when truncating large files.
208 + *
209 + * If we have any problem deferring the truncate, just truncate it right away.
210 + * If we defer it, we also mark how many blocks it would free, so that we
211 + * can keep the statfs data correct, and we know if we should sleep on the
212 + * delete thread when we run out of space.
213 + */
214 +void ext3_truncate_thread(struct inode *old_inode)
215 +{
216 +       struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
217 +       struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
218 +       struct inode *new_inode;
219 +       handle_t *handle;
220 +       unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
221 +
222 +       if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
223 +               goto out_truncate;
224 +
225 +       /* XXX This is a temporary limitation for code simplicity.
226 +        *     We could truncate to arbitrary sizes at some later time.
227 +        */
228 +       if (old_inode->i_size != 0)
229 +               goto out_truncate;
230 +
231 +       /* We may want to truncate the inode immediately and not defer it */
232 +       if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
233 +           old_inode->i_size > oei->i_disksize)
234 +               goto out_truncate;
235 +
236 +       /* We can't use the delete thread as-is during real orphan recovery,
237 +        * as we add to the orphan list here, causing ext3_orphan_cleanup()
238 +        * to loop endlessly.  It would be nice to do so, but needs work.
239 +        */
240 +       if (oei->i_state & EXT3_STATE_DELETE ||
241 +           sbi->s_mount_state & EXT3_ORPHAN_FS) {
242 +               ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
243 +                          old_inode->i_ino, blocks);
244 +               goto out_truncate;
245 +       }
246 +
247 +       ext3_discard_prealloc(old_inode);
248 +
249 +       /* old_inode   = 1
250 +        * new_inode   = sb + GDT + ibitmap
251 +        * orphan list = 1 inode/superblock for add, 2 inodes for del
252 +        * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
253 +        */
254 +       handle = ext3_journal_start(old_inode, 7);
255 +       if (IS_ERR(handle))
256 +               goto out_truncate;
257 +
258 +       new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
259 +       if (IS_ERR(new_inode)) {
260 +               ext3_debug("truncate inode %lu directly (no new inodes)\n",
261 +                          old_inode->i_ino);
262 +               goto out_journal;
263 +       }
264 +
265 +       nei = EXT3_I(new_inode);
266 +
267 +       down_write(&oei->truncate_sem);
268 +       new_inode->i_size = old_inode->i_size;
269 +       new_inode->i_blocks = old_inode->i_blocks;
270 +       new_inode->i_uid = old_inode->i_uid;
271 +       new_inode->i_gid = old_inode->i_gid;
272 +       new_inode->i_nlink = 1;
273 +
274 +       /* FIXME when we do arbitrary truncates */
275 +       old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
276 +       old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
277 +
278 +       memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
279 +       memset(oei->i_data, 0, sizeof(oei->i_data));
280 +
281 +       nei->i_disksize = oei->i_disksize;
282 +       nei->i_state |= EXT3_STATE_DELETE;
283 +       up_write(&oei->truncate_sem);
284 +
285 +       if (ext3_orphan_add(handle, new_inode) < 0)
286 +               goto out_journal;
287 +
288 +       if (ext3_orphan_del(handle, old_inode) < 0) {
289 +               ext3_orphan_del(handle, new_inode);
290 +               iput(new_inode);
291 +               goto out_journal;
292 +       }
293 +
294 +       ext3_journal_stop(handle, old_inode);
295 +
296 +       spin_lock(&sbi->s_delete_lock);
297 +       J_ASSERT(list_empty(&new_inode->i_devices));
298 +       list_add_tail(&new_inode->i_devices, &sbi->s_delete_list);
299 +       sbi->s_delete_blocks += blocks;
300 +       sbi->s_delete_inodes++;
301 +       spin_unlock(&sbi->s_delete_lock);
302 +
303 +       ext3_debug("delete inode %lu (%lu blocks) by thread\n",
304 +                  new_inode->i_ino, blocks);
305 +
306 +       wake_up(&sbi->s_delete_thread_queue);
307 +       return;
308 +
309 +out_journal:
310 +       ext3_journal_stop(handle, old_inode);
311 +out_truncate:
312 +       ext3_truncate(old_inode);
313 +}
314 +#endif /* EXT3_DELETE_THREAD */
315 +
316  /* 
317   * On success, We end up with an outstanding reference count against
318   * iloc->bh.  This _must_ be cleaned up later. 
319 Index: linux-2.4.24/fs/ext3/file.c
320 ===================================================================
321 --- linux-2.4.24.orig/fs/ext3/file.c    2004-01-12 20:36:29.000000000 +0300
322 +++ linux-2.4.24/fs/ext3/file.c 2004-01-12 20:36:32.000000000 +0300
323 @@ -126,7 +126,11 @@
324  };
325  
326  struct inode_operations ext3_file_inode_operations = {
327 +#ifdef EXT3_DELETE_THREAD
328 +       truncate:       ext3_truncate_thread,   /* BKL held */
329 +#else
330         truncate:       ext3_truncate,          /* BKL held */
331 +#endif
332         setattr:        ext3_setattr,           /* BKL held */
333         setxattr:       ext3_setxattr,          /* BKL held */
334         getxattr:       ext3_getxattr,          /* BKL held */
335 Index: linux-2.4.24/fs/ext3/namei.c
336 ===================================================================
337 --- linux-2.4.24.orig/fs/ext3/namei.c   2004-01-12 20:36:31.000000000 +0300
338 +++ linux-2.4.24/fs/ext3/namei.c        2004-01-12 20:36:32.000000000 +0300
339 @@ -1936,6 +1936,40 @@
340         return retval;
341  }
342  
343 +#ifdef EXT3_DELETE_THREAD
344 +static int ext3_try_to_delay_deletion(struct inode *inode)
345 +{
346 +       struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb);
347 +       struct ext3_inode_info *ei = EXT3_I(inode);
348 +       unsigned long blocks;
349 +
350 +       if (!test_opt(inode->i_sb, ASYNCDEL))
351 +               return 0;
352 +
353 +       /* We may want to delete the inode immediately and not defer it */
354 +       blocks = inode->i_blocks >> (inode->i_blkbits - 9);
355 +       if (IS_SYNC(inode) || blocks <= EXT3_NDIR_BLOCKS)
356 +               return 0;
357 +
358 +       inode->i_nlink = 1;
359 +       atomic_inc(&inode->i_count);
360 +       ei->i_state |= EXT3_STATE_DELETE;
361 +
362 +       spin_lock(&sbi->s_delete_lock);
363 +       J_ASSERT(list_empty(&inode->i_devices));
364 +       list_add_tail(&inode->i_devices, &sbi->s_delete_list);
365 +       sbi->s_delete_blocks += blocks;
366 +       sbi->s_delete_inodes++;
367 +       spin_unlock(&sbi->s_delete_lock);
368 +
369 +       wake_up(&sbi->s_delete_thread_queue);
370 +
371 +       return 0;
372 +}
373 +#else
374 +#define ext3_try_to_delay_deletion(inode) do {} while (0)
375 +#endif
376 +
377  static int ext3_unlink(struct inode * dir, struct dentry *dentry)
378  {
379         int retval;
380 @@ -1977,8 +2007,10 @@
381         ext3_update_dx_flag(dir);
382         ext3_mark_inode_dirty(handle, dir);
383         inode->i_nlink--;
384 -       if (!inode->i_nlink)
385 +       if (!inode->i_nlink) {
386 +               ext3_try_to_delay_deletion(inode);
387                 ext3_orphan_add(handle, inode);
388 +       }
389         inode->i_ctime = dir->i_ctime;
390         ext3_mark_inode_dirty(handle, inode);
391         retval = 0;
392 Index: linux-2.4.24/include/linux/ext3_fs.h
393 ===================================================================
394 --- linux-2.4.24.orig/include/linux/ext3_fs.h   2004-01-12 20:36:31.000000000 +0300
395 +++ linux-2.4.24/include/linux/ext3_fs.h        2004-01-12 20:36:32.000000000 +0300
396 @@ -193,6 +193,7 @@
397   */
398  #define EXT3_STATE_JDATA               0x00000001 /* journaled data exists */
399  #define EXT3_STATE_NEW                 0x00000002 /* inode is newly created */
400 +#define EXT3_STATE_DELETE              0x00000010 /* deferred delete inode */
401  
402  /*
403   * ioctl commands
404 @@ -320,6 +321,7 @@
405  #define EXT3_MOUNT_UPDATE_JOURNAL      0x1000  /* Update the journal format */
406  #define EXT3_MOUNT_NO_UID32            0x2000  /* Disable 32-bit UIDs */
407  #define EXT3_MOUNT_XATTR_USER          0x4000  /* Extended user attributes */
408 +#define EXT3_MOUNT_ASYNCDEL            0x20000 /* Delayed deletion */
409  
410  /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
411  #ifndef _LINUX_EXT2_FS_H
412 @@ -697,6 +699,9 @@
413  extern void ext3_dirty_inode(struct inode *);
414  extern int ext3_change_inode_journal_flag(struct inode *, int);
415  extern void ext3_truncate (struct inode *);
416 +#ifdef EXT3_DELETE_THREAD
417 +extern void ext3_truncate_thread(struct inode *inode);
418 +#endif
419  extern void ext3_set_inode_flags(struct inode *);
420  
421  /* ioctl.c */
422 Index: linux-2.4.24/include/linux/ext3_fs_sb.h
423 ===================================================================
424 --- linux-2.4.24.orig/include/linux/ext3_fs_sb.h        2004-01-12 20:36:31.000000000 +0300
425 +++ linux-2.4.24/include/linux/ext3_fs_sb.h     2004-01-12 20:36:32.000000000 +0300
426 @@ -29,6 +29,8 @@
427  
428  #define EXT3_MAX_GROUP_LOADED  8
429  
430 +#define EXT3_DELETE_THREAD
431 +
432  /*
433   * third extended-fs super-block data in memory
434   */
435 @@ -76,6 +78,14 @@
436         struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
437         wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
438  #endif
439 +#ifdef EXT3_DELETE_THREAD
440 +       spinlock_t s_delete_lock;
441 +       struct list_head s_delete_list;
442 +       unsigned long s_delete_blocks;
443 +       unsigned long s_delete_inodes;
444 +       wait_queue_head_t s_delete_thread_queue;
445 +       wait_queue_head_t s_delete_waiter_queue;
446 +#endif
447  };
448  
449  #endif /* _LINUX_EXT3_FS_SB */