Whamcloud - gitweb
b=3119
[fs/lustre-release.git] / lustre / kernel_patches / patches / ext3_delete_thread_2.4.20_chaos.patch
1  fs/ext3/file.c             |    4 
2  fs/ext3/inode.c            |  116 ++++++++++++++++++++++
3  fs/ext3/super.c            |  230 +++++++++++++++++++++++++++++++++++++++++++++
4  include/linux/ext3_fs.h    |    5 
5  include/linux/ext3_fs_sb.h |   10 +
6  5 files changed, 365 insertions(+)
7
8 Index: linux-2.4.20-rh-20.9/fs/ext3/super.c
9 ===================================================================
10 --- linux-2.4.20-rh-20.9.orig/fs/ext3/super.c   2004-01-12 19:27:46.000000000 +0300
11 +++ linux-2.4.20-rh-20.9/fs/ext3/super.c        2004-01-13 17:20:31.000000000 +0300
12 @@ -400,6 +400,221 @@
13         }
14  }
15  
16 +#ifdef EXT3_DELETE_THREAD
17 +/*
18 + * Delete inodes in a loop until there are no more to be deleted.
19 + * Normally, we run in the background doing the deletes and sleeping again,
20 + * and clients just add new inodes to be deleted onto the end of the list.
21 + * If someone is concerned about free space (e.g. block allocation or similar)
22 + * then they can sleep on s_delete_waiter_queue and be woken up when space
23 + * has been freed.
24 + */
25 +int ext3_delete_thread(void *data)
26 +{
27 +       struct super_block *sb = data;
28 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
29 +       struct task_struct *tsk = current;
30 +
31 +       /* Almost like daemonize, but not quite */
32 +       exit_mm(current);
33 +       tsk->session = 1;
34 +       tsk->pgrp = 1;
35 +       tsk->tty = NULL;
36 +       exit_files(current);
37 +       reparent_to_init();
38 +
39 +       sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
40 +       sigfillset(&tsk->blocked);
41 +
42 +       /*tsk->flags |= PF_KERNTHREAD;*/
43 +
44 +       INIT_LIST_HEAD(&sbi->s_delete_list);
45 +       wake_up(&sbi->s_delete_waiter_queue);
46 +       ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
47 +
48 +       /* main loop */
49 +       for (;;) {
50 +               wait_event_interruptible(sbi->s_delete_thread_queue,
51 +                                        !list_empty(&sbi->s_delete_list) ||
52 +                                        !test_opt(sb, ASYNCDEL));
53 +               ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
54 +                          tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
55 +
56 +               spin_lock(&sbi->s_delete_lock);
57 +               if (list_empty(&sbi->s_delete_list)) {
58 +                       clear_opt(sbi->s_mount_opt, ASYNCDEL);
59 +                       memset(&sbi->s_delete_list, 0,
60 +                              sizeof(sbi->s_delete_list));
61 +                       spin_unlock(&sbi->s_delete_lock);
62 +                       ext3_debug("delete thread on %s exiting\n",
63 +                                  kdevname(sb->s_dev));
64 +                       wake_up(&sbi->s_delete_waiter_queue);
65 +                       break;
66 +               }
67 +
68 +               while (!list_empty(&sbi->s_delete_list)) {
69 +                       struct inode *inode=list_entry(sbi->s_delete_list.next,
70 +                                                      struct inode, i_dentry);
71 +                       unsigned long blocks = inode->i_blocks >>
72 +                                                       (inode->i_blkbits - 9);
73 +
74 +                       list_del_init(&inode->i_dentry);
75 +                       spin_unlock(&sbi->s_delete_lock);
76 +                       ext3_debug("%s delete ino %lu blk %lu\n",
77 +                                  tsk->comm, inode->i_ino, blocks);
78 +
79 +                       iput(inode);
80 +
81 +                       spin_lock(&sbi->s_delete_lock);
82 +                       sbi->s_delete_blocks -= blocks;
83 +                       sbi->s_delete_inodes--;
84 +               }
85 +               if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
86 +                       ext3_warning(sb, __FUNCTION__,
87 +                                    "%lu blocks, %lu inodes on list?\n",
88 +                                    sbi->s_delete_blocks,sbi->s_delete_inodes);
89 +                       sbi->s_delete_blocks = 0;
90 +                       sbi->s_delete_inodes = 0;
91 +               }
92 +               spin_unlock(&sbi->s_delete_lock);
93 +               wake_up(&sbi->s_delete_waiter_queue);
94 +       }
95 +
96 +       return 0;
97 +}
98 +
99 +static void ext3_start_delete_thread(struct super_block *sb)
100 +{
101 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
102 +       int rc;
103 +
104 +       spin_lock_init(&sbi->s_delete_lock);
105 +       init_waitqueue_head(&sbi->s_delete_thread_queue);
106 +       init_waitqueue_head(&sbi->s_delete_waiter_queue);
107 +
108 +       if (!test_opt(sb, ASYNCDEL))
109 +               return;
110 +
111 +       rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
112 +       if (rc < 0)
113 +               printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
114 +                      rc);
115 +       else
116 +               wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
117 +}
118 +
119 +static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
120 +{
121 +       if (sbi->s_delete_list.next == 0)       /* thread never started */
122 +               return;
123 +
124 +       clear_opt(sbi->s_mount_opt, ASYNCDEL);
125 +       wake_up(&sbi->s_delete_thread_queue);
126 +       wait_event(sbi->s_delete_waiter_queue,
127 +                       sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0);
128 +}
129 +
130 +/* Instead of playing games with the inode flags, destruction, etc we just
131 + * create a new inode locally and put it on a list for the truncate thread.
132 + * We need large parts of the inode struct in order to complete the
133 + * truncate and unlink, so we may as well just have a real inode to do it.
134 + *
135 + * If we have any problem deferring the delete, just delete it right away.
136 + * If we defer it, we also mark how many blocks it would free, so that we
137 + * can keep the statfs data correct, and we know if we should sleep on the
138 + * delete thread when we run out of space.
139 + */
140 +static void ext3_delete_inode_thread(struct inode *old_inode)
141 +{
142 +       struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
143 +       struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
144 +       struct inode *new_inode;
145 +       unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
146 +
147 +       if (is_bad_inode(old_inode)) {
148 +               clear_inode(old_inode);
149 +               return;
150 +       }
151 +
152 +       if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
153 +               goto out_delete;
154 +
155 +       /* We may want to delete the inode immediately and not defer it */
156 +       if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS)
157 +               goto out_delete;
158 +
159 +       /* We can't use the delete thread as-is during real orphan recovery,
160 +        * as we add to the orphan list here, causing ext3_orphan_cleanup()
161 +        * to loop endlessly.  It would be nice to do so, but needs work.
162 +        */
163 +       if (oei->i_state & EXT3_STATE_DELETE ||
164 +           sbi->s_mount_state & EXT3_ORPHAN_FS) {
165 +               ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
166 +                          old_inode->i_ino, blocks);
167 +               goto out_delete;
168 +       }
169 +
170 +       /* We can iget this inode again here, because our caller has unhashed
171 +        * old_inode, so new_inode will be in a different inode struct.
172 +        *
173 +        * We need to ensure that the i_orphan pointers in the other inodes
174 +        * point at the new inode copy instead of the old one so the orphan
175 +        * list doesn't get corrupted when the old orphan inode is freed.
176 +        */
177 +       down(&sbi->s_orphan_lock);
178 +
179 +       sbi->s_mount_state |= EXT3_ORPHAN_FS;
180 +       new_inode = iget(old_inode->i_sb, old_inode->i_ino);
181 +       sbi->s_mount_state &= ~EXT3_ORPHAN_FS;
182 +       if (is_bad_inode(new_inode)) {
183 +               printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
184 +               iput(new_inode);
185 +               new_inode = NULL;
186 +       }
187 +       if (!new_inode) {
188 +               up(&sbi->s_orphan_lock);
189 +               ext3_debug("delete inode %lu directly (bad read)\n",
190 +                          old_inode->i_ino);
191 +               goto out_delete;
192 +       }
193 +       J_ASSERT(new_inode != old_inode);
194 +
195 +       J_ASSERT(!list_empty(&oei->i_orphan));
196 +
197 +       nei = EXT3_I(new_inode);
198 +       /* Ugh.  We need to insert new_inode into the same spot on the list
199 +        * as old_inode was, to ensure the in-memory orphan list is still
200 +        * in the same order as the on-disk orphan list (badness otherwise).
201 +        */
202 +       nei->i_orphan = oei->i_orphan;
203 +       nei->i_orphan.next->prev = &nei->i_orphan;
204 +       nei->i_orphan.prev->next = &nei->i_orphan;
205 +       nei->i_state |= EXT3_STATE_DELETE;
206 +       up(&sbi->s_orphan_lock);
207 +
208 +       clear_inode(old_inode);
209 +
210 +       spin_lock(&sbi->s_delete_lock);
211 +       J_ASSERT(list_empty(&new_inode->i_dentry));
212 +       list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
213 +       sbi->s_delete_blocks += blocks;
214 +       sbi->s_delete_inodes++;
215 +       spin_unlock(&sbi->s_delete_lock);
216 +
217 +       ext3_debug("delete inode %lu (%lu blocks) by thread\n",
218 +                  new_inode->i_ino, blocks);
219 +
220 +       wake_up(&sbi->s_delete_thread_queue);
221 +       return;
222 +
223 +out_delete:
224 +       ext3_delete_inode(old_inode);
225 +}
226 +#else
227 +#define ext3_start_delete_thread(sbi) do {} while(0)
228 +#define ext3_stop_delete_thread(sbi) do {} while(0)
229 +#endif /* EXT3_DELETE_THREAD */
230 +
231  void ext3_put_super (struct super_block * sb)
232  {
233         struct ext3_sb_info *sbi = EXT3_SB(sb);
234 @@ -407,6 +622,7 @@
235         kdev_t j_dev = sbi->s_journal->j_dev;
236         int i;
237  
238 +       J_ASSERT(sbi->s_delete_inodes == 0);
239         ext3_xattr_put_super(sb);
240         journal_destroy(sbi->s_journal);
241         if (!(sb->s_flags & MS_RDONLY)) {
242 @@ -455,7 +671,11 @@
243         write_inode:    ext3_write_inode,       /* BKL not held.  Don't need */
244         dirty_inode:    ext3_dirty_inode,       /* BKL not held.  We take it */
245         put_inode:      ext3_put_inode,         /* BKL not held.  Don't need */
246 +#ifdef EXT3_DELETE_THREAD
247 +       delete_inode:   ext3_delete_inode_thread,/* BKL not held. We take it */
248 +#else
249         delete_inode:   ext3_delete_inode,      /* BKL not held.  We take it */
250 +#endif
251         put_super:      ext3_put_super,         /* BKL held */
252         write_super:    ext3_write_super,       /* BKL held */
253         sync_fs:        ext3_sync_fs,
254 @@ -524,6 +744,13 @@
255                         clear_opt (*mount_options, XATTR_USER);
256                 else
257  #endif
258 +#ifdef EXT3_DELETE_THREAD
259 +               if (!strcmp(this_char, "asyncdel"))
260 +                       set_opt(*mount_options, ASYNCDEL);
261 +               else if (!strcmp(this_char, "noasyncdel"))
262 +                       clear_opt(*mount_options, ASYNCDEL);
263 +               else
264 +#endif
265                 if (!strcmp (this_char, "bsddf"))
266                         clear_opt (*mount_options, MINIX_DF);
267                 else if (!strcmp (this_char, "nouid32")) {
268 @@ -1223,6 +1450,7 @@
269         }
270  
271         ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
272 +       ext3_start_delete_thread(sb);
273         /*
274          * akpm: core read_super() calls in here with the superblock locked.
275          * That deadlocks, because orphan cleanup needs to lock the superblock
276 @@ -1614,7 +1842,12 @@
277  static int ext3_sync_fs(struct super_block *sb)
278  {
279         tid_t target;
280 -       
281 +
282 +       if (atomic_read(&sb->s_active) == 0) {
283 +               /* fs is being umounted: time to stop delete thread */
284 +               ext3_stop_delete_thread(EXT3_SB(sb));
285 +       }
286 +
287         sb->s_dirt = 0;
288         target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
289         log_wait_commit(EXT3_SB(sb)->s_journal, target);
290 @@ -1678,6 +1911,9 @@
291         if (!parse_options(data, &tmp, sbi, &tmp, 1))
292                 return -EINVAL;
293  
294 +       if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
295 +               ext3_stop_delete_thread(sbi);
296 +
297         if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
298                 ext3_abort(sb, __FUNCTION__, "Abort forced by user");
299  
300 Index: linux-2.4.20-rh-20.9/fs/ext3/inode.c
301 ===================================================================
302 --- linux-2.4.20-rh-20.9.orig/fs/ext3/inode.c   2004-01-12 19:27:46.000000000 +0300
303 +++ linux-2.4.20-rh-20.9/fs/ext3/inode.c        2004-01-13 17:15:48.000000000 +0300
304 @@ -2017,6 +2017,122 @@
305         ext3_journal_stop(handle, inode);
306  }
307  
308 +#ifdef EXT3_DELETE_THREAD
309 +/* Move blocks from to-be-truncated inode over to a new inode, and delete
310 + * that one from the delete thread instead.  This avoids a lot of latency
311 + * when truncating large files.
312 + *
313 + * If we have any problem deferring the truncate, just truncate it right away.
314 + * If we defer it, we also mark how many blocks it would free, so that we
315 + * can keep the statfs data correct, and we know if we should sleep on the
316 + * delete thread when we run out of space.
317 + *
318 + * During normal filesystem usage, we are always called here with a
319 + * transaction already started.  The only time ext3_truncate is called
320 + * without a started transaction is from ext3_orphan_cleanup(), and we
321 + * currently just do a direct truncate in that case.
322 + */
323 +void ext3_truncate_thread(struct inode *old_inode)
324 +{
325 +       struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
326 +       struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
327 +       struct inode *new_inode;
328 +       handle_t *handle;
329 +       unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
330 +
331 +       if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
332 +               goto out_truncate;
333 +
334 +       /* XXX This is a temporary limitation for code simplicity.
335 +        *     We could truncate to arbitrary sizes at some later time.
336 +        */
337 +       if (old_inode->i_size != 0)
338 +               goto out_truncate;
339 +
340 +       /* We may want to truncate the inode immediately and not defer it */
341 +       if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
342 +           old_inode->i_size > oei->i_disksize)
343 +               goto out_truncate;
344 +
345 +       /* We can't use the delete thread as-is during real orphan recovery,
346 +        * as we add to the orphan list here, causing ext3_orphan_cleanup()
347 +        * to loop endlessly.  It would be nice to do so, but needs work.
348 +        */
349 +       if (oei->i_state & EXT3_STATE_DELETE ||
350 +           sbi->s_mount_state & EXT3_ORPHAN_FS) {
351 +               ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
352 +                          old_inode->i_ino, blocks);
353 +               goto out_truncate;
354 +       }
355 +
356 +       ext3_discard_prealloc(old_inode);
357 +
358 +       /* old_inode   = 1
359 +        * new_inode   = sb + GDT + ibitmap
360 +        * orphan list = 1 inode/superblock for add, 2 inodes for del
361 +        * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
362 +        */
363 +       handle = ext3_journal_start(old_inode, 7);
364 +       if (IS_ERR(handle))
365 +               goto out_truncate;
366 +
367 +       new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
368 +       if (IS_ERR(new_inode)) {
369 +               ext3_debug("truncate inode %lu directly (no new inodes)\n",
370 +                          old_inode->i_ino);
371 +               goto out_journal;
372 +       }
373 +
374 +       if (ext3_orphan_add(handle, new_inode) < 0)
375 +               goto out_journal;
376 +
377 +       if (ext3_orphan_del(handle, old_inode) < 0) {
378 +               ext3_orphan_del(handle, new_inode);
379 +               iput(new_inode);
380 +               goto out_journal;
381 +       }
382 +
383 +       nei = EXT3_I(new_inode);
384 +
385 +       down_write(&oei->truncate_sem);
386 +       new_inode->i_size = old_inode->i_size;
387 +       new_inode->i_blocks = old_inode->i_blocks;
388 +       new_inode->i_uid = old_inode->i_uid;
389 +       new_inode->i_gid = old_inode->i_gid;
390 +       new_inode->i_nlink = 0;
391 +
392 +       /* FIXME when we do arbitrary truncates */
393 +       old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
394 +
395 +       memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
396 +       memset(oei->i_data, 0, sizeof(oei->i_data));
397 +
398 +       nei->i_disksize = oei->i_disksize;
399 +       nei->i_state |= EXT3_STATE_DELETE;
400 +       up_write(&oei->truncate_sem);
401 +
402 +       ext3_journal_stop(handle, old_inode);
403 +
404 +       spin_lock(&sbi->s_delete_lock);
405 +       J_ASSERT(list_empty(&new_inode->i_dentry));
406 +       list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
407 +       sbi->s_delete_blocks += blocks;
408 +       sbi->s_delete_inodes++;
409 +       spin_unlock(&sbi->s_delete_lock);
410 +
411 +       ext3_debug("delete inode %lu (%lu blocks) by thread\n",
412 +                  new_inode->i_ino, blocks);
413 +
414 +       wake_up(&sbi->s_delete_thread_queue);
415 +       return;
416 +
417 +out_journal:
418 +       ext3_journal_stop(handle, old_inode);
419 +out_truncate:
420 +       ext3_truncate(old_inode);
421 +}
422 +#endif /* EXT3_DELETE_THREAD */
423 +
424  /* 
425   * ext3_get_inode_loc returns with an extra refcount against the
426   * inode's underlying buffer_head on success. 
427 Index: linux-2.4.20-rh-20.9/fs/ext3/file.c
428 ===================================================================
429 --- linux-2.4.20-rh-20.9.orig/fs/ext3/file.c    2004-01-12 19:27:46.000000000 +0300
430 +++ linux-2.4.20-rh-20.9/fs/ext3/file.c 2004-01-13 17:15:48.000000000 +0300
431 @@ -125,7 +125,11 @@
432  };
433  
434  struct inode_operations ext3_file_inode_operations = {
435 +#ifdef EXT3_DELETE_THREAD
436 +       truncate:       ext3_truncate_thread,   /* BKL held */
437 +#else
438         truncate:       ext3_truncate,          /* BKL held */
439 +#endif
440         setattr:        ext3_setattr,           /* BKL held */
441         setxattr:       ext3_setxattr,          /* BKL held */
442         getxattr:       ext3_getxattr,          /* BKL held */
443 Index: linux-2.4.20-rh-20.9/include/linux/ext3_fs.h
444 ===================================================================
445 --- linux-2.4.20-rh-20.9.orig/include/linux/ext3_fs.h   2004-01-12 19:27:46.000000000 +0300
446 +++ linux-2.4.20-rh-20.9/include/linux/ext3_fs.h        2004-01-13 17:15:48.000000000 +0300
447 @@ -193,6 +193,7 @@
448   */
449  #define EXT3_STATE_JDATA               0x00000001 /* journaled data exists */
450  #define EXT3_STATE_NEW                 0x00000002 /* inode is newly created */
451 +#define EXT3_STATE_DELETE              0x00000010 /* deferred delete inode */
452  
453  /*
454   * ioctl commands
455 @@ -320,6 +321,7 @@
456  #define EXT3_MOUNT_UPDATE_JOURNAL      0x1000  /* Update the journal format */
457  #define EXT3_MOUNT_NO_UID32            0x2000  /* Disable 32-bit UIDs */
458  #define EXT3_MOUNT_XATTR_USER          0x4000  /* Extended user attributes */
459 +#define EXT3_MOUNT_ASYNCDEL            0x20000 /* Delayed deletion */
460  
461  /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
462  #ifndef _LINUX_EXT2_FS_H
463 @@ -695,6 +697,9 @@
464  extern void ext3_dirty_inode(struct inode *);
465  extern int ext3_change_inode_journal_flag(struct inode *, int);
466  extern void ext3_truncate (struct inode *);
467 +#ifdef EXT3_DELETE_THREAD
468 +extern void ext3_truncate_thread(struct inode *inode);
469 +#endif
470  extern void ext3_set_inode_flags(struct inode *);
471  
472  /* ioctl.c */
473 Index: linux-2.4.20-rh-20.9/include/linux/ext3_fs_sb.h
474 ===================================================================
475 --- linux-2.4.20-rh-20.9.orig/include/linux/ext3_fs_sb.h        2004-01-12 19:27:46.000000000 +0300
476 +++ linux-2.4.20-rh-20.9/include/linux/ext3_fs_sb.h     2004-01-13 17:15:48.000000000 +0300
477 @@ -29,6 +29,8 @@
478  
479  #define EXT3_MAX_GROUP_LOADED  32
480  
481 +#define EXT3_DELETE_THREAD
482 +
483  /*
484   * third extended-fs super-block data in memory
485   */
486 @@ -76,6 +78,14 @@
487         struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
488         wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
489  #endif
490 +#ifdef EXT3_DELETE_THREAD
491 +       spinlock_t s_delete_lock;
492 +       struct list_head s_delete_list;
493 +       unsigned long s_delete_blocks;
494 +       unsigned long s_delete_inodes;
495 +       wait_queue_head_t s_delete_thread_queue;
496 +       wait_queue_head_t s_delete_waiter_queue;
497 +#endif
498  };
499  
500  #endif /* _LINUX_EXT3_FS_SB */