Whamcloud - gitweb
a6a64dee613d7a8beef9b76fe3447ca4a762e186
[fs/lustre-release.git] / lustre / kernel_patches / patches / ext3-delete_thread-2.4.18.patch
1
2 Create a service thread to handle delete and truncate of inodes, to avoid
3 long latency while truncating very large files.
4
5
6  fs/ext3/inode.c            |  116 ++++++++++++++++++++++
7  fs/ext3/super.c            |  231 +++++++++++++++++++++++++++++++++++++++++++++
8  include/linux/ext3_fs.h    |    5 
9  include/linux/ext3_fs_sb.h |   10 +
10  4 files changed, 362 insertions(+)
11
12 Index: linux-2.4.18-chaos/fs/ext3/super.c
13 ===================================================================
14 --- linux-2.4.18-chaos.orig/fs/ext3/super.c     2004-01-13 15:39:03.000000000 +0300
15 +++ linux-2.4.18-chaos/fs/ext3/super.c  2004-01-13 16:35:05.000000000 +0300
16 @@ -398,6 +398,221 @@
17         }
18  }
19  
20 +#ifdef EXT3_DELETE_THREAD
21 +/*
22 + * Delete inodes in a loop until there are no more to be deleted.
23 + * Normally, we run in the background doing the deletes and sleeping again,
24 + * and clients just add new inodes to be deleted onto the end of the list.
25 + * If someone is concerned about free space (e.g. block allocation or similar)
26 + * then they can sleep on s_delete_waiter_queue and be woken up when space
27 + * has been freed.
28 + */
29 +int ext3_delete_thread(void *data)
30 +{
31 +       struct super_block *sb = data;
32 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
33 +       struct task_struct *tsk = current;
34 +
35 +       /* Almost like daemonize, but not quite */
36 +       exit_mm(current);
37 +       tsk->session = 1;
38 +       tsk->pgrp = 1;
39 +       tsk->tty = NULL;
40 +       exit_files(current);
41 +       reparent_to_init();
42 +
43 +       sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
44 +       sigfillset(&tsk->blocked);
45 +
46 +       /*tsk->flags |= PF_KERNTHREAD;*/
47 +
48 +       INIT_LIST_HEAD(&sbi->s_delete_list);
49 +       wake_up(&sbi->s_delete_waiter_queue);
50 +       ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
51 +
52 +       /* main loop */
53 +       for (;;) {
54 +               wait_event_interruptible(sbi->s_delete_thread_queue,
55 +                                        !list_empty(&sbi->s_delete_list) ||
56 +                                        !test_opt(sb, ASYNCDEL));
57 +               ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
58 +                          tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
59 +
60 +               spin_lock(&sbi->s_delete_lock);
61 +               if (list_empty(&sbi->s_delete_list)) {
62 +                       clear_opt(sbi->s_mount_opt, ASYNCDEL);
63 +                       memset(&sbi->s_delete_list, 0,
64 +                              sizeof(sbi->s_delete_list));
65 +                       spin_unlock(&sbi->s_delete_lock);
66 +                       ext3_debug("delete thread on %s exiting\n",
67 +                                  kdevname(sb->s_dev));
68 +                       wake_up(&sbi->s_delete_waiter_queue);
69 +                       break;
70 +               }
71 +
72 +               while (!list_empty(&sbi->s_delete_list)) {
73 +                       struct inode *inode=list_entry(sbi->s_delete_list.next,
74 +                                                      struct inode, i_dentry);
75 +                       unsigned long blocks = inode->i_blocks >>
76 +                                                       (inode->i_blkbits - 9);
77 +
78 +                       list_del_init(&inode->i_dentry);
79 +                       spin_unlock(&sbi->s_delete_lock);
80 +                       ext3_debug("%s delete ino %lu blk %lu\n",
81 +                                  tsk->comm, inode->i_ino, blocks);
82 +
83 +                       iput(inode);
84 +
85 +                       spin_lock(&sbi->s_delete_lock);
86 +                       sbi->s_delete_blocks -= blocks;
87 +                       sbi->s_delete_inodes--;
88 +               }
89 +               if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
90 +                       ext3_warning(sb, __FUNCTION__,
91 +                                    "%lu blocks, %lu inodes on list?\n",
92 +                                    sbi->s_delete_blocks,sbi->s_delete_inodes);
93 +                       sbi->s_delete_blocks = 0;
94 +                       sbi->s_delete_inodes = 0;
95 +               }
96 +               spin_unlock(&sbi->s_delete_lock);
97 +               wake_up(&sbi->s_delete_waiter_queue);
98 +       }
99 +
100 +       return 0;
101 +}
102 +
103 +static void ext3_start_delete_thread(struct super_block *sb)
104 +{
105 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
106 +       int rc;
107 +
108 +       spin_lock_init(&sbi->s_delete_lock);
109 +       init_waitqueue_head(&sbi->s_delete_thread_queue);
110 +       init_waitqueue_head(&sbi->s_delete_waiter_queue);
111 +
112 +       if (!test_opt(sb, ASYNCDEL))
113 +               return;
114 +
115 +       rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
116 +       if (rc < 0)
117 +               printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
118 +                      rc);
119 +       else
120 +               wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
121 +}
122 +
123 +static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
124 +{
125 +       if (sbi->s_delete_list.next == 0)       /* thread never started */
126 +               return;
127 +
128 +       clear_opt(sbi->s_mount_opt, ASYNCDEL);
129 +       wake_up(&sbi->s_delete_thread_queue);
130 +       wait_event(sbi->s_delete_waiter_queue,
131 +                       sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0);
132 +}
133 +
134 +/* Instead of playing games with the inode flags, destruction, etc we just
135 + * create a new inode locally and put it on a list for the truncate thread.
136 + * We need large parts of the inode struct in order to complete the
137 + * truncate and unlink, so we may as well just have a real inode to do it.
138 + *
139 + * If we have any problem deferring the delete, just delete it right away.
140 + * If we defer it, we also mark how many blocks it would free, so that we
141 + * can keep the statfs data correct, and we know if we should sleep on the
142 + * delete thread when we run out of space.
143 + */
144 +static void ext3_delete_inode_thread(struct inode *old_inode)
145 +{
146 +       struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
147 +       struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
148 +       struct inode *new_inode;
149 +       unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
150 +
151 +       if (is_bad_inode(old_inode)) {
152 +               clear_inode(old_inode);
153 +               return;
154 +       }
155 +
156 +       if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
157 +               goto out_delete;
158 +
159 +       /* We may want to delete the inode immediately and not defer it */
160 +       if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS)
161 +               goto out_delete;
162 +
163 +       /* We can't use the delete thread as-is during real orphan recovery,
164 +        * as we add to the orphan list here, causing ext3_orphan_cleanup()
165 +        * to loop endlessly.  It would be nice to do so, but needs work.
166 +        */
167 +       if (oei->i_state & EXT3_STATE_DELETE ||
168 +           sbi->s_mount_state & EXT3_ORPHAN_FS) {
169 +               ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
170 +                          old_inode->i_ino, blocks);
171 +               goto out_delete;
172 +       }
173 +
174 +       /* We can iget this inode again here, because our caller has unhashed
175 +        * old_inode, so new_inode will be in a different inode struct.
176 +        *
177 +        * We need to ensure that the i_orphan pointers in the other inodes
178 +        * point at the new inode copy instead of the old one so the orphan
179 +        * list doesn't get corrupted when the old orphan inode is freed.
180 +        */
181 +       down(&sbi->s_orphan_lock);
182 +
183 +       sbi->s_mount_state |= EXT3_ORPHAN_FS;
184 +       new_inode = iget(old_inode->i_sb, old_inode->i_ino);
185 +       sbi->s_mount_state &= ~EXT3_ORPHAN_FS;
186 +       if (is_bad_inode(new_inode)) {
187 +               printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
188 +               iput(new_inode);
189 +               new_inode = NULL;
190 +       }
191 +       if (!new_inode) {
192 +               up(&sbi->s_orphan_lock);
193 +               ext3_debug("delete inode %lu directly (bad read)\n",
194 +                          old_inode->i_ino);
195 +               goto out_delete;
196 +       }
197 +       J_ASSERT(new_inode != old_inode);
198 +
199 +       J_ASSERT(!list_empty(&oei->i_orphan));
200 +
201 +       nei = EXT3_I(new_inode);
202 +       /* Ugh.  We need to insert new_inode into the same spot on the list
203 +        * as old_inode was, to ensure the in-memory orphan list is still
204 +        * in the same order as the on-disk orphan list (badness otherwise).
205 +        */
206 +       nei->i_orphan = oei->i_orphan;
207 +       nei->i_orphan.next->prev = &nei->i_orphan;
208 +       nei->i_orphan.prev->next = &nei->i_orphan;
209 +       nei->i_state |= EXT3_STATE_DELETE;
210 +       up(&sbi->s_orphan_lock);
211 +
212 +       clear_inode(old_inode);
213 +
214 +       spin_lock(&sbi->s_delete_lock);
215 +       J_ASSERT(list_empty(&new_inode->i_dentry));
216 +       list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
217 +       sbi->s_delete_blocks += blocks;
218 +       sbi->s_delete_inodes++;
219 +       spin_unlock(&sbi->s_delete_lock);
220 +
221 +       ext3_debug("delete inode %lu (%lu blocks) by thread\n",
222 +                  new_inode->i_ino, blocks);
223 +
224 +       wake_up(&sbi->s_delete_thread_queue);
225 +       return;
226 +
227 +out_delete:
228 +       ext3_delete_inode(old_inode);
229 +}
230 +#else
231 +#define ext3_start_delete_thread(sbi) do {} while(0)
232 +#define ext3_stop_delete_thread(sbi) do {} while(0)
233 +#endif /* EXT3_DELETE_THREAD */
234 +
235  void ext3_put_super (struct super_block * sb)
236  {
237         struct ext3_sb_info *sbi = EXT3_SB(sb);
238 @@ -405,6 +620,8 @@
239         kdev_t j_dev = sbi->s_journal->j_dev;
240         int i;
241  
242 +       J_ASSERT(sbi->s_delete_inodes == 0);
243 +
244         ext3_xattr_put_super(sb);
245         journal_destroy(sbi->s_journal);
246         if (!(sb->s_flags & MS_RDONLY)) {
247 @@ -453,7 +670,11 @@
248         write_inode:    ext3_write_inode,       /* BKL not held.  Don't need */
249         dirty_inode:    ext3_dirty_inode,       /* BKL not held.  We take it */
250         put_inode:      ext3_put_inode,         /* BKL not held.  Don't need */
251 +#ifdef EXT3_DELETE_THREAD
252 +       delete_inode:   ext3_delete_inode_thread,/* BKL not held. We take it */
253 +#else
254         delete_inode:   ext3_delete_inode,      /* BKL not held.  We take it */
255 +#endif
256         put_super:      ext3_put_super,         /* BKL held */
257         write_super:    ext3_write_super,       /* BKL held */
258         sync_fs:        ext3_sync_fs,
259 @@ -514,6 +735,14 @@
260              this_char = strtok (NULL, ",")) {
261                 if ((value = strchr (this_char, '=')) != NULL)
262                         *value++ = 0;
263 +#ifdef EXT3_DELETE_THREAD
264 +               if (!strcmp(this_char, "asyncdel"))
265 +                       set_opt(*mount_options, ASYNCDEL);
266 +               else if (!strcmp(this_char, "noasyncdel"))
267 +                       clear_opt(*mount_options, ASYNCDEL);
268 +               else
269 +#endif
270 +
271                 if (!strcmp (this_char, "bsddf"))
272                         clear_opt (*mount_options, MINIX_DF);
273                 else if (!strcmp (this_char, "nouid32")) {
274 @@ -1209,6 +1438,7 @@
275         }
276  
277         ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
278 +       ext3_start_delete_thread(sb);
279         /*
280          * akpm: core read_super() calls in here with the superblock locked.
281          * That deadlocks, because orphan cleanup needs to lock the superblock
282 @@ -1585,7 +1815,12 @@
283  static int ext3_sync_fs(struct super_block *sb)
284  {
285         tid_t target;
286 -       
287 +
288 +       if (atomic_read(&sb->s_active) == 0) {
289 +               /* fs is being umounted: time to stop delete thread */
290 +               ext3_stop_delete_thread(EXT3_SB(sb));
291 +       }
292 +
293         sb->s_dirt = 0;
294         target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
295         log_wait_commit(EXT3_SB(sb)->s_journal, target);
296 @@ -1649,6 +1884,9 @@
297         if (!parse_options(data, &tmp, sbi, &tmp, 1))
298                 return -EINVAL;
299  
300 +       if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
301 +               ext3_stop_delete_thread(sbi);
302 +
303         if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
304                 ext3_abort(sb, __FUNCTION__, "Abort forced by user");
305  
306 Index: linux-2.4.18-chaos/fs/ext3/file.c
307 ===================================================================
308 --- linux-2.4.18-chaos.orig/fs/ext3/file.c      2003-07-28 17:52:04.000000000 +0400
309 +++ linux-2.4.18-chaos/fs/ext3/file.c   2004-01-13 16:26:01.000000000 +0300
310 @@ -121,7 +121,11 @@
311  };
312  
313  struct inode_operations ext3_file_inode_operations = {
314 +#ifdef EXT3_DELETE_THREAD
315 +       truncate:       ext3_truncate_thread,   /* BKL held */
316 +#else
317         truncate:       ext3_truncate,          /* BKL held */
318 +#endif
319         setattr:        ext3_setattr,           /* BKL held */
320  };
321  
322 Index: linux-2.4.18-chaos/fs/ext3/inode.c
323 ===================================================================
324 --- linux-2.4.18-chaos.orig/fs/ext3/inode.c     2004-01-13 15:39:03.000000000 +0300
325 +++ linux-2.4.18-chaos/fs/ext3/inode.c  2004-01-13 16:26:01.000000000 +0300
326 @@ -2041,6 +2041,118 @@
327         return;         /* AKPM: return what? */
328  }
329  
330 +#ifdef EXT3_DELETE_THREAD
331 +/* Move blocks from to-be-truncated inode over to a new inode, and delete
332 + * that one from the delete thread instead.  This avoids a lot of latency
333 + * when truncating large files.
334 + *
335 + * If we have any problem deferring the truncate, just truncate it right away.
336 + * If we defer it, we also mark how many blocks it would free, so that we
337 + * can keep the statfs data correct, and we know if we should sleep on the
338 + * delete thread when we run out of space.
339 + */
340 +void ext3_truncate_thread(struct inode *old_inode)
341 +{
342 +       struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
343 +       struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
344 +       struct inode *new_inode;
345 +       handle_t *handle;
346 +       unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
347 +
348 +       if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
349 +               goto out_truncate;
350 +
351 +       /* XXX This is a temporary limitation for code simplicity.
352 +        *     We could truncate to arbitrary sizes at some later time.
353 +        */
354 +       if (old_inode->i_size != 0)
355 +               goto out_truncate;
356 +
357 +       /* We may want to truncate the inode immediately and not defer it */
358 +       if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
359 +           old_inode->i_size > oei->i_disksize)
360 +               goto out_truncate;
361 +
362 +       /* We can't use the delete thread as-is during real orphan recovery,
363 +        * as we add to the orphan list here, causing ext3_orphan_cleanup()
364 +        * to loop endlessly.  It would be nice to do so, but needs work.
365 +        */
366 +       if (oei->i_state & EXT3_STATE_DELETE ||
367 +           sbi->s_mount_state & EXT3_ORPHAN_FS) {
368 +               ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
369 +                          old_inode->i_ino, blocks);
370 +               goto out_truncate;
371 +       }
372 +
373 +       ext3_discard_prealloc(old_inode);
374 +
375 +       /* old_inode   = 1
376 +        * new_inode   = sb + GDT + ibitmap
377 +        * orphan list = 1 inode/superblock for add, 2 inodes for del
378 +        * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
379 +        */
380 +       handle = ext3_journal_start(old_inode, 7);
381 +       if (IS_ERR(handle))
382 +               goto out_truncate;
383 +
384 +       new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
385 +       if (IS_ERR(new_inode)) {
386 +               ext3_debug("truncate inode %lu directly (no new inodes)\n",
387 +                          old_inode->i_ino);
388 +               goto out_journal;
389 +       }
390 +
391 +       nei = EXT3_I(new_inode);
392 +
393 +       down_write(&oei->truncate_sem);
394 +       new_inode->i_size = old_inode->i_size;
395 +       new_inode->i_blocks = old_inode->i_blocks;
396 +       new_inode->i_uid = old_inode->i_uid;
397 +       new_inode->i_gid = old_inode->i_gid;
398 +       new_inode->i_nlink = 0;
399 +
400 +       /* FIXME when we do arbitrary truncates */
401 +       old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
402 +       old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
403 +
404 +       memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
405 +       memset(oei->i_data, 0, sizeof(oei->i_data));
406 +
407 +       nei->i_disksize = oei->i_disksize;
408 +       nei->i_state |= EXT3_STATE_DELETE;
409 +       up_write(&oei->truncate_sem);
410 +
411 +       if (ext3_orphan_add(handle, new_inode) < 0)
412 +               goto out_journal;
413 +
414 +       if (ext3_orphan_del(handle, old_inode) < 0) {
415 +               ext3_orphan_del(handle, new_inode);
416 +               iput(new_inode);
417 +               goto out_journal;
418 +       }
419 +
420 +       ext3_journal_stop(handle, old_inode);
421 +
422 +       spin_lock(&sbi->s_delete_lock);
423 +       J_ASSERT(list_empty(&new_inode->i_dentry));
424 +       list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
425 +       sbi->s_delete_blocks += blocks;
426 +       sbi->s_delete_inodes++;
427 +       spin_unlock(&sbi->s_delete_lock);
428 +
429 +       ext3_debug("delete inode %lu (%lu blocks) by thread\n",
430 +                  new_inode->i_ino, blocks);
431 +
432 +       wake_up(&sbi->s_delete_thread_queue);
433 +       return;
434 +
435 +out_journal:
436 +       ext3_journal_stop(handle, old_inode);
437 +out_truncate:
438 +       ext3_truncate(old_inode);
439 +}
440 +#endif /* EXT3_DELETE_THREAD */
441 +
442  /* 
443   * ext3_get_inode_loc returns with an extra refcount against the
444   * inode's underlying buffer_head on success. 
445 Index: linux-2.4.18-chaos/fs/buffer.c
446 ===================================================================
447 --- linux-2.4.18-chaos.orig/fs/buffer.c 2003-07-28 17:52:03.000000000 +0400
448 +++ linux-2.4.18-chaos/fs/buffer.c      2004-01-13 16:34:43.000000000 +0300
449 @@ -352,9 +352,9 @@
450         lock_super(sb);
451         if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
452                 sb->s_op->write_super(sb);
453 +       unlock_super(sb);
454         if (sb->s_op && sb->s_op->sync_fs)
455                 sb->s_op->sync_fs(sb);
456 -       unlock_super(sb);
457         unlock_kernel();
458  
459         return sync_buffers(dev, 1);
460 Index: linux-2.4.18-chaos/include/linux/ext3_fs.h
461 ===================================================================
462 --- linux-2.4.18-chaos.orig/include/linux/ext3_fs.h     2004-01-13 15:39:03.000000000 +0300
463 +++ linux-2.4.18-chaos/include/linux/ext3_fs.h  2004-01-13 16:26:01.000000000 +0300
464 @@ -190,6 +190,7 @@
465   */
466  #define EXT3_STATE_JDATA               0x00000001 /* journaled data exists */
467  #define EXT3_STATE_NEW                 0x00000002 /* inode is newly created */
468 +#define EXT3_STATE_DELETE              0x00000010 /* deferred delete inode */
469  
470  /*
471   * ioctl commands
472 @@ -317,6 +318,7 @@
473  #define EXT3_MOUNT_UPDATE_JOURNAL      0x1000  /* Update the journal format */
474  #define EXT3_MOUNT_NO_UID32            0x2000  /* Disable 32-bit UIDs */
475  #define EXT3_MOUNT_INDEX               0x4000  /* Enable directory index */
476 +#define EXT3_MOUNT_ASYNCDEL            0x20000 /* Delayed deletion */
477  
478  /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
479  #ifndef _LINUX_EXT2_FS_H
480 @@ -651,6 +653,9 @@
481  extern void ext3_dirty_inode(struct inode *);
482  extern int ext3_change_inode_journal_flag(struct inode *, int);
483  extern void ext3_truncate (struct inode *);
484 +#ifdef EXT3_DELETE_THREAD
485 +extern void ext3_truncate_thread(struct inode *inode);
486 +#endif
487  
488  /* ioctl.c */
489  extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
490 Index: linux-2.4.18-chaos/include/linux/ext3_fs_sb.h
491 ===================================================================
492 --- linux-2.4.18-chaos.orig/include/linux/ext3_fs_sb.h  2004-01-13 15:39:03.000000000 +0300
493 +++ linux-2.4.18-chaos/include/linux/ext3_fs_sb.h       2004-01-13 16:26:01.000000000 +0300
494 @@ -29,6 +29,8 @@
495  
496  #define EXT3_MAX_GROUP_LOADED  32
497  
498 +#define EXT3_DELETE_THREAD
499 +
500  /*
501   * third extended-fs super-block data in memory
502   */
503 @@ -74,6 +76,14 @@
504         struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
505         wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
506  #endif
507 +#ifdef EXT3_DELETE_THREAD
508 +       spinlock_t s_delete_lock;
509 +       struct list_head s_delete_list;
510 +       unsigned long s_delete_blocks;
511 +       unsigned long s_delete_inodes;
512 +       wait_queue_head_t s_delete_thread_queue;
513 +       wait_queue_head_t s_delete_waiter_queue;
514 +#endif
515  };
516  
517  #endif /* _LINUX_EXT3_FS_SB */