Whamcloud - gitweb
- list_for_each_entry_safe(), list_move() and list_move_tail() have been added
[fs/lustre-release.git] / lustre / kernel_patches / patches / ext3-delete_thread-2.4.18.patch
1
2 Create a service thread to handle delete and truncate of inodes, to avoid
3 long latency while truncating very large files.
4
5
6  fs/ext3/inode.c            |  116 ++++++++++++++++++++++
7  fs/ext3/super.c            |  231 +++++++++++++++++++++++++++++++++++++++++++++
8  include/linux/ext3_fs.h    |    5 
9  include/linux/ext3_fs_sb.h |   10 +
10  4 files changed, 362 insertions(+)
11
12 --- linux-2.4.18-18.8.0-l15/fs/ext3/super.c~ext3-delete_thread-2.4.18   Tue Jun  3 17:26:21 2003
13 +++ linux-2.4.18-18.8.0-l15-adilger/fs/ext3/super.c     Wed Jul  2 23:49:40 2003
14 @@ -396,6 +396,220 @@ static void dump_orphan_list(struct supe
15         }
16  }
17  
18 +#ifdef EXT3_DELETE_THREAD
19 +/*
20 + * Delete inodes in a loop until there are no more to be deleted.
21 + * Normally, we run in the background doing the deletes and sleeping again,
22 + * and clients just add new inodes to be deleted onto the end of the list.
23 + * If someone is concerned about free space (e.g. block allocation or similar)
24 + * then they can sleep on s_delete_waiter_queue and be woken up when space
25 + * has been freed.
26 + */
27 +int ext3_delete_thread(void *data)
28 +{
29 +       struct super_block *sb = data;
30 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
31 +       struct task_struct *tsk = current;
32 +
33 +       /* Almost like daemonize, but not quite */
34 +       exit_mm(current);
35 +       tsk->session = 1;
36 +       tsk->pgrp = 1;
37 +       tsk->tty = NULL;
38 +       exit_files(current);
39 +       reparent_to_init();
40 +
41 +       sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
42 +       sigfillset(&tsk->blocked);
43 +
44 +       /*tsk->flags |= PF_KERNTHREAD;*/
45 +
46 +       INIT_LIST_HEAD(&sbi->s_delete_list);
47 +       wake_up(&sbi->s_delete_waiter_queue);
48 +       ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
49 +
50 +       /* main loop */
51 +       for (;;) {
52 +               wait_event_interruptible(sbi->s_delete_thread_queue,
53 +                                        !list_empty(&sbi->s_delete_list) ||
54 +                                        !test_opt(sb, ASYNCDEL));
55 +               ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
56 +                          tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
57 +
58 +               spin_lock(&sbi->s_delete_lock);
59 +               if (list_empty(&sbi->s_delete_list)) {
60 +                       clear_opt(sbi->s_mount_opt, ASYNCDEL);
61 +                       memset(&sbi->s_delete_list, 0,
62 +                              sizeof(sbi->s_delete_list));
63 +                       spin_unlock(&sbi->s_delete_lock);
64 +                       ext3_debug("delete thread on %s exiting\n",
65 +                                  kdevname(sb->s_dev));
66 +                       wake_up(&sbi->s_delete_waiter_queue);
67 +                       break;
68 +               }
69 +
70 +               while (!list_empty(&sbi->s_delete_list)) {
71 +                       struct inode *inode=list_entry(sbi->s_delete_list.next,
72 +                                                      struct inode, i_dentry);
73 +                       unsigned long blocks = inode->i_blocks >>
74 +                                                       (inode->i_blkbits - 9);
75 +
76 +                       list_del_init(&inode->i_dentry);
77 +                       spin_unlock(&sbi->s_delete_lock);
78 +                       ext3_debug("%s delete ino %lu blk %lu\n",
79 +                                  tsk->comm, inode->i_ino, blocks);
80 +
81 +                       iput(inode);
82 +
83 +                       spin_lock(&sbi->s_delete_lock);
84 +                       sbi->s_delete_blocks -= blocks;
85 +                       sbi->s_delete_inodes--;
86 +               }
87 +               if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
88 +                       ext3_warning(sb, __FUNCTION__,
89 +                                    "%lu blocks, %lu inodes on list?\n",
90 +                                    sbi->s_delete_blocks,sbi->s_delete_inodes);
91 +                       sbi->s_delete_blocks = 0;
92 +                       sbi->s_delete_inodes = 0;
93 +               }
94 +               spin_unlock(&sbi->s_delete_lock);
95 +               wake_up(&sbi->s_delete_waiter_queue);
96 +       }
97 +
98 +       return 0;
99 +}
100 +
101 +static void ext3_start_delete_thread(struct super_block *sb)
102 +{
103 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
104 +       int rc;
105 +
106 +       spin_lock_init(&sbi->s_delete_lock);
107 +       init_waitqueue_head(&sbi->s_delete_thread_queue);
108 +       init_waitqueue_head(&sbi->s_delete_waiter_queue);
109 +
110 +       if (!test_opt(sb, ASYNCDEL))
111 +               return;
112 +
113 +       rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
114 +       if (rc < 0)
115 +               printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
116 +                      rc);
117 +       else
118 +               wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
119 +}
120 +
121 +static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
122 +{
123 +       if (sbi->s_delete_list.next == 0)       /* thread never started */
124 +               return;
125 +
126 +       clear_opt(sbi->s_mount_opt, ASYNCDEL);
127 +       wake_up(&sbi->s_delete_thread_queue);
128 +       wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list));
129 +}
130 +
131 +/* Instead of playing games with the inode flags, destruction, etc we just
132 + * create a new inode locally and put it on a list for the truncate thread.
133 + * We need large parts of the inode struct in order to complete the
134 + * truncate and unlink, so we may as well just have a real inode to do it.
135 + *
136 + * If we have any problem deferring the delete, just delete it right away.
137 + * If we defer it, we also mark how many blocks it would free, so that we
138 + * can keep the statfs data correct, and we know if we should sleep on the
139 + * delete thread when we run out of space.
140 + */
141 +static void ext3_delete_inode_thread(struct inode *old_inode)
142 +{
143 +       struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
144 +       struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
145 +       struct inode *new_inode;
146 +       unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
147 +
148 +       if (is_bad_inode(old_inode)) {
149 +               clear_inode(old_inode);
150 +               return;
151 +       }
152 +
153 +       if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
154 +               goto out_delete;
155 +
156 +       /* We may want to delete the inode immediately and not defer it */
157 +       if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS)
158 +               goto out_delete;
159 +
160 +       /* We can't use the delete thread as-is during real orphan recovery,
161 +        * as we add to the orphan list here, causing ext3_orphan_cleanup()
162 +        * to loop endlessly.  It would be nice to do so, but needs work.
163 +        */
164 +       if (oei->i_state & EXT3_STATE_DELETE ||
165 +           sbi->s_mount_state & EXT3_ORPHAN_FS) {
166 +               ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
167 +                          old_inode->i_ino, blocks);
168 +               goto out_delete;
169 +       }
170 +
171 +       /* We can iget this inode again here, because our caller has unhashed
172 +        * old_inode, so new_inode will be in a different inode struct.
173 +        *
174 +        * We need to ensure that the i_orphan pointers in the other inodes
175 +        * point at the new inode copy instead of the old one so the orphan
176 +        * list doesn't get corrupted when the old orphan inode is freed.
177 +        */
178 +       down(&sbi->s_orphan_lock);
179 +
180 +       sbi->s_mount_state |= EXT3_ORPHAN_FS;
181 +       new_inode = iget(old_inode->i_sb, old_inode->i_ino);
182 +       sbi->s_mount_state &= ~EXT3_ORPHAN_FS;
183 +       if (is_bad_inode(new_inode)) {
184 +               printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
185 +               iput(new_inode);
186 +               new_inode = NULL;
187 +       }
188 +       if (!new_inode) {
189 +               up(&sbi->s_orphan_lock);
190 +               ext3_debug("delete inode %lu directly (bad read)\n",
191 +                          old_inode->i_ino);
192 +               goto out_delete;
193 +       }
194 +       J_ASSERT(new_inode != old_inode);
195 +
196 +       J_ASSERT(!list_empty(&oei->i_orphan));
197 +
198 +       nei = EXT3_I(new_inode);
199 +       /* Ugh.  We need to insert new_inode into the same spot on the list
200 +        * as old_inode was, to ensure the in-memory orphan list is still
201 +        * in the same order as the on-disk orphan list (badness otherwise).
202 +        */
203 +       nei->i_orphan = oei->i_orphan;
204 +       nei->i_orphan.next->prev = &nei->i_orphan;
205 +       nei->i_orphan.prev->next = &nei->i_orphan;
206 +       nei->i_state |= EXT3_STATE_DELETE;
207 +       up(&sbi->s_orphan_lock);
208 +
209 +       clear_inode(old_inode);
210 +
211 +       spin_lock(&sbi->s_delete_lock);
212 +       J_ASSERT(list_empty(&new_inode->i_dentry));
213 +       list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
214 +       sbi->s_delete_blocks += blocks;
215 +       sbi->s_delete_inodes++;
216 +       spin_unlock(&sbi->s_delete_lock);
217 +
218 +       ext3_debug("delete inode %lu (%lu blocks) by thread\n",
219 +                  new_inode->i_ino, blocks);
220 +
221 +       wake_up(&sbi->s_delete_thread_queue);
222 +       return;
223 +
224 +out_delete:
225 +       ext3_delete_inode(old_inode);
226 +}
227 +#else
228 +#define ext3_start_delete_thread(sbi) do {} while(0)
229 +#define ext3_stop_delete_thread(sbi) do {} while(0)
230 +#endif /* EXT3_DELETE_THREAD */
231 +
232  void ext3_put_super (struct super_block * sb)
233  {
234         struct ext3_sb_info *sbi = EXT3_SB(sb);
235 @@ -403,6 +617,7 @@ void ext3_put_super (struct super_block 
236         kdev_t j_dev = sbi->s_journal->j_dev;
237         int i;
238  
239 +       ext3_stop_delete_thread(sbi);
240         ext3_xattr_put_super(sb);
241         journal_destroy(sbi->s_journal);
242         if (!(sb->s_flags & MS_RDONLY)) {
243 @@ -451,7 +666,11 @@ static struct super_operations ext3_sops
244         write_inode:    ext3_write_inode,       /* BKL not held.  Don't need */
245         dirty_inode:    ext3_dirty_inode,       /* BKL not held.  We take it */
246         put_inode:      ext3_put_inode,         /* BKL not held.  Don't need */
247 +#ifdef EXT3_DELETE_THREAD
248 +       delete_inode:   ext3_delete_inode_thread,/* BKL not held. We take it */
249 +#else
250         delete_inode:   ext3_delete_inode,      /* BKL not held.  We take it */
251 +#endif
252         put_super:      ext3_put_super,         /* BKL held */
253         write_super:    ext3_write_super,       /* BKL held */
254         write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */
255 @@ -511,6 +730,14 @@ static int parse_options (char * options
256              this_char = strtok (NULL, ",")) {
257                 if ((value = strchr (this_char, '=')) != NULL)
258                         *value++ = 0;
259 +#ifdef EXT3_DELETE_THREAD
260 +               if (!strcmp(this_char, "asyncdel"))
261 +                       set_opt(*mount_options, ASYNCDEL);
262 +               else if (!strcmp(this_char, "noasyncdel"))
263 +                       clear_opt(*mount_options, ASYNCDEL);
264 +               else
265 +#endif
266 +
267                 if (!strcmp (this_char, "bsddf"))
268                         clear_opt (*mount_options, MINIX_DF);
269                 else if (!strcmp (this_char, "nouid32")) {
270 @@ -1206,6 +1433,7 @@ struct super_block * ext3_read_super (st
271         }
272  
273         ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
274 +       ext3_start_delete_thread(sb);
275         /*
276          * akpm: core read_super() calls in here with the superblock locked.
277          * That deadlocks, because orphan cleanup needs to lock the superblock
278 @@ -1648,6 +1876,9 @@ int ext3_remount (struct super_block * s
279         if (!parse_options(data, &tmp, sbi, &tmp, 1))
280                 return -EINVAL;
281  
282 +       if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
283 +               ext3_stop_delete_thread(sbi);
284 +
285         if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
286                 ext3_abort(sb, __FUNCTION__, "Abort forced by user");
287  
288 --- linux/fs/ext3/file.c.orig   Fri Jan 17 10:57:31 2003
289 +++ linux/fs/ext3/file.c        Mon Jun 30 13:28:52 2003
290 @@ -121,7 +121,11 @@ struct file_operations ext3_file_operati
291  };
292  
293  struct inode_operations ext3_file_inode_operations = {
294 +#ifdef EXT3_DELETE_THREAD
295 +       truncate:       ext3_truncate_thread,   /* BKL held */
296 +#else
297         truncate:       ext3_truncate,          /* BKL held */
298 +#endif
299         setattr:        ext3_setattr,           /* BKL held */
300  };
301  
302 --- linux-2.4.18-18.8.0-l15/fs/ext3/inode.c~ext3-delete_thread-2.4.18   Wed Jul  2 23:13:58 2003
303 +++ linux-2.4.18-18.8.0-l15-adilger/fs/ext3/inode.c     Wed Jul  2 23:50:29 2003
304 @@ -2004,6 +2004,118 @@ out_stop:
305         ext3_journal_stop(handle, inode);
306  }
307  
308 +#ifdef EXT3_DELETE_THREAD
309 +/* Move blocks from to-be-truncated inode over to a new inode, and delete
310 + * that one from the delete thread instead.  This avoids a lot of latency
311 + * when truncating large files.
312 + *
313 + * If we have any problem deferring the truncate, just truncate it right away.
314 + * If we defer it, we also mark how many blocks it would free, so that we
315 + * can keep the statfs data correct, and we know if we should sleep on the
316 + * delete thread when we run out of space.
317 + */
318 +void ext3_truncate_thread(struct inode *old_inode)
319 +{
320 +       struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
321 +       struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
322 +       struct inode *new_inode;
323 +       handle_t *handle;
324 +       unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
325 +
326 +       if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
327 +               goto out_truncate;
328 +
329 +       /* XXX This is a temporary limitation for code simplicity.
330 +        *     We could truncate to arbitrary sizes at some later time.
331 +        */
332 +       if (old_inode->i_size != 0)
333 +               goto out_truncate;
334 +
335 +       /* We may want to truncate the inode immediately and not defer it */
336 +       if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
337 +           old_inode->i_size > oei->i_disksize)
338 +               goto out_truncate;
339 +
340 +       /* We can't use the delete thread as-is during real orphan recovery,
341 +        * as we add to the orphan list here, causing ext3_orphan_cleanup()
342 +        * to loop endlessly.  It would be nice to do so, but needs work.
343 +        */
344 +       if (oei->i_state & EXT3_STATE_DELETE ||
345 +           sbi->s_mount_state & EXT3_ORPHAN_FS) {
346 +               ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
347 +                          old_inode->i_ino, blocks);
348 +               goto out_truncate;
349 +       }
350 +
351 +       ext3_discard_prealloc(old_inode);
352 +
353 +       /* old_inode   = 1
354 +        * new_inode   = sb + GDT + ibitmap
355 +        * orphan list = 1 inode/superblock for add, 2 inodes for del
356 +        * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
357 +        */
358 +       handle = ext3_journal_start(old_inode, 7);
359 +       if (IS_ERR(handle))
360 +               goto out_truncate;
361 +
362 +       new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
363 +       if (IS_ERR(new_inode)) {
364 +               ext3_debug("truncate inode %lu directly (no new inodes)\n",
365 +                          old_inode->i_ino);
366 +               goto out_journal;
367 +       }
368 +
369 +       nei = EXT3_I(new_inode);
370 +
371 +       down_write(&oei->truncate_sem);
372 +       new_inode->i_size = old_inode->i_size;
373 +       new_inode->i_blocks = old_inode->i_blocks;
374 +       new_inode->i_uid = old_inode->i_uid;
375 +       new_inode->i_gid = old_inode->i_gid;
376 +       new_inode->i_nlink = 0;
377 +
378 +       /* FIXME when we do arbitrary truncates */
379 +       old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
380 +       old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
381 +
382 +       memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
383 +       memset(oei->i_data, 0, sizeof(oei->i_data));
384 +
385 +       nei->i_disksize = oei->i_disksize;
386 +       nei->i_state |= EXT3_STATE_DELETE;
387 +       up_write(&oei->truncate_sem);
388 +
389 +       if (ext3_orphan_add(handle, new_inode) < 0)
390 +               goto out_journal;
391 +
392 +       if (ext3_orphan_del(handle, old_inode) < 0) {
393 +               ext3_orphan_del(handle, new_inode);
394 +               iput(new_inode);
395 +               goto out_journal;
396 +       }
397 +
398 +       ext3_journal_stop(handle, old_inode);
399 +
400 +       spin_lock(&sbi->s_delete_lock);
401 +       J_ASSERT(list_empty(&new_inode->i_dentry));
402 +       list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
403 +       sbi->s_delete_blocks += blocks;
404 +       sbi->s_delete_inodes++;
405 +       spin_unlock(&sbi->s_delete_lock);
406 +
407 +       ext3_debug("delete inode %lu (%lu blocks) by thread\n",
408 +                  new_inode->i_ino, blocks);
409 +
410 +       wake_up(&sbi->s_delete_thread_queue);
411 +       return;
412 +
413 +out_journal:
414 +       ext3_journal_stop(handle, old_inode);
415 +out_truncate:
416 +       ext3_truncate(old_inode);
417 +}
418 +#endif /* EXT3_DELETE_THREAD */
419 +
420  /* 
421   * ext3_get_inode_loc returns with an extra refcount against the
422   * inode's underlying buffer_head on success. 
423 --- linux-2.4.18-18.8.0-l15/include/linux/ext3_fs.h~ext3-delete_thread-2.4.18   Tue Jun  3 17:26:20 2003
424 +++ linux-2.4.18-18.8.0-l15-adilger/include/linux/ext3_fs.h     Wed Jul  2 23:19:09 2003
425 @@ -190,6 +190,7 @@ struct ext3_group_desc
426   */
427  #define EXT3_STATE_JDATA               0x00000001 /* journaled data exists */
428  #define EXT3_STATE_NEW                 0x00000002 /* inode is newly created */
429 +#define EXT3_STATE_DELETE              0x00000010 /* deferred delete inode */
430  
431  /*
432   * ioctl commands
433 @@ -317,6 +318,7 @@ struct ext3_inode {
434  #define EXT3_MOUNT_UPDATE_JOURNAL      0x1000  /* Update the journal format */
435  #define EXT3_MOUNT_NO_UID32            0x2000  /* Disable 32-bit UIDs */
436  #define EXT3_MOUNT_INDEX               0x4000  /* Enable directory index */
437 +#define EXT3_MOUNT_ASYNCDEL            0x20000 /* Delayed deletion */
438  
439  /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
440  #ifndef _LINUX_EXT2_FS_H
441 @@ -651,6 +653,9 @@ extern void ext3_discard_prealloc (struc
442  extern void ext3_dirty_inode(struct inode *);
443  extern int ext3_change_inode_journal_flag(struct inode *, int);
444  extern void ext3_truncate (struct inode *);
445 +#ifdef EXT3_DELETE_THREAD
446 +extern void ext3_truncate_thread(struct inode *inode);
447 +#endif
448  
449  /* ioctl.c */
450  extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
451 --- linux-2.4.18-18.8.0-l15/include/linux/ext3_fs_sb.h~ext3-delete_thread-2.4.18        Tue Jun  3 17:26:21 2003
452 +++ linux-2.4.18-18.8.0-l15-adilger/include/linux/ext3_fs_sb.h  Wed Jul  2 23:19:09 2003
453 @@ -29,6 +29,8 @@
454  
455  #define EXT3_MAX_GROUP_LOADED  32
456  
457 +#define EXT3_DELETE_THREAD
458 +
459  /*
460   * third extended-fs super-block data in memory
461   */
462 @@ -74,6 +76,14 @@ struct ext3_sb_info {
463         struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
464         wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
465  #endif
466 +#ifdef EXT3_DELETE_THREAD
467 +       spinlock_t s_delete_lock;
468 +       struct list_head s_delete_list;
469 +       unsigned long s_delete_blocks;
470 +       unsigned long s_delete_inodes;
471 +       wait_queue_head_t s_delete_thread_queue;
472 +       wait_queue_head_t s_delete_waiter_queue;
473 +#endif
474  };
475  
476  #endif /* _LINUX_EXT3_FS_SB */
477
478 _