Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / kernel_patches / patches / ext3-delete_thread-2.4.29.patch
1 Index: linux-2.4.29/fs/ext3/super.c
2 ===================================================================
3 --- linux-2.4.29.orig/fs/ext3/super.c   2005-05-03 15:53:33.047533872 +0300
4 +++ linux-2.4.29/fs/ext3/super.c        2005-05-03 15:54:47.192262160 +0300
5 @@ -400,6 +400,127 @@
6         }
7  }
8  
9 +#ifdef EXT3_DELETE_THREAD
10 +/*
11 + * Delete inodes in a loop until there are no more to be deleted.
12 + * Normally, we run in the background doing the deletes and sleeping again,
13 + * and clients just add new inodes to be deleted onto the end of the list.
14 + * If someone is concerned about free space (e.g. block allocation or similar)
15 + * then they can sleep on s_delete_waiter_queue and be woken up when space
16 + * has been freed.
17 + */
18 +int ext3_delete_thread(void *data)
19 +{
20 +       struct super_block *sb = data;
21 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
22 +       struct task_struct *tsk = current;
23 +
24 +       /* Almost like daemonize, but not quite */
25 +       exit_mm(current);
26 +       tsk->session = 1;
27 +       tsk->pgrp = 1;
28 +       tsk->tty = NULL;
29 +       exit_files(current);
30 +       reparent_to_init();
31 +
32 +       sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
33 +       sigfillset(&tsk->blocked);
34 +
35 +       /*tsk->flags |= PF_KERNTHREAD;*/
36 +
37 +       INIT_LIST_HEAD(&sbi->s_delete_list);
38 +       wake_up(&sbi->s_delete_waiter_queue);
39 +       ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
40 +
41 +       /* main loop */
42 +       for (;;) {
43 +               wait_event_interruptible(sbi->s_delete_thread_queue,
44 +                                        !list_empty(&sbi->s_delete_list) ||
45 +                                        !test_opt(sb, ASYNCDEL));
46 +               ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
47 +                          tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
48 +
49 +               spin_lock(&sbi->s_delete_lock);
50 +               if (list_empty(&sbi->s_delete_list)) {
51 +                       clear_opt(sbi->s_mount_opt, ASYNCDEL);
52 +                       memset(&sbi->s_delete_list, 0,
53 +                              sizeof(sbi->s_delete_list));
54 +                       spin_unlock(&sbi->s_delete_lock);
55 +                       ext3_debug("delete thread on %s exiting\n",
56 +                                  kdevname(sb->s_dev));
57 +                       wake_up(&sbi->s_delete_waiter_queue);
58 +                       break;
59 +               }
60 +
61 +               while (!list_empty(&sbi->s_delete_list)) {
62 +                       struct inode *inode=list_entry(sbi->s_delete_list.next,
63 +                                                      struct inode, i_devices);
64 +                       unsigned long blocks = inode->i_blocks >>
65 +                                                       (inode->i_blkbits - 9);
66 +
67 +                       list_del_init(&inode->i_devices);
68 +                       spin_unlock(&sbi->s_delete_lock);
69 +                       ext3_debug("%s delete ino %lu blk %lu\n",
70 +                                  tsk->comm, inode->i_ino, blocks);
71 +
72 +                       J_ASSERT(EXT3_I(inode)->i_state & EXT3_STATE_DELETE);
73 +                       J_ASSERT(inode->i_nlink == 1);
74 +                       inode->i_nlink = 0;
75 +                       iput(inode);
76 +
77 +                       spin_lock(&sbi->s_delete_lock);
78 +                       sbi->s_delete_blocks -= blocks;
79 +                       sbi->s_delete_inodes--;
80 +               }
81 +               if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
82 +                       ext3_warning(sb, __FUNCTION__,
83 +                                    "%lu blocks, %lu inodes on list?\n",
84 +                                    sbi->s_delete_blocks,sbi->s_delete_inodes);
85 +                       sbi->s_delete_blocks = 0;
86 +                       sbi->s_delete_inodes = 0;
87 +               }
88 +               spin_unlock(&sbi->s_delete_lock);
89 +               wake_up(&sbi->s_delete_waiter_queue);
90 +       }
91 +
92 +       return 0;
93 +}
94 +
95 +static void ext3_start_delete_thread(struct super_block *sb)
96 +{
97 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
98 +       int rc;
99 +
100 +       spin_lock_init(&sbi->s_delete_lock);
101 +       init_waitqueue_head(&sbi->s_delete_thread_queue);
102 +       init_waitqueue_head(&sbi->s_delete_waiter_queue);
103 +
104 +       if (!test_opt(sb, ASYNCDEL))
105 +               return;
106 +
107 +       rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
108 +       if (rc < 0)
109 +               printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
110 +                      rc);
111 +       else
112 +               wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
113 +}
114 +
115 +static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
116 +{
117 +       if (sbi->s_delete_list.next == 0)       /* thread never started */
118 +               return;
119 +
120 +       clear_opt(sbi->s_mount_opt, ASYNCDEL);
121 +       wake_up(&sbi->s_delete_thread_queue);
122 +       wait_event(sbi->s_delete_waiter_queue,
123 +                       sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0);
124 +}
125 +#else
126 +#define ext3_start_delete_thread(sbi) do {} while(0)
127 +#define ext3_stop_delete_thread(sbi) do {} while(0)
128 +#endif /* EXT3_DELETE_THREAD */
129 +
130  void ext3_put_super (struct super_block * sb)
131  {
132         struct ext3_sb_info *sbi = EXT3_SB(sb);
133 @@ -407,6 +528,9 @@
134         kdev_t j_dev = sbi->s_journal->j_dev;
135         int i;
136  
137 +#ifdef EXT3_DELETE_THREAD
138 +       J_ASSERT(sbi->s_delete_inodes == 0);
139 +#endif
140         ext3_xattr_put_super(sb);
141         journal_destroy(sbi->s_journal);
142         if (!(sb->s_flags & MS_RDONLY)) {
143 @@ -526,6 +650,13 @@
144                         clear_opt (*mount_options, XATTR_USER);
145                 else
146  #endif
147 +#ifdef EXT3_DELETE_THREAD
148 +               if (!strcmp(this_char, "asyncdel"))
149 +                       set_opt(*mount_options, ASYNCDEL);
150 +               else if (!strcmp(this_char, "noasyncdel"))
151 +                       clear_opt(*mount_options, ASYNCDEL);
152 +               else
153 +#endif
154                 if (!strcmp (this_char, "bsddf"))
155                         clear_opt (*mount_options, MINIX_DF);
156                 else if (!strcmp (this_char, "nouid32")) {
157 @@ -1244,6 +1375,7 @@
158         }
159  
160         ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
161 +       ext3_start_delete_thread(sb);
162         EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
163         ext3_orphan_cleanup(sb, es);
164         EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
165 @@ -1626,7 +1758,12 @@
166  static int ext3_sync_fs(struct super_block *sb)
167  {
168         tid_t target;
169 -       
170 +
171 +       if (atomic_read(&sb->s_active) == 0) {
172 +               /* fs is being umounted: time to stop delete thread */
173 +               ext3_stop_delete_thread(EXT3_SB(sb));
174 +       }
175 +
176         sb->s_dirt = 0;
177         target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
178         log_wait_commit(EXT3_SB(sb)->s_journal, target);
179 @@ -1690,6 +1827,9 @@
180         if (!parse_options(data, &tmp, sbi, &tmp, 1))
181                 return -EINVAL;
182  
183 +       if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
184 +               ext3_stop_delete_thread(sbi);
185 +
186         if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
187                 ext3_abort(sb, __FUNCTION__, "Abort forced by user");
188  
189 Index: linux-2.4.29/fs/ext3/inode.c
190 ===================================================================
191 --- linux-2.4.29.orig/fs/ext3/inode.c   2005-05-03 15:53:36.555000656 +0300
192 +++ linux-2.4.29/fs/ext3/inode.c        2005-05-03 15:53:56.901907456 +0300
193 @@ -2562,6 +2562,118 @@
194         return err;
195  }
196  
197 +#ifdef EXT3_DELETE_THREAD
198 +/* Move blocks from to-be-truncated inode over to a new inode, and delete
199 + * that one from the delete thread instead.  This avoids a lot of latency
200 + * when truncating large files.
201 + *
202 + * If we have any problem deferring the truncate, just truncate it right away.
203 + * If we defer it, we also mark how many blocks it would free, so that we
204 + * can keep the statfs data correct, and we know if we should sleep on the
205 + * delete thread when we run out of space.
206 + */
207 +void ext3_truncate_thread(struct inode *old_inode)
208 +{
209 +       struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
210 +       struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
211 +       struct inode *new_inode;
212 +       handle_t *handle;
213 +       unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
214 +
215 +       if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
216 +               goto out_truncate;
217 +
218 +       /* XXX This is a temporary limitation for code simplicity.
219 +        *     We could truncate to arbitrary sizes at some later time.
220 +        */
221 +       if (old_inode->i_size != 0)
222 +               goto out_truncate;
223 +
224 +       /* We may want to truncate the inode immediately and not defer it */
225 +       if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
226 +           old_inode->i_size > oei->i_disksize)
227 +               goto out_truncate;
228 +
229 +       /* We can't use the delete thread as-is during real orphan recovery,
230 +        * as we add to the orphan list here, causing ext3_orphan_cleanup()
231 +        * to loop endlessly.  It would be nice to do so, but needs work.
232 +        */
233 +       if (oei->i_state & EXT3_STATE_DELETE ||
234 +           sbi->s_mount_state & EXT3_ORPHAN_FS) {
235 +               ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
236 +                          old_inode->i_ino, blocks);
237 +               goto out_truncate;
238 +       }
239 +
240 +       ext3_discard_prealloc(old_inode);
241 +
242 +       /* old_inode   = 1
243 +        * new_inode   = sb + GDT + ibitmap
244 +        * orphan list = 1 inode/superblock for add, 2 inodes for del
245 +        * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
246 +        */
247 +       handle = ext3_journal_start(old_inode, 7);
248 +       if (IS_ERR(handle))
249 +               goto out_truncate;
250 +
251 +       new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
252 +       if (IS_ERR(new_inode)) {
253 +               ext3_debug("truncate inode %lu directly (no new inodes)\n",
254 +                          old_inode->i_ino);
255 +               goto out_journal;
256 +       }
257 +
258 +       nei = EXT3_I(new_inode);
259 +
260 +       down_write(&oei->truncate_sem);
261 +       new_inode->i_size = old_inode->i_size;
262 +       new_inode->i_blocks = old_inode->i_blocks;
263 +       new_inode->i_uid = old_inode->i_uid;
264 +       new_inode->i_gid = old_inode->i_gid;
265 +       new_inode->i_nlink = 1;
266 +
267 +       /* FIXME when we do arbitrary truncates */
268 +       old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
269 +       old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
270 +
271 +       memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
272 +       memset(oei->i_data, 0, sizeof(oei->i_data));
273 +
274 +       nei->i_disksize = oei->i_disksize;
275 +       nei->i_state |= EXT3_STATE_DELETE;
276 +       up_write(&oei->truncate_sem);
277 +
278 +       if (ext3_orphan_add(handle, new_inode) < 0)
279 +               goto out_journal;
280 +
281 +       if (ext3_orphan_del(handle, old_inode) < 0) {
282 +               ext3_orphan_del(handle, new_inode);
283 +               iput(new_inode);
284 +               goto out_journal;
285 +       }
286 +
287 +       ext3_journal_stop(handle, old_inode);
288 +
289 +       spin_lock(&sbi->s_delete_lock);
290 +       J_ASSERT(list_empty(&new_inode->i_devices));
291 +       list_add_tail(&new_inode->i_devices, &sbi->s_delete_list);
292 +       sbi->s_delete_blocks += blocks;
293 +       sbi->s_delete_inodes++;
294 +       spin_unlock(&sbi->s_delete_lock);
295 +
296 +       ext3_debug("delete inode %lu (%lu blocks) by thread\n",
297 +                  new_inode->i_ino, blocks);
298 +
299 +       wake_up(&sbi->s_delete_thread_queue);
300 +       return;
301 +
302 +out_journal:
303 +       ext3_journal_stop(handle, old_inode);
304 +out_truncate:
305 +       ext3_truncate(old_inode);
306 +}
307 +#endif /* EXT3_DELETE_THREAD */
308 +
309  /* 
310   * On success, We end up with an outstanding reference count against
311   * iloc->bh.  This _must_ be cleaned up later. 
312 Index: linux-2.4.29/fs/ext3/file.c
313 ===================================================================
314 --- linux-2.4.29.orig/fs/ext3/file.c    2005-04-07 19:31:00.000000000 +0300
315 +++ linux-2.4.29/fs/ext3/file.c 2005-05-03 15:53:56.902907304 +0300
316 @@ -123,7 +123,11 @@
317  };
318  
319  struct inode_operations ext3_file_inode_operations = {
320 +#ifdef EXT3_DELETE_THREAD
321 +       truncate:       ext3_truncate_thread,   /* BKL held */
322 +#else
323         truncate:       ext3_truncate,          /* BKL held */
324 +#endif
325         setattr:        ext3_setattr,           /* BKL held */
326         setxattr:       ext3_setxattr,          /* BKL held */
327         getxattr:       ext3_getxattr,          /* BKL held */
328 Index: linux-2.4.29/fs/ext3/namei.c
329 ===================================================================
330 --- linux-2.4.29.orig/fs/ext3/namei.c   2005-05-03 15:53:33.044534328 +0300
331 +++ linux-2.4.29/fs/ext3/namei.c        2005-05-03 15:53:56.905906848 +0300
332 @@ -838,6 +838,40 @@
333         return retval;
334  }
335  
336 +#ifdef EXT3_DELETE_THREAD
337 +static int ext3_try_to_delay_deletion(struct inode *inode)
338 +{
339 +       struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb);
340 +       struct ext3_inode_info *ei = EXT3_I(inode);
341 +       unsigned long blocks;
342 +
343 +       if (!test_opt(inode->i_sb, ASYNCDEL))
344 +               return 0;
345 +
346 +       /* We may want to delete the inode immediately and not defer it */
347 +       blocks = inode->i_blocks >> (inode->i_blkbits - 9);
348 +       if (IS_SYNC(inode) || blocks <= EXT3_NDIR_BLOCKS)
349 +               return 0;
350 +
351 +       inode->i_nlink = 1;
352 +       atomic_inc(&inode->i_count);
353 +       ei->i_state |= EXT3_STATE_DELETE;
354 +
355 +       spin_lock(&sbi->s_delete_lock);
356 +       J_ASSERT(list_empty(&inode->i_devices));
357 +       list_add_tail(&inode->i_devices, &sbi->s_delete_list);
358 +       sbi->s_delete_blocks += blocks;
359 +       sbi->s_delete_inodes++;
360 +       spin_unlock(&sbi->s_delete_lock);
361 +
362 +       wake_up(&sbi->s_delete_thread_queue);
363 +
364 +       return 0;
365 +}
366 +#else
367 +#define ext3_try_to_delay_deletion(inode) do {} while (0)
368 +#endif
369 +
370  static int ext3_unlink(struct inode * dir, struct dentry *dentry)
371  {
372         int retval;
373 @@ -878,8 +912,10 @@
374         dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
375         ext3_mark_inode_dirty(handle, dir);
376         inode->i_nlink--;
377 -       if (!inode->i_nlink)
378 +       if (!inode->i_nlink) {
379 +               ext3_try_to_delay_deletion(inode);
380                 ext3_orphan_add(handle, inode);
381 +       }
382         inode->i_ctime = dir->i_ctime;
383         ext3_mark_inode_dirty(handle, inode);
384         retval = 0;
385 Index: linux-2.4.29/include/linux/ext3_fs.h
386 ===================================================================
387 --- linux-2.4.29.orig/include/linux/ext3_fs.h   2005-05-03 15:53:37.124914016 +0300
388 +++ linux-2.4.29/include/linux/ext3_fs.h        2005-05-03 15:53:56.907906544 +0300
389 @@ -188,6 +188,7 @@
390   */
391  #define EXT3_STATE_JDATA               0x00000001 /* journaled data exists */
392  #define EXT3_STATE_NEW                 0x00000002 /* inode is newly created */
393 +#define EXT3_STATE_DELETE              0x00000010 /* deferred delete inode */
394  
395  /*
396   * ioctl commands
397 @@ -315,6 +316,7 @@
398  #define EXT3_MOUNT_UPDATE_JOURNAL      0x1000  /* Update the journal format */
399  #define EXT3_MOUNT_NO_UID32            0x2000  /* Disable 32-bit UIDs */
400  #define EXT3_MOUNT_XATTR_USER          0x4000  /* Extended user attributes */
401 +#define EXT3_MOUNT_ASYNCDEL            0x20000 /* Delayed deletion */
402  
403  /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
404  #ifndef _LINUX_EXT2_FS_H
405 @@ -639,6 +641,9 @@
406  extern void ext3_dirty_inode(struct inode *);
407  extern int ext3_change_inode_journal_flag(struct inode *, int);
408  extern void ext3_truncate (struct inode *);
409 +#ifdef EXT3_DELETE_THREAD
410 +extern void ext3_truncate_thread(struct inode *inode);
411 +#endif
412  extern void ext3_set_inode_flags(struct inode *);
413  
414  /* ioctl.c */
415 Index: linux-2.4.29/include/linux/ext3_fs_sb.h
416 ===================================================================
417 --- linux-2.4.29.orig/include/linux/ext3_fs_sb.h        2005-05-03 15:53:33.048533720 +0300
418 +++ linux-2.4.29/include/linux/ext3_fs_sb.h     2005-05-03 15:53:56.909906240 +0300
419 @@ -29,6 +29,8 @@
420  
421  #define EXT3_MAX_GROUP_LOADED  8
422  
423 +#define EXT3_DELETE_THREAD
424 +
425  /*
426   * third extended-fs super-block data in memory
427   */
428 @@ -74,6 +76,14 @@
429         struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
430         wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
431  #endif
432 +#ifdef EXT3_DELETE_THREAD
433 +       spinlock_t s_delete_lock;
434 +       struct list_head s_delete_list;
435 +       unsigned long s_delete_blocks;
436 +       unsigned long s_delete_inodes;
437 +       wait_queue_head_t s_delete_thread_queue;
438 +       wait_queue_head_t s_delete_waiter_queue;
439 +#endif
440  };
441  
442  #endif /* _LINUX_EXT3_FS_SB */