Whamcloud - gitweb
oops make a mistake, change snapfs_core-2.4.20.path to snapfs_core-2.4.20.patch
[fs/lustre-release.git] / lustre / kernel_patches / patches / ext3-delete_thread-2.4.20.patch
1  fs/ext3/file.c             |    4 
2  fs/ext3/inode.c            |  116 ++++++++++++++++++++++
3  fs/ext3/super.c            |  230 +++++++++++++++++++++++++++++++++++++++++++++
4  include/linux/ext3_fs.h    |    5 
5  include/linux/ext3_fs_sb.h |   10 +
6  5 files changed, 365 insertions(+)
7
8 --- linux/fs/ext3/super.c~ext3-delete_thread-2.4.20     Thu Jul 10 14:11:32 2003
9 +++ linux-mmonroe/fs/ext3/super.c       Thu Jul 10 14:11:33 2003
10 @@ -400,6 +400,220 @@ static void dump_orphan_list(struct supe
11         }
12  }
13  
14 +#ifdef EXT3_DELETE_THREAD
15 +/*
16 + * Delete inodes in a loop until there are no more to be deleted.
17 + * Normally, we run in the background doing the deletes and sleeping again,
18 + * and clients just add new inodes to be deleted onto the end of the list.
19 + * If someone is concerned about free space (e.g. block allocation or similar)
20 + * then they can sleep on s_delete_waiter_queue and be woken up when space
21 + * has been freed.
22 + */
23 +int ext3_delete_thread(void *data)
24 +{
25 +       struct super_block *sb = data;
26 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
27 +       struct task_struct *tsk = current;
28 +
29 +       /* Almost like daemonize, but not quite */
30 +       exit_mm(current);
31 +       tsk->session = 1;
32 +       tsk->pgrp = 1;
33 +       tsk->tty = NULL;
34 +       exit_files(current);
35 +       reparent_to_init();
36 +
37 +       sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
38 +       sigfillset(&tsk->blocked);
39 +
40 +       /*tsk->flags |= PF_KERNTHREAD;*/
41 +
42 +       INIT_LIST_HEAD(&sbi->s_delete_list);
43 +       wake_up(&sbi->s_delete_waiter_queue);
44 +       ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
45 +
46 +       /* main loop */
47 +       for (;;) {
48 +               wait_event_interruptible(sbi->s_delete_thread_queue,
49 +                                        !list_empty(&sbi->s_delete_list) ||
50 +                                        !test_opt(sb, ASYNCDEL));
51 +               ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
52 +                          tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
53 +
54 +               spin_lock(&sbi->s_delete_lock);
55 +               if (list_empty(&sbi->s_delete_list)) {
56 +                       clear_opt(sbi->s_mount_opt, ASYNCDEL);
57 +                       memset(&sbi->s_delete_list, 0,
58 +                              sizeof(sbi->s_delete_list));
59 +                       spin_unlock(&sbi->s_delete_lock);
60 +                       ext3_debug("delete thread on %s exiting\n",
61 +                                  kdevname(sb->s_dev));
62 +                       wake_up(&sbi->s_delete_waiter_queue);
63 +                       break;
64 +               }
65 +
66 +               while (!list_empty(&sbi->s_delete_list)) {
67 +                       struct inode *inode=list_entry(sbi->s_delete_list.next,
68 +                                                      struct inode, i_dentry);
69 +                       unsigned long blocks = inode->i_blocks >>
70 +                                                       (inode->i_blkbits - 9);
71 +
72 +                       list_del_init(&inode->i_dentry);
73 +                       spin_unlock(&sbi->s_delete_lock);
74 +                       ext3_debug("%s delete ino %lu blk %lu\n",
75 +                                  tsk->comm, inode->i_ino, blocks);
76 +
77 +                       iput(inode);
78 +
79 +                       spin_lock(&sbi->s_delete_lock);
80 +                       sbi->s_delete_blocks -= blocks;
81 +                       sbi->s_delete_inodes--;
82 +               }
83 +               if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
84 +                       ext3_warning(sb, __FUNCTION__,
85 +                                    "%lu blocks, %lu inodes on list?\n",
86 +                                    sbi->s_delete_blocks,sbi->s_delete_inodes);
87 +                       sbi->s_delete_blocks = 0;
88 +                       sbi->s_delete_inodes = 0;
89 +               }
90 +               spin_unlock(&sbi->s_delete_lock);
91 +               wake_up(&sbi->s_delete_waiter_queue);
92 +       }
93 +
94 +       return 0;
95 +}
96 +
97 +static void ext3_start_delete_thread(struct super_block *sb)
98 +{
99 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
100 +       int rc;
101 +
102 +       spin_lock_init(&sbi->s_delete_lock);
103 +       init_waitqueue_head(&sbi->s_delete_thread_queue);
104 +       init_waitqueue_head(&sbi->s_delete_waiter_queue);
105 +
106 +       if (!test_opt(sb, ASYNCDEL))
107 +               return;
108 +
109 +       rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
110 +       if (rc < 0)
111 +               printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
112 +                      rc);
113 +       else
114 +               wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
115 +}
116 +
117 +static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
118 +{
119 +       if (sbi->s_delete_list.next == 0)       /* thread never started */
120 +               return;
121 +
122 +       clear_opt(sbi->s_mount_opt, ASYNCDEL);
123 +       wake_up(&sbi->s_delete_thread_queue);
124 +       wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list));
125 +}
126 +
127 +/* Instead of playing games with the inode flags, destruction, etc we just
128 + * create a new inode locally and put it on a list for the truncate thread.
129 + * We need large parts of the inode struct in order to complete the
130 + * truncate and unlink, so we may as well just have a real inode to do it.
131 + *
132 + * If we have any problem deferring the delete, just delete it right away.
133 + * If we defer it, we also mark how many blocks it would free, so that we
134 + * can keep the statfs data correct, and we know if we should sleep on the
135 + * delete thread when we run out of space.
136 + */
137 +static void ext3_delete_inode_thread(struct inode *old_inode)
138 +{
139 +       struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
140 +       struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
141 +       struct inode *new_inode;
142 +       unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
143 +
144 +       if (is_bad_inode(old_inode)) {
145 +               clear_inode(old_inode);
146 +               return;
147 +       }
148 +
149 +       if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
150 +               goto out_delete;
151 +
152 +       /* We may want to delete the inode immediately and not defer it */
153 +       if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS)
154 +               goto out_delete;
155 +
156 +       /* We can't use the delete thread as-is during real orphan recovery,
157 +        * as we add to the orphan list here, causing ext3_orphan_cleanup()
158 +        * to loop endlessly.  It would be nice to do so, but needs work.
159 +        */
160 +       if (oei->i_state & EXT3_STATE_DELETE ||
161 +           sbi->s_mount_state & EXT3_ORPHAN_FS) {
162 +               ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
163 +                          old_inode->i_ino, blocks);
164 +               goto out_delete;
165 +       }
166 +
167 +       /* We can iget this inode again here, because our caller has unhashed
168 +        * old_inode, so new_inode will be in a different inode struct.
169 +        *
170 +        * We need to ensure that the i_orphan pointers in the other inodes
171 +        * point at the new inode copy instead of the old one so the orphan
172 +        * list doesn't get corrupted when the old orphan inode is freed.
173 +        */
174 +       down(&sbi->s_orphan_lock);
175 +
176 +       sbi->s_mount_state |= EXT3_ORPHAN_FS;
177 +       new_inode = iget(old_inode->i_sb, old_inode->i_ino);
178 +       sbi->s_mount_state &= ~EXT3_ORPHAN_FS;
179 +       if (is_bad_inode(new_inode)) {
180 +               printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
181 +               iput(new_inode);
182 +               new_inode = NULL;
183 +       }
184 +       if (!new_inode) {
185 +               up(&sbi->s_orphan_lock);
186 +               ext3_debug("delete inode %lu directly (bad read)\n",
187 +                          old_inode->i_ino);
188 +               goto out_delete;
189 +       }
190 +       J_ASSERT(new_inode != old_inode);
191 +
192 +       J_ASSERT(!list_empty(&oei->i_orphan));
193 +
194 +       nei = EXT3_I(new_inode);
195 +       /* Ugh.  We need to insert new_inode into the same spot on the list
196 +        * as old_inode was, to ensure the in-memory orphan list is still
197 +        * in the same order as the on-disk orphan list (badness otherwise).
198 +        */
199 +       nei->i_orphan = oei->i_orphan;
200 +       nei->i_orphan.next->prev = &nei->i_orphan;
201 +       nei->i_orphan.prev->next = &nei->i_orphan;
202 +       nei->i_state |= EXT3_STATE_DELETE;
203 +       up(&sbi->s_orphan_lock);
204 +
205 +       clear_inode(old_inode);
206 +
207 +       spin_lock(&sbi->s_delete_lock);
208 +       J_ASSERT(list_empty(&new_inode->i_dentry));
209 +       list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
210 +       sbi->s_delete_blocks += blocks;
211 +       sbi->s_delete_inodes++;
212 +       spin_unlock(&sbi->s_delete_lock);
213 +
214 +       ext3_debug("delete inode %lu (%lu blocks) by thread\n",
215 +                  new_inode->i_ino, blocks);
216 +
217 +       wake_up(&sbi->s_delete_thread_queue);
218 +       return;
219 +
220 +out_delete:
221 +       ext3_delete_inode(old_inode);
222 +}
223 +#else
224 +#define ext3_start_delete_thread(sbi) do {} while(0)
225 +#define ext3_stop_delete_thread(sbi) do {} while(0)
226 +#endif /* EXT3_DELETE_THREAD */
227 +
228  void ext3_put_super (struct super_block * sb)
229  {
230         struct ext3_sb_info *sbi = EXT3_SB(sb);
231 @@ -407,6 +621,7 @@ void ext3_put_super (struct super_block 
232         kdev_t j_dev = sbi->s_journal->j_dev;
233         int i;
234  
235 +       ext3_stop_delete_thread(sbi);
236         ext3_xattr_put_super(sb);
237         journal_destroy(sbi->s_journal);
238         if (!(sb->s_flags & MS_RDONLY)) {
239 @@ -455,7 +670,11 @@ static struct super_operations ext3_sops
240         write_inode:    ext3_write_inode,       /* BKL not held.  Don't need */
241         dirty_inode:    ext3_dirty_inode,       /* BKL not held.  We take it */
242         put_inode:      ext3_put_inode,         /* BKL not held.  Don't need */
243 +#ifdef EXT3_DELETE_THREAD
244 +       delete_inode:   ext3_delete_inode_thread,/* BKL not held. We take it */
245 +#else
246         delete_inode:   ext3_delete_inode,      /* BKL not held.  We take it */
247 +#endif
248         put_super:      ext3_put_super,         /* BKL held */
249         write_super:    ext3_write_super,       /* BKL held */
250         sync_fs:        ext3_sync_fs,
251 @@ -524,6 +743,13 @@ static int parse_options (char * options
252                         clear_opt (*mount_options, XATTR_USER);
253                 else
254  #endif
255 +#ifdef EXT3_DELETE_THREAD
256 +               if (!strcmp(this_char, "asyncdel"))
257 +                       set_opt(*mount_options, ASYNCDEL);
258 +               else if (!strcmp(this_char, "noasyncdel"))
259 +                       clear_opt(*mount_options, ASYNCDEL);
260 +               else
261 +#endif
262                 if (!strcmp (this_char, "bsddf"))
263                         clear_opt (*mount_options, MINIX_DF);
264                 else if (!strcmp (this_char, "nouid32")) {
265 @@ -1223,6 +1449,7 @@ struct super_block * ext3_read_super (st
266         }
267  
268         ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
269 +       ext3_start_delete_thread(sb);
270         /*
271          * akpm: core read_super() calls in here with the superblock locked.
272          * That deadlocks, because orphan cleanup needs to lock the superblock
273 @@ -1678,6 +1905,9 @@ int ext3_remount (struct super_block * s
274         if (!parse_options(data, &tmp, sbi, &tmp, 1))
275                 return -EINVAL;
276  
277 +       if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
278 +               ext3_stop_delete_thread(sbi);
279 +
280         if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
281                 ext3_abort(sb, __FUNCTION__, "Abort forced by user");
282  
283 --- linux/fs/ext3/inode.c~ext3-delete_thread-2.4.20     Thu Jul 10 14:11:29 2003
284 +++ linux-mmonroe/fs/ext3/inode.c       Thu Jul 10 14:11:33 2003
285 @@ -2013,6 +2013,118 @@ out_stop:
286         ext3_journal_stop(handle, inode);
287  }
288  
289 +#ifdef EXT3_DELETE_THREAD
290 +/* Move blocks from to-be-truncated inode over to a new inode, and delete
291 + * that one from the delete thread instead.  This avoids a lot of latency
292 + * when truncating large files.
293 + *
294 + * If we have any problem deferring the truncate, just truncate it right away.
295 + * If we defer it, we also mark how many blocks it would free, so that we
296 + * can keep the statfs data correct, and we know if we should sleep on the
297 + * delete thread when we run out of space.
298 + */
299 +void ext3_truncate_thread(struct inode *old_inode)
300 +{
301 +       struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
302 +       struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
303 +       struct inode *new_inode;
304 +       handle_t *handle;
305 +       unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
306 +
307 +       if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
308 +               goto out_truncate;
309 +
310 +       /* XXX This is a temporary limitation for code simplicity.
311 +        *     We could truncate to arbitrary sizes at some later time.
312 +        */
313 +       if (old_inode->i_size != 0)
314 +               goto out_truncate;
315 +
316 +       /* We may want to truncate the inode immediately and not defer it */
317 +       if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
318 +           old_inode->i_size > oei->i_disksize)
319 +               goto out_truncate;
320 +
321 +       /* We can't use the delete thread as-is during real orphan recovery,
322 +        * as we add to the orphan list here, causing ext3_orphan_cleanup()
323 +        * to loop endlessly.  It would be nice to do so, but needs work.
324 +        */
325 +       if (oei->i_state & EXT3_STATE_DELETE ||
326 +           sbi->s_mount_state & EXT3_ORPHAN_FS) {
327 +               ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
328 +                          old_inode->i_ino, blocks);
329 +               goto out_truncate;
330 +       }
331 +
332 +       ext3_discard_prealloc(old_inode);
333 +
334 +       /* old_inode   = 1
335 +        * new_inode   = sb + GDT + ibitmap
336 +        * orphan list = 1 inode/superblock for add, 2 inodes for del
337 +        * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
338 +        */
339 +       handle = ext3_journal_start(old_inode, 7);
340 +       if (IS_ERR(handle))
341 +               goto out_truncate;
342 +
343 +       new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
344 +       if (IS_ERR(new_inode)) {
345 +               ext3_debug("truncate inode %lu directly (no new inodes)\n",
346 +                          old_inode->i_ino);
347 +               goto out_journal;
348 +       }
349 +
350 +       nei = EXT3_I(new_inode);
351 +
352 +       down_write(&oei->truncate_sem);
353 +       new_inode->i_size = old_inode->i_size;
354 +       new_inode->i_blocks = old_inode->i_blocks;
355 +       new_inode->i_uid = old_inode->i_uid;
356 +       new_inode->i_gid = old_inode->i_gid;
357 +       new_inode->i_nlink = 0;
358 +
359 +       /* FIXME when we do arbitrary truncates */
360 +       old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
361 +       old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
362 +
363 +       memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
364 +       memset(oei->i_data, 0, sizeof(oei->i_data));
365 +
366 +       nei->i_disksize = oei->i_disksize;
367 +       nei->i_state |= EXT3_STATE_DELETE;
368 +       up_write(&oei->truncate_sem);
369 +
370 +       if (ext3_orphan_add(handle, new_inode) < 0)
371 +               goto out_journal;
372 +
373 +       if (ext3_orphan_del(handle, old_inode) < 0) {
374 +               ext3_orphan_del(handle, new_inode);
375 +               iput(new_inode);
376 +               goto out_journal;
377 +       }
378 +
379 +       ext3_journal_stop(handle, old_inode);
380 +
381 +       spin_lock(&sbi->s_delete_lock);
382 +       J_ASSERT(list_empty(&new_inode->i_dentry));
383 +       list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
384 +       sbi->s_delete_blocks += blocks;
385 +       sbi->s_delete_inodes++;
386 +       spin_unlock(&sbi->s_delete_lock);
387 +
388 +       ext3_debug("delete inode %lu (%lu blocks) by thread\n",
389 +                  new_inode->i_ino, blocks);
390 +
391 +       wake_up(&sbi->s_delete_thread_queue);
392 +       return;
393 +
394 +out_journal:
395 +       ext3_journal_stop(handle, old_inode);
396 +out_truncate:
397 +       ext3_truncate(old_inode);
398 +}
399 +#endif /* EXT3_DELETE_THREAD */
400 +
401  /* 
402   * ext3_get_inode_loc returns with an extra refcount against the
403   * inode's underlying buffer_head on success. 
404 --- linux/fs/ext3/file.c~ext3-delete_thread-2.4.20      Thu Jul 10 14:11:21 2003
405 +++ linux-mmonroe/fs/ext3/file.c        Thu Jul 10 14:12:17 2003
406 @@ -125,7 +125,11 @@ struct file_operations ext3_file_operati
407  };
408  
409  struct inode_operations ext3_file_inode_operations = {
410 +#ifdef EXT3_DELETE_THREAD
411 +       truncate:       ext3_truncate_thread,   /* BKL held */
412 +#else
413         truncate:       ext3_truncate,          /* BKL held */
414 +#endif
415         setattr:        ext3_setattr,           /* BKL held */
416         setxattr:       ext3_setxattr,          /* BKL held */
417         getxattr:       ext3_getxattr,          /* BKL held */
418 --- linux/include/linux/ext3_fs.h~ext3-delete_thread-2.4.20     Thu Jul 10 14:11:26 2003
419 +++ linux-mmonroe/include/linux/ext3_fs.h       Thu Jul 10 14:11:33 2003
420 @@ -193,6 +193,7 @@ struct ext3_group_desc
421   */
422  #define EXT3_STATE_JDATA               0x00000001 /* journaled data exists */
423  #define EXT3_STATE_NEW                 0x00000002 /* inode is newly created */
424 +#define EXT3_STATE_DELETE              0x00000010 /* deferred delete inode */
425  
426  /*
427   * ioctl commands
428 @@ -320,6 +321,7 @@ struct ext3_inode {
429  #define EXT3_MOUNT_UPDATE_JOURNAL      0x1000  /* Update the journal format */
430  #define EXT3_MOUNT_NO_UID32            0x2000  /* Disable 32-bit UIDs */
431  #define EXT3_MOUNT_XATTR_USER          0x4000  /* Extended user attributes */
432 +#define EXT3_MOUNT_ASYNCDEL            0x20000 /* Delayed deletion */
433  
434  /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
435  #ifndef _LINUX_EXT2_FS_H
436 @@ -694,6 +696,9 @@ extern void ext3_discard_prealloc (struc
437  extern void ext3_dirty_inode(struct inode *);
438  extern int ext3_change_inode_journal_flag(struct inode *, int);
439  extern void ext3_truncate (struct inode *);
440 +#ifdef EXT3_DELETE_THREAD
441 +extern void ext3_truncate_thread(struct inode *inode);
442 +#endif
443  
444  /* ioctl.c */
445  extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
446 --- linux/include/linux/ext3_fs_sb.h~ext3-delete_thread-2.4.20  Thu Jul 10 14:11:32 2003
447 +++ linux-mmonroe/include/linux/ext3_fs_sb.h    Thu Jul 10 14:11:33 2003
448 @@ -29,6 +29,8 @@
449  
450  #define EXT3_MAX_GROUP_LOADED  8
451  
452 +#define EXT3_DELETE_THREAD
453 +
454  /*
455   * third extended-fs super-block data in memory
456   */
457 @@ -76,6 +78,14 @@ struct ext3_sb_info {
458         struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
459         wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
460  #endif
461 +#ifdef EXT3_DELETE_THREAD
462 +       spinlock_t s_delete_lock;
463 +       struct list_head s_delete_list;
464 +       unsigned long s_delete_blocks;
465 +       unsigned long s_delete_inodes;
466 +       wait_queue_head_t s_delete_thread_queue;
467 +       wait_queue_head_t s_delete_waiter_queue;
468 +#endif
469  };
470  
471  #endif /* _LINUX_EXT3_FS_SB */
472
473 _