Whamcloud - gitweb
b=3119
[fs/lustre-release.git] / lustre / kernel_patches / patches / ext3-delete_thread-2.4.18-2.patch
1  fs/ext3/file.c             |    4 
2  fs/ext3/inode.c            |  112 +++++++++++++++++++++
3  fs/ext3/super.c            |  231 +++++++++++++++++++++++++++++++++++++++++++++
4  include/linux/ext3_fs.h    |    5 
5  include/linux/ext3_fs_sb.h |   10 +
6  5 files changed, 362 insertions(+)
7
8 --- linux-2.4.18-chaos/fs/ext3/file.c~ext3-delete_thread-2.4.18-2       2003-09-16 23:34:07.000000000 +0400
9 +++ linux-2.4.18-chaos-alexey/fs/ext3/file.c    2003-09-16 23:42:34.000000000 +0400
10 @@ -124,7 +124,11 @@ struct file_operations ext3_file_operati
11  };
12  
13  struct inode_operations ext3_file_inode_operations = {
14 +#ifdef EXT3_DELETE_THREAD
15 +       truncate:       ext3_truncate_thread,   /* BKL held */
16 +#else
17         truncate:       ext3_truncate,          /* BKL held */
18 +#endif
19         setattr:        ext3_setattr,           /* BKL held */
20  };
21  
22 --- linux-2.4.18-chaos/fs/ext3/inode.c~ext3-delete_thread-2.4.18-2      2003-09-16 23:39:37.000000000 +0400
23 +++ linux-2.4.18-chaos-alexey/fs/ext3/inode.c   2003-09-16 23:42:34.000000000 +0400
24 @@ -2041,6 +2041,118 @@ out_unlock:
25         return;         /* AKPM: return what? */
26  }
27  
28 +#ifdef EXT3_DELETE_THREAD
29 +/* Move blocks from to-be-truncated inode over to a new inode, and delete
30 + * that one from the delete thread instead.  This avoids a lot of latency
31 + * when truncating large files.
32 + *
33 + * If we have any problem deferring the truncate, just truncate it right away.
34 + * If we defer it, we also mark how many blocks it would free, so that we
35 + * can keep the statfs data correct, and we know if we should sleep on the
36 + * delete thread when we run out of space.
37 + */
38 +void ext3_truncate_thread(struct inode *old_inode)
39 +{
40 +       struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
41 +       struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
42 +       struct inode *new_inode;
43 +       handle_t *handle;
44 +       unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
45 +
46 +       if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
47 +               goto out_truncate;
48 +
49 +       /* XXX This is a temporary limitation for code simplicity.
50 +        *     We could truncate to arbitrary sizes at some later time.
51 +        */
52 +       if (old_inode->i_size != 0)
53 +               goto out_truncate;
54 +
55 +       /* We may want to truncate the inode immediately and not defer it */
56 +       if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
57 +           old_inode->i_size > oei->i_disksize)
58 +               goto out_truncate;
59 +
60 +       /* We can't use the delete thread as-is during real orphan recovery,
61 +        * as we add to the orphan list here, causing ext3_orphan_cleanup()
62 +        * to loop endlessly.  It would be nice to do so, but needs work.
63 +        */
64 +       if (oei->i_state & EXT3_STATE_DELETE ||
65 +           sbi->s_mount_state & EXT3_ORPHAN_FS) {
66 +               ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
67 +                          old_inode->i_ino, blocks);
68 +               goto out_truncate;
69 +       }
70 +
71 +       ext3_discard_prealloc(old_inode);
72 +
73 +       /* old_inode   = 1
74 +        * new_inode   = sb + GDT + ibitmap
75 +        * orphan list = 1 inode/superblock for add, 2 inodes for del
76 +        * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
77 +        */
78 +       handle = ext3_journal_start(old_inode, 7);
79 +       if (IS_ERR(handle))
80 +               goto out_truncate;
81 +
82 +       new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
83 +       if (IS_ERR(new_inode)) {
84 +               ext3_debug("truncate inode %lu directly (no new inodes)\n",
85 +                          old_inode->i_ino);
86 +               goto out_journal;
87 +       }
88 +
89 +       nei = EXT3_I(new_inode);
90 +
91 +       down_write(&oei->truncate_sem);
92 +       new_inode->i_size = old_inode->i_size;
93 +       new_inode->i_blocks = old_inode->i_blocks;
94 +       new_inode->i_uid = old_inode->i_uid;
95 +       new_inode->i_gid = old_inode->i_gid;
96 +       new_inode->i_nlink = 0;
97 +
98 +       /* FIXME when we do arbitrary truncates */
99 +       old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
100 +       old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
101 +
102 +       memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
103 +       memset(oei->i_data, 0, sizeof(oei->i_data));
104 +
105 +       nei->i_disksize = oei->i_disksize;
106 +       nei->i_state |= EXT3_STATE_DELETE;
107 +       up_write(&oei->truncate_sem);
108 +
109 +       if (ext3_orphan_add(handle, new_inode) < 0)
110 +               goto out_journal;
111 +
112 +       if (ext3_orphan_del(handle, old_inode) < 0) {
113 +               ext3_orphan_del(handle, new_inode);
114 +               iput(new_inode);
115 +               goto out_journal;
116 +       }
117 +
118 +       ext3_journal_stop(handle, old_inode);
119 +
120 +       spin_lock(&sbi->s_delete_lock);
121 +       J_ASSERT(list_empty(&new_inode->i_dentry));
122 +       list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
123 +       sbi->s_delete_blocks += blocks;
124 +       sbi->s_delete_inodes++;
125 +       spin_unlock(&sbi->s_delete_lock);
126 +
127 +       ext3_debug("delete inode %lu (%lu blocks) by thread\n",
128 +                  new_inode->i_ino, blocks);
129 +
130 +       wake_up(&sbi->s_delete_thread_queue);
131 +       return;
132 +
133 +out_journal:
134 +       ext3_journal_stop(handle, old_inode);
135 +out_truncate:
136 +       ext3_truncate(old_inode);
137 +}
138 +#endif /* EXT3_DELETE_THREAD */
139 +
140  /* 
141   * ext3_get_inode_loc returns with an extra refcount against the
142   * inode's underlying buffer_head on success. 
143 --- linux-2.4.18-chaos/fs/ext3/super.c~ext3-delete_thread-2.4.18-2      2003-09-16 23:42:33.000000000 +0400
144 +++ linux-2.4.18-chaos-alexey/fs/ext3/super.c   2003-09-16 23:42:34.000000000 +0400
145 @@ -398,6 +398,220 @@ static void dump_orphan_list(struct supe
146         }
147  }
148  
149 +#ifdef EXT3_DELETE_THREAD
150 +/*
151 + * Delete inodes in a loop until there are no more to be deleted.
152 + * Normally, we run in the background doing the deletes and sleeping again,
153 + * and clients just add new inodes to be deleted onto the end of the list.
154 + * If someone is concerned about free space (e.g. block allocation or similar)
155 + * then they can sleep on s_delete_waiter_queue and be woken up when space
156 + * has been freed.
157 + */
158 +int ext3_delete_thread(void *data)
159 +{
160 +       struct super_block *sb = data;
161 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
162 +       struct task_struct *tsk = current;
163 +
164 +       /* Almost like daemonize, but not quite */
165 +       exit_mm(current);
166 +       tsk->session = 1;
167 +       tsk->pgrp = 1;
168 +       tsk->tty = NULL;
169 +       exit_files(current);
170 +       reparent_to_init();
171 +
172 +       sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
173 +       sigfillset(&tsk->blocked);
174 +
175 +       /*tsk->flags |= PF_KERNTHREAD;*/
176 +
177 +       INIT_LIST_HEAD(&sbi->s_delete_list);
178 +       wake_up(&sbi->s_delete_waiter_queue);
179 +       ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
180 +
181 +       /* main loop */
182 +       for (;;) {
183 +               wait_event_interruptible(sbi->s_delete_thread_queue,
184 +                                        !list_empty(&sbi->s_delete_list) ||
185 +                                        !test_opt(sb, ASYNCDEL));
186 +               ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
187 +                          tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
188 +
189 +               spin_lock(&sbi->s_delete_lock);
190 +               if (list_empty(&sbi->s_delete_list)) {
191 +                       clear_opt(sbi->s_mount_opt, ASYNCDEL);
192 +                       memset(&sbi->s_delete_list, 0,
193 +                              sizeof(sbi->s_delete_list));
194 +                       spin_unlock(&sbi->s_delete_lock);
195 +                       ext3_debug("delete thread on %s exiting\n",
196 +                                  kdevname(sb->s_dev));
197 +                       wake_up(&sbi->s_delete_waiter_queue);
198 +                       break;
199 +               }
200 +
201 +               while (!list_empty(&sbi->s_delete_list)) {
202 +                       struct inode *inode=list_entry(sbi->s_delete_list.next,
203 +                                                      struct inode, i_dentry);
204 +                       unsigned long blocks = inode->i_blocks >>
205 +                                                       (inode->i_blkbits - 9);
206 +
207 +                       list_del_init(&inode->i_dentry);
208 +                       spin_unlock(&sbi->s_delete_lock);
209 +                       ext3_debug("%s delete ino %lu blk %lu\n",
210 +                                  tsk->comm, inode->i_ino, blocks);
211 +
212 +                       iput(inode);
213 +
214 +                       spin_lock(&sbi->s_delete_lock);
215 +                       sbi->s_delete_blocks -= blocks;
216 +                       sbi->s_delete_inodes--;
217 +               }
218 +               if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
219 +                       ext3_warning(sb, __FUNCTION__,
220 +                                    "%lu blocks, %lu inodes on list?\n",
221 +                                    sbi->s_delete_blocks,sbi->s_delete_inodes);
222 +                       sbi->s_delete_blocks = 0;
223 +                       sbi->s_delete_inodes = 0;
224 +               }
225 +               spin_unlock(&sbi->s_delete_lock);
226 +               wake_up(&sbi->s_delete_waiter_queue);
227 +       }
228 +
229 +       return 0;
230 +}
231 +
232 +static void ext3_start_delete_thread(struct super_block *sb)
233 +{
234 +       struct ext3_sb_info *sbi = EXT3_SB(sb);
235 +       int rc;
236 +
237 +       spin_lock_init(&sbi->s_delete_lock);
238 +       init_waitqueue_head(&sbi->s_delete_thread_queue);
239 +       init_waitqueue_head(&sbi->s_delete_waiter_queue);
240 +
241 +       if (!test_opt(sb, ASYNCDEL))
242 +               return;
243 +
244 +       rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
245 +       if (rc < 0)
246 +               printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
247 +                      rc);
248 +       else
249 +               wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
250 +}
251 +
252 +static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
253 +{
254 +       if (sbi->s_delete_list.next == 0)       /* thread never started */
255 +               return;
256 +
257 +       clear_opt(sbi->s_mount_opt, ASYNCDEL);
258 +       wake_up(&sbi->s_delete_thread_queue);
259 +       wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list));
260 +}
261 +
262 +/* Instead of playing games with the inode flags, destruction, etc we just
263 + * create a new inode locally and put it on a list for the truncate thread.
264 + * We need large parts of the inode struct in order to complete the
265 + * truncate and unlink, so we may as well just have a real inode to do it.
266 + *
267 + * If we have any problem deferring the delete, just delete it right away.
268 + * If we defer it, we also mark how many blocks it would free, so that we
269 + * can keep the statfs data correct, and we know if we should sleep on the
270 + * delete thread when we run out of space.
271 + */
272 +static void ext3_delete_inode_thread(struct inode *old_inode)
273 +{
274 +       struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
275 +       struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
276 +       struct inode *new_inode;
277 +       unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
278 +
279 +       if (is_bad_inode(old_inode)) {
280 +               clear_inode(old_inode);
281 +               return;
282 +       }
283 +
284 +       if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
285 +               goto out_delete;
286 +
287 +       /* We may want to delete the inode immediately and not defer it */
288 +       if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS)
289 +               goto out_delete;
290 +
291 +       /* We can't use the delete thread as-is during real orphan recovery,
292 +        * as we add to the orphan list here, causing ext3_orphan_cleanup()
293 +        * to loop endlessly.  It would be nice to do so, but needs work.
294 +        */
295 +       if (oei->i_state & EXT3_STATE_DELETE ||
296 +           sbi->s_mount_state & EXT3_ORPHAN_FS) {
297 +               ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
298 +                          old_inode->i_ino, blocks);
299 +               goto out_delete;
300 +       }
301 +
302 +       /* We can iget this inode again here, because our caller has unhashed
303 +        * old_inode, so new_inode will be in a different inode struct.
304 +        *
305 +        * We need to ensure that the i_orphan pointers in the other inodes
306 +        * point at the new inode copy instead of the old one so the orphan
307 +        * list doesn't get corrupted when the old orphan inode is freed.
308 +        */
309 +       down(&sbi->s_orphan_lock);
310 +
311 +       sbi->s_mount_state |= EXT3_ORPHAN_FS;
312 +       new_inode = iget(old_inode->i_sb, old_inode->i_ino);
313 +       sbi->s_mount_state &= ~EXT3_ORPHAN_FS;
314 +       if (is_bad_inode(new_inode)) {
315 +               printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
316 +               iput(new_inode);
317 +               new_inode = NULL;
318 +       }
319 +       if (!new_inode) {
320 +               up(&sbi->s_orphan_lock);
321 +               ext3_debug("delete inode %lu directly (bad read)\n",
322 +                          old_inode->i_ino);
323 +               goto out_delete;
324 +       }
325 +       J_ASSERT(new_inode != old_inode);
326 +
327 +       J_ASSERT(!list_empty(&oei->i_orphan));
328 +
329 +       nei = EXT3_I(new_inode);
330 +       /* Ugh.  We need to insert new_inode into the same spot on the list
331 +        * as old_inode was, to ensure the in-memory orphan list is still
332 +        * in the same order as the on-disk orphan list (badness otherwise).
333 +        */
334 +       nei->i_orphan = oei->i_orphan;
335 +       nei->i_orphan.next->prev = &nei->i_orphan;
336 +       nei->i_orphan.prev->next = &nei->i_orphan;
337 +       nei->i_state |= EXT3_STATE_DELETE;
338 +       up(&sbi->s_orphan_lock);
339 +
340 +       clear_inode(old_inode);
341 +
342 +       spin_lock(&sbi->s_delete_lock);
343 +       J_ASSERT(list_empty(&new_inode->i_dentry));
344 +       list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
345 +       sbi->s_delete_blocks += blocks;
346 +       sbi->s_delete_inodes++;
347 +       spin_unlock(&sbi->s_delete_lock);
348 +
349 +       ext3_debug("delete inode %lu (%lu blocks) by thread\n",
350 +                  new_inode->i_ino, blocks);
351 +
352 +       wake_up(&sbi->s_delete_thread_queue);
353 +       return;
354 +
355 +out_delete:
356 +       ext3_delete_inode(old_inode);
357 +}
358 +#else
359 +#define ext3_start_delete_thread(sbi) do {} while(0)
360 +#define ext3_stop_delete_thread(sbi) do {} while(0)
361 +#endif /* EXT3_DELETE_THREAD */
362 +
363  void ext3_put_super (struct super_block * sb)
364  {
365         struct ext3_sb_info *sbi = EXT3_SB(sb);
366 @@ -405,6 +619,7 @@ void ext3_put_super (struct super_block 
367         kdev_t j_dev = sbi->s_journal->j_dev;
368         int i;
369  
370 +       ext3_stop_delete_thread(sbi);
371         ext3_xattr_put_super(sb);
372         journal_destroy(sbi->s_journal);
373         if (!(sb->s_flags & MS_RDONLY)) {
374 @@ -453,7 +668,11 @@ static struct super_operations ext3_sops
375         write_inode:    ext3_write_inode,       /* BKL not held.  Don't need */
376         dirty_inode:    ext3_dirty_inode,       /* BKL not held.  We take it */
377         put_inode:      ext3_put_inode,         /* BKL not held.  Don't need */
378 +#ifdef EXT3_DELETE_THREAD
379 +       delete_inode:   ext3_delete_inode_thread,/* BKL not held. We take it */
380 +#else
381         delete_inode:   ext3_delete_inode,      /* BKL not held.  We take it */
382 +#endif
383         put_super:      ext3_put_super,         /* BKL held */
384         write_super:    ext3_write_super,       /* BKL held */
385         sync_fs:        ext3_sync_fs,
386 @@ -514,6 +733,14 @@ static int parse_options (char * options
387              this_char = strtok (NULL, ",")) {
388                 if ((value = strchr (this_char, '=')) != NULL)
389                         *value++ = 0;
390 +#ifdef EXT3_DELETE_THREAD
391 +               if (!strcmp(this_char, "asyncdel"))
392 +                       set_opt(*mount_options, ASYNCDEL);
393 +               else if (!strcmp(this_char, "noasyncdel"))
394 +                       clear_opt(*mount_options, ASYNCDEL);
395 +               else
396 +#endif
397 +
398                 if (!strcmp (this_char, "bsddf"))
399                         clear_opt (*mount_options, MINIX_DF);
400                 else if (!strcmp (this_char, "nouid32")) {
401 @@ -1203,6 +1430,7 @@ struct super_block * ext3_read_super (st
402         }
403  
404         ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
405 +       ext3_start_delete_thread(sb);
406         /*
407          * akpm: core read_super() calls in here with the superblock locked.
408          * That deadlocks, because orphan cleanup needs to lock the superblock
409 @@ -1643,6 +1871,9 @@ int ext3_remount (struct super_block * s
410         if (!parse_options(data, &tmp, sbi, &tmp, 1))
411                 return -EINVAL;
412  
413 +       if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
414 +               ext3_stop_delete_thread(sbi);
415 +
416         if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
417                 ext3_abort(sb, __FUNCTION__, "Abort forced by user");
418  
419 --- linux-2.4.18-chaos/include/linux/ext3_fs.h~ext3-delete_thread-2.4.18-2      2003-09-16 23:39:37.000000000 +0400
420 +++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs.h   2003-09-16 23:42:34.000000000 +0400
421 @@ -195,6 +195,7 @@ struct ext3_group_desc
422   */
423  #define EXT3_STATE_JDATA               0x00000001 /* journaled data exists */
424  #define EXT3_STATE_NEW                 0x00000002 /* inode is newly created */
425 +#define EXT3_STATE_DELETE              0x00000010 /* deferred delete inode */
426  
427  /*
428   * ioctl commands
429 @@ -322,6 +323,7 @@ struct ext3_inode {
430    #define EXT3_MOUNT_WRITEBACK_DATA    0x0C00  /* No data ordering */
431  #define EXT3_MOUNT_UPDATE_JOURNAL      0x1000  /* Update the journal format */
432  #define EXT3_MOUNT_NO_UID32            0x2000  /* Disable 32-bit UIDs */
433 +#define EXT3_MOUNT_ASYNCDEL            0x20000 /* Delayed deletion */
434  
435  /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
436  #ifndef _LINUX_EXT2_FS_H
437 @@ -708,6 +710,9 @@ extern void ext3_discard_prealloc (struc
438  extern void ext3_dirty_inode(struct inode *);
439  extern int ext3_change_inode_journal_flag(struct inode *, int);
440  extern void ext3_truncate (struct inode *);
441 +#ifdef EXT3_DELETE_THREAD
442 +extern void ext3_truncate_thread(struct inode *inode);
443 +#endif
444  
445  /* ioctl.c */
446  extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
447 --- linux-2.4.18-chaos/include/linux/ext3_fs_sb.h~ext3-delete_thread-2.4.18-2   2003-09-16 23:42:33.000000000 +0400
448 +++ linux-2.4.18-chaos-alexey/include/linux/ext3_fs_sb.h        2003-09-16 23:42:34.000000000 +0400
449 @@ -29,6 +29,8 @@
450  
451  #define EXT3_MAX_GROUP_LOADED  32
452  
453 +#define EXT3_DELETE_THREAD
454 +
455  /*
456   * third extended-fs super-block data in memory
457   */
458 @@ -76,6 +78,14 @@ struct ext3_sb_info {
459         struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
460         wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
461  #endif
462 +#ifdef EXT3_DELETE_THREAD
463 +       spinlock_t s_delete_lock;
464 +       struct list_head s_delete_list;
465 +       unsigned long s_delete_blocks;
466 +       unsigned long s_delete_inodes;
467 +       wait_queue_head_t s_delete_thread_queue;
468 +       wait_queue_head_t s_delete_waiter_queue;
469 +#endif
470  };
471  
472  #endif /* _LINUX_EXT3_FS_SB */
473
474 _