2 Create a service thread to handle delete and truncate of inodes, to avoid
3 long latency while truncating very large files.
6 fs/ext3/inode.c | 116 ++++++++++++++++++++++
7 fs/ext3/super.c | 231 +++++++++++++++++++++++++++++++++++++++++++++
8 include/linux/ext3_fs.h | 5
9 include/linux/ext3_fs_sb.h | 10 +
10 4 files changed, 362 insertions(+)
12 Index: linux-2.4.18-chaos/fs/ext3/super.c
13 ===================================================================
14 --- linux-2.4.18-chaos.orig/fs/ext3/super.c 2004-01-13 15:39:03.000000000 +0300
15 +++ linux-2.4.18-chaos/fs/ext3/super.c 2004-01-13 16:35:05.000000000 +0300
20 +#ifdef EXT3_DELETE_THREAD
22 + * Delete inodes in a loop until there are no more to be deleted.
23 + * Normally, we run in the background doing the deletes and sleeping again,
24 + * and clients just add new inodes to be deleted onto the end of the list.
25 + * If someone is concerned about free space (e.g. block allocation or similar)
26 + * then they can sleep on s_delete_waiter_queue and be woken up when space
29 +int ext3_delete_thread(void *data)
31 + struct super_block *sb = data;
32 + struct ext3_sb_info *sbi = EXT3_SB(sb);
33 + struct task_struct *tsk = current;
35 + /* Almost like daemonize, but not quite */
40 + exit_files(current);
43 + sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
44 + sigfillset(&tsk->blocked);
46 + /*tsk->flags |= PF_KERNTHREAD;*/
48 + INIT_LIST_HEAD(&sbi->s_delete_list);
49 + wake_up(&sbi->s_delete_waiter_queue);
50 + ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
54 + wait_event_interruptible(sbi->s_delete_thread_queue,
55 + !list_empty(&sbi->s_delete_list) ||
56 + !test_opt(sb, ASYNCDEL));
57 + ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
58 + tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
60 + spin_lock(&sbi->s_delete_lock);
61 + if (list_empty(&sbi->s_delete_list)) {
62 + clear_opt(sbi->s_mount_opt, ASYNCDEL);
63 + memset(&sbi->s_delete_list, 0,
64 + sizeof(sbi->s_delete_list));
65 + spin_unlock(&sbi->s_delete_lock);
66 + ext3_debug("delete thread on %s exiting\n",
67 + kdevname(sb->s_dev));
68 + wake_up(&sbi->s_delete_waiter_queue);
72 + while (!list_empty(&sbi->s_delete_list)) {
73 + struct inode *inode=list_entry(sbi->s_delete_list.next,
74 + struct inode, i_dentry);
75 + unsigned long blocks = inode->i_blocks >>
76 + (inode->i_blkbits - 9);
78 + list_del_init(&inode->i_dentry);
79 + spin_unlock(&sbi->s_delete_lock);
80 + ext3_debug("%s delete ino %lu blk %lu\n",
81 + tsk->comm, inode->i_ino, blocks);
85 + spin_lock(&sbi->s_delete_lock);
86 + sbi->s_delete_blocks -= blocks;
87 + sbi->s_delete_inodes--;
89 + if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
90 + ext3_warning(sb, __FUNCTION__,
91 + "%lu blocks, %lu inodes on list?\n",
92 + sbi->s_delete_blocks,sbi->s_delete_inodes);
93 + sbi->s_delete_blocks = 0;
94 + sbi->s_delete_inodes = 0;
96 + spin_unlock(&sbi->s_delete_lock);
97 + wake_up(&sbi->s_delete_waiter_queue);
103 +static void ext3_start_delete_thread(struct super_block *sb)
105 + struct ext3_sb_info *sbi = EXT3_SB(sb);
108 + spin_lock_init(&sbi->s_delete_lock);
109 + init_waitqueue_head(&sbi->s_delete_thread_queue);
110 + init_waitqueue_head(&sbi->s_delete_waiter_queue);
112 + if (!test_opt(sb, ASYNCDEL))
115 + rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
117 + printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
120 + wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
123 +static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
125 + if (sbi->s_delete_list.next == 0) /* thread never started */
128 + clear_opt(sbi->s_mount_opt, ASYNCDEL);
129 + wake_up(&sbi->s_delete_thread_queue);
130 + wait_event(sbi->s_delete_waiter_queue,
131 + sbi->s_delete_list.next == 0 && sbi->s_delete_inodes == 0);
134 +/* Instead of playing games with the inode flags, destruction, etc we just
135 + * create a new inode locally and put it on a list for the truncate thread.
136 + * We need large parts of the inode struct in order to complete the
137 + * truncate and unlink, so we may as well just have a real inode to do it.
139 + * If we have any problem deferring the delete, just delete it right away.
140 + * If we defer it, we also mark how many blocks it would free, so that we
141 + * can keep the statfs data correct, and we know if we should sleep on the
142 + * delete thread when we run out of space.
144 +static void ext3_delete_inode_thread(struct inode *old_inode)
146 + struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
147 + struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
148 + struct inode *new_inode;
149 + unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
151 + if (is_bad_inode(old_inode)) {
152 + clear_inode(old_inode);
156 + if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
159 + /* We may want to delete the inode immediately and not defer it */
160 + if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS)
163 + /* We can't use the delete thread as-is during real orphan recovery,
164 + * as we add to the orphan list here, causing ext3_orphan_cleanup()
165 + * to loop endlessly. It would be nice to do so, but needs work.
167 + if (oei->i_state & EXT3_STATE_DELETE ||
168 + sbi->s_mount_state & EXT3_ORPHAN_FS) {
169 + ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
170 + old_inode->i_ino, blocks);
174 + /* We can iget this inode again here, because our caller has unhashed
175 + * old_inode, so new_inode will be in a different inode struct.
177 + * We need to ensure that the i_orphan pointers in the other inodes
178 + * point at the new inode copy instead of the old one so the orphan
179 + * list doesn't get corrupted when the old orphan inode is freed.
181 + down(&sbi->s_orphan_lock);
183 + sbi->s_mount_state |= EXT3_ORPHAN_FS;
184 + new_inode = iget(old_inode->i_sb, old_inode->i_ino);
185 + sbi->s_mount_state &= ~EXT3_ORPHAN_FS;
186 + if (is_bad_inode(new_inode)) {
187 + printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
192 + up(&sbi->s_orphan_lock);
193 + ext3_debug("delete inode %lu directly (bad read)\n",
197 + J_ASSERT(new_inode != old_inode);
199 + J_ASSERT(!list_empty(&oei->i_orphan));
201 + nei = EXT3_I(new_inode);
202 + /* Ugh. We need to insert new_inode into the same spot on the list
203 + * as old_inode was, to ensure the in-memory orphan list is still
204 + * in the same order as the on-disk orphan list (badness otherwise).
206 + nei->i_orphan = oei->i_orphan;
207 + nei->i_orphan.next->prev = &nei->i_orphan;
208 + nei->i_orphan.prev->next = &nei->i_orphan;
209 + nei->i_state |= EXT3_STATE_DELETE;
210 + up(&sbi->s_orphan_lock);
212 + clear_inode(old_inode);
214 + spin_lock(&sbi->s_delete_lock);
215 + J_ASSERT(list_empty(&new_inode->i_dentry));
216 + list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
217 + sbi->s_delete_blocks += blocks;
218 + sbi->s_delete_inodes++;
219 + spin_unlock(&sbi->s_delete_lock);
221 + ext3_debug("delete inode %lu (%lu blocks) by thread\n",
222 + new_inode->i_ino, blocks);
224 + wake_up(&sbi->s_delete_thread_queue);
228 + ext3_delete_inode(old_inode);
231 +#define ext3_start_delete_thread(sbi) do {} while(0)
232 +#define ext3_stop_delete_thread(sbi) do {} while(0)
233 +#endif /* EXT3_DELETE_THREAD */
235 void ext3_put_super (struct super_block * sb)
237 struct ext3_sb_info *sbi = EXT3_SB(sb);
239 kdev_t j_dev = sbi->s_journal->j_dev;
242 + J_ASSERT(sbi->s_delete_inodes == 0);
244 ext3_xattr_put_super(sb);
245 journal_destroy(sbi->s_journal);
246 if (!(sb->s_flags & MS_RDONLY)) {
248 write_inode: ext3_write_inode, /* BKL not held. Don't need */
249 dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */
250 put_inode: ext3_put_inode, /* BKL not held. Don't need */
251 +#ifdef EXT3_DELETE_THREAD
252 + delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */
254 delete_inode: ext3_delete_inode, /* BKL not held. We take it */
256 put_super: ext3_put_super, /* BKL held */
257 write_super: ext3_write_super, /* BKL held */
258 sync_fs: ext3_sync_fs,
260 this_char = strtok (NULL, ",")) {
261 if ((value = strchr (this_char, '=')) != NULL)
263 +#ifdef EXT3_DELETE_THREAD
264 + if (!strcmp(this_char, "asyncdel"))
265 + set_opt(*mount_options, ASYNCDEL);
266 + else if (!strcmp(this_char, "noasyncdel"))
267 + clear_opt(*mount_options, ASYNCDEL);
271 if (!strcmp (this_char, "bsddf"))
272 clear_opt (*mount_options, MINIX_DF);
273 else if (!strcmp (this_char, "nouid32")) {
274 @@ -1209,6 +1438,7 @@
277 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
278 + ext3_start_delete_thread(sb);
280 * akpm: core read_super() calls in here with the superblock locked.
281 * That deadlocks, because orphan cleanup needs to lock the superblock
282 @@ -1585,7 +1815,12 @@
283 static int ext3_sync_fs(struct super_block *sb)
288 + if (atomic_read(&sb->s_active) == 0) {
289 + /* fs is being umounted: time to stop delete thread */
290 + ext3_stop_delete_thread(EXT3_SB(sb));
294 target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
295 log_wait_commit(EXT3_SB(sb)->s_journal, target);
296 @@ -1649,6 +1884,9 @@
297 if (!parse_options(data, &tmp, sbi, &tmp, 1))
300 + if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
301 + ext3_stop_delete_thread(sbi);
303 if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
304 ext3_abort(sb, __FUNCTION__, "Abort forced by user");
306 Index: linux-2.4.18-chaos/fs/ext3/file.c
307 ===================================================================
308 --- linux-2.4.18-chaos.orig/fs/ext3/file.c 2003-07-28 17:52:04.000000000 +0400
309 +++ linux-2.4.18-chaos/fs/ext3/file.c 2004-01-13 16:26:01.000000000 +0300
313 struct inode_operations ext3_file_inode_operations = {
314 +#ifdef EXT3_DELETE_THREAD
315 + truncate: ext3_truncate_thread, /* BKL held */
317 truncate: ext3_truncate, /* BKL held */
319 setattr: ext3_setattr, /* BKL held */
322 Index: linux-2.4.18-chaos/fs/ext3/inode.c
323 ===================================================================
324 --- linux-2.4.18-chaos.orig/fs/ext3/inode.c 2004-01-13 15:39:03.000000000 +0300
325 +++ linux-2.4.18-chaos/fs/ext3/inode.c 2004-01-13 16:26:01.000000000 +0300
326 @@ -2041,6 +2041,118 @@
327 return; /* AKPM: return what? */
330 +#ifdef EXT3_DELETE_THREAD
331 +/* Move blocks from to-be-truncated inode over to a new inode, and delete
332 + * that one from the delete thread instead. This avoids a lot of latency
333 + * when truncating large files.
335 + * If we have any problem deferring the truncate, just truncate it right away.
336 + * If we defer it, we also mark how many blocks it would free, so that we
337 + * can keep the statfs data correct, and we know if we should sleep on the
338 + * delete thread when we run out of space.
340 +void ext3_truncate_thread(struct inode *old_inode)
342 + struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
343 + struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
344 + struct inode *new_inode;
346 + unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
348 + if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
351 + /* XXX This is a temporary limitation for code simplicity.
352 + * We could truncate to arbitrary sizes at some later time.
354 + if (old_inode->i_size != 0)
357 + /* We may want to truncate the inode immediately and not defer it */
358 + if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
359 + old_inode->i_size > oei->i_disksize)
362 + /* We can't use the delete thread as-is during real orphan recovery,
363 + * as we add to the orphan list here, causing ext3_orphan_cleanup()
364 + * to loop endlessly. It would be nice to do so, but needs work.
366 + if (oei->i_state & EXT3_STATE_DELETE ||
367 + sbi->s_mount_state & EXT3_ORPHAN_FS) {
368 + ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
369 + old_inode->i_ino, blocks);
373 + ext3_discard_prealloc(old_inode);
376 + * new_inode = sb + GDT + ibitmap
377 + * orphan list = 1 inode/superblock for add, 2 inodes for del
378 + * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
380 + handle = ext3_journal_start(old_inode, 7);
381 + if (IS_ERR(handle))
384 + new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
385 + if (IS_ERR(new_inode)) {
386 + ext3_debug("truncate inode %lu directly (no new inodes)\n",
391 + nei = EXT3_I(new_inode);
393 + down_write(&oei->truncate_sem);
394 + new_inode->i_size = old_inode->i_size;
395 + new_inode->i_blocks = old_inode->i_blocks;
396 + new_inode->i_uid = old_inode->i_uid;
397 + new_inode->i_gid = old_inode->i_gid;
398 + new_inode->i_nlink = 0;
400 + /* FIXME when we do arbitrary truncates */
401 + old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
402 + old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
404 + memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
405 + memset(oei->i_data, 0, sizeof(oei->i_data));
407 + nei->i_disksize = oei->i_disksize;
408 + nei->i_state |= EXT3_STATE_DELETE;
409 + up_write(&oei->truncate_sem);
411 + if (ext3_orphan_add(handle, new_inode) < 0)
414 + if (ext3_orphan_del(handle, old_inode) < 0) {
415 + ext3_orphan_del(handle, new_inode);
420 + ext3_journal_stop(handle, old_inode);
422 + spin_lock(&sbi->s_delete_lock);
423 + J_ASSERT(list_empty(&new_inode->i_dentry));
424 + list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
425 + sbi->s_delete_blocks += blocks;
426 + sbi->s_delete_inodes++;
427 + spin_unlock(&sbi->s_delete_lock);
429 + ext3_debug("delete inode %lu (%lu blocks) by thread\n",
430 + new_inode->i_ino, blocks);
432 + wake_up(&sbi->s_delete_thread_queue);
436 + ext3_journal_stop(handle, old_inode);
438 + ext3_truncate(old_inode);
440 +#endif /* EXT3_DELETE_THREAD */
443 * ext3_get_inode_loc returns with an extra refcount against the
444 * inode's underlying buffer_head on success.
445 Index: linux-2.4.18-chaos/fs/buffer.c
446 ===================================================================
447 --- linux-2.4.18-chaos.orig/fs/buffer.c 2003-07-28 17:52:03.000000000 +0400
448 +++ linux-2.4.18-chaos/fs/buffer.c 2004-01-13 16:34:43.000000000 +0300
451 if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
452 sb->s_op->write_super(sb);
454 if (sb->s_op && sb->s_op->sync_fs)
455 sb->s_op->sync_fs(sb);
459 return sync_buffers(dev, 1);
460 Index: linux-2.4.18-chaos/include/linux/ext3_fs.h
461 ===================================================================
462 --- linux-2.4.18-chaos.orig/include/linux/ext3_fs.h 2004-01-13 15:39:03.000000000 +0300
463 +++ linux-2.4.18-chaos/include/linux/ext3_fs.h 2004-01-13 16:26:01.000000000 +0300
466 #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */
467 #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */
468 +#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */
473 #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */
474 #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
475 #define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */
476 +#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */
478 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
479 #ifndef _LINUX_EXT2_FS_H
481 extern void ext3_dirty_inode(struct inode *);
482 extern int ext3_change_inode_journal_flag(struct inode *, int);
483 extern void ext3_truncate (struct inode *);
484 +#ifdef EXT3_DELETE_THREAD
485 +extern void ext3_truncate_thread(struct inode *inode);
489 extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
490 Index: linux-2.4.18-chaos/include/linux/ext3_fs_sb.h
491 ===================================================================
492 --- linux-2.4.18-chaos.orig/include/linux/ext3_fs_sb.h 2004-01-13 15:39:03.000000000 +0300
493 +++ linux-2.4.18-chaos/include/linux/ext3_fs_sb.h 2004-01-13 16:26:01.000000000 +0300
496 #define EXT3_MAX_GROUP_LOADED 32
498 +#define EXT3_DELETE_THREAD
501 * third extended-fs super-block data in memory
504 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
505 wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
507 +#ifdef EXT3_DELETE_THREAD
508 + spinlock_t s_delete_lock;
509 + struct list_head s_delete_list;
510 + unsigned long s_delete_blocks;
511 + unsigned long s_delete_inodes;
512 + wait_queue_head_t s_delete_thread_queue;
513 + wait_queue_head_t s_delete_waiter_queue;
517 #endif /* _LINUX_EXT3_FS_SB */