2 Create a service thread to handle delete and truncate of inodes, to avoid
3 long latency while truncating very large files.
6 fs/ext3/inode.c | 116 ++++++++++++++++++++++
7 fs/ext3/super.c | 231 +++++++++++++++++++++++++++++++++++++++++++++
8 include/linux/ext3_fs.h | 5
9 include/linux/ext3_fs_sb.h | 10 +
10 4 files changed, 362 insertions(+)
12 --- linux-2.4.18-18.8.0-l15/fs/ext3/super.c~ext3-delete_thread-2.4.18 Tue Jun 3 17:26:21 2003
13 +++ linux-2.4.18-18.8.0-l15-adilger/fs/ext3/super.c Wed Jul 2 23:49:40 2003
14 @@ -396,6 +396,220 @@ static void dump_orphan_list(struct supe
18 +#ifdef EXT3_DELETE_THREAD
20 + * Delete inodes in a loop until there are no more to be deleted.
21 + * Normally, we run in the background doing the deletes and sleeping again,
22 + * and clients just add new inodes to be deleted onto the end of the list.
23 + * If someone is concerned about free space (e.g. block allocation or similar)
24 + * then they can sleep on s_delete_waiter_queue and be woken up when space
27 +int ext3_delete_thread(void *data)
29 + struct super_block *sb = data;
30 + struct ext3_sb_info *sbi = EXT3_SB(sb);
31 + struct task_struct *tsk = current;
33 + /* Almost like daemonize, but not quite */
38 + exit_files(current);
41 + sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev));
42 + sigfillset(&tsk->blocked);
44 + /*tsk->flags |= PF_KERNTHREAD;*/
46 + INIT_LIST_HEAD(&sbi->s_delete_list);
47 + wake_up(&sbi->s_delete_waiter_queue);
48 + ext3_debug("delete thread on %s started\n", kdevname(sb->s_dev));
52 + wait_event_interruptible(sbi->s_delete_thread_queue,
53 + !list_empty(&sbi->s_delete_list) ||
54 + !test_opt(sb, ASYNCDEL));
55 + ext3_debug("%s woken up: %lu inodes, %lu blocks\n",
56 + tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks);
58 + spin_lock(&sbi->s_delete_lock);
59 + if (list_empty(&sbi->s_delete_list)) {
60 + clear_opt(sbi->s_mount_opt, ASYNCDEL);
61 + memset(&sbi->s_delete_list, 0,
62 + sizeof(sbi->s_delete_list));
63 + spin_unlock(&sbi->s_delete_lock);
64 + ext3_debug("delete thread on %s exiting\n",
65 + kdevname(sb->s_dev));
66 + wake_up(&sbi->s_delete_waiter_queue);
70 + while (!list_empty(&sbi->s_delete_list)) {
71 + struct inode *inode=list_entry(sbi->s_delete_list.next,
72 + struct inode, i_dentry);
73 + unsigned long blocks = inode->i_blocks >>
74 + (inode->i_blkbits - 9);
76 + list_del_init(&inode->i_dentry);
77 + spin_unlock(&sbi->s_delete_lock);
78 + ext3_debug("%s delete ino %lu blk %lu\n",
79 + tsk->comm, inode->i_ino, blocks);
83 + spin_lock(&sbi->s_delete_lock);
84 + sbi->s_delete_blocks -= blocks;
85 + sbi->s_delete_inodes--;
87 + if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) {
88 + ext3_warning(sb, __FUNCTION__,
89 + "%lu blocks, %lu inodes on list?\n",
90 + sbi->s_delete_blocks,sbi->s_delete_inodes);
91 + sbi->s_delete_blocks = 0;
92 + sbi->s_delete_inodes = 0;
94 + spin_unlock(&sbi->s_delete_lock);
95 + wake_up(&sbi->s_delete_waiter_queue);
101 +static void ext3_start_delete_thread(struct super_block *sb)
103 + struct ext3_sb_info *sbi = EXT3_SB(sb);
106 + spin_lock_init(&sbi->s_delete_lock);
107 + init_waitqueue_head(&sbi->s_delete_thread_queue);
108 + init_waitqueue_head(&sbi->s_delete_waiter_queue);
110 + if (!test_opt(sb, ASYNCDEL))
113 + rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES);
115 + printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n",
118 + wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next);
121 +static void ext3_stop_delete_thread(struct ext3_sb_info *sbi)
123 + if (sbi->s_delete_list.next == 0) /* thread never started */
126 + clear_opt(sbi->s_mount_opt, ASYNCDEL);
127 + wake_up(&sbi->s_delete_thread_queue);
128 + wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list));
131 +/* Instead of playing games with the inode flags, destruction, etc we just
132 + * create a new inode locally and put it on a list for the truncate thread.
133 + * We need large parts of the inode struct in order to complete the
134 + * truncate and unlink, so we may as well just have a real inode to do it.
136 + * If we have any problem deferring the delete, just delete it right away.
137 + * If we defer it, we also mark how many blocks it would free, so that we
138 + * can keep the statfs data correct, and we know if we should sleep on the
139 + * delete thread when we run out of space.
141 +static void ext3_delete_inode_thread(struct inode *old_inode)
143 + struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
144 + struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
145 + struct inode *new_inode;
146 + unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
148 + if (is_bad_inode(old_inode)) {
149 + clear_inode(old_inode);
153 + if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
156 + /* We may want to delete the inode immediately and not defer it */
157 + if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS)
160 + /* We can't use the delete thread as-is during real orphan recovery,
161 + * as we add to the orphan list here, causing ext3_orphan_cleanup()
162 + * to loop endlessly. It would be nice to do so, but needs work.
164 + if (oei->i_state & EXT3_STATE_DELETE ||
165 + sbi->s_mount_state & EXT3_ORPHAN_FS) {
166 + ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
167 + old_inode->i_ino, blocks);
171 + /* We can iget this inode again here, because our caller has unhashed
172 + * old_inode, so new_inode will be in a different inode struct.
174 + * We need to ensure that the i_orphan pointers in the other inodes
175 + * point at the new inode copy instead of the old one so the orphan
176 + * list doesn't get corrupted when the old orphan inode is freed.
178 + down(&sbi->s_orphan_lock);
180 + sbi->s_mount_state |= EXT3_ORPHAN_FS;
181 + new_inode = iget(old_inode->i_sb, old_inode->i_ino);
182 + sbi->s_mount_state &= ~EXT3_ORPHAN_FS;
183 + if (is_bad_inode(new_inode)) {
184 + printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino);
189 + up(&sbi->s_orphan_lock);
190 + ext3_debug("delete inode %lu directly (bad read)\n",
194 + J_ASSERT(new_inode != old_inode);
196 + J_ASSERT(!list_empty(&oei->i_orphan));
198 + nei = EXT3_I(new_inode);
199 + /* Ugh. We need to insert new_inode into the same spot on the list
200 + * as old_inode was, to ensure the in-memory orphan list is still
201 + * in the same order as the on-disk orphan list (badness otherwise).
203 + nei->i_orphan = oei->i_orphan;
204 + nei->i_orphan.next->prev = &nei->i_orphan;
205 + nei->i_orphan.prev->next = &nei->i_orphan;
206 + nei->i_state |= EXT3_STATE_DELETE;
207 + up(&sbi->s_orphan_lock);
209 + clear_inode(old_inode);
211 + spin_lock(&sbi->s_delete_lock);
212 + J_ASSERT(list_empty(&new_inode->i_dentry));
213 + list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
214 + sbi->s_delete_blocks += blocks;
215 + sbi->s_delete_inodes++;
216 + spin_unlock(&sbi->s_delete_lock);
218 + ext3_debug("delete inode %lu (%lu blocks) by thread\n",
219 + new_inode->i_ino, blocks);
221 + wake_up(&sbi->s_delete_thread_queue);
225 + ext3_delete_inode(old_inode);
228 +#define ext3_start_delete_thread(sbi) do {} while(0)
229 +#define ext3_stop_delete_thread(sbi) do {} while(0)
230 +#endif /* EXT3_DELETE_THREAD */
232 void ext3_put_super (struct super_block * sb)
234 struct ext3_sb_info *sbi = EXT3_SB(sb);
235 @@ -403,6 +617,7 @@ void ext3_put_super (struct super_block
236 kdev_t j_dev = sbi->s_journal->j_dev;
239 + ext3_stop_delete_thread(sbi);
240 ext3_xattr_put_super(sb);
241 journal_destroy(sbi->s_journal);
242 if (!(sb->s_flags & MS_RDONLY)) {
243 @@ -451,7 +666,11 @@ static struct super_operations ext3_sops
244 write_inode: ext3_write_inode, /* BKL not held. Don't need */
245 dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */
246 put_inode: ext3_put_inode, /* BKL not held. Don't need */
247 +#ifdef EXT3_DELETE_THREAD
248 + delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */
250 delete_inode: ext3_delete_inode, /* BKL not held. We take it */
252 put_super: ext3_put_super, /* BKL held */
253 write_super: ext3_write_super, /* BKL held */
254 write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */
255 @@ -511,6 +730,14 @@ static int parse_options (char * options
256 this_char = strtok (NULL, ",")) {
257 if ((value = strchr (this_char, '=')) != NULL)
259 +#ifdef EXT3_DELETE_THREAD
260 + if (!strcmp(this_char, "asyncdel"))
261 + set_opt(*mount_options, ASYNCDEL);
262 + else if (!strcmp(this_char, "noasyncdel"))
263 + clear_opt(*mount_options, ASYNCDEL);
267 if (!strcmp (this_char, "bsddf"))
268 clear_opt (*mount_options, MINIX_DF);
269 else if (!strcmp (this_char, "nouid32")) {
270 @@ -1206,6 +1433,7 @@ struct super_block * ext3_read_super (st
273 ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
274 + ext3_start_delete_thread(sb);
276 * akpm: core read_super() calls in here with the superblock locked.
277 * That deadlocks, because orphan cleanup needs to lock the superblock
278 @@ -1648,6 +1876,9 @@ int ext3_remount (struct super_block * s
279 if (!parse_options(data, &tmp, sbi, &tmp, 1))
282 + if (!test_opt(sb, ASYNCDEL) || (*flags & MS_RDONLY))
283 + ext3_stop_delete_thread(sbi);
285 if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
286 ext3_abort(sb, __FUNCTION__, "Abort forced by user");
288 --- linux/fs/ext3/file.c.orig Fri Jan 17 10:57:31 2003
289 +++ linux/fs/ext3/file.c Mon Jun 30 13:28:52 2003
290 @@ -121,7 +121,11 @@ struct file_operations ext3_file_operati
293 struct inode_operations ext3_file_inode_operations = {
294 +#ifdef EXT3_DELETE_THREAD
295 + truncate: ext3_truncate_thread, /* BKL held */
297 truncate: ext3_truncate, /* BKL held */
299 setattr: ext3_setattr, /* BKL held */
302 --- linux-2.4.18-18.8.0-l15/fs/ext3/inode.c~ext3-delete_thread-2.4.18 Wed Jul 2 23:13:58 2003
303 +++ linux-2.4.18-18.8.0-l15-adilger/fs/ext3/inode.c Wed Jul 2 23:50:29 2003
304 @@ -2004,6 +2004,118 @@ out_stop:
305 ext3_journal_stop(handle, inode);
308 +#ifdef EXT3_DELETE_THREAD
309 +/* Move blocks from to-be-truncated inode over to a new inode, and delete
310 + * that one from the delete thread instead. This avoids a lot of latency
311 + * when truncating large files.
313 + * If we have any problem deferring the truncate, just truncate it right away.
314 + * If we defer it, we also mark how many blocks it would free, so that we
315 + * can keep the statfs data correct, and we know if we should sleep on the
316 + * delete thread when we run out of space.
318 +void ext3_truncate_thread(struct inode *old_inode)
320 + struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb);
321 + struct ext3_inode_info *nei, *oei = EXT3_I(old_inode);
322 + struct inode *new_inode;
324 + unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9);
326 + if (!test_opt(old_inode->i_sb, ASYNCDEL) || !sbi->s_delete_list.next)
329 + /* XXX This is a temporary limitation for code simplicity.
330 + * We could truncate to arbitrary sizes at some later time.
332 + if (old_inode->i_size != 0)
335 + /* We may want to truncate the inode immediately and not defer it */
336 + if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS ||
337 + old_inode->i_size > oei->i_disksize)
340 + /* We can't use the delete thread as-is during real orphan recovery,
341 + * as we add to the orphan list here, causing ext3_orphan_cleanup()
342 + * to loop endlessly. It would be nice to do so, but needs work.
344 + if (oei->i_state & EXT3_STATE_DELETE ||
345 + sbi->s_mount_state & EXT3_ORPHAN_FS) {
346 + ext3_debug("doing deferred inode %lu delete (%lu blocks)\n",
347 + old_inode->i_ino, blocks);
351 + ext3_discard_prealloc(old_inode);
354 + * new_inode = sb + GDT + ibitmap
355 + * orphan list = 1 inode/superblock for add, 2 inodes for del
356 + * quota files = 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
358 + handle = ext3_journal_start(old_inode, 7);
359 + if (IS_ERR(handle))
362 + new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
363 + if (IS_ERR(new_inode)) {
364 + ext3_debug("truncate inode %lu directly (no new inodes)\n",
369 + nei = EXT3_I(new_inode);
371 + down_write(&oei->truncate_sem);
372 + new_inode->i_size = old_inode->i_size;
373 + new_inode->i_blocks = old_inode->i_blocks;
374 + new_inode->i_uid = old_inode->i_uid;
375 + new_inode->i_gid = old_inode->i_gid;
376 + new_inode->i_nlink = 0;
378 + /* FIXME when we do arbitrary truncates */
379 + old_inode->i_blocks = oei->i_file_acl ? old_inode->i_blksize / 512 : 0;
380 + old_inode->i_mtime = old_inode->i_ctime = CURRENT_TIME;
382 + memcpy(nei->i_data, oei->i_data, sizeof(nei->i_data));
383 + memset(oei->i_data, 0, sizeof(oei->i_data));
385 + nei->i_disksize = oei->i_disksize;
386 + nei->i_state |= EXT3_STATE_DELETE;
387 + up_write(&oei->truncate_sem);
389 + if (ext3_orphan_add(handle, new_inode) < 0)
392 + if (ext3_orphan_del(handle, old_inode) < 0) {
393 + ext3_orphan_del(handle, new_inode);
398 + ext3_journal_stop(handle, old_inode);
400 + spin_lock(&sbi->s_delete_lock);
401 + J_ASSERT(list_empty(&new_inode->i_dentry));
402 + list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list);
403 + sbi->s_delete_blocks += blocks;
404 + sbi->s_delete_inodes++;
405 + spin_unlock(&sbi->s_delete_lock);
407 + ext3_debug("delete inode %lu (%lu blocks) by thread\n",
408 + new_inode->i_ino, blocks);
410 + wake_up(&sbi->s_delete_thread_queue);
414 + ext3_journal_stop(handle, old_inode);
416 + ext3_truncate(old_inode);
418 +#endif /* EXT3_DELETE_THREAD */
421 * ext3_get_inode_loc returns with an extra refcount against the
422 * inode's underlying buffer_head on success.
423 --- linux-2.4.18-18.8.0-l15/include/linux/ext3_fs.h~ext3-delete_thread-2.4.18 Tue Jun 3 17:26:20 2003
424 +++ linux-2.4.18-18.8.0-l15-adilger/include/linux/ext3_fs.h Wed Jul 2 23:19:09 2003
425 @@ -190,6 +190,7 @@ struct ext3_group_desc
427 #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */
428 #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */
429 +#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */
433 @@ -317,6 +318,7 @@ struct ext3_inode {
434 #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */
435 #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
436 #define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */
437 +#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */
439 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
440 #ifndef _LINUX_EXT2_FS_H
441 @@ -651,6 +653,9 @@ extern void ext3_discard_prealloc (struc
442 extern void ext3_dirty_inode(struct inode *);
443 extern int ext3_change_inode_journal_flag(struct inode *, int);
444 extern void ext3_truncate (struct inode *);
445 +#ifdef EXT3_DELETE_THREAD
446 +extern void ext3_truncate_thread(struct inode *inode);
450 extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
451 --- linux-2.4.18-18.8.0-l15/include/linux/ext3_fs_sb.h~ext3-delete_thread-2.4.18 Tue Jun 3 17:26:21 2003
452 +++ linux-2.4.18-18.8.0-l15-adilger/include/linux/ext3_fs_sb.h Wed Jul 2 23:19:09 2003
455 #define EXT3_MAX_GROUP_LOADED 32
457 +#define EXT3_DELETE_THREAD
460 * third extended-fs super-block data in memory
462 @@ -74,6 +76,14 @@ struct ext3_sb_info {
463 struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
464 wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */
466 +#ifdef EXT3_DELETE_THREAD
467 + spinlock_t s_delete_lock;
468 + struct list_head s_delete_list;
469 + unsigned long s_delete_blocks;
470 + unsigned long s_delete_inodes;
471 + wait_queue_head_t s_delete_thread_queue;
472 + wait_queue_head_t s_delete_waiter_queue;
476 #endif /* _LINUX_EXT3_FS_SB */