2 * OBDFS Super operations - also used for Lustre file system
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
7 * Copryright (C) 1999 Seagate Technology Inc.
10 #define __NO_VERSION__
11 #include <linux/module.h>
12 #include <linux/sched.h>
14 #include <linux/malloc.h>
15 #include <linux/locks.h>
16 #include <linux/errno.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/vmalloc.h>
20 #include <linux/blkdev.h>
21 #include <linux/sysrq.h>
22 #include <linux/file.h>
23 #include <linux/init.h>
24 #include <linux/quotaops.h>
25 #include <linux/iobuf.h>
26 #include <linux/highmem.h>
28 #include <asm/uaccess.h>
30 #include <asm/bitops.h>
31 #include <asm/mmu_context.h>
33 #include <linux/obd_support.h>
34 #include <linux/obd_class.h>
35 #include <linux/obdfs.h>
39 int nfract; /* Percentage of buffer cache dirty to
41 int ndirty; /* Maximum number of dirty blocks to write out per
43 int nrefill; /* Number of clean buffers to try to obtain
44 each time we call refill */
45 int nref_dirt; /* Dirty buffer threshold for activating bdflush
46 when trying to refill buffers. */
47 int interval; /* jiffies delay between pupdate flushes */
48 int age_buffer; /* Time for normal buffer to age before we flush it */
49 int age_super; /* Time for superblock to age before we flush it */
50 } pupd_prm = {40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ };
52 /* Called with the superblock list lock */
53 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
54 int nr_slots, struct page **pages, char **bufs,
55 obd_size *counts, obd_off *offsets,
56 obd_flag *flag, unsigned long check_time)
58 struct list_head *page_list = obdfs_iplist(inode);
59 struct list_head *tmp;
65 /* Traverse list in reverse order, so we do FIFO, not LIFO order */
66 while ( (tmp = tmp->prev) != page_list && num < nr_slots ) {
67 struct obdfs_pgrq *req;
70 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
74 if (req->rq_jiffies > check_time)
75 break; /* pages are in chronological order */
77 /* Only allocate the obdo if we will actually do I/O here */
80 *obdo = obdo_fromid(IID(inode), inode->i_ino,
82 if ( IS_ERR(*obdo) ) {
83 int err = PTR_ERR(*obdo);
90 /* FIXME revisit fromid & from_inode */
91 obdfs_from_inode(*obdo, inode);
92 *flag = OBD_BRW_CREATE;
95 /* Remove request from list before write to avoid conflict.
96 * Note that obdfs_pgrq_del() also deletes the request.
100 CDEBUG(D_CACHE, "no page \n");
104 bufs[num] = (char *)page_address(page);
106 counts[num] = PAGE_SIZE;
107 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
108 CDEBUG(D_INFO, "ENQ inode %ld, page %p addr %p to vector\n",
109 inode->i_ino, page, (char *)page_address(page));
113 if (!list_empty(page_list))
114 CDEBUG(D_INFO, "inode %ld list not empty\n", inode->i_ino);
115 CDEBUG(D_INFO, "added %d page(s) to vector\n", num);
119 } /* obdfs_enqueue_pages */
121 /* Remove writeback requests for the superblock */
122 int obdfs_flush_reqs(struct list_head *inode_list, unsigned long check_time)
124 struct list_head *tmp;
128 struct inode *inodes[MAX_IOVEC]; /* write data back to these */
129 struct page *pages[MAX_IOVEC]; /* call put_page on these */
130 struct obdo *obdos[MAX_IOVEC];
131 char *bufs[MAX_IOVEC];
132 obd_size counts[MAX_IOVEC];
133 obd_off offsets[MAX_IOVEC];
134 obd_flag flags[MAX_IOVEC];
135 obd_count bufs_per_obdo[MAX_IOVEC];
137 struct obdfs_sb_info *sbi;
141 CDEBUG(D_INODE, "no list\n");
146 sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
148 obd_down(&sbi->osi_list_mutex);
149 if ( list_empty(inode_list) ) {
150 CDEBUG(D_CACHE, "list empty: memory %ld, inodes %d, pages %d\n",
151 obd_memory, obd_inodes, obd_pages);
152 obd_up(&sbi->osi_list_mutex);
157 /* Add each inode's dirty pages to a write vector, and write it.
158 * Traverse list in reverse order, so we do FIFO, not LIFO order
164 while ( (tmp = tmp->prev) != inode_list && total_io < pupd_prm.ndirty) {
165 struct obdfs_inode_info *ii;
169 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
170 inode = list_entry(ii, struct inode, u);
171 inodes[num_obdos] = inode;
172 obdos[num_obdos] = NULL;
173 CDEBUG(D_INFO, "checking inode %ld pages\n", inode->i_ino);
175 /* Make sure we reference "inode" and not "inodes[num_obdos]",
176 * as num_obdos will change after the loop is run.
178 if (!list_empty(obdfs_iplist(inode))) {
179 res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
181 &pages[num_io], &bufs[num_io],
186 CDEBUG(D_INFO, "FLUSH inode %ld, pages flushed: %d\n",
190 "fatal: unable to enqueue inode %ld (err %d)\n",
192 /* XXX Move bad inode to end of list so we can
193 * continue with flushing list. This is a
194 * temporary measure to avoid machine lockups.
197 list_add(tmp, inode_list);
204 bufs_per_obdo[num_obdos] = res;
208 if ( num_io == MAX_IOVEC ) {
209 obd_up(&sbi->osi_list_mutex);
210 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
211 obdos, bufs_per_obdo,
216 "fatal: unable to do vec_wr (err %d)\n", err);
220 obd_down(&sbi->osi_list_mutex);
227 obd_up(&sbi->osi_list_mutex);
229 /* flush any remaining I/Os */
231 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
232 bufs_per_obdo, pages, bufs, counts,
235 CDEBUG(D_INODE, "fatal: unable to do vec_wr (err %d)\n", err);
240 /* Remove inode from superblock dirty list when no more pages.
241 * Make sure we don't point at the current inode with tmp
242 * when we re-init the list on the inode, or we will loop.
244 obd_down(&sbi->osi_list_mutex);
246 while ( (tmp = tmp->prev) != inode_list ) {
247 struct obdfs_inode_info *ii;
250 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
251 inode = list_entry(ii, struct inode, u);
252 CDEBUG(D_INFO, "checking inode %ld empty\n", inode->i_ino);
253 if (list_empty(obdfs_iplist(inode))) {
254 CDEBUG(D_INFO, "remove inode %ld from dirty list\n",
257 list_del(obdfs_islist(inode));
258 /* decrement inode reference for page cache */
260 INIT_LIST_HEAD(obdfs_islist(inode));
263 obd_up(&sbi->osi_list_mutex);
265 CDEBUG(D_INFO, "flushed %d pages in total\n", total_io);
268 return err ? err : total_io;
269 } /* obdfs_flush_reqs */
272 /* Walk all of the superblocks and write out blocks which are too old.
273 * Return the maximum number of blocks written for a single filesystem.
275 int obdfs_flush_dirty_pages(unsigned long check_time)
277 struct list_head *sl;
281 sl = &obdfs_super_list;
282 while ( (sl = sl->prev) != &obdfs_super_list ) {
283 struct obdfs_sb_info *sbi =
284 list_entry(sl, struct obdfs_sb_info, osi_list);
287 /* walk write requests here, use the sb, check the time */
288 ret = obdfs_flush_reqs(&sbi->osi_inodes, check_time);
289 /* XXX handle error? What to do with it? */
291 max = ret > max ? ret : max;
295 } /* obdfs_flush_dirty_pages */
298 static struct task_struct *pupdated;
300 static int pupdate(void *unused)
302 int interval = pupd_prm.interval;
303 long age = pupd_prm.age_buffer;
310 pupdated->session = 1;
312 strcpy(pupdated->comm, "pupdated");
314 printk("pupdated activated...\n");
316 spin_lock_irq(&pupdated->sigmask_lock);
317 sigfillset(&pupdated->blocked);
318 siginitsetinv(&pupdated->blocked, sigmask(SIGTERM));
319 recalc_sigpending(pupdated);
320 spin_unlock_irq(&pupdated->sigmask_lock);
325 /* update interval */
327 set_task_state(pupdated, TASK_INTERRUPTIBLE);
328 schedule_timeout(interval);
330 if (signal_pending(pupdated))
333 spin_lock_irq(&pupdated->sigmask_lock);
334 if (sigismember(&pupdated->signal, SIGTERM))
336 sigdelset(&pupdated->signal, SIGTERM);
339 recalc_sigpending(pupdated);
340 spin_unlock_irq(&pupdated->sigmask_lock);
342 printk("pupdated stopped...\n");
343 set_task_state(pupdated, TASK_STOPPED);
348 /* asynchronous setattr etc for the future ...
349 obdfs_flush_dirty_inodes(jiffies - pupd_prm.age_super);
351 dirty_limit = nr_free_buffer_pages() * pupd_prm.nfract / 100;
352 CDEBUG(D_CACHE, "dirty_limit %ld, cache_count %ld\n",
353 dirty_limit, obdfs_cache_count);
355 if (obdfs_cache_count > dirty_limit) {
357 if ( wrote < pupd_prm.ndirty )
360 int isave = interval;
363 if ( wrote < pupd_prm.ndirty >> 1 )
364 interval = pupd_prm.interval;
366 interval = isave >> 1;
368 if (obdfs_cache_count > dirty_limit / 3) {
370 interval = isave >> 1;
372 age = pupd_prm.age_buffer;
375 CDEBUG(D_CACHE, "age %ld, interval %d\n", age, interval);
376 wrote = obdfs_flush_dirty_pages(jiffies - age);
382 int obdfs_flushd_init(void)
385 kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
387 kernel_thread(pupdate, NULL, 0);
388 CDEBUG(D_PSDEV, __FUNCTION__ ": flushd inited\n");
392 int obdfs_flushd_cleanup(void)
396 if (pupdated) /* for debugging purposes only */
397 CDEBUG(D_CACHE, "pupdated->state = %lx\n", pupdated->state);
399 /* deliver a signal to pupdated to shut it down */
400 if (pupdated && (pupdated->state == TASK_RUNNING ||
401 pupdated->state == TASK_INTERRUPTIBLE )) {
402 unsigned long timeout = HZ/20;
403 unsigned long count = 0;
404 send_sig_info(SIGTERM, (struct siginfo *)1, pupdated);
406 if ((count % 2*HZ) == timeout)
407 printk(KERN_INFO "wait for pupdated to stop\n");
409 set_current_state(TASK_INTERRUPTIBLE);
410 schedule_timeout(timeout);