2 * OBDFS Super operations - also used for Lustre file system
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
7 * Copryright (C) 1999 Seagate Technology Inc.
10 #define __NO_VERSION__
11 #include <linux/module.h>
12 #include <linux/sched.h>
14 #include <linux/malloc.h>
15 #include <linux/locks.h>
16 #include <linux/errno.h>
17 #include <linux/swap.h>
18 #include <linux/smp_lock.h>
19 #include <linux/vmalloc.h>
20 #include <linux/blkdev.h>
21 #include <linux/sysrq.h>
22 #include <linux/file.h>
23 #include <linux/init.h>
24 #include <linux/quotaops.h>
25 #include <linux/iobuf.h>
26 #include <linux/highmem.h>
28 #include <asm/uaccess.h>
30 #include <asm/bitops.h>
31 #include <asm/mmu_context.h>
33 #include <linux/obd_support.h>
34 #include <linux/obd_class.h>
35 #include <linux/obdfs.h>
40 int nfract; /* Percentage of buffer cache dirty to
42 int ndirty; /* Maximum number of dirty blocks to write out per
44 int nrefill; /* Number of clean buffers to try to obtain
45 each time we call refill */
46 int nref_dirt; /* Dirty buffer threshold for activating bdflush
47 when trying to refill buffers. */
48 int interval; /* jiffies delay between kupdate flushes */
49 int age_buffer; /* Time for normal buffer to age before we flush it */
50 int age_super; /* Time for superblock to age before we flush it */
51 /* } pupd_prm = {40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ }; */
52 } pupd_prm = {40, 500, 64, 256, 10*HZ, 30*HZ, 5*HZ };
55 /* Called with the superblock list lock */
56 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
57 int nr_slots, struct page **pages, char **bufs,
58 obd_size *counts, obd_off *offsets,
59 obd_flag *flag, int check_time)
61 struct list_head *page_list = obdfs_iplist(inode);
62 struct list_head *tmp;
68 *obdo = obdo_fromid(IID(inode), inode->i_ino, OBD_MD_FLNOTOBD);
69 if ( IS_ERR(*obdo) ) {
71 return PTR_ERR(*obdo);
74 obdfs_from_inode(*obdo, inode); /* FIXME revisit fromid & from_inode */
75 *flag = OBD_BRW_CREATE;
78 while ( ((tmp = tmp->next) != page_list) && (num < nr_slots) ) {
79 struct obdfs_pgrq *req;
82 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
87 (jiffies - req->rq_jiffies) < pupd_prm.age_buffer)
90 /* Remove request from list before write to avoid conflict.
91 * Note that obdfs_pgrq_del() also deletes the request.
95 CDEBUG(D_INODE, "no page \n");
99 bufs[num] = (char *)page_address(page);
101 counts[num] = PAGE_SIZE;
102 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
103 CDEBUG(D_INODE, "ENQ inode %ld, page %p addr %p to vector\n",
104 inode->i_ino, page, (char *)page_address(page));
108 if (!list_empty(page_list))
109 CDEBUG(D_INODE, "inode %ld list not empty\n", inode->i_ino);
110 CDEBUG(D_INODE, "added %d page(s) to vector\n", num);
116 /* dequeue requests for a dying inode */
117 void obdfs_dequeue_reqs(struct inode *inode)
120 struct list_head *tmp;
122 obd_down(&obdfs_i2sbi(inode)->osi_list_mutex);
123 tmp = obdfs_islist(inode);
124 if ( list_empty(tmp) ) {
125 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
130 /* take it out of the super list */
132 INIT_LIST_HEAD(obdfs_islist(inode));
134 tmp = obdfs_iplist(inode);
135 while ( (tmp = tmp->next) != obdfs_iplist(inode) ) {
136 struct obdfs_pgrq *req;
139 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
141 /* take it out of the list and free */
143 /* now put the page away */
147 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
150 /* Remove writeback requests for the superblock */
151 int obdfs_flush_reqs(struct list_head *inode_list, int check_time)
153 struct list_head *tmp;
155 obd_count num_io = 0;
156 obd_count num_obdos = 0;
157 struct inode *inodes[MAX_IOVEC]; /* write data back to these */
158 struct page *pages[MAX_IOVEC]; /* call put_page on these */
159 struct obdo *obdos[MAX_IOVEC];
160 char *bufs[MAX_IOVEC];
161 obd_size counts[MAX_IOVEC];
162 obd_off offsets[MAX_IOVEC];
163 obd_flag flags[MAX_IOVEC];
164 obd_count bufs_per_obdo[MAX_IOVEC];
166 struct obdfs_sb_info *sbi;
172 CDEBUG(D_INODE, "no list\n");
177 sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
179 obd_down(&sbi->osi_list_mutex);
180 if ( list_empty(inode_list)) {
181 CDEBUG(D_INODE, "list empty\n");
182 obd_up(&sbi->osi_list_mutex);
187 /* add each inode's dirty pages to a write vector, and write it */
190 while ( (tmp = tmp->next) != inode_list &&
191 total_io < pupd_prm.ndirty) {
192 struct obdfs_inode_info *ii;
196 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
197 inode = list_entry(ii, struct inode, u);
198 inodes[num_obdos] = inode;
199 CDEBUG(D_INODE, "checking inode %ld pages\n", inode->i_ino);
203 /* Loop on this inode until we can't get more pages from it
204 * (either no more pages, or the pages aren't old enough).
205 * Make sure we reference "inode" and not "inodes[num_obdos]",
206 * as num_obdos will change after the loop is run.
208 while (!list_empty(obdfs_iplist(inode)) && res &&
209 total_io < pupd_prm.ndirty ) {
210 res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
212 &pages[num_io], &bufs[num_io],
215 &flags[num_obdos], 1);
216 CDEBUG(D_INODE, "FLUSHED inode %ld, pages flushed: %d\n",
219 obd_up(&sbi->osi_list_mutex);
226 bufs_per_obdo[num_obdos] = res;
229 if ( num_io == MAX_IOVEC ) {
230 obd_up(&sbi->osi_list_mutex);
231 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
232 obdos, bufs_per_obdo,
242 obd_down(&sbi->osi_list_mutex);
248 obd_up(&sbi->osi_list_mutex);
250 /* flush any remaining I/Os */
252 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
253 bufs_per_obdo, pages, bufs, counts,
257 /* Remove inode from superblock dirty list when no more pages.
258 * Make sure we don't point at the current inode with tmp
259 * when we re-init the list on the inode, or we will loop.
261 obd_down(&sbi->osi_list_mutex);
263 while ( (tmp = tmp->next) != inode_list ) {
264 struct obdfs_inode_info *ii;
267 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
268 inode = list_entry(ii, struct inode, u);
269 CDEBUG(D_INODE, "checking inode %ld empty\n", inode->i_ino);
270 if (list_empty(obdfs_iplist(inode))) {
271 CDEBUG(D_INODE, "remove inode %ld from dirty list\n",
274 list_del(obdfs_islist(inode));
276 INIT_LIST_HEAD(obdfs_islist(inode));
279 obd_up(&sbi->osi_list_mutex);
281 CDEBUG(D_INODE, "flushed %d pages in total\n", total_io);
285 } /* obdfs_remove_pages_from_cache */
288 void obdfs_flush_dirty_pages(int check_time)
290 struct list_head *sl;
292 sl = &obdfs_super_list;
293 while ( (sl = sl->next) != &obdfs_super_list ) {
294 struct obdfs_sb_info *sbi =
295 list_entry(sl, struct obdfs_sb_info, osi_list);
297 /* walk write requests here, use the sb, check the time */
298 obdfs_flush_reqs(&sbi->osi_inodes, 0);
304 static struct task_struct *pupdated;
307 static int pupdate(void *unused)
309 struct task_struct * tsk = current;
319 sprintf(tsk->comm, "pupdated");
322 printk("pupdated activated...\n");
324 /* sigstop and sigcont will stop and wakeup pupdate */
325 spin_lock_irq(&tsk->sigmask_lock);
326 sigfillset(&tsk->blocked);
327 siginitsetinv(&tsk->blocked, sigmask(SIGTERM));
328 recalc_sigpending(tsk);
329 spin_unlock_irq(&tsk->sigmask_lock);
332 /* update interval */
333 interval = pupd_prm.interval;
336 tsk->state = TASK_INTERRUPTIBLE;
337 schedule_timeout(interval);
342 obdfs_flush_dirty_pages(0);
343 tsk->state = TASK_STOPPED;
344 /* MOD_DEC_USE_COUNT; */
345 printk("pupdated stopped...\n");
348 /* check for sigstop */
349 if (signal_pending(tsk))
352 spin_lock_irq(&tsk->sigmask_lock);
353 if (sigismember(&tsk->signal, SIGTERM))
355 sigdelset(&tsk->signal, SIGTERM);
358 recalc_sigpending(tsk);
359 spin_unlock_irq(&tsk->sigmask_lock);
363 /* asynchronous setattr etc for the future ... */
364 /* flush_inodes(); */
365 obdfs_flush_dirty_pages(1);
370 int flushd_init(void)
373 kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
375 /* MOD_INC_USE_COUNT; */
376 kernel_thread(pupdate, NULL, 0);
377 printk("flushd inited\n");
381 int flushd_cleanup(void)
383 /* this should deliver a signal to */
386 /* XXX Andreas, we will do this later, for now, you must kill
387 pupdated with a SIGTERM from userland, before unloading obdfs.o
390 /* then let it run at least once, before continuing */
392 /* XXX need to do something like this here:
393 send_sig(SIGTERM, current, 0);
396 /*obdfs_flush_dirty_pages(0); */