2 * OBDFS Super operations - also used for Lustre file system
5 * This code is issued under the GNU General Public License.
6 * See the file COPYING in this distribution
8 * Copyright (C) 1991, 1992 Linus Torvalds
9 * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
10 * Copryright (C) 1999 Seagate Technology Inc.
13 #define __NO_VERSION__
15 #include <linux/locks.h>
16 #include <linux/swap.h>
18 #include <linux/obd_support.h>
19 #include <linux/obd_class.h>
20 #include <linux/obdfs.h>
23 /* XXX temporary until the real function is available from kernel
24 * XXX set this to memory size in pages for max page cache size
26 #define nr_free_buffer_pages() 32768
29 int nfract; /* Percentage of buffer cache dirty to
31 int ndirty; /* Maximum number of dirty blocks to write out per
33 int nrefill; /* Number of clean buffers to try to obtain
34 each time we call refill */
35 int nref_dirt; /* Dirty buffer threshold for activating bdflush
36 when trying to refill buffers. */
37 int interval; /* jiffies delay between pupdate flushes */
38 int age_buffer; /* Time for normal buffer to age before we flush it */
39 int age_super; /* Time for superblock to age before we flush it */
40 } pupd_prm = {40, 1024, 64, 256, 1*HZ, 30*HZ, 5*HZ };
42 /* Called with the superblock list lock held */
43 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
44 int nr_slots, struct page **pages, char **bufs,
45 obd_size *counts, obd_off *offsets,
46 obd_flag *flag, unsigned long check_time)
48 struct list_head *page_list = obdfs_iplist(inode);
49 struct list_head *tmp;
55 /* Traverse list in reverse order, so we do FIFO, not LIFO order */
56 while ( (tmp = tmp->prev) != page_list && num < nr_slots ) {
57 struct obdfs_pgrq *req;
60 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
64 if (req->rq_jiffies > check_time)
65 break; /* pages are in chronological order */
67 /* Only allocate the obdo if we will actually do I/O here */
70 *obdo = obdo_fromid(IID(inode), inode->i_ino,
72 if ( IS_ERR(*obdo) ) {
73 int err = PTR_ERR(*obdo);
80 /* FIXME revisit fromid & from_inode */
81 obdfs_from_inode(*obdo, inode);
82 *flag = OBD_BRW_CREATE;
85 /* Remove request from list before write to avoid conflict.
86 * Note that obdfs_pgrq_del() also deletes the request.
90 CDEBUG(D_CACHE, "no page \n");
94 bufs[num] = (char *)page_address(page);
96 counts[num] = PAGE_SIZE;
97 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
98 CDEBUG(D_INFO, "ENQ inode %ld, page %p addr %p to vector\n",
99 inode->i_ino, page, (char *)page_address(page));
103 if (!list_empty(page_list))
104 CDEBUG(D_INFO, "inode %ld list not empty\n", inode->i_ino);
105 CDEBUG(D_INFO, "added %d page(s) to vector\n", num);
109 } /* obdfs_enqueue_pages */
111 /* Dequeue cached pages for a dying inode without writing them to disk. */
112 void obdfs_dequeue_pages(struct inode *inode)
114 struct list_head *tmp;
117 obd_down(&obdfs_i2sbi(inode)->osi_list_mutex);
118 tmp = obdfs_islist(inode);
119 if ( list_empty(tmp) ) {
120 CDEBUG(D_INFO, "no dirty pages for inode %ld\n", inode->i_ino);
121 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
126 /* take it out of the super list */
128 INIT_LIST_HEAD(obdfs_islist(inode));
130 tmp = obdfs_iplist(inode);
131 while ( (tmp = tmp->prev) != obdfs_iplist(inode) ) {
132 struct obdfs_pgrq *req;
135 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
137 /* take it out of the list and free */
139 /* now put the page away */
143 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
145 /* decrement inode reference for page cache */
146 atomic_dec(&inode->i_count);
150 /* Remove writeback requests for the superblock */
151 int obdfs_flush_reqs(struct list_head *inode_list, unsigned long check_time)
153 struct list_head *tmp;
154 unsigned long max_io, total_io = 0;
157 struct inode *inodes[MAX_IOVEC]; /* write data back to these */
158 struct page *pages[MAX_IOVEC]; /* call put_page on these */
159 struct obdo *obdos[MAX_IOVEC];
160 char *bufs[MAX_IOVEC];
161 obd_size counts[MAX_IOVEC];
162 obd_off offsets[MAX_IOVEC];
163 obd_flag flags[MAX_IOVEC];
164 obd_count bufs_per_obdo[MAX_IOVEC];
166 struct obdfs_sb_info *sbi;
170 CDEBUG(D_INODE, "no list\n");
175 sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
177 obd_down(&sbi->osi_list_mutex);
178 if ( list_empty(inode_list) ) {
179 CDEBUG(D_INFO, "list empty\n");
180 obd_up(&sbi->osi_list_mutex);
185 /* If we are forcing a write, write out all dirty pages */
186 max_io = check_time == ~0UL ? 1<<31 : pupd_prm.ndirty;
187 CDEBUG(D_INFO, "max_io = %lu\n", max_io);
189 /* Add each inode's dirty pages to a write vector, and write it.
190 * Traverse list in reverse order, so we do FIFO, not LIFO order
196 while ( (tmp = tmp->prev) != inode_list && total_io < max_io) {
197 struct obdfs_inode_info *ii;
201 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
202 inode = list_entry(ii, struct inode, u);
203 inodes[num_obdos] = inode;
204 obdos[num_obdos] = NULL;
205 CDEBUG(D_INFO, "checking inode %ld pages\n", inode->i_ino);
207 /* Make sure we reference "inode" and not "inodes[num_obdos]",
208 * as num_obdos will change after the loop is run.
210 if (!list_empty(obdfs_iplist(inode))) {
211 res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
213 &pages[num_io], &bufs[num_io],
218 CDEBUG(D_INFO, "FLUSH inode %ld, pages flushed: %d\n",
222 "fatal: unable to enqueue inode %ld (err %d)\n",
224 /* XXX Move bad inode to end of list so we can
225 * continue with flushing list. This is a
226 * temporary measure to avoid machine lockups.
227 * Maybe if we have -ENOENT, simply discard.
230 list_add(tmp, inode_list);
240 bufs_per_obdo[num_obdos] = res;
243 if ( num_io == MAX_IOVEC ) {
244 obd_up(&sbi->osi_list_mutex);
245 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
246 obdos, bufs_per_obdo,
251 "fatal: do_vec_wr err=%d\n",
256 obd_down(&sbi->osi_list_mutex);
263 obd_up(&sbi->osi_list_mutex);
265 /* flush any remaining I/Os */
267 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
268 bufs_per_obdo, pages, bufs, counts,
271 CDEBUG(D_INODE, "fatal: unable to do vec_wr (err %d)\n", err);
276 /* Remove inode from superblock dirty list when no more pages.
277 * Make sure we don't point at the current inode with tmp
278 * when we re-init the list on the inode, or we will loop.
280 obd_down(&sbi->osi_list_mutex);
282 while ( (tmp = tmp->prev) != inode_list ) {
283 struct obdfs_inode_info *ii;
286 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
287 inode = list_entry(ii, struct inode, u);
288 CDEBUG(D_INFO, "checking inode %ld empty\n", inode->i_ino);
289 if (list_empty(obdfs_iplist(inode))) {
290 CDEBUG(D_INFO, "remove inode %ld from dirty list\n",
293 list_del(obdfs_islist(inode));
294 /* decrement inode reference for page cache */
295 atomic_dec(&inode->i_count);
296 INIT_LIST_HEAD(obdfs_islist(inode));
299 obd_up(&sbi->osi_list_mutex);
301 CDEBUG(D_INFO, "flushed %ld pages in total\n", total_io);
304 return err ? err : total_io;
305 } /* obdfs_flush_reqs */
308 /* Walk all of the superblocks and write out blocks which are too old.
309 * Return the maximum number of blocks written for a single filesystem.
311 int obdfs_flush_dirty_pages(unsigned long check_time)
313 struct list_head *sl;
317 sl = &obdfs_super_list;
318 while ( (sl = sl->prev) != &obdfs_super_list ) {
319 struct obdfs_sb_info *sbi =
320 list_entry(sl, struct obdfs_sb_info, osi_list);
323 /* walk write requests here, use the sb, check the time */
324 ret = obdfs_flush_reqs(&sbi->osi_inodes, check_time);
325 /* XXX handle error? What to do with it? */
327 max = ret > max ? ret : max;
331 } /* obdfs_flush_dirty_pages */
334 static struct task_struct *pupdated;
336 static int pupdate(void *unused)
338 int interval = pupd_prm.interval;
339 long age = pupd_prm.age_buffer;
346 pupdated->session = 1;
348 strcpy(pupdated->comm, "pupdated");
350 printk("pupdated activated...\n");
352 spin_lock_irq(&pupdated->sigmask_lock);
353 sigfillset(&pupdated->blocked);
354 siginitsetinv(&pupdated->blocked, sigmask(SIGTERM));
355 recalc_sigpending(pupdated);
356 spin_unlock_irq(&pupdated->sigmask_lock);
361 /* update interval */
363 set_task_state(pupdated, TASK_INTERRUPTIBLE);
364 schedule_timeout(interval);
366 if (signal_pending(pupdated))
369 spin_lock_irq(&pupdated->sigmask_lock);
370 if (sigismember(&pupdated->pending.signal, SIGTERM))
372 sigdelset(&pupdated->pending.signal, SIGTERM);
375 recalc_sigpending(pupdated);
376 spin_unlock_irq(&pupdated->sigmask_lock);
378 printk("pupdated stopped...\n");
379 set_task_state(pupdated, TASK_STOPPED);
384 /* asynchronous setattr etc for the future ...
385 obdfs_flush_dirty_inodes(jiffies - pupd_prm.age_super);
387 dirty_limit = nr_free_buffer_pages() * pupd_prm.nfract / 100;
389 if (obdfs_cache_count > dirty_limit) {
391 if ( wrote < pupd_prm.ndirty )
394 CDEBUG(D_CACHE, "wrote %d, age %ld, interval %d\n",
395 wrote, age, interval);
397 if ( wrote < pupd_prm.ndirty >> 1 &&
398 obdfs_cache_count < dirty_limit / 2) {
399 interval = pupd_prm.interval;
400 age = pupd_prm.age_buffer;
403 "wrote %d, age %ld, interval %d\n",
404 wrote, age, interval);
405 } else if (obdfs_cache_count > dirty_limit / 2) {
407 if ( wrote < pupd_prm.ndirty )
411 "wrote %d, age %ld, interval %d\n",
412 wrote, age, interval);
416 wrote = obdfs_flush_dirty_pages(jiffies - age);
419 "dirty_limit %ld, cache_count %ld, wrote %d\n",
420 dirty_limit, obdfs_cache_count, wrote);
425 int obdfs_flushd_init(void)
428 kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
430 kernel_thread(pupdate, NULL, 0);
431 CDEBUG(D_PSDEV, __FUNCTION__ ": flushd inited\n");
435 int obdfs_flushd_cleanup(void)
439 if (pupdated) /* for debugging purposes only */
440 CDEBUG(D_CACHE, "pupdated->state = %lx\n", pupdated->state);
442 /* deliver a signal to pupdated to shut it down */
443 if (pupdated && (pupdated->state == TASK_RUNNING ||
444 pupdated->state == TASK_INTERRUPTIBLE )) {
445 unsigned long timeout = HZ/20;
446 unsigned long count = 0;
447 send_sig_info(SIGTERM, (struct siginfo *)1, pupdated);
449 if ((count % 2*HZ) == timeout)
450 printk(KERN_INFO "wait for pupdated to stop\n");
452 set_current_state(TASK_INTERRUPTIBLE);
453 schedule_timeout(timeout);