2 * OBDFS Super operations - also used for Lustre file system
5 * This code is issued under the GNU General Public License.
6 * See the file COPYING in this distribution
8 * Copyright (C) 1991, 1992 Linus Torvalds
9 * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
10 * Copryright (C) 1999 Seagate Technology Inc.
13 #define __NO_VERSION__
15 #include <linux/locks.h>
16 #include <linux/swap.h>
18 #include <linux/obd_support.h>
19 #include <linux/obd_class.h>
20 #include <linux/obdfs.h>
23 /* XXX temporary until the real function is available from kernel
24 * XXX set this to memory size in pages for max page cache size
26 #define nr_free_buffer_pages() 32768
28 /* Defines for page buf daemon */
30 int nfract; /* Percentage of buffer cache dirty to
32 int ndirty; /* Maximum number of dirty blocks to write out per
34 int nrefill; /* Number of clean buffers to try to obtain
35 each time we call refill */
36 int nref_dirt; /* Dirty buffer threshold for activating bdflush
37 when trying to refill buffers. */
38 int interval; /* jiffies delay between pupdate flushes */
39 int age_buffer; /* Time for normal buffer to age before we flush it */
40 int age_super; /* Time for superblock to age before we flush it */
44 static struct pupdated {
46 wait_queue_head_t waitq;
47 struct timer_list timer;
48 struct pupd_prm parms;
51 parms: {40, 1024, 64, 256, 1*HZ, 30*HZ, 5*HZ }
55 /* Called with the superblock list lock held */
56 static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
57 int nr_slots, struct page **pages, char **bufs,
58 obd_size *counts, obd_off *offsets,
59 obd_flag *flag, unsigned long check_time)
61 struct list_head *page_list = obdfs_iplist(inode);
62 struct list_head *tmp;
68 /* Traverse list in reverse order, so we do FIFO, not LIFO order */
69 while ( (tmp = tmp->prev) != page_list && num < nr_slots ) {
70 struct obdfs_pgrq *req;
73 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
77 if (req->rq_jiffies > check_time)
78 break; /* pages are in chronological order */
80 /* Only allocate the obdo if we will actually do I/O here */
83 *obdo = obdo_fromid(IID(inode), inode->i_ino,
85 if ( IS_ERR(*obdo) ) {
86 int err = PTR_ERR(*obdo);
93 /* FIXME revisit fromid & from_inode */
94 obdfs_from_inode(*obdo, inode);
95 *flag = OBD_BRW_CREATE;
98 /* Remove request from list before write to avoid conflict.
99 * Note that obdfs_pgrq_del() also deletes the request.
103 CDEBUG(D_CACHE, "no page \n");
107 bufs[num] = (char *)page_address(page);
109 counts[num] = PAGE_SIZE;
110 offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
111 CDEBUG(D_INFO, "ENQ inode %ld, page %p addr %p to vector\n",
112 inode->i_ino, page, (char *)page_address(page));
116 if (!list_empty(page_list))
117 CDEBUG(D_INFO, "inode %ld list not empty\n", inode->i_ino);
118 CDEBUG(D_INFO, "added %d page(s) to vector\n", num);
122 } /* obdfs_enqueue_pages */
124 /* Dequeue cached pages for a dying inode without writing them to disk. */
125 void obdfs_dequeue_pages(struct inode *inode)
127 struct list_head *tmp;
130 obd_down(&obdfs_i2sbi(inode)->osi_list_mutex);
131 tmp = obdfs_islist(inode);
132 if ( list_empty(tmp) ) {
133 CDEBUG(D_INFO, "no dirty pages for inode %ld\n", inode->i_ino);
134 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
139 /* take it out of the super list */
141 INIT_LIST_HEAD(obdfs_islist(inode));
143 tmp = obdfs_iplist(inode);
144 while ( (tmp = tmp->prev) != obdfs_iplist(inode) ) {
145 struct obdfs_pgrq *req;
148 req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
150 /* take it out of the list and free */
152 /* now put the page away */
156 obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
158 /* decrement inode reference for page cache */
159 atomic_dec(&inode->i_count);
163 /* This value is not arbitrarily chosen. KIO_STATIC_PAGES from linux/iobuf.h */
164 #define MAX_IOVEC (KIO_STATIC_PAGES - 1)
166 /* Remove writeback requests for the superblock */
167 int obdfs_flush_reqs(struct list_head *inode_list, unsigned long check_time)
169 struct list_head *tmp;
170 unsigned long max_io, total_io = 0;
173 struct inode *inodes[MAX_IOVEC]; /* write data back to these */
174 struct page *pages[MAX_IOVEC]; /* call put_page on these */
175 struct obdo *obdos[MAX_IOVEC];
176 char *bufs[MAX_IOVEC];
177 obd_size counts[MAX_IOVEC];
178 obd_off offsets[MAX_IOVEC];
179 obd_flag flags[MAX_IOVEC];
180 obd_count bufs_per_obdo[MAX_IOVEC];
182 struct obdfs_sb_info *sbi;
186 CDEBUG(D_INODE, "no list\n");
191 sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
193 obd_down(&sbi->osi_list_mutex);
194 if ( list_empty(inode_list) ) {
195 CDEBUG(D_INFO, "list empty\n");
196 obd_up(&sbi->osi_list_mutex);
201 /* If we are forcing a write, write out all dirty pages */
202 max_io = check_time == ~0UL ? 1<<31 : pupdated.parms.ndirty;
203 CDEBUG(D_INFO, "max_io = %lu\n", max_io);
205 /* Add each inode's dirty pages to a write vector, and write it.
206 * Traverse list in reverse order, so we do FIFO, not LIFO order
212 while ( (tmp = tmp->prev) != inode_list && total_io < max_io) {
213 struct obdfs_inode_info *ii;
217 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
218 inode = list_entry(ii, struct inode, u);
219 inodes[num_obdos] = inode;
220 obdos[num_obdos] = NULL;
221 CDEBUG(D_INFO, "checking inode %ld pages\n", inode->i_ino);
223 /* Make sure we reference "inode" and not "inodes[num_obdos]",
224 * as num_obdos will change after the loop is run.
226 if (!list_empty(obdfs_iplist(inode))) {
227 res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
229 &pages[num_io], &bufs[num_io],
234 CDEBUG(D_INFO, "FLUSH inode %ld, pages flushed: %d\n",
238 "fatal: unable to enqueue inode %ld (err %d)\n",
240 /* XXX Move bad inode to end of list so we can
241 * continue with flushing list. This is a
242 * temporary measure to avoid machine lockups.
243 * Maybe if we have -ENOENT, simply discard.
246 list_add(tmp, inode_list);
256 bufs_per_obdo[num_obdos] = res;
259 if ( num_io == MAX_IOVEC ) {
260 obd_up(&sbi->osi_list_mutex);
261 err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
262 obdos, bufs_per_obdo,
267 "fatal: do_vec_wr err=%d\n",
272 obd_down(&sbi->osi_list_mutex);
279 obd_up(&sbi->osi_list_mutex);
281 /* flush any remaining I/Os */
283 err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
284 bufs_per_obdo, pages, bufs, counts,
287 CDEBUG(D_INODE, "fatal: unable to do vec_wr (err %d)\n", err);
292 /* Remove inode from superblock dirty list when no more pages.
293 * Make sure we don't point at the current inode with tmp
294 * when we re-init the list on the inode, or we will loop.
296 obd_down(&sbi->osi_list_mutex);
298 while ( (tmp = tmp->prev) != inode_list ) {
299 struct obdfs_inode_info *ii;
302 ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
303 inode = list_entry(ii, struct inode, u);
304 CDEBUG(D_INFO, "checking inode %ld empty\n", inode->i_ino);
305 if (list_empty(obdfs_iplist(inode))) {
306 CDEBUG(D_INFO, "remove inode %ld from dirty list\n",
309 list_del(obdfs_islist(inode));
310 /* decrement inode reference for page cache */
311 atomic_dec(&inode->i_count);
312 INIT_LIST_HEAD(obdfs_islist(inode));
315 obd_up(&sbi->osi_list_mutex);
317 CDEBUG(D_INFO, "flushed %ld pages in total\n", total_io);
320 return err ? err : total_io;
321 } /* obdfs_flush_reqs */
324 /* Walk all of the superblocks and write out blocks which are too old.
325 * Return the maximum number of blocks written for a single filesystem.
327 int obdfs_flush_dirty_pages(unsigned long check_time)
329 struct list_head *sl;
333 sl = &obdfs_super_list;
334 while ( (sl = sl->prev) != &obdfs_super_list ) {
335 struct obdfs_sb_info *sbi =
336 list_entry(sl, struct obdfs_sb_info, osi_list);
339 /* walk write requests here, use the sb, check the time */
340 ret = obdfs_flush_reqs(&sbi->osi_inodes, check_time);
341 /* XXX handle error? What to do with it? */
343 max = ret > max ? ret : max;
347 } /* obdfs_flush_dirty_pages */
350 static void pupdate_wakeup(unsigned long l)
352 wake_up(&pupdated.waitq);
356 static int pupdate(void *unused)
359 int interval = pupdated.parms.interval;
360 long age = pupdated.parms.age_buffer;
363 if (pupdated.active >= 0) {
364 CDEBUG(D_CACHE, "attempted to run multiple pupdates\n");
368 init_timer(&pupdated.timer);
369 init_waitqueue_head(&pupdated.waitq);
370 pupdated.timer.function = pupdate_wakeup;
376 current->session = 1;
378 strcpy(current->comm, "pupdated");
380 CDEBUG(D_CACHE, "pupdated activated...\n");
383 spin_lock_irqsave(¤t->sigmask_lock, flags);
384 flush_signals(current);
385 sigfillset(¤t->blocked);
386 recalc_sigpending(current);
387 spin_unlock_irqrestore(¤t->sigmask_lock, flags);
392 /* update interval */
393 if (pupdated.active == 1 && interval) {
394 mod_timer(&pupdated.timer, jiffies + interval);
395 interruptible_sleep_on(&pupdated.waitq);
397 if (pupdated.active == 0) {
398 del_timer(&pupdated.timer);
399 /* If stopped, we flush one last time... */
402 /* asynchronous setattr etc for the future ...
403 obdfs_flush_dirty_inodes(jiffies - pupdated.parms.age_super);
405 dirty_limit = nr_free_buffer_pages() * pupdated.parms.nfract / 100;
407 if (obdfs_cache_count > dirty_limit) {
409 if (wrote < pupdated.parms.ndirty)
412 CDEBUG(D_CACHE, "wrote %d, age %ld, interval %d\n",
413 wrote, age, interval);
415 if (wrote < pupdated.parms.ndirty >> 1 &&
416 obdfs_cache_count < dirty_limit / 2) {
417 interval = pupdated.parms.interval;
418 age = pupdated.parms.age_buffer;
421 "wrote %d, age %ld, interval %d\n",
422 wrote, age, interval);
423 } else if (obdfs_cache_count > dirty_limit / 2) {
425 if (wrote < pupdated.parms.ndirty)
429 "wrote %d, age %ld, interval %d\n",
430 wrote, age, interval);
434 wrote = obdfs_flush_dirty_pages(jiffies - age);
437 "dirty_limit %ld, cache_count %ld, wrote %d\n",
438 dirty_limit, obdfs_cache_count, wrote);
439 run_task_queue(&tq_disk);
441 } while (pupdated.active == 1);
443 CDEBUG(D_CACHE, "pupdated stopped...\n");
444 pupdated.active = -1;
445 wake_up(&pupdated.waitq);
450 int obdfs_flushd_init(void)
453 kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
455 kernel_thread(pupdate, NULL, 0);
456 CDEBUG(D_PSDEV, "flushd inited\n");
460 int obdfs_flushd_cleanup(void)
464 /* Shut down pupdated. */
465 if (pupdated.active > 0) {
466 CDEBUG(D_CACHE, "inform pupdated\n");
468 wake_up(&pupdated.waitq);
470 CDEBUG(D_CACHE, "wait for pupdated\n");
471 while (pupdated.active == 0) {
472 interruptible_sleep_on(&pupdated.waitq);
474 CDEBUG(D_CACHE, "done waiting for pupdated\n");