/*
* OBDFS Super operations - also used for Lustre file system
*
- *
+ *
+ * This code is issued under the GNU General Public License.
+ * See the file COPYING in this distribution
+ *
* Copyright (C) 1991, 1992 Linus Torvalds
* Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
* Copryright (C) 1999 Seagate Technology Inc.
*
*/
#define __NO_VERSION__
-#include <linux/module.h>
-#include <linux/sched.h>
#include <linux/fs.h>
-#include <linux/malloc.h>
#include <linux/locks.h>
-#include <linux/errno.h>
#include <linux/swap.h>
-#include <linux/smp_lock.h>
-#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
-#include <linux/sysrq.h>
-#include <linux/file.h>
-#include <linux/init.h>
-#include <linux/quotaops.h>
-#include <linux/iobuf.h>
-#include <linux/highmem.h>
-
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/bitops.h>
-#include <asm/mmu_context.h>
#include <linux/obd_support.h>
#include <linux/obd_class.h>
#include <linux/obdfs.h>
+/* XXX temporary until the real function is available from kernel
+ * XXX set this to memory size in pages for max page cache size
+ */
+#define nr_free_buffer_pages() 32768
+
+/* Defines for page buf daemon */
+struct pupd_prm {
+ int nfract; /* Percentage of buffer cache dirty to
+ activate bdflush */
+ int ndirty; /* Maximum number of dirty blocks to write out per
+ wake-cycle */
+ int nrefill; /* Number of clean buffers to try to obtain
+ each time we call refill */
+ int nref_dirt; /* Dirty buffer threshold for activating bdflush
+ when trying to refill buffers. */
+ int interval; /* jiffies delay between pupdate flushes */
+ int age_buffer; /* Time for normal buffer to age before we flush it */
+ int age_super; /* Time for superblock to age before we flush it */
+};
+
-struct {
- int nfract; /* Percentage of buffer cache dirty to
- activate bdflush */
- int ndirty; /* Maximum number of dirty blocks to write out per
- wake-cycle */
- int nrefill; /* Number of clean buffers to try to obtain
- each time we call refill */
- int nref_dirt; /* Dirty buffer threshold for activating bdflush
- when trying to refill buffers. */
- int interval; /* jiffies delay between kupdate flushes */
- int age_buffer; /* Time for normal buffer to age before we flush it */
- int age_super; /* Time for superblock to age before we flush it */
-} pupd_prm = {40, 500, 64, 256, 5*HZ, 30*HZ, 5*HZ };
-
-/* static void obdfs_flush_reqs(struct obdfs_super_info *sbi, int wait,
-
-*/
-static void obdfs_flush_reqs(struct obdfs_super_info *sbi, int check_time)
+static struct pupdated {
+ int active;
+ wait_queue_head_t waitq;
+ struct timer_list timer;
+ struct pupd_prm parms;
+} pupdated = {
+ active: -1,
+ parms: {40, 1024, 64, 256, 1*HZ, 30*HZ, 5*HZ }
+};
+
+
+/* Called with the superblock list lock held */
+static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo,
+ int nr_slots, struct page **pages, char **bufs,
+ obd_size *counts, obd_off *offsets,
+ obd_flag *flag, unsigned long check_time)
{
- struct list_head *wr;
- struct obdfs_pgrq *req;
-
- wr = &sbi->s_wr_head;
- while ( (wr = wr->next) != &sbi->s_wr_head ) {
- req = list_entry(wr, struct obdfs_pgrq, rq_list);
-
- if (!check_time ||
- req->rq_jiffies <= (jiffies - pupd_prm.age_buffer)) {
- /* write request out to disk */
- obdfs_do_writepage(req->rq_inode, req->rq_page, 1);
- }
+ struct list_head *page_list = obdfs_iplist(inode);
+ struct list_head *tmp;
+ int num = 0;
- }
+ ENTRY;
+
+ tmp = page_list;
+ /* Traverse list in reverse order, so we do FIFO, not LIFO order */
+ while ( (tmp = tmp->prev) != page_list && num < nr_slots ) {
+ struct obdfs_pgrq *req;
+ struct page *page;
+
+ req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
+ page = req->rq_page;
+
+
+ if (req->rq_jiffies > check_time)
+ break; /* pages are in chronological order */
+
+ /* Only allocate the obdo if we will actually do I/O here */
+ if ( !*obdo ) {
+ OIDEBUG(inode);
+ *obdo = obdo_fromid(IID(inode), inode->i_ino,
+ OBD_MD_FLNOTOBD);
+ if ( IS_ERR(*obdo) ) {
+ int err = PTR_ERR(*obdo);
+ *obdo = NULL;
+
+ EXIT;
+ return err;
+ }
+
+ /* FIXME revisit fromid & from_inode */
+ obdfs_from_inode(*obdo, inode);
+ *flag = OBD_BRW_CREATE;
+ }
+
+ /* Remove request from list before write to avoid conflict.
+ * Note that obdfs_pgrq_del() also deletes the request.
+ */
+ obdfs_pgrq_del(req);
+ if ( !page ) {
+ CDEBUG(D_CACHE, "no page \n");
+ continue;
+ }
+
+ bufs[num] = (char *)page_address(page);
+ pages[num] = page;
+ counts[num] = PAGE_SIZE;
+ offsets[num] = ((obd_off)page->index) << PAGE_SHIFT;
+ CDEBUG(D_INFO, "ENQ inode %ld, page %p addr %p to vector\n",
+ inode->i_ino, page, (char *)page_address(page));
+ num++;
+ }
+
+ if (!list_empty(page_list))
+ CDEBUG(D_INFO, "inode %ld list not empty\n", inode->i_ino);
+ CDEBUG(D_INFO, "added %d page(s) to vector\n", num);
+ EXIT;
+ return num;
+} /* obdfs_enqueue_pages */
+
+/* Dequeue cached pages for a dying inode without writing them to disk. */
+void obdfs_dequeue_pages(struct inode *inode)
+{
+ struct list_head *tmp;
+
+ ENTRY;
+ obd_down(&obdfs_i2sbi(inode)->osi_list_mutex);
+ tmp = obdfs_islist(inode);
+ if ( list_empty(tmp) ) {
+ CDEBUG(D_INFO, "no dirty pages for inode %ld\n", inode->i_ino);
+ obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
+ EXIT;
+ return;
+ }
+
+ /* take it out of the super list */
+ list_del(tmp);
+ INIT_LIST_HEAD(obdfs_islist(inode));
+
+ tmp = obdfs_iplist(inode);
+ while ( (tmp = tmp->prev) != obdfs_iplist(inode) ) {
+ struct obdfs_pgrq *req;
+ struct page *page;
+
+ req = list_entry(tmp, struct obdfs_pgrq, rq_plist);
+ page = req->rq_page;
+ /* take it out of the list and free */
+ obdfs_pgrq_del(req);
+ /* now put the page away */
+ put_page(page);
+ }
+
+ obd_up(&obdfs_i2sbi(inode)->osi_list_mutex);
+
+ /* decrement inode reference for page cache */
+ atomic_dec(&inode->i_count);
+ EXIT;
}
+/* This value is not arbitrarily chosen. KIO_STATIC_PAGES from linux/iobuf.h */
+#define MAX_IOVEC (KIO_STATIC_PAGES - 1)
-static void obdfs_flush_dirty_pages(int check_time)
+/* Remove writeback requests for the superblock */
+int obdfs_flush_reqs(struct list_head *inode_list, unsigned long check_time)
{
- struct list_head *sl;
- struct obdfs_super_info *sbi;
+ struct list_head *tmp;
+ unsigned long max_io, total_io = 0;
+ obd_count num_io;
+ obd_count num_obdos;
+ struct inode *inodes[MAX_IOVEC]; /* write data back to these */
+ struct page *pages[MAX_IOVEC]; /* call put_page on these */
+ struct obdo *obdos[MAX_IOVEC];
+ char *bufs[MAX_IOVEC];
+ obd_size counts[MAX_IOVEC];
+ obd_off offsets[MAX_IOVEC];
+ obd_flag flags[MAX_IOVEC];
+ obd_count bufs_per_obdo[MAX_IOVEC];
+ int err = 0;
+ struct obdfs_sb_info *sbi;
- sl = &obdfs_super_list;
- while ( (sl = sl->next) != &obdfs_super_list ) {
- struct obdfs_super_entry *entry =
- list_entry(sl, struct obdfs_super_entry, sl_chain);
- struct obdfs_super_info *sbi = entry->sl_sbi;
+ ENTRY;
+ if (!inode_list) {
+ CDEBUG(D_INODE, "no list\n");
+ EXIT;
+ return 0;
+ }
- /* walk write requests here */
- obdfs_flush_reqs(sbi, jiffies);
- }
+ sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes);
- /* again, but now we wait for completion */
- sl = &obdfs_super_list;
- while ( (sl = sl->next) != &obdfs_super_list ) {
- struct obdfs_super_entry *entry =
- list_entry(sl, struct obdfs_super_entry, sl_chain);
- sbi = entry->sl_sbi;
+ obd_down(&sbi->osi_list_mutex);
+ if ( list_empty(inode_list) ) {
+ CDEBUG(D_INFO, "list empty\n");
+ obd_up(&sbi->osi_list_mutex);
+ EXIT;
+ return 0;
+ }
- /* walk write requests here */
- obdfs_flush_reqs(sbi, jiffies);
- }
+ /* If we are forcing a write, write out all dirty pages */
+ max_io = check_time == ~0UL ? 1<<31 : pupdated.parms.ndirty;
+ CDEBUG(D_INFO, "max_io = %lu\n", max_io);
+
+ /* Add each inode's dirty pages to a write vector, and write it.
+ * Traverse list in reverse order, so we do FIFO, not LIFO order
+ */
+ again:
+ tmp = inode_list;
+ num_io = 0;
+ num_obdos = 0;
+ while ( (tmp = tmp->prev) != inode_list && total_io < max_io) {
+ struct obdfs_inode_info *ii;
+ struct inode *inode;
+ int res;
+
+ ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
+ inode = list_entry(ii, struct inode, u);
+ inodes[num_obdos] = inode;
+ obdos[num_obdos] = NULL;
+ CDEBUG(D_INFO, "checking inode %ld pages\n", inode->i_ino);
+
+ /* Make sure we reference "inode" and not "inodes[num_obdos]",
+ * as num_obdos will change after the loop is run.
+ */
+ if (!list_empty(obdfs_iplist(inode))) {
+ res = obdfs_enqueue_pages(inode, &obdos[num_obdos],
+ MAX_IOVEC - num_io,
+ &pages[num_io], &bufs[num_io],
+ &counts[num_io],
+ &offsets[num_io],
+ &flags[num_obdos],
+ check_time);
+ CDEBUG(D_INFO, "FLUSH inode %ld, pages flushed: %d\n",
+ inode->i_ino, res);
+ if ( res < 0 ) {
+ CDEBUG(D_INODE,
+ "fatal: unable to enqueue inode %ld (err %d)\n",
+ inode->i_ino, res);
+ /* XXX Move bad inode to end of list so we can
+ * continue with flushing list. This is a
+ * temporary measure to avoid machine lockups.
+ * Maybe if we have -ENOENT, simply discard.
+ */
+ list_del(tmp);
+ list_add(tmp, inode_list);
+ err = res;
+ EXIT;
+ goto BREAK;
+ }
+ if (res == 0)
+ continue;
+
+ num_io += res;
+ total_io += res;
+ bufs_per_obdo[num_obdos] = res;
+ num_obdos++;
+
+ if ( num_io == MAX_IOVEC ) {
+ obd_up(&sbi->osi_list_mutex);
+ err = obdfs_do_vec_wr(inodes, num_io, num_obdos,
+ obdos, bufs_per_obdo,
+ pages, bufs, counts,
+ offsets, flags);
+ if ( err ) {
+ CDEBUG(D_INODE,
+ "fatal: do_vec_wr err=%d\n",
+ err);
+ EXIT;
+ goto ERR;
+ }
+ obd_down(&sbi->osi_list_mutex);
+ goto again;
+ }
+ }
+ }
+
+BREAK:
+ obd_up(&sbi->osi_list_mutex);
+
+ /* flush any remaining I/Os */
+ if ( num_io ) {
+ err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos,
+ bufs_per_obdo, pages, bufs, counts,
+ offsets, flags);
+ if (err)
+ CDEBUG(D_INODE, "fatal: unable to do vec_wr (err %d)\n", err);
+ num_io = 0;
+ num_obdos = 0;
+ }
+
+ /* Remove inode from superblock dirty list when no more pages.
+ * Make sure we don't point at the current inode with tmp
+ * when we re-init the list on the inode, or we will loop.
+ */
+ obd_down(&sbi->osi_list_mutex);
+ tmp = inode_list;
+ while ( (tmp = tmp->prev) != inode_list ) {
+ struct obdfs_inode_info *ii;
+ struct inode *inode;
+
+ ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes);
+ inode = list_entry(ii, struct inode, u);
+ CDEBUG(D_INFO, "checking inode %ld empty\n", inode->i_ino);
+ if (list_empty(obdfs_iplist(inode))) {
+ CDEBUG(D_INFO, "remove inode %ld from dirty list\n",
+ inode->i_ino);
+ tmp = tmp->next;
+ list_del(obdfs_islist(inode));
+ /* decrement inode reference for page cache */
+ atomic_dec(&inode->i_count);
+ INIT_LIST_HEAD(obdfs_islist(inode));
+ }
+ }
+ obd_up(&sbi->osi_list_mutex);
+
+ CDEBUG(D_INFO, "flushed %ld pages in total\n", total_io);
+ EXIT;
+ERR:
+ return err ? err : total_io;
+} /* obdfs_flush_reqs */
+
+
+/* Walk all of the superblocks and write out blocks which are too old.
+ * Return the maximum number of blocks written for a single filesystem.
+ */
+int obdfs_flush_dirty_pages(unsigned long check_time)
+{
+ struct list_head *sl;
+ int max = 0;
+
+ /* ENTRY; */
+ sl = &obdfs_super_list;
+ while ( (sl = sl->prev) != &obdfs_super_list ) {
+ struct obdfs_sb_info *sbi =
+ list_entry(sl, struct obdfs_sb_info, osi_list);
+ int ret;
+
+ /* walk write requests here, use the sb, check the time */
+ ret = obdfs_flush_reqs(&sbi->osi_inodes, check_time);
+ /* XXX handle error? What to do with it? */
+
+ max = ret > max ? ret : max;
+ }
+ if (max) { EXIT; }
+ return max;
+} /* obdfs_flush_dirty_pages */
+
+
+static void pupdate_wakeup(unsigned long l)
+{
+ wake_up(&pupdated.waitq);
}
-static struct task_struct *pupdated;
static int pupdate(void *unused)
{
- struct task_struct * tsk = current;
- int interval;
-
- pupdated = current;
- tsk->session = 1;
- tsk->pgrp = 1;
- strcpy(tsk->comm, "pupdate");
-
- /* sigstop and sigcont will stop and wakeup kupdate */
- spin_lock_irq(&tsk->sigmask_lock);
- sigfillset(&tsk->blocked);
- siginitsetinv(¤t->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
- recalc_sigpending(tsk);
- spin_unlock_irq(&tsk->sigmask_lock);
-
- for (;;) {
- /* update interval */
- interval = pupd_prm.interval;
- if (interval)
- {
- tsk->state = TASK_INTERRUPTIBLE;
- schedule_timeout(interval);
- }
- else
- {
- stop_pupdate:
- tsk->state = TASK_STOPPED;
- MOD_DEC_USE_COUNT;
- schedule(); /* wait for SIGCONT */
+ u_long flags;
+ int interval = pupdated.parms.interval;
+ long age = pupdated.parms.age_buffer;
+ int wrote = 0;
+
+ if (pupdated.active >= 0) {
+ CDEBUG(D_CACHE, "attempted to run multiple pupdates\n");
+ return 1;
+ }
+
+ init_timer(&pupdated.timer);
+ init_waitqueue_head(&pupdated.waitq);
+ pupdated.timer.function = pupdate_wakeup;
+
+ exit_files(current);
+ exit_mm(current);
+ daemonize();
+
+ current->session = 1;
+ current->pgrp = 1;
+ strcpy(current->comm, "pupdated");
+
+ CDEBUG(D_CACHE, "pupdated activated...\n");
+ pupdated.active = 1;
+
+ spin_lock_irqsave(¤t->sigmask_lock, flags);
+ flush_signals(current);
+ sigfillset(¤t->blocked);
+ recalc_sigpending(current);
+ spin_unlock_irqrestore(¤t->sigmask_lock, flags);
+
+ do {
+ long dirty_limit;
+
+ /* update interval */
+ if (pupdated.active == 1 && interval) {
+ mod_timer(&pupdated.timer, jiffies + interval);
+ interruptible_sleep_on(&pupdated.waitq);
+ }
+ if (pupdated.active == 0) {
+ del_timer(&pupdated.timer);
+ /* If stopped, we flush one last time... */
}
- /* check for sigstop */
- if (signal_pending(tsk))
- {
- int stopped = 0;
- spin_lock_irq(&tsk->sigmask_lock);
- if (sigismember(&tsk->signal, SIGSTOP))
- {
- sigdelset(&tsk->signal, SIGSTOP);
- stopped = 1;
- }
- recalc_sigpending(tsk);
- spin_unlock_irq(&tsk->sigmask_lock);
- if (stopped)
- goto stop_pupdate;
+
+ /* asynchronous setattr etc for the future ...
+ obdfs_flush_dirty_inodes(jiffies - pupdated.parms.age_super);
+ */
+ dirty_limit = nr_free_buffer_pages() * pupdated.parms.nfract / 100;
+
+ if (obdfs_cache_count > dirty_limit) {
+ interval = 0;
+ if (wrote < pupdated.parms.ndirty)
+ age >>= 1;
+ if (wrote)
+ CDEBUG(D_CACHE, "wrote %d, age %ld, interval %d\n",
+ wrote, age, interval);
+ } else {
+ if (wrote < pupdated.parms.ndirty >> 1 &&
+ obdfs_cache_count < dirty_limit / 2) {
+ interval = pupdated.parms.interval;
+ age = pupdated.parms.age_buffer;
+ if (wrote)
+ CDEBUG(D_INFO,
+ "wrote %d, age %ld, interval %d\n",
+ wrote, age, interval);
+ } else if (obdfs_cache_count > dirty_limit / 2) {
+ interval >>= 1;
+ if (wrote < pupdated.parms.ndirty)
+ age >>= 1;
+ if (wrote)
+ CDEBUG(D_CACHE,
+ "wrote %d, age %ld, interval %d\n",
+ wrote, age, interval);
+ }
+ }
+
+ wrote = obdfs_flush_dirty_pages(jiffies - age);
+ if (wrote) {
+ CDEBUG(D_CACHE,
+ "dirty_limit %ld, cache_count %ld, wrote %d\n",
+ dirty_limit, obdfs_cache_count, wrote);
+ run_task_queue(&tq_disk);
}
- printk("pupdate() activated...\n");
- /* flush_inodes(); */
- obdfs_flush_dirty_pages(1);
- }
+ } while (pupdated.active == 1);
+
+ CDEBUG(D_CACHE, "pupdated stopped...\n");
+ pupdated.active = -1;
+ wake_up(&pupdated.waitq);
+ return 0;
}
-int flushd_init(void)
+int obdfs_flushd_init(void)
{
- /* kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); */
- MOD_INC_USE_COUNT;
- kernel_thread(pupdate, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
- return 0;
+ /*
+ kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+ */
+ kernel_thread(pupdate, NULL, 0);
+ CDEBUG(D_PSDEV, "flushd inited\n");
+ return 0;
}
-int flushd_cleanup(void)
+int obdfs_flushd_cleanup(void)
{
- /* this should deliver a signal to */
-
+ ENTRY;
- /* XXX Andreas, we will do this later, for now, you must kill
- pupdated with a SIGSTOP from userland, before unloading obdfs.o
- */
- if (pupdated) {
- /* send updated a STOP signal */
- /* then let it run at least once, before continuing */
+ /* Shut down pupdated. */
+ if (pupdated.active > 0) {
+ CDEBUG(D_CACHE, "inform pupdated\n");
+ pupdated.active = 0;
+ wake_up(&pupdated.waitq);
- 1;
- }
-
- /* not reached */
- return 0;
+ CDEBUG(D_CACHE, "wait for pupdated\n");
+ while (pupdated.active == 0) {
+ interruptible_sleep_on(&pupdated.waitq);
+ }
+ CDEBUG(D_CACHE, "done waiting for pupdated\n");
+ }
+ EXIT;
+ return 0;
}