/* * OBDFS Super operations - also used for Lustre file system * * * This code is issued under the GNU General Public License. * See the file COPYING in this distribution * * Copyright (C) 1991, 1992 Linus Torvalds * Copryright (C) 1999 Stelias Computing Inc. * Copryright (C) 1999 Seagate Technology Inc. * */ #define __NO_VERSION__ #include #include #include #include #include #include /* XXX temporary until the real function is available from kernel * XXX set this to memory size in pages for max page cache size */ #define nr_free_buffer_pages() 32768 /* Defines for page buf daemon */ struct pupd_prm { int nfract; /* Percentage of buffer cache dirty to activate bdflush */ int ndirty; /* Maximum number of dirty blocks to write out per wake-cycle */ int nrefill; /* Number of clean buffers to try to obtain each time we call refill */ int nref_dirt; /* Dirty buffer threshold for activating bdflush when trying to refill buffers. */ int interval; /* jiffies delay between pupdate flushes */ int age_buffer; /* Time for normal buffer to age before we flush it */ int age_super; /* Time for superblock to age before we flush it */ }; static struct pupdated { int active; wait_queue_head_t waitq; struct timer_list timer; struct pupd_prm parms; } pupdated = { active: -1, parms: {40, 1024, 64, 256, 1*HZ, 30*HZ, 5*HZ } }; /* Called with the superblock list lock held */ static int obdfs_enqueue_pages(struct inode *inode, struct obdo **obdo, int nr_slots, struct page **pages, char **bufs, obd_size *counts, obd_off *offsets, obd_flag *flag, unsigned long check_time) { struct list_head *page_list = obdfs_iplist(inode); struct list_head *tmp; int num = 0; ENTRY; tmp = page_list; /* Traverse list in reverse order, so we do FIFO, not LIFO order */ while ( (tmp = tmp->prev) != page_list && num < nr_slots ) { struct obdfs_pgrq *req; struct page *page; req = list_entry(tmp, struct obdfs_pgrq, rq_plist); page = req->rq_page; if (req->rq_jiffies > check_time) break; /* pages are in chronological order */ /* Only allocate the obdo if we will actually do I/O here */ if ( !*obdo ) { OIDEBUG(inode); *obdo = obdo_fromid(IID(inode), inode->i_ino, OBD_MD_FLNOTOBD); if ( IS_ERR(*obdo) ) { int err = PTR_ERR(*obdo); *obdo = NULL; EXIT; return err; } /* FIXME revisit fromid & from_inode */ obdfs_from_inode(*obdo, inode); *flag = OBD_BRW_CREATE; } /* Remove request from list before write to avoid conflict. * Note that obdfs_pgrq_del() also deletes the request. */ obdfs_pgrq_del(req); if ( !page ) { CDEBUG(D_CACHE, "no page \n"); continue; } bufs[num] = (char *)page_address(page); pages[num] = page; counts[num] = PAGE_SIZE; offsets[num] = ((obd_off)page->index) << PAGE_SHIFT; CDEBUG(D_INFO, "ENQ inode %ld, page %p addr %p to vector\n", inode->i_ino, page, (char *)page_address(page)); num++; } if (!list_empty(page_list)) CDEBUG(D_INFO, "inode %ld list not empty\n", inode->i_ino); CDEBUG(D_INFO, "added %d page(s) to vector\n", num); EXIT; return num; } /* obdfs_enqueue_pages */ /* Dequeue cached pages for a dying inode without writing them to disk. */ void obdfs_dequeue_pages(struct inode *inode) { struct list_head *tmp; ENTRY; obd_down(&obdfs_i2sbi(inode)->osi_list_mutex); tmp = obdfs_islist(inode); if ( list_empty(tmp) ) { CDEBUG(D_INFO, "no dirty pages for inode %ld\n", inode->i_ino); obd_up(&obdfs_i2sbi(inode)->osi_list_mutex); EXIT; return; } /* take it out of the super list */ list_del(tmp); INIT_LIST_HEAD(obdfs_islist(inode)); tmp = obdfs_iplist(inode); while ( (tmp = tmp->prev) != obdfs_iplist(inode) ) { struct obdfs_pgrq *req; struct page *page; req = list_entry(tmp, struct obdfs_pgrq, rq_plist); page = req->rq_page; /* take it out of the list and free */ obdfs_pgrq_del(req); /* now put the page away */ put_page(page); } obd_up(&obdfs_i2sbi(inode)->osi_list_mutex); /* decrement inode reference for page cache */ atomic_dec(&inode->i_count); EXIT; } /* This value is not arbitrarily chosen. KIO_STATIC_PAGES from linux/iobuf.h */ #define MAX_IOVEC (KIO_STATIC_PAGES - 1) /* Remove writeback requests for the superblock */ int obdfs_flush_reqs(struct list_head *inode_list, unsigned long check_time) { struct list_head *tmp; unsigned long max_io, total_io = 0; obd_count num_io; obd_count num_obdos; struct inode *inodes[MAX_IOVEC]; /* write data back to these */ struct page *pages[MAX_IOVEC]; /* call put_page on these */ struct obdo *obdos[MAX_IOVEC]; char *bufs[MAX_IOVEC]; obd_size counts[MAX_IOVEC]; obd_off offsets[MAX_IOVEC]; obd_flag flags[MAX_IOVEC]; obd_count bufs_per_obdo[MAX_IOVEC]; int err = 0; struct obdfs_sb_info *sbi; ENTRY; if (!inode_list) { CDEBUG(D_INODE, "no list\n"); EXIT; return 0; } sbi = list_entry(inode_list, struct obdfs_sb_info, osi_inodes); obd_down(&sbi->osi_list_mutex); if ( list_empty(inode_list) ) { CDEBUG(D_INFO, "list empty\n"); obd_up(&sbi->osi_list_mutex); EXIT; return 0; } /* If we are forcing a write, write out all dirty pages */ max_io = check_time == ~0UL ? 1<<31 : pupdated.parms.ndirty; CDEBUG(D_INFO, "max_io = %lu\n", max_io); /* Add each inode's dirty pages to a write vector, and write it. * Traverse list in reverse order, so we do FIFO, not LIFO order */ again: tmp = inode_list; num_io = 0; num_obdos = 0; while ( (tmp = tmp->prev) != inode_list && total_io < max_io) { struct obdfs_inode_info *ii; struct inode *inode; int res; ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes); inode = list_entry(ii, struct inode, u); inodes[num_obdos] = inode; obdos[num_obdos] = NULL; CDEBUG(D_INFO, "checking inode %ld pages\n", inode->i_ino); /* Make sure we reference "inode" and not "inodes[num_obdos]", * as num_obdos will change after the loop is run. */ if (!list_empty(obdfs_iplist(inode))) { res = obdfs_enqueue_pages(inode, &obdos[num_obdos], MAX_IOVEC - num_io, &pages[num_io], &bufs[num_io], &counts[num_io], &offsets[num_io], &flags[num_obdos], check_time); CDEBUG(D_INFO, "FLUSH inode %ld, pages flushed: %d\n", inode->i_ino, res); if ( res < 0 ) { CDEBUG(D_INODE, "fatal: unable to enqueue inode %ld (err %d)\n", inode->i_ino, res); /* XXX Move bad inode to end of list so we can * continue with flushing list. This is a * temporary measure to avoid machine lockups. * Maybe if we have -ENOENT, simply discard. */ list_del(tmp); list_add(tmp, inode_list); err = res; EXIT; goto BREAK; } if (res == 0) continue; num_io += res; total_io += res; bufs_per_obdo[num_obdos] = res; num_obdos++; if ( num_io == MAX_IOVEC ) { obd_up(&sbi->osi_list_mutex); err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos, bufs_per_obdo, pages, bufs, counts, offsets, flags); if ( err ) { CDEBUG(D_INODE, "fatal: do_vec_wr err=%d\n", err); EXIT; goto ERR; } obd_down(&sbi->osi_list_mutex); goto again; } } } BREAK: obd_up(&sbi->osi_list_mutex); /* flush any remaining I/Os */ if ( num_io ) { err = obdfs_do_vec_wr(inodes, num_io, num_obdos, obdos, bufs_per_obdo, pages, bufs, counts, offsets, flags); if (err) CDEBUG(D_INODE, "fatal: unable to do vec_wr (err %d)\n", err); num_io = 0; num_obdos = 0; } /* Remove inode from superblock dirty list when no more pages. * Make sure we don't point at the current inode with tmp * when we re-init the list on the inode, or we will loop. */ obd_down(&sbi->osi_list_mutex); tmp = inode_list; while ( (tmp = tmp->prev) != inode_list ) { struct obdfs_inode_info *ii; struct inode *inode; ii = list_entry(tmp, struct obdfs_inode_info, oi_inodes); inode = list_entry(ii, struct inode, u); CDEBUG(D_INFO, "checking inode %ld empty\n", inode->i_ino); if (list_empty(obdfs_iplist(inode))) { CDEBUG(D_INFO, "remove inode %ld from dirty list\n", inode->i_ino); tmp = tmp->next; list_del(obdfs_islist(inode)); /* decrement inode reference for page cache */ atomic_dec(&inode->i_count); INIT_LIST_HEAD(obdfs_islist(inode)); } } obd_up(&sbi->osi_list_mutex); CDEBUG(D_INFO, "flushed %ld pages in total\n", total_io); EXIT; ERR: return err ? err : total_io; } /* obdfs_flush_reqs */ /* Walk all of the superblocks and write out blocks which are too old. * Return the maximum number of blocks written for a single filesystem. */ int obdfs_flush_dirty_pages(unsigned long check_time) { struct list_head *sl; int max = 0; /* ENTRY; */ sl = &obdfs_super_list; while ( (sl = sl->prev) != &obdfs_super_list ) { struct obdfs_sb_info *sbi = list_entry(sl, struct obdfs_sb_info, osi_list); int ret; /* walk write requests here, use the sb, check the time */ ret = obdfs_flush_reqs(&sbi->osi_inodes, check_time); /* XXX handle error? What to do with it? */ max = ret > max ? ret : max; } if (max) { EXIT; } return max; } /* obdfs_flush_dirty_pages */ static void pupdate_wakeup(unsigned long l) { wake_up_interruptible(&pupdated.waitq); } static int pupdate(void *unused) { u_long flags; int interval = pupdated.parms.interval; long age = pupdated.parms.age_buffer; int wrote = 0; if (pupdated.active >= 0) { CDEBUG(D_CACHE, "attempted to run multiple pupdates\n"); return 1; } init_timer(&pupdated.timer); init_waitqueue_head(&pupdated.waitq); pupdated.timer.function = pupdate_wakeup; exit_files(current); exit_mm(current); daemonize(); current->session = 1; current->pgrp = 1; strcpy(current->comm, "pupdated"); CDEBUG(D_CACHE, "pupdated activated...\n"); pupdated.active = 1; spin_lock_irqsave(¤t->sigmask_lock, flags); flush_signals(current); sigfillset(¤t->blocked); recalc_sigpending(current); spin_unlock_irqrestore(¤t->sigmask_lock, flags); do { long dirty_limit; /* update interval */ if (pupdated.active == 1 && interval) { mod_timer(&pupdated.timer, jiffies + interval); interruptible_sleep_on(&pupdated.waitq); } if (pupdated.active == 0) { del_timer(&pupdated.timer); /* If stopped, we flush one last time... */ } /* asynchronous setattr etc for the future ... obdfs_flush_dirty_inodes(jiffies - pupdated.parms.age_super); */ dirty_limit = nr_free_buffer_pages() * pupdated.parms.nfract / 100; if (obdfs_cache_count > dirty_limit) { interval = 0; if (wrote < pupdated.parms.ndirty) age >>= 1; if (wrote) CDEBUG(D_CACHE, "wrote %d, age %ld, interval %d\n", wrote, age, interval); } else { if (wrote < pupdated.parms.ndirty >> 1 && obdfs_cache_count < dirty_limit / 2) { interval = pupdated.parms.interval; age = pupdated.parms.age_buffer; if (wrote) CDEBUG(D_INFO, "wrote %d, age %ld, interval %d\n", wrote, age, interval); } else if (obdfs_cache_count > dirty_limit / 2) { interval >>= 1; if (wrote < pupdated.parms.ndirty) age >>= 1; if (wrote) CDEBUG(D_CACHE, "wrote %d, age %ld, interval %d\n", wrote, age, interval); } } wrote = obdfs_flush_dirty_pages(jiffies - age); if (wrote) { CDEBUG(D_CACHE, "dirty_limit %ld, cache_count %ld, wrote %d\n", dirty_limit, obdfs_cache_count, wrote); run_task_queue(&tq_disk); } } while (pupdated.active == 1); CDEBUG(D_CACHE, "pupdated stopped...\n"); pupdated.active = -1; wake_up_interruptible (&pupdated.waitq); return 0; } int obdfs_flushd_init(void) { /* kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); */ kernel_thread(pupdate, NULL, 0); CDEBUG(D_PSDEV, "flushd inited\n"); return 0; } int obdfs_flushd_cleanup(void) { ENTRY; /* Shut down pupdated. */ if (pupdated.active > 0) { CDEBUG(D_CACHE, "inform pupdated\n"); pupdated.active = 0; wake_up_interruptible(&pupdated.waitq); CDEBUG(D_CACHE, "wait for pupdated\n"); while (pupdated.active == 0) { interruptible_sleep_on(&pupdated.waitq); } CDEBUG(D_CACHE, "done waiting for pupdated\n"); } EXIT; return 0; }