--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Copyright (C) 2002, 2003 Cluster File Systems, Inc
+ *
+ * this started as an implementation of an io daemon that woke regularly
+ * to force writeback.. the throttling in prepare_write and kupdate's usual
+ * writeback pressure got rid of our thread, but the file name remains.
+ */
+#include <linux/version.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/kmod.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+
+/* PG_inactive_clean is shorthand for rmap, we want free_high/low here.. */
+#ifdef PG_inactive_clean
+#include <linux/mm_inline.h>
+#endif
+
+#define DEBUG_SUBSYSTEM S_LLITE
+#include <linux/lustre_lite.h>
+
+#ifndef list_for_each_prev_safe
+#define list_for_each_prev_safe(pos, n, head) \
+ for (pos = (head)->prev, n = pos->prev; pos != (head); \
+ pos = n, n = pos->prev )
+#endif
+
+extern spinlock_t inode_lock;
+
+#define LLWP_MAX_PAGES (PTL_MD_MAX_IOV)
+struct ll_writeback_pages {
+ unsigned has_whole_pages:1,
+ num_frags:2,
+ num_pages:29;
+ struct brw_page pgs[LLWP_MAX_PAGES];
+};
+
+
+/*
+ * ugh, we want disk allocation on the target to happen in offset order. we'll
+ * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
+ * fine for our small page arrays and doesn't require allocation. its an
+ * insertion sort that swaps elements that are strides apart, shrinking the
+ * stride down until its '1' and the array is sorted.
+ */
+void sort_brw_pages(struct brw_page *array, int num)
+{
+ int stride, i, j;
+ struct brw_page tmp;
+
+ if ( num == 1 )
+ return;
+
+ for( stride = 1; stride < num ; stride = (stride*3) +1 )
+ ;
+
+ do {
+ stride /= 3;
+ for ( i = stride ; i < num ; i++ ) {
+ tmp = array[i];
+ j = i;
+ while ( j >= stride &&
+ array[j - stride].off > tmp.off ) {
+ array[j] = array[j - stride];
+ j -= stride;
+ }
+ array[j] = tmp;
+ }
+ } while ( stride > 1 );
+}
+
+/*
+ * returns 0 if the page was inserted in the array because it was
+ * within i_size. if we raced with truncate and i_size was less
+ * than the page we can unlock the page because truncate_inode_pages will
+ * be waiting to cleanup the page
+ */
+static int llwp_consume_page(struct ll_writeback_pages *llwp,
+ struct inode *inode, struct page *page)
+{
+ obd_off off = ((obd_off)page->index) << PAGE_SHIFT;
+ struct brw_page *pg;
+
+ /* we raced with truncate? */
+ if ( off >= inode->i_size ) {
+ unlock_page(page);
+ goto out;
+ }
+
+ page_cache_get(page);
+ pg = &llwp->pgs[llwp->num_pages];
+ llwp->num_pages++;
+
+ pg->pg = page;
+ pg->off = off;
+ pg->flag = OBD_BRW_CREATE;
+ pg->count = PAGE_SIZE;
+
+ /* catch partial writes for files that end mid-page */
+ if ( pg->off + pg->count > inode->i_size )
+ pg->count = inode->i_size & ~PAGE_MASK;
+
+ if ( pg->count == PAGE_SIZE ) {
+ if ( ! llwp->has_whole_pages ) {
+ llwp->has_whole_pages = 1;
+ llwp->num_frags++;
+ }
+ } else {
+ llwp->num_frags++;
+ }
+
+ /*
+ * matches ptlrpc_bulk_get assert that trickles down
+ * from a 0 page length going through niobuf and into
+ * the buffer regions being posted
+ */
+ LASSERT(pg->count >= 0);
+
+ CDEBUG(D_CACHE, "brw_page %p: off "LPU64" cnt %d, page %p: ind %ld"
+ " i_size: "LPU64"\n", pg, pg->off, pg->count, page,
+ page->index, inode->i_size);
+
+ if ( llwp->num_frags == 3 || llwp->num_pages == LLWP_MAX_PAGES )
+ return -1;
+
+out:
+ return 0;
+}
+
+/*
+ * returns the number of pages that it added to the pgs array
+ *
+ * this duplicates filemap_fdatasync and gives us an opportunity to grab lots
+ * of dirty pages..
+ */
+static void ll_get_dirty_pages(struct inode *inode,
+ struct ll_writeback_pages *llwp)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+ struct list_head *pos, *n;
+ ENTRY;
+
+ spin_lock(&pagecache_lock);
+
+ list_for_each_prev_safe(pos, n, &mapping->dirty_pages) {
+ page = list_entry(pos, struct page, list);
+
+ if (TryLockPage(page))
+ continue;
+
+ list_del(&page->list);
+ list_add(&page->list, &mapping->locked_pages);
+
+ if ( ! PageDirty(page) ) {
+ unlock_page(page);
+ continue;
+ }
+ ClearPageDirty(page);
+
+ if ( llwp_consume_page(llwp, inode, page) != 0)
+ break;
+ }
+
+ spin_unlock(&pagecache_lock);
+ EXIT;
+}
+
+static void ll_brw_pages_unlock( struct inode *inode,
+ struct ll_writeback_pages *llwp)
+{
+ int rc, i;
+ struct obd_brw_set *set;
+ ENTRY;
+
+ sort_brw_pages(llwp->pgs, llwp->num_pages);
+
+ set = obd_brw_set_new();
+ if (set == NULL) {
+ EXIT;
+ return;
+ }
+ set->brw_callback = ll_brw_sync_wait;
+
+ rc = obd_brw(OBD_BRW_WRITE, ll_i2obdconn(inode),
+ ll_i2info(inode)->lli_smd, llwp->num_pages, llwp->pgs,
+ set, NULL);
+ if (rc) {
+ CERROR("error from obd_brw: rc = %d\n", rc);
+ } else {
+ rc = ll_brw_sync_wait(set, CB_PHASE_START);
+ if (rc)
+ CERROR("error from callback: rc = %d\n", rc);
+ }
+ obd_brw_set_decref(set);
+
+ /* XXX this doesn't make sense to me */
+ rc = 0;
+
+ for ( i = 0 ; i < llwp->num_pages ; i++) {
+ struct page *page = llwp->pgs[i].pg;
+
+ CDEBUG(D_CACHE, "cleaning page %p\n", page);
+ LASSERT(PageLocked(page));
+ unlock_page(page);
+ page_cache_release(page);
+ }
+
+ EXIT;
+}
+
+#ifndef PG_inactive_clean
+#ifdef CONFIG_DISCONTIGMEM
+#error "sorry, we don't support DISCONTIGMEM yet"
+#endif
+/*
+ * __alloc_pages marks a zone as needing balancing if an allocation is
+ * performed when the zone has fewer free pages than its 'low' water
+ * mark. its cleared when try_to_free_pages makes progress.
+ */
+static int zones_need_balancing(void)
+{
+ pg_data_t * pgdat;
+ zone_t *zone;
+ int i;
+
+ for ( pgdat = pgdat_list ; pgdat != NULL ; pgdat = pgdat->node_next ) {
+ for ( i = pgdat->nr_zones-1 ; i >= 0 ; i-- ) {
+ zone = &pgdat->node_zones[i];
+
+ if ( zone->need_balance )
+ return 1;
+ }
+ }
+ return 0;
+}
+#endif
+/* 2.4 doesn't give us a way to find out how many pages we have
+ * cached 'cause we're not using buffer_heads. we are very
+ * conservative here and flush the superblock of all dirty data
+ * when the vm (rmap or stock) thinks that it is running low
+ * and kswapd would have done work. kupdated isn't good enough
+ * because writers (dbench) can dirty _very quickly_, and we
+ * allocate under writepage..
+ *
+ * 2.5 gets this right, see the {inc,dec}_page_state(nr_dirty, )
+ */
+static int should_writeback(void)
+{
+#ifdef PG_inactive_clean
+ if (free_high(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0)
+#else
+ if (zones_need_balancing())
+#endif
+ return 1;
+ return 0;
+}
+
+int ll_check_dirty( struct super_block *sb)
+{
+ unsigned long old_flags; /* hack? */
+ int making_progress;
+ struct ll_writeback_pages *llwp;
+ struct inode *inode;
+ int rc = 0;
+ ENTRY;
+
+ if ( ! should_writeback() )
+ return 0;
+
+ old_flags = current->flags;
+ current->flags |= PF_MEMALLOC;
+ llwp = kmalloc(sizeof(struct ll_writeback_pages), GFP_ATOMIC);
+ if ( llwp == NULL )
+ GOTO(cleanup, rc = -ENOMEM);
+ memset(llwp, 0, offsetof(struct ll_writeback_pages, pgs));
+
+ spin_lock(&inode_lock);
+
+ /*
+ * first we try and write back dirty pages from dirty inodes
+ * until the VM thinkgs we're ok again..
+ */
+ do {
+ struct list_head *pos;
+ inode = NULL;
+ making_progress = 0;
+
+ list_for_each_prev(pos, &sb->s_dirty) {
+ inode = list_entry(pos, struct inode, i_list);
+
+ if ( ! (inode->i_state & I_DIRTY_PAGES) ) {
+ inode = NULL;
+ continue;
+ }
+ break;
+ }
+
+ if ( inode == NULL )
+ break;
+
+ /* duplicate __sync_one, *sigh* */
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode->i_sb->s_locked_inodes);
+ inode->i_state |= I_LOCK;
+ inode->i_state &= ~I_DIRTY_PAGES;
+
+ spin_unlock(&inode_lock);
+
+ do {
+ memset(llwp, 0, sizeof(*llwp));
+ ll_get_dirty_pages(inode, llwp);
+ if ( llwp->num_pages ) {
+ ll_brw_pages_unlock(inode, llwp);
+ rc += llwp->num_pages;
+ making_progress = 1;
+ }
+ } while (llwp->num_pages && should_writeback() );
+
+ spin_lock(&inode_lock);
+
+ if ( ! list_empty(&inode->i_mapping->dirty_pages) )
+ inode->i_state |= I_DIRTY_PAGES;
+
+ inode->i_state &= ~I_LOCK;
+ /*
+ * we are sneaky and leave the inode on the dirty list,
+ * even though it might not still be..
+ */
+ if (!(inode->i_state & I_FREEING)) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode->i_sb->s_dirty);
+ }
+ wake_up(&inode->i_wait);
+
+ } while ( making_progress && should_writeback() );
+
+ /*
+ * and if that didn't work, we sleep on any data that might
+ * be under writeback..
+ */
+ while ( should_writeback() ) {
+ if ( list_empty(&sb->s_locked_inodes) )
+ break;
+
+ inode = list_entry(sb->s_locked_inodes.next, struct inode,
+ i_list);
+
+ atomic_inc(&inode->i_count); /* XXX hack? */
+ spin_unlock(&inode_lock);
+ wait_event(inode->i_wait, !(inode->i_state & I_LOCK));
+ iput(inode);
+ spin_lock(&inode_lock);
+ }
+
+ spin_unlock(&inode_lock);
+
+cleanup:
+ if ( llwp != NULL )
+ kfree(llwp);
+ current->flags = old_flags;
+
+ RETURN(rc);
+}
+
+int ll_batch_writepage( struct inode *inode, struct page *page )
+{
+ unsigned long old_flags; /* hack? */
+ struct ll_writeback_pages *llwp;
+ int rc = 0;
+ ENTRY;
+
+ old_flags = current->flags;
+ current->flags |= PF_MEMALLOC;
+ llwp = kmalloc(sizeof(struct ll_writeback_pages), GFP_ATOMIC);
+ if ( llwp == NULL )
+ GOTO(cleanup, rc = -ENOMEM);
+ memset(llwp, 0, offsetof(struct ll_writeback_pages, pgs));
+
+ llwp_consume_page(llwp, inode, page);
+
+ ll_get_dirty_pages(inode, llwp);
+ if ( llwp->num_pages )
+ ll_brw_pages_unlock(inode, llwp);
+
+cleanup:
+ if ( llwp != NULL )
+ kfree(llwp);
+ current->flags = old_flags;
+ RETURN(rc);
+}