1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
6 * This file is part of Lustre, http://www.lustre.org.
8 * Lustre is free software; you can redistribute it and/or
9 * modify it under the terms of version 2 of the GNU General Public
10 * License as published by the Free Software Foundation.
12 * Lustre is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Lustre; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 * Copyright (C) 2002, 2003 Cluster File Systems, Inc
23 * this started as an implementation of an io daemon that woke regularly
24 * to force writeback.. the throttling in prepare_write and kupdate's usual
25 * writeback pressure got rid of our thread, but the file name remains.
27 #include <linux/version.h>
28 #include <linux/config.h>
29 #include <linux/module.h>
31 #include <linux/stat.h>
32 #include <linux/sched.h>
33 #include <linux/smp_lock.h>
34 #include <linux/kmod.h>
35 #include <linux/pagemap.h>
38 /* PG_inactive_clean is shorthand for rmap, we want free_high/low here.. */
39 #ifdef PG_inactive_clean
40 #include <linux/mm_inline.h>
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <linux/lustre_lite.h>
46 #ifndef list_for_each_prev_safe
47 #define list_for_each_prev_safe(pos, n, head) \
48 for (pos = (head)->prev, n = pos->prev; pos != (head); \
49 pos = n, n = pos->prev )
52 extern spinlock_t inode_lock;
54 #define LLWP_MAX_PAGES (PTL_MD_MAX_IOV)
55 struct ll_writeback_pages {
56 unsigned has_whole_pages:1,
59 struct brw_page pgs[LLWP_MAX_PAGES];
64 * ugh, we want disk allocation on the target to happen in offset order. we'll
65 * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
66 * fine for our small page arrays and doesn't require allocation. its an
67 * insertion sort that swaps elements that are strides apart, shrinking the
68 * stride down until its '1' and the array is sorted.
70 void sort_brw_pages(struct brw_page *array, int num)
78 for( stride = 1; stride < num ; stride = (stride*3) +1 )
83 for ( i = stride ; i < num ; i++ ) {
86 while ( j >= stride &&
87 array[j - stride].off > tmp.off ) {
88 array[j] = array[j - stride];
93 } while ( stride > 1 );
97 * returns 0 if the page was inserted in the array because it was
98 * within i_size. if we raced with truncate and i_size was less
99 * than the page we can unlock the page because truncate_inode_pages will
100 * be waiting to cleanup the page
102 static int llwp_consume_page(struct ll_writeback_pages *llwp,
103 struct inode *inode, struct page *page)
105 obd_off off = ((obd_off)page->index) << PAGE_SHIFT;
108 /* we raced with truncate? */
109 if ( off >= inode->i_size ) {
114 page_cache_get(page);
115 pg = &llwp->pgs[llwp->num_pages];
120 pg->flag = OBD_BRW_CREATE;
121 pg->count = PAGE_SIZE;
123 /* catch partial writes for files that end mid-page */
124 if ( pg->off + pg->count > inode->i_size )
125 pg->count = inode->i_size & ~PAGE_MASK;
127 if ( pg->count == PAGE_SIZE ) {
128 if ( ! llwp->has_whole_pages ) {
129 llwp->has_whole_pages = 1;
137 * matches ptlrpc_bulk_get assert that trickles down
138 * from a 0 page length going through niobuf and into
139 * the buffer regions being posted
141 LASSERT(pg->count >= 0);
143 CDEBUG(D_CACHE, "brw_page %p: off "LPU64" cnt %d, page %p: ind %ld"
144 " i_size: "LPU64"\n", pg, pg->off, pg->count, page,
145 page->index, inode->i_size);
147 if ( llwp->num_frags == 3 || llwp->num_pages == LLWP_MAX_PAGES )
155 * returns the number of pages that it added to the pgs array
157 * this duplicates filemap_fdatasync and gives us an opportunity to grab lots
160 static void ll_get_dirty_pages(struct inode *inode,
161 struct ll_writeback_pages *llwp)
163 struct address_space *mapping = inode->i_mapping;
165 struct list_head *pos, *n;
168 spin_lock(&pagecache_lock);
170 list_for_each_prev_safe(pos, n, &mapping->dirty_pages) {
171 page = list_entry(pos, struct page, list);
173 if (TryLockPage(page))
176 list_del(&page->list);
177 list_add(&page->list, &mapping->locked_pages);
179 if ( ! PageDirty(page) ) {
183 ClearPageDirty(page);
185 if ( llwp_consume_page(llwp, inode, page) != 0)
189 spin_unlock(&pagecache_lock);
193 static void ll_brw_pages_unlock( struct inode *inode,
194 struct ll_writeback_pages *llwp)
197 struct obd_brw_set *set;
200 sort_brw_pages(llwp->pgs, llwp->num_pages);
202 set = obd_brw_set_new();
207 set->brw_callback = ll_brw_sync_wait;
209 rc = obd_brw(OBD_BRW_WRITE, ll_i2obdconn(inode),
210 ll_i2info(inode)->lli_smd, llwp->num_pages, llwp->pgs,
213 CERROR("error from obd_brw: rc = %d\n", rc);
215 rc = ll_brw_sync_wait(set, CB_PHASE_START);
217 CERROR("error from callback: rc = %d\n", rc);
219 obd_brw_set_decref(set);
221 /* XXX this doesn't make sense to me */
224 for ( i = 0 ; i < llwp->num_pages ; i++) {
225 struct page *page = llwp->pgs[i].pg;
227 CDEBUG(D_CACHE, "cleaning page %p\n", page);
228 LASSERT(PageLocked(page));
230 page_cache_release(page);
236 #ifndef PG_inactive_clean
237 #ifdef CONFIG_DISCONTIGMEM
238 #error "sorry, we don't support DISCONTIGMEM yet"
241 * __alloc_pages marks a zone as needing balancing if an allocation is
242 * performed when the zone has fewer free pages than its 'low' water
243 * mark. its cleared when try_to_free_pages makes progress.
245 static int zones_need_balancing(void)
251 for ( pgdat = pgdat_list ; pgdat != NULL ; pgdat = pgdat->node_next ) {
252 for ( i = pgdat->nr_zones-1 ; i >= 0 ; i-- ) {
253 zone = &pgdat->node_zones[i];
255 if ( zone->need_balance )
262 /* 2.4 doesn't give us a way to find out how many pages we have
263 * cached 'cause we're not using buffer_heads. we are very
264 * conservative here and flush the superblock of all dirty data
265 * when the vm (rmap or stock) thinks that it is running low
266 * and kswapd would have done work. kupdated isn't good enough
267 * because writers (dbench) can dirty _very quickly_, and we
268 * allocate under writepage..
270 * 2.5 gets this right, see the {inc,dec}_page_state(nr_dirty, )
272 static int should_writeback(void)
274 #ifdef PG_inactive_clean
275 if (free_high(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0)
277 if (zones_need_balancing())
283 int ll_check_dirty( struct super_block *sb)
285 unsigned long old_flags; /* hack? */
287 struct ll_writeback_pages *llwp;
292 if ( ! should_writeback() )
295 old_flags = current->flags;
296 current->flags |= PF_MEMALLOC;
297 llwp = kmalloc(sizeof(struct ll_writeback_pages), GFP_ATOMIC);
299 GOTO(cleanup, rc = -ENOMEM);
300 memset(llwp, 0, offsetof(struct ll_writeback_pages, pgs));
302 spin_lock(&inode_lock);
305 * first we try and write back dirty pages from dirty inodes
306 * until the VM thinkgs we're ok again..
309 struct list_head *pos;
313 list_for_each_prev(pos, &sb->s_dirty) {
314 inode = list_entry(pos, struct inode, i_list);
316 if ( ! (inode->i_state & I_DIRTY_PAGES) ) {
326 /* duplicate __sync_one, *sigh* */
327 list_del(&inode->i_list);
328 list_add(&inode->i_list, &inode->i_sb->s_locked_inodes);
329 inode->i_state |= I_LOCK;
330 inode->i_state &= ~I_DIRTY_PAGES;
332 spin_unlock(&inode_lock);
335 memset(llwp, 0, sizeof(*llwp));
336 ll_get_dirty_pages(inode, llwp);
337 if ( llwp->num_pages ) {
338 ll_brw_pages_unlock(inode, llwp);
339 rc += llwp->num_pages;
342 } while (llwp->num_pages && should_writeback() );
344 spin_lock(&inode_lock);
346 if ( ! list_empty(&inode->i_mapping->dirty_pages) )
347 inode->i_state |= I_DIRTY_PAGES;
349 inode->i_state &= ~I_LOCK;
351 * we are sneaky and leave the inode on the dirty list,
352 * even though it might not still be..
354 if (!(inode->i_state & I_FREEING)) {
355 list_del(&inode->i_list);
356 list_add(&inode->i_list, &inode->i_sb->s_dirty);
358 wake_up(&inode->i_wait);
360 } while ( making_progress && should_writeback() );
363 * and if that didn't work, we sleep on any data that might
364 * be under writeback..
366 while ( should_writeback() ) {
367 if ( list_empty(&sb->s_locked_inodes) )
370 inode = list_entry(sb->s_locked_inodes.next, struct inode,
373 atomic_inc(&inode->i_count); /* XXX hack? */
374 spin_unlock(&inode_lock);
375 wait_event(inode->i_wait, !(inode->i_state & I_LOCK));
377 spin_lock(&inode_lock);
380 spin_unlock(&inode_lock);
385 current->flags = old_flags;
390 int ll_batch_writepage( struct inode *inode, struct page *page )
392 unsigned long old_flags; /* hack? */
393 struct ll_writeback_pages *llwp;
397 old_flags = current->flags;
398 current->flags |= PF_MEMALLOC;
399 llwp = kmalloc(sizeof(struct ll_writeback_pages), GFP_ATOMIC);
401 GOTO(cleanup, rc = -ENOMEM);
402 memset(llwp, 0, offsetof(struct ll_writeback_pages, pgs));
404 llwp_consume_page(llwp, inode, page);
406 ll_get_dirty_pages(inode, llwp);
407 if ( llwp->num_pages )
408 ll_brw_pages_unlock(inode, llwp);
413 current->flags = old_flags;