lustre/llite/iod.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
   5  *
   6  *   This file is part of Lustre, http://www.lustre.org.
   7  *
   8  *   Lustre is free software; you can redistribute it and/or
   9  *   modify it under the terms of version 2 of the GNU General Public
  10  *   License as published by the Free Software Foundation.
  11  *
  12  *   Lustre is distributed in the hope that it will be useful,
  13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  *   GNU General Public License for more details.
  16  *
  17  *   You should have received a copy of the GNU General Public License
  18  *   along with Lustre; if not, write to the Free Software
  19  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20  *
  21  *  Copyright (C) 2002, 2003  Cluster File Systems, Inc
  22  *
  23  *  this started as an implementation of an io daemon that woke regularly
  24  *  to force writeback.. the throttling in prepare_write and kupdate's usual
  25  *  writeback pressure got rid of our thread, but the file name remains.
  26  */
  27 #include <linux/version.h>
  28 #include <linux/config.h>
  29 #include <linux/module.h>
  30 #include <linux/fs.h>
  31 #include <linux/stat.h>
  32 #include <linux/sched.h>
  33 #include <linux/smp_lock.h>
  34 #include <linux/kmod.h>
  35 #include <linux/pagemap.h>
  36 #include <linux/mm.h>
  37
  38 /* PG_inactive_clean is shorthand for rmap, we want free_high/low here.. */
  39 #ifdef PG_inactive_clean
  40 #include <linux/mm_inline.h>
  41 #endif
  42
  43 #define DEBUG_SUBSYSTEM S_LLITE
  44 #include <linux/lustre_lite.h>
  45
  46 #ifndef list_for_each_prev_safe
  47 #define list_for_each_prev_safe(pos, n, head) \
  48         for (pos = (head)->prev, n = pos->prev; pos != (head); \
  49                 pos = n, n = pos->prev )
  50 #endif
  51
  52 extern spinlock_t inode_lock;
  53
  54 #define LLWP_MAX_PAGES (PTL_MD_MAX_IOV)
  55 struct ll_writeback_pages {
  56         unsigned        has_whole_pages:1,
  57                         num_frags:2,
  58                         num_pages:29;
  59         struct brw_page pgs[LLWP_MAX_PAGES];
  60 };
  61
  62
  63 /*
  64  * ugh, we want disk allocation on the target to happen in offset order.  we'll
  65  * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
  66  * fine for our small page arrays and doesn't require allocation.  its an
  67  * insertion sort that swaps elements that are strides apart, shrinking the
  68  * stride down until its '1' and the array is sorted.
  69  */
  70 void sort_brw_pages(struct brw_page *array, int num)
  71 {
  72         int stride, i, j;
  73         struct brw_page tmp;
  74
  75         if ( num == 1 )
  76                 return;
  77
  78         for( stride = 1; stride < num ; stride = (stride*3) +1  )
  79                 ;
  80
  81         do {
  82                 stride /= 3;
  83                 for ( i = stride ; i < num ; i++ ) {
  84                         tmp = array[i];
  85                         j = i;
  86                         while ( j >= stride &&
  87                                         array[j - stride].off > tmp.off ) {
  88                                 array[j] = array[j - stride];
  89                                 j -= stride;
  90                         }
  91                         array[j] = tmp;
  92                 }
  93         } while ( stride > 1 );
  94 }
  95
  96 /*
  97  * returns 0 if the page was inserted in the array because it was
  98  * within i_size.  if we raced with truncate and i_size was less
  99  * than the page we can unlock the page because truncate_inode_pages will
 100  * be waiting to cleanup the page
 101  */
 102 static int llwp_consume_page(struct ll_writeback_pages *llwp,
 103                              struct inode *inode, struct page *page)
 104 {
 105         obd_off off = ((obd_off)page->index) << PAGE_SHIFT;
 106         struct brw_page *pg;
 107
 108         /* we raced with truncate? */
 109         if ( off >= inode->i_size ) {
 110                 unlock_page(page);
 111                 goto out;
 112         }
 113
 114         page_cache_get(page);
 115         pg = &llwp->pgs[llwp->num_pages];
 116         llwp->num_pages++;
 117
 118         pg->pg = page;
 119         pg->off = off;
 120         pg->flag = OBD_BRW_CREATE;
 121         pg->count = PAGE_SIZE;
 122
 123         /* catch partial writes for files that end mid-page */
 124         if ( pg->off + pg->count > inode->i_size )
 125                 pg->count = inode->i_size & ~PAGE_MASK;
 126
 127         if ( pg->count == PAGE_SIZE ) {
 128                 if ( ! llwp->has_whole_pages ) {
 129                         llwp->has_whole_pages = 1;
 130                         llwp->num_frags++;
 131                 }
 132         } else {
 133                 llwp->num_frags++;
 134         }
 135
 136         /*
 137          * matches ptlrpc_bulk_get assert that trickles down
 138          * from a 0 page length going through niobuf and into
 139          * the buffer regions being posted
 140          */
 141         LASSERT(pg->count >= 0);
 142
 143         CDEBUG(D_CACHE, "brw_page %p: off "LPU64" cnt %d, page %p: ind %ld"
 144                         " i_size: "LPU64"\n", pg, pg->off, pg->count, page,
 145                         page->index, inode->i_size);
 146
 147         if ( llwp->num_frags == 3 || llwp->num_pages == LLWP_MAX_PAGES )
 148                 return -1;
 149
 150 out:
 151         return 0;
 152 }
 153
 154 /*
 155  * returns the number of pages that it added to the pgs array
 156  *
 157  * this duplicates filemap_fdatasync and gives us an opportunity to grab lots
 158  * of dirty pages..
 159  */
 160 static void ll_get_dirty_pages(struct inode *inode,
 161                                struct ll_writeback_pages *llwp)
 162 {
 163         struct address_space *mapping = inode->i_mapping;
 164         struct page *page;
 165         struct list_head *pos, *n;
 166         ENTRY;
 167
 168         spin_lock(&pagecache_lock);
 169
 170         list_for_each_prev_safe(pos, n, &mapping->dirty_pages) {
 171                 page = list_entry(pos, struct page, list);
 172
 173                 if (TryLockPage(page))
 174                         continue;
 175
 176                 list_del(&page->list);
 177                 list_add(&page->list, &mapping->locked_pages);
 178
 179                 if ( ! PageDirty(page) ) {
 180                         unlock_page(page);
 181                         continue;
 182                 }
 183                 ClearPageDirty(page);
 184
 185                 if ( llwp_consume_page(llwp, inode, page) != 0)
 186                         break;
 187         }
 188
 189         spin_unlock(&pagecache_lock);
 190         EXIT;
 191 }
 192
 193 static void ll_brw_pages_unlock( struct inode *inode,
 194                                  struct ll_writeback_pages *llwp)
 195 {
 196         int rc, i;
 197         struct obd_brw_set *set;
 198         ENTRY;
 199
 200         sort_brw_pages(llwp->pgs, llwp->num_pages);
 201
 202         set = obd_brw_set_new();
 203         if (set == NULL) {
 204                 EXIT;
 205                 return;
 206         }
 207         set->brw_callback = ll_brw_sync_wait;
 208
 209         rc = obd_brw(OBD_BRW_WRITE, ll_i2obdconn(inode),
 210                      ll_i2info(inode)->lli_smd, llwp->num_pages, llwp->pgs,
 211                      set, NULL);
 212         if (rc) {
 213                 CERROR("error from obd_brw: rc = %d\n", rc);
 214         } else {
 215                 rc = ll_brw_sync_wait(set, CB_PHASE_START);
 216                 if (rc)
 217                         CERROR("error from callback: rc = %d\n", rc);
 218         }
 219         obd_brw_set_decref(set);
 220
 221         /* XXX this doesn't make sense to me */
 222         rc = 0;
 223
 224         for ( i = 0 ; i < llwp->num_pages ; i++) {
 225                 struct page *page = llwp->pgs[i].pg;
 226
 227                 CDEBUG(D_CACHE, "cleaning page %p\n", page);
 228                 LASSERT(PageLocked(page));
 229                 unlock_page(page);
 230                 page_cache_release(page);
 231         }
 232
 233         EXIT;
 234 }
 235
 236 #ifndef PG_inactive_clean
 237 #ifdef CONFIG_DISCONTIGMEM
 238 #error "sorry, we don't support DISCONTIGMEM yet"
 239 #endif
 240 /*
 241  * __alloc_pages marks a zone as needing balancing if an allocation is
 242  * performed when the zone has fewer free pages than its 'low' water
 243  * mark.  its cleared when try_to_free_pages makes progress.
 244  */
 245 static int zones_need_balancing(void)
 246 {
 247         pg_data_t * pgdat;
 248         zone_t *zone;
 249         int i;
 250
 251         for ( pgdat = pgdat_list ; pgdat != NULL ; pgdat = pgdat->node_next ) {
 252                 for ( i = pgdat->nr_zones-1 ; i >= 0 ; i-- ) {
 253                         zone = &pgdat->node_zones[i];
 254
 255                         if ( zone->need_balance )
 256                                 return 1;
 257                 }
 258         }
 259         return 0;
 260 }
 261 #endif
 262 /* 2.4 doesn't give us a way to find out how many pages we have
 263  * cached 'cause we're not using buffer_heads.  we are very
 264  * conservative here and flush the superblock of all dirty data
 265  * when the vm (rmap or stock) thinks that it is running low
 266  * and kswapd would have done work.  kupdated isn't good enough
 267  * because writers (dbench) can dirty _very quickly_, and we
 268  * allocate under writepage..
 269  *
 270  * 2.5 gets this right, see the {inc,dec}_page_state(nr_dirty, )
 271  */
 272 static int should_writeback(void)
 273 {
 274 #ifdef PG_inactive_clean
 275         if (free_high(ALL_ZONES) > 0 || free_low(ANY_ZONE) > 0)
 276 #else
 277         if (zones_need_balancing())
 278 #endif
 279                 return 1;
 280         return 0;
 281 }
 282
 283 int ll_check_dirty( struct super_block *sb)
 284 {
 285         unsigned long old_flags; /* hack? */
 286         int making_progress;
 287         struct ll_writeback_pages *llwp;
 288         struct inode *inode;
 289         int rc = 0;
 290         ENTRY;
 291
 292         if ( ! should_writeback() )
 293                 return 0;
 294
 295         old_flags = current->flags;
 296         current->flags |= PF_MEMALLOC;
 297         llwp = kmalloc(sizeof(struct ll_writeback_pages), GFP_ATOMIC);
 298         if ( llwp == NULL )
 299                 GOTO(cleanup, rc = -ENOMEM);
 300         memset(llwp, 0, offsetof(struct ll_writeback_pages, pgs));
 301
 302         spin_lock(&inode_lock);
 303
 304         /*
 305          * first we try and write back dirty pages from dirty inodes
 306          * until the VM thinkgs we're ok again..
 307          */
 308         do {
 309                 struct list_head *pos;
 310                 inode = NULL;
 311                 making_progress = 0;
 312
 313                 list_for_each_prev(pos, &sb->s_dirty) {
 314                         inode = list_entry(pos, struct inode, i_list);
 315
 316                         if ( ! (inode->i_state & I_DIRTY_PAGES) ) {
 317                                 inode = NULL;
 318                                 continue;
 319                         }
 320                         break;
 321                 }
 322
 323                 if ( inode == NULL )
 324                         break;
 325
 326                 /* duplicate __sync_one, *sigh* */
 327                 list_del(&inode->i_list);
 328                 list_add(&inode->i_list, &inode->i_sb->s_locked_inodes);
 329                 inode->i_state |= I_LOCK;
 330                 inode->i_state &= ~I_DIRTY_PAGES;
 331
 332                 spin_unlock(&inode_lock);
 333
 334                 do {
 335                         memset(llwp, 0, sizeof(*llwp));
 336                         ll_get_dirty_pages(inode, llwp);
 337                         if ( llwp->num_pages ) {
 338                                 ll_brw_pages_unlock(inode, llwp);
 339                                 rc += llwp->num_pages;
 340                                 making_progress = 1;
 341                         }
 342                 } while (llwp->num_pages && should_writeback() );
 343
 344                 spin_lock(&inode_lock);
 345
 346                 if ( ! list_empty(&inode->i_mapping->dirty_pages) )
 347                         inode->i_state |= I_DIRTY_PAGES;
 348
 349                 inode->i_state &= ~I_LOCK;
 350                 /*
 351                  * we are sneaky and leave the inode on the dirty list,
 352                  * even though it might not still be..
 353                  */
 354                 if (!(inode->i_state & I_FREEING)) {
 355                         list_del(&inode->i_list);
 356                         list_add(&inode->i_list, &inode->i_sb->s_dirty);
 357                 }
 358                 wake_up(&inode->i_wait);
 359
 360         } while ( making_progress && should_writeback() );
 361
 362         /*
 363          * and if that didn't work, we sleep on any data that might
 364          * be under writeback..
 365          */
 366         while ( should_writeback() ) {
 367                 if ( list_empty(&sb->s_locked_inodes) )
 368                         break;
 369
 370                 inode = list_entry(sb->s_locked_inodes.next, struct inode,
 371                                 i_list);
 372
 373                 atomic_inc(&inode->i_count); /* XXX hack? */
 374                 spin_unlock(&inode_lock);
 375                 wait_event(inode->i_wait, !(inode->i_state & I_LOCK));
 376                 iput(inode);
 377                 spin_lock(&inode_lock);
 378         }
 379
 380         spin_unlock(&inode_lock);
 381
 382 cleanup:
 383         if ( llwp != NULL )
 384                 kfree(llwp);
 385         current->flags = old_flags;
 386
 387         RETURN(rc);
 388 }
 389
 390 int ll_batch_writepage( struct inode *inode, struct page *page )
 391 {
 392         unsigned long old_flags; /* hack? */
 393         struct ll_writeback_pages *llwp;
 394         int rc = 0;
 395         ENTRY;
 396
 397         old_flags = current->flags;
 398         current->flags |= PF_MEMALLOC;
 399         llwp = kmalloc(sizeof(struct ll_writeback_pages), GFP_ATOMIC);
 400         if ( llwp == NULL )
 401                 GOTO(cleanup, rc = -ENOMEM);
 402         memset(llwp, 0, offsetof(struct ll_writeback_pages, pgs));
 403
 404         llwp_consume_page(llwp, inode, page);
 405
 406         ll_get_dirty_pages(inode, llwp);
 407         if ( llwp->num_pages )
 408                 ll_brw_pages_unlock(inode, llwp);
 409
 410 cleanup:
 411         if ( llwp != NULL )
 412                 kfree(llwp);
 413         current->flags = old_flags;
 414         RETURN(rc);
 415 }