lustre/llite/rw.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  * Lustre Lite I/O page cache routines shared by different kernel revs
   5  *
   6  *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
   7  *
   8  *   This file is part of Lustre, http://www.lustre.org.
   9  *
  10  *   Lustre is free software; you can redistribute it and/or
  11  *   modify it under the terms of version 2 of the GNU General Public
  12  *   License as published by the Free Software Foundation.
  13  *
  14  *   Lustre is distributed in the hope that it will be useful,
  15  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  *   GNU General Public License for more details.
  18  *
  19  *   You should have received a copy of the GNU General Public License
  20  *   along with Lustre; if not, write to the Free Software
  21  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  22  */
  23
  24 #include <linux/config.h>
  25 #include <linux/kernel.h>
  26 #include <linux/mm.h>
  27 #include <linux/string.h>
  28 #include <linux/stat.h>
  29 #include <linux/errno.h>
  30 #include <linux/smp_lock.h>
  31 #include <linux/unistd.h>
  32 #include <linux/version.h>
  33 #include <asm/system.h>
  34 #include <asm/uaccess.h>
  35
  36 #include <linux/fs.h>
  37 #include <linux/stat.h>
  38 #include <asm/uaccess.h>
  39 #include <asm/segment.h>
  40 #include <linux/mm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/smp_lock.h>
  43
  44 #define DEBUG_SUBSYSTEM S_LLITE
  45
  46 #include <lustre_mdc.h>
  47 #include <lustre_lite.h>
  48 #include "llite_internal.h"
  49 #include <linux/lustre_compat25.h>
  50
  51 #ifndef list_for_each_prev_safe
  52 #define list_for_each_prev_safe(pos, n, head) \
  53         for (pos = (head)->prev, n = pos->prev; pos != (head); \
  54                 pos = n, n = pos->prev )
  55 #endif
  56
  57 kmem_cache_t *ll_async_page_slab = NULL;
  58 size_t ll_async_page_slab_size = 0;
  59
  60 /* SYNCHRONOUS I/O to object storage for an inode */
  61 static int ll_brw(int cmd, struct inode *inode, struct obdo *oa,
  62                   struct page *page, int flags)
  63 {
  64         struct ll_inode_info *lli = ll_i2info(inode);
  65         struct lov_stripe_md *lsm = lli->lli_smd;
  66         struct obd_info oinfo = { { { 0 } } };
  67         struct brw_page pg;
  68         int opc, rc;
  69         ENTRY;
  70
  71         pg.pg = page;
  72         pg.off = ((obd_off)page->index) << PAGE_SHIFT;
  73
  74         if ((cmd & OBD_BRW_WRITE) && (pg.off + PAGE_SIZE > inode->i_size))
  75                 pg.count = inode->i_size % PAGE_SIZE;
  76         else
  77                 pg.count = PAGE_SIZE;
  78
  79         LL_CDEBUG_PAGE(D_PAGE, page, "%s %d bytes ino %lu at "LPU64"/"LPX64"\n",
  80                        cmd & OBD_BRW_WRITE ? "write" : "read", pg.count,
  81                        inode->i_ino, pg.off, pg.off);
  82         if (pg.count == 0) {
  83                 CERROR("ZERO COUNT: ino %lu: size %p:%Lu(%p:%Lu) idx %lu off "
  84                        LPU64"\n",
  85                        inode->i_ino, inode, inode->i_size, page->mapping->host,
  86                        page->mapping->host->i_size, page->index, pg.off);
  87         }
  88
  89         pg.flag = flags;
  90
  91         if (cmd & OBD_BRW_WRITE)
  92                 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats,
  93                                     LPROC_LL_BRW_WRITE, pg.count);
  94         else
  95                 lprocfs_counter_add(ll_i2sbi(inode)->ll_stats,
  96                                     LPROC_LL_BRW_READ, pg.count);
  97         oinfo.oi_oa = oa;
  98         oinfo.oi_md = lsm;
  99         /* NB partial write, so we might not have CAPA_OPC_OSS_READ capa */
 100         opc = cmd & OBD_BRW_WRITE ? CAPA_OPC_OSS_WRITE :
 101                                     CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ;
 102         oinfo.oi_capa = ll_osscapa_get(inode, opc);
 103         rc = obd_brw(cmd, ll_i2dtexp(inode), &oinfo, 1, &pg, NULL);
 104         capa_put(oinfo.oi_capa);
 105         if (rc == 0)
 106                 obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS);
 107         else if (rc != -EIO)
 108                 CERROR("error from obd_brw: rc = %d\n", rc);
 109         RETURN(rc);
 110 }
 111
 112 /* this isn't where truncate starts.   roughly:
 113  * sys_truncate->ll_setattr_raw->vmtruncate->ll_truncate. setattr_raw grabs
 114  * DLM lock on [size, EOF], i_mutex, ->lli_size_sem, and WRITE_I_ALLOC_SEM to
 115  * avoid races.
 116  *
 117  * must be called under ->lli_size_sem */
 118 void ll_truncate(struct inode *inode)
 119 {
 120         struct ll_inode_info *lli = ll_i2info(inode);
 121         struct obd_info oinfo = { { { 0 } } };
 122         struct ost_lvb lvb;
 123         struct obdo oa;
 124         int rc;
 125         ENTRY;
 126         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",inode->i_ino,
 127                inode->i_generation, inode, inode->i_size, inode->i_size);
 128
 129         if (lli->lli_size_sem_owner != current) {
 130                 EXIT;
 131                 return;
 132         }
 133
 134         if (!lli->lli_smd) {
 135                 CDEBUG(D_INODE, "truncate on inode %lu with no objects\n",
 136                        inode->i_ino);
 137                 GOTO(out_unlock, 0);
 138         }
 139
 140         LASSERT(atomic_read(&lli->lli_size_sem.count) <= 0);
 141
 142         /* XXX I'm pretty sure this is a hack to paper over a more fundamental
 143          * race condition. */
 144         lov_stripe_lock(lli->lli_smd);
 145         inode_init_lvb(inode, &lvb);
 146         obd_merge_lvb(ll_i2dtexp(inode), lli->lli_smd, &lvb, 0);
 147         if (lvb.lvb_size == inode->i_size) {
 148                 CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64", %Lu=%#Lx\n",
 149                        lli->lli_smd->lsm_object_id, inode->i_size, inode->i_size);
 150                 lov_stripe_unlock(lli->lli_smd);
 151                 GOTO(out_unlock, 0);
 152         }
 153
 154         obd_adjust_kms(ll_i2dtexp(inode), lli->lli_smd, inode->i_size, 1);
 155         lov_stripe_unlock(lli->lli_smd);
 156
 157         if (unlikely((ll_i2sbi(inode)->ll_flags & LL_SBI_CHECKSUM) &&
 158                      (inode->i_size & ~PAGE_MASK))) {
 159                 /* If the truncate leaves behind a partial page, update its
 160                  * checksum. */
 161                 struct page *page = find_get_page(inode->i_mapping,
 162                                                   inode->i_size >> PAGE_CACHE_SHIFT);
 163                 if (page != NULL) {
 164                         struct ll_async_page *llap = llap_cast_private(page);
 165                         if (llap != NULL) {
 166                                 llap->llap_checksum =
 167                                         crc32_le(0, kmap(page), PAGE_SIZE);
 168                                 kunmap(page);
 169                         }
 170                         page_cache_release(page);
 171                 }
 172         }
 173
 174         CDEBUG(D_INFO, "calling punch for "LPX64" (new size %Lu=%#Lx)\n",
 175                lli->lli_smd->lsm_object_id, inode->i_size, inode->i_size);
 176
 177         oinfo.oi_md = lli->lli_smd;
 178         oinfo.oi_policy.l_extent.start = inode->i_size;
 179         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
 180         oinfo.oi_oa = &oa;
 181         oa.o_id = lli->lli_smd->lsm_object_id;
 182         oa.o_gr = lli->lli_smd->lsm_object_gr;
 183         oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
 184
 185         obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
 186                         OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 187                         OBD_MD_FLFID | OBD_MD_FLGENER);
 188
 189         ll_inode_size_unlock(inode, 0);
 190
 191         oinfo.oi_capa = ll_osscapa_get(inode, CAPA_OPC_OSS_TRUNC);
 192         rc = obd_punch_rqset(ll_i2dtexp(inode), &oinfo, NULL);
 193         ll_truncate_free_capa(oinfo.oi_capa);
 194         if (rc)
 195                 CERROR("obd_truncate fails (%d) ino %lu\n", rc, inode->i_ino);
 196         else
 197                 obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 198                               OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME);
 199         EXIT;
 200         return;
 201
 202  out_unlock:
 203         ll_inode_size_unlock(inode, 0);
 204 } /* ll_truncate */
 205
 206 int ll_prepare_write(struct file *file, struct page *page, unsigned from,
 207                      unsigned to)
 208 {
 209         struct inode *inode = page->mapping->host;
 210         struct ll_inode_info *lli = ll_i2info(inode);
 211         struct lov_stripe_md *lsm = lli->lli_smd;
 212         obd_off offset = ((obd_off)page->index) << PAGE_SHIFT;
 213         struct obd_info oinfo = { { { 0 } } };
 214         struct brw_page pga;
 215         struct obdo oa;
 216         struct ost_lvb lvb;
 217         int rc = 0;
 218         ENTRY;
 219
 220         LASSERT(PageLocked(page));
 221         (void)llap_cast_private(page); /* assertion */
 222
 223         /* Check to see if we should return -EIO right away */
 224         pga.pg = page;
 225         pga.off = offset;
 226         pga.count = PAGE_SIZE;
 227         pga.flag = 0;
 228
 229         oa.o_mode = inode->i_mode;
 230         oa.o_id = lsm->lsm_object_id;
 231         oa.o_gr = lsm->lsm_object_gr;
 232         oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE |
 233                      OBD_MD_FLTYPE | OBD_MD_FLGROUP;
 234         obdo_from_inode(&oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER);
 235
 236         oinfo.oi_oa = &oa;
 237         oinfo.oi_md = lsm;
 238         rc = obd_brw(OBD_BRW_CHECK, ll_i2dtexp(inode), &oinfo, 1, &pga, NULL);
 239         if (rc)
 240                 RETURN(rc);
 241
 242         if (PageUptodate(page)) {
 243                 LL_CDEBUG_PAGE(D_PAGE, page, "uptodate\n");
 244                 RETURN(0);
 245         }
 246
 247         /* We're completely overwriting an existing page, so _don't_ set it up
 248          * to date until commit_write */
 249         if (from == 0 && to == PAGE_SIZE) {
 250                 LL_CDEBUG_PAGE(D_PAGE, page, "full page write\n");
 251                 POISON_PAGE(page, 0x11);
 252                 RETURN(0);
 253         }
 254
 255         /* If are writing to a new page, no need to read old data.  The extent
 256          * locking will have updated the KMS, and for our purposes here we can
 257          * treat it like i_size. */
 258         lov_stripe_lock(lsm);
 259         inode_init_lvb(inode, &lvb);
 260         obd_merge_lvb(ll_i2dtexp(inode), lsm, &lvb, 0);
 261         lov_stripe_unlock(lsm);
 262         if (lvb.lvb_size <= offset) {
 263                 LL_CDEBUG_PAGE(D_PAGE, page, "kms "LPU64" <= offset "LPU64"\n",
 264                                lvb.lvb_size, offset);
 265                 memset(kmap(page), 0, PAGE_SIZE);
 266                 kunmap(page);
 267                 GOTO(prepare_done, rc = 0);
 268         }
 269
 270         /* XXX could be an async ocp read.. read-ahead? */
 271         rc = ll_brw(OBD_BRW_READ, inode, &oa, page, 0);
 272         if (rc == 0) {
 273                 /* bug 1598: don't clobber blksize */
 274                 oa.o_valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLKSZ);
 275                 obdo_refresh_inode(inode, &oa, oa.o_valid);
 276         }
 277
 278         EXIT;
 279  prepare_done:
 280         if (rc == 0)
 281                 SetPageUptodate(page);
 282
 283         return rc;
 284 }
 285
 286 static int ll_ap_make_ready(void *data, int cmd)
 287 {
 288         struct ll_async_page *llap;
 289         struct page *page;
 290         ENTRY;
 291
 292         llap = LLAP_FROM_COOKIE(data);
 293         page = llap->llap_page;
 294
 295         LASSERTF(!(cmd & OBD_BRW_READ), "cmd %x page %p ino %lu index %lu\n", cmd, page,
 296                  page->mapping->host->i_ino, page->index);
 297
 298         /* we're trying to write, but the page is locked.. come back later */
 299         if (TryLockPage(page))
 300                 RETURN(-EAGAIN);
 301
 302         LL_CDEBUG_PAGE(D_PAGE, page, "made ready\n");
 303         page_cache_get(page);
 304
 305         /* if we left PageDirty we might get another writepage call
 306          * in the future.  list walkers are bright enough
 307          * to check page dirty so we can leave it on whatever list
 308          * its on.  XXX also, we're called with the cli list so if
 309          * we got the page cache list we'd create a lock inversion
 310          * with the removepage path which gets the page lock then the
 311          * cli lock */
 312         clear_page_dirty(page);
 313         RETURN(0);
 314 }
 315
 316 /* We have two reasons for giving llite the opportunity to change the
 317  * write length of a given queued page as it builds the RPC containing
 318  * the page:
 319  *
 320  * 1) Further extending writes may have landed in the page cache
 321  *    since a partial write first queued this page requiring us
 322  *    to write more from the page cache.  (No further races are possible, since
 323  *    by the time this is called, the page is locked.)
 324  * 2) We might have raced with truncate and want to avoid performing
 325  *    write RPCs that are just going to be thrown away by the
 326  *    truncate's punch on the storage targets.
 327  *
 328  * The kms serves these purposes as it is set at both truncate and extending
 329  * writes.
 330  */
 331 static int ll_ap_refresh_count(void *data, int cmd)
 332 {
 333         struct ll_inode_info *lli;
 334         struct ll_async_page *llap;
 335         struct lov_stripe_md *lsm;
 336         struct page *page;
 337         struct inode *inode;
 338         struct ost_lvb lvb;
 339         __u64 kms;
 340         ENTRY;
 341
 342         /* readpage queues with _COUNT_STABLE, shouldn't get here. */
 343         LASSERT(cmd != OBD_BRW_READ);
 344
 345         llap = LLAP_FROM_COOKIE(data);
 346         page = llap->llap_page;
 347         inode = page->mapping->host;
 348         lli = ll_i2info(inode);
 349         lsm = lli->lli_smd;
 350
 351         lov_stripe_lock(lsm);
 352         inode_init_lvb(inode, &lvb);
 353         obd_merge_lvb(ll_i2dtexp(inode), lsm, &lvb, 1);
 354         kms = lvb.lvb_size;
 355         lov_stripe_unlock(lsm);
 356
 357         /* catch race with truncate */
 358         if (((__u64)page->index << PAGE_SHIFT) >= kms)
 359                 return 0;
 360
 361         /* catch sub-page write at end of file */
 362         if (((__u64)page->index << PAGE_SHIFT) + PAGE_SIZE > kms)
 363                 return kms % PAGE_SIZE;
 364
 365         return PAGE_SIZE;
 366 }
 367
 368 void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa)
 369 {
 370         struct lov_stripe_md *lsm;
 371         obd_flag valid_flags;
 372
 373         lsm = ll_i2info(inode)->lli_smd;
 374
 375         oa->o_id = lsm->lsm_object_id;
 376         oa->o_gr = lsm->lsm_object_gr;
 377         oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
 378         valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME;
 379         if (cmd & OBD_BRW_WRITE) {
 380                 oa->o_valid |= OBD_MD_FLEPOCH;
 381                 oa->o_easize = ll_i2info(inode)->lli_ioepoch;
 382
 383                 valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 384                         OBD_MD_FLUID | OBD_MD_FLGID |
 385                         OBD_MD_FLFID | OBD_MD_FLGENER;
 386         }
 387
 388         obdo_from_inode(oa, inode, valid_flags);
 389 }
 390
 391 static void ll_ap_fill_obdo(void *data, int cmd, struct obdo *oa)
 392 {
 393         struct ll_async_page *llap;
 394         ENTRY;
 395
 396         llap = LLAP_FROM_COOKIE(data);
 397         ll_inode_fill_obdo(llap->llap_page->mapping->host, cmd, oa);
 398
 399         EXIT;
 400 }
 401
 402 static void ll_ap_update_obdo(void *data, int cmd, struct obdo *oa,
 403                               obd_valid valid)
 404 {
 405         struct ll_async_page *llap;
 406         ENTRY;
 407
 408         llap = LLAP_FROM_COOKIE(data);
 409         obdo_from_inode(oa, llap->llap_page->mapping->host, valid);
 410
 411         EXIT;
 412 }
 413
 414 static struct obd_capa *ll_ap_lookup_capa(void *data, int cmd)
 415 {
 416         struct ll_async_page *llap = LLAP_FROM_COOKIE(data);
 417         int opc = cmd & OBD_BRW_WRITE ? CAPA_OPC_OSS_WRITE :
 418                                         CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ;
 419
 420         return ll_osscapa_get(llap->llap_page->mapping->host, opc);
 421 }
 422
 423 static struct obd_async_page_ops ll_async_page_ops = {
 424         .ap_make_ready =        ll_ap_make_ready,
 425         .ap_refresh_count =     ll_ap_refresh_count,
 426         .ap_fill_obdo =         ll_ap_fill_obdo,
 427         .ap_update_obdo =       ll_ap_update_obdo,
 428         .ap_completion =        ll_ap_completion,
 429         .ap_lookup_capa =       ll_ap_lookup_capa,
 430 };
 431
 432 struct ll_async_page *llap_cast_private(struct page *page)
 433 {
 434         struct ll_async_page *llap = (struct ll_async_page *)page_private(page);
 435
 436         LASSERTF(llap == NULL || llap->llap_magic == LLAP_MAGIC,
 437                  "page %p private %lu gave magic %d which != %d\n",
 438                  page, page_private(page), llap->llap_magic, LLAP_MAGIC);
 439
 440         return llap;
 441 }
 442
 443 /* Try to shrink the page cache for the @sbi filesystem by 1/@shrink_fraction.
 444  *
 445  * There is an llap attached onto every page in lustre, linked off @sbi.
 446  * We add an llap to the list so we don't lose our place during list walking.
 447  * If llaps in the list are being moved they will only move to the end
 448  * of the LRU, and we aren't terribly interested in those pages here (we
 449  * start at the beginning of the list where the least-used llaps are.
 450  */
 451 int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction)
 452 {
 453         struct ll_async_page *llap, dummy_llap = { .llap_magic = 0xd11ad11a };
 454         unsigned long total, want, count = 0;
 455
 456         total = sbi->ll_async_page_count;
 457
 458         /* There can be a large number of llaps (600k or more in a large
 459          * memory machine) so the VM 1/6 shrink ratio is likely too much.
 460          * Since we are freeing pages also, we don't necessarily want to
 461          * shrink so much.  Limit to 40MB of pages + llaps per call. */
 462         if (shrink_fraction == 0)
 463                 want = sbi->ll_async_page_count - sbi->ll_async_page_max + 32;
 464         else
 465                 want = (total + shrink_fraction - 1) / shrink_fraction;
 466
 467         if (want > 40 << (20 - PAGE_CACHE_SHIFT))
 468                 want = 40 << (20 - PAGE_CACHE_SHIFT);
 469
 470         CDEBUG(D_CACHE, "shrinking %lu of %lu pages (1/%d)\n",
 471                want, total, shrink_fraction);
 472
 473         spin_lock(&sbi->ll_lock);
 474         list_add(&dummy_llap.llap_pglist_item, &sbi->ll_pglist);
 475
 476         while (--total >= 0 && count < want) {
 477                 struct page *page;
 478                 int keep;
 479
 480                 if (unlikely(need_resched())) {
 481                         spin_unlock(&sbi->ll_lock);
 482                         cond_resched();
 483                         spin_lock(&sbi->ll_lock);
 484                 }
 485
 486                 llap = llite_pglist_next_llap(sbi,&dummy_llap.llap_pglist_item);
 487                 list_del_init(&dummy_llap.llap_pglist_item);
 488                 if (llap == NULL)
 489                         break;
 490
 491                 page = llap->llap_page;
 492                 LASSERT(page != NULL);
 493
 494                 list_add(&dummy_llap.llap_pglist_item, &llap->llap_pglist_item);
 495
 496                 /* Page needs/undergoing IO */
 497                 if (TryLockPage(page)) {
 498                         LL_CDEBUG_PAGE(D_PAGE, page, "can't lock\n");
 499                         continue;
 500                 }
 501
 502                 if (llap->llap_write_queued || PageDirty(page) ||
 503                     (!PageUptodate(page) &&
 504                      llap->llap_origin != LLAP_ORIGIN_READAHEAD))
 505                         keep = 1;
 506                 else
 507                         keep = 0;
 508
 509                 LL_CDEBUG_PAGE(D_PAGE, page,"%s LRU page: %s%s%s%s origin %s\n",
 510                                keep ? "keep" : "drop",
 511                                llap->llap_write_queued ? "wq " : "",
 512                                PageDirty(page) ? "pd " : "",
 513                                PageUptodate(page) ? "" : "!pu ",
 514                                llap->llap_defer_uptodate ? "" : "!du",
 515                                llap_origins[llap->llap_origin]);
 516
 517                 /* If page is dirty or undergoing IO don't discard it */
 518                 if (keep) {
 519                         unlock_page(page);
 520                         continue;
 521                 }
 522
 523                 page_cache_get(page);
 524                 spin_unlock(&sbi->ll_lock);
 525
 526                 if (page->mapping != NULL) {
 527                         ll_teardown_mmaps(page->mapping,
 528                                          (__u64)page->index<<PAGE_CACHE_SHIFT,
 529                                          ((__u64)page->index<<PAGE_CACHE_SHIFT)|
 530                                           ~PAGE_CACHE_MASK);
 531                         if (!PageDirty(page) && !page_mapped(page)) {
 532                                 ll_ra_accounting(llap, page->mapping);
 533                                 ll_truncate_complete_page(page);
 534                                 ++count;
 535                         } else {
 536                                 LL_CDEBUG_PAGE(D_PAGE, page, "Not dropping page"
 537                                                              " because it is "
 538                                                              "%s\n",
 539                                                               PageDirty(page)?
 540                                                               "dirty":"mapped");
 541                         }
 542                 }
 543                 unlock_page(page);
 544                 page_cache_release(page);
 545
 546                 spin_lock(&sbi->ll_lock);
 547         }
 548         list_del(&dummy_llap.llap_pglist_item);
 549         spin_unlock(&sbi->ll_lock);
 550
 551         CDEBUG(D_CACHE, "shrank %lu/%lu and left %lu unscanned\n",
 552                count, want, total);
 553
 554         return count;
 555 }
 556
 557 struct ll_async_page *llap_from_page(struct page *page, unsigned origin)
 558 {
 559         struct ll_async_page *llap;
 560         struct obd_export *exp;
 561         struct inode *inode = page->mapping->host;
 562         struct ll_sb_info *sbi;
 563         int rc;
 564         ENTRY;
 565
 566         if (!inode) {
 567                 static int triggered;
 568
 569                 if (!triggered) {
 570                         LL_CDEBUG_PAGE(D_ERROR, page, "Bug 10047. Wrong anon "
 571                                        "page received\n");
 572                         libcfs_debug_dumpstack(NULL);
 573                         triggered = 1;
 574                 }
 575                 RETURN(ERR_PTR(-EINVAL));
 576         }
 577         sbi = ll_i2sbi(inode);
 578         LASSERT(ll_async_page_slab);
 579         LASSERTF(origin < LLAP__ORIGIN_MAX, "%u\n", origin);
 580
 581         llap = llap_cast_private(page);
 582         if (llap != NULL) {
 583                 /* move to end of LRU list, except when page is just about to
 584                  * die */
 585                 if (origin != LLAP_ORIGIN_REMOVEPAGE) {
 586                         spin_lock(&sbi->ll_lock);
 587                         sbi->ll_pglist_gen++;
 588                         list_del_init(&llap->llap_pglist_item);
 589                         list_add_tail(&llap->llap_pglist_item, &sbi->ll_pglist);
 590                         spin_unlock(&sbi->ll_lock);
 591                 }
 592                 GOTO(out, llap);
 593         }
 594
 595         exp = ll_i2dtexp(page->mapping->host);
 596         if (exp == NULL)
 597                 RETURN(ERR_PTR(-EINVAL));
 598
 599         /* limit the number of lustre-cached pages */
 600         if (sbi->ll_async_page_count >= sbi->ll_async_page_max)
 601                 llap_shrink_cache(sbi, 0);
 602
 603         OBD_SLAB_ALLOC(llap, ll_async_page_slab, SLAB_KERNEL,
 604                        ll_async_page_slab_size);
 605         if (llap == NULL)
 606                 RETURN(ERR_PTR(-ENOMEM));
 607         llap->llap_magic = LLAP_MAGIC;
 608         llap->llap_cookie = (void *)llap + size_round(sizeof(*llap));
 609
 610         rc = obd_prep_async_page(exp, ll_i2info(inode)->lli_smd, NULL, page,
 611                                  (obd_off)page->index << PAGE_SHIFT,
 612                                  &ll_async_page_ops, llap, &llap->llap_cookie);
 613         if (rc) {
 614                 OBD_SLAB_FREE(llap, ll_async_page_slab,
 615                               ll_async_page_slab_size);
 616                 RETURN(ERR_PTR(rc));
 617         }
 618
 619         CDEBUG(D_CACHE, "llap %p page %p cookie %p obj off "LPU64"\n", llap,
 620                page, llap->llap_cookie, (obd_off)page->index << PAGE_SHIFT);
 621         /* also zeroing the PRIVBITS low order bitflags */
 622         __set_page_ll_data(page, llap);
 623         llap->llap_page = page;
 624         spin_lock(&sbi->ll_lock);
 625         sbi->ll_pglist_gen++;
 626         sbi->ll_async_page_count++;
 627         list_add_tail(&llap->llap_pglist_item, &sbi->ll_pglist);
 628         INIT_LIST_HEAD(&llap->llap_pending_write);
 629         spin_unlock(&sbi->ll_lock);
 630
 631  out:
 632         if (unlikely(sbi->ll_flags & LL_SBI_CHECKSUM)) {
 633                 __u32 csum = 0;
 634                 csum = crc32_le(csum, kmap(page), PAGE_SIZE);
 635                 kunmap(page);
 636                 if (origin == LLAP_ORIGIN_READAHEAD ||
 637                     origin == LLAP_ORIGIN_READPAGE) {
 638                         llap->llap_checksum = 0;
 639                 } else if (origin == LLAP_ORIGIN_COMMIT_WRITE ||
 640                            llap->llap_checksum == 0) {
 641                         llap->llap_checksum = csum;
 642                         CDEBUG(D_PAGE, "page %p cksum %x\n", page, csum);
 643                 } else if (llap->llap_checksum == csum) {
 644                         /* origin == LLAP_ORIGIN_WRITEPAGE */
 645                         CDEBUG(D_PAGE, "page %p cksum %x confirmed\n",
 646                                page, csum);
 647                 } else {
 648                         /* origin == LLAP_ORIGIN_WRITEPAGE */
 649                         LL_CDEBUG_PAGE(D_ERROR, page, "old cksum %x != new "
 650                                        "%x!\n", llap->llap_checksum, csum);
 651                 }
 652         }
 653
 654         llap->llap_origin = origin;
 655         RETURN(llap);
 656 }
 657
 658 static int queue_or_sync_write(struct obd_export *exp, struct inode *inode,
 659                                struct ll_async_page *llap,
 660                                unsigned to, obd_flag async_flags)
 661 {
 662         unsigned long size_index = inode->i_size >> PAGE_SHIFT;
 663         struct obd_io_group *oig;
 664         struct ll_sb_info *sbi = ll_i2sbi(inode);
 665         int rc, noquot = llap->llap_ignore_quota ? OBD_BRW_NOQUOTA : 0;
 666         ENTRY;
 667
 668         /* _make_ready only sees llap once we've unlocked the page */
 669         llap->llap_write_queued = 1;
 670         rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL,
 671                                 llap->llap_cookie, OBD_BRW_WRITE | noquot,
 672                                 0, 0, 0, async_flags);
 673         if (rc == 0) {
 674                 LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "write queued\n");
 675                 GOTO(out, 0);
 676         }
 677
 678         llap->llap_write_queued = 0;
 679         /* Do not pass llap here as it is sync write. */
 680         llap_write_pending(inode, NULL);
 681
 682         rc = oig_init(&oig);
 683         if (rc)
 684                 GOTO(out, rc);
 685
 686         /* make full-page requests if we are not at EOF (bug 4410) */
 687         if (to != PAGE_SIZE && llap->llap_page->index < size_index) {
 688                 LL_CDEBUG_PAGE(D_PAGE, llap->llap_page,
 689                                "sync write before EOF: size_index %lu, to %d\n",
 690                                size_index, to);
 691                 to = PAGE_SIZE;
 692         } else if (to != PAGE_SIZE && llap->llap_page->index == size_index) {
 693                 int size_to = inode->i_size & ~PAGE_MASK;
 694                 LL_CDEBUG_PAGE(D_PAGE, llap->llap_page,
 695                                "sync write at EOF: size_index %lu, to %d/%d\n",
 696                                size_index, to, size_to);
 697                 if (to < size_to)
 698                         to = size_to;
 699         }
 700
 701         /* compare the checksum once before the page leaves llite */
 702         if (unlikely((sbi->ll_flags & LL_SBI_CHECKSUM) &&
 703                      llap->llap_checksum != 0)) {
 704                 __u32 csum = 0;
 705                 struct page *page = llap->llap_page;
 706                 csum = crc32_le(csum, kmap(page), PAGE_SIZE);
 707                 kunmap(page);
 708                 if (llap->llap_checksum == csum) {
 709                         CDEBUG(D_PAGE, "page %p cksum %x confirmed\n",
 710                                page, csum);
 711                 } else {
 712                         CERROR("page %p old cksum %x != new cksum %x!\n",
 713                                page, llap->llap_checksum, csum);
 714                 }
 715         }
 716
 717         rc = obd_queue_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig,
 718                                 llap->llap_cookie, OBD_BRW_WRITE | noquot,
 719                                 0, to, 0, ASYNC_READY | ASYNC_URGENT |
 720                                 ASYNC_COUNT_STABLE | ASYNC_GROUP_SYNC);
 721         if (rc)
 722                 GOTO(free_oig, rc);
 723
 724         rc = obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig);
 725         if (rc)
 726                 GOTO(free_oig, rc);
 727
 728         rc = oig_wait(oig);
 729
 730         if (!rc && async_flags & ASYNC_READY)
 731                 unlock_page(llap->llap_page);
 732
 733         LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "sync write returned %d\n", rc);
 734
 735 free_oig:
 736         oig_release(oig);
 737 out:
 738         RETURN(rc);
 739 }
 740
 741 /* update our write count to account for i_size increases that may have
 742  * happened since we've queued the page for io. */
 743
 744 /* be careful not to return success without setting the page Uptodate or
 745  * the next pass through prepare_write will read in stale data from disk. */
 746 int ll_commit_write(struct file *file, struct page *page, unsigned from,
 747                     unsigned to)
 748 {
 749         struct inode *inode = page->mapping->host;
 750         struct ll_inode_info *lli = ll_i2info(inode);
 751         struct lov_stripe_md *lsm = lli->lli_smd;
 752         struct obd_export *exp;
 753         struct ll_async_page *llap;
 754         loff_t size;
 755         int rc = 0;
 756         ENTRY;
 757
 758         SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
 759         LASSERT(inode == file->f_dentry->d_inode);
 760         LASSERT(PageLocked(page));
 761
 762         CDEBUG(D_INODE, "inode %p is writing page %p from %d to %d at %lu\n",
 763                inode, page, from, to, page->index);
 764
 765         llap = llap_from_page(page, LLAP_ORIGIN_COMMIT_WRITE);
 766         if (IS_ERR(llap))
 767                 RETURN(PTR_ERR(llap));
 768
 769         exp = ll_i2dtexp(inode);
 770         if (exp == NULL)
 771                 RETURN(-EINVAL);
 772
 773         llap->llap_ignore_quota = capable(CAP_SYS_RESOURCE);
 774
 775         /*
 776          * queue a write for some time in the future the first time we
 777          * dirty the page.
 778          *
 779          * This is different from what other file systems do: they usually
 780          * just mark page (and some of its buffers) dirty and rely on
 781          * balance_dirty_pages() to start a write-back. Lustre wants write-back
 782          * to be started earlier for the following reasons:
 783          *
 784          *     (1) with a large number of clients we need to limit the amount
 785          *     of cached data on the clients a lot;
 786          *
 787          *     (2) large compute jobs generally want compute-only then io-only
 788          *     and the IO should complete as quickly as possible;
 789          *
 790          *     (3) IO is batched up to the RPC size and is async until the
 791          *     client max cache is hit
 792          *     (/proc/fs/lustre/osc/OSC.../max_dirty_mb)
 793          *
 794          */
 795         if (!PageDirty(page)) {
 796                 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
 797                                      LPROC_LL_DIRTY_MISSES);
 798
 799                 rc = queue_or_sync_write(exp, inode, llap, to, 0);
 800                 if (rc)
 801                         GOTO(out, rc);
 802         } else {
 803                 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
 804                                      LPROC_LL_DIRTY_HITS);
 805         }
 806
 807         /* put the page in the page cache, from now on ll_removepage is
 808          * responsible for cleaning up the llap.
 809          * only set page dirty when it's queued to be write out */
 810         if (llap->llap_write_queued)
 811                 set_page_dirty(page);
 812
 813 out:
 814         size = (((obd_off)page->index) << PAGE_SHIFT) + to;
 815         ll_inode_size_lock(inode, 0);
 816         if (rc == 0) {
 817                 lov_stripe_lock(lsm);
 818                 obd_adjust_kms(exp, lsm, size, 0);
 819                 lov_stripe_unlock(lsm);
 820                 if (size > inode->i_size)
 821                         inode->i_size = size;
 822                 SetPageUptodate(page);
 823         } else if (size > inode->i_size) {
 824                 /* this page beyond the pales of i_size, so it can't be
 825                  * truncated in ll_p_r_e during lock revoking. we must
 826                  * teardown our book-keeping here. */
 827                 ll_removepage(page);
 828         }
 829         ll_inode_size_unlock(inode, 0);
 830         RETURN(rc);
 831 }
 832
 833 static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, unsigned long len)
 834 {
 835         struct ll_ra_info *ra = &sbi->ll_ra_info;
 836         unsigned long ret;
 837         ENTRY;
 838
 839         spin_lock(&sbi->ll_lock);
 840         ret = min(ra->ra_max_pages - ra->ra_cur_pages, len);
 841         ra->ra_cur_pages += ret;
 842         spin_unlock(&sbi->ll_lock);
 843
 844         RETURN(ret);
 845 }
 846
 847 static void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
 848 {
 849         struct ll_ra_info *ra = &sbi->ll_ra_info;
 850         spin_lock(&sbi->ll_lock);
 851         LASSERTF(ra->ra_cur_pages >= len, "r_c_p %lu len %lu\n",
 852                  ra->ra_cur_pages, len);
 853         ra->ra_cur_pages -= len;
 854         spin_unlock(&sbi->ll_lock);
 855 }
 856
 857 /* called for each page in a completed rpc.*/
 858 int ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc)
 859 {
 860         struct ll_async_page *llap;
 861         struct page *page;
 862         int ret = 0;
 863         ENTRY;
 864
 865         llap = LLAP_FROM_COOKIE(data);
 866         page = llap->llap_page;
 867         LASSERT(PageLocked(page));
 868
 869         LL_CDEBUG_PAGE(D_PAGE, page, "completing cmd %d with %d\n", cmd, rc);
 870
 871         if (cmd & OBD_BRW_READ && llap->llap_defer_uptodate)
 872                 ll_ra_count_put(ll_i2sbi(page->mapping->host), 1);
 873
 874         if (rc == 0)  {
 875                 if (cmd & OBD_BRW_READ) {
 876                         if (!llap->llap_defer_uptodate)
 877                                 SetPageUptodate(page);
 878                 } else {
 879                         llap->llap_write_queued = 0;
 880                 }
 881                 ClearPageError(page);
 882         } else {
 883                 if (cmd & OBD_BRW_READ) {
 884                         llap->llap_defer_uptodate = 0;
 885                 } else {
 886                         ll_redirty_page(page);
 887                         ret = 1;
 888                 }
 889                 SetPageError(page);
 890         }
 891
 892         unlock_page(page);
 893
 894         if (cmd & OBD_BRW_WRITE) {
 895                 if (llap_write_complete(page->mapping->host, llap))
 896                         ll_queue_done_writing(page->mapping->host, 0);
 897         }
 898
 899         if (PageWriteback(page)) {
 900                 end_page_writeback(page);
 901         }
 902         page_cache_release(page);
 903
 904         RETURN(ret);
 905 }
 906
 907 /* the kernel calls us here when a page is unhashed from the page cache.
 908  * the page will be locked and the kernel is holding a spinlock, so
 909  * we need to be careful.  we're just tearing down our book-keeping
 910  * here. */
 911 void ll_removepage(struct page *page)
 912 {
 913         struct inode *inode = page->mapping->host;
 914         struct obd_export *exp;
 915         struct ll_async_page *llap;
 916         struct ll_sb_info *sbi = ll_i2sbi(inode);
 917         int rc;
 918         ENTRY;
 919
 920         LASSERT(!in_interrupt());
 921
 922         /* sync pages or failed read pages can leave pages in the page
 923          * cache that don't have our data associated with them anymore */
 924         if (page_private(page) == 0) {
 925                 EXIT;
 926                 return;
 927         }
 928
 929         LL_CDEBUG_PAGE(D_PAGE, page, "being evicted\n");
 930
 931         exp = ll_i2dtexp(inode);
 932         if (exp == NULL) {
 933                 CERROR("page %p ind %lu gave null export\n", page, page->index);
 934                 EXIT;
 935                 return;
 936         }
 937
 938         llap = llap_from_page(page, 0);
 939         if (IS_ERR(llap)) {
 940                 CERROR("page %p ind %lu couldn't find llap: %ld\n", page,
 941                        page->index, PTR_ERR(llap));
 942                 EXIT;
 943                 return;
 944         }
 945
 946         if (llap_write_complete(inode, llap))
 947                 ll_queue_done_writing(inode, 0);
 948
 949         rc = obd_teardown_async_page(exp, ll_i2info(inode)->lli_smd, NULL,
 950                                      llap->llap_cookie);
 951         if (rc != 0)
 952                 CERROR("page %p ind %lu failed: %d\n", page, page->index, rc);
 953
 954         /* this unconditional free is only safe because the page lock
 955          * is providing exclusivity to memory pressure/truncate/writeback..*/
 956         __clear_page_ll_data(page);
 957
 958         spin_lock(&sbi->ll_lock);
 959         if (!list_empty(&llap->llap_pglist_item))
 960                 list_del_init(&llap->llap_pglist_item);
 961         sbi->ll_pglist_gen++;
 962         sbi->ll_async_page_count--;
 963         spin_unlock(&sbi->ll_lock);
 964         OBD_SLAB_FREE(llap, ll_async_page_slab, ll_async_page_slab_size);
 965         EXIT;
 966 }
 967
 968 static int ll_page_matches(struct page *page, int fd_flags)
 969 {
 970         struct lustre_handle match_lockh = {0};
 971         struct inode *inode = page->mapping->host;
 972         ldlm_policy_data_t page_extent;
 973         int flags, matches;
 974         ENTRY;
 975
 976         if (unlikely(fd_flags & LL_FILE_GROUP_LOCKED))
 977                 RETURN(1);
 978
 979         page_extent.l_extent.start = (__u64)page->index << PAGE_CACHE_SHIFT;
 980         page_extent.l_extent.end =
 981                 page_extent.l_extent.start + PAGE_CACHE_SIZE - 1;
 982         flags = LDLM_FL_TEST_LOCK | LDLM_FL_BLOCK_GRANTED;
 983         if (!(fd_flags & LL_FILE_READAHEAD))
 984                 flags |= LDLM_FL_CBPENDING;
 985         matches = obd_match(ll_i2sbi(inode)->ll_dt_exp,
 986                             ll_i2info(inode)->lli_smd, LDLM_EXTENT,
 987                             &page_extent, LCK_PR | LCK_PW, &flags, inode,
 988                             &match_lockh);
 989         RETURN(matches);
 990 }
 991
 992 static int ll_issue_page_read(struct obd_export *exp,
 993                               struct ll_async_page *llap,
 994                               struct obd_io_group *oig, int defer)
 995 {
 996         struct page *page = llap->llap_page;
 997         int rc;
 998
 999         page_cache_get(page);
1000         llap->llap_defer_uptodate = defer;
1001         llap->llap_ra_used = 0;
1002         rc = obd_queue_group_io(exp, ll_i2info(page->mapping->host)->lli_smd,
1003                                 NULL, oig, llap->llap_cookie, OBD_BRW_READ, 0,
1004                                 PAGE_SIZE, 0, ASYNC_COUNT_STABLE | ASYNC_READY |
1005                                               ASYNC_URGENT);
1006         if (rc) {
1007                 LL_CDEBUG_PAGE(D_ERROR, page, "read queue failed: rc %d\n", rc);
1008                 page_cache_release(page);
1009         }
1010         RETURN(rc);
1011 }
1012
1013 static void ll_ra_stats_inc_unlocked(struct ll_ra_info *ra, enum ra_stat which)
1014 {
1015         LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which);
1016         ra->ra_stats[which]++;
1017 }
1018
1019 static void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
1020 {
1021         struct ll_sb_info *sbi = ll_i2sbi(mapping->host);
1022         struct ll_ra_info *ra = &ll_i2sbi(mapping->host)->ll_ra_info;
1023
1024         spin_lock(&sbi->ll_lock);
1025         ll_ra_stats_inc_unlocked(ra, which);
1026         spin_unlock(&sbi->ll_lock);
1027 }
1028
1029 void ll_ra_accounting(struct ll_async_page *llap, struct address_space *mapping)
1030 {
1031         if (!llap->llap_defer_uptodate || llap->llap_ra_used)
1032                 return;
1033
1034         ll_ra_stats_inc(mapping, RA_STAT_DISCARDED);
1035 }
1036
1037 #define RAS_CDEBUG(ras) \
1038         CDEBUG(D_READA,                                                      \
1039                "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu\n", \
1040                ras->ras_last_readpage, ras->ras_consecutive_requests,        \
1041                ras->ras_consecutive_pages, ras->ras_window_start,            \
1042                ras->ras_window_len, ras->ras_next_readahead,                 \
1043                ras->ras_requests, ras->ras_request_index);
1044
1045 static int index_in_window(unsigned long index, unsigned long point,
1046                            unsigned long before, unsigned long after)
1047 {
1048         unsigned long start = point - before, end = point + after;
1049
1050         if (start > point)
1051                start = 0;
1052         if (end < point)
1053                end = ~0;
1054
1055         return start <= index && index <= end;
1056 }
1057
1058 static struct ll_readahead_state *ll_ras_get(struct file *f)
1059 {
1060         struct ll_file_data       *fd;
1061
1062         fd = LUSTRE_FPRIVATE(f);
1063         return &fd->fd_ras;
1064 }
1065
1066 void ll_ra_read_in(struct file *f, struct ll_ra_read *rar)
1067 {
1068         struct ll_readahead_state *ras;
1069
1070         ras = ll_ras_get(f);
1071
1072         spin_lock(&ras->ras_lock);
1073         ras->ras_requests++;
1074         ras->ras_request_index = 0;
1075         ras->ras_consecutive_requests++;
1076         rar->lrr_reader = current;
1077
1078         list_add(&rar->lrr_linkage, &ras->ras_read_beads);
1079         spin_unlock(&ras->ras_lock);
1080 }
1081
1082 void ll_ra_read_ex(struct file *f, struct ll_ra_read *rar)
1083 {
1084         struct ll_readahead_state *ras;
1085
1086         ras = ll_ras_get(f);
1087
1088         spin_lock(&ras->ras_lock);
1089         list_del_init(&rar->lrr_linkage);
1090         spin_unlock(&ras->ras_lock);
1091 }
1092
1093 static struct ll_ra_read *ll_ra_read_get_locked(struct ll_readahead_state *ras)
1094 {
1095         struct ll_ra_read *scan;
1096
1097         list_for_each_entry(scan, &ras->ras_read_beads, lrr_linkage) {
1098                 if (scan->lrr_reader == current)
1099                         return scan;
1100         }
1101         return NULL;
1102 }
1103
1104 struct ll_ra_read *ll_ra_read_get(struct file *f)
1105 {
1106         struct ll_readahead_state *ras;
1107         struct ll_ra_read         *bead;
1108
1109         ras = ll_ras_get(f);
1110
1111         spin_lock(&ras->ras_lock);
1112         bead = ll_ra_read_get_locked(ras);
1113         spin_unlock(&ras->ras_lock);
1114         return bead;
1115 }
1116
1117 static int ll_readahead(struct ll_readahead_state *ras,
1118                          struct obd_export *exp, struct address_space *mapping,
1119                          struct obd_io_group *oig, int flags)
1120 {
1121         unsigned long i, start = 0, end = 0, reserved;
1122         struct ll_async_page *llap;
1123         struct page *page;
1124         int rc, ret = 0, match_failed = 0;
1125         __u64 kms;
1126         unsigned int gfp_mask;
1127         struct inode *inode;
1128         struct lov_stripe_md *lsm;
1129         struct ll_ra_read *bead;
1130         struct ost_lvb lvb;
1131         ENTRY;
1132
1133         inode = mapping->host;
1134         lsm = ll_i2info(inode)->lli_smd;
1135
1136         lov_stripe_lock(lsm);
1137         inode_init_lvb(inode, &lvb);
1138         obd_merge_lvb(ll_i2dtexp(inode), lsm, &lvb, 1);
1139         kms = lvb.lvb_size;
1140         lov_stripe_unlock(lsm);
1141         if (kms == 0) {
1142                 ll_ra_stats_inc(mapping, RA_STAT_ZERO_LEN);
1143                 RETURN(0);
1144         }
1145
1146         spin_lock(&ras->ras_lock);
1147         bead = ll_ra_read_get_locked(ras);
1148         /* Enlarge the RA window to encompass the full read */
1149         if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
1150             bead->lrr_start + bead->lrr_count) {
1151                 ras->ras_window_len = bead->lrr_start + bead->lrr_count -
1152                                       ras->ras_window_start;
1153         }
1154         /* Reserve a part of the read-ahead window that we'll be issuing */
1155         if (ras->ras_window_len) {
1156                 start = ras->ras_next_readahead;
1157                 end = ras->ras_window_start + ras->ras_window_len - 1;
1158         }
1159         if (end != 0) {
1160                 /* Truncate RA window to end of file */
1161                 end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT));
1162                 ras->ras_next_readahead = max(end, end + 1);
1163                 RAS_CDEBUG(ras);
1164         }
1165         spin_unlock(&ras->ras_lock);
1166
1167         if (end == 0) {
1168                 ll_ra_stats_inc(mapping, RA_STAT_ZERO_WINDOW);
1169                 RETURN(0);
1170         }
1171
1172         reserved = ll_ra_count_get(ll_i2sbi(inode), end - start + 1);
1173         if (reserved < end - start + 1)
1174                 ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT);
1175
1176         gfp_mask = GFP_HIGHUSER & ~__GFP_WAIT;
1177 #ifdef __GFP_NOWARN
1178         gfp_mask |= __GFP_NOWARN;
1179 #endif
1180
1181         for (i = start; reserved > 0 && !match_failed && i <= end; i++) {
1182                 /* skip locked pages from previous readpage calls */
1183                 page = grab_cache_page_nowait_gfp(mapping, i, gfp_mask);
1184                 if (page == NULL) {
1185                         ll_ra_stats_inc(mapping, RA_STAT_FAILED_GRAB_PAGE);
1186                         CDEBUG(D_READA, "g_c_p_n failed\n");
1187                         continue;
1188                 }
1189
1190                 /* Check if page was truncated or reclaimed */
1191                 if (page->mapping != mapping) {
1192                         ll_ra_stats_inc(mapping, RA_STAT_WRONG_GRAB_PAGE);
1193                         CDEBUG(D_READA, "g_c_p_n returned invalid page\n");
1194                         goto next_page;
1195                 }
1196
1197                 /* we do this first so that we can see the page in the /proc
1198                  * accounting */
1199                 llap = llap_from_page(page, LLAP_ORIGIN_READAHEAD);
1200                 if (IS_ERR(llap) || llap->llap_defer_uptodate)
1201                         goto next_page;
1202
1203                 /* skip completed pages */
1204                 if (Page_Uptodate(page))
1205                         goto next_page;
1206
1207                 /* bail when we hit the end of the lock. */
1208                 if ((rc = ll_page_matches(page, flags|LL_FILE_READAHEAD)) <= 0){
1209                         LL_CDEBUG_PAGE(D_READA | D_PAGE, page,
1210                                        "lock match failed: rc %d\n", rc);
1211                         ll_ra_stats_inc(mapping, RA_STAT_FAILED_MATCH);
1212                         match_failed = 1;
1213                         goto next_page;
1214                 }
1215
1216                 rc = ll_issue_page_read(exp, llap, oig, 1);
1217                 if (rc == 0) {
1218                         reserved--;
1219                         ret++;
1220                         LL_CDEBUG_PAGE(D_READA| D_PAGE, page,
1221                                        "started read-ahead\n");
1222                 }
1223                 if (rc) {
1224         next_page:
1225                         LL_CDEBUG_PAGE(D_READA | D_PAGE, page,
1226                                        "skipping read-ahead\n");
1227
1228                         unlock_page(page);
1229                 }
1230                 page_cache_release(page);
1231         }
1232
1233         LASSERTF(reserved >= 0, "reserved %lu\n", reserved);
1234         if (reserved != 0)
1235                 ll_ra_count_put(ll_i2sbi(inode), reserved);
1236         if (i == end + 1 && end == (kms >> PAGE_CACHE_SHIFT))
1237                 ll_ra_stats_inc(mapping, RA_STAT_EOF);
1238
1239         /* if we didn't get to the end of the region we reserved from
1240          * the ras we need to go back and update the ras so that the
1241          * next read-ahead tries from where we left off.  we only do so
1242          * if the region we failed to issue read-ahead on is still ahead
1243          * of the app and behind the next index to start read-ahead from */
1244         if (i != end + 1) {
1245                 spin_lock(&ras->ras_lock);
1246                 if (i < ras->ras_next_readahead &&
1247                     index_in_window(i, ras->ras_window_start, 0,
1248                                     ras->ras_window_len)) {
1249                         ras->ras_next_readahead = i;
1250                         RAS_CDEBUG(ras);
1251                 }
1252                 spin_unlock(&ras->ras_lock);
1253         }
1254
1255         RETURN(ret);
1256 }
1257
1258 static void ras_set_start(struct ll_readahead_state *ras, unsigned long index)
1259 {
1260         ras->ras_window_start = index & (~((1024 * 1024 >> PAGE_SHIFT) - 1));
1261 }
1262
1263 /* called with the ras_lock held or from places where it doesn't matter */
1264 static void ras_reset(struct ll_readahead_state *ras, unsigned long index)
1265 {
1266         ras->ras_last_readpage = index;
1267         ras->ras_consecutive_requests = 0;
1268         ras->ras_consecutive_pages = 0;
1269         ras->ras_window_len = 0;
1270         ras_set_start(ras, index);
1271         ras->ras_next_readahead = max(ras->ras_window_start, index);
1272
1273         RAS_CDEBUG(ras);
1274 }
1275
1276 void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
1277 {
1278         spin_lock_init(&ras->ras_lock);
1279         ras_reset(ras, 0);
1280         ras->ras_requests = 0;
1281         INIT_LIST_HEAD(&ras->ras_read_beads);
1282 }
1283
1284 static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
1285                        struct ll_readahead_state *ras, unsigned long index,
1286                        unsigned hit)
1287 {
1288         struct ll_ra_info *ra = &sbi->ll_ra_info;
1289         int zero = 0;
1290         ENTRY;
1291
1292         spin_lock(&sbi->ll_lock);
1293         spin_lock(&ras->ras_lock);
1294
1295         ll_ra_stats_inc_unlocked(ra, hit ? RA_STAT_HIT : RA_STAT_MISS);
1296
1297         /* reset the read-ahead window in two cases.  First when the app seeks
1298          * or reads to some other part of the file.  Secondly if we get a
1299          * read-ahead miss that we think we've previously issued.  This can
1300          * be a symptom of there being so many read-ahead pages that the VM is
1301          * reclaiming it before we get to it. */
1302         if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) {
1303                 zero = 1;
1304                 ll_ra_stats_inc_unlocked(ra, RA_STAT_DISTANT_READPAGE);
1305         } else if (!hit && ras->ras_window_len &&
1306                    index < ras->ras_next_readahead &&
1307                    index_in_window(index, ras->ras_window_start, 0,
1308                                    ras->ras_window_len)) {
1309                 zero = 1;
1310                 ll_ra_stats_inc_unlocked(ra, RA_STAT_MISS_IN_WINDOW);
1311         }
1312
1313         /* On the second access to a file smaller than the tunable
1314          * ra_max_read_ahead_whole_pages trigger RA on all pages in the
1315          * file up to ra_max_pages.  This is simply a best effort and
1316          * only occurs once per open file.  Normal RA behavior is reverted
1317          * to for subsequent IO.  The mmap case does not increment
1318          * ras_requests and thus can never trigger this behavior. */
1319         if (ras->ras_requests == 2 && !ras->ras_request_index) {
1320                 __u64 kms_pages;
1321
1322                 kms_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1323
1324                 CDEBUG(D_READA, "kmsp "LPU64" mwp %lu mp %lu\n", kms_pages,
1325                        ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages);
1326
1327                 if (kms_pages &&
1328                     kms_pages <= ra->ra_max_read_ahead_whole_pages) {
1329                         ras->ras_window_start = 0;
1330                         ras->ras_last_readpage = 0;
1331                         ras->ras_next_readahead = 0;
1332                         ras->ras_window_len = min(ra->ra_max_pages,
1333                                 ra->ra_max_read_ahead_whole_pages);
1334                         GOTO(out_unlock, 0);
1335                 }
1336         }
1337
1338         if (zero) {
1339                 ras_reset(ras, index);
1340                 GOTO(out_unlock, 0);
1341         }
1342
1343         ras->ras_last_readpage = index;
1344         ras->ras_consecutive_pages++;
1345         ras_set_start(ras, index);
1346         ras->ras_next_readahead = max(ras->ras_window_start,
1347                                       ras->ras_next_readahead);
1348
1349         /* Trigger RA in the mmap case where ras_consecutive_requests
1350          * is not incremented and thus can't be used to trigger RA */
1351         if (!ras->ras_window_len && ras->ras_consecutive_pages == 3) {
1352                 ras->ras_window_len = 1024 * 1024 >> PAGE_SHIFT;
1353                 GOTO(out_unlock, 0);
1354         }
1355
1356         /* The initial ras_window_len is set to the request size.  To avoid
1357          * uselessly reading and discarding pages for random IO the window is
1358          * only increased once per consecutive request received. */
1359         if (ras->ras_consecutive_requests > 1 && !ras->ras_request_index) {
1360                 ras->ras_window_len = min(ras->ras_window_len +
1361                                           (1024 * 1024 >> PAGE_SHIFT),
1362                                           ra->ra_max_pages);
1363         }
1364
1365         EXIT;
1366 out_unlock:
1367         RAS_CDEBUG(ras);
1368         ras->ras_request_index++;
1369         spin_unlock(&ras->ras_lock);
1370         spin_unlock(&sbi->ll_lock);
1371         return;
1372 }
1373
1374 int ll_writepage(struct page *page)
1375 {
1376         struct inode *inode = page->mapping->host;
1377         struct ll_inode_info *lli = ll_i2info(inode);
1378         struct obd_export *exp;
1379         struct ll_async_page *llap;
1380         int rc = 0;
1381         ENTRY;
1382
1383         LASSERT(!PageDirty(page));
1384         LASSERT(PageLocked(page));
1385
1386         exp = ll_i2dtexp(inode);
1387         if (exp == NULL)
1388                 GOTO(out, rc = -EINVAL);
1389
1390         llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE);
1391         if (IS_ERR(llap))
1392                 GOTO(out, rc = PTR_ERR(llap));
1393
1394         page_cache_get(page);
1395         if (llap->llap_write_queued) {
1396                 LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
1397                 rc = obd_set_async_flags(exp, lli->lli_smd, NULL,
1398                                          llap->llap_cookie,
1399                                          ASYNC_READY | ASYNC_URGENT);
1400         } else {
1401                 rc = queue_or_sync_write(exp, inode, llap, PAGE_SIZE,
1402                                          ASYNC_READY | ASYNC_URGENT);
1403         }
1404         if (rc)
1405                 page_cache_release(page);
1406 out:
1407         if (rc) {
1408                 if (!lli->lli_async_rc)
1409                         lli->lli_async_rc = rc;
1410                 /* re-dirty page on error so it retries write */
1411                 ll_redirty_page(page);
1412                 unlock_page(page);
1413         }
1414         RETURN(rc);
1415 }
1416
1417 /*
1418  * for now we do our readpage the same on both 2.4 and 2.5.  The kernel's
1419  * read-ahead assumes it is valid to issue readpage all the way up to
1420  * i_size, but our dlm locks make that not the case.  We disable the
1421  * kernel's read-ahead and do our own by walking ahead in the page cache
1422  * checking for dlm lock coverage.  the main difference between 2.4 and
1423  * 2.6 is how read-ahead gets batched and issued, but we're using our own,
1424  * so they look the same.
1425  */
1426 int ll_readpage(struct file *filp, struct page *page)
1427 {
1428         struct ll_file_data *fd = LUSTRE_FPRIVATE(filp);
1429         struct inode *inode = page->mapping->host;
1430         struct obd_export *exp;
1431         struct ll_async_page *llap;
1432         struct obd_io_group *oig = NULL;
1433         int rc;
1434         ENTRY;
1435
1436         LASSERT(PageLocked(page));
1437         LASSERT(!PageUptodate(page));
1438         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),offset=%Lu=%#Lx\n",
1439                inode->i_ino, inode->i_generation, inode,
1440                (((loff_t)page->index) << PAGE_SHIFT),
1441                (((loff_t)page->index) << PAGE_SHIFT));
1442         LASSERT(atomic_read(&filp->f_dentry->d_inode->i_count) > 0);
1443
1444         if (!ll_i2info(inode)->lli_smd) {
1445                 /* File with no objects - one big hole */
1446                 /* We use this just for remove_from_page_cache that is not
1447                  * exported, we'd make page back up to date. */
1448                 ll_truncate_complete_page(page);
1449                 clear_page(page);
1450                 SetPageUptodate(page);
1451                 RETURN(0);
1452         }
1453
1454         rc = oig_init(&oig);
1455         if (rc < 0)
1456                 GOTO(out, rc);
1457
1458         exp = ll_i2dtexp(inode);
1459         if (exp == NULL)
1460                 GOTO(out, rc = -EINVAL);
1461
1462         llap = llap_from_page(page, LLAP_ORIGIN_READPAGE);
1463         if (IS_ERR(llap))
1464                 GOTO(out, rc = PTR_ERR(llap));
1465
1466         if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
1467                 ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index,
1468                            llap->llap_defer_uptodate);
1469
1470         if (llap->llap_defer_uptodate) {
1471                 llap->llap_ra_used = 1;
1472                 rc = ll_readahead(&fd->fd_ras, exp, page->mapping, oig,
1473                                   fd->fd_flags);
1474                 if (rc > 0)
1475                         obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd,
1476                                              NULL, oig);
1477                 LL_CDEBUG_PAGE(D_PAGE, page, "marking uptodate from defer\n");
1478                 SetPageUptodate(page);
1479                 unlock_page(page);
1480                 GOTO(out_oig, rc = 0);
1481         }
1482
1483         if (likely((fd->fd_flags & LL_FILE_IGNORE_LOCK) == 0)) {
1484                 rc = ll_page_matches(page, fd->fd_flags);
1485                 if (rc < 0) {
1486                         LL_CDEBUG_PAGE(D_ERROR, page, "lock match failed: rc %d\n", rc);
1487                         GOTO(out, rc);
1488                 }
1489
1490                 if (rc == 0) {
1491                         CWARN("ino %lu page %lu (%llu) not covered by "
1492                               "a lock (mmap?).  check debug logs.\n",
1493                               inode->i_ino, page->index,
1494                               (long long)page->index << PAGE_CACHE_SHIFT);
1495                 }
1496         }
1497
1498         rc = ll_issue_page_read(exp, llap, oig, 0);
1499         if (rc)
1500                 GOTO(out, rc);
1501
1502         LL_CDEBUG_PAGE(D_PAGE, page, "queued readpage\n");
1503         if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
1504                 ll_readahead(&fd->fd_ras, exp, page->mapping, oig,
1505                              fd->fd_flags);
1506
1507         rc = obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig);
1508
1509 out:
1510         if (rc)
1511                 unlock_page(page);
1512 out_oig:
1513         if (oig != NULL)
1514                 oig_release(oig);
1515         RETURN(rc);
1516 }