lustre/obdfilter/filter_io.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  * GPL HEADER START
   5  *
   6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License version 2 only,
  10  * as published by the Free Software Foundation.
  11  *
  12  * This program is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * General Public License version 2 for more details (a copy is included
  16  * in the LICENSE file that accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * version 2 along with this program; If not, see
  20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  21  *
  22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  23  * CA 95054 USA or visit www.sun.com if you need additional information or
  24  * have any questions.
  25  *
  26  * GPL HEADER END
  27  */
  28 /*
  29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
  30  * Use is subject to license terms.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lustre/obdfilter/filter_io.c
  37  *
  38  * Author: Peter Braam <braam@clusterfs.com>
  39  * Author: Andreas Dilger <adilger@clusterfs.com>
  40  * Author: Phil Schwan <phil@clusterfs.com>
  41  */
  42
  43 #define DEBUG_SUBSYSTEM S_FILTER
  44
  45 #ifndef AUTOCONF_INCLUDED
  46 #include <linux/config.h>
  47 #endif
  48 #include <linux/module.h>
  49 #include <linux/pagemap.h> // XXX kill me soon
  50 #include <linux/version.h>
  51
  52 #include <obd_class.h>
  53 #include <obd_ost.h>
  54 #include <lustre_fsfilt.h>
  55 #include "filter_internal.h"
  56
  57 int *obdfilter_created_scratchpad;
  58
  59 static int filter_alloc_dio_page(struct obd_device *obd, struct inode *inode,
  60                                  struct niobuf_local *lnb)
  61 {
  62         struct page *page;
  63
  64         LASSERT(lnb->page != NULL);
  65
  66         page = lnb->page;
  67 #if 0
  68         POISON_PAGE(page, 0xf1);
  69         if (lnb->len != CFS_PAGE_SIZE) {
  70                 memset(kmap(page) + lnb->len, 0, CFS_PAGE_SIZE - lnb->len);
  71                 kunmap(page);
  72         }
  73 #endif
  74         page->index = lnb->offset >> CFS_PAGE_SHIFT;
  75
  76         RETURN(0);
  77 }
  78
  79 static void filter_free_dio_pages(int objcount, struct obd_ioobj *obj,
  80                            int niocount, struct niobuf_local *res)
  81 {
  82         int i, j;
  83
  84         for (i = 0; i < objcount; i++, obj++) {
  85                 for (j = 0 ; j < obj->ioo_bufcnt ; j++, res++)
  86                                 res->page = NULL;
  87         }
  88 }
  89
  90 /* Grab the dirty and seen grant announcements from the incoming obdo.
  91  * We will later calculate the clients new grant and return it.
  92  * Caller must hold osfs lock */
  93 static void filter_grant_incoming(struct obd_export *exp, struct obdo *oa)
  94 {
  95         struct filter_export_data *fed;
  96         struct obd_device *obd = exp->exp_obd;
  97         ENTRY;
  98
  99         LASSERT_SPIN_LOCKED(&obd->obd_osfs_lock);
 100
 101         if ((oa->o_valid & (OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) !=
 102                                         (OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) {
 103                 oa->o_valid &= ~OBD_MD_FLGRANT;
 104                 EXIT;
 105                 return;
 106         }
 107
 108         fed = &exp->exp_filter_data;
 109
 110         /* Add some margin, since there is a small race if other RPCs arrive
 111          * out-or-order and have already consumed some grant.  We want to
 112          * leave this here in case there is a large error in accounting. */
 113         CDEBUG(D_CACHE,
 114                "%s: cli %s/%p reports grant: "LPU64" dropped: %u, local: %lu\n",
 115                obd->obd_name, exp->exp_client_uuid.uuid, exp, oa->o_grant,
 116                oa->o_dropped, fed->fed_grant);
 117
 118         /* Update our accounting now so that statfs takes it into account.
 119          * Note that fed_dirty is only approximate and can become incorrect
 120          * if RPCs arrive out-of-order.  No important calculations depend
 121          * on fed_dirty however, but we must check sanity to not assert. */
 122         if ((long long)oa->o_dirty < 0)
 123                 oa->o_dirty = 0;
 124         else if (oa->o_dirty > fed->fed_grant + 4 * FILTER_GRANT_CHUNK)
 125                 oa->o_dirty = fed->fed_grant + 4 * FILTER_GRANT_CHUNK;
 126         obd->u.filter.fo_tot_dirty += oa->o_dirty - fed->fed_dirty;
 127         if (fed->fed_grant < oa->o_dropped) {
 128                 CDEBUG(D_CACHE,"%s: cli %s/%p reports %u dropped > grant %lu\n",
 129                        obd->obd_name, exp->exp_client_uuid.uuid, exp,
 130                        oa->o_dropped, fed->fed_grant);
 131                 oa->o_dropped = 0;
 132         }
 133         if (obd->u.filter.fo_tot_granted < oa->o_dropped) {
 134                 CERROR("%s: cli %s/%p reports %u dropped > tot_grant "LPU64"\n",
 135                        obd->obd_name, exp->exp_client_uuid.uuid, exp,
 136                        oa->o_dropped, obd->u.filter.fo_tot_granted);
 137                 oa->o_dropped = 0;
 138         }
 139         obd->u.filter.fo_tot_granted -= oa->o_dropped;
 140         fed->fed_grant -= oa->o_dropped;
 141         fed->fed_dirty = oa->o_dirty;
 142         if (fed->fed_dirty < 0 || fed->fed_grant < 0 || fed->fed_pending < 0) {
 143                 CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
 144                        obd->obd_name, exp->exp_client_uuid.uuid, exp,
 145                        fed->fed_dirty, fed->fed_pending, fed->fed_grant);
 146                 spin_unlock(&obd->obd_osfs_lock);
 147                 LBUG();
 148         }
 149         EXIT;
 150 }
 151
 152 /* Figure out how much space is available between what we've granted
 153  * and what remains in the filesystem.  Compensate for ext3 indirect
 154  * block overhead when computing how much free space is left ungranted.
 155  *
 156  * Caller must hold obd_osfs_lock. */
 157 obd_size filter_grant_space_left(struct obd_export *exp)
 158 {
 159         struct obd_device *obd = exp->exp_obd;
 160         int blockbits = obd->u.obt.obt_sb->s_blocksize_bits;
 161         obd_size tot_granted = obd->u.filter.fo_tot_granted, avail, left = 0;
 162         int rc, statfs_done = 0;
 163
 164         LASSERT_SPIN_LOCKED(&obd->obd_osfs_lock);
 165
 166         if (cfs_time_before_64(obd->obd_osfs_age, cfs_time_current_64() - HZ)) {
 167 restat:
 168                 rc = fsfilt_statfs(obd, obd->u.obt.obt_sb,
 169                                    cfs_time_current_64() + HZ);
 170                 if (rc) /* N.B. statfs can't really fail */
 171                         RETURN(0);
 172                 statfs_done = 1;
 173         }
 174
 175         avail = obd->obd_osfs.os_bavail;
 176         left = avail - (avail >> (blockbits - 3)); /* (d)indirect */
 177         if (left > GRANT_FOR_LLOG(obd)) {
 178                 left = (left - GRANT_FOR_LLOG(obd)) << blockbits;
 179         } else {
 180                 left = 0 /* << blockbits */;
 181         }
 182
 183         if (!statfs_done && left < 32 * FILTER_GRANT_CHUNK + tot_granted) {
 184                 CDEBUG(D_CACHE, "fs has no space left and statfs too old\n");
 185                 goto restat;
 186         }
 187
 188         if (left >= tot_granted) {
 189                 left -= tot_granted;
 190         } else {
 191                 if (left < tot_granted - obd->u.filter.fo_tot_pending) {
 192                         CERROR("%s: cli %s/%p grant "LPU64" > available "
 193                                LPU64" and pending "LPU64"\n", obd->obd_name,
 194                                exp->exp_client_uuid.uuid, exp, tot_granted,
 195                                left, obd->u.filter.fo_tot_pending);
 196                 }
 197                 left = 0;
 198         }
 199
 200         CDEBUG(D_CACHE, "%s: cli %s/%p free: "LPU64" avail: "LPU64" grant "LPU64
 201                " left: "LPU64" pending: "LPU64"\n", obd->obd_name,
 202                exp->exp_client_uuid.uuid, exp,
 203                obd->obd_osfs.os_bfree << blockbits, avail << blockbits,
 204                tot_granted, left, obd->u.filter.fo_tot_pending);
 205
 206         return left;
 207 }
 208
 209 /* Calculate how much grant space to allocate to this client, based on how
 210  * much space is currently free and how much of that is already granted.
 211  *
 212  * Caller must hold obd_osfs_lock. */
 213 long filter_grant(struct obd_export *exp, obd_size current_grant,
 214                   obd_size want, obd_size fs_space_left)
 215 {
 216         struct obd_device *obd = exp->exp_obd;
 217         struct filter_export_data *fed = &exp->exp_filter_data;
 218         int blockbits = obd->u.obt.obt_sb->s_blocksize_bits;
 219         __u64 grant = 0;
 220
 221         LASSERT_SPIN_LOCKED(&obd->obd_osfs_lock);
 222
 223         /* Grant some fraction of the client's requested grant space so that
 224          * they are not always waiting for write credits (not all of it to
 225          * avoid overgranting in face of multiple RPCs in flight).  This
 226          * essentially will be able to control the OSC_MAX_RIF for a client.
 227          *
 228          * If we do have a large disparity between what the client thinks it
 229          * has and what we think it has, don't grant very much and let the
 230          * client consume its grant first.  Either it just has lots of RPCs
 231          * in flight, or it was evicted and its grants will soon be used up. */
 232         if (want > 0x7fffffff) {
 233                 CERROR("%s: client %s/%p requesting > 2GB grant "LPU64"\n",
 234                        obd->obd_name, exp->exp_client_uuid.uuid, exp, want);
 235         } else if (current_grant < want &&
 236                    current_grant < fed->fed_grant + FILTER_GRANT_CHUNK) {
 237                 grant = min((want >> blockbits),
 238                             (fs_space_left >> blockbits) / 8);
 239                 grant <<= blockbits;
 240
 241                 if (grant) {
 242                         /* Allow >FILTER_GRANT_CHUNK size when clients
 243                          * reconnect due to a server reboot.
 244                          */
 245                         if ((grant > FILTER_GRANT_CHUNK) &&
 246                             (!obd->obd_recovering))
 247                                 grant = FILTER_GRANT_CHUNK;
 248
 249                         obd->u.filter.fo_tot_granted += grant;
 250                         fed->fed_grant += grant;
 251                         if (fed->fed_grant < 0) {
 252                                 CERROR("%s: cli %s/%p grant %ld want "LPU64
 253                                        "current"LPU64"\n",
 254                                        obd->obd_name, exp->exp_client_uuid.uuid,
 255                                        exp, fed->fed_grant, want,current_grant);
 256                                 spin_unlock(&obd->obd_osfs_lock);
 257                                 LBUG();
 258                         }
 259                 }
 260         }
 261
 262         CDEBUG(D_CACHE,
 263                "%s: cli %s/%p wants: "LPU64" current grant "LPU64
 264                " granting: "LPU64"\n", obd->obd_name, exp->exp_client_uuid.uuid,
 265                exp, want, current_grant, grant);
 266         CDEBUG(D_CACHE,
 267                "%s: cli %s/%p tot cached:"LPU64" granted:"LPU64
 268                " num_exports: %d\n", obd->obd_name, exp->exp_client_uuid.uuid,
 269                exp, obd->u.filter.fo_tot_dirty,
 270                obd->u.filter.fo_tot_granted, obd->obd_num_exports);
 271
 272         return grant;
 273 }
 274
 275 static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa,
 276                               int objcount, struct obd_ioobj *obj,
 277                               int niocount, struct niobuf_remote *nb,
 278                               struct niobuf_local *res,
 279                               struct obd_trans_info *oti,
 280                               struct lustre_capa *capa)
 281 {
 282         struct obd_device *obd = exp->exp_obd;
 283         struct lvfs_run_ctxt saved;
 284         struct niobuf_remote *rnb;
 285         struct niobuf_local *lnb;
 286         struct dentry *dentry = NULL;
 287         struct inode *inode;
 288         void *iobuf = NULL;
 289         int rc = 0, i, tot_bytes = 0;
 290         unsigned long now = jiffies;
 291         ENTRY;
 292
 293         /* We are currently not supporting multi-obj BRW_READ RPCS at all.
 294          * When we do this function's dentry cleanup will need to be fixed.
 295          * These values are verified in ost_brw_write() from the wire. */
 296         LASSERTF(objcount == 1, "%d\n", objcount);
 297         LASSERTF(obj->ioo_bufcnt > 0, "%d\n", obj->ioo_bufcnt);
 298
 299         rc = filter_auth_capa(exp, NULL, obdo_mdsno(oa), capa,
 300                               CAPA_OPC_OSS_READ);
 301         if (rc)
 302                 RETURN(rc);
 303
 304         if (oa && oa->o_valid & OBD_MD_FLGRANT) {
 305                 spin_lock(&obd->obd_osfs_lock);
 306                 filter_grant_incoming(exp, oa);
 307
 308                 oa->o_grant = 0;
 309                 spin_unlock(&obd->obd_osfs_lock);
 310         }
 311
 312         iobuf = filter_iobuf_get(&obd->u.filter, oti);
 313         if (IS_ERR(iobuf))
 314                 RETURN(PTR_ERR(iobuf));
 315
 316         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
 317         dentry = filter_oa2dentry(obd, oa);
 318         if (IS_ERR(dentry)) {
 319                 rc = PTR_ERR(dentry);
 320                 dentry = NULL;
 321                 GOTO(cleanup, rc);
 322         }
 323
 324         inode = dentry->d_inode;
 325
 326         obdo_to_inode(inode, oa, OBD_MD_FLATIME);
 327         fsfilt_check_slow(obd, now, "preprw_read setup");
 328
 329         for (i = 0, lnb = res, rnb = nb; i < obj->ioo_bufcnt;
 330              i++, rnb++, lnb++) {
 331                 lnb->dentry = dentry;
 332                 lnb->offset = rnb->offset;
 333                 lnb->len    = rnb->len;
 334                 lnb->flags  = rnb->flags;
 335
 336                 /*
 337                  * ost_brw_write()->ost_nio_pages_get() already initialized
 338                  * lnb->page to point to the page from the per-thread page
 339                  * pool (bug 5137), initialize page.
 340                  */
 341                 LASSERT(lnb->page != NULL);
 342
 343                 if (i_size_read(inode) <= rnb->offset)
 344                         /* If there's no more data, abort early.  lnb->rc == 0,
 345                          * so it's easy to detect later. */
 346                         break;
 347                 else
 348                         filter_alloc_dio_page(obd, inode, lnb);
 349
 350                 if (i_size_read(inode) < lnb->offset + lnb->len - 1)
 351                         lnb->rc = i_size_read(inode) - lnb->offset;
 352                 else
 353                         lnb->rc = lnb->len;
 354
 355                 tot_bytes += lnb->rc;
 356
 357                 filter_iobuf_add_page(obd, iobuf, inode, lnb->page);
 358         }
 359
 360         fsfilt_check_slow(obd, now, "start_page_read");
 361
 362         rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf,
 363                               exp, NULL, NULL, NULL);
 364         if (rc)
 365                 GOTO(cleanup, rc);
 366
 367         lprocfs_counter_add(obd->obd_stats, LPROC_FILTER_READ_BYTES, tot_bytes);
 368
 369         if (exp->exp_nid_stats && exp->exp_nid_stats->nid_stats)
 370                 lprocfs_counter_add(exp->exp_nid_stats->nid_stats,
 371                                     LPROC_FILTER_READ_BYTES, tot_bytes);
 372
 373         EXIT;
 374
 375  cleanup:
 376         if (rc != 0) {
 377                 filter_free_dio_pages(objcount, obj, niocount, res);
 378
 379                 if (dentry != NULL)
 380                         f_dput(dentry);
 381         }
 382
 383         filter_iobuf_put(&obd->u.filter, iobuf, oti);
 384
 385         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
 386         if (rc)
 387                 CERROR("io error %d\n", rc);
 388
 389         return rc;
 390 }
 391
 392 /* When clients have dirtied as much space as they've been granted they
 393  * fall through to sync writes.  These sync writes haven't been expressed
 394  * in grants and need to error with ENOSPC when there isn't room in the
 395  * filesystem for them after grants are taken into account.  However,
 396  * writeback of the dirty data that was already granted space can write
 397  * right on through.
 398  *
 399  * Caller must hold obd_osfs_lock. */
 400 static int filter_grant_check(struct obd_export *exp, struct obdo *oa,
 401                               int objcount, struct fsfilt_objinfo *fso,
 402                               int niocount, struct niobuf_remote *rnb,
 403                               struct niobuf_local *lnb, obd_size *left,
 404                               struct inode *inode)
 405 {
 406         struct filter_export_data *fed = &exp->exp_filter_data;
 407         int blocksize = exp->exp_obd->u.obt.obt_sb->s_blocksize;
 408         unsigned long used = 0, ungranted = 0, using;
 409         int i, rc = -ENOSPC, obj, n = 0;
 410
 411         LASSERT_SPIN_LOCKED(&exp->exp_obd->obd_osfs_lock);
 412
 413         for (obj = 0; obj < objcount; obj++) {
 414                 for (i = 0; i < fso[obj].fso_bufcnt; i++, n++) {
 415                         int tmp, bytes;
 416
 417                         /* should match the code in osc_exit_cache */
 418                         bytes = rnb[n].len;
 419                         bytes += rnb[n].offset & (blocksize - 1);
 420                         tmp = (rnb[n].offset + rnb[n].len) & (blocksize - 1);
 421                         if (tmp)
 422                                 bytes += blocksize - tmp;
 423
 424                         if ((rnb[n].flags & OBD_BRW_FROM_GRANT) &&
 425                             (oa->o_valid & OBD_MD_FLGRANT)) {
 426                                 if (fed->fed_grant < used + bytes) {
 427                                         CDEBUG(D_CACHE,
 428                                                "%s: cli %s/%p claims %ld+%d "
 429                                                "GRANT, real grant %lu idx %d\n",
 430                                                exp->exp_obd->obd_name,
 431                                                exp->exp_client_uuid.uuid, exp,
 432                                                used, bytes, fed->fed_grant, n);
 433                                 } else {
 434                                         used += bytes;
 435                                         rnb[n].flags |= OBD_BRW_GRANTED;
 436                                         lnb[n].lnb_grant_used = bytes;
 437                                         CDEBUG(0, "idx %d used=%lu\n", n, used);
 438                                         rc = 0;
 439                                         continue;
 440                                 }
 441                         }
 442                         if (*left > ungranted + bytes) {
 443                                 /* if enough space, pretend it was granted */
 444                                 ungranted += bytes;
 445                                 rnb[n].flags |= OBD_BRW_GRANTED;
 446                                 lnb[n].lnb_grant_used = bytes;
 447                                 CDEBUG(0, "idx %d ungranted=%lu\n",n,ungranted);
 448                                 rc = 0;
 449                                 continue;
 450                         }
 451
 452                         /* We can't check for already-mapped blocks here, as
 453                          * it requires dropping the osfs lock to do the bmap.
 454                          * Instead, we return ENOSPC and in that case we need
 455                          * to go through and verify if all of the blocks not
 456                          * marked BRW_GRANTED are already mapped and we can
 457                          * ignore this error. */
 458                         lnb[n].rc = -ENOSPC;
 459                         rnb[n].flags &= ~OBD_BRW_GRANTED;
 460                         CDEBUG(D_CACHE,"%s: cli %s/%p idx %d no space for %d\n",
 461                                exp->exp_obd->obd_name,
 462                                exp->exp_client_uuid.uuid, exp, n, bytes);
 463                 }
 464         }
 465
 466         /* Now substract what client have used already.  We don't subtract
 467          * this from the tot_granted yet, so that other client's can't grab
 468          * that space before we have actually allocated our blocks.  That
 469          * happens in filter_grant_commit() after the writes are done. */
 470         *left -= ungranted;
 471         fed->fed_grant -= used;
 472         fed->fed_pending += used + ungranted;
 473         exp->exp_obd->u.filter.fo_tot_granted += ungranted;
 474         exp->exp_obd->u.filter.fo_tot_pending += used + ungranted;
 475
 476         CDEBUG(D_CACHE,
 477                "%s: cli %s/%p used: %lu ungranted: %lu grant: %lu dirty: %lu\n",
 478                exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, used,
 479                ungranted, fed->fed_grant, fed->fed_dirty);
 480
 481         /* Rough calc in case we don't refresh cached statfs data */
 482         using = (used + ungranted + 1 ) >>
 483                 exp->exp_obd->u.obt.obt_sb->s_blocksize_bits;
 484         if (exp->exp_obd->obd_osfs.os_bavail > using)
 485                 exp->exp_obd->obd_osfs.os_bavail -= using;
 486         else
 487                 exp->exp_obd->obd_osfs.os_bavail = 0;
 488
 489         if (fed->fed_dirty < used) {
 490                 CERROR("%s: cli %s/%p claims used %lu > fed_dirty %lu\n",
 491                        exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
 492                        used, fed->fed_dirty);
 493                 used = fed->fed_dirty;
 494         }
 495         exp->exp_obd->u.filter.fo_tot_dirty -= used;
 496         fed->fed_dirty -= used;
 497
 498         if (fed->fed_dirty < 0 || fed->fed_grant < 0 || fed->fed_pending < 0) {
 499                 CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
 500                        exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
 501                        fed->fed_dirty, fed->fed_pending, fed->fed_grant);
 502                 spin_unlock(&exp->exp_obd->obd_osfs_lock);
 503                 LBUG();
 504         }
 505         return rc;
 506 }
 507
 508 /* If we ever start to support multi-object BRW RPCs, we will need to get locks
 509  * on mulitple inodes.  That isn't all, because there still exists the
 510  * possibility of a truncate starting a new transaction while holding the ext3
 511  * rwsem = write while some writes (which have started their transactions here)
 512  * blocking on the ext3 rwsem = read => lock inversion.
 513  *
 514  * The handling gets very ugly when dealing with locked pages.  It may be easier
 515  * to just get rid of the locked page code (which has problems of its own) and
 516  * either discover we do not need it anymore (i.e. it was a symptom of another
 517  * bug) or ensure we get the page locks in an appropriate order. */
 518 static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa,
 519                                int objcount, struct obd_ioobj *obj,
 520                                int niocount, struct niobuf_remote *nb,
 521                                struct niobuf_local *res,
 522                                struct obd_trans_info *oti,
 523                                struct lustre_capa *capa)
 524 {
 525         struct lvfs_run_ctxt saved;
 526         struct niobuf_remote *rnb;
 527         struct niobuf_local *lnb = res;
 528         struct fsfilt_objinfo fso;
 529         struct filter_mod_data *fmd;
 530         struct dentry *dentry = NULL;
 531         void *iobuf;
 532         obd_size left;
 533         unsigned long now = jiffies;
 534         int rc = 0, i, tot_bytes = 0, cleanup_phase = 0;
 535         ENTRY;
 536         LASSERT(objcount == 1);
 537         LASSERT(obj->ioo_bufcnt > 0);
 538
 539         rc = filter_auth_capa(exp, NULL, obdo_mdsno(oa), capa,
 540                               CAPA_OPC_OSS_WRITE);
 541         if (rc)
 542                 RETURN(rc);
 543
 544         push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
 545         iobuf = filter_iobuf_get(&exp->exp_obd->u.filter, oti);
 546         if (IS_ERR(iobuf))
 547                 GOTO(cleanup, rc = PTR_ERR(iobuf));
 548         cleanup_phase = 1;
 549
 550         dentry = filter_fid2dentry(exp->exp_obd, NULL, obj->ioo_gr,
 551                                    obj->ioo_id);
 552         if (IS_ERR(dentry))
 553                 GOTO(cleanup, rc = PTR_ERR(dentry));
 554         cleanup_phase = 2;
 555
 556         if (dentry->d_inode == NULL) {
 557                 CERROR("%s: trying to BRW to non-existent file "LPU64"\n",
 558                        exp->exp_obd->obd_name, obj->ioo_id);
 559                 GOTO(cleanup, rc = -ENOENT);
 560         }
 561
 562         fso.fso_dentry = dentry;
 563         fso.fso_bufcnt = obj->ioo_bufcnt;
 564
 565         fsfilt_check_slow(exp->exp_obd, now, "preprw_write setup");
 566
 567         /* Don't update inode timestamps if this write is older than a
 568          * setattr which modifies the timestamps. b=10150 */
 569         /* XXX when we start having persistent reservations this needs to
 570          * be changed to filter_fmd_get() to create the fmd if it doesn't
 571          * already exist so we can store the reservation handle there. */
 572         fmd = filter_fmd_find(exp, obj->ioo_id, obj->ioo_gr);
 573
 574         LASSERT(oa != NULL);
 575         spin_lock(&exp->exp_obd->obd_osfs_lock);
 576         filter_grant_incoming(exp, oa);
 577         if (fmd && fmd->fmd_mactime_xid > oti->oti_xid)
 578                 oa->o_valid &= ~(OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 579                                  OBD_MD_FLATIME);
 580         else
 581                 obdo_to_inode(dentry->d_inode, oa, OBD_MD_FLATIME |
 582                               OBD_MD_FLMTIME | OBD_MD_FLCTIME);
 583         cleanup_phase = 3;
 584
 585         left = filter_grant_space_left(exp);
 586
 587         rc = filter_grant_check(exp, oa, objcount, &fso, niocount, nb, res,
 588                                 &left, dentry->d_inode);
 589
 590         /* do not zero out oa->o_valid as it is used in filter_commitrw_write()
 591          * for setting UID/GID and fid EA in first write time. */
 592         if (oa->o_valid & OBD_MD_FLGRANT)
 593                 oa->o_grant = filter_grant(exp,oa->o_grant,oa->o_undirty,left);
 594
 595         spin_unlock(&exp->exp_obd->obd_osfs_lock);
 596         filter_fmd_put(exp, fmd);
 597
 598         if (rc)
 599                 GOTO(cleanup, rc);
 600
 601         for (i = 0, rnb = nb, lnb = res; i < obj->ioo_bufcnt;
 602              i++, lnb++, rnb++) {
 603                 /* We still set up for ungranted pages so that granted pages
 604                  * can be written to disk as they were promised, and portals
 605                  * needs to keep the pages all aligned properly. */
 606                 lnb->dentry = dentry;
 607                 lnb->offset = rnb->offset;
 608                 lnb->len    = rnb->len;
 609                 lnb->flags  = rnb->flags;
 610
 611                 /*
 612                  * ost_brw_write()->ost_nio_pages_get() already initialized
 613                  * lnb->page to point to the page from the per-thread page
 614                  * pool (bug 5137), initialize page.
 615                  */
 616                 LASSERT(lnb->page != NULL);
 617                 if (lnb->len != CFS_PAGE_SIZE) {
 618                         memset(kmap(lnb->page) + lnb->len,
 619                                0, CFS_PAGE_SIZE - lnb->len);
 620                         kunmap(lnb->page);
 621                 }
 622                 lnb->page->index = lnb->offset >> CFS_PAGE_SHIFT;
 623
 624                 cleanup_phase = 4;
 625
 626                 /* If the filter writes a partial page, then has the file
 627                  * extended, the client will read in the whole page.  the
 628                  * filter has to be careful to zero the rest of the partial
 629                  * page on disk.  we do it by hand for partial extending
 630                  * writes, send_bio() is responsible for zeroing pages when
 631                  * asked to read unmapped blocks -- brw_kiovec() does this. */
 632                 if (lnb->len != CFS_PAGE_SIZE) {
 633                         __s64 maxidx;
 634
 635                         maxidx = ((i_size_read(dentry->d_inode) +
 636                                    CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT) - 1;
 637                         if (maxidx >= lnb->page->index) {
 638                                 LL_CDEBUG_PAGE(D_PAGE, lnb->page, "write %u @ "
 639                                                LPU64" flg %x before EOF %llu\n",
 640                                                lnb->len, lnb->offset,lnb->flags,
 641                                                i_size_read(dentry->d_inode));
 642                                 filter_iobuf_add_page(exp->exp_obd, iobuf,
 643                                                       dentry->d_inode,
 644                                                       lnb->page);
 645                         } else {
 646                                 long off;
 647                                 char *p = kmap(lnb->page);
 648
 649                                 off = lnb->offset & ~CFS_PAGE_MASK;
 650                                 if (off)
 651                                         memset(p, 0, off);
 652                                 off = (lnb->offset + lnb->len) & ~CFS_PAGE_MASK;
 653                                 if (off)
 654                                         memset(p + off, 0, CFS_PAGE_SIZE - off);
 655                                 kunmap(lnb->page);
 656                         }
 657                 }
 658                 if (lnb->rc == 0)
 659                         tot_bytes += lnb->len;
 660         }
 661
 662         rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp,
 663                               NULL, NULL, NULL);
 664
 665         fsfilt_check_slow(exp->exp_obd, now, "start_page_write");
 666
 667         if (exp->exp_nid_stats && exp->exp_nid_stats->nid_stats)
 668                 lprocfs_counter_add(exp->exp_nid_stats->nid_stats,
 669                                     LPROC_FILTER_WRITE_BYTES, tot_bytes);
 670         EXIT;
 671 cleanup:
 672         switch(cleanup_phase) {
 673         case 4:
 674         case 3:
 675                 filter_iobuf_put(&exp->exp_obd->u.filter, iobuf, oti);
 676         case 2:
 677                 pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
 678                 if (rc)
 679                         f_dput(dentry);
 680                 break;
 681         case 1:
 682                 filter_iobuf_put(&exp->exp_obd->u.filter, iobuf, oti);
 683         case 0:
 684                 spin_lock(&exp->exp_obd->obd_osfs_lock);
 685                 if (oa)
 686                         filter_grant_incoming(exp, oa);
 687                 spin_unlock(&exp->exp_obd->obd_osfs_lock);
 688                 pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
 689                 break;
 690         default:;
 691         }
 692         return rc;
 693 }
 694
 695 int filter_preprw(int cmd, struct obd_export *exp, struct obdo *oa,
 696                   int objcount, struct obd_ioobj *obj, int niocount,
 697                   struct niobuf_remote *nb, struct niobuf_local *res,
 698                   struct obd_trans_info *oti, struct lustre_capa *capa)
 699 {
 700         if (cmd == OBD_BRW_WRITE)
 701                 return filter_preprw_write(cmd, exp, oa, objcount, obj,
 702                                            niocount, nb, res, oti, capa);
 703         if (cmd == OBD_BRW_READ)
 704                 return filter_preprw_read(cmd, exp, oa, objcount, obj,
 705                                           niocount, nb, res, oti, capa);
 706         LBUG();
 707         return -EPROTO;
 708 }
 709
 710 void filter_release_read_page(struct filter_obd *filter, struct inode *inode,
 711                               struct page *page)
 712 {
 713         int drop = 0;
 714
 715         if (inode != NULL &&
 716             (i_size_read(inode) > filter->fo_readcache_max_filesize))
 717                 drop = 1;
 718
 719         /* drop from cache like truncate_list_pages() */
 720         if (drop && !TryLockPage(page)) {
 721                 if (page->mapping)
 722                         ll_truncate_complete_page(page);
 723                 unlock_page(page);
 724         }
 725         page_cache_release(page);
 726 }
 727
 728 static int filter_commitrw_read(struct obd_export *exp, struct obdo *oa,
 729                                 int objcount, struct obd_ioobj *obj,
 730                                 int niocount, struct niobuf_local *res,
 731                                 struct obd_trans_info *oti, int rc)
 732 {
 733         struct inode *inode = NULL;
 734         struct ldlm_res_id res_id;
 735         struct ldlm_resource *resource = NULL;
 736         struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
 737         ENTRY;
 738
 739         osc_build_res_name(obj->ioo_id, obj->ioo_gr, &res_id);
 740         /* If oa != NULL then filter_preprw_read updated the inode atime
 741          * and we should update the lvb so that other glimpses will also
 742          * get the updated value. bug 5972 */
 743         if (oa && ns && ns->ns_lvbo && ns->ns_lvbo->lvbo_update) {
 744                 resource = ldlm_resource_get(ns, NULL, &res_id, LDLM_EXTENT, 0);
 745
 746                 if (resource != NULL) {
 747                         ns->ns_lvbo->lvbo_update(resource, NULL, 0, 1);
 748                         ldlm_resource_putref(resource);
 749                 }
 750         }
 751
 752         if (res->dentry != NULL)
 753                 inode = res->dentry->d_inode;
 754
 755         filter_free_dio_pages(objcount, obj, niocount, res);
 756
 757         if (res->dentry != NULL)
 758                 f_dput(res->dentry);
 759         RETURN(rc);
 760 }
 761
 762 void flip_into_page_cache(struct inode *inode, struct page *new_page)
 763 {
 764         struct page *old_page;
 765         int rc;
 766
 767         do {
 768                 /* the dlm is protecting us from read/write concurrency, so we
 769                  * expect this find_lock_page to return quickly.  even if we
 770                  * race with another writer it won't be doing much work with
 771                  * the page locked.  we do this 'cause t_c_p expects a
 772                  * locked page, and it wants to grab the pagecache lock
 773                  * as well. */
 774                 old_page = find_lock_page(inode->i_mapping, new_page->index);
 775                 if (old_page) {
 776                         ll_truncate_complete_page(old_page);
 777                         unlock_page(old_page);
 778                         page_cache_release(old_page);
 779                 }
 780
 781 #if 0 /* this should be a /proc tunable someday */
 782                 /* racing o_directs (no locking ioctl) could race adding
 783                  * their pages, so we repeat the page invalidation unless
 784                  * we successfully added our new page */
 785                 rc = add_to_page_cache_unique(new_page, inode->i_mapping,
 786                                               new_page->index,
 787                                               page_hash(inode->i_mapping,
 788                                                         new_page->index));
 789                 if (rc == 0) {
 790                         /* add_to_page_cache clears uptodate|dirty and locks
 791                          * the page */
 792                         SetPageUptodate(new_page);
 793                         unlock_page(new_page);
 794                 }
 795 #else
 796                 rc = 0;
 797 #endif
 798         } while (rc != 0);
 799 }
 800
 801 void filter_grant_commit(struct obd_export *exp, int niocount,
 802                          struct niobuf_local *res)
 803 {
 804         struct filter_obd *filter = &exp->exp_obd->u.filter;
 805         struct niobuf_local *lnb = res;
 806         unsigned long pending = 0;
 807         int i;
 808
 809         spin_lock(&exp->exp_obd->obd_osfs_lock);
 810         for (i = 0, lnb = res; i < niocount; i++, lnb++)
 811                 pending += lnb->lnb_grant_used;
 812
 813         LASSERTF(exp->exp_filter_data.fed_pending >= pending,
 814                  "%s: cli %s/%p fed_pending: %lu grant_used: %lu\n",
 815                  exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
 816                  exp->exp_filter_data.fed_pending, pending);
 817         exp->exp_filter_data.fed_pending -= pending;
 818         LASSERTF(filter->fo_tot_granted >= pending,
 819                  "%s: cli %s/%p tot_granted: "LPU64" grant_used: %lu\n",
 820                  exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
 821                  exp->exp_obd->u.filter.fo_tot_granted, pending);
 822         filter->fo_tot_granted -= pending;
 823         LASSERTF(filter->fo_tot_pending >= pending,
 824                  "%s: cli %s/%p tot_pending: "LPU64" grant_used: %lu\n",
 825                  exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
 826                  filter->fo_tot_pending, pending);
 827         filter->fo_tot_pending -= pending;
 828
 829         spin_unlock(&exp->exp_obd->obd_osfs_lock);
 830 }
 831
 832 int filter_commitrw(int cmd, struct obd_export *exp, struct obdo *oa,
 833                     int objcount, struct obd_ioobj *obj, int niocount,
 834                     struct niobuf_local *res, struct obd_trans_info *oti,
 835                     int rc)
 836 {
 837         if (cmd == OBD_BRW_WRITE)
 838                 return filter_commitrw_write(exp, oa, objcount, obj, niocount,
 839                                              res, oti, rc);
 840         if (cmd == OBD_BRW_READ)
 841                 return filter_commitrw_read(exp, oa, objcount, obj, niocount,
 842                                             res, oti, rc);
 843         LBUG();
 844         return -EPROTO;
 845 }
 846
 847 int filter_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
 848                obd_count oa_bufs, struct brw_page *pga,
 849                struct obd_trans_info *oti)
 850 {
 851         struct obd_ioobj ioo;
 852         struct niobuf_local *lnb;
 853         struct niobuf_remote *rnb;
 854         obd_count i;
 855         int ret = 0;
 856         ENTRY;
 857
 858         OBD_ALLOC(lnb, oa_bufs * sizeof(struct niobuf_local));
 859         OBD_ALLOC(rnb, oa_bufs * sizeof(struct niobuf_remote));
 860
 861         if (lnb == NULL || rnb == NULL)
 862                 GOTO(out, ret = -ENOMEM);
 863
 864         for (i = 0; i < oa_bufs; i++) {
 865                 lnb[i].page = pga[i].pg;
 866                 rnb[i].offset = pga[i].off;
 867                 rnb[i].len = pga[i].count;
 868         }
 869
 870         obdo_to_ioobj(oinfo->oi_oa, &ioo);
 871         ioo.ioo_bufcnt = oa_bufs;
 872
 873         ret = filter_preprw(cmd, exp, oinfo->oi_oa, 1, &ioo,
 874                             oa_bufs, rnb, lnb, oti, oinfo_capa(oinfo));
 875         if (ret != 0)
 876                 GOTO(out, ret);
 877
 878         ret = filter_commitrw(cmd, exp, oinfo->oi_oa, 1, &ioo,
 879                               oa_bufs, lnb, oti, ret);
 880
 881 out:
 882         if (lnb)
 883                 OBD_FREE(lnb, oa_bufs * sizeof(struct niobuf_local));
 884         if (rnb)
 885                 OBD_FREE(rnb, oa_bufs * sizeof(struct niobuf_remote));
 886         RETURN(ret);
 887 }