lustre/obdfilter/filter_io.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  *  linux/fs/obdfilter/filter_io.c
   5  *
   6  *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
   7  *   Author: Peter Braam <braam@clusterfs.com>
   8  *   Author: Andreas Dilger <adilger@clusterfs.com>
   9  *   Author: Phil Schwan <phil@clusterfs.com>
  10  *
  11  *   This file is part of the Lustre file system, http://www.lustre.org
  12  *   Lustre is a trademark of Cluster File Systems, Inc.
  13  *
  14  *   You may have signed or agreed to another license before downloading
  15  *   this software.  If so, you are bound by the terms and conditions
  16  *   of that agreement, and the following does not apply to you.  See the
  17  *   LICENSE file included with this distribution for more information.
  18  *
  19  *   If you did not agree to a different license, then this copy of Lustre
  20  *   is open source software; you can redistribute it and/or modify it
  21  *   under the terms of version 2 of the GNU General Public License as
  22  *   published by the Free Software Foundation.
  23  *
  24  *   In either case, Lustre is distributed in the hope that it will be
  25  *   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  26  *   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  27  *   license text for more details.
  28  */
  29
  30 #define DEBUG_SUBSYSTEM S_FILTER
  31
  32 #ifndef AUTOCONF_INCLUDED
  33 #include <linux/config.h>
  34 #endif
  35 #include <linux/module.h>
  36 #include <linux/pagemap.h> // XXX kill me soon
  37 #include <linux/version.h>
  38
  39 #include <obd_class.h>
  40 #include <lustre_fsfilt.h>
  41 #include "filter_internal.h"
  42
  43 int *obdfilter_created_scratchpad;
  44
  45 static int filter_alloc_dio_page(struct obd_device *obd, struct inode *inode,
  46                                  struct niobuf_local *lnb)
  47 {
  48         struct page *page;
  49
  50         LASSERT(lnb->page != NULL);
  51
  52         page = lnb->page;
  53 #if 0
  54         POISON_PAGE(page, 0xf1);
  55         if (lnb->len != CFS_PAGE_SIZE) {
  56                 memset(kmap(page) + lnb->len, 0, CFS_PAGE_SIZE - lnb->len);
  57                 kunmap(page);
  58         }
  59 #endif
  60         page->index = lnb->offset >> CFS_PAGE_SHIFT;
  61
  62         RETURN(0);
  63 }
  64
  65 static void filter_free_dio_pages(int objcount, struct obd_ioobj *obj,
  66                            int niocount, struct niobuf_local *res)
  67 {
  68         int i, j;
  69
  70         for (i = 0; i < objcount; i++, obj++) {
  71                 for (j = 0 ; j < obj->ioo_bufcnt ; j++, res++)
  72                                 res->page = NULL;
  73         }
  74 }
  75
  76 /* Grab the dirty and seen grant announcements from the incoming obdo.
  77  * We will later calculate the clients new grant and return it.
  78  * Caller must hold osfs lock */
  79 static void filter_grant_incoming(struct obd_export *exp, struct obdo *oa)
  80 {
  81         struct filter_export_data *fed;
  82         struct obd_device *obd = exp->exp_obd;
  83         static unsigned long last_msg;
  84         static int last_count;
  85         int mask = D_CACHE;
  86         ENTRY;
  87
  88         LASSERT_SPIN_LOCKED(&obd->obd_osfs_lock);
  89
  90         if ((oa->o_valid & (OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) !=
  91                                         (OBD_MD_FLBLOCKS|OBD_MD_FLGRANT)) {
  92                 oa->o_valid &= ~OBD_MD_FLGRANT;
  93                 EXIT;
  94                 return;
  95         }
  96
  97         fed = &exp->exp_filter_data;
  98
  99         /* Don't print this to the console the first time it happens, since
 100          * it can happen legitimately on occasion, but only rarely. */
 101         if (time_after(jiffies, last_msg + 60 * HZ)) {
 102                 last_count = 0;
 103                 last_msg = jiffies;
 104         }
 105         if ((last_count & (-last_count)) == last_count)
 106                 mask = D_HA /* until bug 3273 is fixed D_WARNING */;
 107         last_count++;
 108
 109         /* Add some margin, since there is a small race if other RPCs arrive
 110          * out-or-order and have already consumed some grant.  We want to
 111          * leave this here in case there is a large error in accounting. */
 112         CDEBUG(oa->o_grant > fed->fed_grant + FILTER_GRANT_CHUNK ? mask:D_CACHE,
 113                "%s: cli %s/%p reports grant: "LPU64" dropped: %u, local: %lu\n",
 114                obd->obd_name, exp->exp_client_uuid.uuid, exp, oa->o_grant,
 115                oa->o_dropped, fed->fed_grant);
 116
 117         /* Update our accounting now so that statfs takes it into account.
 118          * Note that fed_dirty is only approximate and can become incorrect
 119          * if RPCs arrive out-of-order.  No important calculations depend
 120          * on fed_dirty however, but we must check sanity to not assert. */
 121         if ((long long)oa->o_dirty < 0)
 122                 oa->o_dirty = 0;
 123         else if (oa->o_dirty > fed->fed_grant + 4 * FILTER_GRANT_CHUNK)
 124                 oa->o_dirty = fed->fed_grant + 4 * FILTER_GRANT_CHUNK;
 125         obd->u.filter.fo_tot_dirty += oa->o_dirty - fed->fed_dirty;
 126         if (fed->fed_grant < oa->o_dropped) {
 127                 CDEBUG(D_HA,"%s: cli %s/%p reports %u dropped > fedgrant %lu\n",
 128                        obd->obd_name, exp->exp_client_uuid.uuid, exp,
 129                        oa->o_dropped, fed->fed_grant);
 130                 oa->o_dropped = 0;
 131         }
 132         if (obd->u.filter.fo_tot_granted < oa->o_dropped) {
 133                 CERROR("%s: cli %s/%p reports %u dropped > tot_grant "LPU64"\n",
 134                        obd->obd_name, exp->exp_client_uuid.uuid, exp,
 135                        oa->o_dropped, obd->u.filter.fo_tot_granted);
 136                 oa->o_dropped = 0;
 137         }
 138         obd->u.filter.fo_tot_granted -= oa->o_dropped;
 139         fed->fed_grant -= oa->o_dropped;
 140         fed->fed_dirty = oa->o_dirty;
 141         if (fed->fed_dirty < 0 || fed->fed_grant < 0 || fed->fed_pending < 0) {
 142                 CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
 143                        obd->obd_name, exp->exp_client_uuid.uuid, exp,
 144                        fed->fed_dirty, fed->fed_pending, fed->fed_grant);
 145                 spin_unlock(&obd->obd_osfs_lock);
 146                 LBUG();
 147         }
 148         EXIT;
 149 }
 150
 151 /* Figure out how much space is available between what we've granted
 152  * and what remains in the filesystem.  Compensate for ext3 indirect
 153  * block overhead when computing how much free space is left ungranted.
 154  *
 155  * Caller must hold obd_osfs_lock. */
 156 obd_size filter_grant_space_left(struct obd_export *exp)
 157 {
 158         struct obd_device *obd = exp->exp_obd;
 159         int blockbits = obd->u.obt.obt_sb->s_blocksize_bits;
 160         obd_size tot_granted = obd->u.filter.fo_tot_granted, avail, left = 0;
 161         int rc, statfs_done = 0;
 162
 163         LASSERT_SPIN_LOCKED(&obd->obd_osfs_lock);
 164
 165         if (cfs_time_before_64(obd->obd_osfs_age, cfs_time_current_64() - HZ)) {
 166 restat:
 167                 rc = fsfilt_statfs(obd, obd->u.obt.obt_sb,
 168                                    cfs_time_current_64() + HZ);
 169                 if (rc) /* N.B. statfs can't really fail */
 170                         RETURN(0);
 171                 statfs_done = 1;
 172         }
 173
 174         avail = obd->obd_osfs.os_bavail;
 175         left = avail - (avail >> (blockbits - 3)); /* (d)indirect */
 176         if (left > GRANT_FOR_LLOG(obd)) {
 177                 left = (left - GRANT_FOR_LLOG(obd)) << blockbits;
 178         } else {
 179                 left = 0 /* << blockbits */;
 180         }
 181
 182         if (!statfs_done && left < 32 * FILTER_GRANT_CHUNK + tot_granted) {
 183                 CDEBUG(D_CACHE, "fs has no space left and statfs too old\n");
 184                 goto restat;
 185         }
 186
 187         if (left >= tot_granted) {
 188                 left -= tot_granted;
 189         } else {
 190                 if (left < tot_granted - obd->u.filter.fo_tot_pending) {
 191                         CERROR("%s: cli %s/%p grant "LPU64" > available "
 192                                LPU64" and pending "LPU64"\n", obd->obd_name,
 193                                exp->exp_client_uuid.uuid, exp, tot_granted,
 194                                left, obd->u.filter.fo_tot_pending);
 195                 }
 196                 left = 0;
 197         }
 198
 199         CDEBUG(D_CACHE, "%s: cli %s/%p free: "LPU64" avail: "LPU64" grant "LPU64
 200                " left: "LPU64" pending: "LPU64"\n", obd->obd_name,
 201                exp->exp_client_uuid.uuid, exp,
 202                obd->obd_osfs.os_bfree << blockbits, avail << blockbits,
 203                tot_granted, left, obd->u.filter.fo_tot_pending);
 204
 205         return left;
 206 }
 207
 208 /* Calculate how much grant space to allocate to this client, based on how
 209  * much space is currently free and how much of that is already granted.
 210  *
 211  * Caller must hold obd_osfs_lock. */
 212 long filter_grant(struct obd_export *exp, obd_size current_grant,
 213                   obd_size want, obd_size fs_space_left)
 214 {
 215         struct obd_device *obd = exp->exp_obd;
 216         struct filter_export_data *fed = &exp->exp_filter_data;
 217         int blockbits = obd->u.obt.obt_sb->s_blocksize_bits;
 218         __u64 grant = 0;
 219
 220         LASSERT_SPIN_LOCKED(&obd->obd_osfs_lock);
 221
 222         /* Grant some fraction of the client's requested grant space so that
 223          * they are not always waiting for write credits (not all of it to
 224          * avoid overgranting in face of multiple RPCs in flight).  This
 225          * essentially will be able to control the OSC_MAX_RIF for a client.
 226          *
 227          * If we do have a large disparity between what the client thinks it
 228          * has and what we think it has, don't grant very much and let the
 229          * client consume its grant first.  Either it just has lots of RPCs
 230          * in flight, or it was evicted and its grants will soon be used up. */
 231         if (want > 0x7fffffff) {
 232                 CERROR("%s: client %s/%p requesting > 2GB grant "LPU64"\n",
 233                        obd->obd_name, exp->exp_client_uuid.uuid, exp, want);
 234         } else if (current_grant < want &&
 235                    current_grant < fed->fed_grant + FILTER_GRANT_CHUNK) {
 236                 grant = min((want >> blockbits),
 237                             (fs_space_left >> blockbits) / 8);
 238                 grant <<= blockbits;
 239
 240                 if (grant) {
 241                         /* Allow >FILTER_GRANT_CHUNK size when clients
 242                          * reconnect due to a server reboot.
 243                          */
 244                         if ((grant > FILTER_GRANT_CHUNK) &&
 245                             (!obd->obd_recovering))
 246                                 grant = FILTER_GRANT_CHUNK;
 247
 248                         obd->u.filter.fo_tot_granted += grant;
 249                         fed->fed_grant += grant;
 250                         if (fed->fed_grant < 0) {
 251                                 CERROR("%s: cli %s/%p grant %ld want "LPU64
 252                                        "current"LPU64"\n",
 253                                        obd->obd_name, exp->exp_client_uuid.uuid,
 254                                        exp, fed->fed_grant, want,current_grant);
 255                                 spin_unlock(&obd->obd_osfs_lock);
 256                                 LBUG();
 257                         }
 258                 }
 259         }
 260
 261         CDEBUG(D_CACHE,
 262                "%s: cli %s/%p wants: "LPU64" current grant "LPU64
 263                " granting: "LPU64"\n", obd->obd_name, exp->exp_client_uuid.uuid,
 264                exp, want, current_grant, grant);
 265         CDEBUG(D_CACHE,
 266                "%s: cli %s/%p tot cached:"LPU64" granted:"LPU64
 267                " num_exports: %d\n", obd->obd_name, exp->exp_client_uuid.uuid,
 268                exp, obd->u.filter.fo_tot_dirty,
 269                obd->u.filter.fo_tot_granted, obd->obd_num_exports);
 270
 271         return grant;
 272 }
 273
 274 static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa,
 275                               int objcount, struct obd_ioobj *obj,
 276                               int niocount, struct niobuf_remote *nb,
 277                               struct niobuf_local *res,
 278                               struct obd_trans_info *oti,
 279                               struct lustre_capa *capa)
 280 {
 281         struct obd_device *obd = exp->exp_obd;
 282         struct lvfs_run_ctxt saved;
 283         struct niobuf_remote *rnb;
 284         struct niobuf_local *lnb;
 285         struct dentry *dentry = NULL;
 286         struct inode *inode;
 287         void *iobuf = NULL;
 288         int rc = 0, i, tot_bytes = 0;
 289         unsigned long now = jiffies;
 290         ENTRY;
 291
 292         /* We are currently not supporting multi-obj BRW_READ RPCS at all.
 293          * When we do this function's dentry cleanup will need to be fixed.
 294          * These values are verified in ost_brw_write() from the wire. */
 295         LASSERTF(objcount == 1, "%d\n", objcount);
 296         LASSERTF(obj->ioo_bufcnt > 0, "%d\n", obj->ioo_bufcnt);
 297
 298         rc = filter_auth_capa(exp, NULL, obdo_mdsno(oa), capa,
 299                               CAPA_OPC_OSS_READ);
 300         if (rc)
 301                 RETURN(rc);
 302
 303         if (oa && oa->o_valid & OBD_MD_FLGRANT) {
 304                 spin_lock(&obd->obd_osfs_lock);
 305                 filter_grant_incoming(exp, oa);
 306
 307                 oa->o_grant = 0;
 308                 spin_unlock(&obd->obd_osfs_lock);
 309         }
 310
 311         iobuf = filter_iobuf_get(&obd->u.filter, oti);
 312         if (IS_ERR(iobuf))
 313                 RETURN(PTR_ERR(iobuf));
 314
 315         push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
 316         dentry = filter_oa2dentry(obd, oa);
 317         if (IS_ERR(dentry)) {
 318                 rc = PTR_ERR(dentry);
 319                 dentry = NULL;
 320                 GOTO(cleanup, rc);
 321         }
 322
 323         inode = dentry->d_inode;
 324
 325         obdo_to_inode(inode, oa, OBD_MD_FLATIME);
 326         fsfilt_check_slow(obd, now, obd_timeout, "preprw_read setup");
 327
 328         for (i = 0, lnb = res, rnb = nb; i < obj->ioo_bufcnt;
 329              i++, rnb++, lnb++) {
 330                 lnb->dentry = dentry;
 331                 lnb->offset = rnb->offset;
 332                 lnb->len    = rnb->len;
 333                 lnb->flags  = rnb->flags;
 334
 335                 /*
 336                  * ost_brw_write()->ost_nio_pages_get() already initialized
 337                  * lnb->page to point to the page from the per-thread page
 338                  * pool (bug 5137), initialize page.
 339                  */
 340                 LASSERT(lnb->page != NULL);
 341
 342                 if (inode->i_size <= rnb->offset)
 343                         /* If there's no more data, abort early.  lnb->rc == 0,
 344                          * so it's easy to detect later. */
 345                         break;
 346                 else
 347                         filter_alloc_dio_page(obd, inode, lnb);
 348
 349                 if (inode->i_size < lnb->offset + lnb->len - 1)
 350                         lnb->rc = inode->i_size - lnb->offset;
 351                 else
 352                         lnb->rc = lnb->len;
 353
 354                 tot_bytes += lnb->rc;
 355
 356                 filter_iobuf_add_page(obd, iobuf, inode, lnb->page);
 357         }
 358
 359         fsfilt_check_slow(obd, now, obd_timeout, "start_page_read");
 360
 361         rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf,
 362                               exp, NULL, NULL, NULL);
 363         if (rc)
 364                 GOTO(cleanup, rc);
 365
 366         lprocfs_counter_add(obd->obd_stats, LPROC_FILTER_READ_BYTES, tot_bytes);
 367
 368         lprocfs_counter_add(exp->exp_ops_stats, LPROC_FILTER_READ_BYTES,
 369                             tot_bytes);
 370
 371         EXIT;
 372
 373  cleanup:
 374         if (rc != 0) {
 375                 filter_free_dio_pages(objcount, obj, niocount, res);
 376
 377                 if (dentry != NULL)
 378                         f_dput(dentry);
 379         }
 380
 381         filter_iobuf_put(&obd->u.filter, iobuf, oti);
 382
 383         pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
 384         if (rc)
 385                 CERROR("io error %d\n", rc);
 386
 387         return rc;
 388 }
 389
 390 /* When clients have dirtied as much space as they've been granted they
 391  * fall through to sync writes.  These sync writes haven't been expressed
 392  * in grants and need to error with ENOSPC when there isn't room in the
 393  * filesystem for them after grants are taken into account.  However,
 394  * writeback of the dirty data that was already granted space can write
 395  * right on through.
 396  *
 397  * Caller must hold obd_osfs_lock. */
 398 static int filter_grant_check(struct obd_export *exp, struct obdo *oa,
 399                               int objcount, struct fsfilt_objinfo *fso,
 400                               int niocount, struct niobuf_remote *rnb,
 401                               struct niobuf_local *lnb, obd_size *left,
 402                               struct inode *inode)
 403 {
 404         struct filter_export_data *fed = &exp->exp_filter_data;
 405         int blocksize = exp->exp_obd->u.obt.obt_sb->s_blocksize;
 406         unsigned long used = 0, ungranted = 0, using;
 407         int i, rc = -ENOSPC, obj, n = 0, mask = D_CACHE;
 408
 409         LASSERT_SPIN_LOCKED(&exp->exp_obd->obd_osfs_lock);
 410
 411         for (obj = 0; obj < objcount; obj++) {
 412                 for (i = 0; i < fso[obj].fso_bufcnt; i++, n++) {
 413                         int tmp, bytes;
 414
 415                         /* should match the code in osc_exit_cache */
 416                         bytes = rnb[n].len;
 417                         bytes += rnb[n].offset & (blocksize - 1);
 418                         tmp = (rnb[n].offset + rnb[n].len) & (blocksize - 1);
 419                         if (tmp)
 420                                 bytes += blocksize - tmp;
 421
 422                         if ((rnb[n].flags & OBD_BRW_FROM_GRANT) &&
 423                             (oa->o_valid & OBD_MD_FLGRANT)) {
 424                                 if (fed->fed_grant < used + bytes) {
 425                                         CDEBUG(D_CACHE,
 426                                                "%s: cli %s/%p claims %ld+%d "
 427                                                "GRANT, real grant %lu idx %d\n",
 428                                                exp->exp_obd->obd_name,
 429                                                exp->exp_client_uuid.uuid, exp,
 430                                                used, bytes, fed->fed_grant, n);
 431                                         mask = D_RPCTRACE;
 432                                 } else {
 433                                         used += bytes;
 434                                         rnb[n].flags |= OBD_BRW_GRANTED;
 435                                         lnb[n].lnb_grant_used = bytes;
 436                                         CDEBUG(0, "idx %d used=%lu\n", n, used);
 437                                         rc = 0;
 438                                         continue;
 439                                 }
 440                         }
 441                         if (*left > ungranted) {
 442                                 /* if enough space, pretend it was granted */
 443                                 ungranted += bytes;
 444                                 rnb[n].flags |= OBD_BRW_GRANTED;
 445                                 lnb[n].lnb_grant_used = bytes;
 446                                 CDEBUG(0, "idx %d ungranted=%lu\n",n,ungranted);
 447                                 rc = 0;
 448                                 continue;
 449                         }
 450
 451                         /* We can't check for already-mapped blocks here, as
 452                          * it requires dropping the osfs lock to do the bmap.
 453                          * Instead, we return ENOSPC and in that case we need
 454                          * to go through and verify if all of the blocks not
 455                          * marked BRW_GRANTED are already mapped and we can
 456                          * ignore this error. */
 457                         lnb[n].rc = -ENOSPC;
 458                         rnb[n].flags &= ~OBD_BRW_GRANTED;
 459                         CDEBUG(D_CACHE,"%s: cli %s/%p idx %d no space for %d\n",
 460                                exp->exp_obd->obd_name,
 461                                exp->exp_client_uuid.uuid, exp, n, bytes);
 462                 }
 463         }
 464
 465         /* Now substract what client have used already.  We don't subtract
 466          * this from the tot_granted yet, so that other client's can't grab
 467          * that space before we have actually allocated our blocks.  That
 468          * happens in filter_grant_commit() after the writes are done. */
 469         *left -= ungranted;
 470         fed->fed_grant -= used;
 471         fed->fed_pending += used + ungranted;
 472         exp->exp_obd->u.filter.fo_tot_granted += ungranted;
 473         exp->exp_obd->u.filter.fo_tot_pending += used + ungranted;
 474
 475         CDEBUG(mask,
 476                "%s: cli %s/%p used: %lu ungranted: %lu grant: %lu dirty: %lu\n",
 477                exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, used,
 478                ungranted, fed->fed_grant, fed->fed_dirty);
 479
 480         /* Rough calc in case we don't refresh cached statfs data */
 481         using = (used + ungranted + 1 ) >>
 482                 exp->exp_obd->u.obt.obt_sb->s_blocksize_bits;
 483         if (exp->exp_obd->obd_osfs.os_bavail > using)
 484                 exp->exp_obd->obd_osfs.os_bavail -= using;
 485         else
 486                 exp->exp_obd->obd_osfs.os_bavail = 0;
 487
 488         if (fed->fed_dirty < used) {
 489                 CERROR("%s: cli %s/%p claims used %lu > fed_dirty %lu\n",
 490                        exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
 491                        used, fed->fed_dirty);
 492                 used = fed->fed_dirty;
 493         }
 494         exp->exp_obd->u.filter.fo_tot_dirty -= used;
 495         fed->fed_dirty -= used;
 496
 497         if (fed->fed_dirty < 0 || fed->fed_grant < 0 || fed->fed_pending < 0) {
 498                 CERROR("%s: cli %s/%p dirty %ld pend %ld grant %ld\n",
 499                        exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
 500                        fed->fed_dirty, fed->fed_pending, fed->fed_grant);
 501                 spin_unlock(&exp->exp_obd->obd_osfs_lock);
 502                 LBUG();
 503         }
 504         return rc;
 505 }
 506
 507 /* If we ever start to support multi-object BRW RPCs, we will need to get locks
 508  * on mulitple inodes.  That isn't all, because there still exists the
 509  * possibility of a truncate starting a new transaction while holding the ext3
 510  * rwsem = write while some writes (which have started their transactions here)
 511  * blocking on the ext3 rwsem = read => lock inversion.
 512  *
 513  * The handling gets very ugly when dealing with locked pages.  It may be easier
 514  * to just get rid of the locked page code (which has problems of its own) and
 515  * either discover we do not need it anymore (i.e. it was a symptom of another
 516  * bug) or ensure we get the page locks in an appropriate order. */
 517 static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa,
 518                                int objcount, struct obd_ioobj *obj,
 519                                int niocount, struct niobuf_remote *nb,
 520                                struct niobuf_local *res,
 521                                struct obd_trans_info *oti,
 522                                struct lustre_capa *capa)
 523 {
 524         struct lvfs_run_ctxt saved;
 525         struct niobuf_remote *rnb;
 526         struct niobuf_local *lnb = res;
 527         struct fsfilt_objinfo fso;
 528         struct filter_mod_data *fmd;
 529         struct dentry *dentry = NULL;
 530         void *iobuf;
 531         obd_size left;
 532         unsigned long now = jiffies;
 533         int rc = 0, i, tot_bytes = 0, cleanup_phase = 0;
 534         ENTRY;
 535         LASSERT(objcount == 1);
 536         LASSERT(obj->ioo_bufcnt > 0);
 537
 538         rc = filter_auth_capa(exp, NULL, obdo_mdsno(oa), capa,
 539                               CAPA_OPC_OSS_WRITE);
 540         if (rc)
 541                 RETURN(rc);
 542
 543         push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
 544         iobuf = filter_iobuf_get(&exp->exp_obd->u.filter, oti);
 545         if (IS_ERR(iobuf))
 546                 GOTO(cleanup, rc = PTR_ERR(iobuf));
 547         cleanup_phase = 1;
 548
 549         dentry = filter_fid2dentry(exp->exp_obd, NULL, obj->ioo_gr,
 550                                    obj->ioo_id);
 551         if (IS_ERR(dentry))
 552                 GOTO(cleanup, rc = PTR_ERR(dentry));
 553         cleanup_phase = 2;
 554
 555         if (dentry->d_inode == NULL) {
 556                 CERROR("%s: trying to BRW to non-existent file "LPU64"\n",
 557                        exp->exp_obd->obd_name, obj->ioo_id);
 558                 GOTO(cleanup, rc = -ENOENT);
 559         }
 560
 561         fso.fso_dentry = dentry;
 562         fso.fso_bufcnt = obj->ioo_bufcnt;
 563
 564         fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "preprw_write setup");
 565
 566         /* Don't update inode timestamps if this write is older than a
 567          * setattr which modifies the timestamps. b=10150 */
 568         /* XXX when we start having persistent reservations this needs to
 569          * be changed to filter_fmd_get() to create the fmd if it doesn't
 570          * already exist so we can store the reservation handle there. */
 571         fmd = filter_fmd_find(exp, obj->ioo_id, obj->ioo_gr);
 572
 573         LASSERT(oa != NULL);
 574         spin_lock(&exp->exp_obd->obd_osfs_lock);
 575         filter_grant_incoming(exp, oa);
 576         if (fmd && fmd->fmd_mactime_xid > oti->oti_xid)
 577                 oa->o_valid &= ~(OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 578                                  OBD_MD_FLATIME);
 579         else
 580                 obdo_to_inode(dentry->d_inode, oa, OBD_MD_FLATIME |
 581                               OBD_MD_FLMTIME | OBD_MD_FLCTIME);
 582         cleanup_phase = 3;
 583
 584         left = filter_grant_space_left(exp);
 585
 586         rc = filter_grant_check(exp, oa, objcount, &fso, niocount, nb, res,
 587                                 &left, dentry->d_inode);
 588
 589         /* do not zero out oa->o_valid as it is used in filter_commitrw_write()
 590          * for setting UID/GID and fid EA in first write time. */
 591         if (oa->o_valid & OBD_MD_FLGRANT) {
 592                 oa->o_grant = filter_grant(exp,oa->o_grant,oa->o_undirty,left);
 593                 oa->o_valid |= OBD_MD_FLGRANT;
 594         }
 595
 596         spin_unlock(&exp->exp_obd->obd_osfs_lock);
 597         filter_fmd_put(exp, fmd);
 598
 599         if (rc)
 600                 GOTO(cleanup, rc);
 601
 602         for (i = 0, rnb = nb, lnb = res; i < obj->ioo_bufcnt;
 603              i++, lnb++, rnb++) {
 604                 /* We still set up for ungranted pages so that granted pages
 605                  * can be written to disk as they were promised, and portals
 606                  * needs to keep the pages all aligned properly. */
 607                 lnb->dentry = dentry;
 608                 lnb->offset = rnb->offset;
 609                 lnb->len    = rnb->len;
 610                 lnb->flags  = rnb->flags;
 611
 612                 /*
 613                  * ost_brw_write()->ost_nio_pages_get() already initialized
 614                  * lnb->page to point to the page from the per-thread page
 615                  * pool (bug 5137), initialize page.
 616                  */
 617                 LASSERT(lnb->page != NULL);
 618                 if (lnb->len != CFS_PAGE_SIZE) {
 619                         memset(kmap(lnb->page) + lnb->len,
 620                                0, CFS_PAGE_SIZE - lnb->len);
 621                         kunmap(lnb->page);
 622                 }
 623                 lnb->page->index = lnb->offset >> CFS_PAGE_SHIFT;
 624
 625                 cleanup_phase = 4;
 626
 627                 /* If the filter writes a partial page, then has the file
 628                  * extended, the client will read in the whole page.  the
 629                  * filter has to be careful to zero the rest of the partial
 630                  * page on disk.  we do it by hand for partial extending
 631                  * writes, send_bio() is responsible for zeroing pages when
 632                  * asked to read unmapped blocks -- brw_kiovec() does this. */
 633                 if (lnb->len != CFS_PAGE_SIZE) {
 634                         __s64 maxidx;
 635
 636                         maxidx = ((dentry->d_inode->i_size + CFS_PAGE_SIZE - 1) >>
 637                                  CFS_PAGE_SHIFT) - 1;
 638                         if (maxidx >= lnb->page->index) {
 639                                 LL_CDEBUG_PAGE(D_PAGE, lnb->page, "write %u @ "
 640                                                LPU64" flg %x before EOF %llu\n",
 641                                                lnb->len, lnb->offset,lnb->flags,
 642                                                dentry->d_inode->i_size);
 643                                 filter_iobuf_add_page(exp->exp_obd, iobuf,
 644                                                       dentry->d_inode,
 645                                                       lnb->page);
 646                         } else {
 647                                 long off;
 648                                 char *p = kmap(lnb->page);
 649
 650                                 off = lnb->offset & ~CFS_PAGE_MASK;
 651                                 if (off)
 652                                         memset(p, 0, off);
 653                                 off = (lnb->offset + lnb->len) & ~CFS_PAGE_MASK;
 654                                 if (off)
 655                                         memset(p + off, 0, CFS_PAGE_SIZE - off);
 656                                 kunmap(lnb->page);
 657                         }
 658                 }
 659                 if (lnb->rc == 0)
 660                         tot_bytes += lnb->len;
 661         }
 662
 663         rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp,
 664                               NULL, NULL, NULL);
 665
 666         fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "start_page_write");
 667
 668         lprocfs_counter_add(exp->exp_ops_stats, LPROC_FILTER_WRITE_BYTES,
 669                             tot_bytes);
 670         EXIT;
 671 cleanup:
 672         switch(cleanup_phase) {
 673         case 4:
 674         case 3:
 675                 filter_iobuf_put(&exp->exp_obd->u.filter, iobuf, oti);
 676         case 2:
 677                 pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
 678                 if (rc)
 679                         f_dput(dentry);
 680                 break;
 681         case 1:
 682                 filter_iobuf_put(&exp->exp_obd->u.filter, iobuf, oti);
 683         case 0:
 684                 spin_lock(&exp->exp_obd->obd_osfs_lock);
 685                 if (oa)
 686                         filter_grant_incoming(exp, oa);
 687                 spin_unlock(&exp->exp_obd->obd_osfs_lock);
 688                 pop_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL);
 689                 break;
 690         default:;
 691         }
 692         return rc;
 693 }
 694
 695 int filter_preprw(int cmd, struct obd_export *exp, struct obdo *oa,
 696                   int objcount, struct obd_ioobj *obj, int niocount,
 697                   struct niobuf_remote *nb, struct niobuf_local *res,
 698                   struct obd_trans_info *oti, struct lustre_capa *capa)
 699 {
 700         if (cmd == OBD_BRW_WRITE)
 701                 return filter_preprw_write(cmd, exp, oa, objcount, obj,
 702                                            niocount, nb, res, oti, capa);
 703         if (cmd == OBD_BRW_READ)
 704                 return filter_preprw_read(cmd, exp, oa, objcount, obj,
 705                                           niocount, nb, res, oti, capa);
 706         LBUG();
 707         return -EPROTO;
 708 }
 709
 710 void filter_release_read_page(struct filter_obd *filter, struct inode *inode,
 711                               struct page *page)
 712 {
 713         int drop = 0;
 714
 715         if (inode != NULL &&
 716             (inode->i_size > filter->fo_readcache_max_filesize))
 717                 drop = 1;
 718
 719         /* drop from cache like truncate_list_pages() */
 720         if (drop && !TryLockPage(page)) {
 721                 if (page->mapping)
 722                         ll_truncate_complete_page(page);
 723                 unlock_page(page);
 724         }
 725         page_cache_release(page);
 726 }
 727
 728 static int filter_commitrw_read(struct obd_export *exp, struct obdo *oa,
 729                                 int objcount, struct obd_ioobj *obj,
 730                                 int niocount, struct niobuf_local *res,
 731                                 struct obd_trans_info *oti, int rc)
 732 {
 733         struct inode *inode = NULL;
 734         struct ldlm_res_id res_id = { .name = { obj->ioo_id, 0,
 735                                                 obj->ioo_gr, 0} };
 736         struct ldlm_resource *resource = NULL;
 737         struct ldlm_namespace *ns = exp->exp_obd->obd_namespace;
 738         ENTRY;
 739
 740         /* If oa != NULL then filter_preprw_read updated the inode atime
 741          * and we should update the lvb so that other glimpses will also
 742          * get the updated value. bug 5972 */
 743         if (oa && ns && ns->ns_lvbo && ns->ns_lvbo->lvbo_update) {
 744                 resource = ldlm_resource_get(ns, NULL, &res_id, LDLM_EXTENT, 0);
 745
 746                 if (resource != NULL) {
 747                         ns->ns_lvbo->lvbo_update(resource, NULL, 0, 1);
 748                         ldlm_resource_putref(resource);
 749                 }
 750         }
 751
 752         if (res->dentry != NULL)
 753                 inode = res->dentry->d_inode;
 754
 755         filter_free_dio_pages(objcount, obj, niocount, res);
 756
 757         if (res->dentry != NULL)
 758                 f_dput(res->dentry);
 759         RETURN(rc);
 760 }
 761
 762 void flip_into_page_cache(struct inode *inode, struct page *new_page)
 763 {
 764         struct page *old_page;
 765         int rc;
 766
 767         do {
 768                 /* the dlm is protecting us from read/write concurrency, so we
 769                  * expect this find_lock_page to return quickly.  even if we
 770                  * race with another writer it won't be doing much work with
 771                  * the page locked.  we do this 'cause t_c_p expects a
 772                  * locked page, and it wants to grab the pagecache lock
 773                  * as well. */
 774                 old_page = find_lock_page(inode->i_mapping, new_page->index);
 775                 if (old_page) {
 776                         ll_truncate_complete_page(old_page);
 777                         unlock_page(old_page);
 778                         page_cache_release(old_page);
 779                 }
 780
 781 #if 0 /* this should be a /proc tunable someday */
 782                 /* racing o_directs (no locking ioctl) could race adding
 783                  * their pages, so we repeat the page invalidation unless
 784                  * we successfully added our new page */
 785                 rc = add_to_page_cache_unique(new_page, inode->i_mapping,
 786                                               new_page->index,
 787                                               page_hash(inode->i_mapping,
 788                                                         new_page->index));
 789                 if (rc == 0) {
 790                         /* add_to_page_cache clears uptodate|dirty and locks
 791                          * the page */
 792                         SetPageUptodate(new_page);
 793                         unlock_page(new_page);
 794                 }
 795 #else
 796                 rc = 0;
 797 #endif
 798         } while (rc != 0);
 799 }
 800
 801 void filter_grant_commit(struct obd_export *exp, int niocount,
 802                          struct niobuf_local *res)
 803 {
 804         struct filter_obd *filter = &exp->exp_obd->u.filter;
 805         struct niobuf_local *lnb = res;
 806         unsigned long pending = 0;
 807         int i;
 808
 809         spin_lock(&exp->exp_obd->obd_osfs_lock);
 810         for (i = 0, lnb = res; i < niocount; i++, lnb++)
 811                 pending += lnb->lnb_grant_used;
 812
 813         LASSERTF(exp->exp_filter_data.fed_pending >= pending,
 814                  "%s: cli %s/%p fed_pending: %lu grant_used: %lu\n",
 815                  exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
 816                  exp->exp_filter_data.fed_pending, pending);
 817         exp->exp_filter_data.fed_pending -= pending;
 818         LASSERTF(filter->fo_tot_granted >= pending,
 819                  "%s: cli %s/%p tot_granted: "LPU64" grant_used: %lu\n",
 820                  exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
 821                  exp->exp_obd->u.filter.fo_tot_granted, pending);
 822         filter->fo_tot_granted -= pending;
 823         LASSERTF(filter->fo_tot_pending >= pending,
 824                  "%s: cli %s/%p tot_pending: "LPU64" grant_used: %lu\n",
 825                  exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp,
 826                  filter->fo_tot_pending, pending);
 827         filter->fo_tot_pending -= pending;
 828
 829         spin_unlock(&exp->exp_obd->obd_osfs_lock);
 830 }
 831
 832 int filter_commitrw(int cmd, struct obd_export *exp, struct obdo *oa,
 833                     int objcount, struct obd_ioobj *obj, int niocount,
 834                     struct niobuf_local *res, struct obd_trans_info *oti,
 835                     int rc)
 836 {
 837         if (cmd == OBD_BRW_WRITE)
 838                 return filter_commitrw_write(exp, oa, objcount, obj, niocount,
 839                                              res, oti, rc);
 840         if (cmd == OBD_BRW_READ)
 841                 return filter_commitrw_read(exp, oa, objcount, obj, niocount,
 842                                             res, oti, rc);
 843         LBUG();
 844         return -EPROTO;
 845 }
 846
 847 int filter_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
 848                obd_count oa_bufs, struct brw_page *pga,
 849                struct obd_trans_info *oti)
 850 {
 851         struct obd_ioobj ioo;
 852         struct niobuf_local *lnb;
 853         struct niobuf_remote *rnb;
 854         obd_count i;
 855         int ret = 0;
 856         ENTRY;
 857
 858         OBD_ALLOC(lnb, oa_bufs * sizeof(struct niobuf_local));
 859         OBD_ALLOC(rnb, oa_bufs * sizeof(struct niobuf_remote));
 860
 861         if (lnb == NULL || rnb == NULL)
 862                 GOTO(out, ret = -ENOMEM);
 863
 864         for (i = 0; i < oa_bufs; i++) {
 865                 lnb[i].page = pga[i].pg;
 866                 rnb[i].offset = pga[i].off;
 867                 rnb[i].len = pga[i].count;
 868         }
 869
 870         obdo_to_ioobj(oinfo->oi_oa, &ioo);
 871         ioo.ioo_bufcnt = oa_bufs;
 872
 873         ret = filter_preprw(cmd, exp, oinfo->oi_oa, 1, &ioo,
 874                             oa_bufs, rnb, lnb, oti, oinfo_capa(oinfo));
 875         if (ret != 0)
 876                 GOTO(out, ret);
 877
 878         ret = filter_commitrw(cmd, exp, oinfo->oi_oa, 1, &ioo,
 879                               oa_bufs, lnb, oti, ret);
 880
 881 out:
 882         if (lnb)
 883                 OBD_FREE(lnb, oa_bufs * sizeof(struct niobuf_local));
 884         if (rnb)
 885                 OBD_FREE(rnb, oa_bufs * sizeof(struct niobuf_remote));
 886         RETURN(ret);
 887 }