lustre/llite/file.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
   5  *   Author: Peter Braam <braam@clusterfs.com>
   6  *   Author: Phil Schwan <phil@clusterfs.com>
   7  *   Author: Andreas Dilger <adilger@clusterfs.com>
   8  *
   9  *   This file is part of Lustre, http://www.lustre.org.
  10  *
  11  *   Lustre is free software; you can redistribute it and/or
  12  *   modify it under the terms of version 2 of the GNU General Public
  13  *   License as published by the Free Software Foundation.
  14  *
  15  *   Lustre is distributed in the hope that it will be useful,
  16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  *   GNU General Public License for more details.
  19  *
  20  *   You should have received a copy of the GNU General Public License
  21  *   along with Lustre; if not, write to the Free Software
  22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25 #define DEBUG_SUBSYSTEM S_LLITE
  26 #include <linux/lustre_dlm.h>
  27 #include <linux/lustre_lite.h>
  28 #include <linux/obd_lov.h>      /* for lov_mds_md_size() in lov_setstripe() */
  29 #include <linux/random.h>
  30 #include <linux/pagemap.h>
  31 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
  32 #include <linux/lustre_compat25.h>
  33 #endif
  34
  35 int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc);
  36 extern int ll_setattr(struct dentry *de, struct iattr *attr);
  37
  38 static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode,
  39                         struct file *file)
  40 {
  41         struct ll_file_data *fd = file->private_data;
  42         struct ptlrpc_request *req = NULL;
  43         unsigned long flags;
  44         struct obd_import *imp;
  45         int rc;
  46         ENTRY;
  47
  48         /* Complete the open request and remove it from replay list */
  49         rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, inode->i_ino,
  50                        inode->i_mode, &fd->fd_mds_och.och_fh, &req);
  51         if (rc)
  52                 CERROR("inode %lu close failed: rc = %d\n", inode->i_ino, rc);
  53
  54         imp = fd->fd_mds_och.och_req->rq_import;
  55         LASSERT(imp != NULL);
  56         spin_lock_irqsave(&imp->imp_lock, flags);
  57
  58         DEBUG_REQ(D_HA, fd->fd_mds_och.och_req, "matched open req %p",
  59                   fd->fd_mds_och.och_req);
  60
  61         /* We held on to the request for replay until we saw a close for that
  62          * file.  Now that we've closed it, it gets replayed on the basis of
  63          * its transno only. */
  64         spin_lock (&fd->fd_mds_och.och_req->rq_lock);
  65         fd->fd_mds_och.och_req->rq_replay = 0;
  66         spin_unlock (&fd->fd_mds_och.och_req->rq_lock);
  67
  68         if (fd->fd_mds_och.och_req->rq_transno) {
  69                 /* This open created a file, so it needs replay as a
  70                  * normal transaction now.  Our reference to it now
  71                  * effectively owned by the imp_replay_list, and it'll
  72                  * be committed just like other transno-having
  73                  * requests from here on out. */
  74
  75                 /* We now retain this close request, so that it is
  76                  * replayed if the open is replayed.  We duplicate the
  77                  * transno, so that we get freed at the right time,
  78                  * and rely on the difference in xid to keep
  79                  * everything ordered correctly.
  80                  *
  81                  * But! If this close was already given a transno
  82                  * (because it caused real unlinking of an
  83                  * open-unlinked file, f.e.), then we'll be ordered on
  84                  * the basis of that and we don't need to do anything
  85                  * magical here. */
  86                 if (!req->rq_transno) {
  87                         req->rq_transno = fd->fd_mds_och.och_req->rq_transno;
  88                         ptlrpc_retain_replayable_request(req, imp);
  89                 }
  90                 spin_unlock_irqrestore(&imp->imp_lock, flags);
  91
  92                 /* Should we free_committed now? we always free before
  93                  * replay, so it's probably a wash.  We could check to
  94                  * see if the fd_req should already be committed, in
  95                  * which case we can avoid the whole retain_replayable
  96                  * dance. */
  97         } else {
  98                 /* No transno means that we can just drop our ref. */
  99                 spin_unlock_irqrestore(&imp->imp_lock, flags);
 100         }
 101         ptlrpc_req_finished(fd->fd_mds_och.och_req);
 102
 103         /* Do this after the fd_req->rq_transno check, because we don't want
 104          * to bounce off zero references. */
 105         ptlrpc_req_finished(req);
 106         fd->fd_mds_och.och_fh.cookie = DEAD_HANDLE_MAGIC;
 107         file->private_data = NULL;
 108         OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd);
 109
 110         RETURN(-abs(rc));
 111 }
 112
 113 /* While this returns an error code, fput() the caller does not, so we need
 114  * to make every effort to clean up all of our state here.  Also, applications
 115  * rarely check close errors and even if an error is returned they will not
 116  * re-try the close call.
 117  */
 118 int ll_file_release(struct inode *inode, struct file *file)
 119 {
 120         struct ll_file_data *fd;
 121         struct obdo oa;
 122         struct ll_sb_info *sbi = ll_i2sbi(inode);
 123         struct ll_inode_info *lli = ll_i2info(inode);
 124         struct lov_stripe_md *lsm = lli->lli_smd;
 125         int rc = 0, rc2;
 126
 127         ENTRY;
 128         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 129                inode->i_generation, inode);
 130
 131         /* don't do anything for / */
 132         if (inode->i_sb->s_root == file->f_dentry)
 133                 RETURN(0);
 134
 135         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_RELEASE);
 136         fd = (struct ll_file_data *)file->private_data;
 137         if (!fd) /* no process opened the file after an mcreate */
 138                 RETURN(rc = 0);
 139
 140         /* we might not be able to get a valid handle on this file
 141          * again so we really want to flush our write cache.. */
 142         if (S_ISREG(inode->i_mode)) {
 143                 filemap_fdatasync(inode->i_mapping);
 144                 filemap_fdatawait(inode->i_mapping);
 145
 146                 if (lsm != NULL) {
 147                         memset(&oa, 0, sizeof(oa));
 148                         oa.o_id = lsm->lsm_object_id;
 149                         oa.o_mode = S_IFREG;
 150                         oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID;
 151
 152                         memcpy(&oa.o_inline, &fd->fd_ost_och, FD_OSTDATA_SIZE);
 153                         oa.o_valid |= OBD_MD_FLHANDLE;
 154
 155                         rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL);
 156                         if (rc)
 157                                 CERROR("inode %lu object close failed: rc = "
 158                                        "%d\n", inode->i_ino, rc);
 159                 }
 160         }
 161
 162         rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
 163         if (rc2 && !rc)
 164                 rc = rc2;
 165
 166         RETURN(rc);
 167 }
 168
 169 static int ll_local_open(struct file *file, struct lookup_intent *it)
 170 {
 171         struct ptlrpc_request *req = it->it_data;
 172         struct ll_file_data *fd;
 173         struct mds_body *body;
 174         ENTRY;
 175
 176         body = lustre_msg_buf (req->rq_repmsg, 1, sizeof (*body));
 177         LASSERT (body != NULL);                 /* reply already checked out */
 178         LASSERT_REPSWABBED (req, 1);            /* and swabbed down */
 179
 180         LASSERT(!file->private_data);
 181
 182         OBD_SLAB_ALLOC(fd, ll_file_data_slab, SLAB_KERNEL, sizeof *fd);
 183         /* We can't handle this well without reorganizing ll_file_open and
 184          * ll_mdc_close, so don't even try right now. */
 185         LASSERT(fd != NULL);
 186
 187         memset(fd, 0, sizeof(*fd));
 188
 189         memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle));
 190         fd->fd_mds_och.och_req = it->it_data;
 191         file->private_data = fd;
 192
 193         RETURN(0);
 194 }
 195
 196 static int ll_osc_open(struct lustre_handle *conn, struct inode *inode,
 197                        struct file *file, struct lov_stripe_md *lsm)
 198 {
 199         struct ll_file_data *fd = file->private_data;
 200         struct obdo *oa;
 201         int rc;
 202         ENTRY;
 203
 204         oa = obdo_alloc();
 205         if (!oa)
 206                 RETURN(-ENOMEM);
 207         oa->o_id = lsm->lsm_object_id;
 208         oa->o_mode = S_IFREG;
 209         oa->o_valid = (OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLBLOCKS |
 210                        OBD_MD_FLMTIME | OBD_MD_FLCTIME);
 211         rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och);
 212         if (rc)
 213                 GOTO(out, rc);
 214
 215         file->f_flags &= ~O_LOV_DELAY_CREATE;
 216         obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 217                                  OBD_MD_FLMTIME | OBD_MD_FLCTIME);
 218
 219         EXIT;
 220 out:
 221         obdo_free(oa);
 222         return rc;
 223 }
 224
 225 /* Caller must hold lli_open_sem to protect lli->lli_smd from changing and
 226  * duplicate objects from being created.  We only install lsm to lli_smd if
 227  * the mdc open was successful (hence stored stripe MD on MDS), otherwise
 228  * other nodes could try to create different objects for the same file.
 229  */
 230 static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
 231                          struct file *file, struct lov_stripe_md *lsm)
 232 {
 233         struct ptlrpc_request *req = NULL;
 234         struct ll_inode_info *lli = ll_i2info(inode);
 235         struct lov_mds_md *lmm = NULL;
 236         struct obdo *oa;
 237         struct iattr iattr;
 238         struct mdc_op_data op_data;
 239         int rc, err, lmm_size = 0;;
 240         ENTRY;
 241
 242         oa = obdo_alloc();
 243         if (!oa)
 244                 RETURN(-ENOMEM);
 245
 246         oa->o_mode = S_IFREG | 0600;
 247         oa->o_id = inode->i_ino;
 248         /* Keep these 0 for now, because chown/chgrp does not change the
 249          * ownership on the OST, and we don't want to allow BA OST NFS
 250          * users to access these objects by mistake. */
 251         oa->o_uid = 0;
 252         oa->o_gid = 0;
 253         oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE |
 254                 OBD_MD_FLUID | OBD_MD_FLGID;
 255
 256         rc = obd_create(conn, oa, &lsm, NULL);
 257         if (rc) {
 258                 CERROR("error creating objects for inode %lu: rc = %d\n",
 259                        inode->i_ino, rc);
 260                 if (rc > 0) {
 261                         CERROR("obd_create returned invalid rc %d\n", rc);
 262                         rc = -EIO;
 263                 }
 264                 GOTO(out_oa, rc);
 265         }
 266         obdo_to_inode(inode, oa, OBD_MD_FLBLKSZ);
 267
 268         LASSERT(lsm && lsm->lsm_object_id);
 269         rc = obd_packmd(conn, &lmm, lsm);
 270         if (rc < 0)
 271                 GOTO(out_destroy, rc);
 272
 273         lmm_size = rc;
 274
 275         /* Save the stripe MD with this file on the MDS */
 276         memset(&iattr, 0, sizeof(iattr));
 277         iattr.ia_valid = ATTR_FROM_OPEN;
 278
 279         ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
 280
 281         rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, &op_data,
 282                          &iattr, lmm, lmm_size, &req);
 283         ptlrpc_req_finished(req);
 284
 285         obd_free_diskmd (conn, &lmm);
 286
 287         /* If we couldn't complete mdc_open() and store the stripe MD on the
 288          * MDS, we need to destroy the objects now or they will be leaked.
 289          */
 290         if (rc) {
 291                 CERROR("error: storing stripe MD for %lu: rc %d\n",
 292                        inode->i_ino, rc);
 293                 GOTO(out_destroy, rc);
 294         }
 295         lli->lli_smd = lsm;
 296         lli->lli_maxbytes = lsm->lsm_maxbytes;
 297
 298         EXIT;
 299 out_oa:
 300         obdo_free(oa);
 301         return rc;
 302
 303 out_destroy:
 304         obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
 305         oa->o_id = lsm->lsm_object_id;
 306         oa->o_valid |= OBD_MD_FLID;
 307         err = obd_destroy(conn, oa, lsm, NULL);
 308         obd_free_memmd(conn, &lsm);
 309         if (err)
 310                 CERROR("error uncreating inode %lu objects: rc %d\n",
 311                        inode->i_ino, err);
 312         goto out_oa;
 313 }
 314
 315 /* Open a file, and (for the very first open) create objects on the OSTs at
 316  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 317  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
 318  * lli_open_sem to ensure no other process will create objects, send the
 319  * stripe MD to the MDS, or try to destroy the objects if that fails.
 320  *
 321  * If we already have the stripe MD locally then we don't request it in
 322  * mdc_open(), by passing a lmm_size = 0.
 323  *
 324  * It is up to the application to ensure no other processes open this file
 325  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 326  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 327  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 328  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 329  */
 330 extern int ll_it_open_error(int phase, struct lookup_intent *it);
 331
 332 int ll_file_open(struct inode *inode, struct file *file)
 333 {
 334         struct ll_sb_info *sbi = ll_i2sbi(inode);
 335         struct ll_inode_info *lli = ll_i2info(inode);
 336         struct lustre_handle *conn = ll_i2obdconn(inode);
 337         struct lookup_intent *it;
 338         struct lov_stripe_md *lsm;
 339         int rc = 0;
 340         ENTRY;
 341
 342         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 343                inode->i_generation, inode);
 344
 345         /* don't do anything for / */
 346         if (inode->i_sb->s_root == file->f_dentry)
 347                 RETURN(0);
 348
 349         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
 350         LL_GET_INTENT(file->f_dentry, it);
 351         rc = ll_it_open_error(IT_OPEN_OPEN, it);
 352         if (rc)
 353                 RETURN(rc);
 354
 355         rc = ll_local_open(file, it);
 356         if (rc)
 357                 LBUG();
 358
 359         mdc_set_open_replay_data(&((struct ll_file_data *)
 360                                    file->private_data)->fd_mds_och);
 361         if (!S_ISREG(inode->i_mode))
 362                 RETURN(0);
 363
 364         lsm = lli->lli_smd;
 365         if (lsm == NULL) {
 366                 if (file->f_flags & O_LOV_DELAY_CREATE) {
 367                         CDEBUG(D_INODE, "delaying object creation\n");
 368                         RETURN(0);
 369                 }
 370                 down(&lli->lli_open_sem);
 371                 if (!lli->lli_smd) {
 372                         rc = ll_create_obj(conn, inode, file, NULL);
 373                         up(&lli->lli_open_sem);
 374                         if (rc)
 375                                 GOTO(out_close, rc);
 376                 } else {
 377                         CERROR("warning: stripe already set on ino %lu\n",
 378                                inode->i_ino);
 379                         up(&lli->lli_open_sem);
 380                 }
 381                 lsm = lli->lli_smd;
 382         }
 383
 384         rc = ll_osc_open(conn, inode, file, lsm);
 385         if (rc)
 386                 GOTO(out_close, rc);
 387         RETURN(0);
 388
 389  out_close:
 390         ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
 391         return rc;
 392 }
 393
 394 /*
 395  * really does the getattr on the inode and updates its fields
 396  */
 397 int ll_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm,
 398                      void *ostdata)
 399 {
 400         struct ll_sb_info *sbi = ll_i2sbi(inode);
 401         struct ll_inode_info *lli = ll_i2info(inode);
 402         struct ptlrpc_request_set *set;
 403         struct obdo oa;
 404         int bef, aft;
 405         unsigned long before, after;
 406         int rc;
 407         ENTRY;
 408
 409         LASSERT(lsm);
 410         LASSERT(sbi);
 411         LASSERT(lli);
 412
 413         memset(&oa, 0, sizeof oa);
 414         oa.o_id = lsm->lsm_object_id;
 415         oa.o_mode = S_IFREG;
 416         oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
 417                 OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
 418                 OBD_MD_FLCTIME;
 419
 420         if (ostdata != NULL) {
 421                 memcpy(&oa.o_inline, ostdata, FD_OSTDATA_SIZE);
 422                 oa.o_valid |= OBD_MD_FLHANDLE;
 423         }
 424
 425         /* getattr can race with writeback.  we don't want to trust a getattr
 426          * that doesn't include the writeback of our farthest cached pages
 427          * that it raced with. */
 428         /* Now that the OSC knows the cached-page status, it can and should be
 429          * adjusting its getattr results to include the maximum cached offset
 430          * for its stripe(s). */
 431         do {
 432                 bef = obd_last_dirty_offset(ll_i2obdconn(inode), lli->lli_smd,
 433                                             &before);
 434 #if 0
 435                 rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm);
 436 #else
 437                 set = ptlrpc_prep_set ();
 438                 if (set == NULL) {
 439                         CERROR ("ENOMEM allocing request set\n");
 440                         rc = -ENOMEM;
 441                 } else {
 442                         rc = obd_getattr_async(&sbi->ll_osc_conn, &oa, lsm, set);
 443                         if (rc == 0)
 444                                 rc = ptlrpc_set_wait (set);
 445                         ptlrpc_set_destroy (set);
 446                 }
 447 #endif
 448                 if (rc)
 449                         RETURN(rc);
 450
 451                 aft = obd_last_dirty_offset(ll_i2obdconn(inode), lli->lli_smd,
 452                                             &after);
 453                 CDEBUG(D_INODE, " %d,%lu -> %d,%lu\n", bef, before, aft, after);
 454         } while (bef == 0 &&
 455                  (aft != 0 || after < before) &&
 456                  oa.o_size < ((u64)before + 1) << PAGE_CACHE_SHIFT);
 457
 458         obdo_to_inode(inode, &oa, (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 459                                    OBD_MD_FLMTIME | OBD_MD_FLCTIME));
 460         if (inode->i_blksize < PAGE_CACHE_SIZE)
 461                 inode->i_blksize = PAGE_CACHE_SIZE;
 462
 463         /* make sure getattr doesn't return a size that causes writeback
 464          * to forget about cached writes */
 465         if ((aft == 0) && oa.o_size < ((u64)after + 1) << PAGE_CACHE_SHIFT) {
 466                 CDEBUG(D_INODE, "cached at %lu, keeping %llu i_size instead "
 467                                 "of oa "LPU64"\n", after, inode->i_size,
 468                                 oa.o_size);
 469                 RETURN(0);
 470         }
 471
 472         obdo_to_inode(inode, &oa, OBD_MD_FLSIZE);
 473
 474         CDEBUG(D_INODE, "objid "LPX64" size %Lu/%Lu blksize %lu\n",
 475                lsm->lsm_object_id, inode->i_size, inode->i_size,
 476                inode->i_blksize);
 477         RETURN(0);
 478 }
 479
 480 /*
 481  * some callers, notably truncate, really don't want i_size set based
 482  * on the the size returned by the getattr, or lock acquisition in
 483  * the future.
 484  */
 485 int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode,
 486                    struct lov_stripe_md *lsm,
 487                    int mode, struct ldlm_extent *extent,
 488                    struct lustre_handle *lockh)
 489 {
 490         struct ll_sb_info *sbi = ll_i2sbi(inode);
 491         int rc, flags = 0;
 492         ENTRY;
 493
 494         LASSERT(lockh->cookie == 0);
 495
 496         /* XXX phil: can we do this?  won't it screw the file size up? */
 497         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
 498             (sbi->ll_flags & LL_SBI_NOLCK))
 499                 RETURN(0);
 500
 501         CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
 502                inode->i_ino, extent->start, extent->end);
 503
 504         rc = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, extent,
 505                          sizeof(extent), mode, &flags, ll_extent_lock_callback,
 506                          inode, lockh);
 507
 508         RETURN(rc);
 509 }
 510
 511 /*
 512  * this grabs a lock and manually implements behaviour that makes it look like
 513  * the OST is returning the file size with each lock acquisition.
 514  */
 515 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
 516                    struct lov_stripe_md *lsm, int mode,
 517                    struct ldlm_extent *extent, struct lustre_handle *lockh)
 518 {
 519         struct ll_inode_info *lli = ll_i2info(inode);
 520         struct ldlm_extent size_lock;
 521         struct lustre_handle match_lockh = {0};
 522         int flags, rc, matched;
 523         ENTRY;
 524
 525         rc = ll_extent_lock_no_validate(fd, inode, lsm, mode, extent, lockh);
 526         if (rc != ELDLM_OK)
 527                 RETURN(rc);
 528
 529         if (test_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags))
 530                 RETURN(0);
 531
 532         rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL);
 533         if (rc) {
 534                 ll_extent_unlock(fd, inode, lsm, mode, lockh);
 535                 RETURN(rc);
 536         }
 537
 538         size_lock.start = inode->i_size;
 539         size_lock.end = OBD_OBJECT_EOF;
 540
 541         /* XXX I bet we should be checking the lock ignore flags.. */
 542         flags = LDLM_FL_CBPENDING | LDLM_FL_BLOCK_GRANTED | LDLM_FL_MATCH_DATA;
 543         matched = obd_match(&ll_i2sbi(inode)->ll_osc_conn, lsm, LDLM_EXTENT,
 544                             &size_lock, sizeof(size_lock), LCK_PR, &flags,
 545                             inode, &match_lockh);
 546
 547         /* hey, alright, we hold a size lock that covers the size we
 548          * just found, its not going to change for a while.. */
 549         if (matched == 1) {
 550                 set_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags);
 551                 obd_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, LCK_PR,
 552                            &match_lockh);
 553         }
 554
 555         RETURN(0);
 556 }
 557
 558 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
 559                 struct lov_stripe_md *lsm, int mode,
 560                 struct lustre_handle *lockh)
 561 {
 562         struct ll_sb_info *sbi = ll_i2sbi(inode);
 563         int rc;
 564         ENTRY;
 565
 566         /* XXX phil: can we do this?  won't it screw the file size up? */
 567         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
 568             (sbi->ll_flags & LL_SBI_NOLCK))
 569                 RETURN(0);
 570
 571         rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockh);
 572
 573         RETURN(rc);
 574 }
 575
 576 static inline void ll_remove_suid(struct inode *inode)
 577 {
 578         unsigned int mode;
 579
 580         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
 581         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
 582
 583         /* was any of the uid bits set? */
 584         mode &= inode->i_mode;
 585         if (mode && !capable(CAP_FSETID)) {
 586                 inode->i_mode &= ~mode;
 587                 // XXX careful here - we cannot change the size
 588         }
 589 }
 590
 591 #if 0
 592 static void ll_update_atime(struct inode *inode)
 593 {
 594 #ifdef USE_ATIME
 595         struct iattr attr;
 596
 597         attr.ia_atime = LTIME_S(CURRENT_TIME);
 598         attr.ia_valid = ATTR_ATIME;
 599
 600         if (inode->i_atime == attr.ia_atime) return;
 601         if (IS_RDONLY(inode)) return;
 602         if (IS_NOATIME(inode)) return;
 603
 604         /* ll_inode_setattr() sets inode->i_atime from attr.ia_atime */
 605         ll_inode_setattr(inode, &attr, 0);
 606 #else
 607         /* update atime, but don't explicitly write it out just this change */
 608         inode->i_atime = CURRENT_TIME;
 609 #endif
 610 }
 611 #endif
 612
 613 /*
 614  * flush the page cache for an extent as its canceled.  when we're on an
 615  * lov we get a lock cancelation for each of the obd locks under the lov
 616  * so we have to map the obd's region back onto the stripes in the file
 617  * that it held.
 618  *
 619  * no one can dirty the extent until we've finished our work and they
 620  * can enqueue another lock.
 621  *
 622  * XXX this could be asking the inode's dirty tree for info
 623  */
 624 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
 625                               struct ldlm_lock *lock)
 626 {
 627         struct ldlm_extent *extent = &lock->l_extent;
 628         unsigned long start, end, count, skip, i, j;
 629         struct page *page;
 630         int ret;
 631         ENTRY;
 632
 633         CDEBUG(D_INODE, "obdo %lu inode %p ["LPU64"->"LPU64"] size: %llu\n",
 634                inode->i_ino, inode, extent->start, extent->end, inode->i_size);
 635
 636         start = extent->start >> PAGE_CACHE_SHIFT;
 637         count = ~0;
 638         skip = 0;
 639         end = (extent->end >> PAGE_CACHE_SHIFT) + 1;
 640         if ((end << PAGE_CACHE_SHIFT) < extent->end)
 641                 end = ~0;
 642         if (lsm->lsm_stripe_count > 1) {
 643                 struct {
 644                         char name[16];
 645                         struct ldlm_lock *lock;
 646                         struct lov_stripe_md *lsm;
 647                 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
 648                 __u32 stripe;
 649                 __u32 vallen = sizeof(stripe);
 650                 int rc;
 651
 652                 /* get our offset in the lov */
 653                 rc = obd_get_info(ll_i2obdconn(inode), sizeof(key),
 654                                   &key, &vallen, &stripe);
 655                 if (rc != 0) {
 656                         CERROR("obd_get_info: rc = %d\n", rc);
 657                         LBUG();
 658                 }
 659                 LASSERT(stripe < lsm->lsm_stripe_count);
 660
 661                 count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT;
 662                 skip = (lsm->lsm_stripe_count - 1) * count;
 663                 start += (start/count * skip) + (stripe * count);
 664                 if (end != ~0)
 665                         end += (end/count * skip) + (stripe * count);
 666         }
 667
 668         i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
 669         if (end >= i)
 670                 clear_bit(LLI_F_HAVE_SIZE_LOCK, &(ll_i2info(inode)->lli_flags));
 671         if (i < end)
 672                 end = i;
 673
 674         CDEBUG(D_INODE, "start: %lu j: %lu count: %lu skip: %lu end: %lu\n",
 675                start, start % count, count, skip, end);
 676
 677         /* start writeback on dirty pages in the extent when its PW */
 678         for (i = start, j = start % count;
 679                         lock->l_granted_mode == LCK_PW && i < end; j++, i++) {
 680                 if (j == count) {
 681                         i += skip;
 682                         j = 0;
 683                 }
 684                 /* its unlikely, but give us a chance to bail when we're out */
 685                 PGCACHE_WRLOCK(inode->i_mapping);
 686                 if (list_empty(&inode->i_mapping->dirty_pages)) {
 687                         CDEBUG(D_INODE, "dirty list empty\n");
 688                         PGCACHE_WRUNLOCK(inode->i_mapping);
 689                         break;
 690                 }
 691                 PGCACHE_WRUNLOCK(inode->i_mapping);
 692
 693                 if (need_resched())
 694                         schedule();
 695
 696                 page = find_get_page(inode->i_mapping, i);
 697                 if (page == NULL)
 698                         continue;
 699                 if (!PageDirty(page) || TryLockPage(page)) {
 700                         page_cache_release(page);
 701                         continue;
 702                 }
 703                 if (PageDirty(page)) {
 704                         CDEBUG(D_INODE, "writing page %p\n", page);
 705                         PGCACHE_WRLOCK(inode->i_mapping);
 706                         list_del(&page->list);
 707                         list_add(&page->list, &inode->i_mapping->locked_pages);
 708                         PGCACHE_WRUNLOCK(inode->i_mapping);
 709
 710                         /* this writepage might write out pages outside
 711                          * this extent, but that's ok, the pages are only
 712                          * still dirty because a lock still covers them */
 713                         ClearPageDirty(page);
 714 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 715                         ret = inode->i_mapping->a_ops->writepage(page);
 716 #else
 717                         ret = inode->i_mapping->a_ops->writepage(page, NULL);
 718 #endif
 719                         if (ret != 0)
 720                                 unlock_page(page);
 721                 } else {
 722                         unlock_page(page);
 723                 }
 724                 page_cache_release(page);
 725
 726         }
 727
 728         /* our locks are page granular thanks to osc_enqueue, we invalidate the
 729          * whole page. */
 730         LASSERT((extent->start & ~PAGE_CACHE_MASK) == 0);
 731         LASSERT(((extent->end+1) & ~PAGE_CACHE_MASK) == 0);
 732         for (i = start, j = start % count ; i < end ; j++, i++) {
 733                 if ( j == count ) {
 734                         i += skip;
 735                         j = 0;
 736                 }
 737                 PGCACHE_WRLOCK(inode->i_mapping);
 738                 if (list_empty(&inode->i_mapping->dirty_pages) &&
 739                      list_empty(&inode->i_mapping->clean_pages) &&
 740                      list_empty(&inode->i_mapping->locked_pages)) {
 741                         CDEBUG(D_INODE, "nothing left\n");
 742                         PGCACHE_WRUNLOCK(inode->i_mapping);
 743                         break;
 744                 }
 745                 PGCACHE_WRUNLOCK(inode->i_mapping);
 746                 if (need_resched())
 747                         schedule();
 748                 page = find_get_page(inode->i_mapping, i);
 749                 if (page == NULL)
 750                         continue;
 751                 CDEBUG(D_INODE, "dropping page %p at %lu\n", page, page->index);
 752                 lock_page(page);
 753                 if (page->mapping) /* might have raced */
 754 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 755                         truncate_complete_page(page);
 756 #else
 757                         truncate_complete_page(page->mapping, page);
 758 #endif
 759                 unlock_page(page);
 760                 page_cache_release(page);
 761         }
 762         EXIT;
 763 }
 764
 765 int ll_extent_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
 766                             void *data, int flag)
 767 {
 768         struct inode *inode = data;
 769         struct ll_inode_info *lli = ll_i2info(inode);
 770         struct lustre_handle lockh = { 0 };
 771         int rc;
 772         ENTRY;
 773
 774         LASSERT(inode != NULL);
 775
 776         switch (flag) {
 777         case LDLM_CB_BLOCKING:
 778                 ldlm_lock2handle(lock, &lockh);
 779                 rc = ldlm_cli_cancel(&lockh);
 780                 if (rc != ELDLM_OK)
 781                         CERROR("ldlm_cli_cancel failed: %d\n", rc);
 782                 break;
 783         case LDLM_CB_CANCELING:
 784                 /* FIXME: we could be given 'canceling intents' so that we
 785                  * could know to write-back or simply throw away the pages
 786                  * based on if the cancel comes from a desire to, say,
 787                  * read or truncate.. */
 788                 LASSERT((unsigned long)inode > 0x1000);
 789                 LASSERT((unsigned long)lli > 0x1000);
 790                 LASSERT((unsigned long)lli->lli_smd > 0x1000);
 791                 ll_pgcache_remove_extent(inode, lli->lli_smd, lock);
 792                 break;
 793         default:
 794                 LBUG();
 795         }
 796
 797         RETURN(0);
 798 }
 799
 800 static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
 801                             loff_t *ppos)
 802 {
 803         struct ll_file_data *fd = filp->private_data;
 804         struct inode *inode = filp->f_dentry->d_inode;
 805         struct ll_inode_info *lli = ll_i2info(inode);
 806         struct lov_stripe_md *lsm = lli->lli_smd;
 807         struct lustre_handle lockh = { 0 };
 808         struct ll_read_extent rextent;
 809         ldlm_error_t err;
 810         ssize_t retval;
 811         ENTRY;
 812         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
 813                inode->i_ino, inode->i_generation, inode, count, *ppos);
 814
 815         /* "If nbyte is 0, read() will return 0 and have no other results."
 816          *                      -- Single Unix Spec */
 817         if (count == 0)
 818                 RETURN(0);
 819
 820         lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
 821                             count);
 822         /* grab a -> eof extent to push extending writes out of node's caches
 823          * so we can see them at the getattr after lock acquisition.  this will
 824          * turn into a seperate [*ppos + count, EOF] 'size intent' lock attempt
 825          * in the future. */
 826         rextent.re_extent.start = *ppos;
 827         rextent.re_extent.end = OBD_OBJECT_EOF;
 828
 829         err = ll_extent_lock(fd, inode, lsm, LCK_PR, &rextent.re_extent,&lockh);
 830         if (err != ELDLM_OK)
 831                 RETURN(-ENOLCK);
 832
 833         /* XXX tell ll_readpage what pages have a PR lock.. */
 834         rextent.re_task = current;
 835         spin_lock(&lli->lli_read_extent_lock);
 836         list_add(&rextent.re_lli_item, &lli->lli_read_extents);
 837         spin_unlock(&lli->lli_read_extent_lock);
 838
 839         CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n",
 840                inode->i_ino, count, *ppos);
 841         retval = generic_file_read(filp, buf, count, ppos);
 842
 843         spin_lock(&lli->lli_read_extent_lock);
 844         list_del(&rextent.re_lli_item);
 845         spin_unlock(&lli->lli_read_extent_lock);
 846
 847         /* XXX errors? */
 848         ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
 849         RETURN(retval);
 850 }
 851
 852 /*
 853  * Write to a file (through the page cache).
 854  */
 855 static ssize_t
 856 ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
 857 {
 858         struct ll_file_data *fd = file->private_data;
 859         struct inode *inode = file->f_dentry->d_inode;
 860         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 861         struct lustre_handle lockh = { 0 };
 862         struct ldlm_extent extent;
 863         loff_t maxbytes = ll_file_maxbytes(inode);
 864         ldlm_error_t err;
 865         ssize_t retval;
 866         char should_validate = 1;
 867         ENTRY;
 868         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
 869                inode->i_ino, inode->i_generation, inode, count, *ppos);
 870
 871         /*
 872          * sleep doing some writeback work of this mount's dirty data
 873          * if the VM thinks we're low on memory.. other dirtying code
 874          * paths should think about doing this, too, but they should be
 875          * careful not to hold locked pages while they do so.  like
 876          * ll_prepare_write.  *cough*
 877          */
 878         LL_CHECK_DIRTY(inode->i_sb);
 879
 880         /* POSIX, but surprised the VFS doesn't check this already */
 881         if (count == 0)
 882                 RETURN(0);
 883
 884         if (file->f_flags & O_APPEND) {
 885                 extent.start = 0;
 886                 extent.end = OBD_OBJECT_EOF;
 887         } else  {
 888                 extent.start = *ppos;
 889                 extent.end = *ppos + count - 1;
 890                 /* we really don't care what i_size is if we're doing
 891                  * fully page aligned writes */
 892                 if ((*ppos & ~PAGE_CACHE_MASK) == 0 &&
 893                     (count & ~PAGE_CACHE_MASK) == 0)
 894                         should_validate = 0;
 895         }
 896
 897         if (should_validate)
 898                 err = ll_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh);
 899         else
 900                 err = ll_extent_lock_no_validate(fd, inode, lsm, LCK_PW,
 901                                                  &extent, &lockh);
 902         if (err != ELDLM_OK)
 903                 RETURN(-ENOLCK);
 904
 905         /* this is ok, g_f_w will overwrite this under i_sem if it races
 906          * with a local truncate, it just makes our maxbyte checking easier */
 907         if (file->f_flags & O_APPEND)
 908                 *ppos = inode->i_size;
 909
 910         if (*ppos >= maxbytes) {
 911                 if (count || *ppos > maxbytes) {
 912                         send_sig(SIGXFSZ, current, 0);
 913                         GOTO(out, retval = -EFBIG);
 914                 }
 915         }
 916         if (*ppos + count > maxbytes)
 917                 count = maxbytes - *ppos;
 918
 919         CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
 920                inode->i_ino, count, *ppos);
 921
 922         /* generic_file_write handles O_APPEND after getting i_sem */
 923         retval = generic_file_write(file, buf, count, ppos);
 924
 925 out:
 926         /* XXX errors? */
 927         lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES,
 928                             retval);
 929         ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh);
 930         RETURN(retval);
 931 }
 932
 933 static int ll_lov_setstripe(struct inode *inode, struct file *file,
 934                             unsigned long arg)
 935 {
 936         struct ll_inode_info *lli = ll_i2info(inode);
 937         struct lustre_handle *conn = ll_i2obdconn(inode);
 938         struct lov_stripe_md *lsm;
 939         int rc;
 940         ENTRY;
 941
 942         down(&lli->lli_open_sem);
 943         lsm = lli->lli_smd;
 944         if (lsm) {
 945                 up(&lli->lli_open_sem);
 946                 CERROR("stripe already exists for ino %lu\n", inode->i_ino);
 947                 /* If we haven't already done the open, do so now */
 948                 if (file->f_flags & O_LOV_DELAY_CREATE) {
 949                         int rc2 = ll_osc_open(conn, inode, file, lsm);
 950                         if (rc2)
 951                                 RETURN(rc2);
 952                 }
 953
 954                 RETURN(-EEXIST);
 955         }
 956
 957         rc = obd_iocontrol(LL_IOC_LOV_SETSTRIPE, conn, 0, &lsm, (void *)arg);
 958         if (rc) {
 959                 up(&lli->lli_open_sem);
 960                 RETURN(rc);
 961         }
 962         rc = ll_create_obj(conn, inode, file, lsm);
 963         up(&lli->lli_open_sem);
 964
 965         if (rc) {
 966                 obd_free_memmd(conn, &lsm);
 967                 RETURN(rc);
 968         }
 969         rc = ll_osc_open(conn, inode, file, lli->lli_smd);
 970         RETURN(rc);
 971 }
 972
 973 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
 974 {
 975         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 976         struct lustre_handle *conn = ll_i2obdconn(inode);
 977
 978         if (!lsm)
 979                 RETURN(-ENODATA);
 980
 981         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, conn, 0, lsm, (void *)arg);
 982 }
 983
 984 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 985                   unsigned long arg)
 986 {
 987         struct ll_file_data *fd = file->private_data;
 988         struct lustre_handle *conn;
 989         int flags;
 990         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%u\n", inode->i_ino,
 991                inode->i_generation, inode, cmd);
 992
 993         if (_IOC_TYPE(cmd) == 'T') /* tty ioctls */
 994                 return -ENOTTY;
 995
 996         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_IOCTL);
 997         switch(cmd) {
 998         case LL_IOC_GETFLAGS:
 999                 /* Get the current value of the file flags */
1000                 return put_user(fd->fd_flags, (int *)arg);
1001         case LL_IOC_SETFLAGS:
1002         case LL_IOC_CLRFLAGS:
1003                 /* Set or clear specific file flags */
1004                 /* XXX This probably needs checks to ensure the flags are
1005                  *     not abused, and to handle any flag side effects.
1006                  */
1007                 if (get_user(flags, (int *) arg))
1008                         return -EFAULT;
1009
1010                 if (cmd == LL_IOC_SETFLAGS)
1011                         fd->fd_flags |= flags;
1012                 else
1013                         fd->fd_flags &= ~flags;
1014                 return 0;
1015         case LL_IOC_LOV_SETSTRIPE:
1016                 return ll_lov_setstripe(inode, file, arg);
1017         case LL_IOC_LOV_GETSTRIPE:
1018                 return ll_lov_getstripe(inode, arg);
1019
1020         /* We need to special case any other ioctls we want to handle,
1021          * to send them to the MDS/OST as appropriate and to properly
1022          * network encode the arg field.
1023         case EXT2_IOC_GETFLAGS:
1024         case EXT2_IOC_SETFLAGS:
1025         case EXT2_IOC_GETVERSION_OLD:
1026         case EXT2_IOC_GETVERSION_NEW:
1027         case EXT2_IOC_SETVERSION_OLD:
1028         case EXT2_IOC_SETVERSION_NEW:
1029         */
1030         default:
1031                 conn = ll_i2obdconn(inode);
1032                 return obd_iocontrol(cmd, conn, 0, NULL, (void *)arg);
1033         }
1034 }
1035
1036 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1037 {
1038         struct inode *inode = file->f_dentry->d_inode;
1039         struct ll_file_data *fd = file->private_data;
1040         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1041         struct lustre_handle lockh = {0};
1042         loff_t retval;
1043         ENTRY;
1044         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),to=%llu\n", inode->i_ino,
1045                inode->i_generation, inode,
1046                offset + ((origin==2) ? inode->i_size : file->f_pos));
1047
1048         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_LLSEEK);
1049         if (origin == 2) { /* SEEK_END */
1050                 ldlm_error_t err;
1051                 struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
1052                 err = ll_extent_lock(fd, inode, lsm, LCK_PR, &extent, &lockh);
1053                 if (err != ELDLM_OK)
1054                         RETURN(-ENOLCK);
1055
1056                 offset += inode->i_size;
1057         } else if (origin == 1) { /* SEEK_CUR */
1058                 offset += file->f_pos;
1059         }
1060
1061         retval = -EINVAL;
1062         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1063                 if (offset != file->f_pos) {
1064                         file->f_pos = offset;
1065 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1066                         file->f_reada = 0;
1067                         file->f_version = ++event;
1068 #endif
1069                 }
1070                 retval = offset;
1071         }
1072
1073         if (origin == 2)
1074                 ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
1075         RETURN(retval);
1076 }
1077
1078 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1079 {
1080         int ret;
1081         struct inode *inode = dentry->d_inode;
1082         ENTRY;
1083         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
1084                inode->i_generation, inode);
1085
1086         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_FSYNC);
1087         /*
1088          * filemap_fdata{sync,wait} are also called at PW lock cancelation so
1089          * we know that they can only find data to writeback here if we are
1090          * still holding the PW lock that covered the dirty pages.  XXX we
1091          * should probably get a reference on it, though, just to be clear.
1092          */
1093         ret = filemap_fdatasync(dentry->d_inode->i_mapping);
1094         if ( ret == 0 )
1095                 ret = filemap_fdatawait(dentry->d_inode->i_mapping);
1096
1097         RETURN(ret);
1098 }
1099
1100 int ll_inode_revalidate(struct dentry *dentry)
1101 {
1102         struct inode *inode = dentry->d_inode;
1103         struct lov_stripe_md *lsm = NULL;
1104         ENTRY;
1105
1106         if (!inode) {
1107                 CERROR("REPORT THIS LINE TO PETER\n");
1108                 RETURN(0);
1109         }
1110         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
1111                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
1112 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
1113         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_REVALIDATE);
1114 #endif
1115
1116         /* this is very tricky.  it is unsafe to call ll_have_md_lock
1117            when we have a referenced lock: because it may cause an RPC
1118            below when the lock is marked CB_PENDING.  That RPC may not
1119            go out because someone else may be in another RPC waiting for
1120            that lock*/
1121         if (!(dentry->d_it && dentry->d_it->it_lock_mode) &&
1122             !ll_have_md_lock(dentry)) {
1123                 struct ptlrpc_request *req = NULL;
1124                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
1125                 struct ll_fid fid;
1126                 struct mds_body *body;
1127                 struct lov_mds_md *lmm;
1128                 unsigned long valid = 0;
1129                 int eadatalen = 0, rc;
1130
1131                 /* Why don't we update all valid MDS fields here, if we're
1132                  * doing an RPC anyways?  -phil */
1133                 if (S_ISREG(inode->i_mode)) {
1134                         eadatalen = obd_size_diskmd(&sbi->ll_osc_conn, NULL);
1135                         valid |= OBD_MD_FLEASIZE;
1136                 }
1137                 ll_inode2fid(&fid, inode);
1138                 rc = mdc_getattr(&sbi->ll_mdc_conn, &fid,
1139                                  valid, eadatalen, &req);
1140                 if (rc) {
1141                         CERROR("failure %d inode %lu\n", rc, inode->i_ino);
1142                         RETURN(-abs(rc));
1143                 }
1144
1145                 body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body));
1146                 LASSERT (body != NULL);         /* checked by mdc_getattr() */
1147                 LASSERT_REPSWABBED (req, 0);    /* swabbed by mdc_getattr() */
1148
1149                 if (S_ISREG(inode->i_mode) &&
1150                     (body->valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS))) {
1151                         CERROR("MDS sent back size for regular file\n");
1152                         body->valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
1153                 }
1154
1155                 /* XXX Too paranoid? */
1156                 if ((body->valid ^ valid) & OBD_MD_FLEASIZE)
1157                         CERROR("Asked for %s eadata but got %s\n",
1158                                (valid & OBD_MD_FLEASIZE) ? "some" : "no",
1159                                (body->valid & OBD_MD_FLEASIZE) ? "some":"none");
1160
1161                 if (S_ISREG(inode->i_mode) &&
1162                     (body->valid & OBD_MD_FLEASIZE)) {
1163                         if (body->eadatasize == 0) { /* no EA data */
1164                                 CERROR("OBD_MD_FLEASIZE set but no data\n");
1165                                 RETURN(-EPROTO);
1166                         }
1167                         /* Only bother with this if inode's lsm not set? */
1168                         lmm = lustre_msg_buf(req->rq_repmsg,1,body->eadatasize);
1169                         LASSERT(lmm != NULL);       /* mdc_getattr() checked */
1170                         LASSERT_REPSWABBED(req, 1); /* mdc_getattr() swabbed */
1171
1172                         rc = obd_unpackmd (&sbi->ll_osc_conn,
1173                                            &lsm, lmm, body->eadatasize);
1174                         if (rc < 0) {
1175                                 CERROR("Error %d unpacking eadata\n", rc);
1176                                 ptlrpc_req_finished(req);
1177                                 RETURN(rc);
1178                         }
1179                         LASSERT(rc >= sizeof(*lsm));
1180                 }
1181
1182                 ll_update_inode(inode, body, lsm);
1183                 if (lsm != NULL && ll_i2info(inode)->lli_smd != lsm)
1184                         obd_free_memmd(&sbi->ll_osc_conn, &lsm);
1185
1186                 ptlrpc_req_finished(req);
1187         }
1188
1189         lsm = ll_i2info(inode)->lli_smd;
1190         if (!lsm)       /* object not yet allocated, don't validate size */
1191                 RETURN(0);
1192
1193         /*
1194          * unfortunately stat comes in through revalidate and we don't
1195          * differentiate this use from initial instantiation.  we're
1196          * also being wildly conservative and flushing write caches
1197          * so that stat really returns the proper size.
1198          */
1199         {
1200                 struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
1201                 struct lustre_handle lockh = {0};
1202                 ldlm_error_t err;
1203
1204                 err = ll_extent_lock(NULL, inode, lsm, LCK_PR, &extent, &lockh);
1205                 if (err != ELDLM_OK)
1206                         RETURN(err);
1207
1208                 ll_extent_unlock(NULL, inode, lsm, LCK_PR, &lockh);
1209         }
1210         RETURN(0);
1211 }
1212
1213 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1214 static int ll_getattr(struct vfsmount *mnt, struct dentry *de,
1215                       struct kstat *stat)
1216 {
1217         int res = 0;
1218         struct inode *inode = de->d_inode;
1219
1220         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR);
1221         res = ll_inode_revalidate(de);
1222         if (res)
1223                 return res;
1224 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1225         stat->dev = inode->i_dev;
1226 #endif
1227         stat->ino = inode->i_ino;
1228         stat->mode = inode->i_mode;
1229         stat->nlink = inode->i_nlink;
1230         stat->uid = inode->i_uid;
1231         stat->gid = inode->i_gid;
1232         stat->rdev = kdev_t_to_nr(inode->i_rdev);
1233         stat->atime = inode->i_atime;
1234         stat->mtime = inode->i_mtime;
1235         stat->ctime = inode->i_ctime;
1236         stat->size = inode->i_size;
1237         return 0;
1238 }
1239 #endif
1240
1241 struct file_operations ll_file_operations = {
1242         read:           ll_file_read,
1243         write:          ll_file_write,
1244         ioctl:          ll_file_ioctl,
1245         open:           ll_file_open,
1246         release:        ll_file_release,
1247         mmap:           generic_file_mmap,
1248         llseek:         ll_file_seek,
1249         fsync:          ll_fsync,
1250 };
1251
1252 struct inode_operations ll_file_inode_operations = {
1253         setattr_raw:    ll_setattr_raw,
1254         setattr:    ll_setattr,
1255         truncate:   ll_truncate,
1256 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1257         getattr: ll_getattr,
1258 #else
1259         revalidate: ll_inode_revalidate,
1260 #endif
1261 };
1262
1263 struct inode_operations ll_special_inode_operations = {
1264         setattr_raw:    ll_setattr_raw,
1265         setattr:    ll_setattr,
1266 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1267         getattr:    ll_getattr,
1268 #else
1269         revalidate: ll_inode_revalidate,
1270 #endif
1271 };