lustre/llite/file.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
   5  *   Author: Peter Braam <braam@clusterfs.com>
   6  *   Author: Phil Schwan <phil@clusterfs.com>
   7  *   Author: Andreas Dilger <adilger@clusterfs.com>
   8  *
   9  *   This file is part of Lustre, http://www.lustre.org.
  10  *
  11  *   Lustre is free software; you can redistribute it and/or
  12  *   modify it under the terms of version 2 of the GNU General Public
  13  *   License as published by the Free Software Foundation.
  14  *
  15  *   Lustre is distributed in the hope that it will be useful,
  16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  *   GNU General Public License for more details.
  19  *
  20  *   You should have received a copy of the GNU General Public License
  21  *   along with Lustre; if not, write to the Free Software
  22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25 #define DEBUG_SUBSYSTEM S_LLITE
  26 #include <linux/lustre_dlm.h>
  27 #include <linux/lustre_lite.h>
  28 #include <linux/obd_lov.h>      /* for lov_mds_md_size() in lov_setstripe() */
  29 #include <linux/random.h>
  30 #include <linux/pagemap.h>
  31 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
  32 #include <linux/lustre_compat25.h>
  33 #endif
  34
  35 #include "llite_internal.h"
  36
  37 static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode,
  38                         struct file *file)
  39 {
  40         struct ll_file_data *fd = file->private_data;
  41         struct ptlrpc_request *req = NULL;
  42         unsigned long flags;
  43         struct obd_import *imp;
  44         int rc;
  45         ENTRY;
  46
  47         /* Complete the open request and remove it from replay list */
  48         rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, inode->i_ino,
  49                        inode->i_mode, &fd->fd_mds_och.och_fh, &req);
  50         if (rc)
  51                 CERROR("inode %lu close failed: rc = %d\n", inode->i_ino, rc);
  52
  53         imp = fd->fd_mds_och.och_req->rq_import;
  54         LASSERT(imp != NULL);
  55         spin_lock_irqsave(&imp->imp_lock, flags);
  56
  57         DEBUG_REQ(D_HA, fd->fd_mds_och.och_req, "matched open req %p",
  58                   fd->fd_mds_och.och_req);
  59
  60         /* We held on to the request for replay until we saw a close for that
  61          * file.  Now that we've closed it, it gets replayed on the basis of
  62          * its transno only. */
  63         spin_lock (&fd->fd_mds_och.och_req->rq_lock);
  64         fd->fd_mds_och.och_req->rq_replay = 0;
  65         spin_unlock (&fd->fd_mds_och.och_req->rq_lock);
  66
  67         if (fd->fd_mds_och.och_req->rq_transno) {
  68                 /* This open created a file, so it needs replay as a
  69                  * normal transaction now.  Our reference to it now
  70                  * effectively owned by the imp_replay_list, and it'll
  71                  * be committed just like other transno-having
  72                  * requests from here on out. */
  73
  74                 /* We now retain this close request, so that it is
  75                  * replayed if the open is replayed.  We duplicate the
  76                  * transno, so that we get freed at the right time,
  77                  * and rely on the difference in xid to keep
  78                  * everything ordered correctly.
  79                  *
  80                  * But! If this close was already given a transno
  81                  * (because it caused real unlinking of an
  82                  * open-unlinked file, f.e.), then we'll be ordered on
  83                  * the basis of that and we don't need to do anything
  84                  * magical here. */
  85                 if (!req->rq_transno) {
  86                         req->rq_transno = fd->fd_mds_och.och_req->rq_transno;
  87                         ptlrpc_retain_replayable_request(req, imp);
  88                 }
  89                 spin_unlock_irqrestore(&imp->imp_lock, flags);
  90
  91                 /* Should we free_committed now? we always free before
  92                  * replay, so it's probably a wash.  We could check to
  93                  * see if the fd_req should already be committed, in
  94                  * which case we can avoid the whole retain_replayable
  95                  * dance. */
  96         } else {
  97                 /* No transno means that we can just drop our ref. */
  98                 spin_unlock_irqrestore(&imp->imp_lock, flags);
  99         }
 100         ptlrpc_req_finished(fd->fd_mds_och.och_req);
 101
 102         /* Do this after the fd_req->rq_transno check, because we don't want
 103          * to bounce off zero references. */
 104         ptlrpc_req_finished(req);
 105         fd->fd_mds_och.och_fh.cookie = DEAD_HANDLE_MAGIC;
 106         file->private_data = NULL;
 107         OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd);
 108
 109         RETURN(-abs(rc));
 110 }
 111
 112 /* While this returns an error code, fput() the caller does not, so we need
 113  * to make every effort to clean up all of our state here.  Also, applications
 114  * rarely check close errors and even if an error is returned they will not
 115  * re-try the close call.
 116  */
 117 int ll_file_release(struct inode *inode, struct file *file)
 118 {
 119         struct ll_file_data *fd;
 120         struct obdo oa;
 121         struct ll_sb_info *sbi = ll_i2sbi(inode);
 122         struct ll_inode_info *lli = ll_i2info(inode);
 123         struct lov_stripe_md *lsm = lli->lli_smd;
 124         int rc = 0, rc2;
 125
 126         ENTRY;
 127         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 128                inode->i_generation, inode);
 129
 130         /* don't do anything for / */
 131         if (inode->i_sb->s_root == file->f_dentry)
 132                 RETURN(0);
 133
 134         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_RELEASE);
 135         fd = (struct ll_file_data *)file->private_data;
 136         if (!fd) /* no process opened the file after an mcreate */
 137                 RETURN(0);
 138
 139         /* we might not be able to get a valid handle on this file
 140          * again so we really want to flush our write cache.. */
 141         if (S_ISREG(inode->i_mode) && lsm) {
 142                 write_inode_now(inode, 0);
 143                 obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
 144                                             OBD_MD_FLMTIME | OBD_MD_FLCTIME);
 145                 memcpy(obdo_handle(&oa), &fd->fd_ost_och, FD_OSTDATA_SIZE);
 146                 oa.o_valid |= OBD_MD_FLHANDLE;
 147
 148                 rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL);
 149                 if (rc)
 150                         CERROR("inode %lu object close failed: rc %d\n",
 151                                inode->i_ino, rc);
 152         }
 153
 154         rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
 155         if (rc2 && !rc)
 156                 rc = rc2;
 157
 158         RETURN(rc);
 159 }
 160
 161 static int ll_local_open(struct file *file, struct lookup_intent *it)
 162 {
 163         struct ptlrpc_request *req = it->it_data;
 164         struct ll_file_data *fd;
 165         struct mds_body *body;
 166         ENTRY;
 167
 168         body = lustre_msg_buf (req->rq_repmsg, 1, sizeof (*body));
 169         LASSERT (body != NULL);                 /* reply already checked out */
 170         LASSERT_REPSWABBED (req, 1);            /* and swabbed down */
 171
 172         LASSERT(!file->private_data);
 173
 174         OBD_SLAB_ALLOC(fd, ll_file_data_slab, SLAB_KERNEL, sizeof *fd);
 175         /* We can't handle this well without reorganizing ll_file_open and
 176          * ll_mdc_close, so don't even try right now. */
 177         LASSERT(fd != NULL);
 178
 179         memset(fd, 0, sizeof(*fd));
 180
 181         memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle));
 182         fd->fd_mds_och.och_req = it->it_data;
 183         file->private_data = fd;
 184
 185         RETURN(0);
 186 }
 187
 188 static int ll_osc_open(struct lustre_handle *conn, struct inode *inode,
 189                        struct file *file, struct lov_stripe_md *lsm)
 190 {
 191         struct ll_file_data *fd = file->private_data;
 192         struct obdo *oa;
 193         int rc;
 194         ENTRY;
 195
 196         oa = obdo_alloc();
 197         if (!oa)
 198                 RETURN(-ENOMEM);
 199         oa->o_id = lsm->lsm_object_id;
 200         oa->o_mode = S_IFREG;
 201         oa->o_valid = OBD_MD_FLID;
 202         obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
 203         rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och);
 204         if (rc)
 205                 GOTO(out, rc);
 206
 207         file->f_flags &= ~O_LOV_DELAY_CREATE;
 208         obdo_refresh_inode(inode, oa, (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 209                                        OBD_MD_FLATIME | OBD_MD_FLMTIME |
 210                                        OBD_MD_FLCTIME));
 211         EXIT;
 212 out:
 213         obdo_free(oa);
 214         return rc;
 215 }
 216
 217 /* Caller must hold lli_open_sem to protect lli->lli_smd from changing and
 218  * duplicate objects from being created.  We only install lsm to lli_smd if
 219  * the mdc open was successful (hence stored stripe MD on MDS), otherwise
 220  * other nodes could try to create different objects for the same file.
 221  */
 222 static int ll_create_obj(struct lustre_handle *conn, struct inode *inode,
 223                          struct file *file, struct lov_stripe_md *lsm)
 224 {
 225         struct ptlrpc_request *req = NULL;
 226         struct ll_inode_info *lli = ll_i2info(inode);
 227         struct lov_mds_md *lmm = NULL;
 228         struct obdo *oa;
 229         struct iattr iattr;
 230         struct mdc_op_data op_data;
 231         struct obd_trans_info oti = { 0 };
 232         int rc, err, lmm_size = 0;
 233         ENTRY;
 234
 235         oa = obdo_alloc();
 236         if (!oa)
 237                 RETURN(-ENOMEM);
 238
 239         LASSERT(S_ISREG(inode->i_mode));
 240         oa->o_mode = S_IFREG | 0600;
 241         oa->o_id = inode->i_ino;
 242         oa->o_generation = inode->i_generation;
 243         /* Keep these 0 for now, because chown/chgrp does not change the
 244          * ownership on the OST, and we don't want to allow BA OST NFS
 245          * users to access these objects by mistake. */
 246         oa->o_uid = 0;
 247         oa->o_gid = 0;
 248         oa->o_valid = OBD_MD_FLID | OBD_MD_FLGENER | OBD_MD_FLTYPE |
 249                 OBD_MD_FLMODE | OBD_MD_FLUID | OBD_MD_FLGID;
 250 #ifdef ENABLE_ORPHANS
 251         oa->o_valid |= OBD_MD_FLCOOKIE;
 252 #endif
 253
 254         obdo_from_inode(oa, inode, OBD_MD_FLTYPE|OBD_MD_FLATIME|OBD_MD_FLMTIME|
 255                         OBD_MD_FLCTIME | (inode->i_size ? OBD_MD_FLSIZE : 0));
 256
 257         rc = obd_create(conn, oa, &lsm, &oti);
 258         if (rc) {
 259                 CERROR("error creating objects for inode %lu: rc = %d\n",
 260                        inode->i_ino, rc);
 261                 if (rc > 0) {
 262                         CERROR("obd_create returned invalid rc %d\n", rc);
 263                         rc = -EIO;
 264                 }
 265                 GOTO(out_oa, rc);
 266         }
 267         obdo_refresh_inode(inode, oa, OBD_MD_FLBLKSZ);
 268
 269         LASSERT(lsm && lsm->lsm_object_id);
 270         rc = obd_packmd(conn, &lmm, lsm);
 271         if (rc < 0)
 272                 GOTO(out_destroy, rc);
 273
 274         lmm_size = rc;
 275
 276         /* Save the stripe MD with this file on the MDS */
 277         memset(&iattr, 0, sizeof(iattr));
 278         iattr.ia_valid = ATTR_FROM_OPEN;
 279
 280         ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0);
 281
 282 #if 0
 283 #warning FIXME: next line is for debugging purposes only
 284         obd_log_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, oti.oti_numcookies,
 285                        oti.oti_logcookies, OBD_LLOG_FL_SENDNOW);
 286 #endif
 287
 288         rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, &op_data, &iattr,
 289                          lmm, lmm_size, oti.oti_logcookies,
 290                          oti.oti_numcookies * sizeof(oti.oti_onecookie), &req);
 291         ptlrpc_req_finished(req);
 292
 293         obd_free_diskmd(conn, &lmm);
 294
 295         /* If we couldn't complete mdc_open() and store the stripe MD on the
 296          * MDS, we need to destroy the objects now or they will be leaked.
 297          */
 298         if (rc) {
 299                 CERROR("error: storing stripe MD for %lu: rc %d\n",
 300                        inode->i_ino, rc);
 301                 GOTO(out_destroy, rc);
 302         }
 303         lli->lli_smd = lsm;
 304         lli->lli_maxbytes = lsm->lsm_maxbytes;
 305
 306         EXIT;
 307 out_oa:
 308         oti_free_cookies(&oti);
 309         obdo_free(oa);
 310         return rc;
 311
 312 out_destroy:
 313         oa->o_id = lsm->lsm_object_id;
 314         oa->o_valid = OBD_MD_FLID;
 315         obdo_from_inode(oa, inode, OBD_MD_FLTYPE);
 316 #if 0
 317         err = obd_log_cancel(conn, lsm, oti.oti_numcookies, oti.oti_logcookies,
 318                              OBD_LLOG_FL_SENDNOW);
 319         if (err)
 320                 CERROR("error cancelling inode %lu log cookies: rc %d\n",
 321                        inode->i_ino, err);
 322 #endif
 323         err = obd_destroy(conn, oa, lsm, NULL);
 324         obd_free_memmd(conn, &lsm);
 325         if (err)
 326                 CERROR("error uncreating inode %lu objects: rc %d\n",
 327                        inode->i_ino, err);
 328         goto out_oa;
 329 }
 330
 331 /* Open a file, and (for the very first open) create objects on the OSTs at
 332  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 333  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
 334  * lli_open_sem to ensure no other process will create objects, send the
 335  * stripe MD to the MDS, or try to destroy the objects if that fails.
 336  *
 337  * If we already have the stripe MD locally then we don't request it in
 338  * mdc_open(), by passing a lmm_size = 0.
 339  *
 340  * It is up to the application to ensure no other processes open this file
 341  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 342  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 343  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 344  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 345  */
 346 int ll_file_open(struct inode *inode, struct file *file)
 347 {
 348         struct ll_sb_info *sbi = ll_i2sbi(inode);
 349         struct ll_inode_info *lli = ll_i2info(inode);
 350         struct lustre_handle *conn = ll_i2obdconn(inode);
 351         struct lookup_intent *it;
 352         struct lov_stripe_md *lsm;
 353         int rc = 0;
 354         ENTRY;
 355
 356         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 357                inode->i_generation, inode);
 358
 359         /* don't do anything for / */
 360         if (inode->i_sb->s_root == file->f_dentry)
 361                 RETURN(0);
 362
 363         it = file->f_it;
 364         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
 365
 366         rc = ll_it_open_error(DISP_OPEN_OPEN, it);
 367         if (rc)
 368                 RETURN(rc);
 369
 370         rc = ll_local_open(file, it);
 371         if (rc)
 372                 LBUG();
 373
 374         mdc_set_open_replay_data(&((struct ll_file_data *)
 375                                    file->private_data)->fd_mds_och);
 376         if (!S_ISREG(inode->i_mode))
 377                 RETURN(0);
 378
 379         lsm = lli->lli_smd;
 380         if (lsm == NULL) {
 381                 if (file->f_flags & O_LOV_DELAY_CREATE ||
 382                     !(file->f_mode & FMODE_WRITE)) {
 383                         CDEBUG(D_INODE, "delaying object creation\n");
 384                         RETURN(0);
 385                 }
 386                 down(&lli->lli_open_sem);
 387                 if (!lli->lli_smd) {
 388                         rc = ll_create_obj(conn, inode, file, NULL);
 389                         up(&lli->lli_open_sem);
 390                         if (rc)
 391                                 GOTO(out_close, rc);
 392                 } else {
 393                         CERROR("warning: stripe already set on ino %lu\n",
 394                                inode->i_ino);
 395                         up(&lli->lli_open_sem);
 396                 }
 397                 lsm = lli->lli_smd;
 398         }
 399
 400         rc = ll_osc_open(conn, inode, file, lsm);
 401         if (rc)
 402                 GOTO(out_close, rc);
 403         RETURN(0);
 404
 405  out_close:
 406         ll_mdc_close(&sbi->ll_mdc_conn, inode, file);
 407         return rc;
 408 }
 409
 410 /*
 411  * really does the getattr on the inode and updates its fields
 412  */
 413 int ll_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm,
 414                      void *ostdata)
 415 {
 416         struct ll_sb_info *sbi = ll_i2sbi(inode);
 417         struct ll_inode_info *lli = ll_i2info(inode);
 418         struct ptlrpc_request_set *set;
 419         struct obdo oa;
 420         int bef, aft;
 421         unsigned long before, after;
 422         int rc;
 423         ENTRY;
 424
 425         LASSERT(lsm);
 426         LASSERT(sbi);
 427         LASSERT(lli);
 428
 429         memset(&oa, 0, sizeof oa);
 430         oa.o_id = lsm->lsm_object_id;
 431         oa.o_mode = S_IFREG;
 432         oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
 433                 OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
 434                 OBD_MD_FLCTIME;
 435
 436         if (ostdata != NULL) {
 437                 memcpy(obdo_handle(&oa), ostdata, FD_OSTDATA_SIZE);
 438                 oa.o_valid |= OBD_MD_FLHANDLE;
 439         }
 440
 441         /* getattr can race with writeback.  we don't want to trust a getattr
 442          * that doesn't include the writeback of our farthest cached pages
 443          * that it raced with. */
 444         /* Now that the OSC knows the cached-page status, it can and should be
 445          * adjusting its getattr results to include the maximum cached offset
 446          * for its stripe(s). */
 447         do {
 448                 bef = obd_last_dirty_offset(ll_i2obdconn(inode), lli->lli_smd,
 449                                             &before);
 450 #if 0
 451                 rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm);
 452 #else
 453                 set = ptlrpc_prep_set ();
 454                 if (set == NULL) {
 455                         CERROR ("ENOMEM allocing request set\n");
 456                         rc = -ENOMEM;
 457                 } else {
 458                         rc = obd_getattr_async(&sbi->ll_osc_conn, &oa, lsm, set);
 459                         if (rc == 0)
 460                                 rc = ptlrpc_set_wait (set);
 461                         ptlrpc_set_destroy (set);
 462                 }
 463 #endif
 464                 if (rc)
 465                         RETURN(rc);
 466
 467                 aft = obd_last_dirty_offset(ll_i2obdconn(inode), lli->lli_smd,
 468                                             &after);
 469                 CDEBUG(D_INODE, " %d,%lu -> %d,%lu\n", bef, before, aft, after);
 470         } while (bef == 0 &&
 471                  (aft != 0 || after < before) &&
 472                  oa.o_size < ((u64)before + 1) << PAGE_CACHE_SHIFT);
 473
 474         obdo_refresh_inode(inode, &oa, (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 475                                         OBD_MD_FLMTIME | OBD_MD_FLCTIME));
 476         if (inode->i_blksize < PAGE_CACHE_SIZE)
 477                 inode->i_blksize = PAGE_CACHE_SIZE;
 478
 479         /* make sure getattr doesn't return a size that causes writeback
 480          * to forget about cached writes */
 481         if ((aft == 0) && oa.o_size < ((u64)after + 1) << PAGE_CACHE_SHIFT) {
 482                 CDEBUG(D_INODE, "cached at %lu, keeping %llu i_size instead "
 483                                 "of oa "LPU64"\n", after, inode->i_size,
 484                                 oa.o_size);
 485                 RETURN(0);
 486         }
 487
 488         obdo_to_inode(inode, &oa, OBD_MD_FLSIZE);
 489
 490         CDEBUG(D_INODE, "objid "LPX64" size %Lu/%Lu blksize %lu\n",
 491                lsm->lsm_object_id, inode->i_size, inode->i_size,
 492                inode->i_blksize);
 493         RETURN(0);
 494 }
 495
 496 static inline void ll_remove_suid(struct inode *inode)
 497 {
 498         unsigned int mode;
 499
 500         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
 501         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
 502
 503         /* was any of the uid bits set? */
 504         mode &= inode->i_mode;
 505         if (mode && !capable(CAP_FSETID)) {
 506                 inode->i_mode &= ~mode;
 507                 // XXX careful here - we cannot change the size
 508         }
 509 }
 510
 511 #if 0
 512 static void ll_update_atime(struct inode *inode)
 513 {
 514         if (IS_RDONLY(inode)) return;
 515
 516         /* update atime, but don't explicitly write it out just this change */
 517         inode->i_atime = CURRENT_TIME;
 518 }
 519 #endif
 520
 521 /*
 522  * flush the page cache for an extent as its canceled.  when we're on an
 523  * lov we get a lock cancelation for each of the obd locks under the lov
 524  * so we have to map the obd's region back onto the stripes in the file
 525  * that it held.
 526  *
 527  * no one can dirty the extent until we've finished our work and they
 528  * can enqueue another lock.
 529  *
 530  * XXX this could be asking the inode's dirty tree for info
 531  */
 532 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
 533                               struct ldlm_lock *lock)
 534 {
 535         struct ldlm_extent *extent = &lock->l_extent;
 536         unsigned long start, end, count, skip, i, j;
 537         struct page *page;
 538         int ret;
 539         ENTRY;
 540
 541         CDEBUG(D_INODE, "obdo %lu inode %p ["LPU64"->"LPU64"] size: %llu\n",
 542                inode->i_ino, inode, extent->start, extent->end, inode->i_size);
 543
 544         start = extent->start >> PAGE_CACHE_SHIFT;
 545         count = ~0;
 546         skip = 0;
 547         end = (extent->end >> PAGE_CACHE_SHIFT) + 1;
 548         if ((end << PAGE_CACHE_SHIFT) < extent->end)
 549                 end = ~0;
 550         if (lsm->lsm_stripe_count > 1) {
 551                 struct {
 552                         char name[16];
 553                         struct ldlm_lock *lock;
 554                         struct lov_stripe_md *lsm;
 555                 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
 556                 __u32 stripe;
 557                 __u32 vallen = sizeof(stripe);
 558                 int rc;
 559
 560                 /* get our offset in the lov */
 561                 rc = obd_get_info(ll_i2obdconn(inode), sizeof(key),
 562                                   &key, &vallen, &stripe);
 563                 if (rc != 0) {
 564                         CERROR("obd_get_info: rc = %d\n", rc);
 565                         LBUG();
 566                 }
 567                 LASSERT(stripe < lsm->lsm_stripe_count);
 568
 569                 count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT;
 570                 skip = (lsm->lsm_stripe_count - 1) * count;
 571                 start += (start/count * skip) + (stripe * count);
 572                 if (end != ~0)
 573                         end += (end/count * skip) + (stripe * count);
 574         }
 575
 576         i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
 577         if (end >= i)
 578                 clear_bit(LLI_F_HAVE_SIZE_LOCK, &(ll_i2info(inode)->lli_flags));
 579         if (i < end)
 580                 end = i;
 581
 582         CDEBUG(D_INODE, "start: %lu j: %lu count: %lu skip: %lu end: %lu\n",
 583                start, start % count, count, skip, end);
 584
 585         /* start writeback on dirty pages in the extent when its PW */
 586         for (i = start, j = start % count;
 587              lock->l_granted_mode == LCK_PW && i < end; j++, i++) {
 588                 if (j == count) {
 589                         i += skip;
 590                         j = 0;
 591                 }
 592                 /* its unlikely, but give us a chance to bail when we're out */
 593                 ll_pgcache_lock(inode->i_mapping);
 594                 if (list_empty(&inode->i_mapping->dirty_pages)) {
 595                         CDEBUG(D_INODE, "dirty list empty\n");
 596                         ll_pgcache_unlock(inode->i_mapping);
 597                         break;
 598                 }
 599                 ll_pgcache_unlock(inode->i_mapping);
 600
 601                 if (need_resched())
 602                         schedule();
 603
 604                 page = find_get_page(inode->i_mapping, i);
 605                 if (page == NULL)
 606                         continue;
 607                 if (!PageDirty(page) || TryLockPage(page)) {
 608                         page_cache_release(page);
 609                         continue;
 610                 }
 611                 if (PageDirty(page)) {
 612                         CDEBUG(D_INODE, "writing page %p\n", page);
 613                         ll_pgcache_lock(inode->i_mapping);
 614                         list_del(&page->list);
 615                         list_add(&page->list, &inode->i_mapping->locked_pages);
 616                         ll_pgcache_unlock(inode->i_mapping);
 617
 618                         /* this writepage might write out pages outside
 619                          * this extent, but that's ok, the pages are only
 620                          * still dirty because a lock still covers them */
 621                         ClearPageDirty(page);
 622 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 623                         ret = inode->i_mapping->a_ops->writepage(page);
 624 #else
 625                         ret = inode->i_mapping->a_ops->writepage(page, NULL);
 626 #endif
 627                         if (ret != 0)
 628                                 unlock_page(page);
 629                 } else {
 630                         unlock_page(page);
 631                 }
 632                 page_cache_release(page);
 633
 634         }
 635
 636         /* our locks are page granular thanks to osc_enqueue, we invalidate the
 637          * whole page. */
 638         LASSERT((extent->start & ~PAGE_CACHE_MASK) == 0);
 639         LASSERT(((extent->end+1) & ~PAGE_CACHE_MASK) == 0);
 640         for (i = start, j = start % count ; i < end ; j++, i++) {
 641                 if (j == count) {
 642                         i += skip;
 643                         j = 0;
 644                 }
 645                 ll_pgcache_lock(inode->i_mapping);
 646                 if (list_empty(&inode->i_mapping->dirty_pages) &&
 647                      list_empty(&inode->i_mapping->clean_pages) &&
 648                      list_empty(&inode->i_mapping->locked_pages)) {
 649                         CDEBUG(D_INODE, "nothing left\n");
 650                         ll_pgcache_unlock(inode->i_mapping);
 651                         break;
 652                 }
 653                 ll_pgcache_unlock(inode->i_mapping);
 654                 if (need_resched())
 655                         schedule();
 656                 page = find_get_page(inode->i_mapping, i);
 657                 if (page == NULL)
 658                         continue;
 659                 CDEBUG(D_INODE, "dropping page %p at %lu\n", page, page->index);
 660                 lock_page(page);
 661                 if (page->mapping) /* might have raced */
 662 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 663                         truncate_complete_page(page);
 664 #else
 665                         truncate_complete_page(page->mapping, page);
 666 #endif
 667                 unlock_page(page);
 668                 page_cache_release(page);
 669         }
 670         EXIT;
 671 }
 672
 673 static int ll_extent_lock_callback(struct ldlm_lock *lock,
 674                                    struct ldlm_lock_desc *new, void *data,
 675                                    int flag)
 676 {
 677         struct inode *inode = data;
 678         struct ll_inode_info *lli = ll_i2info(inode);
 679         struct lustre_handle lockh = { 0 };
 680         int rc;
 681         ENTRY;
 682
 683         if ((unsigned long)inode < 0x1000) {
 684                 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
 685                 LBUG();
 686         }
 687
 688         switch (flag) {
 689         case LDLM_CB_BLOCKING:
 690                 ldlm_lock2handle(lock, &lockh);
 691                 rc = ldlm_cli_cancel(&lockh);
 692                 if (rc != ELDLM_OK)
 693                         CERROR("ldlm_cli_cancel failed: %d\n", rc);
 694                 break;
 695         case LDLM_CB_CANCELING:
 696                 /* FIXME: we could be given 'canceling intents' so that we
 697                  * could know to write-back or simply throw away the pages
 698                  * based on if the cancel comes from a desire to, say,
 699                  * read or truncate.. */
 700                 if ((unsigned long)lli->lli_smd < 0x1000) {
 701                         /* note that lli is part of the inode itself, so it
 702                          * is valid if as checked the inode pointer above. */
 703                         CERROR("inode %lu, sb %p, lli %p, lli_smd %p\n",
 704                                inode->i_ino, inode->i_sb, lli, lli->lli_smd);
 705                         LDLM_ERROR(lock, "cancel lock on bad inode %p", inode);
 706                         LBUG();
 707                 }
 708
 709                 ll_pgcache_remove_extent(inode, lli->lli_smd, lock);
 710                 break;
 711         default:
 712                 LBUG();
 713         }
 714
 715         RETURN(0);
 716 }
 717
 718 /*
 719  * some callers, notably truncate, really don't want i_size set based
 720  * on the the size returned by the getattr, or lock acquisition in
 721  * the future.
 722  */
 723 int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode,
 724                    struct lov_stripe_md *lsm,
 725                    int mode, struct ldlm_extent *extent,
 726                    struct lustre_handle *lockh)
 727 {
 728         struct ll_sb_info *sbi = ll_i2sbi(inode);
 729         int rc, flags = 0;
 730         ENTRY;
 731
 732         LASSERT(lockh->cookie == 0);
 733
 734         /* XXX phil: can we do this?  won't it screw the file size up? */
 735         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
 736             (sbi->ll_flags & LL_SBI_NOLCK))
 737                 RETURN(0);
 738
 739         CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
 740                inode->i_ino, extent->start, extent->end);
 741
 742         rc = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, extent,
 743                          sizeof(extent), mode, &flags, ll_extent_lock_callback,
 744                          inode, lockh);
 745
 746         RETURN(rc);
 747 }
 748
 749 /*
 750  * this grabs a lock and manually implements behaviour that makes it look like
 751  * the OST is returning the file size with each lock acquisition.
 752  */
 753 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
 754                    struct lov_stripe_md *lsm, int mode,
 755                    struct ldlm_extent *extent, struct lustre_handle *lockh)
 756 {
 757         struct ll_inode_info *lli = ll_i2info(inode);
 758         struct ldlm_extent size_lock;
 759         struct lustre_handle match_lockh = {0};
 760         int flags, rc, matched;
 761         ENTRY;
 762
 763         rc = ll_extent_lock_no_validate(fd, inode, lsm, mode, extent, lockh);
 764         if (rc != ELDLM_OK)
 765                 RETURN(rc);
 766
 767         if (test_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags))
 768                 RETURN(0);
 769
 770         rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL);
 771         if (rc) {
 772                 ll_extent_unlock(fd, inode, lsm, mode, lockh);
 773                 RETURN(rc);
 774         }
 775
 776         size_lock.start = inode->i_size;
 777         size_lock.end = OBD_OBJECT_EOF;
 778
 779         /* XXX I bet we should be checking the lock ignore flags.. */
 780         flags = LDLM_FL_CBPENDING | LDLM_FL_BLOCK_GRANTED | LDLM_FL_MATCH_DATA;
 781         matched = obd_match(&ll_i2sbi(inode)->ll_osc_conn, lsm, LDLM_EXTENT,
 782                             &size_lock, sizeof(size_lock), LCK_PR, &flags,
 783                             inode, &match_lockh);
 784
 785         /* hey, alright, we hold a size lock that covers the size we
 786          * just found, its not going to change for a while.. */
 787         if (matched == 1) {
 788                 set_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags);
 789                 obd_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, LCK_PR,
 790                            &match_lockh);
 791         }
 792
 793         RETURN(0);
 794 }
 795
 796 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
 797                 struct lov_stripe_md *lsm, int mode,
 798                 struct lustre_handle *lockh)
 799 {
 800         struct ll_sb_info *sbi = ll_i2sbi(inode);
 801         int rc;
 802         ENTRY;
 803
 804         /* XXX phil: can we do this?  won't it screw the file size up? */
 805         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
 806             (sbi->ll_flags & LL_SBI_NOLCK))
 807                 RETURN(0);
 808
 809         rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockh);
 810
 811         RETURN(rc);
 812 }
 813
 814 static ssize_t ll_file_read(struct file *filp, char *buf, size_t count,
 815                             loff_t *ppos)
 816 {
 817         struct ll_file_data *fd = filp->private_data;
 818         struct inode *inode = filp->f_dentry->d_inode;
 819         struct ll_inode_info *lli = ll_i2info(inode);
 820         struct lov_stripe_md *lsm = lli->lli_smd;
 821         struct lustre_handle lockh = { 0 };
 822         struct ll_read_extent rextent;
 823         ldlm_error_t err;
 824         ssize_t retval;
 825         ENTRY;
 826         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
 827                inode->i_ino, inode->i_generation, inode, count, *ppos);
 828
 829         /* "If nbyte is 0, read() will return 0 and have no other results."
 830          *                      -- Single Unix Spec */
 831         if (count == 0)
 832                 RETURN(0);
 833
 834         lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
 835                             count);
 836
 837         if (!lsm)
 838                 RETURN(0);
 839
 840         /* grab a -> eof extent to push extending writes out of node's caches
 841          * so we can see them at the getattr after lock acquisition.  this will
 842          * turn into a seperate [*ppos + count, EOF] 'size intent' lock attempt
 843          * in the future. */
 844         rextent.re_extent.start = *ppos;
 845         rextent.re_extent.end = OBD_OBJECT_EOF;
 846
 847         err = ll_extent_lock(fd, inode, lsm, LCK_PR, &rextent.re_extent,&lockh);
 848         if (err != ELDLM_OK)
 849                 RETURN(-ENOLCK);
 850
 851         /* XXX tell ll_readpage what pages have a PR lock.. */
 852         rextent.re_task = current;
 853         spin_lock(&lli->lli_read_extent_lock);
 854         list_add(&rextent.re_lli_item, &lli->lli_read_extents);
 855         spin_unlock(&lli->lli_read_extent_lock);
 856
 857         CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n",
 858                inode->i_ino, count, *ppos);
 859         retval = generic_file_read(filp, buf, count, ppos);
 860
 861         spin_lock(&lli->lli_read_extent_lock);
 862         list_del(&rextent.re_lli_item);
 863         spin_unlock(&lli->lli_read_extent_lock);
 864
 865         /* XXX errors? */
 866         ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
 867         RETURN(retval);
 868 }
 869
 870 /*
 871  * Write to a file (through the page cache).
 872  */
 873 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
 874                              loff_t *ppos)
 875 {
 876         struct ll_file_data *fd = file->private_data;
 877         struct inode *inode = file->f_dentry->d_inode;
 878         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 879         struct lustre_handle lockh = { 0 };
 880         struct ldlm_extent extent;
 881         loff_t maxbytes = ll_file_maxbytes(inode);
 882         ldlm_error_t err;
 883         ssize_t retval;
 884         char should_validate = 1;
 885         ENTRY;
 886         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
 887                inode->i_ino, inode->i_generation, inode, count, *ppos);
 888
 889         SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
 890         /*
 891          * sleep doing some writeback work of this mount's dirty data
 892          * if the VM thinks we're low on memory.. other dirtying code
 893          * paths should think about doing this, too, but they should be
 894          * careful not to hold locked pages while they do so.  like
 895          * ll_prepare_write.  *cough*
 896          */
 897         ll_check_dirty(inode->i_sb);
 898
 899         /* POSIX, but surprised the VFS doesn't check this already */
 900         if (count == 0)
 901                 RETURN(0);
 902
 903         LASSERT(lsm);
 904
 905         if (file->f_flags & O_APPEND) {
 906                 extent.start = 0;
 907                 extent.end = OBD_OBJECT_EOF;
 908         } else  {
 909                 extent.start = *ppos;
 910                 extent.end = *ppos + count - 1;
 911                 /* we really don't care what i_size is if we're doing
 912                  * fully page aligned writes */
 913                 if ((*ppos & ~PAGE_CACHE_MASK) == 0 &&
 914                     (count & ~PAGE_CACHE_MASK) == 0)
 915                         should_validate = 0;
 916         }
 917
 918         if (should_validate)
 919                 err = ll_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh);
 920         else
 921                 err = ll_extent_lock_no_validate(fd, inode, lsm, LCK_PW,
 922                                                  &extent, &lockh);
 923         if (err != ELDLM_OK)
 924                 RETURN(-ENOLCK);
 925
 926         /* this is ok, g_f_w will overwrite this under i_sem if it races
 927          * with a local truncate, it just makes our maxbyte checking easier */
 928         if (file->f_flags & O_APPEND)
 929                 *ppos = inode->i_size;
 930
 931         if (*ppos >= maxbytes) {
 932                 if (count || *ppos > maxbytes) {
 933                         send_sig(SIGXFSZ, current, 0);
 934                         GOTO(out, retval = -EFBIG);
 935                 }
 936         }
 937         if (*ppos + count > maxbytes)
 938                 count = maxbytes - *ppos;
 939
 940         CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
 941                inode->i_ino, count, *ppos);
 942
 943         /* generic_file_write handles O_APPEND after getting i_sem */
 944         retval = generic_file_write(file, buf, count, ppos);
 945
 946 out:
 947         /* XXX errors? */
 948         lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES,
 949                             retval);
 950         ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh);
 951         RETURN(retval);
 952 }
 953
 954 static int ll_lov_setstripe(struct inode *inode, struct file *file,
 955                             unsigned long arg)
 956 {
 957         struct ll_inode_info *lli = ll_i2info(inode);
 958         struct lustre_handle *conn = ll_i2obdconn(inode);
 959         struct lov_stripe_md *lsm;
 960         int rc;
 961         ENTRY;
 962
 963         down(&lli->lli_open_sem);
 964         lsm = lli->lli_smd;
 965         if (lsm) {
 966                 up(&lli->lli_open_sem);
 967                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
 968                        inode->i_ino);
 969                 /* If we haven't already done the open, do so now */
 970                 if (file->f_flags & O_LOV_DELAY_CREATE) {
 971                         int rc2 = ll_osc_open(conn, inode, file, lsm);
 972                         if (rc2)
 973                                 RETURN(rc2);
 974                 }
 975
 976                 RETURN(-EEXIST);
 977         }
 978
 979         rc = obd_iocontrol(LL_IOC_LOV_SETSTRIPE, conn, 0, &lsm, (void *)arg);
 980         if (rc) {
 981                 up(&lli->lli_open_sem);
 982                 RETURN(rc);
 983         }
 984         rc = ll_create_obj(conn, inode, file, lsm);
 985         up(&lli->lli_open_sem);
 986
 987         if (rc) {
 988                 obd_free_memmd(conn, &lsm);
 989                 RETURN(rc);
 990         }
 991         rc = ll_osc_open(conn, inode, file, lli->lli_smd);
 992         RETURN(rc);
 993 }
 994
 995 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
 996 {
 997         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 998         struct lustre_handle *conn = ll_i2obdconn(inode);
 999
1000         if (!lsm)
1001                 RETURN(-ENODATA);
1002
1003         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, conn, 0, lsm, (void *)arg);
1004 }
1005
1006 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1007                   unsigned long arg)
1008 {
1009         struct ll_file_data *fd = file->private_data;
1010         struct lustre_handle *conn;
1011         int flags;
1012
1013         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%u\n", inode->i_ino,
1014                inode->i_generation, inode, cmd);
1015
1016         if (_IOC_TYPE(cmd) == 'T') /* tty ioctls */
1017                 return -ENOTTY;
1018
1019         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_IOCTL);
1020         switch(cmd) {
1021         case LL_IOC_GETFLAGS:
1022                 /* Get the current value of the file flags */
1023                 return put_user(fd->fd_flags, (int *)arg);
1024         case LL_IOC_SETFLAGS:
1025         case LL_IOC_CLRFLAGS:
1026                 /* Set or clear specific file flags */
1027                 /* XXX This probably needs checks to ensure the flags are
1028                  *     not abused, and to handle any flag side effects.
1029                  */
1030                 if (get_user(flags, (int *) arg))
1031                         return -EFAULT;
1032
1033                 if (cmd == LL_IOC_SETFLAGS)
1034                         fd->fd_flags |= flags;
1035                 else
1036                         fd->fd_flags &= ~flags;
1037                 return 0;
1038         case LL_IOC_LOV_SETSTRIPE:
1039                 return ll_lov_setstripe(inode, file, arg);
1040         case LL_IOC_LOV_GETSTRIPE:
1041                 return ll_lov_getstripe(inode, arg);
1042
1043         /* We need to special case any other ioctls we want to handle,
1044          * to send them to the MDS/OST as appropriate and to properly
1045          * network encode the arg field.
1046         case EXT2_IOC_GETFLAGS:
1047         case EXT2_IOC_SETFLAGS:
1048         case EXT2_IOC_GETVERSION_OLD:
1049         case EXT2_IOC_GETVERSION_NEW:
1050         case EXT2_IOC_SETVERSION_OLD:
1051         case EXT2_IOC_SETVERSION_NEW:
1052         */
1053         default:
1054                 conn = ll_i2obdconn(inode);
1055                 return obd_iocontrol(cmd, conn, 0, NULL, (void *)arg);
1056         }
1057 }
1058
1059 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1060 {
1061         struct inode *inode = file->f_dentry->d_inode;
1062         struct ll_file_data *fd = file->private_data;
1063         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1064         struct lustre_handle lockh = {0};
1065         loff_t retval;
1066         ENTRY;
1067         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),to=%llu\n", inode->i_ino,
1068                inode->i_generation, inode,
1069                offset + ((origin==2) ? inode->i_size : file->f_pos));
1070
1071         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_LLSEEK);
1072         if (origin == 2) { /* SEEK_END */
1073                 ldlm_error_t err;
1074                 struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
1075                 err = ll_extent_lock(fd, inode, lsm, LCK_PR, &extent, &lockh);
1076                 if (err != ELDLM_OK)
1077                         RETURN(-ENOLCK);
1078
1079                 offset += inode->i_size;
1080         } else if (origin == 1) { /* SEEK_CUR */
1081                 offset += file->f_pos;
1082         }
1083
1084         retval = -EINVAL;
1085         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1086                 if (offset != file->f_pos) {
1087                         file->f_pos = offset;
1088 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1089                         file->f_reada = 0;
1090                         file->f_version = ++event;
1091 #endif
1092                 }
1093                 retval = offset;
1094         }
1095
1096         if (origin == 2)
1097                 ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh);
1098         RETURN(retval);
1099 }
1100
1101 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1102 {
1103         struct inode *inode = dentry->d_inode;
1104         int rc;
1105         ENTRY;
1106         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
1107                inode->i_generation, inode);
1108
1109         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_FSYNC);
1110         /*
1111          * filemap_fdata{sync,wait} are also called at PW lock cancelation so
1112          * we know that they can only find data to writeback here if we are
1113          * still holding the PW lock that covered the dirty pages.  XXX we
1114          * should probably get a reference on it, though, just to be clear.
1115          */
1116         rc = filemap_fdatasync(inode->i_mapping);
1117         if (rc == 0)
1118                 rc = filemap_fdatawait(inode->i_mapping);
1119
1120         RETURN(rc);
1121 }
1122
1123 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
1124 {
1125         struct inode *inode = dentry->d_inode;
1126         struct lov_stripe_md *lsm;
1127         ENTRY;
1128
1129         if (!inode) {
1130                 CERROR("REPORT THIS LINE TO PETER\n");
1131                 RETURN(0);
1132         }
1133         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
1134                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
1135 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
1136         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_REVALIDATE);
1137 #endif
1138
1139         /* this is very tricky.  it is unsafe to call ll_have_md_lock
1140            when we have a referenced lock: because it may cause an RPC
1141            below when the lock is marked CB_PENDING.  That RPC may not
1142            go out because someone else may be in another RPC waiting for
1143            that lock*/
1144         if (!(it && it->it_lock_mode) && !ll_have_md_lock(dentry)) {
1145                 struct lustre_md md;
1146                 struct ptlrpc_request *req = NULL;
1147                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
1148                 struct ll_fid fid;
1149                 unsigned long valid = 0;
1150                 int rc;
1151                 int ealen = 0;
1152
1153                 if (S_ISREG(inode->i_mode)) {
1154                         ealen = obd_size_diskmd(&sbi->ll_osc_conn, NULL);
1155                         valid |= OBD_MD_FLEASIZE;
1156                 }
1157                 ll_inode2fid(&fid, inode);
1158                 rc = mdc_getattr(&sbi->ll_mdc_conn, &fid, valid, ealen, &req);
1159                 if (rc) {
1160                         CERROR("failure %d inode %lu\n", rc, inode->i_ino);
1161                         RETURN(-abs(rc));
1162                 }
1163                 rc = mdc_req2lustre_md(req, 0, &sbi->ll_osc_conn, &md);
1164
1165                 /* XXX Too paranoid? */
1166                 if ((md.body->valid ^ valid) & OBD_MD_FLEASIZE)
1167                         CERROR("Asked for %s eadata but got %s\n",
1168                                (valid & OBD_MD_FLEASIZE) ? "some" : "no",
1169                                (md.body->valid & OBD_MD_FLEASIZE) ? "some":
1170                                "none");
1171                 if (rc) {
1172                         ptlrpc_req_finished(req);
1173                         RETURN(rc);
1174                 }
1175
1176                 ll_update_inode(inode, md.body, md.lsm);
1177                 if (md.lsm != NULL && ll_i2info(inode)->lli_smd != md.lsm)
1178                         obd_free_memmd(&sbi->ll_osc_conn, &md.lsm);
1179
1180                 ptlrpc_req_finished(req);
1181         }
1182
1183         lsm = ll_i2info(inode)->lli_smd;
1184         if (!lsm)       /* object not yet allocated, don't validate size */
1185                 RETURN(0);
1186
1187         /*
1188          * unfortunately stat comes in through revalidate and we don't
1189          * differentiate this use from initial instantiation.  we're
1190          * also being wildly conservative and flushing write caches
1191          * so that stat really returns the proper size.
1192          */
1193         {
1194                 struct ldlm_extent extent = {0, OBD_OBJECT_EOF};
1195                 struct lustre_handle lockh = {0};
1196                 ldlm_error_t err;
1197
1198                 err = ll_extent_lock(NULL, inode, lsm, LCK_PR, &extent, &lockh);
1199                 if (err != ELDLM_OK)
1200                         RETURN(err);
1201
1202                 ll_extent_unlock(NULL, inode, lsm, LCK_PR, &lockh);
1203         }
1204         RETURN(0);
1205 }
1206
1207 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1208 int ll_getattr(struct vfsmount *mnt, struct dentry *de,
1209                       struct lookup_intent *it,
1210                       struct kstat *stat)
1211 {
1212         int res = 0;
1213         struct inode *inode = de->d_inode;
1214
1215         res = ll_inode_revalidate_it(de, it);
1216         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR);
1217
1218         if (res)
1219                 return res;
1220
1221         stat->dev = inode->i_sb->s_dev;
1222         stat->ino = inode->i_ino;
1223         stat->mode = inode->i_mode;
1224         stat->nlink = inode->i_nlink;
1225         stat->uid = inode->i_uid;
1226         stat->gid = inode->i_gid;
1227         stat->rdev = kdev_t_to_nr(inode->i_rdev);
1228         stat->atime = inode->i_atime;
1229         stat->mtime = inode->i_mtime;
1230         stat->ctime = inode->i_ctime;
1231         stat->size = inode->i_size;
1232         stat->blksize = inode->i_blksize;
1233         stat->blocks = inode->i_blocks;
1234         return 0;
1235 }
1236 #endif
1237
1238 struct file_operations ll_file_operations = {
1239         read:           ll_file_read,
1240         write:          ll_file_write,
1241         ioctl:          ll_file_ioctl,
1242         open:           ll_file_open,
1243         release:        ll_file_release,
1244         mmap:           generic_file_mmap,
1245         llseek:         ll_file_seek,
1246         fsync:          ll_fsync,
1247 };
1248
1249 struct inode_operations ll_file_inode_operations = {
1250         setattr_raw:    ll_setattr_raw,
1251         setattr:    ll_setattr,
1252         truncate:   ll_truncate,
1253 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1254         getattr_it: ll_getattr,
1255 #else
1256         revalidate_it: ll_inode_revalidate_it,
1257 #endif
1258 };
1259
1260 struct inode_operations ll_special_inode_operations = {
1261         setattr_raw:    ll_setattr_raw,
1262         setattr:    ll_setattr,
1263 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1264         getattr_it:    ll_getattr,
1265 #else
1266         revalidate_it: ll_inode_revalidate_it,
1267 #endif
1268 };