lustre/llite/file.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
   5  *   Author: Peter Braam <braam@clusterfs.com>
   6  *   Author: Phil Schwan <phil@clusterfs.com>
   7  *   Author: Andreas Dilger <adilger@clusterfs.com>
   8  *
   9  *   This file is part of Lustre, http://www.lustre.org.
  10  *
  11  *   Lustre is free software; you can redistribute it and/or
  12  *   modify it under the terms of version 2 of the GNU General Public
  13  *   License as published by the Free Software Foundation.
  14  *
  15  *   Lustre is distributed in the hope that it will be useful,
  16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  *   GNU General Public License for more details.
  19  *
  20  *   You should have received a copy of the GNU General Public License
  21  *   along with Lustre; if not, write to the Free Software
  22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25 #define DEBUG_SUBSYSTEM S_LLITE
  26 #include <lustre_dlm.h>
  27 #include <lustre_lite.h>
  28 #include <lustre_mdc.h>
  29 #include <linux/pagemap.h>
  30 #include <linux/file.h>
  31 #include "llite_internal.h"
  32
  33 /* also used by llite/special.c:ll_special_open() */
  34 struct ll_file_data *ll_file_data_get(void)
  35 {
  36         struct ll_file_data *fd;
  37
  38         OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
  39         return fd;
  40 }
  41
  42 static void ll_file_data_put(struct ll_file_data *fd)
  43 {
  44         if (fd != NULL)
  45                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  46 }
  47
  48 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  49                           struct lustre_handle *fh)
  50 {
  51         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  52         op_data->op_attr.ia_mode = inode->i_mode;
  53         op_data->op_attr.ia_atime = inode->i_atime;
  54         op_data->op_attr.ia_mtime = inode->i_mtime;
  55         op_data->op_attr.ia_ctime = inode->i_ctime;
  56         op_data->op_attr.ia_size = i_size_read(inode);
  57         op_data->op_attr_blocks = inode->i_blocks;
  58         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
  59         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  60         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
  61         op_data->op_capa1 = ll_mdscapa_get(inode);
  62 }
  63
  64 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  65                              struct obd_client_handle *och)
  66 {
  67         ENTRY;
  68
  69         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
  70                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
  71
  72         if (!(och->och_flags & FMODE_WRITE))
  73                 goto out;
  74
  75         if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
  76             !S_ISREG(inode->i_mode))
  77                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
  78         else
  79                 ll_epoch_close(inode, op_data, &och, 0);
  80
  81 out:
  82         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
  83         EXIT;
  84 }
  85
  86 static int ll_close_inode_openhandle(struct obd_export *md_exp,
  87                                      struct inode *inode,
  88                                      struct obd_client_handle *och)
  89 {
  90         struct obd_export *exp = ll_i2mdexp(inode);
  91         struct md_op_data *op_data;
  92         struct ptlrpc_request *req = NULL;
  93         struct obd_device *obd = class_exp2obd(exp);
  94         int epoch_close = 1;
  95         int seq_end = 0, rc;
  96         ENTRY;
  97
  98         if (obd == NULL) {
  99                 /*
 100                  * XXX: in case of LMV, is this correct to access
 101                  * ->exp_handle?
 102                  */
 103                 CERROR("Invalid MDC connection handle "LPX64"\n",
 104                        ll_i2mdexp(inode)->exp_handle.h_cookie);
 105                 GOTO(out, rc = 0);
 106         }
 107
 108         /*
 109          * here we check if this is forced umount. If so this is called on
 110          * canceling "open lock" and we do not call md_close() in this case, as
 111          * it will not be successful, as import is already deactivated.
 112          */
 113         if (obd->obd_force)
 114                 GOTO(out, rc = 0);
 115
 116         OBD_ALLOC_PTR(op_data);
 117         if (op_data == NULL)
 118                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
 119
 120         ll_prepare_close(inode, op_data, och);
 121         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
 122         rc = md_close(md_exp, op_data, och->och_mod, &req);
 123         if (rc != -EAGAIN)
 124                 seq_end = 1;
 125
 126         if (rc == -EAGAIN) {
 127                 /* This close must have the epoch closed. */
 128                 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
 129                 LASSERT(epoch_close);
 130                 /* MDS has instructed us to obtain Size-on-MDS attribute from
 131                  * OSTs and send setattr to back to MDS. */
 132                 rc = ll_sizeonmds_update(inode, och->och_mod,
 133                                          &och->och_fh, op_data->op_ioepoch);
 134                 if (rc) {
 135                         CERROR("inode %lu mdc Size-on-MDS update failed: "
 136                                "rc = %d\n", inode->i_ino, rc);
 137                         rc = 0;
 138                 }
 139         } else if (rc) {
 140                 CERROR("inode %lu mdc close failed: rc = %d\n",
 141                        inode->i_ino, rc);
 142         }
 143         ll_finish_md_op_data(op_data);
 144
 145         if (rc == 0) {
 146                 rc = ll_objects_destroy(req, inode);
 147                 if (rc)
 148                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
 149                                inode->i_ino, rc);
 150         }
 151
 152         EXIT;
 153 out:
 154
 155         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
 156             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 157                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 158         } else {
 159                 if (seq_end)
 160                         ptlrpc_close_replay_seq(req);
 161                 md_clear_open_replay_data(md_exp, och);
 162                 /* Free @och if it is not waiting for DONE_WRITING. */
 163                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 164                 OBD_FREE_PTR(och);
 165         }
 166         if (req) /* This is close request */
 167                 ptlrpc_req_finished(req);
 168         return rc;
 169 }
 170
 171 int ll_md_real_close(struct inode *inode, int flags)
 172 {
 173         struct ll_inode_info *lli = ll_i2info(inode);
 174         struct obd_client_handle **och_p;
 175         struct obd_client_handle *och;
 176         __u64 *och_usecount;
 177         int rc = 0;
 178         ENTRY;
 179
 180         if (flags & FMODE_WRITE) {
 181                 och_p = &lli->lli_mds_write_och;
 182                 och_usecount = &lli->lli_open_fd_write_count;
 183         } else if (flags & FMODE_EXEC) {
 184                 och_p = &lli->lli_mds_exec_och;
 185                 och_usecount = &lli->lli_open_fd_exec_count;
 186         } else {
 187                 LASSERT(flags & FMODE_READ);
 188                 och_p = &lli->lli_mds_read_och;
 189                 och_usecount = &lli->lli_open_fd_read_count;
 190         }
 191
 192         down(&lli->lli_och_sem);
 193         if (*och_usecount) { /* There are still users of this handle, so
 194                                 skip freeing it. */
 195                 up(&lli->lli_och_sem);
 196                 RETURN(0);
 197         }
 198         och=*och_p;
 199         *och_p = NULL;
 200         up(&lli->lli_och_sem);
 201
 202         if (och) { /* There might be a race and somebody have freed this och
 203                       already */
 204                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 205                                                inode, och);
 206         }
 207
 208         RETURN(rc);
 209 }
 210
 211 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 212                 struct file *file)
 213 {
 214         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 215         struct ll_inode_info *lli = ll_i2info(inode);
 216         int rc = 0;
 217         ENTRY;
 218
 219         /* clear group lock, if present */
 220         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
 221                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 222                 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
 223                 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
 224                                       &fd->fd_cwlockh);
 225         }
 226
 227         /* Let's see if we have good enough OPEN lock on the file and if
 228            we can skip talking to MDS */
 229         if (file->f_dentry->d_inode) { /* Can this ever be false? */
 230                 int lockmode;
 231                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 232                 struct lustre_handle lockh;
 233                 struct inode *inode = file->f_dentry->d_inode;
 234                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 235
 236                 down(&lli->lli_och_sem);
 237                 if (fd->fd_omode & FMODE_WRITE) {
 238                         lockmode = LCK_CW;
 239                         LASSERT(lli->lli_open_fd_write_count);
 240                         lli->lli_open_fd_write_count--;
 241                 } else if (fd->fd_omode & FMODE_EXEC) {
 242                         lockmode = LCK_PR;
 243                         LASSERT(lli->lli_open_fd_exec_count);
 244                         lli->lli_open_fd_exec_count--;
 245                 } else {
 246                         lockmode = LCK_CR;
 247                         LASSERT(lli->lli_open_fd_read_count);
 248                         lli->lli_open_fd_read_count--;
 249                 }
 250                 up(&lli->lli_och_sem);
 251
 252                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 253                                    LDLM_IBITS, &policy, lockmode,
 254                                    &lockh)) {
 255                         rc = ll_md_real_close(file->f_dentry->d_inode,
 256                                               fd->fd_omode);
 257                 }
 258         } else {
 259                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
 260                        file, file->f_dentry, file->f_dentry->d_name.name);
 261         }
 262
 263         LUSTRE_FPRIVATE(file) = NULL;
 264         ll_file_data_put(fd);
 265         ll_capa_close(inode);
 266
 267         RETURN(rc);
 268 }
 269
 270 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
 271
 272 /* While this returns an error code, fput() the caller does not, so we need
 273  * to make every effort to clean up all of our state here.  Also, applications
 274  * rarely check close errors and even if an error is returned they will not
 275  * re-try the close call.
 276  */
 277 int ll_file_release(struct inode *inode, struct file *file)
 278 {
 279         struct ll_file_data *fd;
 280         struct ll_sb_info *sbi = ll_i2sbi(inode);
 281         struct ll_inode_info *lli = ll_i2info(inode);
 282         struct lov_stripe_md *lsm = lli->lli_smd;
 283         int rc;
 284
 285         ENTRY;
 286         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 287                inode->i_generation, inode);
 288
 289 #ifdef CONFIG_FS_POSIX_ACL
 290         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
 291             inode == inode->i_sb->s_root->d_inode) {
 292                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 293
 294                 LASSERT(fd != NULL);
 295                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
 296                         fd->fd_flags &= ~LL_FILE_RMTACL;
 297                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
 298                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
 299                 }
 300         }
 301 #endif
 302
 303         if (inode->i_sb->s_root != file->f_dentry)
 304                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 305         fd = LUSTRE_FPRIVATE(file);
 306         LASSERT(fd != NULL);
 307
 308         /* The last ref on @file, maybe not the the owner pid of statahead.
 309          * Different processes can open the same dir, "ll_opendir_key" means:
 310          * it is me that should stop the statahead thread. */
 311         if (lli->lli_opendir_key == fd)
 312                 ll_stop_statahead(inode, fd);
 313
 314         if (inode->i_sb->s_root == file->f_dentry) {
 315                 LUSTRE_FPRIVATE(file) = NULL;
 316                 ll_file_data_put(fd);
 317                 RETURN(0);
 318         }
 319
 320         if (lsm)
 321                 lov_test_and_clear_async_rc(lsm);
 322         lli->lli_async_rc = 0;
 323
 324         rc = ll_md_close(sbi->ll_md_exp, inode, file);
 325         RETURN(rc);
 326 }
 327
 328 static int ll_intent_file_open(struct file *file, void *lmm,
 329                                int lmmsize, struct lookup_intent *itp)
 330 {
 331         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
 332         struct dentry *parent = file->f_dentry->d_parent;
 333         const char *name = file->f_dentry->d_name.name;
 334         const int len = file->f_dentry->d_name.len;
 335         struct md_op_data *op_data;
 336         struct ptlrpc_request *req;
 337         int rc;
 338         ENTRY;
 339
 340         if (!parent)
 341                 RETURN(-ENOENT);
 342
 343         /* Usually we come here only for NFSD, and we want open lock.
 344            But we can also get here with pre 2.6.15 patchless kernels, and in
 345            that case that lock is also ok */
 346         /* We can also get here if there was cached open handle in revalidate_it
 347          * but it disappeared while we were getting from there to ll_file_open.
 348          * But this means this file was closed and immediatelly opened which
 349          * makes a good candidate for using OPEN lock */
 350         /* If lmmsize & lmm are not 0, we are just setting stripe info
 351          * parameters. No need for the open lock */
 352         if (!lmm && !lmmsize)
 353                 itp->it_flags |= MDS_OPEN_LOCK;
 354
 355         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
 356                                       file->f_dentry->d_inode, name, len,
 357                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
 358         if (IS_ERR(op_data))
 359                 RETURN(PTR_ERR(op_data));
 360
 361         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 362                             0 /*unused */, &req, ll_md_blocking_ast, 0);
 363         ll_finish_md_op_data(op_data);
 364         if (rc == -ESTALE) {
 365                 /* reason for keep own exit path - don`t flood log
 366                 * with messages with -ESTALE errors.
 367                 */
 368                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 369                      it_open_error(DISP_OPEN_OPEN, itp))
 370                         GOTO(out, rc);
 371                 ll_release_openhandle(file->f_dentry, itp);
 372                 GOTO(out_stale, rc);
 373         }
 374
 375         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 376                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 377                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 378                 GOTO(out, rc);
 379         }
 380
 381         if (itp->d.lustre.it_lock_mode)
 382                 md_set_lock_data(sbi->ll_md_exp,
 383                                  &itp->d.lustre.it_lock_handle,
 384                                  file->f_dentry->d_inode);
 385
 386         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
 387 out:
 388         ptlrpc_req_finished(itp->d.lustre.it_data);
 389
 390 out_stale:
 391         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
 392         ll_intent_drop_lock(itp);
 393
 394         RETURN(rc);
 395 }
 396
 397 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
 398                        struct lookup_intent *it, struct obd_client_handle *och)
 399 {
 400         struct ptlrpc_request *req = it->d.lustre.it_data;
 401         struct mdt_body *body;
 402
 403         LASSERT(och);
 404
 405         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 406         LASSERT(body != NULL);                      /* reply already checked out */
 407
 408         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
 409         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 410         och->och_fid = lli->lli_fid;
 411         och->och_flags = it->it_flags;
 412         lli->lli_ioepoch = body->ioepoch;
 413
 414         return md_set_open_replay_data(md_exp, och, req);
 415 }
 416
 417 int ll_local_open(struct file *file, struct lookup_intent *it,
 418                   struct ll_file_data *fd, struct obd_client_handle *och)
 419 {
 420         struct inode *inode = file->f_dentry->d_inode;
 421         struct ll_inode_info *lli = ll_i2info(inode);
 422         ENTRY;
 423
 424         LASSERT(!LUSTRE_FPRIVATE(file));
 425
 426         LASSERT(fd != NULL);
 427
 428         if (och) {
 429                 struct ptlrpc_request *req = it->d.lustre.it_data;
 430                 struct mdt_body *body;
 431                 int rc;
 432
 433                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
 434                 if (rc)
 435                         RETURN(rc);
 436
 437                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 438                 if ((it->it_flags & FMODE_WRITE) &&
 439                     (body->valid & OBD_MD_FLSIZE))
 440                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 441                                lli->lli_ioepoch, PFID(&lli->lli_fid));
 442         }
 443
 444         LUSTRE_FPRIVATE(file) = fd;
 445         ll_readahead_init(inode, &fd->fd_ras);
 446         fd->fd_omode = it->it_flags;
 447         RETURN(0);
 448 }
 449
 450 /* Open a file, and (for the very first open) create objects on the OSTs at
 451  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 452  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
 453  * lli_open_sem to ensure no other process will create objects, send the
 454  * stripe MD to the MDS, or try to destroy the objects if that fails.
 455  *
 456  * If we already have the stripe MD locally then we don't request it in
 457  * md_open(), by passing a lmm_size = 0.
 458  *
 459  * It is up to the application to ensure no other processes open this file
 460  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 461  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 462  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 463  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 464  */
 465 int ll_file_open(struct inode *inode, struct file *file)
 466 {
 467         struct ll_inode_info *lli = ll_i2info(inode);
 468         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 469                                           .it_flags = file->f_flags };
 470         struct lov_stripe_md *lsm;
 471         struct ptlrpc_request *req = NULL;
 472         struct obd_client_handle **och_p;
 473         __u64 *och_usecount;
 474         struct ll_file_data *fd;
 475         int rc = 0, opendir_set = 0;
 476         ENTRY;
 477
 478         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 479                inode->i_generation, inode, file->f_flags);
 480
 481 #ifdef HAVE_VFS_INTENT_PATCHES
 482         it = file->f_it;
 483 #else
 484         it = file->private_data; /* XXX: compat macro */
 485         file->private_data = NULL; /* prevent ll_local_open assertion */
 486 #endif
 487
 488         fd = ll_file_data_get();
 489         if (fd == NULL)
 490                 RETURN(-ENOMEM);
 491
 492         if (S_ISDIR(inode->i_mode)) {
 493                 spin_lock(&lli->lli_lock);
 494                 /* "lli->lli_opendir_pid != 0" means someone has set it.
 495                  * "lli->lli_sai != NULL" means the previous statahead has not
 496                  *                        been cleanup. */
 497                 if (lli->lli_opendir_pid == 0 && lli->lli_sai == NULL) {
 498                         opendir_set = 1;
 499                         lli->lli_opendir_pid = cfs_curproc_pid();
 500                         lli->lli_opendir_key = fd;
 501                 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid())) {
 502                         /* Two cases for this:
 503                          * (1) The same process open such directory many times.
 504                          * (2) The old process opened the directory, and exited
 505                          *     before its children processes. Then new process
 506                          *     with the same pid opens such directory before the
 507                          *     old process's children processes exit.
 508                          * Change the owner to the latest one. */
 509                         opendir_set = 2;
 510                         lli->lli_opendir_key = fd;
 511                 }
 512                 spin_unlock(&lli->lli_lock);
 513         }
 514
 515         if (inode->i_sb->s_root == file->f_dentry) {
 516                 LUSTRE_FPRIVATE(file) = fd;
 517                 RETURN(0);
 518         }
 519
 520         if (!it || !it->d.lustre.it_disposition) {
 521                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 522                  * because everything but O_ACCMODE mask was stripped from
 523                  * there */
 524                 if ((oit.it_flags + 1) & O_ACCMODE)
 525                         oit.it_flags++;
 526                 if (file->f_flags & O_TRUNC)
 527                         oit.it_flags |= FMODE_WRITE;
 528
 529                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 530                  * dentry_open after call to open_namei that checks permissions.
 531                  * Only nfsd_open call dentry_open directly without checking
 532                  * permissions and because of that this code below is safe. */
 533                 if (oit.it_flags & FMODE_WRITE)
 534                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 535
 536                 /* We do not want O_EXCL here, presumably we opened the file
 537                  * already? XXX - NFS implications? */
 538                 oit.it_flags &= ~O_EXCL;
 539
 540                 it = &oit;
 541         }
 542
 543 restart:
 544         /* Let's see if we have file open on MDS already. */
 545         if (it->it_flags & FMODE_WRITE) {
 546                 och_p = &lli->lli_mds_write_och;
 547                 och_usecount = &lli->lli_open_fd_write_count;
 548         } else if (it->it_flags & FMODE_EXEC) {
 549                 och_p = &lli->lli_mds_exec_och;
 550                 och_usecount = &lli->lli_open_fd_exec_count;
 551          } else {
 552                 och_p = &lli->lli_mds_read_och;
 553                 och_usecount = &lli->lli_open_fd_read_count;
 554         }
 555
 556         down(&lli->lli_och_sem);
 557         if (*och_p) { /* Open handle is present */
 558                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 559                         /* Well, there's extra open request that we do not need,
 560                            let's close it somehow. This will decref request. */
 561                         rc = it_open_error(DISP_OPEN_OPEN, it);
 562                         if (rc) {
 563                                 ll_file_data_put(fd);
 564                                 GOTO(out_och_free, rc);
 565                         }
 566                         ll_release_openhandle(file->f_dentry, it);
 567                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
 568                                              LPROC_LL_OPEN);
 569                 }
 570                 (*och_usecount)++;
 571
 572                 rc = ll_local_open(file, it, fd, NULL);
 573                 if (rc) {
 574                         up(&lli->lli_och_sem);
 575                         ll_file_data_put(fd);
 576                         RETURN(rc);
 577                 }
 578         } else {
 579                 LASSERT(*och_usecount == 0);
 580                 if (!it->d.lustre.it_disposition) {
 581                         /* We cannot just request lock handle now, new ELC code
 582                            means that one of other OPEN locks for this file
 583                            could be cancelled, and since blocking ast handler
 584                            would attempt to grab och_sem as well, that would
 585                            result in a deadlock */
 586                         up(&lli->lli_och_sem);
 587                         it->it_flags |= O_CHECK_STALE;
 588                         rc = ll_intent_file_open(file, NULL, 0, it);
 589                         it->it_flags &= ~O_CHECK_STALE;
 590                         if (rc) {
 591                                 ll_file_data_put(fd);
 592                                 GOTO(out_openerr, rc);
 593                         }
 594
 595                         /* Got some error? Release the request */
 596                         if (it->d.lustre.it_status < 0) {
 597                                 req = it->d.lustre.it_data;
 598                                 ptlrpc_req_finished(req);
 599                         }
 600                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
 601                                          &it->d.lustre.it_lock_handle,
 602                                          file->f_dentry->d_inode);
 603                         goto restart;
 604                 }
 605                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 606                 if (!*och_p) {
 607                         ll_file_data_put(fd);
 608                         GOTO(out_och_free, rc = -ENOMEM);
 609                 }
 610                 (*och_usecount)++;
 611                 req = it->d.lustre.it_data;
 612
 613                 /* md_intent_lock() didn't get a request ref if there was an
 614                  * open error, so don't do cleanup on the request here
 615                  * (bug 3430) */
 616                 /* XXX (green): Should not we bail out on any error here, not
 617                  * just open error? */
 618                 rc = it_open_error(DISP_OPEN_OPEN, it);
 619                 if (rc) {
 620                         ll_file_data_put(fd);
 621                         GOTO(out_och_free, rc);
 622                 }
 623
 624                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 625                 rc = ll_local_open(file, it, fd, *och_p);
 626                 if (rc) {
 627                         up(&lli->lli_och_sem);
 628                         ll_file_data_put(fd);
 629                         GOTO(out_och_free, rc);
 630                 }
 631         }
 632         up(&lli->lli_och_sem);
 633
 634         /* Must do this outside lli_och_sem lock to prevent deadlock where
 635            different kind of OPEN lock for this same inode gets cancelled
 636            by ldlm_cancel_lru */
 637         if (!S_ISREG(inode->i_mode))
 638                 GOTO(out, rc);
 639
 640         ll_capa_open(inode);
 641
 642         lsm = lli->lli_smd;
 643         if (lsm == NULL) {
 644                 if (file->f_flags & O_LOV_DELAY_CREATE ||
 645                     !(file->f_mode & FMODE_WRITE)) {
 646                         CDEBUG(D_INODE, "object creation was delayed\n");
 647                         GOTO(out, rc);
 648                 }
 649         }
 650         file->f_flags &= ~O_LOV_DELAY_CREATE;
 651         GOTO(out, rc);
 652 out:
 653         ptlrpc_req_finished(req);
 654         if (req)
 655                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 656 out_och_free:
 657         if (rc) {
 658                 if (*och_p) {
 659                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 660                         *och_p = NULL; /* OBD_FREE writes some magic there */
 661                         (*och_usecount)--;
 662                 }
 663                 up(&lli->lli_och_sem);
 664 out_openerr:
 665                 if (opendir_set == 1) {
 666                         lli->lli_opendir_key = NULL;
 667                         lli->lli_opendir_pid = 0;
 668                 } else if (unlikely(opendir_set == 2)) {
 669                         ll_stop_statahead(inode, fd);
 670                 }
 671         }
 672
 673         return rc;
 674 }
 675
 676 /* Fills the obdo with the attributes for the inode defined by lsm */
 677 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
 678 {
 679         struct ptlrpc_request_set *set;
 680         struct ll_inode_info *lli = ll_i2info(inode);
 681         struct lov_stripe_md *lsm = lli->lli_smd;
 682
 683         struct obd_info oinfo = { { { 0 } } };
 684         int rc;
 685         ENTRY;
 686
 687         LASSERT(lsm != NULL);
 688
 689         oinfo.oi_md = lsm;
 690         oinfo.oi_oa = obdo;
 691         oinfo.oi_oa->o_id = lsm->lsm_object_id;
 692         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
 693         oinfo.oi_oa->o_mode = S_IFREG;
 694         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 695                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 696                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 697                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 698                                OBD_MD_FLGROUP;
 699         oinfo.oi_capa = ll_mdscapa_get(inode);
 700
 701         set = ptlrpc_prep_set();
 702         if (set == NULL) {
 703                 CERROR("can't allocate ptlrpc set\n");
 704                 rc = -ENOMEM;
 705         } else {
 706                 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
 707                 if (rc == 0)
 708                         rc = ptlrpc_set_wait(set);
 709                 ptlrpc_set_destroy(set);
 710         }
 711         capa_put(oinfo.oi_capa);
 712         if (rc)
 713                 RETURN(rc);
 714
 715         oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 716                                  OBD_MD_FLATIME | OBD_MD_FLMTIME |
 717                                  OBD_MD_FLCTIME | OBD_MD_FLSIZE);
 718
 719         obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
 720         CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
 721                lli->lli_smd->lsm_object_id, i_size_read(inode),
 722                (unsigned long long)inode->i_blocks,
 723                (unsigned long)ll_inode_blksize(inode));
 724         RETURN(0);
 725 }
 726
 727 static inline void ll_remove_suid(struct inode *inode)
 728 {
 729         unsigned int mode;
 730
 731         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
 732         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
 733
 734         /* was any of the uid bits set? */
 735         mode &= inode->i_mode;
 736         if (mode && !capable(CAP_FSETID)) {
 737                 inode->i_mode &= ~mode;
 738                 // XXX careful here - we cannot change the size
 739         }
 740 }
 741
 742 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
 743 {
 744         struct ll_inode_info *lli = ll_i2info(inode);
 745         struct lov_stripe_md *lsm = lli->lli_smd;
 746         struct obd_export *exp = ll_i2dtexp(inode);
 747         struct {
 748                 char name[16];
 749                 struct ldlm_lock *lock;
 750                 struct lov_stripe_md *lsm;
 751         } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
 752         __u32 stripe, vallen = sizeof(stripe);
 753         int rc;
 754         ENTRY;
 755
 756         if (lsm->lsm_stripe_count == 1)
 757                 GOTO(check, stripe = 0);
 758
 759         /* get our offset in the lov */
 760         rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
 761         if (rc != 0) {
 762                 CERROR("obd_get_info: rc = %d\n", rc);
 763                 RETURN(rc);
 764         }
 765         LASSERT(stripe < lsm->lsm_stripe_count);
 766
 767 check:
 768         if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
 769             lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
 770                 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
 771                            lsm->lsm_oinfo[stripe]->loi_id,
 772                            lsm->lsm_oinfo[stripe]->loi_gr);
 773                 RETURN(-ELDLM_NO_LOCK_DATA);
 774         }
 775
 776         RETURN(stripe);
 777 }
 778
 779 /* Get extra page reference to ensure it is not going away */
 780 void ll_pin_extent_cb(void *data)
 781 {
 782         struct page *page = data;
 783
 784         page_cache_get(page);
 785
 786         return;
 787 }
 788
 789 /* Flush the page from page cache for an extent as its canceled.
 790  * Page to remove is delivered as @data.
 791  *
 792  * No one can dirty the extent until we've finished our work and they cannot
 793  * enqueue another lock.  The DLM protects us from ll_file_read/write here,
 794  * but other kernel actors could have pages locked.
 795  *
 796  * If @discard is set, there is no need to write the page if it is dirty.
 797  *
 798  * Called with the DLM lock held. */
 799 int ll_page_removal_cb(void *data, int discard)
 800 {
 801         int rc;
 802         struct page *page = data;
 803         struct address_space *mapping;
 804
 805         ENTRY;
 806
 807         /* We have page reference already from ll_pin_page */
 808         lock_page(page);
 809
 810         /* Already truncated by somebody */
 811         if (!page->mapping)
 812                 GOTO(out, rc = 0);
 813         mapping = page->mapping;
 814
 815         ll_teardown_mmaps(mapping,
 816                           (__u64)page->index << PAGE_CACHE_SHIFT,
 817                           ((__u64)page->index<<PAGE_CACHE_SHIFT)|
 818                                                               ~PAGE_CACHE_MASK);
 819         LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
 820
 821         if (!discard && clear_page_dirty_for_io(page)) {
 822                 LASSERT(page->mapping);
 823                 rc = ll_call_writepage(page->mapping->host, page);
 824                 /* either waiting for io to complete or reacquiring
 825                  * the lock that the failed writepage released */
 826                 lock_page(page);
 827                 wait_on_page_writeback(page);
 828                 if (rc != 0) {
 829                         CERROR("writepage inode %lu(%p) of page %p "
 830                                "failed: %d\n", mapping->host->i_ino,
 831                                mapping->host, page, rc);
 832                         if (rc == -ENOSPC)
 833                                 set_bit(AS_ENOSPC, &mapping->flags);
 834                         else
 835                                 set_bit(AS_EIO, &mapping->flags);
 836                 }
 837                 set_bit(AS_EIO, &mapping->flags);
 838         }
 839         if (page->mapping != NULL) {
 840                 struct ll_async_page *llap = llap_cast_private(page);
 841                 /* checking again to account for writeback's lock_page() */
 842                 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
 843                 if (llap)
 844                         ll_ra_accounting(llap, page->mapping);
 845                 ll_truncate_complete_page(page);
 846         }
 847         EXIT;
 848 out:
 849         LASSERT(!PageWriteback(page));
 850         unlock_page(page);
 851         page_cache_release(page);
 852
 853         return 0;
 854 }
 855
 856 int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
 857                              void *data, int flag)
 858 {
 859         struct inode *inode;
 860         struct ll_inode_info *lli;
 861         struct lov_stripe_md *lsm;
 862         int stripe;
 863         __u64 kms;
 864
 865         ENTRY;
 866
 867         if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
 868                 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
 869                 LBUG();
 870         }
 871
 872         inode = ll_inode_from_lock(lock);
 873         if (inode == NULL)
 874                 RETURN(0);
 875         lli = ll_i2info(inode);
 876         if (lli == NULL)
 877                 GOTO(iput, 0);
 878         if (lli->lli_smd == NULL)
 879                 GOTO(iput, 0);
 880         lsm = lli->lli_smd;
 881
 882         stripe = ll_lock_to_stripe_offset(inode, lock);
 883         if (stripe < 0)
 884                 GOTO(iput, 0);
 885
 886         lov_stripe_lock(lsm);
 887         lock_res_and_lock(lock);
 888         kms = ldlm_extent_shift_kms(lock,
 889                                     lsm->lsm_oinfo[stripe]->loi_kms);
 890
 891         if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
 892                 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 893                            lsm->lsm_oinfo[stripe]->loi_kms, kms);
 894         lsm->lsm_oinfo[stripe]->loi_kms = kms;
 895         unlock_res_and_lock(lock);
 896         lov_stripe_unlock(lsm);
 897         ll_queue_done_writing(inode, 0);
 898         EXIT;
 899 iput:
 900         iput(inode);
 901
 902         return 0;
 903 }
 904
 905 #if 0
 906 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
 907 {
 908         /* XXX ALLOCATE - 160 bytes */
 909         struct inode *inode = ll_inode_from_lock(lock);
 910         struct ll_inode_info *lli = ll_i2info(inode);
 911         struct lustre_handle lockh = { 0 };
 912         struct ost_lvb *lvb;
 913         int stripe;
 914         ENTRY;
 915
 916         if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
 917                      LDLM_FL_BLOCK_CONV)) {
 918                 LBUG(); /* not expecting any blocked async locks yet */
 919                 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
 920                            "lock, returning");
 921                 ldlm_lock_dump(D_OTHER, lock, 0);
 922                 ldlm_reprocess_all(lock->l_resource);
 923                 RETURN(0);
 924         }
 925
 926         LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
 927
 928         stripe = ll_lock_to_stripe_offset(inode, lock);
 929         if (stripe < 0)
 930                 goto iput;
 931
 932         if (lock->l_lvb_len) {
 933                 struct lov_stripe_md *lsm = lli->lli_smd;
 934                 __u64 kms;
 935                 lvb = lock->l_lvb_data;
 936                 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
 937
 938                 lock_res_and_lock(lock);
 939                 ll_inode_size_lock(inode, 1);
 940                 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
 941                 kms = ldlm_extent_shift_kms(NULL, kms);
 942                 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
 943                         LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 944                                    lsm->lsm_oinfo[stripe].loi_kms, kms);
 945                 lsm->lsm_oinfo[stripe].loi_kms = kms;
 946                 ll_inode_size_unlock(inode, 1);
 947                 unlock_res_and_lock(lock);
 948         }
 949
 950 iput:
 951         iput(inode);
 952         wake_up(&lock->l_waitq);
 953
 954         ldlm_lock2handle(lock, &lockh);
 955         ldlm_lock_decref(&lockh, LCK_PR);
 956         RETURN(0);
 957 }
 958 #endif
 959
 960 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
 961 {
 962         struct ptlrpc_request *req = reqp;
 963         struct inode *inode = ll_inode_from_lock(lock);
 964         struct ll_inode_info *lli;
 965         struct lov_stripe_md *lsm;
 966         struct ost_lvb *lvb;
 967         int rc, stripe;
 968         ENTRY;
 969
 970         if (inode == NULL)
 971                 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
 972         lli = ll_i2info(inode);
 973         if (lli == NULL)
 974                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
 975         lsm = lli->lli_smd;
 976         if (lsm == NULL)
 977                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
 978
 979         /* First, find out which stripe index this lock corresponds to. */
 980         stripe = ll_lock_to_stripe_offset(inode, lock);
 981         if (stripe < 0)
 982                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
 983
 984         req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
 985         req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
 986                              sizeof(*lvb));
 987         rc = req_capsule_server_pack(&req->rq_pill);
 988         if (rc) {
 989                 CERROR("lustre_pack_reply: %d\n", rc);
 990                 GOTO(iput, rc);
 991         }
 992
 993         lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
 994         lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
 995         lvb->lvb_mtime = LTIME_S(inode->i_mtime);
 996         lvb->lvb_atime = LTIME_S(inode->i_atime);
 997         lvb->lvb_ctime = LTIME_S(inode->i_ctime);
 998
 999         LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1000                    " atime "LPU64", mtime "LPU64", ctime "LPU64,
1001                    i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
1002                    lvb->lvb_atime, lvb->lvb_ctime);
1003  iput:
1004         iput(inode);
1005
1006  out:
1007         /* These errors are normal races, so we don't want to fill the console
1008          * with messages by calling ptlrpc_error() */
1009         if (rc == -ELDLM_NO_LOCK_DATA)
1010                 lustre_pack_reply(req, 1, NULL, NULL);
1011
1012         req->rq_status = rc;
1013         return rc;
1014 }
1015
1016 static int ll_merge_lvb(struct inode *inode)
1017 {
1018         struct ll_inode_info *lli = ll_i2info(inode);
1019         struct ll_sb_info *sbi = ll_i2sbi(inode);
1020         struct ost_lvb lvb;
1021         int rc;
1022
1023         ENTRY;
1024
1025         ll_inode_size_lock(inode, 1);
1026         inode_init_lvb(inode, &lvb);
1027         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1028         i_size_write(inode, lvb.lvb_size);
1029         inode->i_blocks = lvb.lvb_blocks;
1030
1031         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1032         LTIME_S(inode->i_atime) = lvb.lvb_atime;
1033         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1034         ll_inode_size_unlock(inode, 1);
1035
1036         RETURN(rc);
1037 }
1038
1039 int ll_local_size(struct inode *inode)
1040 {
1041         ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1042         struct ll_inode_info *lli = ll_i2info(inode);
1043         struct ll_sb_info *sbi = ll_i2sbi(inode);
1044         struct lustre_handle lockh = { 0 };
1045         int flags = 0;
1046         int rc;
1047         ENTRY;
1048
1049         if (lli->lli_smd->lsm_stripe_count == 0)
1050                 RETURN(0);
1051
1052         rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1053                        &policy, LCK_PR, &flags, inode, &lockh);
1054         if (rc < 0)
1055                 RETURN(rc);
1056         else if (rc == 0)
1057                 RETURN(-ENODATA);
1058
1059         rc = ll_merge_lvb(inode);
1060         obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
1061         RETURN(rc);
1062 }
1063
1064 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1065                      lstat_t *st)
1066 {
1067         struct lustre_handle lockh = { 0 };
1068         struct ldlm_enqueue_info einfo = { 0 };
1069         struct obd_info oinfo = { { { 0 } } };
1070         struct ost_lvb lvb;
1071         int rc;
1072
1073         ENTRY;
1074
1075         einfo.ei_type = LDLM_EXTENT;
1076         einfo.ei_mode = LCK_PR;
1077         einfo.ei_cb_bl = osc_extent_blocking_cb;
1078         einfo.ei_cb_cp = ldlm_completion_ast;
1079         einfo.ei_cb_gl = ll_glimpse_callback;
1080         einfo.ei_cbdata = NULL;
1081
1082         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1083         oinfo.oi_lockh = &lockh;
1084         oinfo.oi_md = lsm;
1085         oinfo.oi_flags = LDLM_FL_HAS_INTENT;
1086
1087         rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1088         if (rc == -ENOENT)
1089                 RETURN(rc);
1090         if (rc != 0) {
1091                 CERROR("obd_enqueue returned rc %d, "
1092                        "returning -EIO\n", rc);
1093                 RETURN(rc > 0 ? -EIO : rc);
1094         }
1095
1096         lov_stripe_lock(lsm);
1097         memset(&lvb, 0, sizeof(lvb));
1098         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1099         st->st_size = lvb.lvb_size;
1100         st->st_blocks = lvb.lvb_blocks;
1101         st->st_mtime = lvb.lvb_mtime;
1102         st->st_atime = lvb.lvb_atime;
1103         st->st_ctime = lvb.lvb_ctime;
1104         lov_stripe_unlock(lsm);
1105
1106         RETURN(rc);
1107 }
1108
1109 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1110  * file (because it prefers KMS over RSS when larger) */
1111 int ll_glimpse_size(struct inode *inode, int ast_flags)
1112 {
1113         struct ll_inode_info *lli = ll_i2info(inode);
1114         struct ll_sb_info *sbi = ll_i2sbi(inode);
1115         struct lustre_handle lockh = { 0 };
1116         struct ldlm_enqueue_info einfo = { 0 };
1117         struct obd_info oinfo = { { { 0 } } };
1118         int rc;
1119         ENTRY;
1120
1121         if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1122                 RETURN(0);
1123
1124         CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1125
1126         if (!lli->lli_smd) {
1127                 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1128                 RETURN(0);
1129         }
1130
1131         /* NOTE: this looks like DLM lock request, but it may not be one. Due
1132          *       to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1133          *       won't revoke any conflicting DLM locks held. Instead,
1134          *       ll_glimpse_callback() will be called on each client
1135          *       holding a DLM lock against this file, and resulting size
1136          *       will be returned for each stripe. DLM lock on [0, EOF] is
1137          *       acquired only if there were no conflicting locks. */
1138         einfo.ei_type = LDLM_EXTENT;
1139         einfo.ei_mode = LCK_PR;
1140         einfo.ei_cb_bl = osc_extent_blocking_cb;
1141         einfo.ei_cb_cp = ldlm_completion_ast;
1142         einfo.ei_cb_gl = ll_glimpse_callback;
1143         einfo.ei_cbdata = inode;
1144
1145         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1146         oinfo.oi_lockh = &lockh;
1147         oinfo.oi_md = lli->lli_smd;
1148         oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1149
1150         rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1151         if (rc == -ENOENT)
1152                 RETURN(rc);
1153         if (rc != 0) {
1154                 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1155                 RETURN(rc > 0 ? -EIO : rc);
1156         }
1157
1158         rc = ll_merge_lvb(inode);
1159
1160         CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1161                i_size_read(inode), (unsigned long long)inode->i_blocks);
1162
1163         RETURN(rc);
1164 }
1165
1166 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1167                    struct lov_stripe_md *lsm, int mode,
1168                    ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1169                    int ast_flags)
1170 {
1171         struct ll_sb_info *sbi = ll_i2sbi(inode);
1172         struct ost_lvb lvb;
1173         struct ldlm_enqueue_info einfo = { 0 };
1174         struct obd_info oinfo = { { { 0 } } };
1175         int rc;
1176         ENTRY;
1177
1178         LASSERT(!lustre_handle_is_used(lockh));
1179         LASSERT(lsm != NULL);
1180
1181         /* don't drop the mmapped file to LRU */
1182         if (mapping_mapped(inode->i_mapping))
1183                 ast_flags |= LDLM_FL_NO_LRU;
1184
1185         /* XXX phil: can we do this?  won't it screw the file size up? */
1186         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1187             (sbi->ll_flags & LL_SBI_NOLCK))
1188                 RETURN(0);
1189
1190         CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1191                inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1192
1193         einfo.ei_type = LDLM_EXTENT;
1194         einfo.ei_mode = mode;
1195         einfo.ei_cb_bl = osc_extent_blocking_cb;
1196         einfo.ei_cb_cp = ldlm_completion_ast;
1197         einfo.ei_cb_gl = ll_glimpse_callback;
1198         einfo.ei_cbdata = inode;
1199
1200         oinfo.oi_policy = *policy;
1201         oinfo.oi_lockh = lockh;
1202         oinfo.oi_md = lsm;
1203         oinfo.oi_flags = ast_flags;
1204
1205         rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
1206         *policy = oinfo.oi_policy;
1207         if (rc > 0)
1208                 rc = -EIO;
1209
1210         ll_inode_size_lock(inode, 1);
1211         inode_init_lvb(inode, &lvb);
1212         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1213
1214         if (policy->l_extent.start == 0 &&
1215             policy->l_extent.end == OBD_OBJECT_EOF) {
1216                 /* vmtruncate()->ll_truncate() first sets the i_size and then
1217                  * the kms under both a DLM lock and the
1218                  * ll_inode_size_lock().  If we don't get the
1219                  * ll_inode_size_lock() here we can match the DLM lock and
1220                  * reset i_size from the kms before the truncating path has
1221                  * updated the kms.  generic_file_write can then trust the
1222                  * stale i_size when doing appending writes and effectively
1223                  * cancel the result of the truncate.  Getting the
1224                  * ll_inode_size_lock() after the enqueue maintains the DLM
1225                  * -> ll_inode_size_lock() acquiring order. */
1226                 i_size_write(inode, lvb.lvb_size);
1227                 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1228                        inode->i_ino, i_size_read(inode));
1229         }
1230
1231         if (rc == 0) {
1232                 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1233                 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1234                 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1235         }
1236         ll_inode_size_unlock(inode, 1);
1237
1238         RETURN(rc);
1239 }
1240
1241 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1242                      struct lov_stripe_md *lsm, int mode,
1243                      struct lustre_handle *lockh)
1244 {
1245         struct ll_sb_info *sbi = ll_i2sbi(inode);
1246         int rc;
1247         ENTRY;
1248
1249         /* XXX phil: can we do this?  won't it screw the file size up? */
1250         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1251             (sbi->ll_flags & LL_SBI_NOLCK))
1252                 RETURN(0);
1253
1254         rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1255
1256         RETURN(rc);
1257 }
1258
1259 static void ll_set_file_contended(struct inode *inode)
1260 {
1261         struct ll_inode_info *lli = ll_i2info(inode);
1262         cfs_time_t now = cfs_time_current();
1263
1264         spin_lock(&lli->lli_lock);
1265         lli->lli_contention_time = now;
1266         lli->lli_flags |= LLIF_CONTENDED;
1267         spin_unlock(&lli->lli_lock);
1268 }
1269
1270 void ll_clear_file_contended(struct inode *inode)
1271 {
1272         struct ll_inode_info *lli = ll_i2info(inode);
1273
1274         spin_lock(&lli->lli_lock);
1275         lli->lli_flags &= ~LLIF_CONTENDED;
1276         spin_unlock(&lli->lli_lock);
1277 }
1278
1279 static int ll_is_file_contended(struct file *file)
1280 {
1281         struct inode *inode = file->f_dentry->d_inode;
1282         struct ll_inode_info *lli = ll_i2info(inode);
1283         struct ll_sb_info *sbi = ll_i2sbi(inode);
1284         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1285         ENTRY;
1286
1287         if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1288                 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1289                        " osc connect flags = 0x"LPX64"\n",
1290                        sbi->ll_lco.lco_flags);
1291                 RETURN(0);
1292         }
1293         if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1294                 RETURN(1);
1295         if (lli->lli_flags & LLIF_CONTENDED) {
1296                 cfs_time_t cur_time = cfs_time_current();
1297                 cfs_time_t retry_time;
1298
1299                 retry_time = cfs_time_add(
1300                         lli->lli_contention_time,
1301                         cfs_time_seconds(sbi->ll_contention_time));
1302                 if (cfs_time_after(cur_time, retry_time)) {
1303                         ll_clear_file_contended(inode);
1304                         RETURN(0);
1305                 }
1306                 RETURN(1);
1307         }
1308         RETURN(0);
1309 }
1310
1311 static int ll_file_get_tree_lock(struct ll_lock_tree *tree, struct file *file,
1312                                  const char *buf, size_t count,
1313                                  loff_t start, loff_t end, int rw)
1314 {
1315         int append;
1316         int tree_locked = 0;
1317         int rc;
1318         struct inode * inode = file->f_dentry->d_inode;
1319         ENTRY;
1320
1321         append = (rw == WRITE) && (file->f_flags & O_APPEND);
1322
1323         if (append || !ll_is_file_contended(file)) {
1324                 struct ll_lock_tree_node *node;
1325                 int ast_flags;
1326
1327                 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1328                 if (file->f_flags & O_NONBLOCK)
1329                         ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1330                 node = ll_node_from_inode(inode, start, end,
1331                                           (rw == WRITE) ? LCK_PW : LCK_PR);
1332                 if (IS_ERR(node)) {
1333                         rc = PTR_ERR(node);
1334                         GOTO(out, rc);
1335                 }
1336                 tree->lt_fd = LUSTRE_FPRIVATE(file);
1337                 rc = ll_tree_lock(tree, node, buf, count, ast_flags);
1338                 if (rc == 0)
1339                         tree_locked = 1;
1340                 else if (rc == -EUSERS)
1341                         ll_set_file_contended(inode);
1342                 else
1343                         GOTO(out, rc);
1344         }
1345         RETURN(tree_locked);
1346 out:
1347         return rc;
1348 }
1349
1350 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1351                             loff_t *ppos)
1352 {
1353         struct inode *inode = file->f_dentry->d_inode;
1354         struct ll_inode_info *lli = ll_i2info(inode);
1355         struct lov_stripe_md *lsm = lli->lli_smd;
1356         struct ll_sb_info *sbi = ll_i2sbi(inode);
1357         struct ll_lock_tree tree;
1358         struct ost_lvb lvb;
1359         struct ll_ra_read bead;
1360         int ra = 0;
1361         loff_t end;
1362         ssize_t retval, chunk, sum = 0;
1363         int tree_locked;
1364
1365         __u64 kms;
1366         ENTRY;
1367         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1368                inode->i_ino, inode->i_generation, inode, count, *ppos);
1369         /* "If nbyte is 0, read() will return 0 and have no other results."
1370          *                      -- Single Unix Spec */
1371         if (count == 0)
1372                 RETURN(0);
1373
1374         ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1375
1376         if (!lsm) {
1377                 /* Read on file with no objects should return zero-filled
1378                  * buffers up to file size (we can get non-zero sizes with
1379                  * mknod + truncate, then opening file for read. This is a
1380                  * common pattern in NFS case, it seems). Bug 6243 */
1381                 int notzeroed;
1382                 /* Since there are no objects on OSTs, we have nothing to get
1383                  * lock on and so we are forced to access inode->i_size
1384                  * unguarded */
1385
1386                 /* Read beyond end of file */
1387                 if (*ppos >= i_size_read(inode))
1388                         RETURN(0);
1389
1390                 if (count > i_size_read(inode) - *ppos)
1391                         count = i_size_read(inode) - *ppos;
1392                 /* Make sure to correctly adjust the file pos pointer for
1393                  * EFAULT case */
1394                 notzeroed = clear_user(buf, count);
1395                 count -= notzeroed;
1396                 *ppos += count;
1397                 if (!count)
1398                         RETURN(-EFAULT);
1399                 RETURN(count);
1400         }
1401 repeat:
1402         if (sbi->ll_max_rw_chunk != 0) {
1403                 /* first, let's know the end of the current stripe */
1404                 end = *ppos;
1405                 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1406                                 (obd_off *)&end);
1407
1408                 /* correct, the end is beyond the request */
1409                 if (end > *ppos + count - 1)
1410                         end = *ppos + count - 1;
1411
1412                 /* and chunk shouldn't be too large even if striping is wide */
1413                 if (end - *ppos > sbi->ll_max_rw_chunk)
1414                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1415         } else {
1416                 end = *ppos + count - 1;
1417         }
1418
1419         tree_locked = ll_file_get_tree_lock(&tree, file, buf,
1420                                             count, *ppos, end, READ);
1421         if (tree_locked < 0)
1422                 GOTO(out, retval = tree_locked);
1423
1424         ll_inode_size_lock(inode, 1);
1425         /*
1426          * Consistency guarantees: following possibilities exist for the
1427          * relation between region being read and real file size at this
1428          * moment:
1429          *
1430          *  (A): the region is completely inside of the file;
1431          *
1432          *  (B-x): x bytes of region are inside of the file, the rest is
1433          *  outside;
1434          *
1435          *  (C): the region is completely outside of the file.
1436          *
1437          * This classification is stable under DLM lock acquired by
1438          * ll_tree_lock() above, because to change class, other client has to
1439          * take DLM lock conflicting with our lock. Also, any updates to
1440          * ->i_size by other threads on this client are serialized by
1441          * ll_inode_size_lock(). This guarantees that short reads are handled
1442          * correctly in the face of concurrent writes and truncates.
1443          */
1444         inode_init_lvb(inode, &lvb);
1445         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1446         kms = lvb.lvb_size;
1447         if (*ppos + count - 1 > kms) {
1448                 /* A glimpse is necessary to determine whether we return a
1449                  * short read (B) or some zeroes at the end of the buffer (C) */
1450                 ll_inode_size_unlock(inode, 1);
1451                 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1452                 if (retval) {
1453                         if (tree_locked)
1454                                 ll_tree_unlock(&tree);
1455                         goto out;
1456                 }
1457         } else {
1458                 /* region is within kms and, hence, within real file size (A).
1459                  * We need to increase i_size to cover the read region so that
1460                  * generic_file_read() will do its job, but that doesn't mean
1461                  * the kms size is _correct_, it is only the _minimum_ size.
1462                  * If someone does a stat they will get the correct size which
1463                  * will always be >= the kms value here.  b=11081 */
1464                 if (i_size_read(inode) < kms)
1465                         i_size_write(inode, kms);
1466                 ll_inode_size_unlock(inode, 1);
1467         }
1468
1469         chunk = end - *ppos + 1;
1470         CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1471                inode->i_ino, chunk, *ppos, i_size_read(inode));
1472
1473         if (tree_locked) {
1474                 /* turn off the kernel's read-ahead */
1475                 file->f_ra.ra_pages = 0;
1476
1477                 /* initialize read-ahead window once per syscall */
1478                 if (ra == 0) {
1479                         ra = 1;
1480                         bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1481                         bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1482                         ll_ra_read_in(file, &bead);
1483                 }
1484
1485                 /* BUG: 5972 */
1486                 file_accessed(file);
1487                 retval = generic_file_read(file, buf, chunk, ppos);
1488                 ll_tree_unlock(&tree);
1489         } else {
1490                 retval = ll_file_lockless_io(file, buf, chunk, ppos, READ);
1491         }
1492
1493         ll_rw_stats_tally(sbi, current->pid, file, chunk, 0);
1494
1495         if (retval > 0) {
1496                 buf += retval;
1497                 count -= retval;
1498                 sum += retval;
1499                 if (retval == chunk && count > 0)
1500                         goto repeat;
1501         }
1502
1503  out:
1504         if (ra != 0)
1505                 ll_ra_read_ex(file, &bead);
1506         retval = (sum > 0) ? sum : retval;
1507         RETURN(retval);
1508 }
1509
1510 /*
1511  * Write to a file (through the page cache).
1512  */
1513 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1514                              loff_t *ppos)
1515 {
1516         struct inode *inode = file->f_dentry->d_inode;
1517         struct ll_sb_info *sbi = ll_i2sbi(inode);
1518         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1519         struct ll_lock_tree tree;
1520         loff_t maxbytes = ll_file_maxbytes(inode);
1521         loff_t lock_start, lock_end, end;
1522         ssize_t retval, chunk, sum = 0;
1523         int tree_locked;
1524         ENTRY;
1525
1526         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1527                inode->i_ino, inode->i_generation, inode, count, *ppos);
1528
1529         SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1530
1531         /* POSIX, but surprised the VFS doesn't check this already */
1532         if (count == 0)
1533                 RETURN(0);
1534
1535         /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1536          * called on the file, don't fail the below assertion (bug 2388). */
1537         if (file->f_flags & O_LOV_DELAY_CREATE &&
1538             ll_i2info(inode)->lli_smd == NULL)
1539                 RETURN(-EBADF);
1540
1541         LASSERT(ll_i2info(inode)->lli_smd != NULL);
1542
1543         down(&ll_i2info(inode)->lli_write_sem);
1544
1545 repeat:
1546         chunk = 0; /* just to fix gcc's warning */
1547         end = *ppos + count - 1;
1548
1549         if (file->f_flags & O_APPEND) {
1550                 lock_start = 0;
1551                 lock_end = OBD_OBJECT_EOF;
1552         } else if (sbi->ll_max_rw_chunk != 0) {
1553                 /* first, let's know the end of the current stripe */
1554                 end = *ppos;
1555                 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1556                                 (obd_off *)&end);
1557
1558                 /* correct, the end is beyond the request */
1559                 if (end > *ppos + count - 1)
1560                         end = *ppos + count - 1;
1561
1562                 /* and chunk shouldn't be too large even if striping is wide */
1563                 if (end - *ppos > sbi->ll_max_rw_chunk)
1564                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1565                 lock_start = *ppos;
1566                 lock_end = end;
1567         } else {
1568                 lock_start = *ppos;
1569                 lock_end = *ppos + count - 1;
1570         }
1571
1572         tree_locked = ll_file_get_tree_lock(&tree, file, buf, count,
1573                                             lock_start, lock_end, WRITE);
1574         if (tree_locked < 0)
1575                 GOTO(out, retval = tree_locked);
1576
1577         /* This is ok, g_f_w will overwrite this under i_sem if it races
1578          * with a local truncate, it just makes our maxbyte checking easier.
1579          * The i_size value gets updated in ll_extent_lock() as a consequence
1580          * of the [0,EOF] extent lock we requested above. */
1581         if (file->f_flags & O_APPEND) {
1582                 *ppos = i_size_read(inode);
1583                 end = *ppos + count - 1;
1584         }
1585
1586         if (*ppos >= maxbytes) {
1587                 send_sig(SIGXFSZ, current, 0);
1588                 GOTO(out_unlock, retval = -EFBIG);
1589         }
1590         if (end > maxbytes - 1)
1591                 end = maxbytes - 1;
1592
1593         /* generic_file_write handles O_APPEND after getting i_mutex */
1594         chunk = end - *ppos + 1;
1595         CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1596                inode->i_ino, chunk, *ppos);
1597         if (tree_locked)
1598                 retval = generic_file_write(file, buf, chunk, ppos);
1599         else
1600                 retval = ll_file_lockless_io(file, (char*)buf, chunk,
1601                                              ppos, WRITE);
1602         ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1603
1604 out_unlock:
1605         if (tree_locked)
1606                 ll_tree_unlock(&tree);
1607
1608 out:
1609         if (retval > 0) {
1610                 buf += retval;
1611                 count -= retval;
1612                 sum += retval;
1613                 if (retval == chunk && count > 0)
1614                         goto repeat;
1615         }
1616
1617         up(&ll_i2info(inode)->lli_write_sem);
1618
1619         retval = (sum > 0) ? sum : retval;
1620         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1621                            retval > 0 ? retval : 0);
1622         RETURN(retval);
1623 }
1624
1625 /*
1626  * Send file content (through pagecache) somewhere with helper
1627  */
1628 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1629                                 read_actor_t actor, void *target)
1630 {
1631         struct inode *inode = in_file->f_dentry->d_inode;
1632         struct ll_inode_info *lli = ll_i2info(inode);
1633         struct lov_stripe_md *lsm = lli->lli_smd;
1634         struct ll_lock_tree tree;
1635         struct ll_lock_tree_node *node;
1636         struct ost_lvb lvb;
1637         struct ll_ra_read bead;
1638         int rc;
1639         ssize_t retval;
1640         __u64 kms;
1641         ENTRY;
1642         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1643                inode->i_ino, inode->i_generation, inode, count, *ppos);
1644
1645         /* "If nbyte is 0, read() will return 0 and have no other results."
1646          *                      -- Single Unix Spec */
1647         if (count == 0)
1648                 RETURN(0);
1649
1650         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1651         /* turn off the kernel's read-ahead */
1652         in_file->f_ra.ra_pages = 0;
1653
1654         /* File with no objects, nothing to lock */
1655         if (!lsm)
1656                 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1657
1658         node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1659         if (IS_ERR(node))
1660                 RETURN(PTR_ERR(node));
1661
1662         tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1663         rc = ll_tree_lock(&tree, node, NULL, count,
1664                           in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1665         if (rc != 0)
1666                 RETURN(rc);
1667
1668         ll_clear_file_contended(inode);
1669         ll_inode_size_lock(inode, 1);
1670         /*
1671          * Consistency guarantees: following possibilities exist for the
1672          * relation between region being read and real file size at this
1673          * moment:
1674          *
1675          *  (A): the region is completely inside of the file;
1676          *
1677          *  (B-x): x bytes of region are inside of the file, the rest is
1678          *  outside;
1679          *
1680          *  (C): the region is completely outside of the file.
1681          *
1682          * This classification is stable under DLM lock acquired by
1683          * ll_tree_lock() above, because to change class, other client has to
1684          * take DLM lock conflicting with our lock. Also, any updates to
1685          * ->i_size by other threads on this client are serialized by
1686          * ll_inode_size_lock(). This guarantees that short reads are handled
1687          * correctly in the face of concurrent writes and truncates.
1688          */
1689         inode_init_lvb(inode, &lvb);
1690         obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1691         kms = lvb.lvb_size;
1692         if (*ppos + count - 1 > kms) {
1693                 /* A glimpse is necessary to determine whether we return a
1694                  * short read (B) or some zeroes at the end of the buffer (C) */
1695                 ll_inode_size_unlock(inode, 1);
1696                 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1697                 if (retval)
1698                         goto out;
1699         } else {
1700                 /* region is within kms and, hence, within real file size (A) */
1701                 i_size_write(inode, kms);
1702                 ll_inode_size_unlock(inode, 1);
1703         }
1704
1705         CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1706                inode->i_ino, count, *ppos, i_size_read(inode));
1707
1708         bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1709         bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1710         ll_ra_read_in(in_file, &bead);
1711         /* BUG: 5972 */
1712         file_accessed(in_file);
1713         retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1714         ll_ra_read_ex(in_file, &bead);
1715
1716  out:
1717         ll_tree_unlock(&tree);
1718         RETURN(retval);
1719 }
1720
1721 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1722                                unsigned long arg)
1723 {
1724         struct ll_inode_info *lli = ll_i2info(inode);
1725         struct obd_export *exp = ll_i2dtexp(inode);
1726         struct ll_recreate_obj ucreatp;
1727         struct obd_trans_info oti = { 0 };
1728         struct obdo *oa = NULL;
1729         int lsm_size;
1730         int rc = 0;
1731         struct lov_stripe_md *lsm, *lsm2;
1732         ENTRY;
1733
1734         if (!capable (CAP_SYS_ADMIN))
1735                 RETURN(-EPERM);
1736
1737         rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1738                             sizeof(struct ll_recreate_obj));
1739         if (rc) {
1740                 RETURN(-EFAULT);
1741         }
1742         OBDO_ALLOC(oa);
1743         if (oa == NULL)
1744                 RETURN(-ENOMEM);
1745
1746         down(&lli->lli_size_sem);
1747         lsm = lli->lli_smd;
1748         if (lsm == NULL)
1749                 GOTO(out, rc = -ENOENT);
1750         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1751                    (lsm->lsm_stripe_count));
1752
1753         OBD_ALLOC(lsm2, lsm_size);
1754         if (lsm2 == NULL)
1755                 GOTO(out, rc = -ENOMEM);
1756
1757         oa->o_id = ucreatp.lrc_id;
1758         oa->o_gr = ucreatp.lrc_group;
1759         oa->o_nlink = ucreatp.lrc_ost_idx;
1760         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1761         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1762         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1763                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1764
1765         memcpy(lsm2, lsm, lsm_size);
1766         rc = obd_create(exp, oa, &lsm2, &oti);
1767
1768         OBD_FREE(lsm2, lsm_size);
1769         GOTO(out, rc);
1770 out:
1771         up(&lli->lli_size_sem);
1772         OBDO_FREE(oa);
1773         return rc;
1774 }
1775
1776 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1777                              int flags, struct lov_user_md *lum, int lum_size)
1778 {
1779         struct ll_inode_info *lli = ll_i2info(inode);
1780         struct lov_stripe_md *lsm;
1781         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1782         int rc = 0;
1783         ENTRY;
1784
1785         down(&lli->lli_size_sem);
1786         lsm = lli->lli_smd;
1787         if (lsm) {
1788                 up(&lli->lli_size_sem);
1789                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1790                        inode->i_ino);
1791                 RETURN(-EEXIST);
1792         }
1793
1794         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1795         if (rc)
1796                 GOTO(out, rc);
1797         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1798                 GOTO(out_req_free, rc = -ENOENT);
1799         rc = oit.d.lustre.it_status;
1800         if (rc < 0)
1801                 GOTO(out_req_free, rc);
1802
1803         ll_release_openhandle(file->f_dentry, &oit);
1804
1805  out:
1806         up(&lli->lli_size_sem);
1807         ll_intent_release(&oit);
1808         RETURN(rc);
1809 out_req_free:
1810         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1811         goto out;
1812 }
1813
1814 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1815                              struct lov_mds_md **lmmp, int *lmm_size,
1816                              struct ptlrpc_request **request)
1817 {
1818         struct ll_sb_info *sbi = ll_i2sbi(inode);
1819         struct mdt_body  *body;
1820         struct lov_mds_md *lmm = NULL;
1821         struct ptlrpc_request *req = NULL;
1822         struct obd_capa *oc;
1823         int rc, lmmsize;
1824
1825         rc = ll_get_max_mdsize(sbi, &lmmsize);
1826         if (rc)
1827                 RETURN(rc);
1828
1829         oc = ll_mdscapa_get(inode);
1830         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1831                              oc, filename, strlen(filename) + 1,
1832                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1833                              ll_i2suppgid(inode), &req);
1834         capa_put(oc);
1835         if (rc < 0) {
1836                 CDEBUG(D_INFO, "md_getattr_name failed "
1837                        "on %s: rc %d\n", filename, rc);
1838                 GOTO(out, rc);
1839         }
1840
1841         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1842         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1843
1844         lmmsize = body->eadatasize;
1845
1846         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1847                         lmmsize == 0) {
1848                 GOTO(out, rc = -ENODATA);
1849         }
1850
1851         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1852         LASSERT(lmm != NULL);
1853
1854         /*
1855          * This is coming from the MDS, so is probably in
1856          * little endian.  We convert it to host endian before
1857          * passing it to userspace.
1858          */
1859         if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1860                 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1861                 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1862         } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1863                 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1864         }
1865
1866         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1867                 struct lov_stripe_md *lsm;
1868                 struct lov_user_md_join *lmj;
1869                 int lmj_size, i, aindex = 0;
1870
1871                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1872                 if (rc < 0)
1873                         GOTO(out, rc = -ENOMEM);
1874                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1875                 if (rc)
1876                         GOTO(out_free_memmd, rc);
1877
1878                 lmj_size = sizeof(struct lov_user_md_join) +
1879                            lsm->lsm_stripe_count *
1880                            sizeof(struct lov_user_ost_data_join);
1881                 OBD_ALLOC(lmj, lmj_size);
1882                 if (!lmj)
1883                         GOTO(out_free_memmd, rc = -ENOMEM);
1884
1885                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1886                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1887                         struct lov_extent *lex =
1888                                 &lsm->lsm_array->lai_ext_array[aindex];
1889
1890                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1891                                 aindex ++;
1892                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1893                                         LPU64" len %d\n", aindex, i,
1894                                         lex->le_start, (int)lex->le_len);
1895                         lmj->lmm_objects[i].l_extent_start =
1896                                 lex->le_start;
1897
1898                         if ((int)lex->le_len == -1)
1899                                 lmj->lmm_objects[i].l_extent_end = -1;
1900                         else
1901                                 lmj->lmm_objects[i].l_extent_end =
1902                                         lex->le_start + lex->le_len;
1903                         lmj->lmm_objects[i].l_object_id =
1904                                 lsm->lsm_oinfo[i]->loi_id;
1905                         lmj->lmm_objects[i].l_object_gr =
1906                                 lsm->lsm_oinfo[i]->loi_gr;
1907                         lmj->lmm_objects[i].l_ost_gen =
1908                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1909                         lmj->lmm_objects[i].l_ost_idx =
1910                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1911                 }
1912                 lmm = (struct lov_mds_md *)lmj;
1913                 lmmsize = lmj_size;
1914 out_free_memmd:
1915                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1916         }
1917 out:
1918         *lmmp = lmm;
1919         *lmm_size = lmmsize;
1920         *request = req;
1921         return rc;
1922 }
1923
1924 static int ll_lov_setea(struct inode *inode, struct file *file,
1925                             unsigned long arg)
1926 {
1927         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1928         struct lov_user_md  *lump;
1929         int lum_size = sizeof(struct lov_user_md) +
1930                        sizeof(struct lov_user_ost_data);
1931         int rc;
1932         ENTRY;
1933
1934         if (!capable (CAP_SYS_ADMIN))
1935                 RETURN(-EPERM);
1936
1937         OBD_ALLOC(lump, lum_size);
1938         if (lump == NULL) {
1939                 RETURN(-ENOMEM);
1940         }
1941         rc = copy_from_user(lump, (struct lov_user_md  *)arg, lum_size);
1942         if (rc) {
1943                 OBD_FREE(lump, lum_size);
1944                 RETURN(-EFAULT);
1945         }
1946
1947         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1948
1949         OBD_FREE(lump, lum_size);
1950         RETURN(rc);
1951 }
1952
1953 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1954                             unsigned long arg)
1955 {
1956         struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1957         int rc;
1958         int flags = FMODE_WRITE;
1959         ENTRY;
1960
1961         /* Bug 1152: copy properly when this is no longer true */
1962         LASSERT(sizeof(lum) == sizeof(*lump));
1963         LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1964         rc = copy_from_user(&lum, lump, sizeof(lum));
1965         if (rc)
1966                 RETURN(-EFAULT);
1967
1968         rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1969         if (rc == 0) {
1970                  put_user(0, &lump->lmm_stripe_count);
1971                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1972                                     0, ll_i2info(inode)->lli_smd, lump);
1973         }
1974         RETURN(rc);
1975 }
1976
1977 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1978 {
1979         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1980
1981         if (!lsm)
1982                 RETURN(-ENODATA);
1983
1984         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1985                             (void *)arg);
1986 }
1987
1988 static int ll_get_grouplock(struct inode *inode, struct file *file,
1989                             unsigned long arg)
1990 {
1991         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1992         ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1993                                                     .end = OBD_OBJECT_EOF}};
1994         struct lustre_handle lockh = { 0 };
1995         struct ll_inode_info *lli = ll_i2info(inode);
1996         struct lov_stripe_md *lsm = lli->lli_smd;
1997         int flags = 0, rc;
1998         ENTRY;
1999
2000         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2001                 RETURN(-EINVAL);
2002         }
2003
2004         policy.l_extent.gid = arg;
2005         if (file->f_flags & O_NONBLOCK)
2006                 flags = LDLM_FL_BLOCK_NOWAIT;
2007
2008         rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
2009         if (rc)
2010                 RETURN(rc);
2011
2012         fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
2013         fd->fd_gid = arg;
2014         memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
2015
2016         RETURN(0);
2017 }
2018
2019 static int ll_put_grouplock(struct inode *inode, struct file *file,
2020                             unsigned long arg)
2021 {
2022         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2023         struct ll_inode_info *lli = ll_i2info(inode);
2024         struct lov_stripe_md *lsm = lli->lli_smd;
2025         int rc;
2026         ENTRY;
2027
2028         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2029                 /* Ugh, it's already unlocked. */
2030                 RETURN(-EINVAL);
2031         }
2032
2033         if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2034                 RETURN(-EINVAL);
2035
2036         fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2037
2038         rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2039         if (rc)
2040                 RETURN(rc);
2041
2042         fd->fd_gid = 0;
2043         memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2044
2045         RETURN(0);
2046 }
2047
2048 static int join_sanity_check(struct inode *head, struct inode *tail)
2049 {
2050         ENTRY;
2051         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2052                 CERROR("server do not support join \n");
2053                 RETURN(-EINVAL);
2054         }
2055         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2056                 CERROR("tail ino %lu and ino head %lu must be regular\n",
2057                        head->i_ino, tail->i_ino);
2058                 RETURN(-EINVAL);
2059         }
2060         if (head->i_ino == tail->i_ino) {
2061                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2062                 RETURN(-EINVAL);
2063         }
2064         if (i_size_read(head) % JOIN_FILE_ALIGN) {
2065                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2066                 RETURN(-EINVAL);
2067         }
2068         RETURN(0);
2069 }
2070
2071 static int join_file(struct inode *head_inode, struct file *head_filp,
2072                      struct file *tail_filp)
2073 {
2074         struct dentry *tail_dentry = tail_filp->f_dentry;
2075         struct lookup_intent oit = {.it_op = IT_OPEN,
2076                                    .it_flags = head_filp->f_flags|O_JOIN_FILE};
2077         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
2078                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL };
2079
2080         struct lustre_handle lockh;
2081         struct md_op_data *op_data;
2082         int    rc;
2083         loff_t data;
2084         ENTRY;
2085
2086         tail_dentry = tail_filp->f_dentry;
2087
2088         data = i_size_read(head_inode);
2089         op_data = ll_prep_md_op_data(NULL, head_inode,
2090                                      tail_dentry->d_parent->d_inode,
2091                                      tail_dentry->d_name.name,
2092                                      tail_dentry->d_name.len, 0,
2093                                      LUSTRE_OPC_ANY, &data);
2094         if (IS_ERR(op_data))
2095                 RETURN(PTR_ERR(op_data));
2096
2097         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
2098                          op_data, &lockh, NULL, 0, 0);
2099
2100         ll_finish_md_op_data(op_data);
2101         if (rc < 0)
2102                 GOTO(out, rc);
2103
2104         rc = oit.d.lustre.it_status;
2105
2106         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2107                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2108                 ptlrpc_req_finished((struct ptlrpc_request *)
2109                                     oit.d.lustre.it_data);
2110                 GOTO(out, rc);
2111         }
2112
2113         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2114                                            * away */
2115                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2116                 oit.d.lustre.it_lock_mode = 0;
2117         }
2118         ll_release_openhandle(head_filp->f_dentry, &oit);
2119 out:
2120         ll_intent_release(&oit);
2121         RETURN(rc);
2122 }
2123
2124 static int ll_file_join(struct inode *head, struct file *filp,
2125                         char *filename_tail)
2126 {
2127         struct inode *tail = NULL, *first = NULL, *second = NULL;
2128         struct dentry *tail_dentry;
2129         struct file *tail_filp, *first_filp, *second_filp;
2130         struct ll_lock_tree first_tree, second_tree;
2131         struct ll_lock_tree_node *first_node, *second_node;
2132         struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2133         int rc = 0, cleanup_phase = 0;
2134         ENTRY;
2135
2136         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2137                head->i_ino, head->i_generation, head, filename_tail);
2138
2139         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2140         if (IS_ERR(tail_filp)) {
2141                 CERROR("Can not open tail file %s", filename_tail);
2142                 rc = PTR_ERR(tail_filp);
2143                 GOTO(cleanup, rc);
2144         }
2145         tail = igrab(tail_filp->f_dentry->d_inode);
2146
2147         tlli = ll_i2info(tail);
2148         tail_dentry = tail_filp->f_dentry;
2149         LASSERT(tail_dentry);
2150         cleanup_phase = 1;
2151
2152         /*reorder the inode for lock sequence*/
2153         first = head->i_ino > tail->i_ino ? head : tail;
2154         second = head->i_ino > tail->i_ino ? tail : head;
2155         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2156         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2157
2158         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2159                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2160         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2161         if (IS_ERR(first_node)){
2162                 rc = PTR_ERR(first_node);
2163                 GOTO(cleanup, rc);
2164         }
2165         first_tree.lt_fd = first_filp->private_data;
2166         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2167         if (rc != 0)
2168                 GOTO(cleanup, rc);
2169         cleanup_phase = 2;
2170
2171         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2172         if (IS_ERR(second_node)){
2173                 rc = PTR_ERR(second_node);
2174                 GOTO(cleanup, rc);
2175         }
2176         second_tree.lt_fd = second_filp->private_data;
2177         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2178         if (rc != 0)
2179                 GOTO(cleanup, rc);
2180         cleanup_phase = 3;
2181
2182         rc = join_sanity_check(head, tail);
2183         if (rc)
2184                 GOTO(cleanup, rc);
2185
2186         rc = join_file(head, filp, tail_filp);
2187         if (rc)
2188                 GOTO(cleanup, rc);
2189 cleanup:
2190         switch (cleanup_phase) {
2191         case 3:
2192                 ll_tree_unlock(&second_tree);
2193                 obd_cancel_unused(ll_i2dtexp(second),
2194                                   ll_i2info(second)->lli_smd, 0, NULL);
2195         case 2:
2196                 ll_tree_unlock(&first_tree);
2197                 obd_cancel_unused(ll_i2dtexp(first),
2198                                   ll_i2info(first)->lli_smd, 0, NULL);
2199         case 1:
2200                 filp_close(tail_filp, 0);
2201                 if (tail)
2202                         iput(tail);
2203                 if (head && rc == 0) {
2204                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2205                                        &hlli->lli_smd);
2206                         hlli->lli_smd = NULL;
2207                 }
2208         case 0:
2209                 break;
2210         default:
2211                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2212                 LBUG();
2213         }
2214         RETURN(rc);
2215 }
2216
2217 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2218 {
2219         struct inode *inode = dentry->d_inode;
2220         struct obd_client_handle *och;
2221         int rc;
2222         ENTRY;
2223
2224         LASSERT(inode);
2225
2226         /* Root ? Do nothing. */
2227         if (dentry->d_inode->i_sb->s_root == dentry)
2228                 RETURN(0);
2229
2230         /* No open handle to close? Move away */
2231         if (!it_disposition(it, DISP_OPEN_OPEN))
2232                 RETURN(0);
2233
2234         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2235
2236         OBD_ALLOC(och, sizeof(*och));
2237         if (!och)
2238                 GOTO(out, rc = -ENOMEM);
2239
2240         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2241                     ll_i2info(inode), it, och);
2242
2243         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2244                                        inode, och);
2245  out:
2246         /* this one is in place of ll_file_open */
2247         ptlrpc_req_finished(it->d.lustre.it_data);
2248         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2249         RETURN(rc);
2250 }
2251
2252 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2253                   unsigned long arg)
2254 {
2255         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2256         int flags;
2257         ENTRY;
2258
2259         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2260                inode->i_generation, inode, cmd);
2261         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2262
2263         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2264         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2265                 RETURN(-ENOTTY);
2266
2267         switch(cmd) {
2268         case LL_IOC_GETFLAGS:
2269                 /* Get the current value of the file flags */
2270                 return put_user(fd->fd_flags, (int *)arg);
2271         case LL_IOC_SETFLAGS:
2272         case LL_IOC_CLRFLAGS:
2273                 /* Set or clear specific file flags */
2274                 /* XXX This probably needs checks to ensure the flags are
2275                  *     not abused, and to handle any flag side effects.
2276                  */
2277                 if (get_user(flags, (int *) arg))
2278                         RETURN(-EFAULT);
2279
2280                 if (cmd == LL_IOC_SETFLAGS) {
2281                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2282                             !(file->f_flags & O_DIRECT)) {
2283                                 CERROR("%s: unable to disable locking on "
2284                                        "non-O_DIRECT file\n", current->comm);
2285                                 RETURN(-EINVAL);
2286                         }
2287
2288                         fd->fd_flags |= flags;
2289                 } else {
2290                         fd->fd_flags &= ~flags;
2291                 }
2292                 RETURN(0);
2293         case LL_IOC_LOV_SETSTRIPE:
2294                 RETURN(ll_lov_setstripe(inode, file, arg));
2295         case LL_IOC_LOV_SETEA:
2296                 RETURN(ll_lov_setea(inode, file, arg));
2297         case LL_IOC_LOV_GETSTRIPE:
2298                 RETURN(ll_lov_getstripe(inode, arg));
2299         case LL_IOC_RECREATE_OBJ:
2300                 RETURN(ll_lov_recreate_obj(inode, file, arg));
2301         case EXT3_IOC_GETFLAGS:
2302         case EXT3_IOC_SETFLAGS:
2303                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2304         case EXT3_IOC_GETVERSION_OLD:
2305         case EXT3_IOC_GETVERSION:
2306                 RETURN(put_user(inode->i_generation, (int *)arg));
2307         case LL_IOC_JOIN: {
2308                 char *ftail;
2309                 int rc;
2310
2311                 ftail = getname((const char *)arg);
2312                 if (IS_ERR(ftail))
2313                         RETURN(PTR_ERR(ftail));
2314                 rc = ll_file_join(inode, file, ftail);
2315                 putname(ftail);
2316                 RETURN(rc);
2317         }
2318         case LL_IOC_GROUP_LOCK:
2319                 RETURN(ll_get_grouplock(inode, file, arg));
2320         case LL_IOC_GROUP_UNLOCK:
2321                 RETURN(ll_put_grouplock(inode, file, arg));
2322         case IOC_OBD_STATFS:
2323                 RETURN(ll_obd_statfs(inode, (void *)arg));
2324
2325         /* We need to special case any other ioctls we want to handle,
2326          * to send them to the MDS/OST as appropriate and to properly
2327          * network encode the arg field.
2328         case EXT3_IOC_SETVERSION_OLD:
2329         case EXT3_IOC_SETVERSION:
2330         */
2331         case LL_IOC_FLUSHCTX:
2332                 RETURN(ll_flush_ctx(inode));
2333         default: {
2334                 int err;
2335
2336                 if (LLIOC_STOP ==
2337                     ll_iocontrol_call(inode, file, cmd, arg, &err))
2338                         RETURN(err);
2339
2340                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2341                                      (void *)arg));
2342         }
2343         }
2344 }
2345
2346 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2347 {
2348         struct inode *inode = file->f_dentry->d_inode;
2349         struct ll_inode_info *lli = ll_i2info(inode);
2350         struct lov_stripe_md *lsm = lli->lli_smd;
2351         loff_t retval;
2352         ENTRY;
2353         retval = offset + ((origin == 2) ? i_size_read(inode) :
2354                            (origin == 1) ? file->f_pos : 0);
2355         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2356                inode->i_ino, inode->i_generation, inode, retval, retval,
2357                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2358         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2359
2360         if (origin == 2) { /* SEEK_END */
2361                 int nonblock = 0, rc;
2362
2363                 if (file->f_flags & O_NONBLOCK)
2364                         nonblock = LDLM_FL_BLOCK_NOWAIT;
2365
2366                 if (lsm != NULL) {
2367                         rc = ll_glimpse_size(inode, nonblock);
2368                         if (rc != 0)
2369                                 RETURN(rc);
2370                 }
2371
2372                 ll_inode_size_lock(inode, 0);
2373                 offset += i_size_read(inode);
2374                 ll_inode_size_unlock(inode, 0);
2375         } else if (origin == 1) { /* SEEK_CUR */
2376                 offset += file->f_pos;
2377         }
2378
2379         retval = -EINVAL;
2380         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2381                 if (offset != file->f_pos) {
2382                         file->f_pos = offset;
2383 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2384                         file->f_reada = 0;
2385                         file->f_version = ++event;
2386 #endif
2387                 }
2388                 retval = offset;
2389         }
2390
2391         RETURN(retval);
2392 }
2393
2394 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2395 {
2396         struct inode *inode = dentry->d_inode;
2397         struct ll_inode_info *lli = ll_i2info(inode);
2398         struct lov_stripe_md *lsm = lli->lli_smd;
2399         struct ptlrpc_request *req;
2400         struct obd_capa *oc;
2401         int rc, err;
2402         ENTRY;
2403         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2404                inode->i_generation, inode);
2405         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2406
2407         /* fsync's caller has already called _fdata{sync,write}, we want
2408          * that IO to finish before calling the osc and mdc sync methods */
2409         rc = filemap_fdatawait(inode->i_mapping);
2410
2411         /* catch async errors that were recorded back when async writeback
2412          * failed for pages in this mapping. */
2413         err = lli->lli_async_rc;
2414         lli->lli_async_rc = 0;
2415         if (rc == 0)
2416                 rc = err;
2417         if (lsm) {
2418                 err = lov_test_and_clear_async_rc(lsm);
2419                 if (rc == 0)
2420                         rc = err;
2421         }
2422
2423         oc = ll_mdscapa_get(inode);
2424         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2425                       &req);
2426         capa_put(oc);
2427         if (!rc)
2428                 rc = err;
2429         if (!err)
2430                 ptlrpc_req_finished(req);
2431
2432         if (data && lsm) {
2433                 struct obdo *oa;
2434
2435                 OBDO_ALLOC(oa);
2436                 if (!oa)
2437                         RETURN(rc ? rc : -ENOMEM);
2438
2439                 oa->o_id = lsm->lsm_object_id;
2440                 oa->o_gr = lsm->lsm_object_gr;
2441                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2442                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2443                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2444                                            OBD_MD_FLGROUP);
2445
2446                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2447                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2448                                0, OBD_OBJECT_EOF, oc);
2449                 capa_put(oc);
2450                 if (!rc)
2451                         rc = err;
2452                 OBDO_FREE(oa);
2453         }
2454
2455         RETURN(rc);
2456 }
2457
2458 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2459 {
2460         struct inode *inode = file->f_dentry->d_inode;
2461         struct ll_sb_info *sbi = ll_i2sbi(inode);
2462         struct ldlm_res_id res_id =
2463                 { .name = { fid_seq(ll_inode2fid(inode)),
2464                             fid_oid(ll_inode2fid(inode)),
2465                             fid_ver(ll_inode2fid(inode)),
2466                             LDLM_FLOCK} };
2467         struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2468                 ldlm_flock_completion_ast, NULL, file_lock };
2469         struct lustre_handle lockh = {0};
2470         ldlm_policy_data_t flock;
2471         int flags = 0;
2472         int rc;
2473         ENTRY;
2474
2475         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2476                inode->i_ino, file_lock);
2477
2478         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2479
2480         if (file_lock->fl_flags & FL_FLOCK) {
2481                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2482                 /* set missing params for flock() calls */
2483                 file_lock->fl_end = OFFSET_MAX;
2484                 file_lock->fl_pid = current->tgid;
2485         }
2486         flock.l_flock.pid = file_lock->fl_pid;
2487         flock.l_flock.start = file_lock->fl_start;
2488         flock.l_flock.end = file_lock->fl_end;
2489
2490         switch (file_lock->fl_type) {
2491         case F_RDLCK:
2492                 einfo.ei_mode = LCK_PR;
2493                 break;
2494         case F_UNLCK:
2495                 /* An unlock request may or may not have any relation to
2496                  * existing locks so we may not be able to pass a lock handle
2497                  * via a normal ldlm_lock_cancel() request. The request may even
2498                  * unlock a byte range in the middle of an existing lock. In
2499                  * order to process an unlock request we need all of the same
2500                  * information that is given with a normal read or write record
2501                  * lock request. To avoid creating another ldlm unlock (cancel)
2502                  * message we'll treat a LCK_NL flock request as an unlock. */
2503                 einfo.ei_mode = LCK_NL;
2504                 break;
2505         case F_WRLCK:
2506                 einfo.ei_mode = LCK_PW;
2507                 break;
2508         default:
2509                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2510                 LBUG();
2511         }
2512
2513         switch (cmd) {
2514         case F_SETLKW:
2515 #ifdef F_SETLKW64
2516         case F_SETLKW64:
2517 #endif
2518                 flags = 0;
2519                 break;
2520         case F_SETLK:
2521 #ifdef F_SETLK64
2522         case F_SETLK64:
2523 #endif
2524                 flags = LDLM_FL_BLOCK_NOWAIT;
2525                 break;
2526         case F_GETLK:
2527 #ifdef F_GETLK64
2528         case F_GETLK64:
2529 #endif
2530                 flags = LDLM_FL_TEST_LOCK;
2531                 /* Save the old mode so that if the mode in the lock changes we
2532                  * can decrement the appropriate reader or writer refcount. */
2533                 file_lock->fl_type = einfo.ei_mode;
2534                 break;
2535         default:
2536                 CERROR("unknown fcntl lock command: %d\n", cmd);
2537                 LBUG();
2538         }
2539
2540         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2541                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2542                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2543
2544         rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &einfo, &res_id,
2545                               &flock, &flags, NULL, 0, NULL, &lockh, 0);
2546         if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2547                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2548 #ifdef HAVE_F_OP_FLOCK
2549         if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2550             !(flags & LDLM_FL_TEST_LOCK))
2551                 posix_lock_file_wait(file, file_lock);
2552 #endif
2553
2554         RETURN(rc);
2555 }
2556
2557 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2558 {
2559         ENTRY;
2560
2561         RETURN(-ENOSYS);
2562 }
2563
2564 int ll_have_md_lock(struct inode *inode, __u64 bits)
2565 {
2566         struct lustre_handle lockh;
2567         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2568         struct lu_fid *fid;
2569         int flags;
2570         ENTRY;
2571
2572         if (!inode)
2573                RETURN(0);
2574
2575         fid = &ll_i2info(inode)->lli_fid;
2576         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2577
2578         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2579         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2580                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2581                 RETURN(1);
2582         }
2583         RETURN(0);
2584 }
2585
2586 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2587                             struct lustre_handle *lockh)
2588 {
2589         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2590         struct lu_fid *fid;
2591         ldlm_mode_t rc;
2592         int flags;
2593         ENTRY;
2594
2595         fid = &ll_i2info(inode)->lli_fid;
2596         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2597
2598         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2599         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2600                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2601         RETURN(rc);
2602 }
2603
2604 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2605         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2606                               * and return success */
2607                 inode->i_nlink = 0;
2608                 /* This path cannot be hit for regular files unless in
2609                  * case of obscure races, so no need to to validate
2610                  * size. */
2611                 if (!S_ISREG(inode->i_mode) &&
2612                     !S_ISDIR(inode->i_mode))
2613                         return 0;
2614         }
2615
2616         if (rc) {
2617                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2618                 return -abs(rc);
2619
2620         }
2621
2622         return 0;
2623 }
2624
2625 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2626 {
2627         struct inode *inode = dentry->d_inode;
2628         struct ptlrpc_request *req = NULL;
2629         struct ll_sb_info *sbi;
2630         struct obd_export *exp;
2631         int rc;
2632         ENTRY;
2633
2634         if (!inode) {
2635                 CERROR("REPORT THIS LINE TO PETER\n");
2636                 RETURN(0);
2637         }
2638         sbi = ll_i2sbi(inode);
2639
2640         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2641                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2642
2643         exp = ll_i2mdexp(inode);
2644
2645         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2646                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2647                 struct md_op_data *op_data;
2648
2649                 /* Call getattr by fid, so do not provide name at all. */
2650                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2651                                              dentry->d_inode, NULL, 0, 0,
2652                                              LUSTRE_OPC_ANY, NULL);
2653                 if (IS_ERR(op_data))
2654                         RETURN(PTR_ERR(op_data));
2655
2656                 oit.it_flags |= O_CHECK_STALE;
2657                 rc = md_intent_lock(exp, op_data, NULL, 0,
2658                                     /* we are not interested in name
2659                                        based lookup */
2660                                     &oit, 0, &req,
2661                                     ll_md_blocking_ast, 0);
2662                 ll_finish_md_op_data(op_data);
2663                 oit.it_flags &= ~O_CHECK_STALE;
2664                 if (rc < 0) {
2665                         rc = ll_inode_revalidate_fini(inode, rc);
2666                         GOTO (out, rc);
2667                 }
2668
2669                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2670                 if (rc != 0) {
2671                         ll_intent_release(&oit);
2672                         GOTO(out, rc);
2673                 }
2674
2675                 /* Unlinked? Unhash dentry, so it is not picked up later by
2676                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2677                    here to preserve get_cwd functionality on 2.6.
2678                    Bug 10503 */
2679                 if (!dentry->d_inode->i_nlink) {
2680                         spin_lock(&dcache_lock);
2681                         ll_drop_dentry(dentry);
2682                         spin_unlock(&dcache_lock);
2683                 }
2684
2685                 ll_lookup_finish_locks(&oit, dentry);
2686         } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE |
2687                                                      MDS_INODELOCK_LOOKUP)) {
2688                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2689                 obd_valid valid = OBD_MD_FLGETATTR;
2690                 struct obd_capa *oc;
2691                 int ealen = 0;
2692
2693                 if (S_ISREG(inode->i_mode)) {
2694                         rc = ll_get_max_mdsize(sbi, &ealen);
2695                         if (rc)
2696                                 RETURN(rc);
2697                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2698                 }
2699                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2700                  * capa for this inode. Because we only keep capas of dirs
2701                  * fresh. */
2702                 oc = ll_mdscapa_get(inode);
2703                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2704                                 ealen, &req);
2705                 capa_put(oc);
2706                 if (rc) {
2707                         rc = ll_inode_revalidate_fini(inode, rc);
2708                         RETURN(rc);
2709                 }
2710
2711                 rc = ll_prep_inode(&inode, req, NULL);
2712                 if (rc)
2713                         GOTO(out, rc);
2714         }
2715
2716         /* if object not yet allocated, don't validate size */
2717         if (ll_i2info(inode)->lli_smd == NULL)
2718                 GOTO(out, rc = 0);
2719
2720         /* ll_glimpse_size will prefer locally cached writes if they extend
2721          * the file */
2722         rc = ll_glimpse_size(inode, 0);
2723         EXIT;
2724 out:
2725         ptlrpc_req_finished(req);
2726         return rc;
2727 }
2728
2729 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2730                   struct lookup_intent *it, struct kstat *stat)
2731 {
2732         struct inode *inode = de->d_inode;
2733         int res = 0;
2734
2735         res = ll_inode_revalidate_it(de, it);
2736         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2737
2738         if (res)
2739                 return res;
2740
2741         stat->dev = inode->i_sb->s_dev;
2742         stat->ino = inode->i_ino;
2743         stat->mode = inode->i_mode;
2744         stat->nlink = inode->i_nlink;
2745         stat->uid = inode->i_uid;
2746         stat->gid = inode->i_gid;
2747         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2748         stat->atime = inode->i_atime;
2749         stat->mtime = inode->i_mtime;
2750         stat->ctime = inode->i_ctime;
2751 #ifdef HAVE_INODE_BLKSIZE
2752         stat->blksize = inode->i_blksize;
2753 #else
2754         stat->blksize = 1 << inode->i_blkbits;
2755 #endif
2756
2757         ll_inode_size_lock(inode, 0);
2758         stat->size = i_size_read(inode);
2759         stat->blocks = inode->i_blocks;
2760         ll_inode_size_unlock(inode, 0);
2761
2762         return 0;
2763 }
2764 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2765 {
2766         struct lookup_intent it = { .it_op = IT_GETATTR };
2767
2768         return ll_getattr_it(mnt, de, &it, stat);
2769 }
2770
2771 static
2772 int lustre_check_acl(struct inode *inode, int mask)
2773 {
2774 #ifdef CONFIG_FS_POSIX_ACL
2775         struct ll_inode_info *lli = ll_i2info(inode);
2776         struct posix_acl *acl;
2777         int rc;
2778         ENTRY;
2779
2780         spin_lock(&lli->lli_lock);
2781         acl = posix_acl_dup(lli->lli_posix_acl);
2782         spin_unlock(&lli->lli_lock);
2783
2784         if (!acl)
2785                 RETURN(-EAGAIN);
2786
2787         rc = posix_acl_permission(inode, acl, mask);
2788         posix_acl_release(acl);
2789
2790         RETURN(rc);
2791 #else
2792         return -EAGAIN;
2793 #endif
2794 }
2795
2796 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2797 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2798 {
2799         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2800                inode->i_ino, inode->i_generation, inode, mask);
2801         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2802                 return lustre_check_remote_perm(inode, mask);
2803
2804         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2805         return generic_permission(inode, mask, lustre_check_acl);
2806 }
2807 #else
2808 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2809 {
2810         int mode = inode->i_mode;
2811         int rc;
2812
2813         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2814                inode->i_ino, inode->i_generation, inode, mask);
2815
2816         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2817                 return lustre_check_remote_perm(inode, mask);
2818
2819         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2820
2821         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2822             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2823                 return -EROFS;
2824         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2825                 return -EACCES;
2826         if (current->fsuid == inode->i_uid) {
2827                 mode >>= 6;
2828         } else if (1) {
2829                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2830                         goto check_groups;
2831                 rc = lustre_check_acl(inode, mask);
2832                 if (rc == -EAGAIN)
2833                         goto check_groups;
2834                 if (rc == -EACCES)
2835                         goto check_capabilities;
2836                 return rc;
2837         } else {
2838 check_groups:
2839                 if (in_group_p(inode->i_gid))
2840                         mode >>= 3;
2841         }
2842         if ((mode & mask & S_IRWXO) == mask)
2843                 return 0;
2844
2845 check_capabilities:
2846         if (!(mask & MAY_EXEC) ||
2847             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2848                 if (capable(CAP_DAC_OVERRIDE))
2849                         return 0;
2850
2851         if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2852             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2853                 return 0;
2854
2855         return -EACCES;
2856 }
2857 #endif
2858
2859 /* -o localflock - only provides locally consistent flock locks */
2860 struct file_operations ll_file_operations = {
2861         .read           = ll_file_read,
2862         .write          = ll_file_write,
2863         .ioctl          = ll_file_ioctl,
2864         .open           = ll_file_open,
2865         .release        = ll_file_release,
2866         .mmap           = ll_file_mmap,
2867         .llseek         = ll_file_seek,
2868         .sendfile       = ll_file_sendfile,
2869         .fsync          = ll_fsync,
2870 };
2871
2872 struct file_operations ll_file_operations_flock = {
2873         .read           = ll_file_read,
2874         .write          = ll_file_write,
2875         .ioctl          = ll_file_ioctl,
2876         .open           = ll_file_open,
2877         .release        = ll_file_release,
2878         .mmap           = ll_file_mmap,
2879         .llseek         = ll_file_seek,
2880         .sendfile       = ll_file_sendfile,
2881         .fsync          = ll_fsync,
2882 #ifdef HAVE_F_OP_FLOCK
2883         .flock          = ll_file_flock,
2884 #endif
2885         .lock           = ll_file_flock
2886 };
2887
2888 /* These are for -o noflock - to return ENOSYS on flock calls */
2889 struct file_operations ll_file_operations_noflock = {
2890         .read           = ll_file_read,
2891         .write          = ll_file_write,
2892         .ioctl          = ll_file_ioctl,
2893         .open           = ll_file_open,
2894         .release        = ll_file_release,
2895         .mmap           = ll_file_mmap,
2896         .llseek         = ll_file_seek,
2897         .sendfile       = ll_file_sendfile,
2898         .fsync          = ll_fsync,
2899 #ifdef HAVE_F_OP_FLOCK
2900         .flock          = ll_file_noflock,
2901 #endif
2902         .lock           = ll_file_noflock
2903 };
2904
2905 struct inode_operations ll_file_inode_operations = {
2906 #ifdef HAVE_VFS_INTENT_PATCHES
2907         .setattr_raw    = ll_setattr_raw,
2908 #endif
2909         .setattr        = ll_setattr,
2910         .truncate       = ll_truncate,
2911         .getattr        = ll_getattr,
2912         .permission     = ll_inode_permission,
2913         .setxattr       = ll_setxattr,
2914         .getxattr       = ll_getxattr,
2915         .listxattr      = ll_listxattr,
2916         .removexattr    = ll_removexattr,
2917 };
2918
2919 /* dynamic ioctl number support routins */
2920 static struct llioc_ctl_data {
2921         struct rw_semaphore ioc_sem;
2922         struct list_head    ioc_head;
2923 } llioc = {
2924         __RWSEM_INITIALIZER(llioc.ioc_sem),
2925         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2926 };
2927
2928
2929 struct llioc_data {
2930         struct list_head        iocd_list;
2931         unsigned int            iocd_size;
2932         llioc_callback_t        iocd_cb;
2933         unsigned int            iocd_count;
2934         unsigned int            iocd_cmd[0];
2935 };
2936
2937 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2938 {
2939         unsigned int size;
2940         struct llioc_data *in_data = NULL;
2941         ENTRY;
2942
2943         if (cb == NULL || cmd == NULL ||
2944             count > LLIOC_MAX_CMD || count < 0)
2945                 RETURN(NULL);
2946
2947         size = sizeof(*in_data) + count * sizeof(unsigned int);
2948         OBD_ALLOC(in_data, size);
2949         if (in_data == NULL)
2950                 RETURN(NULL);
2951
2952         memset(in_data, 0, sizeof(*in_data));
2953         in_data->iocd_size = size;
2954         in_data->iocd_cb = cb;
2955         in_data->iocd_count = count;
2956         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2957
2958         down_write(&llioc.ioc_sem);
2959         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2960         up_write(&llioc.ioc_sem);
2961
2962         RETURN(in_data);
2963 }
2964
2965 void ll_iocontrol_unregister(void *magic)
2966 {
2967         struct llioc_data *tmp;
2968
2969         if (magic == NULL)
2970                 return;
2971
2972         down_write(&llioc.ioc_sem);
2973         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2974                 if (tmp == magic) {
2975                         unsigned int size = tmp->iocd_size;
2976
2977                         list_del(&tmp->iocd_list);
2978                         up_write(&llioc.ioc_sem);
2979
2980                         OBD_FREE(tmp, size);
2981                         return;
2982                 }
2983         }
2984         up_write(&llioc.ioc_sem);
2985
2986         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2987 }
2988
2989 EXPORT_SYMBOL(ll_iocontrol_register);
2990 EXPORT_SYMBOL(ll_iocontrol_unregister);
2991
2992 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2993                         unsigned int cmd, unsigned long arg, int *rcp)
2994 {
2995         enum llioc_iter ret = LLIOC_CONT;
2996         struct llioc_data *data;
2997         int rc = -EINVAL, i;
2998
2999         down_read(&llioc.ioc_sem);
3000         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3001                 for (i = 0; i < data->iocd_count; i++) {
3002                         if (cmd != data->iocd_cmd[i])
3003                                 continue;
3004
3005                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3006                         break;
3007                 }
3008
3009                 if (ret == LLIOC_STOP)
3010                         break;
3011         }
3012         up_read(&llioc.ioc_sem);
3013
3014         if (rcp)
3015                 *rcp = rc;
3016         return ret;
3017 }