lustre/llite/file.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
   5  *   Author: Peter Braam <braam@clusterfs.com>
   6  *   Author: Phil Schwan <phil@clusterfs.com>
   7  *   Author: Andreas Dilger <adilger@clusterfs.com>
   8  *
   9  *   This file is part of Lustre, http://www.lustre.org.
  10  *
  11  *   Lustre is free software; you can redistribute it and/or
  12  *   modify it under the terms of version 2 of the GNU General Public
  13  *   License as published by the Free Software Foundation.
  14  *
  15  *   Lustre is distributed in the hope that it will be useful,
  16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  *   GNU General Public License for more details.
  19  *
  20  *   You should have received a copy of the GNU General Public License
  21  *   along with Lustre; if not, write to the Free Software
  22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25 #define DEBUG_SUBSYSTEM S_LLITE
  26 #include <lustre_dlm.h>
  27 #include <lustre_lite.h>
  28 #include <lustre_mdc.h>
  29 #include <linux/pagemap.h>
  30 #include <linux/file.h>
  31 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
  32 #include <linux/lustre_compat25.h>
  33 #endif
  34 #include "llite_internal.h"
  35
  36 /* also used by llite/special.c:ll_special_open() */
  37 struct ll_file_data *ll_file_data_get(void)
  38 {
  39         struct ll_file_data *fd;
  40
  41         OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
  42         return fd;
  43 }
  44
  45 static void ll_file_data_put(struct ll_file_data *fd)
  46 {
  47         if (fd != NULL)
  48                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  49 }
  50
  51 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  52                           struct lustre_handle *fh)
  53 {
  54         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  55         op_data->op_attr.ia_mode = inode->i_mode;
  56         op_data->op_attr.ia_atime = inode->i_atime;
  57         op_data->op_attr.ia_mtime = inode->i_mtime;
  58         op_data->op_attr.ia_ctime = inode->i_ctime;
  59         op_data->op_attr.ia_size = inode->i_size;
  60         op_data->op_attr_blocks = inode->i_blocks;
  61         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
  62         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  63         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
  64         op_data->op_capa1 = ll_mdscapa_get(inode);
  65 }
  66
  67 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  68                              struct obd_client_handle *och)
  69 {
  70         ENTRY;
  71
  72         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
  73                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
  74
  75         if (!(och->och_flags & FMODE_WRITE))
  76                 goto out;
  77
  78         if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
  79             !S_ISREG(inode->i_mode))
  80                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
  81         else
  82                 ll_epoch_close(inode, op_data, &och, 0);
  83
  84 out:
  85         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
  86         EXIT;
  87 }
  88
  89 static int ll_close_inode_openhandle(struct obd_export *md_exp,
  90                                      struct inode *inode,
  91                                      struct obd_client_handle *och)
  92 {
  93         struct obd_export *exp = ll_i2mdexp(inode);
  94         struct md_op_data *op_data;
  95         struct ptlrpc_request *req = NULL;
  96         struct obd_device *obd = class_exp2obd(exp);
  97         int epoch_close = 1;
  98         int rc;
  99         ENTRY;
 100
 101         if (obd == NULL) {
 102                 /*
 103                  * XXX: in case of LMV, is this correct to access
 104                  * ->exp_handle?
 105                  */
 106                 CERROR("Invalid MDC connection handle "LPX64"\n",
 107                        ll_i2mdexp(inode)->exp_handle.h_cookie);
 108                 GOTO(out, rc = 0);
 109         }
 110
 111         /*
 112          * here we check if this is forced umount. If so this is called on
 113          * canceling "open lock" and we do not call md_close() in this case, as
 114          * it will not be successful, as import is already deactivated.
 115          */
 116         if (obd->obd_force)
 117                 GOTO(out, rc = 0);
 118
 119         OBD_ALLOC_PTR(op_data);
 120         if (op_data == NULL)
 121                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
 122
 123         ll_prepare_close(inode, op_data, och);
 124         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
 125         rc = md_close(md_exp, op_data, och, &req);
 126
 127         if (rc == -EAGAIN) {
 128                 /* This close must have the epoch closed. */
 129                 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
 130                 LASSERT(epoch_close);
 131                 /* MDS has instructed us to obtain Size-on-MDS attribute from
 132                  * OSTs and send setattr to back to MDS. */
 133                 rc = ll_sizeonmds_update(inode, &och->och_fh,
 134                                          op_data->op_ioepoch);
 135                 if (rc) {
 136                         CERROR("inode %lu mdc Size-on-MDS update failed: "
 137                                "rc = %d\n", inode->i_ino, rc);
 138                         rc = 0;
 139                 }
 140         } else if (rc) {
 141                 CERROR("inode %lu mdc close failed: rc = %d\n",
 142                        inode->i_ino, rc);
 143         }
 144         ll_finish_md_op_data(op_data);
 145
 146         if (rc == 0) {
 147                 rc = ll_objects_destroy(req, inode);
 148                 if (rc)
 149                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
 150                                inode->i_ino, rc);
 151         }
 152
 153         ptlrpc_req_finished(req); /* This is close request */
 154         EXIT;
 155 out:
 156
 157         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
 158             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 159                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 160         } else {
 161                 md_clear_open_replay_data(md_exp, och);
 162                 /* Free @och if it is not waiting for DONE_WRITING. */
 163                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 164                 OBD_FREE_PTR(och);
 165         }
 166
 167         return rc;
 168 }
 169
 170 int ll_md_real_close(struct inode *inode, int flags)
 171 {
 172         struct ll_inode_info *lli = ll_i2info(inode);
 173         struct obd_client_handle **och_p;
 174         struct obd_client_handle *och;
 175         __u64 *och_usecount;
 176         int rc = 0;
 177         ENTRY;
 178
 179         if (flags & FMODE_WRITE) {
 180                 och_p = &lli->lli_mds_write_och;
 181                 och_usecount = &lli->lli_open_fd_write_count;
 182         } else if (flags & FMODE_EXEC) {
 183                 och_p = &lli->lli_mds_exec_och;
 184                 och_usecount = &lli->lli_open_fd_exec_count;
 185         } else {
 186                 LASSERT(flags & FMODE_READ);
 187                 och_p = &lli->lli_mds_read_och;
 188                 och_usecount = &lli->lli_open_fd_read_count;
 189         }
 190
 191         down(&lli->lli_och_sem);
 192         if (*och_usecount) { /* There are still users of this handle, so
 193                                 skip freeing it. */
 194                 up(&lli->lli_och_sem);
 195                 RETURN(0);
 196         }
 197         och=*och_p;
 198         *och_p = NULL;
 199         up(&lli->lli_och_sem);
 200
 201         if (och) { /* There might be a race and somebody have freed this och
 202                       already */
 203                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 204                                                inode, och);
 205         }
 206
 207         RETURN(rc);
 208 }
 209
 210 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 211                 struct file *file)
 212 {
 213         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 214         struct ll_inode_info *lli = ll_i2info(inode);
 215         int rc = 0;
 216         ENTRY;
 217
 218         /* clear group lock, if present */
 219         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
 220                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 221                 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
 222                 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
 223                                       &fd->fd_cwlockh);
 224         }
 225
 226         /* Let's see if we have good enough OPEN lock on the file and if
 227            we can skip talking to MDS */
 228         if (file->f_dentry->d_inode) { /* Can this ever be false? */
 229                 int lockmode;
 230                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 231                 struct lustre_handle lockh;
 232                 struct inode *inode = file->f_dentry->d_inode;
 233                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 234
 235                 down(&lli->lli_och_sem);
 236                 if (fd->fd_omode & FMODE_WRITE) {
 237                         lockmode = LCK_CW;
 238                         LASSERT(lli->lli_open_fd_write_count);
 239                         lli->lli_open_fd_write_count--;
 240                 } else if (fd->fd_omode & FMODE_EXEC) {
 241                         lockmode = LCK_PR;
 242                         LASSERT(lli->lli_open_fd_exec_count);
 243                         lli->lli_open_fd_exec_count--;
 244                 } else {
 245                         lockmode = LCK_CR;
 246                         LASSERT(lli->lli_open_fd_read_count);
 247                         lli->lli_open_fd_read_count--;
 248                 }
 249                 up(&lli->lli_och_sem);
 250
 251                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 252                                    LDLM_IBITS, &policy, lockmode,
 253                                    &lockh)) {
 254                         rc = ll_md_real_close(file->f_dentry->d_inode,
 255                                               fd->fd_omode);
 256                 }
 257         } else {
 258                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
 259                        file, file->f_dentry, file->f_dentry->d_name.name);
 260         }
 261
 262         LUSTRE_FPRIVATE(file) = NULL;
 263         ll_file_data_put(fd);
 264         ll_capa_close(inode);
 265
 266         RETURN(rc);
 267 }
 268
 269 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
 270
 271 /* While this returns an error code, fput() the caller does not, so we need
 272  * to make every effort to clean up all of our state here.  Also, applications
 273  * rarely check close errors and even if an error is returned they will not
 274  * re-try the close call.
 275  */
 276 int ll_file_release(struct inode *inode, struct file *file)
 277 {
 278         struct ll_file_data *fd;
 279         struct ll_sb_info *sbi = ll_i2sbi(inode);
 280         struct ll_inode_info *lli = ll_i2info(inode);
 281         struct lov_stripe_md *lsm = lli->lli_smd;
 282         int rc;
 283
 284         ENTRY;
 285         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 286                inode->i_generation, inode);
 287
 288         /* don't do anything for / */
 289         if (inode->i_sb->s_root == file->f_dentry)
 290                 RETURN(0);
 291
 292         ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 293         fd = LUSTRE_FPRIVATE(file);
 294         LASSERT(fd != NULL);
 295
 296         /* don't do anything for / */
 297         if (inode->i_sb->s_root == file->f_dentry) {
 298                 LUSTRE_FPRIVATE(file) = NULL;
 299                 ll_file_data_put(fd);
 300                 RETURN(0);
 301         }
 302
 303         if (lsm)
 304                 lov_test_and_clear_async_rc(lsm);
 305         lli->lli_async_rc = 0;
 306
 307         rc = ll_md_close(sbi->ll_md_exp, inode, file);
 308         RETURN(rc);
 309 }
 310
 311 static int ll_intent_file_open(struct file *file, void *lmm,
 312                                int lmmsize, struct lookup_intent *itp)
 313 {
 314         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
 315         struct dentry *parent = file->f_dentry->d_parent;
 316         const char *name = file->f_dentry->d_name.name;
 317         const int len = file->f_dentry->d_name.len;
 318         struct md_op_data *op_data;
 319         struct ptlrpc_request *req;
 320         int rc;
 321
 322         if (!parent)
 323                 RETURN(-ENOENT);
 324
 325         /* Usually we come here only for NFSD, and we want open lock.
 326            But we can also get here with pre 2.6.15 patchless kernels, and in
 327            that case that lock is also ok */
 328         /* We can also get here if there was cached open handle in revalidate_it
 329          * but it disappeared while we were getting from there to ll_file_open.
 330          * But this means this file was closed and immediatelly opened which
 331          * makes a good candidate for using OPEN lock */
 332         /* If lmmsize & lmm are not 0, we are just setting stripe info
 333          * parameters. No need for the open lock */
 334         if (!lmm && !lmmsize)
 335                 itp->it_flags |= MDS_OPEN_LOCK;
 336
 337         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
 338                                       file->f_dentry->d_inode, name, len,
 339                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
 340         if (IS_ERR(op_data))
 341                 RETURN(PTR_ERR(op_data));
 342
 343         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 344                             0 /*unused */, &req, ll_md_blocking_ast, 0);
 345         ll_finish_md_op_data(op_data);
 346         if (rc == -ESTALE) {
 347                 /* reason for keep own exit path - don`t flood log
 348                 * with messages with -ESTALE errors.
 349                 */
 350                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 351                      it_open_error(DISP_OPEN_OPEN, itp))
 352                         GOTO(out, rc);
 353                 ll_release_openhandle(file->f_dentry, itp);
 354                 GOTO(out_stale, rc);
 355         }
 356
 357         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 358                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 359                 CERROR("lock enqueue: err: %d\n", rc);
 360                 GOTO(out, rc);
 361         }
 362
 363         if (itp->d.lustre.it_lock_mode)
 364                 md_set_lock_data(sbi->ll_md_exp,
 365                                  &itp->d.lustre.it_lock_handle,
 366                                  file->f_dentry->d_inode);
 367
 368         rc = ll_prep_inode(&file->f_dentry->d_inode, req, DLM_REPLY_REC_OFF,
 369                            NULL);
 370 out:
 371         ptlrpc_req_finished(itp->d.lustre.it_data);
 372
 373 out_stale:
 374         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
 375         ll_intent_drop_lock(itp);
 376
 377         RETURN(rc);
 378 }
 379
 380 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
 381                        struct lookup_intent *it, struct obd_client_handle *och)
 382 {
 383         struct ptlrpc_request *req = it->d.lustre.it_data;
 384         struct mdt_body *body;
 385
 386         LASSERT(och);
 387
 388         body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
 389         LASSERT(body != NULL);                      /* reply already checked out */
 390         LASSERT_REPSWABBED(req, DLM_REPLY_REC_OFF); /* and swabbed in md_enqueue */
 391
 392         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
 393         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 394         och->och_fid = lli->lli_fid;
 395         och->och_flags = it->it_flags;
 396         lli->lli_ioepoch = body->ioepoch;
 397
 398         return md_set_open_replay_data(md_exp, och, req);
 399 }
 400
 401 int ll_local_open(struct file *file, struct lookup_intent *it,
 402                   struct ll_file_data *fd, struct obd_client_handle *och)
 403 {
 404         struct inode *inode = file->f_dentry->d_inode;
 405         struct ll_inode_info *lli = ll_i2info(inode);
 406         ENTRY;
 407
 408         LASSERT(!LUSTRE_FPRIVATE(file));
 409
 410         LASSERT(fd != NULL);
 411
 412         if (och) {
 413                 struct ptlrpc_request *req = it->d.lustre.it_data;
 414                 struct mdt_body *body;
 415                 int rc;
 416
 417                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
 418                 if (rc)
 419                         RETURN(rc);
 420
 421                 body = lustre_msg_buf(req->rq_repmsg,
 422                                       DLM_REPLY_REC_OFF, sizeof(*body));
 423
 424                 if ((it->it_flags & FMODE_WRITE) &&
 425                     (body->valid & OBD_MD_FLSIZE))
 426                 {
 427                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 428                                lli->lli_ioepoch, PFID(&lli->lli_fid));
 429                 }
 430         }
 431
 432         LUSTRE_FPRIVATE(file) = fd;
 433         ll_readahead_init(inode, &fd->fd_ras);
 434         fd->fd_omode = it->it_flags;
 435         RETURN(0);
 436 }
 437
 438 /* Open a file, and (for the very first open) create objects on the OSTs at
 439  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 440  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
 441  * lli_open_sem to ensure no other process will create objects, send the
 442  * stripe MD to the MDS, or try to destroy the objects if that fails.
 443  *
 444  * If we already have the stripe MD locally then we don't request it in
 445  * md_open(), by passing a lmm_size = 0.
 446  *
 447  * It is up to the application to ensure no other processes open this file
 448  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 449  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 450  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 451  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 452  */
 453 int ll_file_open(struct inode *inode, struct file *file)
 454 {
 455         struct ll_inode_info *lli = ll_i2info(inode);
 456         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 457                                           .it_flags = file->f_flags };
 458         struct lov_stripe_md *lsm;
 459         struct ptlrpc_request *req = NULL;
 460         struct obd_client_handle **och_p;
 461         __u64 *och_usecount;
 462         struct ll_file_data *fd;
 463         int rc = 0;
 464         ENTRY;
 465
 466         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 467                inode->i_generation, inode, file->f_flags);
 468
 469         /* don't do anything for / */
 470         if (inode->i_sb->s_root == file->f_dentry)
 471                 RETURN(0);
 472
 473 #ifdef LUSTRE_KERNEL_VERSION
 474         it = file->f_it;
 475 #else
 476         it = file->private_data; /* XXX: compat macro */
 477         file->private_data = NULL; /* prevent ll_local_open assertion */
 478 #endif
 479
 480         fd = ll_file_data_get();
 481         if (fd == NULL)
 482                 RETURN(-ENOMEM);
 483
 484         /* don't do anything for / */
 485         if (inode->i_sb->s_root == file->f_dentry) {
 486                 LUSTRE_FPRIVATE(file) = fd;
 487                 RETURN(0);
 488         }
 489
 490         if (!it || !it->d.lustre.it_disposition) {
 491                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 492                  * because everything but O_ACCMODE mask was stripped from
 493                  * there */
 494                 if ((oit.it_flags + 1) & O_ACCMODE)
 495                         oit.it_flags++;
 496                 if (file->f_flags & O_TRUNC)
 497                         oit.it_flags |= FMODE_WRITE;
 498
 499                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 500                  * dentry_open after call to open_namei that checks permissions.
 501                  * Only nfsd_open call dentry_open directly without checking
 502                  * permissions and because of that this code below is safe. */
 503                 if (oit.it_flags & FMODE_WRITE)
 504                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 505
 506                 /* We do not want O_EXCL here, presumably we opened the file
 507                  * already? XXX - NFS implications? */
 508                 oit.it_flags &= ~O_EXCL;
 509
 510                 it = &oit;
 511         }
 512
 513         /* Let's see if we have file open on MDS already. */
 514         if (it->it_flags & FMODE_WRITE) {
 515                 och_p = &lli->lli_mds_write_och;
 516                 och_usecount = &lli->lli_open_fd_write_count;
 517         } else if (it->it_flags & FMODE_EXEC) {
 518                 och_p = &lli->lli_mds_exec_och;
 519                 och_usecount = &lli->lli_open_fd_exec_count;
 520          } else {
 521                 och_p = &lli->lli_mds_read_och;
 522                 och_usecount = &lli->lli_open_fd_read_count;
 523         }
 524
 525         down(&lli->lli_och_sem);
 526         if (*och_p) { /* Open handle is present */
 527                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 528                         /* Well, there's extra open request that we do not need,
 529                            let's close it somehow. This will decref request. */
 530                         rc = it_open_error(DISP_OPEN_OPEN, it);
 531                         if (rc) {
 532                                 ll_file_data_put(fd);
 533                                 GOTO(out_och_free, rc);
 534                         }
 535                         ll_release_openhandle(file->f_dentry, it);
 536                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
 537                                              LPROC_LL_OPEN);
 538                 }
 539                 (*och_usecount)++;
 540
 541                 rc = ll_local_open(file, it, fd, NULL);
 542                 if (rc) {
 543                         up(&lli->lli_och_sem);
 544                         ll_file_data_put(fd);
 545                         RETURN(rc);
 546                 }
 547         } else {
 548                 LASSERT(*och_usecount == 0);
 549                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 550                 if (!*och_p) {
 551                         ll_file_data_put(fd);
 552                         GOTO(out_och_free, rc = -ENOMEM);
 553                 }
 554                 (*och_usecount)++;
 555                 if (!it->d.lustre.it_disposition) {
 556                         it->it_flags |= O_CHECK_STALE;
 557                         rc = ll_intent_file_open(file, NULL, 0, it);
 558                         it->it_flags &= ~O_CHECK_STALE;
 559                         if (rc) {
 560                                 ll_file_data_put(fd);
 561                                 GOTO(out_och_free, rc);
 562                         }
 563
 564                         /* Got some error? Release the request */
 565                         if (it->d.lustre.it_status < 0) {
 566                                 req = it->d.lustre.it_data;
 567                                 ptlrpc_req_finished(req);
 568                         }
 569                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
 570                                          &it->d.lustre.it_lock_handle,
 571                                          file->f_dentry->d_inode);
 572                 }
 573                 req = it->d.lustre.it_data;
 574
 575                 /* md_intent_lock() didn't get a request ref if there was an
 576                  * open error, so don't do cleanup on the request here
 577                  * (bug 3430) */
 578                 /* XXX (green): Should not we bail out on any error here, not
 579                  * just open error? */
 580                 rc = it_open_error(DISP_OPEN_OPEN, it);
 581                 if (rc) {
 582                         ll_file_data_put(fd);
 583                         GOTO(out_och_free, rc);
 584                 }
 585
 586                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 587                 rc = ll_local_open(file, it, fd, *och_p);
 588                 if (rc) {
 589                         up(&lli->lli_och_sem);
 590                         ll_file_data_put(fd);
 591                         GOTO(out_och_free, rc);
 592                 }
 593         }
 594         up(&lli->lli_och_sem);
 595
 596         /* Must do this outside lli_och_sem lock to prevent deadlock where
 597            different kind of OPEN lock for this same inode gets cancelled
 598            by ldlm_cancel_lru */
 599         if (!S_ISREG(inode->i_mode))
 600                 GOTO(out, rc);
 601
 602         ll_capa_open(inode);
 603
 604         lsm = lli->lli_smd;
 605         if (lsm == NULL) {
 606                 if (file->f_flags & O_LOV_DELAY_CREATE ||
 607                     !(file->f_mode & FMODE_WRITE)) {
 608                         CDEBUG(D_INODE, "object creation was delayed\n");
 609                         GOTO(out, rc);
 610                 }
 611         }
 612         file->f_flags &= ~O_LOV_DELAY_CREATE;
 613         GOTO(out, rc);
 614 out:
 615         ptlrpc_req_finished(req);
 616         if (req)
 617                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 618 out_och_free:
 619         if (rc) {
 620                 if (*och_p) {
 621                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 622                         *och_p = NULL; /* OBD_FREE writes some magic there */
 623                         (*och_usecount)--;
 624                 }
 625                 up(&lli->lli_och_sem);
 626         }
 627
 628         return rc;
 629 }
 630
 631 /* Fills the obdo with the attributes for the inode defined by lsm */
 632 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
 633 {
 634         struct ptlrpc_request_set *set;
 635         struct ll_inode_info *lli = ll_i2info(inode);
 636         struct lov_stripe_md *lsm = lli->lli_smd;
 637
 638         struct obd_info oinfo = { { { 0 } } };
 639         int rc;
 640         ENTRY;
 641
 642         LASSERT(lsm != NULL);
 643
 644         oinfo.oi_md = lsm;
 645         oinfo.oi_oa = obdo;
 646         oinfo.oi_oa->o_id = lsm->lsm_object_id;
 647         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
 648         oinfo.oi_oa->o_mode = S_IFREG;
 649         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 650                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 651                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 652                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 653                                OBD_MD_FLGROUP;
 654         oinfo.oi_capa = ll_mdscapa_get(inode);
 655
 656         set = ptlrpc_prep_set();
 657         if (set == NULL) {
 658                 CERROR("can't allocate ptlrpc set\n");
 659                 rc = -ENOMEM;
 660         } else {
 661                 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
 662                 if (rc == 0)
 663                         rc = ptlrpc_set_wait(set);
 664                 ptlrpc_set_destroy(set);
 665         }
 666         capa_put(oinfo.oi_capa);
 667         if (rc)
 668                 RETURN(rc);
 669
 670         oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 671                                  OBD_MD_FLATIME | OBD_MD_FLMTIME |
 672                                  OBD_MD_FLCTIME | OBD_MD_FLSIZE);
 673
 674         obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
 675         CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
 676                lli->lli_smd->lsm_object_id, inode->i_size, inode->i_blocks,
 677                inode->i_blksize);
 678         RETURN(0);
 679 }
 680
 681 static inline void ll_remove_suid(struct inode *inode)
 682 {
 683         unsigned int mode;
 684
 685         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
 686         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
 687
 688         /* was any of the uid bits set? */
 689         mode &= inode->i_mode;
 690         if (mode && !capable(CAP_FSETID)) {
 691                 inode->i_mode &= ~mode;
 692                 // XXX careful here - we cannot change the size
 693         }
 694 }
 695
 696 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
 697 {
 698         struct ll_inode_info *lli = ll_i2info(inode);
 699         struct lov_stripe_md *lsm = lli->lli_smd;
 700         struct obd_export *exp = ll_i2dtexp(inode);
 701         struct {
 702                 char name[16];
 703                 struct ldlm_lock *lock;
 704                 struct lov_stripe_md *lsm;
 705         } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
 706         __u32 stripe, vallen = sizeof(stripe);
 707         int rc;
 708         ENTRY;
 709
 710         if (lsm->lsm_stripe_count == 1)
 711                 GOTO(check, stripe = 0);
 712
 713         /* get our offset in the lov */
 714         rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
 715         if (rc != 0) {
 716                 CERROR("obd_get_info: rc = %d\n", rc);
 717                 RETURN(rc);
 718         }
 719         LASSERT(stripe < lsm->lsm_stripe_count);
 720
 721 check:
 722         if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
 723             lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
 724                 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
 725                            lsm->lsm_oinfo[stripe]->loi_id,
 726                            lsm->lsm_oinfo[stripe]->loi_gr);
 727                 RETURN(-ELDLM_NO_LOCK_DATA);
 728         }
 729
 730         RETURN(stripe);
 731 }
 732
 733 /* Flush the page cache for an extent as its canceled.  When we're on an LOV,
 734  * we get a lock cancellation for each stripe, so we have to map the obd's
 735  * region back onto the stripes in the file that it held.
 736  *
 737  * No one can dirty the extent until we've finished our work and they can
 738  * enqueue another lock.  The DLM protects us from ll_file_read/write here,
 739  * but other kernel actors could have pages locked.
 740  *
 741  * Called with the DLM lock held. */
 742 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
 743                               struct ldlm_lock *lock, __u32 stripe)
 744 {
 745         ldlm_policy_data_t tmpex;
 746         unsigned long start, end, count, skip, i, j;
 747         struct page *page;
 748         int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
 749         struct lustre_handle lockh;
 750         ENTRY;
 751
 752         memcpy(&tmpex, &lock->l_policy_data, sizeof(tmpex));
 753         CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
 754                inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
 755                inode->i_size);
 756
 757         /* our locks are page granular thanks to osc_enqueue, we invalidate the
 758          * whole page. */
 759         if ((tmpex.l_extent.start & ~CFS_PAGE_MASK) != 0 ||
 760             ((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) != 0)
 761                 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",
 762                            CFS_PAGE_SIZE);
 763         LASSERT((tmpex.l_extent.start & ~CFS_PAGE_MASK) == 0);
 764         LASSERT(((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) == 0);
 765
 766         count = ~0;
 767         skip = 0;
 768         start = tmpex.l_extent.start >> CFS_PAGE_SHIFT;
 769         end = tmpex.l_extent.end >> CFS_PAGE_SHIFT;
 770         if (lsm->lsm_stripe_count > 1) {
 771                 count = lsm->lsm_stripe_size >> CFS_PAGE_SHIFT;
 772                 skip = (lsm->lsm_stripe_count - 1) * count;
 773                 start += start/count * skip + stripe * count;
 774                 if (end != ~0)
 775                         end += end/count * skip + stripe * count;
 776         }
 777         if (end < tmpex.l_extent.end >> CFS_PAGE_SHIFT)
 778                 end = ~0;
 779
 780         i = inode->i_size ? (__u64)(inode->i_size - 1) >> CFS_PAGE_SHIFT : 0;
 781         if (i < end)
 782                 end = i;
 783
 784         CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
 785                "count: %lu skip: %lu end: %lu%s\n", start, start % count,
 786                count, skip, end, discard ? " (DISCARDING)" : "");
 787
 788         /* walk through the vmas on the inode and tear down mmaped pages that
 789          * intersect with the lock.  this stops immediately if there are no
 790          * mmap()ed regions of the file.  This is not efficient at all and
 791          * should be short lived. We'll associate mmap()ed pages with the lock
 792          * and will be able to find them directly */
 793         for (i = start; i <= end; i += (j + skip)) {
 794                 j = min(count - (i % count), end - i + 1);
 795                 LASSERT(j > 0);
 796                 LASSERT(inode->i_mapping);
 797                 if (ll_teardown_mmaps(inode->i_mapping,
 798                                       (__u64)i << CFS_PAGE_SHIFT,
 799                                       ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) )
 800                         break;
 801         }
 802
 803         /* this is the simplistic implementation of page eviction at
 804          * cancelation.  It is careful to get races with other page
 805          * lockers handled correctly.  fixes from bug 20 will make it
 806          * more efficient by associating locks with pages and with
 807          * batching writeback under the lock explicitly. */
 808         for (i = start, j = start % count; i <= end;
 809              j++, i++, tmpex.l_extent.start += CFS_PAGE_SIZE) {
 810                 if (j == count) {
 811                         CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
 812                         i += skip;
 813                         j = 0;
 814                         if (i > end)
 815                                 break;
 816                 }
 817                 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
 818                          LPU64" >= "LPU64" start %lu i %lu end %lu\n",
 819                          tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
 820                          start, i, end);
 821
 822                 if (!mapping_has_pages(inode->i_mapping)) {
 823                         CDEBUG(D_INODE|D_PAGE, "nothing left\n");
 824                         break;
 825                 }
 826
 827                 cond_resched();
 828
 829                 page = find_get_page(inode->i_mapping, i);
 830                 if (page == NULL)
 831                         continue;
 832                 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
 833                                i, tmpex.l_extent.start);
 834                 lock_page(page);
 835
 836                 /* page->mapping to check with racing against teardown */
 837                 if (!discard && clear_page_dirty_for_io(page)) {
 838                         rc = ll_call_writepage(inode, page);
 839                         if (rc != 0)
 840                                 CERROR("writepage inode %lu(%p) of page %p "
 841                                        "failed: %d\n", inode->i_ino, inode,
 842                                        page, rc);
 843                         /* either waiting for io to complete or reacquiring
 844                          * the lock that the failed writepage released */
 845                         lock_page(page);
 846                 }
 847
 848                 tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
 849                 /* check to see if another DLM lock covers this page b=2765 */
 850                 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
 851                                       LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING |
 852                                       LDLM_FL_TEST_LOCK,
 853                                       &lock->l_resource->lr_name, LDLM_EXTENT,
 854                                       &tmpex, LCK_PR | LCK_PW, &lockh);
 855
 856                 if (rc2 <= 0 && page->mapping != NULL) {
 857                         struct ll_async_page *llap = llap_cast_private(page);
 858                         /* checking again to account for writeback's
 859                          * lock_page() */
 860                         LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
 861                         if (llap)
 862                                 ll_ra_accounting(llap, inode->i_mapping);
 863                         ll_truncate_complete_page(page);
 864                 }
 865                 unlock_page(page);
 866                 page_cache_release(page);
 867         }
 868         LASSERTF(tmpex.l_extent.start <=
 869                  (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
 870                   lock->l_policy_data.l_extent.end + 1),
 871                  "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
 872                  tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
 873                  start, i, end);
 874         EXIT;
 875 }
 876
 877 static int ll_extent_lock_callback(struct ldlm_lock *lock,
 878                                    struct ldlm_lock_desc *new, void *data,
 879                                    int flag)
 880 {
 881         struct lustre_handle lockh = { 0 };
 882         int rc;
 883         ENTRY;
 884
 885         if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
 886                 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
 887                 LBUG();
 888         }
 889
 890         switch (flag) {
 891         case LDLM_CB_BLOCKING:
 892                 ldlm_lock2handle(lock, &lockh);
 893                 rc = ldlm_cli_cancel(&lockh);
 894                 if (rc != ELDLM_OK)
 895                         CERROR("ldlm_cli_cancel failed: %d\n", rc);
 896                 break;
 897         case LDLM_CB_CANCELING: {
 898                 struct inode *inode;
 899                 struct ll_inode_info *lli;
 900                 struct lov_stripe_md *lsm;
 901                 int stripe;
 902                 __u64 kms;
 903
 904                 /* This lock wasn't granted, don't try to evict pages */
 905                 if (lock->l_req_mode != lock->l_granted_mode)
 906                         RETURN(0);
 907
 908                 inode = ll_inode_from_lock(lock);
 909                 if (inode == NULL)
 910                         RETURN(0);
 911                 lli = ll_i2info(inode);
 912                 if (lli == NULL)
 913                         goto iput;
 914                 if (lli->lli_smd == NULL)
 915                         goto iput;
 916                 lsm = lli->lli_smd;
 917
 918                 stripe = ll_lock_to_stripe_offset(inode, lock);
 919                 if (stripe < 0)
 920                         goto iput;
 921
 922                 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
 923
 924                 lov_stripe_lock(lsm);
 925                 lock_res_and_lock(lock);
 926                 kms = ldlm_extent_shift_kms(lock,
 927                                             lsm->lsm_oinfo[stripe]->loi_kms);
 928
 929                 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
 930                         LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 931                                    lsm->lsm_oinfo[stripe]->loi_kms, kms);
 932                 lsm->lsm_oinfo[stripe]->loi_kms = kms;
 933                 unlock_res_and_lock(lock);
 934                 lov_stripe_unlock(lsm);
 935         iput:
 936                 iput(inode);
 937                 break;
 938         }
 939         default:
 940                 LBUG();
 941         }
 942
 943         RETURN(0);
 944 }
 945
 946 #if 0
 947 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
 948 {
 949         /* XXX ALLOCATE - 160 bytes */
 950         struct inode *inode = ll_inode_from_lock(lock);
 951         struct ll_inode_info *lli = ll_i2info(inode);
 952         struct lustre_handle lockh = { 0 };
 953         struct ost_lvb *lvb;
 954         int stripe;
 955         ENTRY;
 956
 957         if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
 958                      LDLM_FL_BLOCK_CONV)) {
 959                 LBUG(); /* not expecting any blocked async locks yet */
 960                 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
 961                            "lock, returning");
 962                 ldlm_lock_dump(D_OTHER, lock, 0);
 963                 ldlm_reprocess_all(lock->l_resource);
 964                 RETURN(0);
 965         }
 966
 967         LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
 968
 969         stripe = ll_lock_to_stripe_offset(inode, lock);
 970         if (stripe < 0)
 971                 goto iput;
 972
 973         if (lock->l_lvb_len) {
 974                 struct lov_stripe_md *lsm = lli->lli_smd;
 975                 __u64 kms;
 976                 lvb = lock->l_lvb_data;
 977                 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
 978
 979                 lock_res_and_lock(lock);
 980                 ll_inode_size_lock(inode, 1);
 981                 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
 982                 kms = ldlm_extent_shift_kms(NULL, kms);
 983                 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
 984                         LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 985                                    lsm->lsm_oinfo[stripe].loi_kms, kms);
 986                 lsm->lsm_oinfo[stripe].loi_kms = kms;
 987                 ll_inode_size_unlock(inode, 1);
 988                 unlock_res_and_lock(lock);
 989         }
 990
 991 iput:
 992         iput(inode);
 993         wake_up(&lock->l_waitq);
 994
 995         ldlm_lock2handle(lock, &lockh);
 996         ldlm_lock_decref(&lockh, LCK_PR);
 997         RETURN(0);
 998 }
 999 #endif
1000
1001 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
1002 {
1003         struct ptlrpc_request *req = reqp;
1004         struct inode *inode = ll_inode_from_lock(lock);
1005         struct ll_inode_info *lli;
1006         struct lov_stripe_md *lsm;
1007         struct ost_lvb *lvb;
1008         int rc, stripe;
1009         int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
1010         ENTRY;
1011
1012         if (inode == NULL)
1013                 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
1014         lli = ll_i2info(inode);
1015         if (lli == NULL)
1016                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1017         lsm = lli->lli_smd;
1018         if (lsm == NULL)
1019                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1020
1021         /* First, find out which stripe index this lock corresponds to. */
1022         stripe = ll_lock_to_stripe_offset(inode, lock);
1023         if (stripe < 0)
1024                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1025
1026         rc = lustre_pack_reply(req, 2, size, NULL);
1027         if (rc) {
1028                 CERROR("lustre_pack_reply: %d\n", rc);
1029                 GOTO(iput, rc);
1030         }
1031
1032         lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
1033         lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
1034         lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1035         lvb->lvb_atime = LTIME_S(inode->i_atime);
1036         lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1037
1038         LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1039                    " atime "LPU64", mtime "LPU64", ctime "LPU64,
1040                    inode->i_size, stripe, lvb->lvb_size, lvb->lvb_mtime,
1041                    lvb->lvb_atime, lvb->lvb_ctime);
1042  iput:
1043         iput(inode);
1044
1045  out:
1046         /* These errors are normal races, so we don't want to fill the console
1047          * with messages by calling ptlrpc_error() */
1048         if (rc == -ELDLM_NO_LOCK_DATA)
1049                 lustre_pack_reply(req, 1, NULL, NULL);
1050
1051         req->rq_status = rc;
1052         return rc;
1053 }
1054
1055 static void ll_merge_lvb(struct inode *inode)
1056 {
1057         struct ll_inode_info *lli = ll_i2info(inode);
1058         struct ll_sb_info *sbi = ll_i2sbi(inode);
1059         struct ost_lvb lvb;
1060         ENTRY;
1061
1062         ll_inode_size_lock(inode, 1);
1063         inode_init_lvb(inode, &lvb);
1064         obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1065         inode->i_size = lvb.lvb_size;
1066         inode->i_blocks = lvb.lvb_blocks;
1067         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1068         LTIME_S(inode->i_atime) = lvb.lvb_atime;
1069         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1070         ll_inode_size_unlock(inode, 1);
1071         EXIT;
1072 }
1073
1074 int ll_local_size(struct inode *inode)
1075 {
1076         ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1077         struct ll_inode_info *lli = ll_i2info(inode);
1078         struct ll_sb_info *sbi = ll_i2sbi(inode);
1079         struct lustre_handle lockh = { 0 };
1080         int flags = 0;
1081         int rc;
1082         ENTRY;
1083
1084         if (lli->lli_smd->lsm_stripe_count == 0)
1085                 RETURN(0);
1086
1087         rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1088                        &policy, LCK_PR | LCK_PW, &flags, inode, &lockh);
1089         if (rc < 0)
1090                 RETURN(rc);
1091         else if (rc == 0)
1092                 RETURN(-ENODATA);
1093
1094         ll_merge_lvb(inode);
1095         obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR | LCK_PW, &lockh);
1096         RETURN(0);
1097 }
1098
1099 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1100                      lstat_t *st)
1101 {
1102         struct lustre_handle lockh = { 0 };
1103         struct obd_enqueue_info einfo = { 0 };
1104         struct obd_info oinfo = { { { 0 } } };
1105         struct ost_lvb lvb;
1106         int rc;
1107
1108         ENTRY;
1109
1110         einfo.ei_type = LDLM_EXTENT;
1111         einfo.ei_mode = LCK_PR;
1112         einfo.ei_flags = LDLM_FL_HAS_INTENT;
1113         einfo.ei_cb_bl = ll_extent_lock_callback;
1114         einfo.ei_cb_cp = ldlm_completion_ast;
1115         einfo.ei_cb_gl = ll_glimpse_callback;
1116         einfo.ei_cbdata = NULL;
1117
1118         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1119         oinfo.oi_lockh = &lockh;
1120         oinfo.oi_md = lsm;
1121
1122         rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1123         if (rc == -ENOENT)
1124                 RETURN(rc);
1125         if (rc != 0) {
1126                 CERROR("obd_enqueue returned rc %d, "
1127                        "returning -EIO\n", rc);
1128                 RETURN(rc > 0 ? -EIO : rc);
1129         }
1130
1131         lov_stripe_lock(lsm);
1132         memset(&lvb, 0, sizeof(lvb));
1133         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1134         st->st_size = lvb.lvb_size;
1135         st->st_blocks = lvb.lvb_blocks;
1136         st->st_mtime = lvb.lvb_mtime;
1137         st->st_atime = lvb.lvb_atime;
1138         st->st_ctime = lvb.lvb_ctime;
1139         lov_stripe_unlock(lsm);
1140
1141         RETURN(rc);
1142 }
1143
1144 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1145  * file (because it prefers KMS over RSS when larger) */
1146 int ll_glimpse_size(struct inode *inode, int ast_flags)
1147 {
1148         struct ll_inode_info *lli = ll_i2info(inode);
1149         struct ll_sb_info *sbi = ll_i2sbi(inode);
1150         struct lustre_handle lockh = { 0 };
1151         struct obd_enqueue_info einfo = { 0 };
1152         struct obd_info oinfo = { { { 0 } } };
1153         int rc;
1154         ENTRY;
1155
1156         if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1157                 RETURN(0);
1158
1159         CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1160
1161         if (!lli->lli_smd) {
1162                 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1163                 RETURN(0);
1164         }
1165
1166         /* NOTE: this looks like DLM lock request, but it may not be one. Due
1167          *       to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1168          *       won't revoke any conflicting DLM locks held. Instead,
1169          *       ll_glimpse_callback() will be called on each client
1170          *       holding a DLM lock against this file, and resulting size
1171          *       will be returned for each stripe. DLM lock on [0, EOF] is
1172          *       acquired only if there were no conflicting locks. */
1173         einfo.ei_type = LDLM_EXTENT;
1174         einfo.ei_mode = LCK_PR;
1175         einfo.ei_flags = ast_flags | LDLM_FL_HAS_INTENT;
1176         einfo.ei_cb_bl = ll_extent_lock_callback;
1177         einfo.ei_cb_cp = ldlm_completion_ast;
1178         einfo.ei_cb_gl = ll_glimpse_callback;
1179         einfo.ei_cbdata = inode;
1180
1181         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1182         oinfo.oi_lockh = &lockh;
1183         oinfo.oi_md = lli->lli_smd;
1184
1185         rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1186         if (rc == -ENOENT)
1187                 RETURN(rc);
1188         if (rc != 0) {
1189                 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1190                 RETURN(rc > 0 ? -EIO : rc);
1191         }
1192
1193         ll_merge_lvb(inode);
1194
1195         CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
1196                inode->i_size, inode->i_blocks);
1197
1198         RETURN(rc);
1199 }
1200
1201 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1202                    struct lov_stripe_md *lsm, int mode,
1203                    ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1204                    int ast_flags)
1205 {
1206         struct ll_sb_info *sbi = ll_i2sbi(inode);
1207         struct ost_lvb lvb;
1208         struct obd_enqueue_info einfo = { 0 };
1209         struct obd_info oinfo = { { { 0 } } };
1210         int rc;
1211         ENTRY;
1212
1213         LASSERT(!lustre_handle_is_used(lockh));
1214         LASSERT(lsm != NULL);
1215
1216         /* don't drop the mmapped file to LRU */
1217         if (mapping_mapped(inode->i_mapping))
1218                 ast_flags |= LDLM_FL_NO_LRU;
1219
1220         /* XXX phil: can we do this?  won't it screw the file size up? */
1221         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1222             (sbi->ll_flags & LL_SBI_NOLCK))
1223                 RETURN(0);
1224
1225         CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1226                inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1227
1228         einfo.ei_type = LDLM_EXTENT;
1229         einfo.ei_mode = mode;
1230         einfo.ei_flags = ast_flags;
1231         einfo.ei_cb_bl = ll_extent_lock_callback;
1232         einfo.ei_cb_cp = ldlm_completion_ast;
1233         einfo.ei_cb_gl = ll_glimpse_callback;
1234         einfo.ei_cbdata = inode;
1235
1236         oinfo.oi_policy = *policy;
1237         oinfo.oi_lockh = lockh;
1238         oinfo.oi_md = lsm;
1239
1240         rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo);
1241         *policy = oinfo.oi_policy;
1242         if (rc > 0)
1243                 rc = -EIO;
1244
1245         ll_inode_size_lock(inode, 1);
1246         inode_init_lvb(inode, &lvb);
1247         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1248
1249         if (policy->l_extent.start == 0 &&
1250             policy->l_extent.end == OBD_OBJECT_EOF) {
1251                 /* vmtruncate()->ll_truncate() first sets the i_size and then
1252                  * the kms under both a DLM lock and the
1253                  * ll_inode_size_lock().  If we don't get the
1254                  * ll_inode_size_lock() here we can match the DLM lock and
1255                  * reset i_size from the kms before the truncating path has
1256                  * updated the kms.  generic_file_write can then trust the
1257                  * stale i_size when doing appending writes and effectively
1258                  * cancel the result of the truncate.  Getting the
1259                  * ll_inode_size_lock() after the enqueue maintains the DLM
1260                  * -> ll_inode_size_lock() acquiring order. */
1261                 inode->i_size = lvb.lvb_size;
1262                 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1263                        inode->i_ino, inode->i_size);
1264         }
1265
1266         if (rc == 0) {
1267                 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1268                 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1269                 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1270         }
1271         ll_inode_size_unlock(inode, 1);
1272
1273         RETURN(rc);
1274 }
1275
1276 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1277                      struct lov_stripe_md *lsm, int mode,
1278                      struct lustre_handle *lockh)
1279 {
1280         struct ll_sb_info *sbi = ll_i2sbi(inode);
1281         int rc;
1282         ENTRY;
1283
1284         /* XXX phil: can we do this?  won't it screw the file size up? */
1285         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1286             (sbi->ll_flags & LL_SBI_NOLCK))
1287                 RETURN(0);
1288
1289         rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1290
1291         RETURN(rc);
1292 }
1293
1294 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1295                             loff_t *ppos)
1296 {
1297         struct inode *inode = file->f_dentry->d_inode;
1298         struct ll_inode_info *lli = ll_i2info(inode);
1299         struct lov_stripe_md *lsm = lli->lli_smd;
1300         struct ll_sb_info *sbi = ll_i2sbi(inode);
1301         struct ll_lock_tree tree;
1302         struct ll_lock_tree_node *node;
1303         struct ost_lvb lvb;
1304         struct ll_ra_read bead;
1305         int rc, ra = 0;
1306         loff_t end;
1307         ssize_t retval, chunk, sum = 0;
1308
1309         __u64 kms;
1310         ENTRY;
1311         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1312                inode->i_ino, inode->i_generation, inode, count, *ppos);
1313         /* "If nbyte is 0, read() will return 0 and have no other results."
1314          *                      -- Single Unix Spec */
1315         if (count == 0)
1316                 RETURN(0);
1317
1318         ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1319
1320         if (!lsm) {
1321                 /* Read on file with no objects should return zero-filled
1322                  * buffers up to file size (we can get non-zero sizes with
1323                  * mknod + truncate, then opening file for read. This is a
1324                  * common pattern in NFS case, it seems). Bug 6243 */
1325                 int notzeroed;
1326                 /* Since there are no objects on OSTs, we have nothing to get
1327                  * lock on and so we are forced to access inode->i_size
1328                  * unguarded */
1329
1330                 /* Read beyond end of file */
1331                 if (*ppos >= inode->i_size)
1332                         RETURN(0);
1333
1334                 if (count > inode->i_size - *ppos)
1335                         count = inode->i_size - *ppos;
1336                 /* Make sure to correctly adjust the file pos pointer for
1337                  * EFAULT case */
1338                 notzeroed = clear_user(buf, count);
1339                 count -= notzeroed;
1340                 *ppos += count;
1341                 if (!count)
1342                         RETURN(-EFAULT);
1343                 RETURN(count);
1344         }
1345
1346 repeat:
1347         if (sbi->ll_max_rw_chunk != 0) {
1348                 /* first, let's know the end of the current stripe */
1349                 end = *ppos;
1350                 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1351                                 (obd_off *)&end);
1352
1353                 /* correct, the end is beyond the request */
1354                 if (end > *ppos + count - 1)
1355                         end = *ppos + count - 1;
1356
1357                 /* and chunk shouldn't be too large even if striping is wide */
1358                 if (end - *ppos > sbi->ll_max_rw_chunk)
1359                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1360         } else {
1361                 end = *ppos + count - 1;
1362         }
1363
1364         node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1365         tree.lt_fd = LUSTRE_FPRIVATE(file);
1366         rc = ll_tree_lock(&tree, node, buf, count,
1367                           file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1368         if (rc != 0)
1369                 GOTO(out, retval = rc);
1370
1371         ll_inode_size_lock(inode, 1);
1372         /*
1373          * Consistency guarantees: following possibilities exist for the
1374          * relation between region being read and real file size at this
1375          * moment:
1376          *
1377          *  (A): the region is completely inside of the file;
1378          *
1379          *  (B-x): x bytes of region are inside of the file, the rest is
1380          *  outside;
1381          *
1382          *  (C): the region is completely outside of the file.
1383          *
1384          * This classification is stable under DLM lock acquired by
1385          * ll_tree_lock() above, because to change class, other client has to
1386          * take DLM lock conflicting with our lock. Also, any updates to
1387          * ->i_size by other threads on this client are serialized by
1388          * ll_inode_size_lock(). This guarantees that short reads are handled
1389          * correctly in the face of concurrent writes and truncates.
1390          */
1391         inode_init_lvb(inode, &lvb);
1392         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1393         kms = lvb.lvb_size;
1394         if (*ppos + count - 1 > kms) {
1395                 /* A glimpse is necessary to determine whether we return a
1396                  * short read (B) or some zeroes at the end of the buffer (C) */
1397                 ll_inode_size_unlock(inode, 1);
1398                 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1399                 if (retval) {
1400                         ll_tree_unlock(&tree);
1401                         goto out;
1402                 }
1403         } else {
1404                 /* region is within kms and, hence, within real file size (A).
1405                  * We need to increase i_size to cover the read region so that
1406                  * generic_file_read() will do its job, but that doesn't mean
1407                  * the kms size is _correct_, it is only the _minimum_ size.
1408                  * If someone does a stat they will get the correct size which
1409                  * will always be >= the kms value here.  b=11081 */
1410                 if (inode->i_size < kms)
1411                         inode->i_size = kms;
1412                 ll_inode_size_unlock(inode, 1);
1413         }
1414
1415         chunk = end - *ppos + 1;
1416         CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1417                         inode->i_ino, chunk, *ppos, inode->i_size);
1418
1419         /* turn off the kernel's read-ahead */
1420 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1421         file->f_ramax = 0;
1422 #else
1423         file->f_ra.ra_pages = 0;
1424 #endif
1425         /* initialize read-ahead window once per syscall */
1426         if (ra == 0) {
1427                 ra = 1;
1428                 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1429                 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1430                 ll_ra_read_in(file, &bead);
1431         }
1432
1433         /* BUG: 5972 */
1434         file_accessed(file);
1435         retval = generic_file_read(file, buf, chunk, ppos);
1436         ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1437
1438         ll_tree_unlock(&tree);
1439
1440         if (retval > 0) {
1441                 buf += retval;
1442                 count -= retval;
1443                 sum += retval;
1444                 if (retval == chunk && count > 0)
1445                         goto repeat;
1446         }
1447
1448  out:
1449         if (ra != 0)
1450                 ll_ra_read_ex(file, &bead);
1451         retval = (sum > 0) ? sum : retval;
1452         RETURN(retval);
1453 }
1454
1455 /*
1456  * Write to a file (through the page cache).
1457  */
1458 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1459                              loff_t *ppos)
1460 {
1461         struct inode *inode = file->f_dentry->d_inode;
1462         struct ll_sb_info *sbi = ll_i2sbi(inode);
1463         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1464         struct ll_lock_tree tree;
1465         struct ll_lock_tree_node *node;
1466         loff_t maxbytes = ll_file_maxbytes(inode);
1467         loff_t lock_start, lock_end, end;
1468         ssize_t retval, chunk, sum = 0;
1469         int rc;
1470         ENTRY;
1471
1472         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1473                inode->i_ino, inode->i_generation, inode, count, *ppos);
1474
1475         SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1476
1477         /* POSIX, but surprised the VFS doesn't check this already */
1478         if (count == 0)
1479                 RETURN(0);
1480
1481         /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1482          * called on the file, don't fail the below assertion (bug 2388). */
1483         if (file->f_flags & O_LOV_DELAY_CREATE &&
1484             ll_i2info(inode)->lli_smd == NULL)
1485                 RETURN(-EBADF);
1486
1487         LASSERT(ll_i2info(inode)->lli_smd != NULL);
1488
1489         down(&ll_i2info(inode)->lli_write_sem);
1490
1491 repeat:
1492         chunk = 0; /* just to fix gcc's warning */
1493         end = *ppos + count - 1;
1494
1495         if (file->f_flags & O_APPEND) {
1496                 lock_start = 0;
1497                 lock_end = OBD_OBJECT_EOF;
1498         } else if (sbi->ll_max_rw_chunk != 0) {
1499                 /* first, let's know the end of the current stripe */
1500                 end = *ppos;
1501                 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1502                                 (obd_off *)&end);
1503
1504                 /* correct, the end is beyond the request */
1505                 if (end > *ppos + count - 1)
1506                         end = *ppos + count - 1;
1507
1508                 /* and chunk shouldn't be too large even if striping is wide */
1509                 if (end - *ppos > sbi->ll_max_rw_chunk)
1510                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1511                 lock_start = *ppos;
1512                 lock_end = end;
1513         } else {
1514                 lock_start = *ppos;
1515                 lock_end = *ppos + count - 1;
1516         }
1517         node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1518
1519         if (IS_ERR(node))
1520                 GOTO(out, retval = PTR_ERR(node));
1521
1522         tree.lt_fd = LUSTRE_FPRIVATE(file);
1523         rc = ll_tree_lock(&tree, node, buf, count,
1524                           file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1525         if (rc != 0)
1526                 GOTO(out, retval = rc);
1527
1528         /* This is ok, g_f_w will overwrite this under i_sem if it races
1529          * with a local truncate, it just makes our maxbyte checking easier.
1530          * The i_size value gets updated in ll_extent_lock() as a consequence
1531          * of the [0,EOF] extent lock we requested above. */
1532         if (file->f_flags & O_APPEND) {
1533                 *ppos = inode->i_size;
1534                 end = *ppos + count - 1;
1535         }
1536
1537         if (*ppos >= maxbytes) {
1538                 send_sig(SIGXFSZ, current, 0);
1539                 GOTO(out, retval = -EFBIG);
1540         }
1541         if (*ppos + count > maxbytes)
1542                 count = maxbytes - *ppos;
1543
1544         /* generic_file_write handles O_APPEND after getting i_mutex */
1545         chunk = end - *ppos + 1;
1546         CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1547                inode->i_ino, chunk, *ppos);
1548         retval = generic_file_write(file, buf, chunk, ppos);
1549         ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1550
1551 out:
1552         ll_tree_unlock(&tree);
1553
1554         if (retval > 0) {
1555                 buf += retval;
1556                 count -= retval;
1557                 sum += retval;
1558                 if (retval == chunk && count > 0)
1559                         goto repeat;
1560         }
1561
1562         up(&ll_i2info(inode)->lli_write_sem);
1563
1564         retval = (sum > 0) ? sum : retval;
1565         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1566                            retval > 0 ? retval : 0);
1567         RETURN(retval);
1568 }
1569
1570 /*
1571  * Send file content (through pagecache) somewhere with helper
1572  */
1573 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1574 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1575                                 read_actor_t actor, void *target)
1576 {
1577         struct inode *inode = in_file->f_dentry->d_inode;
1578         struct ll_inode_info *lli = ll_i2info(inode);
1579         struct lov_stripe_md *lsm = lli->lli_smd;
1580         struct ll_lock_tree tree;
1581         struct ll_lock_tree_node *node;
1582         struct ost_lvb lvb;
1583         struct ll_ra_read bead;
1584         int rc;
1585         ssize_t retval;
1586         __u64 kms;
1587         ENTRY;
1588         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1589                inode->i_ino, inode->i_generation, inode, count, *ppos);
1590
1591         /* "If nbyte is 0, read() will return 0 and have no other results."
1592          *                      -- Single Unix Spec */
1593         if (count == 0)
1594                 RETURN(0);
1595
1596         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1597         /* turn off the kernel's read-ahead */
1598         in_file->f_ra.ra_pages = 0;
1599
1600         /* File with no objects, nothing to lock */
1601         if (!lsm)
1602                 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1603
1604         node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1605         tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1606         rc = ll_tree_lock(&tree, node, NULL, count,
1607                           in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1608         if (rc != 0)
1609                 RETURN(rc);
1610
1611         ll_inode_size_lock(inode, 1);
1612         /*
1613          * Consistency guarantees: following possibilities exist for the
1614          * relation between region being read and real file size at this
1615          * moment:
1616          *
1617          *  (A): the region is completely inside of the file;
1618          *
1619          *  (B-x): x bytes of region are inside of the file, the rest is
1620          *  outside;
1621          *
1622          *  (C): the region is completely outside of the file.
1623          *
1624          * This classification is stable under DLM lock acquired by
1625          * ll_tree_lock() above, because to change class, other client has to
1626          * take DLM lock conflicting with our lock. Also, any updates to
1627          * ->i_size by other threads on this client are serialized by
1628          * ll_inode_size_lock(). This guarantees that short reads are handled
1629          * correctly in the face of concurrent writes and truncates.
1630          */
1631         inode_init_lvb(inode, &lvb);
1632         obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1633         kms = lvb.lvb_size;
1634         if (*ppos + count - 1 > kms) {
1635                 /* A glimpse is necessary to determine whether we return a
1636                  * short read (B) or some zeroes at the end of the buffer (C) */
1637                 ll_inode_size_unlock(inode, 1);
1638                 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1639                 if (retval)
1640                         goto out;
1641         } else {
1642                 /* region is within kms and, hence, within real file size (A) */
1643                 inode->i_size = kms;
1644                 ll_inode_size_unlock(inode, 1);
1645         }
1646
1647         CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1648                inode->i_ino, count, *ppos, inode->i_size);
1649
1650         bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1651         bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1652         ll_ra_read_in(in_file, &bead);
1653         /* BUG: 5972 */
1654         file_accessed(in_file);
1655         retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1656         ll_ra_read_ex(in_file, &bead);
1657
1658  out:
1659         ll_tree_unlock(&tree);
1660         RETURN(retval);
1661 }
1662 #endif
1663
1664 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1665                                unsigned long arg)
1666 {
1667         struct ll_inode_info *lli = ll_i2info(inode);
1668         struct obd_export *exp = ll_i2dtexp(inode);
1669         struct ll_recreate_obj ucreatp;
1670         struct obd_trans_info oti = { 0 };
1671         struct obdo *oa = NULL;
1672         int lsm_size;
1673         int rc = 0;
1674         struct lov_stripe_md *lsm, *lsm2;
1675         ENTRY;
1676
1677         if (!capable (CAP_SYS_ADMIN))
1678                 RETURN(-EPERM);
1679
1680         rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1681                             sizeof(struct ll_recreate_obj));
1682         if (rc) {
1683                 RETURN(-EFAULT);
1684         }
1685         OBDO_ALLOC(oa);
1686         if (oa == NULL)
1687                 RETURN(-ENOMEM);
1688
1689         down(&lli->lli_size_sem);
1690         lsm = lli->lli_smd;
1691         if (lsm == NULL)
1692                 GOTO(out, rc = -ENOENT);
1693         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1694                    (lsm->lsm_stripe_count));
1695
1696         OBD_ALLOC(lsm2, lsm_size);
1697         if (lsm2 == NULL)
1698                 GOTO(out, rc = -ENOMEM);
1699
1700         oa->o_id = ucreatp.lrc_id;
1701         oa->o_gr = ucreatp.lrc_group;
1702         oa->o_nlink = ucreatp.lrc_ost_idx;
1703         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1704         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1705         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1706                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1707
1708         oti.oti_objid = NULL;
1709         memcpy(lsm2, lsm, lsm_size);
1710         rc = obd_create(exp, oa, &lsm2, &oti);
1711
1712         OBD_FREE(lsm2, lsm_size);
1713         GOTO(out, rc);
1714 out:
1715         up(&lli->lli_size_sem);
1716         OBDO_FREE(oa);
1717         return rc;
1718 }
1719
1720 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1721                              int flags, struct lov_user_md *lum, int lum_size)
1722 {
1723         struct ll_inode_info *lli = ll_i2info(inode);
1724         struct lov_stripe_md *lsm;
1725         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1726         int rc = 0;
1727         ENTRY;
1728
1729         down(&lli->lli_size_sem);
1730         lsm = lli->lli_smd;
1731         if (lsm) {
1732                 up(&lli->lli_size_sem);
1733                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1734                        inode->i_ino);
1735                 RETURN(-EEXIST);
1736         }
1737
1738         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1739         if (rc)
1740                 GOTO(out, rc);
1741         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1742                 GOTO(out_req_free, rc = -ENOENT);
1743         rc = oit.d.lustre.it_status;
1744         if (rc < 0)
1745                 GOTO(out_req_free, rc);
1746
1747         ll_release_openhandle(file->f_dentry, &oit);
1748
1749  out:
1750         up(&lli->lli_size_sem);
1751         ll_intent_release(&oit);
1752         RETURN(rc);
1753 out_req_free:
1754         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1755         goto out;
1756 }
1757
1758 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1759                              struct lov_mds_md **lmmp, int *lmm_size,
1760                              struct ptlrpc_request **request)
1761 {
1762         struct ll_sb_info *sbi = ll_i2sbi(inode);
1763         struct mdt_body  *body;
1764         struct lov_mds_md *lmm = NULL;
1765         struct ptlrpc_request *req = NULL;
1766         struct obd_capa *oc;
1767         int rc, lmmsize;
1768
1769         rc = ll_get_max_mdsize(sbi, &lmmsize);
1770         if (rc)
1771                 RETURN(rc);
1772
1773         oc = ll_mdscapa_get(inode);
1774         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1775                              oc, filename, strlen(filename) + 1,
1776                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize, &req);
1777         capa_put(oc);
1778         if (rc < 0) {
1779                 CDEBUG(D_INFO, "md_getattr_name failed "
1780                        "on %s: rc %d\n", filename, rc);
1781                 GOTO(out, rc);
1782         }
1783
1784         body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*body));
1785         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1786         /* swabbed by mdc_getattr_name */
1787         LASSERT_REPSWABBED(req, REPLY_REC_OFF);
1788
1789         lmmsize = body->eadatasize;
1790
1791         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1792                         lmmsize == 0) {
1793                 GOTO(out, rc = -ENODATA);
1794         }
1795
1796         lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1797         LASSERT(lmm != NULL);
1798         LASSERT_REPSWABBED(req, REPLY_REC_OFF + 1);
1799
1800         /*
1801          * This is coming from the MDS, so is probably in
1802          * little endian.  We convert it to host endian before
1803          * passing it to userspace.
1804          */
1805         if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1806                 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1807                 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1808         } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1809                 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1810         }
1811
1812         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1813                 struct lov_stripe_md *lsm;
1814                 struct lov_user_md_join *lmj;
1815                 int lmj_size, i, aindex = 0;
1816
1817                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1818                 if (rc < 0)
1819                         GOTO(out, rc = -ENOMEM);
1820                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1821                 if (rc)
1822                         GOTO(out_free_memmd, rc);
1823
1824                 lmj_size = sizeof(struct lov_user_md_join) +
1825                            lsm->lsm_stripe_count *
1826                            sizeof(struct lov_user_ost_data_join);
1827                 OBD_ALLOC(lmj, lmj_size);
1828                 if (!lmj)
1829                         GOTO(out_free_memmd, rc = -ENOMEM);
1830
1831                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1832                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1833                         struct lov_extent *lex =
1834                                 &lsm->lsm_array->lai_ext_array[aindex];
1835
1836                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1837                                 aindex ++;
1838                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1839                                         LPU64" len %d\n", aindex, i,
1840                                         lex->le_start, (int)lex->le_len);
1841                         lmj->lmm_objects[i].l_extent_start =
1842                                 lex->le_start;
1843
1844                         if ((int)lex->le_len == -1)
1845                                 lmj->lmm_objects[i].l_extent_end = -1;
1846                         else
1847                                 lmj->lmm_objects[i].l_extent_end =
1848                                         lex->le_start + lex->le_len;
1849                         lmj->lmm_objects[i].l_object_id =
1850                                 lsm->lsm_oinfo[i]->loi_id;
1851                         lmj->lmm_objects[i].l_object_gr =
1852                                 lsm->lsm_oinfo[i]->loi_gr;
1853                         lmj->lmm_objects[i].l_ost_gen =
1854                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1855                         lmj->lmm_objects[i].l_ost_idx =
1856                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1857                 }
1858                 lmm = (struct lov_mds_md *)lmj;
1859                 lmmsize = lmj_size;
1860 out_free_memmd:
1861                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1862         }
1863 out:
1864         *lmmp = lmm;
1865         *lmm_size = lmmsize;
1866         *request = req;
1867         return rc;
1868 }
1869
1870 static int ll_lov_setea(struct inode *inode, struct file *file,
1871                             unsigned long arg)
1872 {
1873         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1874         struct lov_user_md  *lump;
1875         int lum_size = sizeof(struct lov_user_md) +
1876                        sizeof(struct lov_user_ost_data);
1877         int rc;
1878         ENTRY;
1879
1880         if (!capable (CAP_SYS_ADMIN))
1881                 RETURN(-EPERM);
1882
1883         OBD_ALLOC(lump, lum_size);
1884         if (lump == NULL) {
1885                 RETURN(-ENOMEM);
1886         }
1887         rc = copy_from_user(lump, (struct lov_user_md  *)arg, lum_size);
1888         if (rc) {
1889                 OBD_FREE(lump, lum_size);
1890                 RETURN(-EFAULT);
1891         }
1892
1893         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1894
1895         OBD_FREE(lump, lum_size);
1896         RETURN(rc);
1897 }
1898
1899 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1900                             unsigned long arg)
1901 {
1902         struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1903         int rc;
1904         int flags = FMODE_WRITE;
1905         ENTRY;
1906
1907         /* Bug 1152: copy properly when this is no longer true */
1908         LASSERT(sizeof(lum) == sizeof(*lump));
1909         LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1910         rc = copy_from_user(&lum, lump, sizeof(lum));
1911         if (rc)
1912                 RETURN(-EFAULT);
1913
1914         rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1915         if (rc == 0) {
1916                  put_user(0, &lump->lmm_stripe_count);
1917                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1918                                     0, ll_i2info(inode)->lli_smd, lump);
1919         }
1920         RETURN(rc);
1921 }
1922
1923 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1924 {
1925         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1926
1927         if (!lsm)
1928                 RETURN(-ENODATA);
1929
1930         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1931                             (void *)arg);
1932 }
1933
1934 static int ll_get_grouplock(struct inode *inode, struct file *file,
1935                             unsigned long arg)
1936 {
1937         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1938         ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1939                                                     .end = OBD_OBJECT_EOF}};
1940         struct lustre_handle lockh = { 0 };
1941         struct ll_inode_info *lli = ll_i2info(inode);
1942         struct lov_stripe_md *lsm = lli->lli_smd;
1943         int flags = 0, rc;
1944         ENTRY;
1945
1946         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1947                 RETURN(-EINVAL);
1948         }
1949
1950         policy.l_extent.gid = arg;
1951         if (file->f_flags & O_NONBLOCK)
1952                 flags = LDLM_FL_BLOCK_NOWAIT;
1953
1954         rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1955         if (rc)
1956                 RETURN(rc);
1957
1958         fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1959         fd->fd_gid = arg;
1960         memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1961
1962         RETURN(0);
1963 }
1964
1965 static int ll_put_grouplock(struct inode *inode, struct file *file,
1966                             unsigned long arg)
1967 {
1968         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1969         struct ll_inode_info *lli = ll_i2info(inode);
1970         struct lov_stripe_md *lsm = lli->lli_smd;
1971         int rc;
1972         ENTRY;
1973
1974         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1975                 /* Ugh, it's already unlocked. */
1976                 RETURN(-EINVAL);
1977         }
1978
1979         if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
1980                 RETURN(-EINVAL);
1981
1982         fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
1983
1984         rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
1985         if (rc)
1986                 RETURN(rc);
1987
1988         fd->fd_gid = 0;
1989         memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
1990
1991         RETURN(0);
1992 }
1993
1994 static int join_sanity_check(struct inode *head, struct inode *tail)
1995 {
1996         ENTRY;
1997         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1998                 CERROR("server do not support join \n");
1999                 RETURN(-EINVAL);
2000         }
2001         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2002                 CERROR("tail ino %lu and ino head %lu must be regular\n",
2003                        head->i_ino, tail->i_ino);
2004                 RETURN(-EINVAL);
2005         }
2006         if (head->i_ino == tail->i_ino) {
2007                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2008                 RETURN(-EINVAL);
2009         }
2010         if (head->i_size % JOIN_FILE_ALIGN) {
2011                 CERROR("hsize %llu must be times of 64K\n", head->i_size);
2012                 RETURN(-EINVAL);
2013         }
2014         RETURN(0);
2015 }
2016
2017 static int join_file(struct inode *head_inode, struct file *head_filp,
2018                      struct file *tail_filp)
2019 {
2020         struct inode *tail_inode, *tail_parent;
2021         struct dentry *tail_dentry = tail_filp->f_dentry;
2022         struct lookup_intent oit = {.it_op = IT_OPEN,
2023                                    .it_flags = head_filp->f_flags|O_JOIN_FILE};
2024         struct lustre_handle lockh;
2025         struct md_op_data *op_data;
2026         int    rc;
2027         ENTRY;
2028
2029         tail_dentry = tail_filp->f_dentry;
2030         tail_inode = tail_dentry->d_inode;
2031         tail_parent = tail_dentry->d_parent->d_inode;
2032
2033         op_data = ll_prep_md_op_data(NULL, head_inode, tail_parent,
2034                                      tail_dentry->d_name.name,
2035                                      tail_dentry->d_name.len, 0,
2036                                      LUSTRE_OPC_ANY, &head_inode->i_size);
2037         if (IS_ERR(op_data))
2038                 RETURN(PTR_ERR(op_data));
2039
2040         rc = md_enqueue(ll_i2mdexp(head_inode), LDLM_IBITS, &oit, LCK_CW,
2041                         op_data, &lockh, NULL, 0, ldlm_completion_ast,
2042                         ll_md_blocking_ast, NULL, 0);
2043
2044         ll_finish_md_op_data(op_data);
2045         if (rc < 0)
2046                 GOTO(out, rc);
2047
2048         rc = oit.d.lustre.it_status;
2049
2050         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2051                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2052                 ptlrpc_req_finished((struct ptlrpc_request *)
2053                                     oit.d.lustre.it_data);
2054                 GOTO(out, rc);
2055         }
2056
2057         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2058                                            * away */
2059                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2060                 oit.d.lustre.it_lock_mode = 0;
2061         }
2062         ll_release_openhandle(head_filp->f_dentry, &oit);
2063 out:
2064         ll_intent_release(&oit);
2065         RETURN(rc);
2066 }
2067
2068 static int ll_file_join(struct inode *head, struct file *filp,
2069                         char *filename_tail)
2070 {
2071         struct inode *tail = NULL, *first = NULL, *second = NULL;
2072         struct dentry *tail_dentry;
2073         struct file *tail_filp, *first_filp, *second_filp;
2074         struct ll_lock_tree first_tree, second_tree;
2075         struct ll_lock_tree_node *first_node, *second_node;
2076         struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2077         int rc = 0, cleanup_phase = 0;
2078         ENTRY;
2079
2080         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2081                head->i_ino, head->i_generation, head, filename_tail);
2082
2083         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2084         if (IS_ERR(tail_filp)) {
2085                 CERROR("Can not open tail file %s", filename_tail);
2086                 rc = PTR_ERR(tail_filp);
2087                 GOTO(cleanup, rc);
2088         }
2089         tail = igrab(tail_filp->f_dentry->d_inode);
2090
2091         tlli = ll_i2info(tail);
2092         tail_dentry = tail_filp->f_dentry;
2093         LASSERT(tail_dentry);
2094         cleanup_phase = 1;
2095
2096         /*reorder the inode for lock sequence*/
2097         first = head->i_ino > tail->i_ino ? head : tail;
2098         second = head->i_ino > tail->i_ino ? tail : head;
2099         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2100         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2101
2102         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2103                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2104         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2105         if (IS_ERR(first_node)){
2106                 rc = PTR_ERR(first_node);
2107                 GOTO(cleanup, rc);
2108         }
2109         first_tree.lt_fd = first_filp->private_data;
2110         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2111         if (rc != 0)
2112                 GOTO(cleanup, rc);
2113         cleanup_phase = 2;
2114
2115         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2116         if (IS_ERR(second_node)){
2117                 rc = PTR_ERR(second_node);
2118                 GOTO(cleanup, rc);
2119         }
2120         second_tree.lt_fd = second_filp->private_data;
2121         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2122         if (rc != 0)
2123                 GOTO(cleanup, rc);
2124         cleanup_phase = 3;
2125
2126         rc = join_sanity_check(head, tail);
2127         if (rc)
2128                 GOTO(cleanup, rc);
2129
2130         rc = join_file(head, filp, tail_filp);
2131         if (rc)
2132                 GOTO(cleanup, rc);
2133 cleanup:
2134         switch (cleanup_phase) {
2135         case 3:
2136                 ll_tree_unlock(&second_tree);
2137                 obd_cancel_unused(ll_i2dtexp(second),
2138                                   ll_i2info(second)->lli_smd, 0, NULL);
2139         case 2:
2140                 ll_tree_unlock(&first_tree);
2141                 obd_cancel_unused(ll_i2dtexp(first),
2142                                   ll_i2info(first)->lli_smd, 0, NULL);
2143         case 1:
2144                 filp_close(tail_filp, 0);
2145                 if (tail)
2146                         iput(tail);
2147                 if (head && rc == 0) {
2148                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2149                                        &hlli->lli_smd);
2150                         hlli->lli_smd = NULL;
2151                 }
2152         case 0:
2153                 break;
2154         default:
2155                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2156                 LBUG();
2157         }
2158         RETURN(rc);
2159 }
2160
2161 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2162 {
2163         struct inode *inode = dentry->d_inode;
2164         struct obd_client_handle *och;
2165         int rc;
2166         ENTRY;
2167
2168         LASSERT(inode);
2169
2170         /* Root ? Do nothing. */
2171         if (dentry->d_inode->i_sb->s_root == dentry)
2172                 RETURN(0);
2173
2174         /* No open handle to close? Move away */
2175         if (!it_disposition(it, DISP_OPEN_OPEN))
2176                 RETURN(0);
2177
2178         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2179
2180         OBD_ALLOC(och, sizeof(*och));
2181         if (!och)
2182                 GOTO(out, rc = -ENOMEM);
2183
2184         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2185                     ll_i2info(inode), it, och);
2186
2187         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2188                                        inode, och);
2189  out:
2190         /* this one is in place of ll_file_open */
2191         ptlrpc_req_finished(it->d.lustre.it_data);
2192         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2193         RETURN(rc);
2194 }
2195
2196 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2197                   unsigned long arg)
2198 {
2199         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2200         int flags;
2201         ENTRY;
2202
2203         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2204                inode->i_generation, inode, cmd);
2205         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2206
2207         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2208         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2209                 RETURN(-ENOTTY);
2210
2211         switch(cmd) {
2212         case LL_IOC_GETFLAGS:
2213                 /* Get the current value of the file flags */
2214                 return put_user(fd->fd_flags, (int *)arg);
2215         case LL_IOC_SETFLAGS:
2216         case LL_IOC_CLRFLAGS:
2217                 /* Set or clear specific file flags */
2218                 /* XXX This probably needs checks to ensure the flags are
2219                  *     not abused, and to handle any flag side effects.
2220                  */
2221                 if (get_user(flags, (int *) arg))
2222                         RETURN(-EFAULT);
2223
2224                 if (cmd == LL_IOC_SETFLAGS) {
2225                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2226                             !(file->f_flags & O_DIRECT)) {
2227                                 CERROR("%s: unable to disable locking on "
2228                                        "non-O_DIRECT file\n", current->comm);
2229                                 RETURN(-EINVAL);
2230                         }
2231
2232                         fd->fd_flags |= flags;
2233                 } else {
2234                         fd->fd_flags &= ~flags;
2235                 }
2236                 RETURN(0);
2237         case LL_IOC_LOV_SETSTRIPE:
2238                 RETURN(ll_lov_setstripe(inode, file, arg));
2239         case LL_IOC_LOV_SETEA:
2240                 RETURN(ll_lov_setea(inode, file, arg));
2241         case LL_IOC_LOV_GETSTRIPE:
2242                 RETURN(ll_lov_getstripe(inode, arg));
2243         case LL_IOC_RECREATE_OBJ:
2244                 RETURN(ll_lov_recreate_obj(inode, file, arg));
2245         case EXT3_IOC_GETFLAGS:
2246         case EXT3_IOC_SETFLAGS:
2247                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2248         case EXT3_IOC_GETVERSION_OLD:
2249         case EXT3_IOC_GETVERSION:
2250                 RETURN(put_user(inode->i_generation, (int *)arg));
2251         case LL_IOC_JOIN: {
2252                 char *ftail;
2253                 int rc;
2254
2255                 ftail = getname((const char *)arg);
2256                 if (IS_ERR(ftail))
2257                         RETURN(PTR_ERR(ftail));
2258                 rc = ll_file_join(inode, file, ftail);
2259                 putname(ftail);
2260                 RETURN(rc);
2261         }
2262         case LL_IOC_GROUP_LOCK:
2263                 RETURN(ll_get_grouplock(inode, file, arg));
2264         case LL_IOC_GROUP_UNLOCK:
2265                 RETURN(ll_put_grouplock(inode, file, arg));
2266         case IOC_OBD_STATFS:
2267                 RETURN(ll_obd_statfs(inode, (void *)arg));
2268
2269         /* We need to special case any other ioctls we want to handle,
2270          * to send them to the MDS/OST as appropriate and to properly
2271          * network encode the arg field.
2272         case EXT3_IOC_SETVERSION_OLD:
2273         case EXT3_IOC_SETVERSION:
2274         */
2275         case LL_IOC_FLUSHCTX:
2276                 RETURN(ll_flush_ctx(inode));
2277         case LL_IOC_GETFACL: {
2278                 struct rmtacl_ioctl_data ioc;
2279
2280                 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2281                         RETURN(-EFAULT);
2282
2283                 RETURN(ll_ioctl_getfacl(inode, &ioc));
2284         }
2285         case LL_IOC_SETFACL: {
2286                 struct rmtacl_ioctl_data ioc;
2287
2288                 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2289                         RETURN(-EFAULT);
2290
2291                 RETURN(ll_ioctl_setfacl(inode, &ioc));
2292         }
2293         default:
2294                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2295                                      (void *)arg));
2296         }
2297 }
2298
2299 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2300 {
2301         struct inode *inode = file->f_dentry->d_inode;
2302         struct ll_inode_info *lli = ll_i2info(inode);
2303         struct lov_stripe_md *lsm = lli->lli_smd;
2304         loff_t retval;
2305         ENTRY;
2306         retval = offset + ((origin == 2) ? inode->i_size :
2307                            (origin == 1) ? file->f_pos : 0);
2308         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2309                inode->i_ino, inode->i_generation, inode, retval, retval,
2310                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2311         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2312
2313         if (origin == 2) { /* SEEK_END */
2314                 int nonblock = 0, rc;
2315
2316                 if (file->f_flags & O_NONBLOCK)
2317                         nonblock = LDLM_FL_BLOCK_NOWAIT;
2318
2319                 if (lsm != NULL) {
2320                         rc = ll_glimpse_size(inode, nonblock);
2321                         if (rc != 0)
2322                                 RETURN(rc);
2323                 }
2324
2325                 ll_inode_size_lock(inode, 0);
2326                 offset += inode->i_size;
2327                 ll_inode_size_unlock(inode, 0);
2328         } else if (origin == 1) { /* SEEK_CUR */
2329                 offset += file->f_pos;
2330         }
2331
2332         retval = -EINVAL;
2333         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2334                 if (offset != file->f_pos) {
2335                         file->f_pos = offset;
2336 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2337                         file->f_reada = 0;
2338                         file->f_version = ++event;
2339 #endif
2340                 }
2341                 retval = offset;
2342         }
2343
2344         RETURN(retval);
2345 }
2346
2347 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2348 {
2349         struct inode *inode = dentry->d_inode;
2350         struct ll_inode_info *lli = ll_i2info(inode);
2351         struct lov_stripe_md *lsm = lli->lli_smd;
2352         struct ptlrpc_request *req;
2353         struct obd_capa *oc;
2354         int rc, err;
2355         ENTRY;
2356         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2357                inode->i_generation, inode);
2358         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2359
2360         /* fsync's caller has already called _fdata{sync,write}, we want
2361          * that IO to finish before calling the osc and mdc sync methods */
2362         rc = filemap_fdatawait(inode->i_mapping);
2363
2364         /* catch async errors that were recorded back when async writeback
2365          * failed for pages in this mapping. */
2366         err = lli->lli_async_rc;
2367         lli->lli_async_rc = 0;
2368         if (rc == 0)
2369                 rc = err;
2370         if (lsm) {
2371                 err = lov_test_and_clear_async_rc(lsm);
2372                 if (rc == 0)
2373                         rc = err;
2374         }
2375
2376         oc = ll_mdscapa_get(inode);
2377         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2378                       &req);
2379         capa_put(oc);
2380         if (!rc)
2381                 rc = err;
2382         if (!err)
2383                 ptlrpc_req_finished(req);
2384
2385         if (data && lsm) {
2386                 struct obdo *oa;
2387
2388                 OBDO_ALLOC(oa);
2389                 if (!oa)
2390                         RETURN(rc ? rc : -ENOMEM);
2391
2392                 oa->o_id = lsm->lsm_object_id;
2393                 oa->o_gr = lsm->lsm_object_gr;
2394                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2395                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2396                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2397                                            OBD_MD_FLGROUP);
2398
2399                 oc = ll_osscapa_get(inode, 0, CAPA_OPC_OSS_WRITE);
2400                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2401                                0, OBD_OBJECT_EOF, oc);
2402                 capa_put(oc);
2403                 if (!rc)
2404                         rc = err;
2405                 OBDO_FREE(oa);
2406         }
2407
2408         RETURN(rc);
2409 }
2410
2411 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2412 {
2413         struct inode *inode = file->f_dentry->d_inode;
2414         struct ll_sb_info *sbi = ll_i2sbi(inode);
2415         struct ldlm_res_id res_id =
2416                 { .name = { fid_seq(ll_inode2fid(inode)),
2417                             fid_oid(ll_inode2fid(inode)),
2418                             fid_ver(ll_inode2fid(inode)),
2419                             LDLM_FLOCK} };
2420         struct lustre_handle lockh = {0};
2421         ldlm_policy_data_t flock;
2422         ldlm_mode_t mode = 0;
2423         int flags = 0;
2424         int rc;
2425         ENTRY;
2426
2427         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2428                inode->i_ino, file_lock);
2429
2430         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2431
2432         if (file_lock->fl_flags & FL_FLOCK) {
2433                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2434                 /* set missing params for flock() calls */
2435                 file_lock->fl_end = OFFSET_MAX;
2436                 file_lock->fl_pid = current->tgid;
2437         }
2438         flock.l_flock.pid = file_lock->fl_pid;
2439         flock.l_flock.start = file_lock->fl_start;
2440         flock.l_flock.end = file_lock->fl_end;
2441
2442         switch (file_lock->fl_type) {
2443         case F_RDLCK:
2444                 mode = LCK_PR;
2445                 break;
2446         case F_UNLCK:
2447                 /* An unlock request may or may not have any relation to
2448                  * existing locks so we may not be able to pass a lock handle
2449                  * via a normal ldlm_lock_cancel() request. The request may even
2450                  * unlock a byte range in the middle of an existing lock. In
2451                  * order to process an unlock request we need all of the same
2452                  * information that is given with a normal read or write record
2453                  * lock request. To avoid creating another ldlm unlock (cancel)
2454                  * message we'll treat a LCK_NL flock request as an unlock. */
2455                 mode = LCK_NL;
2456                 break;
2457         case F_WRLCK:
2458                 mode = LCK_PW;
2459                 break;
2460         default:
2461                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2462                 LBUG();
2463         }
2464
2465         switch (cmd) {
2466         case F_SETLKW:
2467 #ifdef F_SETLKW64
2468         case F_SETLKW64:
2469 #endif
2470                 flags = 0;
2471                 break;
2472         case F_SETLK:
2473 #ifdef F_SETLK64
2474         case F_SETLK64:
2475 #endif
2476                 flags = LDLM_FL_BLOCK_NOWAIT;
2477                 break;
2478         case F_GETLK:
2479 #ifdef F_GETLK64
2480         case F_GETLK64:
2481 #endif
2482                 flags = LDLM_FL_TEST_LOCK;
2483                 /* Save the old mode so that if the mode in the lock changes we
2484                  * can decrement the appropriate reader or writer refcount. */
2485                 file_lock->fl_type = mode;
2486                 break;
2487         default:
2488                 CERROR("unknown fcntl lock command: %d\n", cmd);
2489                 LBUG();
2490         }
2491
2492         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2493                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2494                flags, mode, flock.l_flock.start, flock.l_flock.end);
2495
2496         rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &res_id,
2497                               LDLM_FLOCK, &flock, mode, &flags, NULL,
2498                               ldlm_flock_completion_ast, NULL, file_lock,
2499                               NULL, 0, NULL, &lockh, 0);
2500         if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2501                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2502 #ifdef HAVE_F_OP_FLOCK
2503         if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2504             !(flags & LDLM_FL_TEST_LOCK))
2505                 posix_lock_file_wait(file, file_lock);
2506 #endif
2507
2508         RETURN(rc);
2509 }
2510
2511 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2512 {
2513         ENTRY;
2514
2515         RETURN(-ENOSYS);
2516 }
2517
2518 int ll_have_md_lock(struct inode *inode, __u64 bits)
2519 {
2520         struct lustre_handle lockh;
2521         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2522         struct lu_fid *fid;
2523         int flags;
2524         ENTRY;
2525
2526         if (!inode)
2527                RETURN(0);
2528
2529         fid = &ll_i2info(inode)->lli_fid;
2530         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2531
2532         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2533         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2534                           LCK_CR|LCK_CW|LCK_PR, &lockh)) {
2535                 RETURN(1);
2536         }
2537
2538         RETURN(0);
2539 }
2540
2541 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2542         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2543                               * and return success */
2544                 inode->i_nlink = 0;
2545                 /* This path cannot be hit for regular files unless in
2546                  * case of obscure races, so no need to to validate
2547                  * size. */
2548                 if (!S_ISREG(inode->i_mode) &&
2549                     !S_ISDIR(inode->i_mode))
2550                         return 0;
2551         }
2552
2553         if (rc) {
2554                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2555                 return -abs(rc);
2556
2557         }
2558
2559         return 0;
2560 }
2561
2562 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2563 {
2564         struct inode *inode = dentry->d_inode;
2565         struct ptlrpc_request *req = NULL;
2566         struct ll_sb_info *sbi;
2567         struct obd_export *exp;
2568         int rc;
2569         ENTRY;
2570
2571         if (!inode) {
2572                 CERROR("REPORT THIS LINE TO PETER\n");
2573                 RETURN(0);
2574         }
2575         sbi = ll_i2sbi(inode);
2576
2577         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2578                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2579 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
2580         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REVALIDATE, 1);
2581 #endif
2582
2583         exp = ll_i2mdexp(inode);
2584
2585         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2586                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2587                 struct md_op_data *op_data;
2588
2589                 /* Call getattr by fid, so do not provide name at all. */
2590                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2591                                              dentry->d_inode, NULL, 0, 0,
2592                                              LUSTRE_OPC_ANY, NULL);
2593                 if (IS_ERR(op_data))
2594                         RETURN(PTR_ERR(op_data));
2595
2596                 oit.it_flags |= O_CHECK_STALE;
2597                 rc = md_intent_lock(exp, op_data, NULL, 0,
2598                                     /* we are not interested in name
2599                                        based lookup */
2600                                     &oit, 0, &req,
2601                                     ll_md_blocking_ast, 0);
2602                 ll_finish_md_op_data(op_data);
2603                 oit.it_flags &= ~O_CHECK_STALE;
2604                 if (rc < 0) {
2605                         rc = ll_inode_revalidate_fini(inode, rc);
2606                         GOTO (out, rc);
2607                 }
2608
2609                 rc = ll_revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2610                 if (rc != 0) {
2611                         ll_intent_release(&oit);
2612                         GOTO(out, rc);
2613                 }
2614
2615                 /* Unlinked? Unhash dentry, so it is not picked up later by
2616                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2617                    here to preserve get_cwd functionality on 2.6.
2618                    Bug 10503 */
2619                 if (!dentry->d_inode->i_nlink) {
2620                         spin_lock(&dcache_lock);
2621                         ll_drop_dentry(dentry);
2622                         spin_unlock(&dcache_lock);
2623                 }
2624
2625                 ll_lookup_finish_locks(&oit, dentry);
2626         } else if (!ll_have_md_lock(dentry->d_inode,
2627                                     MDS_INODELOCK_UPDATE)) {
2628                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2629                 obd_valid valid = OBD_MD_FLGETATTR;
2630                 struct obd_capa *oc;
2631                 int ealen = 0;
2632
2633                 if (S_ISREG(inode->i_mode)) {
2634                         rc = ll_get_max_mdsize(sbi, &ealen);
2635                         if (rc)
2636                                 RETURN(rc);
2637                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2638                 }
2639                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2640                  * capa for this inode. Because we only keep capas of dirs
2641                  * fresh. */
2642                 oc = ll_mdscapa_get(inode);
2643                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2644                                 ealen, &req);
2645                 capa_put(oc);
2646                 if (rc) {
2647                         rc = ll_inode_revalidate_fini(inode, rc);
2648                         RETURN(rc);
2649                 }
2650
2651                 rc = ll_prep_inode(&inode, req, REPLY_REC_OFF,
2652                                    NULL);
2653                 if (rc)
2654                         GOTO(out, rc);
2655         }
2656
2657         /* if object not yet allocated, don't validate size */
2658         if (ll_i2info(inode)->lli_smd == NULL)
2659                 GOTO(out, rc = 0);
2660
2661         /* ll_glimpse_size will prefer locally cached writes if they extend
2662          * the file */
2663         rc = ll_glimpse_size(inode, 0);
2664         EXIT;
2665 out:
2666         ptlrpc_req_finished(req);
2667         return rc;
2668 }
2669
2670 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2671 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2672                   struct lookup_intent *it, struct kstat *stat)
2673 {
2674         struct inode *inode = de->d_inode;
2675         int res = 0;
2676
2677         res = ll_inode_revalidate_it(de, it);
2678         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2679
2680         if (res)
2681                 return res;
2682
2683         stat->dev = inode->i_sb->s_dev;
2684         stat->ino = inode->i_ino;
2685         stat->mode = inode->i_mode;
2686         stat->nlink = inode->i_nlink;
2687         stat->uid = inode->i_uid;
2688         stat->gid = inode->i_gid;
2689         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2690         stat->atime = inode->i_atime;
2691         stat->mtime = inode->i_mtime;
2692         stat->ctime = inode->i_ctime;
2693 #ifdef HAVE_INODE_BLKSIZE
2694         stat->blksize = inode->i_blksize;
2695 #else
2696         stat->blksize = 1 << inode->i_blkbits;
2697 #endif
2698
2699         ll_inode_size_lock(inode, 0);
2700         stat->size = inode->i_size;
2701         stat->blocks = inode->i_blocks;
2702         ll_inode_size_unlock(inode, 0);
2703
2704         return 0;
2705 }
2706 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2707 {
2708         struct lookup_intent it = { .it_op = IT_GETATTR };
2709
2710         return ll_getattr_it(mnt, de, &it, stat);
2711 }
2712 #endif
2713
2714 static
2715 int lustre_check_acl(struct inode *inode, int mask)
2716 {
2717 #ifdef CONFIG_FS_POSIX_ACL
2718         struct ll_inode_info *lli = ll_i2info(inode);
2719         struct posix_acl *acl;
2720         int rc;
2721         ENTRY;
2722
2723         spin_lock(&lli->lli_lock);
2724         acl = posix_acl_dup(lli->lli_posix_acl);
2725         spin_unlock(&lli->lli_lock);
2726
2727         if (!acl)
2728                 RETURN(-EAGAIN);
2729
2730         rc = posix_acl_permission(inode, acl, mask);
2731         posix_acl_release(acl);
2732
2733         RETURN(rc);
2734 #else
2735         return -EAGAIN;
2736 #endif
2737 }
2738
2739 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2740 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2741 {
2742         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2743                inode->i_ino, inode->i_generation, inode, mask);
2744         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2745                 return lustre_check_remote_perm(inode, mask);
2746
2747         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2748         return generic_permission(inode, mask, lustre_check_acl);
2749 }
2750 #else
2751 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
2752 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2753 #else
2754 int ll_inode_permission(struct inode *inode, int mask)
2755 #endif
2756 {
2757         int mode = inode->i_mode;
2758         int rc;
2759
2760         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2761                inode->i_ino, inode->i_generation, inode, mask);
2762
2763         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2764                 return lustre_check_remote_perm(inode, mask);
2765
2766         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2767
2768         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2769             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2770                 return -EROFS;
2771         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2772                 return -EACCES;
2773         if (current->fsuid == inode->i_uid) {
2774                 mode >>= 6;
2775         } else if (1) {
2776                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2777                         goto check_groups;
2778                 rc = lustre_check_acl(inode, mask);
2779                 if (rc == -EAGAIN)
2780                         goto check_groups;
2781                 if (rc == -EACCES)
2782                         goto check_capabilities;
2783                 return rc;
2784         } else {
2785 check_groups:
2786                 if (in_group_p(inode->i_gid))
2787                         mode >>= 3;
2788         }
2789         if ((mode & mask & S_IRWXO) == mask)
2790                 return 0;
2791
2792 check_capabilities:
2793         if (!(mask & MAY_EXEC) ||
2794             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2795                 if (capable(CAP_DAC_OVERRIDE))
2796                         return 0;
2797
2798         if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2799             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2800                 return 0;
2801
2802         return -EACCES;
2803 }
2804 #endif
2805
2806 /* -o localflock - only provides locally consistent flock locks */
2807 struct file_operations ll_file_operations = {
2808         .read           = ll_file_read,
2809         .write          = ll_file_write,
2810         .ioctl          = ll_file_ioctl,
2811         .open           = ll_file_open,
2812         .release        = ll_file_release,
2813         .mmap           = ll_file_mmap,
2814         .llseek         = ll_file_seek,
2815 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2816         .sendfile       = ll_file_sendfile,
2817 #endif
2818         .fsync          = ll_fsync,
2819 };
2820
2821 struct file_operations ll_file_operations_flock = {
2822         .read           = ll_file_read,
2823         .write          = ll_file_write,
2824         .ioctl          = ll_file_ioctl,
2825         .open           = ll_file_open,
2826         .release        = ll_file_release,
2827         .mmap           = ll_file_mmap,
2828         .llseek         = ll_file_seek,
2829 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2830         .sendfile       = ll_file_sendfile,
2831 #endif
2832         .fsync          = ll_fsync,
2833 #ifdef HAVE_F_OP_FLOCK
2834         .flock          = ll_file_flock,
2835 #endif
2836         .lock           = ll_file_flock
2837 };
2838
2839 /* These are for -o noflock - to return ENOSYS on flock calls */
2840 struct file_operations ll_file_operations_noflock = {
2841         .read           = ll_file_read,
2842         .write          = ll_file_write,
2843         .ioctl          = ll_file_ioctl,
2844         .open           = ll_file_open,
2845         .release        = ll_file_release,
2846         .mmap           = ll_file_mmap,
2847         .llseek         = ll_file_seek,
2848 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2849         .sendfile       = ll_file_sendfile,
2850 #endif
2851         .fsync          = ll_fsync,
2852 #ifdef HAVE_F_OP_FLOCK
2853         .flock          = ll_file_noflock,
2854 #endif
2855         .lock           = ll_file_noflock
2856 };
2857
2858 struct inode_operations ll_file_inode_operations = {
2859 #ifdef LUSTRE_KERNEL_VERSION
2860         .setattr_raw    = ll_setattr_raw,
2861 #endif
2862         .setattr        = ll_setattr,
2863         .truncate       = ll_truncate,
2864 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2865         .getattr        = ll_getattr,
2866 #else
2867         .revalidate_it  = ll_inode_revalidate_it,
2868 #endif
2869         .permission     = ll_inode_permission,
2870         .setxattr       = ll_setxattr,
2871         .getxattr       = ll_getxattr,
2872         .listxattr      = ll_listxattr,
2873         .removexattr    = ll_removexattr,
2874 };
2875