lustre/llite/file.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
   5  *   Author: Peter Braam <braam@clusterfs.com>
   6  *   Author: Phil Schwan <phil@clusterfs.com>
   7  *   Author: Andreas Dilger <adilger@clusterfs.com>
   8  *
   9  *   This file is part of Lustre, http://www.lustre.org.
  10  *
  11  *   Lustre is free software; you can redistribute it and/or
  12  *   modify it under the terms of version 2 of the GNU General Public
  13  *   License as published by the Free Software Foundation.
  14  *
  15  *   Lustre is distributed in the hope that it will be useful,
  16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  *   GNU General Public License for more details.
  19  *
  20  *   You should have received a copy of the GNU General Public License
  21  *   along with Lustre; if not, write to the Free Software
  22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25 #define DEBUG_SUBSYSTEM S_LLITE
  26 #include <lustre_dlm.h>
  27 #include <lustre_lite.h>
  28 #include <lustre_mdc.h>
  29 #include <linux/pagemap.h>
  30 #include <linux/file.h>
  31 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
  32 #include <linux/lustre_compat25.h>
  33 #endif
  34 #include "llite_internal.h"
  35
  36 /* also used by llite/special.c:ll_special_open() */
  37 struct ll_file_data *ll_file_data_get(void)
  38 {
  39         struct ll_file_data *fd;
  40
  41         OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
  42         return fd;
  43 }
  44
  45 static void ll_file_data_put(struct ll_file_data *fd)
  46 {
  47         if (fd != NULL)
  48                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  49 }
  50
  51 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  52                           struct lustre_handle *fh)
  53 {
  54         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  55         op_data->op_attr.ia_mode = inode->i_mode;
  56         op_data->op_attr.ia_atime = inode->i_atime;
  57         op_data->op_attr.ia_mtime = inode->i_mtime;
  58         op_data->op_attr.ia_ctime = inode->i_ctime;
  59         op_data->op_attr.ia_size = i_size_read(inode);
  60         op_data->op_attr_blocks = inode->i_blocks;
  61         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
  62         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  63         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
  64         op_data->op_capa1 = ll_mdscapa_get(inode);
  65 }
  66
  67 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  68                              struct obd_client_handle *och)
  69 {
  70         ENTRY;
  71
  72         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
  73                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
  74
  75         if (!(och->och_flags & FMODE_WRITE))
  76                 goto out;
  77
  78         if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
  79             !S_ISREG(inode->i_mode))
  80                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
  81         else
  82                 ll_epoch_close(inode, op_data, &och, 0);
  83
  84 out:
  85         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
  86         EXIT;
  87 }
  88
  89 static int ll_close_inode_openhandle(struct obd_export *md_exp,
  90                                      struct inode *inode,
  91                                      struct obd_client_handle *och)
  92 {
  93         struct obd_export *exp = ll_i2mdexp(inode);
  94         struct md_op_data *op_data;
  95         struct ptlrpc_request *req = NULL;
  96         struct obd_device *obd = class_exp2obd(exp);
  97         int epoch_close = 1;
  98         int rc;
  99         ENTRY;
 100
 101         if (obd == NULL) {
 102                 /*
 103                  * XXX: in case of LMV, is this correct to access
 104                  * ->exp_handle?
 105                  */
 106                 CERROR("Invalid MDC connection handle "LPX64"\n",
 107                        ll_i2mdexp(inode)->exp_handle.h_cookie);
 108                 GOTO(out, rc = 0);
 109         }
 110
 111         /*
 112          * here we check if this is forced umount. If so this is called on
 113          * canceling "open lock" and we do not call md_close() in this case, as
 114          * it will not be successful, as import is already deactivated.
 115          */
 116         if (obd->obd_force)
 117                 GOTO(out, rc = 0);
 118
 119         OBD_ALLOC_PTR(op_data);
 120         if (op_data == NULL)
 121                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
 122
 123         ll_prepare_close(inode, op_data, och);
 124         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
 125         rc = md_close(md_exp, op_data, och, &req);
 126
 127         if (rc == -EAGAIN) {
 128                 /* This close must have the epoch closed. */
 129                 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
 130                 LASSERT(epoch_close);
 131                 /* MDS has instructed us to obtain Size-on-MDS attribute from
 132                  * OSTs and send setattr to back to MDS. */
 133                 rc = ll_sizeonmds_update(inode, &och->och_fh,
 134                                          op_data->op_ioepoch);
 135                 if (rc) {
 136                         CERROR("inode %lu mdc Size-on-MDS update failed: "
 137                                "rc = %d\n", inode->i_ino, rc);
 138                         rc = 0;
 139                 }
 140         } else if (rc) {
 141                 CERROR("inode %lu mdc close failed: rc = %d\n",
 142                        inode->i_ino, rc);
 143         }
 144         ll_finish_md_op_data(op_data);
 145
 146         if (rc == 0) {
 147                 rc = ll_objects_destroy(req, inode);
 148                 if (rc)
 149                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
 150                                inode->i_ino, rc);
 151         }
 152
 153         ptlrpc_req_finished(req); /* This is close request */
 154         EXIT;
 155 out:
 156
 157         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
 158             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 159                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 160         } else {
 161                 md_clear_open_replay_data(md_exp, och);
 162                 /* Free @och if it is not waiting for DONE_WRITING. */
 163                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 164                 OBD_FREE_PTR(och);
 165         }
 166
 167         return rc;
 168 }
 169
 170 int ll_md_real_close(struct inode *inode, int flags)
 171 {
 172         struct ll_inode_info *lli = ll_i2info(inode);
 173         struct obd_client_handle **och_p;
 174         struct obd_client_handle *och;
 175         __u64 *och_usecount;
 176         int rc = 0;
 177         ENTRY;
 178
 179         if (flags & FMODE_WRITE) {
 180                 och_p = &lli->lli_mds_write_och;
 181                 och_usecount = &lli->lli_open_fd_write_count;
 182         } else if (flags & FMODE_EXEC) {
 183                 och_p = &lli->lli_mds_exec_och;
 184                 och_usecount = &lli->lli_open_fd_exec_count;
 185         } else {
 186                 LASSERT(flags & FMODE_READ);
 187                 och_p = &lli->lli_mds_read_och;
 188                 och_usecount = &lli->lli_open_fd_read_count;
 189         }
 190
 191         down(&lli->lli_och_sem);
 192         if (*och_usecount) { /* There are still users of this handle, so
 193                                 skip freeing it. */
 194                 up(&lli->lli_och_sem);
 195                 RETURN(0);
 196         }
 197         och=*och_p;
 198         *och_p = NULL;
 199         up(&lli->lli_och_sem);
 200
 201         if (och) { /* There might be a race and somebody have freed this och
 202                       already */
 203                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 204                                                inode, och);
 205         }
 206
 207         RETURN(rc);
 208 }
 209
 210 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 211                 struct file *file)
 212 {
 213         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 214         struct ll_inode_info *lli = ll_i2info(inode);
 215         int rc = 0;
 216         ENTRY;
 217
 218         /* clear group lock, if present */
 219         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
 220                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 221                 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
 222                 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
 223                                       &fd->fd_cwlockh);
 224         }
 225
 226         /* Let's see if we have good enough OPEN lock on the file and if
 227            we can skip talking to MDS */
 228         if (file->f_dentry->d_inode) { /* Can this ever be false? */
 229                 int lockmode;
 230                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 231                 struct lustre_handle lockh;
 232                 struct inode *inode = file->f_dentry->d_inode;
 233                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 234
 235                 down(&lli->lli_och_sem);
 236                 if (fd->fd_omode & FMODE_WRITE) {
 237                         lockmode = LCK_CW;
 238                         LASSERT(lli->lli_open_fd_write_count);
 239                         lli->lli_open_fd_write_count--;
 240                 } else if (fd->fd_omode & FMODE_EXEC) {
 241                         lockmode = LCK_PR;
 242                         LASSERT(lli->lli_open_fd_exec_count);
 243                         lli->lli_open_fd_exec_count--;
 244                 } else {
 245                         lockmode = LCK_CR;
 246                         LASSERT(lli->lli_open_fd_read_count);
 247                         lli->lli_open_fd_read_count--;
 248                 }
 249                 up(&lli->lli_och_sem);
 250
 251                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 252                                    LDLM_IBITS, &policy, lockmode,
 253                                    &lockh)) {
 254                         rc = ll_md_real_close(file->f_dentry->d_inode,
 255                                               fd->fd_omode);
 256                 }
 257         } else {
 258                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
 259                        file, file->f_dentry, file->f_dentry->d_name.name);
 260         }
 261
 262         LUSTRE_FPRIVATE(file) = NULL;
 263         ll_file_data_put(fd);
 264         ll_capa_close(inode);
 265
 266         RETURN(rc);
 267 }
 268
 269 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
 270
 271 /* While this returns an error code, fput() the caller does not, so we need
 272  * to make every effort to clean up all of our state here.  Also, applications
 273  * rarely check close errors and even if an error is returned they will not
 274  * re-try the close call.
 275  */
 276 int ll_file_release(struct inode *inode, struct file *file)
 277 {
 278         struct ll_file_data *fd;
 279         struct ll_sb_info *sbi = ll_i2sbi(inode);
 280         struct ll_inode_info *lli = ll_i2info(inode);
 281         struct lov_stripe_md *lsm = lli->lli_smd;
 282         int rc;
 283
 284         ENTRY;
 285         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 286                inode->i_generation, inode);
 287
 288         /* don't do anything for / */
 289         if (inode->i_sb->s_root == file->f_dentry)
 290                 RETURN(0);
 291
 292         ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 293         fd = LUSTRE_FPRIVATE(file);
 294         LASSERT(fd != NULL);
 295
 296         /* don't do anything for / */
 297         if (inode->i_sb->s_root == file->f_dentry) {
 298                 LUSTRE_FPRIVATE(file) = NULL;
 299                 ll_file_data_put(fd);
 300                 RETURN(0);
 301         }
 302
 303         if (lsm)
 304                 lov_test_and_clear_async_rc(lsm);
 305         lli->lli_async_rc = 0;
 306
 307         rc = ll_md_close(sbi->ll_md_exp, inode, file);
 308         RETURN(rc);
 309 }
 310
 311 static int ll_intent_file_open(struct file *file, void *lmm,
 312                                int lmmsize, struct lookup_intent *itp)
 313 {
 314         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
 315         struct dentry *parent = file->f_dentry->d_parent;
 316         const char *name = file->f_dentry->d_name.name;
 317         const int len = file->f_dentry->d_name.len;
 318         struct md_op_data *op_data;
 319         struct ptlrpc_request *req;
 320         int rc;
 321
 322         if (!parent)
 323                 RETURN(-ENOENT);
 324
 325         /* Usually we come here only for NFSD, and we want open lock.
 326            But we can also get here with pre 2.6.15 patchless kernels, and in
 327            that case that lock is also ok */
 328         /* We can also get here if there was cached open handle in revalidate_it
 329          * but it disappeared while we were getting from there to ll_file_open.
 330          * But this means this file was closed and immediatelly opened which
 331          * makes a good candidate for using OPEN lock */
 332         /* If lmmsize & lmm are not 0, we are just setting stripe info
 333          * parameters. No need for the open lock */
 334         if (!lmm && !lmmsize)
 335                 itp->it_flags |= MDS_OPEN_LOCK;
 336
 337         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
 338                                       file->f_dentry->d_inode, name, len,
 339                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
 340         if (IS_ERR(op_data))
 341                 RETURN(PTR_ERR(op_data));
 342
 343         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 344                             0 /*unused */, &req, ll_md_blocking_ast, 0);
 345         ll_finish_md_op_data(op_data);
 346         if (rc == -ESTALE) {
 347                 /* reason for keep own exit path - don`t flood log
 348                 * with messages with -ESTALE errors.
 349                 */
 350                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 351                      it_open_error(DISP_OPEN_OPEN, itp))
 352                         GOTO(out, rc);
 353                 ll_release_openhandle(file->f_dentry, itp);
 354                 GOTO(out_stale, rc);
 355         }
 356
 357         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 358                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 359                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 360                 GOTO(out, rc);
 361         }
 362
 363         if (itp->d.lustre.it_lock_mode)
 364                 md_set_lock_data(sbi->ll_md_exp,
 365                                  &itp->d.lustre.it_lock_handle,
 366                                  file->f_dentry->d_inode);
 367
 368         rc = ll_prep_inode(&file->f_dentry->d_inode, req, DLM_REPLY_REC_OFF,
 369                            NULL);
 370 out:
 371         ptlrpc_req_finished(itp->d.lustre.it_data);
 372
 373 out_stale:
 374         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
 375         ll_intent_drop_lock(itp);
 376
 377         RETURN(rc);
 378 }
 379
 380 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
 381                        struct lookup_intent *it, struct obd_client_handle *och)
 382 {
 383         struct ptlrpc_request *req = it->d.lustre.it_data;
 384         struct mdt_body *body;
 385
 386         LASSERT(och);
 387
 388         body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
 389         LASSERT(body != NULL);                      /* reply already checked out */
 390         LASSERT_REPSWABBED(req, DLM_REPLY_REC_OFF); /* and swabbed in md_enqueue */
 391
 392         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
 393         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 394         och->och_fid = lli->lli_fid;
 395         och->och_flags = it->it_flags;
 396         lli->lli_ioepoch = body->ioepoch;
 397
 398         return md_set_open_replay_data(md_exp, och, req);
 399 }
 400
 401 int ll_local_open(struct file *file, struct lookup_intent *it,
 402                   struct ll_file_data *fd, struct obd_client_handle *och)
 403 {
 404         struct inode *inode = file->f_dentry->d_inode;
 405         struct ll_inode_info *lli = ll_i2info(inode);
 406         ENTRY;
 407
 408         LASSERT(!LUSTRE_FPRIVATE(file));
 409
 410         LASSERT(fd != NULL);
 411
 412         if (och) {
 413                 struct ptlrpc_request *req = it->d.lustre.it_data;
 414                 struct mdt_body *body;
 415                 int rc;
 416
 417                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
 418                 if (rc)
 419                         RETURN(rc);
 420
 421                 body = lustre_msg_buf(req->rq_repmsg,
 422                                       DLM_REPLY_REC_OFF, sizeof(*body));
 423
 424                 if ((it->it_flags & FMODE_WRITE) &&
 425                     (body->valid & OBD_MD_FLSIZE))
 426                 {
 427                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 428                                lli->lli_ioepoch, PFID(&lli->lli_fid));
 429                 }
 430         }
 431
 432         LUSTRE_FPRIVATE(file) = fd;
 433         ll_readahead_init(inode, &fd->fd_ras);
 434         fd->fd_omode = it->it_flags;
 435         RETURN(0);
 436 }
 437
 438 /* Open a file, and (for the very first open) create objects on the OSTs at
 439  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 440  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
 441  * lli_open_sem to ensure no other process will create objects, send the
 442  * stripe MD to the MDS, or try to destroy the objects if that fails.
 443  *
 444  * If we already have the stripe MD locally then we don't request it in
 445  * md_open(), by passing a lmm_size = 0.
 446  *
 447  * It is up to the application to ensure no other processes open this file
 448  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 449  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 450  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 451  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 452  */
 453 int ll_file_open(struct inode *inode, struct file *file)
 454 {
 455         struct ll_inode_info *lli = ll_i2info(inode);
 456         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 457                                           .it_flags = file->f_flags };
 458         struct lov_stripe_md *lsm;
 459         struct ptlrpc_request *req = NULL;
 460         struct obd_client_handle **och_p;
 461         __u64 *och_usecount;
 462         struct ll_file_data *fd;
 463         int rc = 0;
 464         ENTRY;
 465
 466         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 467                inode->i_generation, inode, file->f_flags);
 468
 469         /* don't do anything for / */
 470         if (inode->i_sb->s_root == file->f_dentry)
 471                 RETURN(0);
 472
 473 #ifdef LUSTRE_KERNEL_VERSION
 474         it = file->f_it;
 475 #else
 476         it = file->private_data; /* XXX: compat macro */
 477         file->private_data = NULL; /* prevent ll_local_open assertion */
 478 #endif
 479
 480         fd = ll_file_data_get();
 481         if (fd == NULL)
 482                 RETURN(-ENOMEM);
 483
 484         /* don't do anything for / */
 485         if (inode->i_sb->s_root == file->f_dentry) {
 486                 LUSTRE_FPRIVATE(file) = fd;
 487                 RETURN(0);
 488         }
 489
 490         if (!it || !it->d.lustre.it_disposition) {
 491                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 492                  * because everything but O_ACCMODE mask was stripped from
 493                  * there */
 494                 if ((oit.it_flags + 1) & O_ACCMODE)
 495                         oit.it_flags++;
 496                 if (file->f_flags & O_TRUNC)
 497                         oit.it_flags |= FMODE_WRITE;
 498
 499                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 500                  * dentry_open after call to open_namei that checks permissions.
 501                  * Only nfsd_open call dentry_open directly without checking
 502                  * permissions and because of that this code below is safe. */
 503                 if (oit.it_flags & FMODE_WRITE)
 504                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 505
 506                 /* We do not want O_EXCL here, presumably we opened the file
 507                  * already? XXX - NFS implications? */
 508                 oit.it_flags &= ~O_EXCL;
 509
 510                 it = &oit;
 511         }
 512
 513         /* Let's see if we have file open on MDS already. */
 514         if (it->it_flags & FMODE_WRITE) {
 515                 och_p = &lli->lli_mds_write_och;
 516                 och_usecount = &lli->lli_open_fd_write_count;
 517         } else if (it->it_flags & FMODE_EXEC) {
 518                 och_p = &lli->lli_mds_exec_och;
 519                 och_usecount = &lli->lli_open_fd_exec_count;
 520          } else {
 521                 och_p = &lli->lli_mds_read_och;
 522                 och_usecount = &lli->lli_open_fd_read_count;
 523         }
 524
 525         down(&lli->lli_och_sem);
 526         if (*och_p) { /* Open handle is present */
 527                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 528                         /* Well, there's extra open request that we do not need,
 529                            let's close it somehow. This will decref request. */
 530                         rc = it_open_error(DISP_OPEN_OPEN, it);
 531                         if (rc) {
 532                                 ll_file_data_put(fd);
 533                                 GOTO(out_och_free, rc);
 534                         }
 535                         ll_release_openhandle(file->f_dentry, it);
 536                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
 537                                              LPROC_LL_OPEN);
 538                 }
 539                 (*och_usecount)++;
 540
 541                 rc = ll_local_open(file, it, fd, NULL);
 542                 if (rc) {
 543                         up(&lli->lli_och_sem);
 544                         ll_file_data_put(fd);
 545                         RETURN(rc);
 546                 }
 547         } else {
 548                 LASSERT(*och_usecount == 0);
 549                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 550                 if (!*och_p) {
 551                         ll_file_data_put(fd);
 552                         GOTO(out_och_free, rc = -ENOMEM);
 553                 }
 554                 (*och_usecount)++;
 555                 if (!it->d.lustre.it_disposition) {
 556                         it->it_flags |= O_CHECK_STALE;
 557                         rc = ll_intent_file_open(file, NULL, 0, it);
 558                         it->it_flags &= ~O_CHECK_STALE;
 559                         if (rc) {
 560                                 ll_file_data_put(fd);
 561                                 GOTO(out_och_free, rc);
 562                         }
 563
 564                         /* Got some error? Release the request */
 565                         if (it->d.lustre.it_status < 0) {
 566                                 req = it->d.lustre.it_data;
 567                                 ptlrpc_req_finished(req);
 568                         }
 569                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
 570                                          &it->d.lustre.it_lock_handle,
 571                                          file->f_dentry->d_inode);
 572                 }
 573                 req = it->d.lustre.it_data;
 574
 575                 /* md_intent_lock() didn't get a request ref if there was an
 576                  * open error, so don't do cleanup on the request here
 577                  * (bug 3430) */
 578                 /* XXX (green): Should not we bail out on any error here, not
 579                  * just open error? */
 580                 rc = it_open_error(DISP_OPEN_OPEN, it);
 581                 if (rc) {
 582                         ll_file_data_put(fd);
 583                         GOTO(out_och_free, rc);
 584                 }
 585
 586                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 587                 rc = ll_local_open(file, it, fd, *och_p);
 588                 if (rc) {
 589                         up(&lli->lli_och_sem);
 590                         ll_file_data_put(fd);
 591                         GOTO(out_och_free, rc);
 592                 }
 593         }
 594         up(&lli->lli_och_sem);
 595
 596         /* Must do this outside lli_och_sem lock to prevent deadlock where
 597            different kind of OPEN lock for this same inode gets cancelled
 598            by ldlm_cancel_lru */
 599         if (!S_ISREG(inode->i_mode))
 600                 GOTO(out, rc);
 601
 602         ll_capa_open(inode);
 603
 604         lsm = lli->lli_smd;
 605         if (lsm == NULL) {
 606                 if (file->f_flags & O_LOV_DELAY_CREATE ||
 607                     !(file->f_mode & FMODE_WRITE)) {
 608                         CDEBUG(D_INODE, "object creation was delayed\n");
 609                         GOTO(out, rc);
 610                 }
 611         }
 612         file->f_flags &= ~O_LOV_DELAY_CREATE;
 613         GOTO(out, rc);
 614 out:
 615         ptlrpc_req_finished(req);
 616         if (req)
 617                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 618 out_och_free:
 619         if (rc) {
 620                 if (*och_p) {
 621                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 622                         *och_p = NULL; /* OBD_FREE writes some magic there */
 623                         (*och_usecount)--;
 624                 }
 625                 up(&lli->lli_och_sem);
 626         }
 627
 628         return rc;
 629 }
 630
 631 /* Fills the obdo with the attributes for the inode defined by lsm */
 632 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
 633 {
 634         struct ptlrpc_request_set *set;
 635         struct ll_inode_info *lli = ll_i2info(inode);
 636         struct lov_stripe_md *lsm = lli->lli_smd;
 637
 638         struct obd_info oinfo = { { { 0 } } };
 639         int rc;
 640         ENTRY;
 641
 642         LASSERT(lsm != NULL);
 643
 644         oinfo.oi_md = lsm;
 645         oinfo.oi_oa = obdo;
 646         oinfo.oi_oa->o_id = lsm->lsm_object_id;
 647         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
 648         oinfo.oi_oa->o_mode = S_IFREG;
 649         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 650                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 651                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 652                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 653                                OBD_MD_FLGROUP;
 654         oinfo.oi_capa = ll_mdscapa_get(inode);
 655
 656         set = ptlrpc_prep_set();
 657         if (set == NULL) {
 658                 CERROR("can't allocate ptlrpc set\n");
 659                 rc = -ENOMEM;
 660         } else {
 661                 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
 662                 if (rc == 0)
 663                         rc = ptlrpc_set_wait(set);
 664                 ptlrpc_set_destroy(set);
 665         }
 666         capa_put(oinfo.oi_capa);
 667         if (rc)
 668                 RETURN(rc);
 669
 670         oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 671                                  OBD_MD_FLATIME | OBD_MD_FLMTIME |
 672                                  OBD_MD_FLCTIME | OBD_MD_FLSIZE);
 673
 674         obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
 675         CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
 676                lli->lli_smd->lsm_object_id, i_size_read(inode),
 677                inode->i_blocks, inode->i_blksize);
 678         RETURN(0);
 679 }
 680
 681 static inline void ll_remove_suid(struct inode *inode)
 682 {
 683         unsigned int mode;
 684
 685         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
 686         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
 687
 688         /* was any of the uid bits set? */
 689         mode &= inode->i_mode;
 690         if (mode && !capable(CAP_FSETID)) {
 691                 inode->i_mode &= ~mode;
 692                 // XXX careful here - we cannot change the size
 693         }
 694 }
 695
 696 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
 697 {
 698         struct ll_inode_info *lli = ll_i2info(inode);
 699         struct lov_stripe_md *lsm = lli->lli_smd;
 700         struct obd_export *exp = ll_i2dtexp(inode);
 701         struct {
 702                 char name[16];
 703                 struct ldlm_lock *lock;
 704                 struct lov_stripe_md *lsm;
 705         } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
 706         __u32 stripe, vallen = sizeof(stripe);
 707         int rc;
 708         ENTRY;
 709
 710         if (lsm->lsm_stripe_count == 1)
 711                 GOTO(check, stripe = 0);
 712
 713         /* get our offset in the lov */
 714         rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
 715         if (rc != 0) {
 716                 CERROR("obd_get_info: rc = %d\n", rc);
 717                 RETURN(rc);
 718         }
 719         LASSERT(stripe < lsm->lsm_stripe_count);
 720
 721 check:
 722         if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
 723             lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
 724                 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
 725                            lsm->lsm_oinfo[stripe]->loi_id,
 726                            lsm->lsm_oinfo[stripe]->loi_gr);
 727                 RETURN(-ELDLM_NO_LOCK_DATA);
 728         }
 729
 730         RETURN(stripe);
 731 }
 732
 733 /* Flush the page cache for an extent as its canceled.  When we're on an LOV,
 734  * we get a lock cancellation for each stripe, so we have to map the obd's
 735  * region back onto the stripes in the file that it held.
 736  *
 737  * No one can dirty the extent until we've finished our work and they can
 738  * enqueue another lock.  The DLM protects us from ll_file_read/write here,
 739  * but other kernel actors could have pages locked.
 740  *
 741  * Called with the DLM lock held. */
 742 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
 743                               struct ldlm_lock *lock, __u32 stripe)
 744 {
 745         ldlm_policy_data_t tmpex;
 746         unsigned long start, end, count, skip, i, j;
 747         struct page *page;
 748         int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
 749         struct lustre_handle lockh;
 750         ENTRY;
 751
 752         memcpy(&tmpex, &lock->l_policy_data, sizeof(tmpex));
 753         CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
 754                inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
 755                i_size_read(inode));
 756
 757         /* our locks are page granular thanks to osc_enqueue, we invalidate the
 758          * whole page. */
 759         if ((tmpex.l_extent.start & ~CFS_PAGE_MASK) != 0 ||
 760             ((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) != 0)
 761                 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",
 762                            CFS_PAGE_SIZE);
 763         LASSERT((tmpex.l_extent.start & ~CFS_PAGE_MASK) == 0);
 764         LASSERT(((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) == 0);
 765
 766         count = ~0;
 767         skip = 0;
 768         start = tmpex.l_extent.start >> CFS_PAGE_SHIFT;
 769         end = tmpex.l_extent.end >> CFS_PAGE_SHIFT;
 770         if (lsm->lsm_stripe_count > 1) {
 771                 count = lsm->lsm_stripe_size >> CFS_PAGE_SHIFT;
 772                 skip = (lsm->lsm_stripe_count - 1) * count;
 773                 start += start/count * skip + stripe * count;
 774                 if (end != ~0)
 775                         end += end/count * skip + stripe * count;
 776         }
 777         if (end < tmpex.l_extent.end >> CFS_PAGE_SHIFT)
 778                 end = ~0;
 779
 780         i = i_size_read(inode) ? (__u64)(i_size_read(inode) - 1) >>
 781             CFS_PAGE_SHIFT : 0;
 782         if (i < end)
 783                 end = i;
 784
 785         CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
 786                "count: %lu skip: %lu end: %lu%s\n", start, start % count,
 787                count, skip, end, discard ? " (DISCARDING)" : "");
 788
 789         /* walk through the vmas on the inode and tear down mmaped pages that
 790          * intersect with the lock.  this stops immediately if there are no
 791          * mmap()ed regions of the file.  This is not efficient at all and
 792          * should be short lived. We'll associate mmap()ed pages with the lock
 793          * and will be able to find them directly */
 794         for (i = start; i <= end; i += (j + skip)) {
 795                 j = min(count - (i % count), end - i + 1);
 796                 LASSERT(j > 0);
 797                 LASSERT(inode->i_mapping);
 798                 if (ll_teardown_mmaps(inode->i_mapping,
 799                                       (__u64)i << CFS_PAGE_SHIFT,
 800                                       ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) )
 801                         break;
 802         }
 803
 804         /* this is the simplistic implementation of page eviction at
 805          * cancelation.  It is careful to get races with other page
 806          * lockers handled correctly.  fixes from bug 20 will make it
 807          * more efficient by associating locks with pages and with
 808          * batching writeback under the lock explicitly. */
 809         for (i = start, j = start % count; i <= end;
 810              j++, i++, tmpex.l_extent.start += CFS_PAGE_SIZE) {
 811                 if (j == count) {
 812                         CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
 813                         i += skip;
 814                         j = 0;
 815                         if (i > end)
 816                                 break;
 817                 }
 818                 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
 819                          LPU64" >= "LPU64" start %lu i %lu end %lu\n",
 820                          tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
 821                          start, i, end);
 822
 823                 if (!mapping_has_pages(inode->i_mapping)) {
 824                         CDEBUG(D_INODE|D_PAGE, "nothing left\n");
 825                         break;
 826                 }
 827
 828                 cond_resched();
 829
 830                 page = find_get_page(inode->i_mapping, i);
 831                 if (page == NULL)
 832                         continue;
 833                 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
 834                                i, tmpex.l_extent.start);
 835                 lock_page(page);
 836
 837                 /* page->mapping to check with racing against teardown */
 838                 if (!discard && clear_page_dirty_for_io(page)) {
 839                         rc = ll_call_writepage(inode, page);
 840                         if (rc != 0)
 841                                 CERROR("writepage inode %lu(%p) of page %p "
 842                                        "failed: %d\n", inode->i_ino, inode,
 843                                        page, rc);
 844                         /* either waiting for io to complete or reacquiring
 845                          * the lock that the failed writepage released */
 846                         lock_page(page);
 847                 }
 848
 849                 tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
 850                 /* check to see if another DLM lock covers this page b=2765 */
 851                 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
 852                                       LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING |
 853                                       LDLM_FL_TEST_LOCK,
 854                                       &lock->l_resource->lr_name, LDLM_EXTENT,
 855                                       &tmpex, LCK_PR | LCK_PW, &lockh);
 856
 857                 if (rc2 <= 0 && page->mapping != NULL) {
 858                         struct ll_async_page *llap = llap_cast_private(page);
 859                         /* checking again to account for writeback's
 860                          * lock_page() */
 861                         LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
 862                         if (llap)
 863                                 ll_ra_accounting(llap, inode->i_mapping);
 864                         ll_truncate_complete_page(page);
 865                 }
 866                 unlock_page(page);
 867                 page_cache_release(page);
 868         }
 869         LASSERTF(tmpex.l_extent.start <=
 870                  (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
 871                   lock->l_policy_data.l_extent.end + 1),
 872                  "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
 873                  tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
 874                  start, i, end);
 875         EXIT;
 876 }
 877
 878 static int ll_extent_lock_callback(struct ldlm_lock *lock,
 879                                    struct ldlm_lock_desc *new, void *data,
 880                                    int flag)
 881 {
 882         struct lustre_handle lockh = { 0 };
 883         int rc;
 884         ENTRY;
 885
 886         if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
 887                 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
 888                 LBUG();
 889         }
 890
 891         switch (flag) {
 892         case LDLM_CB_BLOCKING:
 893                 ldlm_lock2handle(lock, &lockh);
 894                 rc = ldlm_cli_cancel(&lockh);
 895                 if (rc != ELDLM_OK)
 896                         CERROR("ldlm_cli_cancel failed: %d\n", rc);
 897                 break;
 898         case LDLM_CB_CANCELING: {
 899                 struct inode *inode;
 900                 struct ll_inode_info *lli;
 901                 struct lov_stripe_md *lsm;
 902                 int stripe;
 903                 __u64 kms;
 904
 905                 /* This lock wasn't granted, don't try to evict pages */
 906                 if (lock->l_req_mode != lock->l_granted_mode)
 907                         RETURN(0);
 908
 909                 inode = ll_inode_from_lock(lock);
 910                 if (inode == NULL)
 911                         RETURN(0);
 912                 lli = ll_i2info(inode);
 913                 if (lli == NULL)
 914                         goto iput;
 915                 if (lli->lli_smd == NULL)
 916                         goto iput;
 917                 lsm = lli->lli_smd;
 918
 919                 stripe = ll_lock_to_stripe_offset(inode, lock);
 920                 if (stripe < 0)
 921                         goto iput;
 922
 923                 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
 924
 925                 lov_stripe_lock(lsm);
 926                 lock_res_and_lock(lock);
 927                 kms = ldlm_extent_shift_kms(lock,
 928                                             lsm->lsm_oinfo[stripe]->loi_kms);
 929
 930                 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
 931                         LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 932                                    lsm->lsm_oinfo[stripe]->loi_kms, kms);
 933                 lsm->lsm_oinfo[stripe]->loi_kms = kms;
 934                 unlock_res_and_lock(lock);
 935                 lov_stripe_unlock(lsm);
 936         iput:
 937                 iput(inode);
 938                 break;
 939         }
 940         default:
 941                 LBUG();
 942         }
 943
 944         RETURN(0);
 945 }
 946
 947 #if 0
 948 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
 949 {
 950         /* XXX ALLOCATE - 160 bytes */
 951         struct inode *inode = ll_inode_from_lock(lock);
 952         struct ll_inode_info *lli = ll_i2info(inode);
 953         struct lustre_handle lockh = { 0 };
 954         struct ost_lvb *lvb;
 955         int stripe;
 956         ENTRY;
 957
 958         if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
 959                      LDLM_FL_BLOCK_CONV)) {
 960                 LBUG(); /* not expecting any blocked async locks yet */
 961                 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
 962                            "lock, returning");
 963                 ldlm_lock_dump(D_OTHER, lock, 0);
 964                 ldlm_reprocess_all(lock->l_resource);
 965                 RETURN(0);
 966         }
 967
 968         LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
 969
 970         stripe = ll_lock_to_stripe_offset(inode, lock);
 971         if (stripe < 0)
 972                 goto iput;
 973
 974         if (lock->l_lvb_len) {
 975                 struct lov_stripe_md *lsm = lli->lli_smd;
 976                 __u64 kms;
 977                 lvb = lock->l_lvb_data;
 978                 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
 979
 980                 lock_res_and_lock(lock);
 981                 ll_inode_size_lock(inode, 1);
 982                 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
 983                 kms = ldlm_extent_shift_kms(NULL, kms);
 984                 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
 985                         LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 986                                    lsm->lsm_oinfo[stripe].loi_kms, kms);
 987                 lsm->lsm_oinfo[stripe].loi_kms = kms;
 988                 ll_inode_size_unlock(inode, 1);
 989                 unlock_res_and_lock(lock);
 990         }
 991
 992 iput:
 993         iput(inode);
 994         wake_up(&lock->l_waitq);
 995
 996         ldlm_lock2handle(lock, &lockh);
 997         ldlm_lock_decref(&lockh, LCK_PR);
 998         RETURN(0);
 999 }
1000 #endif
1001
1002 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
1003 {
1004         struct ptlrpc_request *req = reqp;
1005         struct inode *inode = ll_inode_from_lock(lock);
1006         struct ll_inode_info *lli;
1007         struct lov_stripe_md *lsm;
1008         struct ost_lvb *lvb;
1009         int rc, stripe;
1010         int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
1011         ENTRY;
1012
1013         if (inode == NULL)
1014                 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
1015         lli = ll_i2info(inode);
1016         if (lli == NULL)
1017                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1018         lsm = lli->lli_smd;
1019         if (lsm == NULL)
1020                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1021
1022         /* First, find out which stripe index this lock corresponds to. */
1023         stripe = ll_lock_to_stripe_offset(inode, lock);
1024         if (stripe < 0)
1025                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1026
1027         rc = lustre_pack_reply(req, 2, size, NULL);
1028         if (rc) {
1029                 CERROR("lustre_pack_reply: %d\n", rc);
1030                 GOTO(iput, rc);
1031         }
1032
1033         lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
1034         lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
1035         lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1036         lvb->lvb_atime = LTIME_S(inode->i_atime);
1037         lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1038
1039         LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1040                    " atime "LPU64", mtime "LPU64", ctime "LPU64,
1041                    i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
1042                    lvb->lvb_atime, lvb->lvb_ctime);
1043  iput:
1044         iput(inode);
1045
1046  out:
1047         /* These errors are normal races, so we don't want to fill the console
1048          * with messages by calling ptlrpc_error() */
1049         if (rc == -ELDLM_NO_LOCK_DATA)
1050                 lustre_pack_reply(req, 1, NULL, NULL);
1051
1052         req->rq_status = rc;
1053         return rc;
1054 }
1055
1056 static void ll_merge_lvb(struct inode *inode)
1057 {
1058         struct ll_inode_info *lli = ll_i2info(inode);
1059         struct ll_sb_info *sbi = ll_i2sbi(inode);
1060         struct ost_lvb lvb;
1061         ENTRY;
1062
1063         ll_inode_size_lock(inode, 1);
1064         inode_init_lvb(inode, &lvb);
1065         obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1066         i_size_write(inode, lvb.lvb_size);
1067         inode->i_blocks = lvb.lvb_blocks;
1068         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1069         LTIME_S(inode->i_atime) = lvb.lvb_atime;
1070         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1071         ll_inode_size_unlock(inode, 1);
1072         EXIT;
1073 }
1074
1075 int ll_local_size(struct inode *inode)
1076 {
1077         ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1078         struct ll_inode_info *lli = ll_i2info(inode);
1079         struct ll_sb_info *sbi = ll_i2sbi(inode);
1080         struct lustre_handle lockh = { 0 };
1081         int flags = 0;
1082         int rc;
1083         ENTRY;
1084
1085         if (lli->lli_smd->lsm_stripe_count == 0)
1086                 RETURN(0);
1087
1088         rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1089                        &policy, LCK_PR | LCK_PW, &flags, inode, &lockh);
1090         if (rc < 0)
1091                 RETURN(rc);
1092         else if (rc == 0)
1093                 RETURN(-ENODATA);
1094
1095         ll_merge_lvb(inode);
1096         obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR | LCK_PW, &lockh);
1097         RETURN(0);
1098 }
1099
1100 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1101                      lstat_t *st)
1102 {
1103         struct lustre_handle lockh = { 0 };
1104         struct ldlm_enqueue_info einfo = { 0 };
1105         struct obd_info oinfo = { { { 0 } } };
1106         struct ost_lvb lvb;
1107         int rc;
1108
1109         ENTRY;
1110
1111         einfo.ei_type = LDLM_EXTENT;
1112         einfo.ei_mode = LCK_PR;
1113         einfo.ei_cb_bl = ll_extent_lock_callback;
1114         einfo.ei_cb_cp = ldlm_completion_ast;
1115         einfo.ei_cb_gl = ll_glimpse_callback;
1116         einfo.ei_cbdata = NULL;
1117
1118         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1119         oinfo.oi_lockh = &lockh;
1120         oinfo.oi_md = lsm;
1121         oinfo.oi_flags = LDLM_FL_HAS_INTENT;
1122
1123         rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1124         if (rc == -ENOENT)
1125                 RETURN(rc);
1126         if (rc != 0) {
1127                 CERROR("obd_enqueue returned rc %d, "
1128                        "returning -EIO\n", rc);
1129                 RETURN(rc > 0 ? -EIO : rc);
1130         }
1131
1132         lov_stripe_lock(lsm);
1133         memset(&lvb, 0, sizeof(lvb));
1134         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1135         st->st_size = lvb.lvb_size;
1136         st->st_blocks = lvb.lvb_blocks;
1137         st->st_mtime = lvb.lvb_mtime;
1138         st->st_atime = lvb.lvb_atime;
1139         st->st_ctime = lvb.lvb_ctime;
1140         lov_stripe_unlock(lsm);
1141
1142         RETURN(rc);
1143 }
1144
1145 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1146  * file (because it prefers KMS over RSS when larger) */
1147 int ll_glimpse_size(struct inode *inode, int ast_flags)
1148 {
1149         struct ll_inode_info *lli = ll_i2info(inode);
1150         struct ll_sb_info *sbi = ll_i2sbi(inode);
1151         struct lustre_handle lockh = { 0 };
1152         struct ldlm_enqueue_info einfo = { 0 };
1153         struct obd_info oinfo = { { { 0 } } };
1154         int rc;
1155         ENTRY;
1156
1157         if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1158                 RETURN(0);
1159
1160         CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1161
1162         if (!lli->lli_smd) {
1163                 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1164                 RETURN(0);
1165         }
1166
1167         /* NOTE: this looks like DLM lock request, but it may not be one. Due
1168          *       to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1169          *       won't revoke any conflicting DLM locks held. Instead,
1170          *       ll_glimpse_callback() will be called on each client
1171          *       holding a DLM lock against this file, and resulting size
1172          *       will be returned for each stripe. DLM lock on [0, EOF] is
1173          *       acquired only if there were no conflicting locks. */
1174         einfo.ei_type = LDLM_EXTENT;
1175         einfo.ei_mode = LCK_PR;
1176         einfo.ei_cb_bl = ll_extent_lock_callback;
1177         einfo.ei_cb_cp = ldlm_completion_ast;
1178         einfo.ei_cb_gl = ll_glimpse_callback;
1179         einfo.ei_cbdata = inode;
1180
1181         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1182         oinfo.oi_lockh = &lockh;
1183         oinfo.oi_md = lli->lli_smd;
1184         oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1185
1186         rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1187         if (rc == -ENOENT)
1188                 RETURN(rc);
1189         if (rc != 0) {
1190                 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1191                 RETURN(rc > 0 ? -EIO : rc);
1192         }
1193
1194         ll_merge_lvb(inode);
1195
1196         CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
1197                i_size_read(inode), inode->i_blocks);
1198
1199         RETURN(rc);
1200 }
1201
1202 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1203                    struct lov_stripe_md *lsm, int mode,
1204                    ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1205                    int ast_flags)
1206 {
1207         struct ll_sb_info *sbi = ll_i2sbi(inode);
1208         struct ost_lvb lvb;
1209         struct ldlm_enqueue_info einfo = { 0 };
1210         struct obd_info oinfo = { { { 0 } } };
1211         int rc;
1212         ENTRY;
1213
1214         LASSERT(!lustre_handle_is_used(lockh));
1215         LASSERT(lsm != NULL);
1216
1217         /* don't drop the mmapped file to LRU */
1218         if (mapping_mapped(inode->i_mapping))
1219                 ast_flags |= LDLM_FL_NO_LRU;
1220
1221         /* XXX phil: can we do this?  won't it screw the file size up? */
1222         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1223             (sbi->ll_flags & LL_SBI_NOLCK))
1224                 RETURN(0);
1225
1226         CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1227                inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1228
1229         einfo.ei_type = LDLM_EXTENT;
1230         einfo.ei_mode = mode;
1231         einfo.ei_cb_bl = ll_extent_lock_callback;
1232         einfo.ei_cb_cp = ldlm_completion_ast;
1233         einfo.ei_cb_gl = ll_glimpse_callback;
1234         einfo.ei_cbdata = inode;
1235
1236         oinfo.oi_policy = *policy;
1237         oinfo.oi_lockh = lockh;
1238         oinfo.oi_md = lsm;
1239         oinfo.oi_flags = ast_flags;
1240
1241         rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
1242         *policy = oinfo.oi_policy;
1243         if (rc > 0)
1244                 rc = -EIO;
1245
1246         ll_inode_size_lock(inode, 1);
1247         inode_init_lvb(inode, &lvb);
1248         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1249
1250         if (policy->l_extent.start == 0 &&
1251             policy->l_extent.end == OBD_OBJECT_EOF) {
1252                 /* vmtruncate()->ll_truncate() first sets the i_size and then
1253                  * the kms under both a DLM lock and the
1254                  * ll_inode_size_lock().  If we don't get the
1255                  * ll_inode_size_lock() here we can match the DLM lock and
1256                  * reset i_size from the kms before the truncating path has
1257                  * updated the kms.  generic_file_write can then trust the
1258                  * stale i_size when doing appending writes and effectively
1259                  * cancel the result of the truncate.  Getting the
1260                  * ll_inode_size_lock() after the enqueue maintains the DLM
1261                  * -> ll_inode_size_lock() acquiring order. */
1262                 i_size_write(inode, lvb.lvb_size);
1263                 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1264                        inode->i_ino, i_size_read(inode));
1265         }
1266
1267         if (rc == 0) {
1268                 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1269                 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1270                 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1271         }
1272         ll_inode_size_unlock(inode, 1);
1273
1274         RETURN(rc);
1275 }
1276
1277 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1278                      struct lov_stripe_md *lsm, int mode,
1279                      struct lustre_handle *lockh)
1280 {
1281         struct ll_sb_info *sbi = ll_i2sbi(inode);
1282         int rc;
1283         ENTRY;
1284
1285         /* XXX phil: can we do this?  won't it screw the file size up? */
1286         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1287             (sbi->ll_flags & LL_SBI_NOLCK))
1288                 RETURN(0);
1289
1290         rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1291
1292         RETURN(rc);
1293 }
1294
1295 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1296                             loff_t *ppos)
1297 {
1298         struct inode *inode = file->f_dentry->d_inode;
1299         struct ll_inode_info *lli = ll_i2info(inode);
1300         struct lov_stripe_md *lsm = lli->lli_smd;
1301         struct ll_sb_info *sbi = ll_i2sbi(inode);
1302         struct ll_lock_tree tree;
1303         struct ll_lock_tree_node *node;
1304         struct ost_lvb lvb;
1305         struct ll_ra_read bead;
1306         int rc, ra = 0;
1307         loff_t end;
1308         ssize_t retval, chunk, sum = 0;
1309
1310         __u64 kms;
1311         ENTRY;
1312         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1313                inode->i_ino, inode->i_generation, inode, count, *ppos);
1314         /* "If nbyte is 0, read() will return 0 and have no other results."
1315          *                      -- Single Unix Spec */
1316         if (count == 0)
1317                 RETURN(0);
1318
1319         ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1320
1321         if (!lsm) {
1322                 /* Read on file with no objects should return zero-filled
1323                  * buffers up to file size (we can get non-zero sizes with
1324                  * mknod + truncate, then opening file for read. This is a
1325                  * common pattern in NFS case, it seems). Bug 6243 */
1326                 int notzeroed;
1327                 /* Since there are no objects on OSTs, we have nothing to get
1328                  * lock on and so we are forced to access inode->i_size
1329                  * unguarded */
1330
1331                 /* Read beyond end of file */
1332                 if (*ppos >= i_size_read(inode))
1333                         RETURN(0);
1334
1335                 if (count > i_size_read(inode) - *ppos)
1336                         count = i_size_read(inode) - *ppos;
1337                 /* Make sure to correctly adjust the file pos pointer for
1338                  * EFAULT case */
1339                 notzeroed = clear_user(buf, count);
1340                 count -= notzeroed;
1341                 *ppos += count;
1342                 if (!count)
1343                         RETURN(-EFAULT);
1344                 RETURN(count);
1345         }
1346
1347 repeat:
1348         if (sbi->ll_max_rw_chunk != 0) {
1349                 /* first, let's know the end of the current stripe */
1350                 end = *ppos;
1351                 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1352                                 (obd_off *)&end);
1353
1354                 /* correct, the end is beyond the request */
1355                 if (end > *ppos + count - 1)
1356                         end = *ppos + count - 1;
1357
1358                 /* and chunk shouldn't be too large even if striping is wide */
1359                 if (end - *ppos > sbi->ll_max_rw_chunk)
1360                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1361         } else {
1362                 end = *ppos + count - 1;
1363         }
1364
1365         node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1366         if (IS_ERR(node)){
1367                 GOTO(out, retval = PTR_ERR(node));
1368         }
1369
1370         tree.lt_fd = LUSTRE_FPRIVATE(file);
1371         rc = ll_tree_lock(&tree, node, buf, count,
1372                           file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1373         if (rc != 0)
1374                 GOTO(out, retval = rc);
1375
1376         ll_inode_size_lock(inode, 1);
1377         /*
1378          * Consistency guarantees: following possibilities exist for the
1379          * relation between region being read and real file size at this
1380          * moment:
1381          *
1382          *  (A): the region is completely inside of the file;
1383          *
1384          *  (B-x): x bytes of region are inside of the file, the rest is
1385          *  outside;
1386          *
1387          *  (C): the region is completely outside of the file.
1388          *
1389          * This classification is stable under DLM lock acquired by
1390          * ll_tree_lock() above, because to change class, other client has to
1391          * take DLM lock conflicting with our lock. Also, any updates to
1392          * ->i_size by other threads on this client are serialized by
1393          * ll_inode_size_lock(). This guarantees that short reads are handled
1394          * correctly in the face of concurrent writes and truncates.
1395          */
1396         inode_init_lvb(inode, &lvb);
1397         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1398         kms = lvb.lvb_size;
1399         if (*ppos + count - 1 > kms) {
1400                 /* A glimpse is necessary to determine whether we return a
1401                  * short read (B) or some zeroes at the end of the buffer (C) */
1402                 ll_inode_size_unlock(inode, 1);
1403                 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1404                 if (retval) {
1405                         ll_tree_unlock(&tree);
1406                         goto out;
1407                 }
1408         } else {
1409                 /* region is within kms and, hence, within real file size (A).
1410                  * We need to increase i_size to cover the read region so that
1411                  * generic_file_read() will do its job, but that doesn't mean
1412                  * the kms size is _correct_, it is only the _minimum_ size.
1413                  * If someone does a stat they will get the correct size which
1414                  * will always be >= the kms value here.  b=11081 */
1415                 if (i_size_read(inode) < kms)
1416                         i_size_write(inode, kms);
1417                 ll_inode_size_unlock(inode, 1);
1418         }
1419
1420         chunk = end - *ppos + 1;
1421         CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1422                inode->i_ino, chunk, *ppos, i_size_read(inode));
1423
1424         /* turn off the kernel's read-ahead */
1425 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1426         file->f_ramax = 0;
1427 #else
1428         file->f_ra.ra_pages = 0;
1429 #endif
1430         /* initialize read-ahead window once per syscall */
1431         if (ra == 0) {
1432                 ra = 1;
1433                 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1434                 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1435                 ll_ra_read_in(file, &bead);
1436         }
1437
1438         /* BUG: 5972 */
1439         file_accessed(file);
1440         retval = generic_file_read(file, buf, chunk, ppos);
1441         ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1442
1443         ll_tree_unlock(&tree);
1444
1445         if (retval > 0) {
1446                 buf += retval;
1447                 count -= retval;
1448                 sum += retval;
1449                 if (retval == chunk && count > 0)
1450                         goto repeat;
1451         }
1452
1453  out:
1454         if (ra != 0)
1455                 ll_ra_read_ex(file, &bead);
1456         retval = (sum > 0) ? sum : retval;
1457         RETURN(retval);
1458 }
1459
1460 /*
1461  * Write to a file (through the page cache).
1462  */
1463 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1464                              loff_t *ppos)
1465 {
1466         struct inode *inode = file->f_dentry->d_inode;
1467         struct ll_sb_info *sbi = ll_i2sbi(inode);
1468         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1469         struct ll_lock_tree tree;
1470         struct ll_lock_tree_node *node;
1471         loff_t maxbytes = ll_file_maxbytes(inode);
1472         loff_t lock_start, lock_end, end;
1473         ssize_t retval, chunk, sum = 0;
1474         int rc;
1475         ENTRY;
1476
1477         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1478                inode->i_ino, inode->i_generation, inode, count, *ppos);
1479
1480         SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1481
1482         /* POSIX, but surprised the VFS doesn't check this already */
1483         if (count == 0)
1484                 RETURN(0);
1485
1486         /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1487          * called on the file, don't fail the below assertion (bug 2388). */
1488         if (file->f_flags & O_LOV_DELAY_CREATE &&
1489             ll_i2info(inode)->lli_smd == NULL)
1490                 RETURN(-EBADF);
1491
1492         LASSERT(ll_i2info(inode)->lli_smd != NULL);
1493
1494         down(&ll_i2info(inode)->lli_write_sem);
1495
1496 repeat:
1497         chunk = 0; /* just to fix gcc's warning */
1498         end = *ppos + count - 1;
1499
1500         if (file->f_flags & O_APPEND) {
1501                 lock_start = 0;
1502                 lock_end = OBD_OBJECT_EOF;
1503         } else if (sbi->ll_max_rw_chunk != 0) {
1504                 /* first, let's know the end of the current stripe */
1505                 end = *ppos;
1506                 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1507                                 (obd_off *)&end);
1508
1509                 /* correct, the end is beyond the request */
1510                 if (end > *ppos + count - 1)
1511                         end = *ppos + count - 1;
1512
1513                 /* and chunk shouldn't be too large even if striping is wide */
1514                 if (end - *ppos > sbi->ll_max_rw_chunk)
1515                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1516                 lock_start = *ppos;
1517                 lock_end = end;
1518         } else {
1519                 lock_start = *ppos;
1520                 lock_end = *ppos + count - 1;
1521         }
1522         node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1523
1524         if (IS_ERR(node))
1525                 GOTO(out, retval = PTR_ERR(node));
1526
1527         tree.lt_fd = LUSTRE_FPRIVATE(file);
1528         rc = ll_tree_lock(&tree, node, buf, count,
1529                           file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1530         if (rc != 0)
1531                 GOTO(out, retval = rc);
1532
1533         /* This is ok, g_f_w will overwrite this under i_sem if it races
1534          * with a local truncate, it just makes our maxbyte checking easier.
1535          * The i_size value gets updated in ll_extent_lock() as a consequence
1536          * of the [0,EOF] extent lock we requested above. */
1537         if (file->f_flags & O_APPEND) {
1538                 *ppos = i_size_read(inode);
1539                 end = *ppos + count - 1;
1540         }
1541
1542         if (*ppos >= maxbytes) {
1543                 send_sig(SIGXFSZ, current, 0);
1544                 GOTO(out_unlock, retval = -EFBIG);
1545         }
1546         if (*ppos + count > maxbytes)
1547                 count = maxbytes - *ppos;
1548
1549         /* generic_file_write handles O_APPEND after getting i_mutex */
1550         chunk = end - *ppos + 1;
1551         CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1552                inode->i_ino, chunk, *ppos);
1553         retval = generic_file_write(file, buf, chunk, ppos);
1554         ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1555
1556 out_unlock:
1557         ll_tree_unlock(&tree);
1558
1559 out:
1560         if (retval > 0) {
1561                 buf += retval;
1562                 count -= retval;
1563                 sum += retval;
1564                 if (retval == chunk && count > 0)
1565                         goto repeat;
1566         }
1567
1568         up(&ll_i2info(inode)->lli_write_sem);
1569
1570         retval = (sum > 0) ? sum : retval;
1571         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1572                            retval > 0 ? retval : 0);
1573         RETURN(retval);
1574 }
1575
1576 /*
1577  * Send file content (through pagecache) somewhere with helper
1578  */
1579 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1580 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1581                                 read_actor_t actor, void *target)
1582 {
1583         struct inode *inode = in_file->f_dentry->d_inode;
1584         struct ll_inode_info *lli = ll_i2info(inode);
1585         struct lov_stripe_md *lsm = lli->lli_smd;
1586         struct ll_lock_tree tree;
1587         struct ll_lock_tree_node *node;
1588         struct ost_lvb lvb;
1589         struct ll_ra_read bead;
1590         int rc;
1591         ssize_t retval;
1592         __u64 kms;
1593         ENTRY;
1594         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1595                inode->i_ino, inode->i_generation, inode, count, *ppos);
1596
1597         /* "If nbyte is 0, read() will return 0 and have no other results."
1598          *                      -- Single Unix Spec */
1599         if (count == 0)
1600                 RETURN(0);
1601
1602         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1603         /* turn off the kernel's read-ahead */
1604         in_file->f_ra.ra_pages = 0;
1605
1606         /* File with no objects, nothing to lock */
1607         if (!lsm)
1608                 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1609
1610         node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1611         if (IS_ERR(node))
1612                 RETURN(PTR_ERR(node));
1613
1614         tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1615         rc = ll_tree_lock(&tree, node, NULL, count,
1616                           in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1617         if (rc != 0)
1618                 RETURN(rc);
1619
1620         ll_inode_size_lock(inode, 1);
1621         /*
1622          * Consistency guarantees: following possibilities exist for the
1623          * relation between region being read and real file size at this
1624          * moment:
1625          *
1626          *  (A): the region is completely inside of the file;
1627          *
1628          *  (B-x): x bytes of region are inside of the file, the rest is
1629          *  outside;
1630          *
1631          *  (C): the region is completely outside of the file.
1632          *
1633          * This classification is stable under DLM lock acquired by
1634          * ll_tree_lock() above, because to change class, other client has to
1635          * take DLM lock conflicting with our lock. Also, any updates to
1636          * ->i_size by other threads on this client are serialized by
1637          * ll_inode_size_lock(). This guarantees that short reads are handled
1638          * correctly in the face of concurrent writes and truncates.
1639          */
1640         inode_init_lvb(inode, &lvb);
1641         obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1642         kms = lvb.lvb_size;
1643         if (*ppos + count - 1 > kms) {
1644                 /* A glimpse is necessary to determine whether we return a
1645                  * short read (B) or some zeroes at the end of the buffer (C) */
1646                 ll_inode_size_unlock(inode, 1);
1647                 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1648                 if (retval)
1649                         goto out;
1650         } else {
1651                 /* region is within kms and, hence, within real file size (A) */
1652                 i_size_write(inode, kms);
1653                 ll_inode_size_unlock(inode, 1);
1654         }
1655
1656         CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1657                inode->i_ino, count, *ppos, i_size_read(inode));
1658
1659         bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1660         bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1661         ll_ra_read_in(in_file, &bead);
1662         /* BUG: 5972 */
1663         file_accessed(in_file);
1664         retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1665         ll_ra_read_ex(in_file, &bead);
1666
1667  out:
1668         ll_tree_unlock(&tree);
1669         RETURN(retval);
1670 }
1671 #endif
1672
1673 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1674                                unsigned long arg)
1675 {
1676         struct ll_inode_info *lli = ll_i2info(inode);
1677         struct obd_export *exp = ll_i2dtexp(inode);
1678         struct ll_recreate_obj ucreatp;
1679         struct obd_trans_info oti = { 0 };
1680         struct obdo *oa = NULL;
1681         int lsm_size;
1682         int rc = 0;
1683         struct lov_stripe_md *lsm, *lsm2;
1684         ENTRY;
1685
1686         if (!capable (CAP_SYS_ADMIN))
1687                 RETURN(-EPERM);
1688
1689         rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1690                             sizeof(struct ll_recreate_obj));
1691         if (rc) {
1692                 RETURN(-EFAULT);
1693         }
1694         OBDO_ALLOC(oa);
1695         if (oa == NULL)
1696                 RETURN(-ENOMEM);
1697
1698         down(&lli->lli_size_sem);
1699         lsm = lli->lli_smd;
1700         if (lsm == NULL)
1701                 GOTO(out, rc = -ENOENT);
1702         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1703                    (lsm->lsm_stripe_count));
1704
1705         OBD_ALLOC(lsm2, lsm_size);
1706         if (lsm2 == NULL)
1707                 GOTO(out, rc = -ENOMEM);
1708
1709         oa->o_id = ucreatp.lrc_id;
1710         oa->o_gr = ucreatp.lrc_group;
1711         oa->o_nlink = ucreatp.lrc_ost_idx;
1712         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1713         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1714         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1715                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1716
1717         oti.oti_objid = NULL;
1718         memcpy(lsm2, lsm, lsm_size);
1719         rc = obd_create(exp, oa, &lsm2, &oti);
1720
1721         OBD_FREE(lsm2, lsm_size);
1722         GOTO(out, rc);
1723 out:
1724         up(&lli->lli_size_sem);
1725         OBDO_FREE(oa);
1726         return rc;
1727 }
1728
1729 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1730                              int flags, struct lov_user_md *lum, int lum_size)
1731 {
1732         struct ll_inode_info *lli = ll_i2info(inode);
1733         struct lov_stripe_md *lsm;
1734         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1735         int rc = 0;
1736         ENTRY;
1737
1738         down(&lli->lli_size_sem);
1739         lsm = lli->lli_smd;
1740         if (lsm) {
1741                 up(&lli->lli_size_sem);
1742                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1743                        inode->i_ino);
1744                 RETURN(-EEXIST);
1745         }
1746
1747         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1748         if (rc)
1749                 GOTO(out, rc);
1750         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1751                 GOTO(out_req_free, rc = -ENOENT);
1752         rc = oit.d.lustre.it_status;
1753         if (rc < 0)
1754                 GOTO(out_req_free, rc);
1755
1756         ll_release_openhandle(file->f_dentry, &oit);
1757
1758  out:
1759         up(&lli->lli_size_sem);
1760         ll_intent_release(&oit);
1761         RETURN(rc);
1762 out_req_free:
1763         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1764         goto out;
1765 }
1766
1767 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1768                              struct lov_mds_md **lmmp, int *lmm_size,
1769                              struct ptlrpc_request **request)
1770 {
1771         struct ll_sb_info *sbi = ll_i2sbi(inode);
1772         struct mdt_body  *body;
1773         struct lov_mds_md *lmm = NULL;
1774         struct ptlrpc_request *req = NULL;
1775         struct obd_capa *oc;
1776         int rc, lmmsize;
1777
1778         rc = ll_get_max_mdsize(sbi, &lmmsize);
1779         if (rc)
1780                 RETURN(rc);
1781
1782         oc = ll_mdscapa_get(inode);
1783         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1784                              oc, filename, strlen(filename) + 1,
1785                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize, &req);
1786         capa_put(oc);
1787         if (rc < 0) {
1788                 CDEBUG(D_INFO, "md_getattr_name failed "
1789                        "on %s: rc %d\n", filename, rc);
1790                 GOTO(out, rc);
1791         }
1792
1793         body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*body));
1794         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1795         /* swabbed by mdc_getattr_name */
1796         LASSERT_REPSWABBED(req, REPLY_REC_OFF);
1797
1798         lmmsize = body->eadatasize;
1799
1800         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1801                         lmmsize == 0) {
1802                 GOTO(out, rc = -ENODATA);
1803         }
1804
1805         lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1806         LASSERT(lmm != NULL);
1807         LASSERT_REPSWABBED(req, REPLY_REC_OFF + 1);
1808
1809         /*
1810          * This is coming from the MDS, so is probably in
1811          * little endian.  We convert it to host endian before
1812          * passing it to userspace.
1813          */
1814         if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1815                 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1816                 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1817         } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1818                 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1819         }
1820
1821         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1822                 struct lov_stripe_md *lsm;
1823                 struct lov_user_md_join *lmj;
1824                 int lmj_size, i, aindex = 0;
1825
1826                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1827                 if (rc < 0)
1828                         GOTO(out, rc = -ENOMEM);
1829                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1830                 if (rc)
1831                         GOTO(out_free_memmd, rc);
1832
1833                 lmj_size = sizeof(struct lov_user_md_join) +
1834                            lsm->lsm_stripe_count *
1835                            sizeof(struct lov_user_ost_data_join);
1836                 OBD_ALLOC(lmj, lmj_size);
1837                 if (!lmj)
1838                         GOTO(out_free_memmd, rc = -ENOMEM);
1839
1840                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1841                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1842                         struct lov_extent *lex =
1843                                 &lsm->lsm_array->lai_ext_array[aindex];
1844
1845                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1846                                 aindex ++;
1847                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1848                                         LPU64" len %d\n", aindex, i,
1849                                         lex->le_start, (int)lex->le_len);
1850                         lmj->lmm_objects[i].l_extent_start =
1851                                 lex->le_start;
1852
1853                         if ((int)lex->le_len == -1)
1854                                 lmj->lmm_objects[i].l_extent_end = -1;
1855                         else
1856                                 lmj->lmm_objects[i].l_extent_end =
1857                                         lex->le_start + lex->le_len;
1858                         lmj->lmm_objects[i].l_object_id =
1859                                 lsm->lsm_oinfo[i]->loi_id;
1860                         lmj->lmm_objects[i].l_object_gr =
1861                                 lsm->lsm_oinfo[i]->loi_gr;
1862                         lmj->lmm_objects[i].l_ost_gen =
1863                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1864                         lmj->lmm_objects[i].l_ost_idx =
1865                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1866                 }
1867                 lmm = (struct lov_mds_md *)lmj;
1868                 lmmsize = lmj_size;
1869 out_free_memmd:
1870                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1871         }
1872 out:
1873         *lmmp = lmm;
1874         *lmm_size = lmmsize;
1875         *request = req;
1876         return rc;
1877 }
1878
1879 static int ll_lov_setea(struct inode *inode, struct file *file,
1880                             unsigned long arg)
1881 {
1882         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1883         struct lov_user_md  *lump;
1884         int lum_size = sizeof(struct lov_user_md) +
1885                        sizeof(struct lov_user_ost_data);
1886         int rc;
1887         ENTRY;
1888
1889         if (!capable (CAP_SYS_ADMIN))
1890                 RETURN(-EPERM);
1891
1892         OBD_ALLOC(lump, lum_size);
1893         if (lump == NULL) {
1894                 RETURN(-ENOMEM);
1895         }
1896         rc = copy_from_user(lump, (struct lov_user_md  *)arg, lum_size);
1897         if (rc) {
1898                 OBD_FREE(lump, lum_size);
1899                 RETURN(-EFAULT);
1900         }
1901
1902         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1903
1904         OBD_FREE(lump, lum_size);
1905         RETURN(rc);
1906 }
1907
1908 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1909                             unsigned long arg)
1910 {
1911         struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1912         int rc;
1913         int flags = FMODE_WRITE;
1914         ENTRY;
1915
1916         /* Bug 1152: copy properly when this is no longer true */
1917         LASSERT(sizeof(lum) == sizeof(*lump));
1918         LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1919         rc = copy_from_user(&lum, lump, sizeof(lum));
1920         if (rc)
1921                 RETURN(-EFAULT);
1922
1923         rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1924         if (rc == 0) {
1925                  put_user(0, &lump->lmm_stripe_count);
1926                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1927                                     0, ll_i2info(inode)->lli_smd, lump);
1928         }
1929         RETURN(rc);
1930 }
1931
1932 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1933 {
1934         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1935
1936         if (!lsm)
1937                 RETURN(-ENODATA);
1938
1939         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1940                             (void *)arg);
1941 }
1942
1943 static int ll_get_grouplock(struct inode *inode, struct file *file,
1944                             unsigned long arg)
1945 {
1946         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1947         ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1948                                                     .end = OBD_OBJECT_EOF}};
1949         struct lustre_handle lockh = { 0 };
1950         struct ll_inode_info *lli = ll_i2info(inode);
1951         struct lov_stripe_md *lsm = lli->lli_smd;
1952         int flags = 0, rc;
1953         ENTRY;
1954
1955         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1956                 RETURN(-EINVAL);
1957         }
1958
1959         policy.l_extent.gid = arg;
1960         if (file->f_flags & O_NONBLOCK)
1961                 flags = LDLM_FL_BLOCK_NOWAIT;
1962
1963         rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1964         if (rc)
1965                 RETURN(rc);
1966
1967         fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1968         fd->fd_gid = arg;
1969         memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1970
1971         RETURN(0);
1972 }
1973
1974 static int ll_put_grouplock(struct inode *inode, struct file *file,
1975                             unsigned long arg)
1976 {
1977         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1978         struct ll_inode_info *lli = ll_i2info(inode);
1979         struct lov_stripe_md *lsm = lli->lli_smd;
1980         int rc;
1981         ENTRY;
1982
1983         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1984                 /* Ugh, it's already unlocked. */
1985                 RETURN(-EINVAL);
1986         }
1987
1988         if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
1989                 RETURN(-EINVAL);
1990
1991         fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
1992
1993         rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
1994         if (rc)
1995                 RETURN(rc);
1996
1997         fd->fd_gid = 0;
1998         memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
1999
2000         RETURN(0);
2001 }
2002
2003 static int join_sanity_check(struct inode *head, struct inode *tail)
2004 {
2005         ENTRY;
2006         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2007                 CERROR("server do not support join \n");
2008                 RETURN(-EINVAL);
2009         }
2010         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2011                 CERROR("tail ino %lu and ino head %lu must be regular\n",
2012                        head->i_ino, tail->i_ino);
2013                 RETURN(-EINVAL);
2014         }
2015         if (head->i_ino == tail->i_ino) {
2016                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2017                 RETURN(-EINVAL);
2018         }
2019         if (i_size_read(head) % JOIN_FILE_ALIGN) {
2020                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2021                 RETURN(-EINVAL);
2022         }
2023         RETURN(0);
2024 }
2025
2026 static int join_file(struct inode *head_inode, struct file *head_filp,
2027                      struct file *tail_filp)
2028 {
2029         struct dentry *tail_dentry = tail_filp->f_dentry;
2030         struct lookup_intent oit = {.it_op = IT_OPEN,
2031                                    .it_flags = head_filp->f_flags|O_JOIN_FILE};
2032         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
2033                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL };
2034
2035         struct lustre_handle lockh;
2036         struct md_op_data *op_data;
2037         int    rc;
2038         loff_t data;
2039         ENTRY;
2040
2041         tail_dentry = tail_filp->f_dentry;
2042
2043         data = i_size_read(head_inode);
2044         op_data = ll_prep_md_op_data(NULL, head_inode,
2045                                      tail_dentry->d_parent->d_inode,
2046                                      tail_dentry->d_name.name,
2047                                      tail_dentry->d_name.len, 0,
2048                                      LUSTRE_OPC_ANY, &data);
2049         if (IS_ERR(op_data))
2050                 RETURN(PTR_ERR(op_data));
2051
2052         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
2053                          op_data, &lockh, NULL, 0, 0);
2054
2055         ll_finish_md_op_data(op_data);
2056         if (rc < 0)
2057                 GOTO(out, rc);
2058
2059         rc = oit.d.lustre.it_status;
2060
2061         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2062                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2063                 ptlrpc_req_finished((struct ptlrpc_request *)
2064                                     oit.d.lustre.it_data);
2065                 GOTO(out, rc);
2066         }
2067
2068         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2069                                            * away */
2070                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2071                 oit.d.lustre.it_lock_mode = 0;
2072         }
2073         ll_release_openhandle(head_filp->f_dentry, &oit);
2074 out:
2075         ll_intent_release(&oit);
2076         RETURN(rc);
2077 }
2078
2079 static int ll_file_join(struct inode *head, struct file *filp,
2080                         char *filename_tail)
2081 {
2082         struct inode *tail = NULL, *first = NULL, *second = NULL;
2083         struct dentry *tail_dentry;
2084         struct file *tail_filp, *first_filp, *second_filp;
2085         struct ll_lock_tree first_tree, second_tree;
2086         struct ll_lock_tree_node *first_node, *second_node;
2087         struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2088         int rc = 0, cleanup_phase = 0;
2089         ENTRY;
2090
2091         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2092                head->i_ino, head->i_generation, head, filename_tail);
2093
2094         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2095         if (IS_ERR(tail_filp)) {
2096                 CERROR("Can not open tail file %s", filename_tail);
2097                 rc = PTR_ERR(tail_filp);
2098                 GOTO(cleanup, rc);
2099         }
2100         tail = igrab(tail_filp->f_dentry->d_inode);
2101
2102         tlli = ll_i2info(tail);
2103         tail_dentry = tail_filp->f_dentry;
2104         LASSERT(tail_dentry);
2105         cleanup_phase = 1;
2106
2107         /*reorder the inode for lock sequence*/
2108         first = head->i_ino > tail->i_ino ? head : tail;
2109         second = head->i_ino > tail->i_ino ? tail : head;
2110         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2111         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2112
2113         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2114                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2115         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2116         if (IS_ERR(first_node)){
2117                 rc = PTR_ERR(first_node);
2118                 GOTO(cleanup, rc);
2119         }
2120         first_tree.lt_fd = first_filp->private_data;
2121         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2122         if (rc != 0)
2123                 GOTO(cleanup, rc);
2124         cleanup_phase = 2;
2125
2126         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2127         if (IS_ERR(second_node)){
2128                 rc = PTR_ERR(second_node);
2129                 GOTO(cleanup, rc);
2130         }
2131         second_tree.lt_fd = second_filp->private_data;
2132         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2133         if (rc != 0)
2134                 GOTO(cleanup, rc);
2135         cleanup_phase = 3;
2136
2137         rc = join_sanity_check(head, tail);
2138         if (rc)
2139                 GOTO(cleanup, rc);
2140
2141         rc = join_file(head, filp, tail_filp);
2142         if (rc)
2143                 GOTO(cleanup, rc);
2144 cleanup:
2145         switch (cleanup_phase) {
2146         case 3:
2147                 ll_tree_unlock(&second_tree);
2148                 obd_cancel_unused(ll_i2dtexp(second),
2149                                   ll_i2info(second)->lli_smd, 0, NULL);
2150         case 2:
2151                 ll_tree_unlock(&first_tree);
2152                 obd_cancel_unused(ll_i2dtexp(first),
2153                                   ll_i2info(first)->lli_smd, 0, NULL);
2154         case 1:
2155                 filp_close(tail_filp, 0);
2156                 if (tail)
2157                         iput(tail);
2158                 if (head && rc == 0) {
2159                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2160                                        &hlli->lli_smd);
2161                         hlli->lli_smd = NULL;
2162                 }
2163         case 0:
2164                 break;
2165         default:
2166                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2167                 LBUG();
2168         }
2169         RETURN(rc);
2170 }
2171
2172 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2173 {
2174         struct inode *inode = dentry->d_inode;
2175         struct obd_client_handle *och;
2176         int rc;
2177         ENTRY;
2178
2179         LASSERT(inode);
2180
2181         /* Root ? Do nothing. */
2182         if (dentry->d_inode->i_sb->s_root == dentry)
2183                 RETURN(0);
2184
2185         /* No open handle to close? Move away */
2186         if (!it_disposition(it, DISP_OPEN_OPEN))
2187                 RETURN(0);
2188
2189         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2190
2191         OBD_ALLOC(och, sizeof(*och));
2192         if (!och)
2193                 GOTO(out, rc = -ENOMEM);
2194
2195         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2196                     ll_i2info(inode), it, och);
2197
2198         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2199                                        inode, och);
2200  out:
2201         /* this one is in place of ll_file_open */
2202         ptlrpc_req_finished(it->d.lustre.it_data);
2203         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2204         RETURN(rc);
2205 }
2206
2207 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2208                   unsigned long arg)
2209 {
2210         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2211         int flags;
2212         ENTRY;
2213
2214         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2215                inode->i_generation, inode, cmd);
2216         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2217
2218         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2219         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2220                 RETURN(-ENOTTY);
2221
2222         switch(cmd) {
2223         case LL_IOC_GETFLAGS:
2224                 /* Get the current value of the file flags */
2225                 return put_user(fd->fd_flags, (int *)arg);
2226         case LL_IOC_SETFLAGS:
2227         case LL_IOC_CLRFLAGS:
2228                 /* Set or clear specific file flags */
2229                 /* XXX This probably needs checks to ensure the flags are
2230                  *     not abused, and to handle any flag side effects.
2231                  */
2232                 if (get_user(flags, (int *) arg))
2233                         RETURN(-EFAULT);
2234
2235                 if (cmd == LL_IOC_SETFLAGS) {
2236                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2237                             !(file->f_flags & O_DIRECT)) {
2238                                 CERROR("%s: unable to disable locking on "
2239                                        "non-O_DIRECT file\n", current->comm);
2240                                 RETURN(-EINVAL);
2241                         }
2242
2243                         fd->fd_flags |= flags;
2244                 } else {
2245                         fd->fd_flags &= ~flags;
2246                 }
2247                 RETURN(0);
2248         case LL_IOC_LOV_SETSTRIPE:
2249                 RETURN(ll_lov_setstripe(inode, file, arg));
2250         case LL_IOC_LOV_SETEA:
2251                 RETURN(ll_lov_setea(inode, file, arg));
2252         case LL_IOC_LOV_GETSTRIPE:
2253                 RETURN(ll_lov_getstripe(inode, arg));
2254         case LL_IOC_RECREATE_OBJ:
2255                 RETURN(ll_lov_recreate_obj(inode, file, arg));
2256         case EXT3_IOC_GETFLAGS:
2257         case EXT3_IOC_SETFLAGS:
2258                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2259         case EXT3_IOC_GETVERSION_OLD:
2260         case EXT3_IOC_GETVERSION:
2261                 RETURN(put_user(inode->i_generation, (int *)arg));
2262         case LL_IOC_JOIN: {
2263                 char *ftail;
2264                 int rc;
2265
2266                 ftail = getname((const char *)arg);
2267                 if (IS_ERR(ftail))
2268                         RETURN(PTR_ERR(ftail));
2269                 rc = ll_file_join(inode, file, ftail);
2270                 putname(ftail);
2271                 RETURN(rc);
2272         }
2273         case LL_IOC_GROUP_LOCK:
2274                 RETURN(ll_get_grouplock(inode, file, arg));
2275         case LL_IOC_GROUP_UNLOCK:
2276                 RETURN(ll_put_grouplock(inode, file, arg));
2277         case IOC_OBD_STATFS:
2278                 RETURN(ll_obd_statfs(inode, (void *)arg));
2279
2280         /* We need to special case any other ioctls we want to handle,
2281          * to send them to the MDS/OST as appropriate and to properly
2282          * network encode the arg field.
2283         case EXT3_IOC_SETVERSION_OLD:
2284         case EXT3_IOC_SETVERSION:
2285         */
2286         case LL_IOC_FLUSHCTX:
2287                 RETURN(ll_flush_ctx(inode));
2288         case LL_IOC_GETFACL: {
2289                 struct rmtacl_ioctl_data ioc;
2290
2291                 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2292                         RETURN(-EFAULT);
2293
2294                 RETURN(ll_ioctl_getfacl(inode, &ioc));
2295         }
2296         case LL_IOC_SETFACL: {
2297                 struct rmtacl_ioctl_data ioc;
2298
2299                 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2300                         RETURN(-EFAULT);
2301
2302                 RETURN(ll_ioctl_setfacl(inode, &ioc));
2303         }
2304         default:
2305                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2306                                      (void *)arg));
2307         }
2308 }
2309
2310 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2311 {
2312         struct inode *inode = file->f_dentry->d_inode;
2313         struct ll_inode_info *lli = ll_i2info(inode);
2314         struct lov_stripe_md *lsm = lli->lli_smd;
2315         loff_t retval;
2316         ENTRY;
2317         retval = offset + ((origin == 2) ? i_size_read(inode) :
2318                            (origin == 1) ? file->f_pos : 0);
2319         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2320                inode->i_ino, inode->i_generation, inode, retval, retval,
2321                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2322         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2323
2324         if (origin == 2) { /* SEEK_END */
2325                 int nonblock = 0, rc;
2326
2327                 if (file->f_flags & O_NONBLOCK)
2328                         nonblock = LDLM_FL_BLOCK_NOWAIT;
2329
2330                 if (lsm != NULL) {
2331                         rc = ll_glimpse_size(inode, nonblock);
2332                         if (rc != 0)
2333                                 RETURN(rc);
2334                 }
2335
2336                 ll_inode_size_lock(inode, 0);
2337                 offset += i_size_read(inode);
2338                 ll_inode_size_unlock(inode, 0);
2339         } else if (origin == 1) { /* SEEK_CUR */
2340                 offset += file->f_pos;
2341         }
2342
2343         retval = -EINVAL;
2344         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2345                 if (offset != file->f_pos) {
2346                         file->f_pos = offset;
2347 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2348                         file->f_reada = 0;
2349                         file->f_version = ++event;
2350 #endif
2351                 }
2352                 retval = offset;
2353         }
2354
2355         RETURN(retval);
2356 }
2357
2358 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2359 {
2360         struct inode *inode = dentry->d_inode;
2361         struct ll_inode_info *lli = ll_i2info(inode);
2362         struct lov_stripe_md *lsm = lli->lli_smd;
2363         struct ptlrpc_request *req;
2364         struct obd_capa *oc;
2365         int rc, err;
2366         ENTRY;
2367         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2368                inode->i_generation, inode);
2369         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2370
2371         /* fsync's caller has already called _fdata{sync,write}, we want
2372          * that IO to finish before calling the osc and mdc sync methods */
2373         rc = filemap_fdatawait(inode->i_mapping);
2374
2375         /* catch async errors that were recorded back when async writeback
2376          * failed for pages in this mapping. */
2377         err = lli->lli_async_rc;
2378         lli->lli_async_rc = 0;
2379         if (rc == 0)
2380                 rc = err;
2381         if (lsm) {
2382                 err = lov_test_and_clear_async_rc(lsm);
2383                 if (rc == 0)
2384                         rc = err;
2385         }
2386
2387         oc = ll_mdscapa_get(inode);
2388         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2389                       &req);
2390         capa_put(oc);
2391         if (!rc)
2392                 rc = err;
2393         if (!err)
2394                 ptlrpc_req_finished(req);
2395
2396         if (data && lsm) {
2397                 struct obdo *oa;
2398
2399                 OBDO_ALLOC(oa);
2400                 if (!oa)
2401                         RETURN(rc ? rc : -ENOMEM);
2402
2403                 oa->o_id = lsm->lsm_object_id;
2404                 oa->o_gr = lsm->lsm_object_gr;
2405                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2406                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2407                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2408                                            OBD_MD_FLGROUP);
2409
2410                 oc = ll_osscapa_get(inode, 0, CAPA_OPC_OSS_WRITE);
2411                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2412                                0, OBD_OBJECT_EOF, oc);
2413                 capa_put(oc);
2414                 if (!rc)
2415                         rc = err;
2416                 OBDO_FREE(oa);
2417         }
2418
2419         RETURN(rc);
2420 }
2421
2422 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2423 {
2424         struct inode *inode = file->f_dentry->d_inode;
2425         struct ll_sb_info *sbi = ll_i2sbi(inode);
2426         struct ldlm_res_id res_id =
2427                 { .name = { fid_seq(ll_inode2fid(inode)),
2428                             fid_oid(ll_inode2fid(inode)),
2429                             fid_ver(ll_inode2fid(inode)),
2430                             LDLM_FLOCK} };
2431         struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2432                 ldlm_flock_completion_ast, NULL, file_lock };
2433         struct lustre_handle lockh = {0};
2434         ldlm_policy_data_t flock;
2435         int flags = 0;
2436         int rc;
2437         ENTRY;
2438
2439         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2440                inode->i_ino, file_lock);
2441
2442         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2443
2444         if (file_lock->fl_flags & FL_FLOCK) {
2445                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2446                 /* set missing params for flock() calls */
2447                 file_lock->fl_end = OFFSET_MAX;
2448                 file_lock->fl_pid = current->tgid;
2449         }
2450         flock.l_flock.pid = file_lock->fl_pid;
2451         flock.l_flock.start = file_lock->fl_start;
2452         flock.l_flock.end = file_lock->fl_end;
2453
2454         switch (file_lock->fl_type) {
2455         case F_RDLCK:
2456                 einfo.ei_mode = LCK_PR;
2457                 break;
2458         case F_UNLCK:
2459                 /* An unlock request may or may not have any relation to
2460                  * existing locks so we may not be able to pass a lock handle
2461                  * via a normal ldlm_lock_cancel() request. The request may even
2462                  * unlock a byte range in the middle of an existing lock. In
2463                  * order to process an unlock request we need all of the same
2464                  * information that is given with a normal read or write record
2465                  * lock request. To avoid creating another ldlm unlock (cancel)
2466                  * message we'll treat a LCK_NL flock request as an unlock. */
2467                 einfo.ei_mode = LCK_NL;
2468                 break;
2469         case F_WRLCK:
2470                 einfo.ei_mode = LCK_PW;
2471                 break;
2472         default:
2473                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2474                 LBUG();
2475         }
2476
2477         switch (cmd) {
2478         case F_SETLKW:
2479 #ifdef F_SETLKW64
2480         case F_SETLKW64:
2481 #endif
2482                 flags = 0;
2483                 break;
2484         case F_SETLK:
2485 #ifdef F_SETLK64
2486         case F_SETLK64:
2487 #endif
2488                 flags = LDLM_FL_BLOCK_NOWAIT;
2489                 break;
2490         case F_GETLK:
2491 #ifdef F_GETLK64
2492         case F_GETLK64:
2493 #endif
2494                 flags = LDLM_FL_TEST_LOCK;
2495                 /* Save the old mode so that if the mode in the lock changes we
2496                  * can decrement the appropriate reader or writer refcount. */
2497                 file_lock->fl_type = einfo.ei_mode;
2498                 break;
2499         default:
2500                 CERROR("unknown fcntl lock command: %d\n", cmd);
2501                 LBUG();
2502         }
2503
2504         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2505                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2506                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2507
2508         rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &einfo, &res_id,
2509                               &flock, &flags, NULL, 0, NULL, &lockh, 0);
2510         if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2511                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2512 #ifdef HAVE_F_OP_FLOCK
2513         if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2514             !(flags & LDLM_FL_TEST_LOCK))
2515                 posix_lock_file_wait(file, file_lock);
2516 #endif
2517
2518         RETURN(rc);
2519 }
2520
2521 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2522 {
2523         ENTRY;
2524
2525         RETURN(-ENOSYS);
2526 }
2527
2528 int ll_have_md_lock(struct inode *inode, __u64 bits)
2529 {
2530         struct lustre_handle lockh;
2531         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2532         struct lu_fid *fid;
2533         int flags;
2534         ENTRY;
2535
2536         if (!inode)
2537                RETURN(0);
2538
2539         fid = &ll_i2info(inode)->lli_fid;
2540         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2541
2542         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2543         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2544                           LCK_CR|LCK_CW|LCK_PR, &lockh)) {
2545                 RETURN(1);
2546         }
2547
2548         RETURN(0);
2549 }
2550
2551 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2552         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2553                               * and return success */
2554                 inode->i_nlink = 0;
2555                 /* This path cannot be hit for regular files unless in
2556                  * case of obscure races, so no need to to validate
2557                  * size. */
2558                 if (!S_ISREG(inode->i_mode) &&
2559                     !S_ISDIR(inode->i_mode))
2560                         return 0;
2561         }
2562
2563         if (rc) {
2564                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2565                 return -abs(rc);
2566
2567         }
2568
2569         return 0;
2570 }
2571
2572 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2573 {
2574         struct inode *inode = dentry->d_inode;
2575         struct ptlrpc_request *req = NULL;
2576         struct ll_sb_info *sbi;
2577         struct obd_export *exp;
2578         int rc;
2579         ENTRY;
2580
2581         if (!inode) {
2582                 CERROR("REPORT THIS LINE TO PETER\n");
2583                 RETURN(0);
2584         }
2585         sbi = ll_i2sbi(inode);
2586
2587         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2588                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2589 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
2590         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REVALIDATE, 1);
2591 #endif
2592
2593         exp = ll_i2mdexp(inode);
2594
2595         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2596                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2597                 struct md_op_data *op_data;
2598
2599                 /* Call getattr by fid, so do not provide name at all. */
2600                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2601                                              dentry->d_inode, NULL, 0, 0,
2602                                              LUSTRE_OPC_ANY, NULL);
2603                 if (IS_ERR(op_data))
2604                         RETURN(PTR_ERR(op_data));
2605
2606                 oit.it_flags |= O_CHECK_STALE;
2607                 rc = md_intent_lock(exp, op_data, NULL, 0,
2608                                     /* we are not interested in name
2609                                        based lookup */
2610                                     &oit, 0, &req,
2611                                     ll_md_blocking_ast, 0);
2612                 ll_finish_md_op_data(op_data);
2613                 oit.it_flags &= ~O_CHECK_STALE;
2614                 if (rc < 0) {
2615                         rc = ll_inode_revalidate_fini(inode, rc);
2616                         GOTO (out, rc);
2617                 }
2618
2619                 rc = ll_revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2620                 if (rc != 0) {
2621                         ll_intent_release(&oit);
2622                         GOTO(out, rc);
2623                 }
2624
2625                 /* Unlinked? Unhash dentry, so it is not picked up later by
2626                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2627                    here to preserve get_cwd functionality on 2.6.
2628                    Bug 10503 */
2629                 if (!dentry->d_inode->i_nlink) {
2630                         spin_lock(&dcache_lock);
2631                         ll_drop_dentry(dentry);
2632                         spin_unlock(&dcache_lock);
2633                 }
2634
2635                 ll_lookup_finish_locks(&oit, dentry);
2636         } else if (!ll_have_md_lock(dentry->d_inode,
2637                                     MDS_INODELOCK_UPDATE)) {
2638                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2639                 obd_valid valid = OBD_MD_FLGETATTR;
2640                 struct obd_capa *oc;
2641                 int ealen = 0;
2642
2643                 if (S_ISREG(inode->i_mode)) {
2644                         rc = ll_get_max_mdsize(sbi, &ealen);
2645                         if (rc)
2646                                 RETURN(rc);
2647                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2648                 }
2649                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2650                  * capa for this inode. Because we only keep capas of dirs
2651                  * fresh. */
2652                 oc = ll_mdscapa_get(inode);
2653                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2654                                 ealen, &req);
2655                 capa_put(oc);
2656                 if (rc) {
2657                         rc = ll_inode_revalidate_fini(inode, rc);
2658                         RETURN(rc);
2659                 }
2660
2661                 rc = ll_prep_inode(&inode, req, REPLY_REC_OFF,
2662                                    NULL);
2663                 if (rc)
2664                         GOTO(out, rc);
2665         }
2666
2667         /* if object not yet allocated, don't validate size */
2668         if (ll_i2info(inode)->lli_smd == NULL)
2669                 GOTO(out, rc = 0);
2670
2671         /* ll_glimpse_size will prefer locally cached writes if they extend
2672          * the file */
2673         rc = ll_glimpse_size(inode, 0);
2674         EXIT;
2675 out:
2676         ptlrpc_req_finished(req);
2677         return rc;
2678 }
2679
2680 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2681 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2682                   struct lookup_intent *it, struct kstat *stat)
2683 {
2684         struct inode *inode = de->d_inode;
2685         int res = 0;
2686
2687         res = ll_inode_revalidate_it(de, it);
2688         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2689
2690         if (res)
2691                 return res;
2692
2693         stat->dev = inode->i_sb->s_dev;
2694         stat->ino = inode->i_ino;
2695         stat->mode = inode->i_mode;
2696         stat->nlink = inode->i_nlink;
2697         stat->uid = inode->i_uid;
2698         stat->gid = inode->i_gid;
2699         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2700         stat->atime = inode->i_atime;
2701         stat->mtime = inode->i_mtime;
2702         stat->ctime = inode->i_ctime;
2703 #ifdef HAVE_INODE_BLKSIZE
2704         stat->blksize = inode->i_blksize;
2705 #else
2706         stat->blksize = 1 << inode->i_blkbits;
2707 #endif
2708
2709         ll_inode_size_lock(inode, 0);
2710         stat->size = i_size_read(inode);
2711         stat->blocks = inode->i_blocks;
2712         ll_inode_size_unlock(inode, 0);
2713
2714         return 0;
2715 }
2716 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2717 {
2718         struct lookup_intent it = { .it_op = IT_GETATTR };
2719
2720         return ll_getattr_it(mnt, de, &it, stat);
2721 }
2722 #endif
2723
2724 static
2725 int lustre_check_acl(struct inode *inode, int mask)
2726 {
2727 #ifdef CONFIG_FS_POSIX_ACL
2728         struct ll_inode_info *lli = ll_i2info(inode);
2729         struct posix_acl *acl;
2730         int rc;
2731         ENTRY;
2732
2733         spin_lock(&lli->lli_lock);
2734         acl = posix_acl_dup(lli->lli_posix_acl);
2735         spin_unlock(&lli->lli_lock);
2736
2737         if (!acl)
2738                 RETURN(-EAGAIN);
2739
2740         rc = posix_acl_permission(inode, acl, mask);
2741         posix_acl_release(acl);
2742
2743         RETURN(rc);
2744 #else
2745         return -EAGAIN;
2746 #endif
2747 }
2748
2749 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2750 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2751 {
2752         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2753                inode->i_ino, inode->i_generation, inode, mask);
2754         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2755                 return lustre_check_remote_perm(inode, mask);
2756
2757         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2758         return generic_permission(inode, mask, lustre_check_acl);
2759 }
2760 #else
2761 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
2762 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2763 #else
2764 int ll_inode_permission(struct inode *inode, int mask)
2765 #endif
2766 {
2767         int mode = inode->i_mode;
2768         int rc;
2769
2770         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2771                inode->i_ino, inode->i_generation, inode, mask);
2772
2773         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2774                 return lustre_check_remote_perm(inode, mask);
2775
2776         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2777
2778         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2779             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2780                 return -EROFS;
2781         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2782                 return -EACCES;
2783         if (current->fsuid == inode->i_uid) {
2784                 mode >>= 6;
2785         } else if (1) {
2786                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2787                         goto check_groups;
2788                 rc = lustre_check_acl(inode, mask);
2789                 if (rc == -EAGAIN)
2790                         goto check_groups;
2791                 if (rc == -EACCES)
2792                         goto check_capabilities;
2793                 return rc;
2794         } else {
2795 check_groups:
2796                 if (in_group_p(inode->i_gid))
2797                         mode >>= 3;
2798         }
2799         if ((mode & mask & S_IRWXO) == mask)
2800                 return 0;
2801
2802 check_capabilities:
2803         if (!(mask & MAY_EXEC) ||
2804             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2805                 if (capable(CAP_DAC_OVERRIDE))
2806                         return 0;
2807
2808         if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2809             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2810                 return 0;
2811
2812         return -EACCES;
2813 }
2814 #endif
2815
2816 /* -o localflock - only provides locally consistent flock locks */
2817 struct file_operations ll_file_operations = {
2818         .read           = ll_file_read,
2819         .write          = ll_file_write,
2820         .ioctl          = ll_file_ioctl,
2821         .open           = ll_file_open,
2822         .release        = ll_file_release,
2823         .mmap           = ll_file_mmap,
2824         .llseek         = ll_file_seek,
2825 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2826         .sendfile       = ll_file_sendfile,
2827 #endif
2828         .fsync          = ll_fsync,
2829 };
2830
2831 struct file_operations ll_file_operations_flock = {
2832         .read           = ll_file_read,
2833         .write          = ll_file_write,
2834         .ioctl          = ll_file_ioctl,
2835         .open           = ll_file_open,
2836         .release        = ll_file_release,
2837         .mmap           = ll_file_mmap,
2838         .llseek         = ll_file_seek,
2839 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2840         .sendfile       = ll_file_sendfile,
2841 #endif
2842         .fsync          = ll_fsync,
2843 #ifdef HAVE_F_OP_FLOCK
2844         .flock          = ll_file_flock,
2845 #endif
2846         .lock           = ll_file_flock
2847 };
2848
2849 /* These are for -o noflock - to return ENOSYS on flock calls */
2850 struct file_operations ll_file_operations_noflock = {
2851         .read           = ll_file_read,
2852         .write          = ll_file_write,
2853         .ioctl          = ll_file_ioctl,
2854         .open           = ll_file_open,
2855         .release        = ll_file_release,
2856         .mmap           = ll_file_mmap,
2857         .llseek         = ll_file_seek,
2858 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2859         .sendfile       = ll_file_sendfile,
2860 #endif
2861         .fsync          = ll_fsync,
2862 #ifdef HAVE_F_OP_FLOCK
2863         .flock          = ll_file_noflock,
2864 #endif
2865         .lock           = ll_file_noflock
2866 };
2867
2868 struct inode_operations ll_file_inode_operations = {
2869 #ifdef LUSTRE_KERNEL_VERSION
2870         .setattr_raw    = ll_setattr_raw,
2871 #endif
2872         .setattr        = ll_setattr,
2873         .truncate       = ll_truncate,
2874 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2875         .getattr        = ll_getattr,
2876 #else
2877         .revalidate_it  = ll_inode_revalidate_it,
2878 #endif
2879         .permission     = ll_inode_permission,
2880         .setxattr       = ll_setxattr,
2881         .getxattr       = ll_getxattr,
2882         .listxattr      = ll_listxattr,
2883         .removexattr    = ll_removexattr,
2884 };
2885