lustre/llite/file.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
   5  *   Author: Peter Braam <braam@clusterfs.com>
   6  *   Author: Phil Schwan <phil@clusterfs.com>
   7  *   Author: Andreas Dilger <adilger@clusterfs.com>
   8  *
   9  *   This file is part of Lustre, http://www.lustre.org.
  10  *
  11  *   Lustre is free software; you can redistribute it and/or
  12  *   modify it under the terms of version 2 of the GNU General Public
  13  *   License as published by the Free Software Foundation.
  14  *
  15  *   Lustre is distributed in the hope that it will be useful,
  16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  *   GNU General Public License for more details.
  19  *
  20  *   You should have received a copy of the GNU General Public License
  21  *   along with Lustre; if not, write to the Free Software
  22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25 #define DEBUG_SUBSYSTEM S_LLITE
  26 #include <lustre_dlm.h>
  27 #include <lustre_lite.h>
  28 #include <lustre_mdc.h>
  29 #include <linux/pagemap.h>
  30 #include <linux/file.h>
  31 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
  32 #include <linux/lustre_compat25.h>
  33 #endif
  34 #include "llite_internal.h"
  35
  36 /* also used by llite/special.c:ll_special_open() */
  37 struct ll_file_data *ll_file_data_get(void)
  38 {
  39         struct ll_file_data *fd;
  40
  41         OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
  42         return fd;
  43 }
  44
  45 static void ll_file_data_put(struct ll_file_data *fd)
  46 {
  47         if (fd != NULL)
  48                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  49 }
  50
  51 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  52                           struct lustre_handle *fh)
  53 {
  54         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  55         op_data->op_attr.ia_mode = inode->i_mode;
  56         op_data->op_attr.ia_atime = inode->i_atime;
  57         op_data->op_attr.ia_mtime = inode->i_mtime;
  58         op_data->op_attr.ia_ctime = inode->i_ctime;
  59         op_data->op_attr.ia_size = inode->i_size;
  60         op_data->op_attr_blocks = inode->i_blocks;
  61         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
  62         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  63         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
  64         op_data->op_capa1 = ll_mdscapa_get(inode);
  65 }
  66
  67 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  68                              struct obd_client_handle *och)
  69 {
  70         ENTRY;
  71
  72         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
  73                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
  74
  75         if (!(och->och_flags & FMODE_WRITE))
  76                 goto out;
  77
  78         if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
  79             !S_ISREG(inode->i_mode))
  80                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
  81         else
  82                 ll_epoch_close(inode, op_data, &och, 0);
  83
  84 out:
  85         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
  86         EXIT;
  87 }
  88
  89 static int ll_close_inode_openhandle(struct obd_export *md_exp,
  90                                      struct inode *inode,
  91                                      struct obd_client_handle *och)
  92 {
  93         struct obd_export *exp = ll_i2mdexp(inode);
  94         struct md_op_data *op_data;
  95         struct ptlrpc_request *req = NULL;
  96         struct obd_device *obd = class_exp2obd(exp);
  97         int epoch_close = 1;
  98         int rc;
  99         ENTRY;
 100
 101         if (obd == NULL) {
 102                 /*
 103                  * XXX: in case of LMV, is this correct to access
 104                  * ->exp_handle?
 105                  */
 106                 CERROR("Invalid MDC connection handle "LPX64"\n",
 107                        ll_i2mdexp(inode)->exp_handle.h_cookie);
 108                 GOTO(out, rc = 0);
 109         }
 110
 111         /*
 112          * here we check if this is forced umount. If so this is called on
 113          * canceling "open lock" and we do not call md_close() in this case, as
 114          * it will not be successful, as import is already deactivated.
 115          */
 116         if (obd->obd_force)
 117                 GOTO(out, rc = 0);
 118
 119         OBD_ALLOC_PTR(op_data);
 120         if (op_data == NULL)
 121                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
 122
 123         ll_prepare_close(inode, op_data, och);
 124         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
 125         rc = md_close(md_exp, op_data, och, &req);
 126
 127         if (rc == -EAGAIN) {
 128                 /* This close must have the epoch closed. */
 129                 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
 130                 LASSERT(epoch_close);
 131                 /* MDS has instructed us to obtain Size-on-MDS attribute from
 132                  * OSTs and send setattr to back to MDS. */
 133                 rc = ll_sizeonmds_update(inode, &och->och_fh,
 134                                          op_data->op_ioepoch);
 135                 if (rc) {
 136                         CERROR("inode %lu mdc Size-on-MDS update failed: "
 137                                "rc = %d\n", inode->i_ino, rc);
 138                         rc = 0;
 139                 }
 140         } else if (rc) {
 141                 CERROR("inode %lu mdc close failed: rc = %d\n",
 142                        inode->i_ino, rc);
 143         }
 144         ll_finish_md_op_data(op_data);
 145
 146         if (rc == 0) {
 147                 rc = ll_objects_destroy(req, inode);
 148                 if (rc)
 149                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
 150                                inode->i_ino, rc);
 151         }
 152
 153         ptlrpc_req_finished(req); /* This is close request */
 154         EXIT;
 155 out:
 156
 157         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
 158             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 159                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 160         } else {
 161                 md_clear_open_replay_data(md_exp, och);
 162                 /* Free @och if it is not waiting for DONE_WRITING. */
 163                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 164                 OBD_FREE_PTR(och);
 165         }
 166
 167         return rc;
 168 }
 169
 170 int ll_md_real_close(struct inode *inode, int flags)
 171 {
 172         struct ll_inode_info *lli = ll_i2info(inode);
 173         struct obd_client_handle **och_p;
 174         struct obd_client_handle *och;
 175         __u64 *och_usecount;
 176         int rc = 0;
 177         ENTRY;
 178
 179         if (flags & FMODE_WRITE) {
 180                 och_p = &lli->lli_mds_write_och;
 181                 och_usecount = &lli->lli_open_fd_write_count;
 182         } else if (flags & FMODE_EXEC) {
 183                 och_p = &lli->lli_mds_exec_och;
 184                 och_usecount = &lli->lli_open_fd_exec_count;
 185         } else {
 186                 LASSERT(flags & FMODE_READ);
 187                 och_p = &lli->lli_mds_read_och;
 188                 och_usecount = &lli->lli_open_fd_read_count;
 189         }
 190
 191         down(&lli->lli_och_sem);
 192         if (*och_usecount) { /* There are still users of this handle, so
 193                                 skip freeing it. */
 194                 up(&lli->lli_och_sem);
 195                 RETURN(0);
 196         }
 197         och=*och_p;
 198         *och_p = NULL;
 199         up(&lli->lli_och_sem);
 200
 201         if (och) { /* There might be a race and somebody have freed this och
 202                       already */
 203                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 204                                                inode, och);
 205         }
 206
 207         RETURN(rc);
 208 }
 209
 210 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 211                 struct file *file)
 212 {
 213         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 214         struct ll_inode_info *lli = ll_i2info(inode);
 215         int rc = 0;
 216         ENTRY;
 217
 218         /* clear group lock, if present */
 219         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
 220                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 221                 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
 222                 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
 223                                       &fd->fd_cwlockh);
 224         }
 225
 226         /* Let's see if we have good enough OPEN lock on the file and if
 227            we can skip talking to MDS */
 228         if (file->f_dentry->d_inode) { /* Can this ever be false? */
 229                 int lockmode;
 230                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 231                 struct lustre_handle lockh;
 232                 struct inode *inode = file->f_dentry->d_inode;
 233                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 234
 235                 down(&lli->lli_och_sem);
 236                 if (fd->fd_omode & FMODE_WRITE) {
 237                         lockmode = LCK_CW;
 238                         LASSERT(lli->lli_open_fd_write_count);
 239                         lli->lli_open_fd_write_count--;
 240                 } else if (fd->fd_omode & FMODE_EXEC) {
 241                         lockmode = LCK_PR;
 242                         LASSERT(lli->lli_open_fd_exec_count);
 243                         lli->lli_open_fd_exec_count--;
 244                 } else {
 245                         lockmode = LCK_CR;
 246                         LASSERT(lli->lli_open_fd_read_count);
 247                         lli->lli_open_fd_read_count--;
 248                 }
 249                 up(&lli->lli_och_sem);
 250
 251                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 252                                    LDLM_IBITS, &policy, lockmode,
 253                                    &lockh)) {
 254                         rc = ll_md_real_close(file->f_dentry->d_inode,
 255                                               fd->fd_omode);
 256                 }
 257         } else {
 258                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
 259                        file, file->f_dentry, file->f_dentry->d_name.name);
 260         }
 261
 262         LUSTRE_FPRIVATE(file) = NULL;
 263         ll_file_data_put(fd);
 264         ll_capa_close(inode);
 265
 266         RETURN(rc);
 267 }
 268
 269 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
 270
 271 /* While this returns an error code, fput() the caller does not, so we need
 272  * to make every effort to clean up all of our state here.  Also, applications
 273  * rarely check close errors and even if an error is returned they will not
 274  * re-try the close call.
 275  */
 276 int ll_file_release(struct inode *inode, struct file *file)
 277 {
 278         struct ll_file_data *fd;
 279         struct ll_sb_info *sbi = ll_i2sbi(inode);
 280         struct ll_inode_info *lli = ll_i2info(inode);
 281         struct lov_stripe_md *lsm = lli->lli_smd;
 282         int rc;
 283
 284         ENTRY;
 285         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 286                inode->i_generation, inode);
 287
 288         /* don't do anything for / */
 289         if (inode->i_sb->s_root == file->f_dentry)
 290                 RETURN(0);
 291
 292         ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 293         fd = LUSTRE_FPRIVATE(file);
 294         LASSERT(fd != NULL);
 295
 296         /* don't do anything for / */
 297         if (inode->i_sb->s_root == file->f_dentry) {
 298                 LUSTRE_FPRIVATE(file) = NULL;
 299                 ll_file_data_put(fd);
 300                 RETURN(0);
 301         }
 302
 303         if (lsm)
 304                 lov_test_and_clear_async_rc(lsm);
 305         lli->lli_async_rc = 0;
 306
 307         rc = ll_md_close(sbi->ll_md_exp, inode, file);
 308         RETURN(rc);
 309 }
 310
 311 static int ll_intent_file_open(struct file *file, void *lmm,
 312                                int lmmsize, struct lookup_intent *itp)
 313 {
 314         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
 315         struct dentry *parent = file->f_dentry->d_parent;
 316         const char *name = file->f_dentry->d_name.name;
 317         const int len = file->f_dentry->d_name.len;
 318         struct md_op_data *op_data;
 319         struct ptlrpc_request *req;
 320         int rc;
 321
 322         if (!parent)
 323                 RETURN(-ENOENT);
 324
 325         /* Usually we come here only for NFSD, and we want open lock.
 326            But we can also get here with pre 2.6.15 patchless kernels, and in
 327            that case that lock is also ok */
 328         /* We can also get here if there was cached open handle in revalidate_it
 329          * but it disappeared while we were getting from there to ll_file_open.
 330          * But this means this file was closed and immediatelly opened which
 331          * makes a good candidate for using OPEN lock */
 332         /* If lmmsize & lmm are not 0, we are just setting stripe info
 333          * parameters. No need for the open lock */
 334         if (!lmm && !lmmsize)
 335                 itp->it_flags |= MDS_OPEN_LOCK;
 336
 337         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
 338                                       file->f_dentry->d_inode, name, len,
 339                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
 340         if (IS_ERR(op_data))
 341                 RETURN(PTR_ERR(op_data));
 342
 343         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 344                             0 /*unused */, &req, ll_md_blocking_ast, 0);
 345         ll_finish_md_op_data(op_data);
 346         if (rc == -ESTALE) {
 347                 /* reason for keep own exit path - don`t flood log
 348                 * with messages with -ESTALE errors.
 349                 */
 350                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 351                      it_open_error(DISP_OPEN_OPEN, itp))
 352                         GOTO(out, rc);
 353                 ll_release_openhandle(file->f_dentry, itp);
 354                 GOTO(out_stale, rc);
 355         }
 356
 357         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 358                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 359                 CERROR("lock enqueue: err: %d\n", rc);
 360                 GOTO(out, rc);
 361         }
 362
 363         if (itp->d.lustre.it_lock_mode)
 364                 md_set_lock_data(sbi->ll_md_exp,
 365                                  &itp->d.lustre.it_lock_handle,
 366                                  file->f_dentry->d_inode);
 367
 368         rc = ll_prep_inode(&file->f_dentry->d_inode, req, DLM_REPLY_REC_OFF,
 369                            NULL);
 370 out:
 371         ptlrpc_req_finished(itp->d.lustre.it_data);
 372
 373 out_stale:
 374         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
 375         ll_intent_drop_lock(itp);
 376
 377         RETURN(rc);
 378 }
 379
 380 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
 381                        struct lookup_intent *it, struct obd_client_handle *och)
 382 {
 383         struct ptlrpc_request *req = it->d.lustre.it_data;
 384         struct mdt_body *body;
 385
 386         LASSERT(och);
 387
 388         body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
 389         LASSERT(body != NULL);                      /* reply already checked out */
 390         LASSERT_REPSWABBED(req, DLM_REPLY_REC_OFF); /* and swabbed in md_enqueue */
 391
 392         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
 393         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 394         och->och_fid = lli->lli_fid;
 395         och->och_flags = it->it_flags;
 396         lli->lli_ioepoch = body->ioepoch;
 397
 398         return md_set_open_replay_data(md_exp, och, req);
 399 }
 400
 401 int ll_local_open(struct file *file, struct lookup_intent *it,
 402                   struct ll_file_data *fd, struct obd_client_handle *och)
 403 {
 404         struct inode *inode = file->f_dentry->d_inode;
 405         struct ll_inode_info *lli = ll_i2info(inode);
 406         ENTRY;
 407
 408         LASSERT(!LUSTRE_FPRIVATE(file));
 409
 410         LASSERT(fd != NULL);
 411
 412         if (och) {
 413                 struct ptlrpc_request *req = it->d.lustre.it_data;
 414                 struct mdt_body *body;
 415                 int rc;
 416
 417                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
 418                 if (rc)
 419                         RETURN(rc);
 420
 421                 body = lustre_msg_buf(req->rq_repmsg,
 422                                       DLM_REPLY_REC_OFF, sizeof(*body));
 423
 424                 if ((it->it_flags & FMODE_WRITE) &&
 425                     (body->valid & OBD_MD_FLSIZE))
 426                 {
 427                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 428                                lli->lli_ioepoch, PFID(&lli->lli_fid));
 429                 }
 430         }
 431
 432         LUSTRE_FPRIVATE(file) = fd;
 433         ll_readahead_init(inode, &fd->fd_ras);
 434         fd->fd_omode = it->it_flags;
 435         RETURN(0);
 436 }
 437
 438 /* Open a file, and (for the very first open) create objects on the OSTs at
 439  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 440  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
 441  * lli_open_sem to ensure no other process will create objects, send the
 442  * stripe MD to the MDS, or try to destroy the objects if that fails.
 443  *
 444  * If we already have the stripe MD locally then we don't request it in
 445  * md_open(), by passing a lmm_size = 0.
 446  *
 447  * It is up to the application to ensure no other processes open this file
 448  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 449  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 450  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 451  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 452  */
 453 int ll_file_open(struct inode *inode, struct file *file)
 454 {
 455         struct ll_inode_info *lli = ll_i2info(inode);
 456         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 457                                           .it_flags = file->f_flags };
 458         struct lov_stripe_md *lsm;
 459         struct ptlrpc_request *req = NULL;
 460         struct obd_client_handle **och_p;
 461         __u64 *och_usecount;
 462         struct ll_file_data *fd;
 463         int rc = 0;
 464         ENTRY;
 465
 466         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 467                inode->i_generation, inode, file->f_flags);
 468
 469         /* don't do anything for / */
 470         if (inode->i_sb->s_root == file->f_dentry)
 471                 RETURN(0);
 472
 473 #ifdef LUSTRE_KERNEL_VERSION
 474         it = file->f_it;
 475 #else
 476         it = file->private_data; /* XXX: compat macro */
 477         file->private_data = NULL; /* prevent ll_local_open assertion */
 478 #endif
 479
 480         fd = ll_file_data_get();
 481         if (fd == NULL)
 482                 RETURN(-ENOMEM);
 483
 484         /* don't do anything for / */
 485         if (inode->i_sb->s_root == file->f_dentry) {
 486                 LUSTRE_FPRIVATE(file) = fd;
 487                 RETURN(0);
 488         }
 489
 490         if (!it || !it->d.lustre.it_disposition) {
 491                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 492                  * because everything but O_ACCMODE mask was stripped from
 493                  * there */
 494                 if ((oit.it_flags + 1) & O_ACCMODE)
 495                         oit.it_flags++;
 496                 if (file->f_flags & O_TRUNC)
 497                         oit.it_flags |= FMODE_WRITE;
 498
 499                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 500                  * dentry_open after call to open_namei that checks permissions.
 501                  * Only nfsd_open call dentry_open directly without checking
 502                  * permissions and because of that this code below is safe. */
 503                 if (oit.it_flags & FMODE_WRITE)
 504                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 505
 506                 /* We do not want O_EXCL here, presumably we opened the file
 507                  * already? XXX - NFS implications? */
 508                 oit.it_flags &= ~O_EXCL;
 509
 510                 it = &oit;
 511         }
 512
 513         /* Let's see if we have file open on MDS already. */
 514         if (it->it_flags & FMODE_WRITE) {
 515                 och_p = &lli->lli_mds_write_och;
 516                 och_usecount = &lli->lli_open_fd_write_count;
 517         } else if (it->it_flags & FMODE_EXEC) {
 518                 och_p = &lli->lli_mds_exec_och;
 519                 och_usecount = &lli->lli_open_fd_exec_count;
 520          } else {
 521                 och_p = &lli->lli_mds_read_och;
 522                 och_usecount = &lli->lli_open_fd_read_count;
 523         }
 524
 525         down(&lli->lli_och_sem);
 526         if (*och_p) { /* Open handle is present */
 527                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 528                         /* Well, there's extra open request that we do not need,
 529                            let's close it somehow. This will decref request. */
 530                         rc = it_open_error(DISP_OPEN_OPEN, it);
 531                         if (rc) {
 532                                 ll_file_data_put(fd);
 533                                 GOTO(out_och_free, rc);
 534                         }
 535                         ll_release_openhandle(file->f_dentry, it);
 536                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
 537                                              LPROC_LL_OPEN);
 538                 }
 539                 (*och_usecount)++;
 540
 541                 rc = ll_local_open(file, it, fd, NULL);
 542                 if (rc) {
 543                         up(&lli->lli_och_sem);
 544                         ll_file_data_put(fd);
 545                         RETURN(rc);
 546                 }
 547         } else {
 548                 LASSERT(*och_usecount == 0);
 549                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 550                 if (!*och_p) {
 551                         ll_file_data_put(fd);
 552                         GOTO(out_och_free, rc = -ENOMEM);
 553                 }
 554                 (*och_usecount)++;
 555                 if (!it->d.lustre.it_disposition) {
 556                         it->it_flags |= O_CHECK_STALE;
 557                         rc = ll_intent_file_open(file, NULL, 0, it);
 558                         it->it_flags &= ~O_CHECK_STALE;
 559                         if (rc) {
 560                                 ll_file_data_put(fd);
 561                                 GOTO(out_och_free, rc);
 562                         }
 563
 564                         /* Got some error? Release the request */
 565                         if (it->d.lustre.it_status < 0) {
 566                                 req = it->d.lustre.it_data;
 567                                 ptlrpc_req_finished(req);
 568                         }
 569                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
 570                                          &it->d.lustre.it_lock_handle,
 571                                          file->f_dentry->d_inode);
 572                 }
 573                 req = it->d.lustre.it_data;
 574
 575                 /* md_intent_lock() didn't get a request ref if there was an
 576                  * open error, so don't do cleanup on the request here
 577                  * (bug 3430) */
 578                 /* XXX (green): Should not we bail out on any error here, not
 579                  * just open error? */
 580                 rc = it_open_error(DISP_OPEN_OPEN, it);
 581                 if (rc) {
 582                         ll_file_data_put(fd);
 583                         GOTO(out_och_free, rc);
 584                 }
 585
 586                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 587                 rc = ll_local_open(file, it, fd, *och_p);
 588                 if (rc) {
 589                         up(&lli->lli_och_sem);
 590                         ll_file_data_put(fd);
 591                         GOTO(out_och_free, rc);
 592                 }
 593         }
 594         up(&lli->lli_och_sem);
 595
 596         /* Must do this outside lli_och_sem lock to prevent deadlock where
 597            different kind of OPEN lock for this same inode gets cancelled
 598            by ldlm_cancel_lru */
 599         if (!S_ISREG(inode->i_mode))
 600                 GOTO(out, rc);
 601
 602         ll_capa_open(inode);
 603
 604         lsm = lli->lli_smd;
 605         if (lsm == NULL) {
 606                 if (file->f_flags & O_LOV_DELAY_CREATE ||
 607                     !(file->f_mode & FMODE_WRITE)) {
 608                         CDEBUG(D_INODE, "object creation was delayed\n");
 609                         GOTO(out, rc);
 610                 }
 611         }
 612         file->f_flags &= ~O_LOV_DELAY_CREATE;
 613         GOTO(out, rc);
 614 out:
 615         ptlrpc_req_finished(req);
 616         if (req)
 617                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 618 out_och_free:
 619         if (rc) {
 620                 if (*och_p) {
 621                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 622                         *och_p = NULL; /* OBD_FREE writes some magic there */
 623                         (*och_usecount)--;
 624                 }
 625                 up(&lli->lli_och_sem);
 626         }
 627
 628         return rc;
 629 }
 630
 631 /* Fills the obdo with the attributes for the inode defined by lsm */
 632 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
 633 {
 634         struct ptlrpc_request_set *set;
 635         struct ll_inode_info *lli = ll_i2info(inode);
 636         struct lov_stripe_md *lsm = lli->lli_smd;
 637
 638         struct obd_info oinfo = { { { 0 } } };
 639         int rc;
 640         ENTRY;
 641
 642         LASSERT(lsm != NULL);
 643
 644         oinfo.oi_md = lsm;
 645         oinfo.oi_oa = obdo;
 646         oinfo.oi_oa->o_id = lsm->lsm_object_id;
 647         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
 648         oinfo.oi_oa->o_mode = S_IFREG;
 649         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 650                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 651                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 652                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 653                                OBD_MD_FLGROUP;
 654         oinfo.oi_capa = ll_mdscapa_get(inode);
 655
 656         set = ptlrpc_prep_set();
 657         if (set == NULL) {
 658                 CERROR("can't allocate ptlrpc set\n");
 659                 rc = -ENOMEM;
 660         } else {
 661                 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
 662                 if (rc == 0)
 663                         rc = ptlrpc_set_wait(set);
 664                 ptlrpc_set_destroy(set);
 665         }
 666         capa_put(oinfo.oi_capa);
 667         if (rc)
 668                 RETURN(rc);
 669
 670         oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 671                                  OBD_MD_FLATIME | OBD_MD_FLMTIME |
 672                                  OBD_MD_FLCTIME | OBD_MD_FLSIZE);
 673
 674         obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
 675         CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
 676                lli->lli_smd->lsm_object_id, inode->i_size, inode->i_blocks,
 677                inode->i_blksize);
 678         RETURN(0);
 679 }
 680
 681 static inline void ll_remove_suid(struct inode *inode)
 682 {
 683         unsigned int mode;
 684
 685         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
 686         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
 687
 688         /* was any of the uid bits set? */
 689         mode &= inode->i_mode;
 690         if (mode && !capable(CAP_FSETID)) {
 691                 inode->i_mode &= ~mode;
 692                 // XXX careful here - we cannot change the size
 693         }
 694 }
 695
 696 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
 697 {
 698         struct ll_inode_info *lli = ll_i2info(inode);
 699         struct lov_stripe_md *lsm = lli->lli_smd;
 700         struct obd_export *exp = ll_i2dtexp(inode);
 701         struct {
 702                 char name[16];
 703                 struct ldlm_lock *lock;
 704                 struct lov_stripe_md *lsm;
 705         } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
 706         __u32 stripe, vallen = sizeof(stripe);
 707         int rc;
 708         ENTRY;
 709
 710         if (lsm->lsm_stripe_count == 1)
 711                 GOTO(check, stripe = 0);
 712
 713         /* get our offset in the lov */
 714         rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
 715         if (rc != 0) {
 716                 CERROR("obd_get_info: rc = %d\n", rc);
 717                 RETURN(rc);
 718         }
 719         LASSERT(stripe < lsm->lsm_stripe_count);
 720
 721 check:
 722         if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
 723             lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
 724                 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
 725                            lsm->lsm_oinfo[stripe]->loi_id,
 726                            lsm->lsm_oinfo[stripe]->loi_gr);
 727                 RETURN(-ELDLM_NO_LOCK_DATA);
 728         }
 729
 730         RETURN(stripe);
 731 }
 732
 733 /* Flush the page cache for an extent as its canceled.  When we're on an LOV,
 734  * we get a lock cancellation for each stripe, so we have to map the obd's
 735  * region back onto the stripes in the file that it held.
 736  *
 737  * No one can dirty the extent until we've finished our work and they can
 738  * enqueue another lock.  The DLM protects us from ll_file_read/write here,
 739  * but other kernel actors could have pages locked.
 740  *
 741  * Called with the DLM lock held. */
 742 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
 743                               struct ldlm_lock *lock, __u32 stripe)
 744 {
 745         ldlm_policy_data_t tmpex;
 746         unsigned long start, end, count, skip, i, j;
 747         struct page *page;
 748         int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
 749         struct lustre_handle lockh;
 750         ENTRY;
 751
 752         memcpy(&tmpex, &lock->l_policy_data, sizeof(tmpex));
 753         CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
 754                inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
 755                inode->i_size);
 756
 757         /* our locks are page granular thanks to osc_enqueue, we invalidate the
 758          * whole page. */
 759         if ((tmpex.l_extent.start & ~CFS_PAGE_MASK) != 0 ||
 760             ((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) != 0)
 761                 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",
 762                            CFS_PAGE_SIZE);
 763         LASSERT((tmpex.l_extent.start & ~CFS_PAGE_MASK) == 0);
 764         LASSERT(((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) == 0);
 765
 766         count = ~0;
 767         skip = 0;
 768         start = tmpex.l_extent.start >> CFS_PAGE_SHIFT;
 769         end = tmpex.l_extent.end >> CFS_PAGE_SHIFT;
 770         if (lsm->lsm_stripe_count > 1) {
 771                 count = lsm->lsm_stripe_size >> CFS_PAGE_SHIFT;
 772                 skip = (lsm->lsm_stripe_count - 1) * count;
 773                 start += start/count * skip + stripe * count;
 774                 if (end != ~0)
 775                         end += end/count * skip + stripe * count;
 776         }
 777         if (end < tmpex.l_extent.end >> CFS_PAGE_SHIFT)
 778                 end = ~0;
 779
 780         i = inode->i_size ? (__u64)(inode->i_size - 1) >> CFS_PAGE_SHIFT : 0;
 781         if (i < end)
 782                 end = i;
 783
 784         CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
 785                "count: %lu skip: %lu end: %lu%s\n", start, start % count,
 786                count, skip, end, discard ? " (DISCARDING)" : "");
 787
 788         /* walk through the vmas on the inode and tear down mmaped pages that
 789          * intersect with the lock.  this stops immediately if there are no
 790          * mmap()ed regions of the file.  This is not efficient at all and
 791          * should be short lived. We'll associate mmap()ed pages with the lock
 792          * and will be able to find them directly */
 793         for (i = start; i <= end; i += (j + skip)) {
 794                 j = min(count - (i % count), end - i + 1);
 795                 LASSERT(j > 0);
 796                 LASSERT(inode->i_mapping);
 797                 if (ll_teardown_mmaps(inode->i_mapping,
 798                                       (__u64)i << CFS_PAGE_SHIFT,
 799                                       ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) )
 800                         break;
 801         }
 802
 803         /* this is the simplistic implementation of page eviction at
 804          * cancelation.  It is careful to get races with other page
 805          * lockers handled correctly.  fixes from bug 20 will make it
 806          * more efficient by associating locks with pages and with
 807          * batching writeback under the lock explicitly. */
 808         for (i = start, j = start % count; i <= end;
 809              j++, i++, tmpex.l_extent.start += CFS_PAGE_SIZE) {
 810                 if (j == count) {
 811                         CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
 812                         i += skip;
 813                         j = 0;
 814                         if (i > end)
 815                                 break;
 816                 }
 817                 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
 818                          LPU64" >= "LPU64" start %lu i %lu end %lu\n",
 819                          tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
 820                          start, i, end);
 821
 822                 if (!mapping_has_pages(inode->i_mapping)) {
 823                         CDEBUG(D_INODE|D_PAGE, "nothing left\n");
 824                         break;
 825                 }
 826
 827                 cond_resched();
 828
 829                 page = find_get_page(inode->i_mapping, i);
 830                 if (page == NULL)
 831                         continue;
 832                 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
 833                                i, tmpex.l_extent.start);
 834                 lock_page(page);
 835
 836                 /* page->mapping to check with racing against teardown */
 837                 if (!discard && clear_page_dirty_for_io(page)) {
 838                         rc = ll_call_writepage(inode, page);
 839                         if (rc != 0)
 840                                 CERROR("writepage inode %lu(%p) of page %p "
 841                                        "failed: %d\n", inode->i_ino, inode,
 842                                        page, rc);
 843                         /* either waiting for io to complete or reacquiring
 844                          * the lock that the failed writepage released */
 845                         lock_page(page);
 846                 }
 847
 848                 tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
 849                 /* check to see if another DLM lock covers this page b=2765 */
 850                 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
 851                                       LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING |
 852                                       LDLM_FL_TEST_LOCK,
 853                                       &lock->l_resource->lr_name, LDLM_EXTENT,
 854                                       &tmpex, LCK_PR | LCK_PW, &lockh);
 855
 856                 if (rc2 <= 0 && page->mapping != NULL) {
 857                         struct ll_async_page *llap = llap_cast_private(page);
 858                         /* checking again to account for writeback's
 859                          * lock_page() */
 860                         LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
 861                         if (llap)
 862                                 ll_ra_accounting(llap, inode->i_mapping);
 863                         ll_truncate_complete_page(page);
 864                 }
 865                 unlock_page(page);
 866                 page_cache_release(page);
 867         }
 868         LASSERTF(tmpex.l_extent.start <=
 869                  (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
 870                   lock->l_policy_data.l_extent.end + 1),
 871                  "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
 872                  tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
 873                  start, i, end);
 874         EXIT;
 875 }
 876
 877 static int ll_extent_lock_callback(struct ldlm_lock *lock,
 878                                    struct ldlm_lock_desc *new, void *data,
 879                                    int flag)
 880 {
 881         struct lustre_handle lockh = { 0 };
 882         int rc;
 883         ENTRY;
 884
 885         if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
 886                 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
 887                 LBUG();
 888         }
 889
 890         switch (flag) {
 891         case LDLM_CB_BLOCKING:
 892                 ldlm_lock2handle(lock, &lockh);
 893                 rc = ldlm_cli_cancel(&lockh);
 894                 if (rc != ELDLM_OK)
 895                         CERROR("ldlm_cli_cancel failed: %d\n", rc);
 896                 break;
 897         case LDLM_CB_CANCELING: {
 898                 struct inode *inode;
 899                 struct ll_inode_info *lli;
 900                 struct lov_stripe_md *lsm;
 901                 int stripe;
 902                 __u64 kms;
 903
 904                 /* This lock wasn't granted, don't try to evict pages */
 905                 if (lock->l_req_mode != lock->l_granted_mode)
 906                         RETURN(0);
 907
 908                 inode = ll_inode_from_lock(lock);
 909                 if (inode == NULL)
 910                         RETURN(0);
 911                 lli = ll_i2info(inode);
 912                 if (lli == NULL)
 913                         goto iput;
 914                 if (lli->lli_smd == NULL)
 915                         goto iput;
 916                 lsm = lli->lli_smd;
 917
 918                 stripe = ll_lock_to_stripe_offset(inode, lock);
 919                 if (stripe < 0)
 920                         goto iput;
 921
 922                 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
 923
 924                 lov_stripe_lock(lsm);
 925                 lock_res_and_lock(lock);
 926                 kms = ldlm_extent_shift_kms(lock,
 927                                             lsm->lsm_oinfo[stripe]->loi_kms);
 928
 929                 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
 930                         LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 931                                    lsm->lsm_oinfo[stripe]->loi_kms, kms);
 932                 lsm->lsm_oinfo[stripe]->loi_kms = kms;
 933                 unlock_res_and_lock(lock);
 934                 lov_stripe_unlock(lsm);
 935         iput:
 936                 iput(inode);
 937                 break;
 938         }
 939         default:
 940                 LBUG();
 941         }
 942
 943         RETURN(0);
 944 }
 945
 946 #if 0
 947 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
 948 {
 949         /* XXX ALLOCATE - 160 bytes */
 950         struct inode *inode = ll_inode_from_lock(lock);
 951         struct ll_inode_info *lli = ll_i2info(inode);
 952         struct lustre_handle lockh = { 0 };
 953         struct ost_lvb *lvb;
 954         int stripe;
 955         ENTRY;
 956
 957         if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
 958                      LDLM_FL_BLOCK_CONV)) {
 959                 LBUG(); /* not expecting any blocked async locks yet */
 960                 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
 961                            "lock, returning");
 962                 ldlm_lock_dump(D_OTHER, lock, 0);
 963                 ldlm_reprocess_all(lock->l_resource);
 964                 RETURN(0);
 965         }
 966
 967         LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
 968
 969         stripe = ll_lock_to_stripe_offset(inode, lock);
 970         if (stripe < 0)
 971                 goto iput;
 972
 973         if (lock->l_lvb_len) {
 974                 struct lov_stripe_md *lsm = lli->lli_smd;
 975                 __u64 kms;
 976                 lvb = lock->l_lvb_data;
 977                 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
 978
 979                 lock_res_and_lock(lock);
 980                 ll_inode_size_lock(inode, 1);
 981                 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
 982                 kms = ldlm_extent_shift_kms(NULL, kms);
 983                 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
 984                         LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 985                                    lsm->lsm_oinfo[stripe].loi_kms, kms);
 986                 lsm->lsm_oinfo[stripe].loi_kms = kms;
 987                 ll_inode_size_unlock(inode, 1);
 988                 unlock_res_and_lock(lock);
 989         }
 990
 991 iput:
 992         iput(inode);
 993         wake_up(&lock->l_waitq);
 994
 995         ldlm_lock2handle(lock, &lockh);
 996         ldlm_lock_decref(&lockh, LCK_PR);
 997         RETURN(0);
 998 }
 999 #endif
1000
1001 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
1002 {
1003         struct ptlrpc_request *req = reqp;
1004         struct inode *inode = ll_inode_from_lock(lock);
1005         struct ll_inode_info *lli;
1006         struct lov_stripe_md *lsm;
1007         struct ost_lvb *lvb;
1008         int rc, stripe;
1009         int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
1010         ENTRY;
1011
1012         if (inode == NULL)
1013                 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
1014         lli = ll_i2info(inode);
1015         if (lli == NULL)
1016                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1017         lsm = lli->lli_smd;
1018         if (lsm == NULL)
1019                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1020
1021         /* First, find out which stripe index this lock corresponds to. */
1022         stripe = ll_lock_to_stripe_offset(inode, lock);
1023         if (stripe < 0)
1024                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1025
1026         rc = lustre_pack_reply(req, 2, size, NULL);
1027         if (rc) {
1028                 CERROR("lustre_pack_reply: %d\n", rc);
1029                 GOTO(iput, rc);
1030         }
1031
1032         lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
1033         lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
1034         lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1035         lvb->lvb_atime = LTIME_S(inode->i_atime);
1036         lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1037
1038         LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1039                    " atime "LPU64", mtime "LPU64", ctime "LPU64,
1040                    inode->i_size, stripe, lvb->lvb_size, lvb->lvb_mtime,
1041                    lvb->lvb_atime, lvb->lvb_ctime);
1042  iput:
1043         iput(inode);
1044
1045  out:
1046         /* These errors are normal races, so we don't want to fill the console
1047          * with messages by calling ptlrpc_error() */
1048         if (rc == -ELDLM_NO_LOCK_DATA)
1049                 lustre_pack_reply(req, 1, NULL, NULL);
1050
1051         req->rq_status = rc;
1052         return rc;
1053 }
1054
1055 static void ll_merge_lvb(struct inode *inode)
1056 {
1057         struct ll_inode_info *lli = ll_i2info(inode);
1058         struct ll_sb_info *sbi = ll_i2sbi(inode);
1059         struct ost_lvb lvb;
1060         ENTRY;
1061
1062         ll_inode_size_lock(inode, 1);
1063         inode_init_lvb(inode, &lvb);
1064         obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1065         inode->i_size = lvb.lvb_size;
1066         inode->i_blocks = lvb.lvb_blocks;
1067         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1068         LTIME_S(inode->i_atime) = lvb.lvb_atime;
1069         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1070         ll_inode_size_unlock(inode, 1);
1071         EXIT;
1072 }
1073
1074 int ll_local_size(struct inode *inode)
1075 {
1076         ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1077         struct ll_inode_info *lli = ll_i2info(inode);
1078         struct ll_sb_info *sbi = ll_i2sbi(inode);
1079         struct lustre_handle lockh = { 0 };
1080         int flags = 0;
1081         int rc;
1082         ENTRY;
1083
1084         if (lli->lli_smd->lsm_stripe_count == 0)
1085                 RETURN(0);
1086
1087         rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1088                        &policy, LCK_PR | LCK_PW, &flags, inode, &lockh);
1089         if (rc < 0)
1090                 RETURN(rc);
1091         else if (rc == 0)
1092                 RETURN(-ENODATA);
1093
1094         ll_merge_lvb(inode);
1095         obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR | LCK_PW, &lockh);
1096         RETURN(0);
1097 }
1098
1099 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1100                      lstat_t *st)
1101 {
1102         struct lustre_handle lockh = { 0 };
1103         struct obd_enqueue_info einfo = { 0 };
1104         struct obd_info oinfo = { { { 0 } } };
1105         struct ost_lvb lvb;
1106         int rc;
1107
1108         ENTRY;
1109
1110         einfo.ei_type = LDLM_EXTENT;
1111         einfo.ei_mode = LCK_PR;
1112         einfo.ei_flags = LDLM_FL_HAS_INTENT;
1113         einfo.ei_cb_bl = ll_extent_lock_callback;
1114         einfo.ei_cb_cp = ldlm_completion_ast;
1115         einfo.ei_cb_gl = ll_glimpse_callback;
1116         einfo.ei_cbdata = NULL;
1117
1118         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1119         oinfo.oi_lockh = &lockh;
1120         oinfo.oi_md = lsm;
1121
1122         rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1123         if (rc == -ENOENT)
1124                 RETURN(rc);
1125         if (rc != 0) {
1126                 CERROR("obd_enqueue returned rc %d, "
1127                        "returning -EIO\n", rc);
1128                 RETURN(rc > 0 ? -EIO : rc);
1129         }
1130
1131         lov_stripe_lock(lsm);
1132         memset(&lvb, 0, sizeof(lvb));
1133         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1134         st->st_size = lvb.lvb_size;
1135         st->st_blocks = lvb.lvb_blocks;
1136         st->st_mtime = lvb.lvb_mtime;
1137         st->st_atime = lvb.lvb_atime;
1138         st->st_ctime = lvb.lvb_ctime;
1139         lov_stripe_unlock(lsm);
1140
1141         RETURN(rc);
1142 }
1143
1144 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1145  * file (because it prefers KMS over RSS when larger) */
1146 int ll_glimpse_size(struct inode *inode, int ast_flags)
1147 {
1148         struct ll_inode_info *lli = ll_i2info(inode);
1149         struct ll_sb_info *sbi = ll_i2sbi(inode);
1150         struct lustre_handle lockh = { 0 };
1151         struct obd_enqueue_info einfo = { 0 };
1152         struct obd_info oinfo = { { { 0 } } };
1153         int rc;
1154         ENTRY;
1155
1156         if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1157                 RETURN(0);
1158
1159         CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1160
1161         if (!lli->lli_smd) {
1162                 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1163                 RETURN(0);
1164         }
1165
1166         /* NOTE: this looks like DLM lock request, but it may not be one. Due
1167          *       to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1168          *       won't revoke any conflicting DLM locks held. Instead,
1169          *       ll_glimpse_callback() will be called on each client
1170          *       holding a DLM lock against this file, and resulting size
1171          *       will be returned for each stripe. DLM lock on [0, EOF] is
1172          *       acquired only if there were no conflicting locks. */
1173         einfo.ei_type = LDLM_EXTENT;
1174         einfo.ei_mode = LCK_PR;
1175         einfo.ei_flags = ast_flags | LDLM_FL_HAS_INTENT;
1176         einfo.ei_cb_bl = ll_extent_lock_callback;
1177         einfo.ei_cb_cp = ldlm_completion_ast;
1178         einfo.ei_cb_gl = ll_glimpse_callback;
1179         einfo.ei_cbdata = inode;
1180
1181         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1182         oinfo.oi_lockh = &lockh;
1183         oinfo.oi_md = lli->lli_smd;
1184
1185         rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1186         if (rc == -ENOENT)
1187                 RETURN(rc);
1188         if (rc != 0) {
1189                 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1190                 RETURN(rc > 0 ? -EIO : rc);
1191         }
1192
1193         ll_merge_lvb(inode);
1194
1195         CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
1196                inode->i_size, inode->i_blocks);
1197
1198         RETURN(rc);
1199 }
1200
1201 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1202                    struct lov_stripe_md *lsm, int mode,
1203                    ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1204                    int ast_flags)
1205 {
1206         struct ll_sb_info *sbi = ll_i2sbi(inode);
1207         struct ost_lvb lvb;
1208         struct obd_enqueue_info einfo = { 0 };
1209         struct obd_info oinfo = { { { 0 } } };
1210         int rc;
1211         ENTRY;
1212
1213         LASSERT(!lustre_handle_is_used(lockh));
1214         LASSERT(lsm != NULL);
1215
1216         /* don't drop the mmapped file to LRU */
1217         if (mapping_mapped(inode->i_mapping))
1218                 ast_flags |= LDLM_FL_NO_LRU;
1219
1220         /* XXX phil: can we do this?  won't it screw the file size up? */
1221         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1222             (sbi->ll_flags & LL_SBI_NOLCK))
1223                 RETURN(0);
1224
1225         CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1226                inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1227
1228         einfo.ei_type = LDLM_EXTENT;
1229         einfo.ei_mode = mode;
1230         einfo.ei_flags = ast_flags;
1231         einfo.ei_cb_bl = ll_extent_lock_callback;
1232         einfo.ei_cb_cp = ldlm_completion_ast;
1233         einfo.ei_cb_gl = ll_glimpse_callback;
1234         einfo.ei_cbdata = inode;
1235
1236         oinfo.oi_policy = *policy;
1237         oinfo.oi_lockh = lockh;
1238         oinfo.oi_md = lsm;
1239
1240         rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo);
1241         *policy = oinfo.oi_policy;
1242         if (rc > 0)
1243                 rc = -EIO;
1244
1245         ll_inode_size_lock(inode, 1);
1246         inode_init_lvb(inode, &lvb);
1247         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1248
1249         if (policy->l_extent.start == 0 &&
1250             policy->l_extent.end == OBD_OBJECT_EOF) {
1251                 /* vmtruncate()->ll_truncate() first sets the i_size and then
1252                  * the kms under both a DLM lock and the
1253                  * ll_inode_size_lock().  If we don't get the
1254                  * ll_inode_size_lock() here we can match the DLM lock and
1255                  * reset i_size from the kms before the truncating path has
1256                  * updated the kms.  generic_file_write can then trust the
1257                  * stale i_size when doing appending writes and effectively
1258                  * cancel the result of the truncate.  Getting the
1259                  * ll_inode_size_lock() after the enqueue maintains the DLM
1260                  * -> ll_inode_size_lock() acquiring order. */
1261                 inode->i_size = lvb.lvb_size;
1262                 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1263                        inode->i_ino, inode->i_size);
1264         }
1265
1266         if (rc == 0) {
1267                 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1268                 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1269                 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1270         }
1271         ll_inode_size_unlock(inode, 1);
1272
1273         RETURN(rc);
1274 }
1275
1276 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1277                      struct lov_stripe_md *lsm, int mode,
1278                      struct lustre_handle *lockh)
1279 {
1280         struct ll_sb_info *sbi = ll_i2sbi(inode);
1281         int rc;
1282         ENTRY;
1283
1284         /* XXX phil: can we do this?  won't it screw the file size up? */
1285         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1286             (sbi->ll_flags & LL_SBI_NOLCK))
1287                 RETURN(0);
1288
1289         rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1290
1291         RETURN(rc);
1292 }
1293
1294 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1295                             loff_t *ppos)
1296 {
1297         struct inode *inode = file->f_dentry->d_inode;
1298         struct ll_inode_info *lli = ll_i2info(inode);
1299         struct lov_stripe_md *lsm = lli->lli_smd;
1300         struct ll_sb_info *sbi = ll_i2sbi(inode);
1301         struct ll_lock_tree tree;
1302         struct ll_lock_tree_node *node;
1303         struct ost_lvb lvb;
1304         struct ll_ra_read bead;
1305         int rc, ra = 0;
1306         loff_t end;
1307         ssize_t retval, chunk, sum = 0;
1308
1309         __u64 kms;
1310         ENTRY;
1311         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1312                inode->i_ino, inode->i_generation, inode, count, *ppos);
1313         /* "If nbyte is 0, read() will return 0 and have no other results."
1314          *                      -- Single Unix Spec */
1315         if (count == 0)
1316                 RETURN(0);
1317
1318         ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1319
1320         if (!lsm) {
1321                 /* Read on file with no objects should return zero-filled
1322                  * buffers up to file size (we can get non-zero sizes with
1323                  * mknod + truncate, then opening file for read. This is a
1324                  * common pattern in NFS case, it seems). Bug 6243 */
1325                 int notzeroed;
1326                 /* Since there are no objects on OSTs, we have nothing to get
1327                  * lock on and so we are forced to access inode->i_size
1328                  * unguarded */
1329
1330                 /* Read beyond end of file */
1331                 if (*ppos >= inode->i_size)
1332                         RETURN(0);
1333
1334                 if (count > inode->i_size - *ppos)
1335                         count = inode->i_size - *ppos;
1336                 /* Make sure to correctly adjust the file pos pointer for
1337                  * EFAULT case */
1338                 notzeroed = clear_user(buf, count);
1339                 count -= notzeroed;
1340                 *ppos += count;
1341                 if (!count)
1342                         RETURN(-EFAULT);
1343                 RETURN(count);
1344         }
1345
1346 repeat:
1347         if (sbi->ll_max_rw_chunk != 0) {
1348                 /* first, let's know the end of the current stripe */
1349                 end = *ppos;
1350                 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1351                                 (obd_off *)&end);
1352
1353                 /* correct, the end is beyond the request */
1354                 if (end > *ppos + count - 1)
1355                         end = *ppos + count - 1;
1356
1357                 /* and chunk shouldn't be too large even if striping is wide */
1358                 if (end - *ppos > sbi->ll_max_rw_chunk)
1359                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1360         } else {
1361                 end = *ppos + count - 1;
1362         }
1363
1364         node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1365         if (IS_ERR(node)){
1366                 GOTO(out, retval = PTR_ERR(node));
1367         }
1368
1369         tree.lt_fd = LUSTRE_FPRIVATE(file);
1370         rc = ll_tree_lock(&tree, node, buf, count,
1371                           file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1372         if (rc != 0)
1373                 GOTO(out, retval = rc);
1374
1375         ll_inode_size_lock(inode, 1);
1376         /*
1377          * Consistency guarantees: following possibilities exist for the
1378          * relation between region being read and real file size at this
1379          * moment:
1380          *
1381          *  (A): the region is completely inside of the file;
1382          *
1383          *  (B-x): x bytes of region are inside of the file, the rest is
1384          *  outside;
1385          *
1386          *  (C): the region is completely outside of the file.
1387          *
1388          * This classification is stable under DLM lock acquired by
1389          * ll_tree_lock() above, because to change class, other client has to
1390          * take DLM lock conflicting with our lock. Also, any updates to
1391          * ->i_size by other threads on this client are serialized by
1392          * ll_inode_size_lock(). This guarantees that short reads are handled
1393          * correctly in the face of concurrent writes and truncates.
1394          */
1395         inode_init_lvb(inode, &lvb);
1396         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1397         kms = lvb.lvb_size;
1398         if (*ppos + count - 1 > kms) {
1399                 /* A glimpse is necessary to determine whether we return a
1400                  * short read (B) or some zeroes at the end of the buffer (C) */
1401                 ll_inode_size_unlock(inode, 1);
1402                 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1403                 if (retval) {
1404                         ll_tree_unlock(&tree);
1405                         goto out;
1406                 }
1407         } else {
1408                 /* region is within kms and, hence, within real file size (A).
1409                  * We need to increase i_size to cover the read region so that
1410                  * generic_file_read() will do its job, but that doesn't mean
1411                  * the kms size is _correct_, it is only the _minimum_ size.
1412                  * If someone does a stat they will get the correct size which
1413                  * will always be >= the kms value here.  b=11081 */
1414                 if (inode->i_size < kms)
1415                         inode->i_size = kms;
1416                 ll_inode_size_unlock(inode, 1);
1417         }
1418
1419         chunk = end - *ppos + 1;
1420         CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1421                         inode->i_ino, chunk, *ppos, inode->i_size);
1422
1423         /* turn off the kernel's read-ahead */
1424 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1425         file->f_ramax = 0;
1426 #else
1427         file->f_ra.ra_pages = 0;
1428 #endif
1429         /* initialize read-ahead window once per syscall */
1430         if (ra == 0) {
1431                 ra = 1;
1432                 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1433                 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1434                 ll_ra_read_in(file, &bead);
1435         }
1436
1437         /* BUG: 5972 */
1438         file_accessed(file);
1439         retval = generic_file_read(file, buf, chunk, ppos);
1440         ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1441
1442         ll_tree_unlock(&tree);
1443
1444         if (retval > 0) {
1445                 buf += retval;
1446                 count -= retval;
1447                 sum += retval;
1448                 if (retval == chunk && count > 0)
1449                         goto repeat;
1450         }
1451
1452  out:
1453         if (ra != 0)
1454                 ll_ra_read_ex(file, &bead);
1455         retval = (sum > 0) ? sum : retval;
1456         RETURN(retval);
1457 }
1458
1459 /*
1460  * Write to a file (through the page cache).
1461  */
1462 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1463                              loff_t *ppos)
1464 {
1465         struct inode *inode = file->f_dentry->d_inode;
1466         struct ll_sb_info *sbi = ll_i2sbi(inode);
1467         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1468         struct ll_lock_tree tree;
1469         struct ll_lock_tree_node *node;
1470         loff_t maxbytes = ll_file_maxbytes(inode);
1471         loff_t lock_start, lock_end, end;
1472         ssize_t retval, chunk, sum = 0;
1473         int rc;
1474         ENTRY;
1475
1476         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1477                inode->i_ino, inode->i_generation, inode, count, *ppos);
1478
1479         SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1480
1481         /* POSIX, but surprised the VFS doesn't check this already */
1482         if (count == 0)
1483                 RETURN(0);
1484
1485         /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1486          * called on the file, don't fail the below assertion (bug 2388). */
1487         if (file->f_flags & O_LOV_DELAY_CREATE &&
1488             ll_i2info(inode)->lli_smd == NULL)
1489                 RETURN(-EBADF);
1490
1491         LASSERT(ll_i2info(inode)->lli_smd != NULL);
1492
1493         down(&ll_i2info(inode)->lli_write_sem);
1494
1495 repeat:
1496         chunk = 0; /* just to fix gcc's warning */
1497         end = *ppos + count - 1;
1498
1499         if (file->f_flags & O_APPEND) {
1500                 lock_start = 0;
1501                 lock_end = OBD_OBJECT_EOF;
1502         } else if (sbi->ll_max_rw_chunk != 0) {
1503                 /* first, let's know the end of the current stripe */
1504                 end = *ppos;
1505                 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1506                                 (obd_off *)&end);
1507
1508                 /* correct, the end is beyond the request */
1509                 if (end > *ppos + count - 1)
1510                         end = *ppos + count - 1;
1511
1512                 /* and chunk shouldn't be too large even if striping is wide */
1513                 if (end - *ppos > sbi->ll_max_rw_chunk)
1514                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1515                 lock_start = *ppos;
1516                 lock_end = end;
1517         } else {
1518                 lock_start = *ppos;
1519                 lock_end = *ppos + count - 1;
1520         }
1521         node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1522
1523         if (IS_ERR(node))
1524                 GOTO(out, retval = PTR_ERR(node));
1525
1526         tree.lt_fd = LUSTRE_FPRIVATE(file);
1527         rc = ll_tree_lock(&tree, node, buf, count,
1528                           file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1529         if (rc != 0)
1530                 GOTO(out, retval = rc);
1531
1532         /* This is ok, g_f_w will overwrite this under i_sem if it races
1533          * with a local truncate, it just makes our maxbyte checking easier.
1534          * The i_size value gets updated in ll_extent_lock() as a consequence
1535          * of the [0,EOF] extent lock we requested above. */
1536         if (file->f_flags & O_APPEND) {
1537                 *ppos = inode->i_size;
1538                 end = *ppos + count - 1;
1539         }
1540
1541         if (*ppos >= maxbytes) {
1542                 send_sig(SIGXFSZ, current, 0);
1543                 GOTO(out_unlock, retval = -EFBIG);
1544         }
1545         if (*ppos + count > maxbytes)
1546                 count = maxbytes - *ppos;
1547
1548         /* generic_file_write handles O_APPEND after getting i_mutex */
1549         chunk = end - *ppos + 1;
1550         CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1551                inode->i_ino, chunk, *ppos);
1552         retval = generic_file_write(file, buf, chunk, ppos);
1553         ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1554
1555 out_unlock:
1556         ll_tree_unlock(&tree);
1557
1558 out:
1559         if (retval > 0) {
1560                 buf += retval;
1561                 count -= retval;
1562                 sum += retval;
1563                 if (retval == chunk && count > 0)
1564                         goto repeat;
1565         }
1566
1567         up(&ll_i2info(inode)->lli_write_sem);
1568
1569         retval = (sum > 0) ? sum : retval;
1570         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1571                            retval > 0 ? retval : 0);
1572         RETURN(retval);
1573 }
1574
1575 /*
1576  * Send file content (through pagecache) somewhere with helper
1577  */
1578 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1579 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1580                                 read_actor_t actor, void *target)
1581 {
1582         struct inode *inode = in_file->f_dentry->d_inode;
1583         struct ll_inode_info *lli = ll_i2info(inode);
1584         struct lov_stripe_md *lsm = lli->lli_smd;
1585         struct ll_lock_tree tree;
1586         struct ll_lock_tree_node *node;
1587         struct ost_lvb lvb;
1588         struct ll_ra_read bead;
1589         int rc;
1590         ssize_t retval;
1591         __u64 kms;
1592         ENTRY;
1593         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1594                inode->i_ino, inode->i_generation, inode, count, *ppos);
1595
1596         /* "If nbyte is 0, read() will return 0 and have no other results."
1597          *                      -- Single Unix Spec */
1598         if (count == 0)
1599                 RETURN(0);
1600
1601         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1602         /* turn off the kernel's read-ahead */
1603         in_file->f_ra.ra_pages = 0;
1604
1605         /* File with no objects, nothing to lock */
1606         if (!lsm)
1607                 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1608
1609         node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1610         if (IS_ERR(node))
1611                 RETURN(PTR_ERR(node));
1612
1613         tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1614         rc = ll_tree_lock(&tree, node, NULL, count,
1615                           in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1616         if (rc != 0)
1617                 RETURN(rc);
1618
1619         ll_inode_size_lock(inode, 1);
1620         /*
1621          * Consistency guarantees: following possibilities exist for the
1622          * relation between region being read and real file size at this
1623          * moment:
1624          *
1625          *  (A): the region is completely inside of the file;
1626          *
1627          *  (B-x): x bytes of region are inside of the file, the rest is
1628          *  outside;
1629          *
1630          *  (C): the region is completely outside of the file.
1631          *
1632          * This classification is stable under DLM lock acquired by
1633          * ll_tree_lock() above, because to change class, other client has to
1634          * take DLM lock conflicting with our lock. Also, any updates to
1635          * ->i_size by other threads on this client are serialized by
1636          * ll_inode_size_lock(). This guarantees that short reads are handled
1637          * correctly in the face of concurrent writes and truncates.
1638          */
1639         inode_init_lvb(inode, &lvb);
1640         obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1641         kms = lvb.lvb_size;
1642         if (*ppos + count - 1 > kms) {
1643                 /* A glimpse is necessary to determine whether we return a
1644                  * short read (B) or some zeroes at the end of the buffer (C) */
1645                 ll_inode_size_unlock(inode, 1);
1646                 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1647                 if (retval)
1648                         goto out;
1649         } else {
1650                 /* region is within kms and, hence, within real file size (A) */
1651                 inode->i_size = kms;
1652                 ll_inode_size_unlock(inode, 1);
1653         }
1654
1655         CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1656                inode->i_ino, count, *ppos, inode->i_size);
1657
1658         bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1659         bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1660         ll_ra_read_in(in_file, &bead);
1661         /* BUG: 5972 */
1662         file_accessed(in_file);
1663         retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1664         ll_ra_read_ex(in_file, &bead);
1665
1666  out:
1667         ll_tree_unlock(&tree);
1668         RETURN(retval);
1669 }
1670 #endif
1671
1672 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1673                                unsigned long arg)
1674 {
1675         struct ll_inode_info *lli = ll_i2info(inode);
1676         struct obd_export *exp = ll_i2dtexp(inode);
1677         struct ll_recreate_obj ucreatp;
1678         struct obd_trans_info oti = { 0 };
1679         struct obdo *oa = NULL;
1680         int lsm_size;
1681         int rc = 0;
1682         struct lov_stripe_md *lsm, *lsm2;
1683         ENTRY;
1684
1685         if (!capable (CAP_SYS_ADMIN))
1686                 RETURN(-EPERM);
1687
1688         rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1689                             sizeof(struct ll_recreate_obj));
1690         if (rc) {
1691                 RETURN(-EFAULT);
1692         }
1693         OBDO_ALLOC(oa);
1694         if (oa == NULL)
1695                 RETURN(-ENOMEM);
1696
1697         down(&lli->lli_size_sem);
1698         lsm = lli->lli_smd;
1699         if (lsm == NULL)
1700                 GOTO(out, rc = -ENOENT);
1701         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1702                    (lsm->lsm_stripe_count));
1703
1704         OBD_ALLOC(lsm2, lsm_size);
1705         if (lsm2 == NULL)
1706                 GOTO(out, rc = -ENOMEM);
1707
1708         oa->o_id = ucreatp.lrc_id;
1709         oa->o_gr = ucreatp.lrc_group;
1710         oa->o_nlink = ucreatp.lrc_ost_idx;
1711         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1712         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1713         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1714                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1715
1716         oti.oti_objid = NULL;
1717         memcpy(lsm2, lsm, lsm_size);
1718         rc = obd_create(exp, oa, &lsm2, &oti);
1719
1720         OBD_FREE(lsm2, lsm_size);
1721         GOTO(out, rc);
1722 out:
1723         up(&lli->lli_size_sem);
1724         OBDO_FREE(oa);
1725         return rc;
1726 }
1727
1728 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1729                              int flags, struct lov_user_md *lum, int lum_size)
1730 {
1731         struct ll_inode_info *lli = ll_i2info(inode);
1732         struct lov_stripe_md *lsm;
1733         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1734         int rc = 0;
1735         ENTRY;
1736
1737         down(&lli->lli_size_sem);
1738         lsm = lli->lli_smd;
1739         if (lsm) {
1740                 up(&lli->lli_size_sem);
1741                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1742                        inode->i_ino);
1743                 RETURN(-EEXIST);
1744         }
1745
1746         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1747         if (rc)
1748                 GOTO(out, rc);
1749         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1750                 GOTO(out_req_free, rc = -ENOENT);
1751         rc = oit.d.lustre.it_status;
1752         if (rc < 0)
1753                 GOTO(out_req_free, rc);
1754
1755         ll_release_openhandle(file->f_dentry, &oit);
1756
1757  out:
1758         up(&lli->lli_size_sem);
1759         ll_intent_release(&oit);
1760         RETURN(rc);
1761 out_req_free:
1762         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1763         goto out;
1764 }
1765
1766 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1767                              struct lov_mds_md **lmmp, int *lmm_size,
1768                              struct ptlrpc_request **request)
1769 {
1770         struct ll_sb_info *sbi = ll_i2sbi(inode);
1771         struct mdt_body  *body;
1772         struct lov_mds_md *lmm = NULL;
1773         struct ptlrpc_request *req = NULL;
1774         struct obd_capa *oc;
1775         int rc, lmmsize;
1776
1777         rc = ll_get_max_mdsize(sbi, &lmmsize);
1778         if (rc)
1779                 RETURN(rc);
1780
1781         oc = ll_mdscapa_get(inode);
1782         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1783                              oc, filename, strlen(filename) + 1,
1784                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize, &req);
1785         capa_put(oc);
1786         if (rc < 0) {
1787                 CDEBUG(D_INFO, "md_getattr_name failed "
1788                        "on %s: rc %d\n", filename, rc);
1789                 GOTO(out, rc);
1790         }
1791
1792         body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*body));
1793         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1794         /* swabbed by mdc_getattr_name */
1795         LASSERT_REPSWABBED(req, REPLY_REC_OFF);
1796
1797         lmmsize = body->eadatasize;
1798
1799         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1800                         lmmsize == 0) {
1801                 GOTO(out, rc = -ENODATA);
1802         }
1803
1804         lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1805         LASSERT(lmm != NULL);
1806         LASSERT_REPSWABBED(req, REPLY_REC_OFF + 1);
1807
1808         /*
1809          * This is coming from the MDS, so is probably in
1810          * little endian.  We convert it to host endian before
1811          * passing it to userspace.
1812          */
1813         if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1814                 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1815                 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1816         } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1817                 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1818         }
1819
1820         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1821                 struct lov_stripe_md *lsm;
1822                 struct lov_user_md_join *lmj;
1823                 int lmj_size, i, aindex = 0;
1824
1825                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1826                 if (rc < 0)
1827                         GOTO(out, rc = -ENOMEM);
1828                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1829                 if (rc)
1830                         GOTO(out_free_memmd, rc);
1831
1832                 lmj_size = sizeof(struct lov_user_md_join) +
1833                            lsm->lsm_stripe_count *
1834                            sizeof(struct lov_user_ost_data_join);
1835                 OBD_ALLOC(lmj, lmj_size);
1836                 if (!lmj)
1837                         GOTO(out_free_memmd, rc = -ENOMEM);
1838
1839                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1840                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1841                         struct lov_extent *lex =
1842                                 &lsm->lsm_array->lai_ext_array[aindex];
1843
1844                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1845                                 aindex ++;
1846                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1847                                         LPU64" len %d\n", aindex, i,
1848                                         lex->le_start, (int)lex->le_len);
1849                         lmj->lmm_objects[i].l_extent_start =
1850                                 lex->le_start;
1851
1852                         if ((int)lex->le_len == -1)
1853                                 lmj->lmm_objects[i].l_extent_end = -1;
1854                         else
1855                                 lmj->lmm_objects[i].l_extent_end =
1856                                         lex->le_start + lex->le_len;
1857                         lmj->lmm_objects[i].l_object_id =
1858                                 lsm->lsm_oinfo[i]->loi_id;
1859                         lmj->lmm_objects[i].l_object_gr =
1860                                 lsm->lsm_oinfo[i]->loi_gr;
1861                         lmj->lmm_objects[i].l_ost_gen =
1862                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1863                         lmj->lmm_objects[i].l_ost_idx =
1864                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1865                 }
1866                 lmm = (struct lov_mds_md *)lmj;
1867                 lmmsize = lmj_size;
1868 out_free_memmd:
1869                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1870         }
1871 out:
1872         *lmmp = lmm;
1873         *lmm_size = lmmsize;
1874         *request = req;
1875         return rc;
1876 }
1877
1878 static int ll_lov_setea(struct inode *inode, struct file *file,
1879                             unsigned long arg)
1880 {
1881         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1882         struct lov_user_md  *lump;
1883         int lum_size = sizeof(struct lov_user_md) +
1884                        sizeof(struct lov_user_ost_data);
1885         int rc;
1886         ENTRY;
1887
1888         if (!capable (CAP_SYS_ADMIN))
1889                 RETURN(-EPERM);
1890
1891         OBD_ALLOC(lump, lum_size);
1892         if (lump == NULL) {
1893                 RETURN(-ENOMEM);
1894         }
1895         rc = copy_from_user(lump, (struct lov_user_md  *)arg, lum_size);
1896         if (rc) {
1897                 OBD_FREE(lump, lum_size);
1898                 RETURN(-EFAULT);
1899         }
1900
1901         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1902
1903         OBD_FREE(lump, lum_size);
1904         RETURN(rc);
1905 }
1906
1907 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1908                             unsigned long arg)
1909 {
1910         struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1911         int rc;
1912         int flags = FMODE_WRITE;
1913         ENTRY;
1914
1915         /* Bug 1152: copy properly when this is no longer true */
1916         LASSERT(sizeof(lum) == sizeof(*lump));
1917         LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1918         rc = copy_from_user(&lum, lump, sizeof(lum));
1919         if (rc)
1920                 RETURN(-EFAULT);
1921
1922         rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1923         if (rc == 0) {
1924                  put_user(0, &lump->lmm_stripe_count);
1925                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1926                                     0, ll_i2info(inode)->lli_smd, lump);
1927         }
1928         RETURN(rc);
1929 }
1930
1931 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1932 {
1933         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1934
1935         if (!lsm)
1936                 RETURN(-ENODATA);
1937
1938         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1939                             (void *)arg);
1940 }
1941
1942 static int ll_get_grouplock(struct inode *inode, struct file *file,
1943                             unsigned long arg)
1944 {
1945         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1946         ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1947                                                     .end = OBD_OBJECT_EOF}};
1948         struct lustre_handle lockh = { 0 };
1949         struct ll_inode_info *lli = ll_i2info(inode);
1950         struct lov_stripe_md *lsm = lli->lli_smd;
1951         int flags = 0, rc;
1952         ENTRY;
1953
1954         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1955                 RETURN(-EINVAL);
1956         }
1957
1958         policy.l_extent.gid = arg;
1959         if (file->f_flags & O_NONBLOCK)
1960                 flags = LDLM_FL_BLOCK_NOWAIT;
1961
1962         rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1963         if (rc)
1964                 RETURN(rc);
1965
1966         fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1967         fd->fd_gid = arg;
1968         memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1969
1970         RETURN(0);
1971 }
1972
1973 static int ll_put_grouplock(struct inode *inode, struct file *file,
1974                             unsigned long arg)
1975 {
1976         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1977         struct ll_inode_info *lli = ll_i2info(inode);
1978         struct lov_stripe_md *lsm = lli->lli_smd;
1979         int rc;
1980         ENTRY;
1981
1982         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1983                 /* Ugh, it's already unlocked. */
1984                 RETURN(-EINVAL);
1985         }
1986
1987         if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
1988                 RETURN(-EINVAL);
1989
1990         fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
1991
1992         rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
1993         if (rc)
1994                 RETURN(rc);
1995
1996         fd->fd_gid = 0;
1997         memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
1998
1999         RETURN(0);
2000 }
2001
2002 static int join_sanity_check(struct inode *head, struct inode *tail)
2003 {
2004         ENTRY;
2005         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2006                 CERROR("server do not support join \n");
2007                 RETURN(-EINVAL);
2008         }
2009         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2010                 CERROR("tail ino %lu and ino head %lu must be regular\n",
2011                        head->i_ino, tail->i_ino);
2012                 RETURN(-EINVAL);
2013         }
2014         if (head->i_ino == tail->i_ino) {
2015                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2016                 RETURN(-EINVAL);
2017         }
2018         if (head->i_size % JOIN_FILE_ALIGN) {
2019                 CERROR("hsize %llu must be times of 64K\n", head->i_size);
2020                 RETURN(-EINVAL);
2021         }
2022         RETURN(0);
2023 }
2024
2025 static int join_file(struct inode *head_inode, struct file *head_filp,
2026                      struct file *tail_filp)
2027 {
2028         struct inode *tail_inode, *tail_parent;
2029         struct dentry *tail_dentry = tail_filp->f_dentry;
2030         struct lookup_intent oit = {.it_op = IT_OPEN,
2031                                    .it_flags = head_filp->f_flags|O_JOIN_FILE};
2032         struct lustre_handle lockh;
2033         struct md_op_data *op_data;
2034         int    rc;
2035         ENTRY;
2036
2037         tail_dentry = tail_filp->f_dentry;
2038         tail_inode = tail_dentry->d_inode;
2039         tail_parent = tail_dentry->d_parent->d_inode;
2040
2041         op_data = ll_prep_md_op_data(NULL, head_inode, tail_parent,
2042                                      tail_dentry->d_name.name,
2043                                      tail_dentry->d_name.len, 0,
2044                                      LUSTRE_OPC_ANY, &head_inode->i_size);
2045         if (IS_ERR(op_data))
2046                 RETURN(PTR_ERR(op_data));
2047
2048         rc = md_enqueue(ll_i2mdexp(head_inode), LDLM_IBITS, &oit, LCK_CW,
2049                         op_data, &lockh, NULL, 0, ldlm_completion_ast,
2050                         ll_md_blocking_ast, NULL, 0);
2051
2052         ll_finish_md_op_data(op_data);
2053         if (rc < 0)
2054                 GOTO(out, rc);
2055
2056         rc = oit.d.lustre.it_status;
2057
2058         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2059                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2060                 ptlrpc_req_finished((struct ptlrpc_request *)
2061                                     oit.d.lustre.it_data);
2062                 GOTO(out, rc);
2063         }
2064
2065         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2066                                            * away */
2067                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2068                 oit.d.lustre.it_lock_mode = 0;
2069         }
2070         ll_release_openhandle(head_filp->f_dentry, &oit);
2071 out:
2072         ll_intent_release(&oit);
2073         RETURN(rc);
2074 }
2075
2076 static int ll_file_join(struct inode *head, struct file *filp,
2077                         char *filename_tail)
2078 {
2079         struct inode *tail = NULL, *first = NULL, *second = NULL;
2080         struct dentry *tail_dentry;
2081         struct file *tail_filp, *first_filp, *second_filp;
2082         struct ll_lock_tree first_tree, second_tree;
2083         struct ll_lock_tree_node *first_node, *second_node;
2084         struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2085         int rc = 0, cleanup_phase = 0;
2086         ENTRY;
2087
2088         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2089                head->i_ino, head->i_generation, head, filename_tail);
2090
2091         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2092         if (IS_ERR(tail_filp)) {
2093                 CERROR("Can not open tail file %s", filename_tail);
2094                 rc = PTR_ERR(tail_filp);
2095                 GOTO(cleanup, rc);
2096         }
2097         tail = igrab(tail_filp->f_dentry->d_inode);
2098
2099         tlli = ll_i2info(tail);
2100         tail_dentry = tail_filp->f_dentry;
2101         LASSERT(tail_dentry);
2102         cleanup_phase = 1;
2103
2104         /*reorder the inode for lock sequence*/
2105         first = head->i_ino > tail->i_ino ? head : tail;
2106         second = head->i_ino > tail->i_ino ? tail : head;
2107         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2108         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2109
2110         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2111                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2112         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2113         if (IS_ERR(first_node)){
2114                 rc = PTR_ERR(first_node);
2115                 GOTO(cleanup, rc);
2116         }
2117         first_tree.lt_fd = first_filp->private_data;
2118         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2119         if (rc != 0)
2120                 GOTO(cleanup, rc);
2121         cleanup_phase = 2;
2122
2123         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2124         if (IS_ERR(second_node)){
2125                 rc = PTR_ERR(second_node);
2126                 GOTO(cleanup, rc);
2127         }
2128         second_tree.lt_fd = second_filp->private_data;
2129         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2130         if (rc != 0)
2131                 GOTO(cleanup, rc);
2132         cleanup_phase = 3;
2133
2134         rc = join_sanity_check(head, tail);
2135         if (rc)
2136                 GOTO(cleanup, rc);
2137
2138         rc = join_file(head, filp, tail_filp);
2139         if (rc)
2140                 GOTO(cleanup, rc);
2141 cleanup:
2142         switch (cleanup_phase) {
2143         case 3:
2144                 ll_tree_unlock(&second_tree);
2145                 obd_cancel_unused(ll_i2dtexp(second),
2146                                   ll_i2info(second)->lli_smd, 0, NULL);
2147         case 2:
2148                 ll_tree_unlock(&first_tree);
2149                 obd_cancel_unused(ll_i2dtexp(first),
2150                                   ll_i2info(first)->lli_smd, 0, NULL);
2151         case 1:
2152                 filp_close(tail_filp, 0);
2153                 if (tail)
2154                         iput(tail);
2155                 if (head && rc == 0) {
2156                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2157                                        &hlli->lli_smd);
2158                         hlli->lli_smd = NULL;
2159                 }
2160         case 0:
2161                 break;
2162         default:
2163                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2164                 LBUG();
2165         }
2166         RETURN(rc);
2167 }
2168
2169 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2170 {
2171         struct inode *inode = dentry->d_inode;
2172         struct obd_client_handle *och;
2173         int rc;
2174         ENTRY;
2175
2176         LASSERT(inode);
2177
2178         /* Root ? Do nothing. */
2179         if (dentry->d_inode->i_sb->s_root == dentry)
2180                 RETURN(0);
2181
2182         /* No open handle to close? Move away */
2183         if (!it_disposition(it, DISP_OPEN_OPEN))
2184                 RETURN(0);
2185
2186         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2187
2188         OBD_ALLOC(och, sizeof(*och));
2189         if (!och)
2190                 GOTO(out, rc = -ENOMEM);
2191
2192         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2193                     ll_i2info(inode), it, och);
2194
2195         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2196                                        inode, och);
2197  out:
2198         /* this one is in place of ll_file_open */
2199         ptlrpc_req_finished(it->d.lustre.it_data);
2200         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2201         RETURN(rc);
2202 }
2203
2204 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2205                   unsigned long arg)
2206 {
2207         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2208         int flags;
2209         ENTRY;
2210
2211         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2212                inode->i_generation, inode, cmd);
2213         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2214
2215         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2216         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2217                 RETURN(-ENOTTY);
2218
2219         switch(cmd) {
2220         case LL_IOC_GETFLAGS:
2221                 /* Get the current value of the file flags */
2222                 return put_user(fd->fd_flags, (int *)arg);
2223         case LL_IOC_SETFLAGS:
2224         case LL_IOC_CLRFLAGS:
2225                 /* Set or clear specific file flags */
2226                 /* XXX This probably needs checks to ensure the flags are
2227                  *     not abused, and to handle any flag side effects.
2228                  */
2229                 if (get_user(flags, (int *) arg))
2230                         RETURN(-EFAULT);
2231
2232                 if (cmd == LL_IOC_SETFLAGS) {
2233                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2234                             !(file->f_flags & O_DIRECT)) {
2235                                 CERROR("%s: unable to disable locking on "
2236                                        "non-O_DIRECT file\n", current->comm);
2237                                 RETURN(-EINVAL);
2238                         }
2239
2240                         fd->fd_flags |= flags;
2241                 } else {
2242                         fd->fd_flags &= ~flags;
2243                 }
2244                 RETURN(0);
2245         case LL_IOC_LOV_SETSTRIPE:
2246                 RETURN(ll_lov_setstripe(inode, file, arg));
2247         case LL_IOC_LOV_SETEA:
2248                 RETURN(ll_lov_setea(inode, file, arg));
2249         case LL_IOC_LOV_GETSTRIPE:
2250                 RETURN(ll_lov_getstripe(inode, arg));
2251         case LL_IOC_RECREATE_OBJ:
2252                 RETURN(ll_lov_recreate_obj(inode, file, arg));
2253         case EXT3_IOC_GETFLAGS:
2254         case EXT3_IOC_SETFLAGS:
2255                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2256         case EXT3_IOC_GETVERSION_OLD:
2257         case EXT3_IOC_GETVERSION:
2258                 RETURN(put_user(inode->i_generation, (int *)arg));
2259         case LL_IOC_JOIN: {
2260                 char *ftail;
2261                 int rc;
2262
2263                 ftail = getname((const char *)arg);
2264                 if (IS_ERR(ftail))
2265                         RETURN(PTR_ERR(ftail));
2266                 rc = ll_file_join(inode, file, ftail);
2267                 putname(ftail);
2268                 RETURN(rc);
2269         }
2270         case LL_IOC_GROUP_LOCK:
2271                 RETURN(ll_get_grouplock(inode, file, arg));
2272         case LL_IOC_GROUP_UNLOCK:
2273                 RETURN(ll_put_grouplock(inode, file, arg));
2274         case IOC_OBD_STATFS:
2275                 RETURN(ll_obd_statfs(inode, (void *)arg));
2276
2277         /* We need to special case any other ioctls we want to handle,
2278          * to send them to the MDS/OST as appropriate and to properly
2279          * network encode the arg field.
2280         case EXT3_IOC_SETVERSION_OLD:
2281         case EXT3_IOC_SETVERSION:
2282         */
2283         case LL_IOC_FLUSHCTX:
2284                 RETURN(ll_flush_ctx(inode));
2285         case LL_IOC_GETFACL: {
2286                 struct rmtacl_ioctl_data ioc;
2287
2288                 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2289                         RETURN(-EFAULT);
2290
2291                 RETURN(ll_ioctl_getfacl(inode, &ioc));
2292         }
2293         case LL_IOC_SETFACL: {
2294                 struct rmtacl_ioctl_data ioc;
2295
2296                 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2297                         RETURN(-EFAULT);
2298
2299                 RETURN(ll_ioctl_setfacl(inode, &ioc));
2300         }
2301         default:
2302                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2303                                      (void *)arg));
2304         }
2305 }
2306
2307 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2308 {
2309         struct inode *inode = file->f_dentry->d_inode;
2310         struct ll_inode_info *lli = ll_i2info(inode);
2311         struct lov_stripe_md *lsm = lli->lli_smd;
2312         loff_t retval;
2313         ENTRY;
2314         retval = offset + ((origin == 2) ? inode->i_size :
2315                            (origin == 1) ? file->f_pos : 0);
2316         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2317                inode->i_ino, inode->i_generation, inode, retval, retval,
2318                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2319         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2320
2321         if (origin == 2) { /* SEEK_END */
2322                 int nonblock = 0, rc;
2323
2324                 if (file->f_flags & O_NONBLOCK)
2325                         nonblock = LDLM_FL_BLOCK_NOWAIT;
2326
2327                 if (lsm != NULL) {
2328                         rc = ll_glimpse_size(inode, nonblock);
2329                         if (rc != 0)
2330                                 RETURN(rc);
2331                 }
2332
2333                 ll_inode_size_lock(inode, 0);
2334                 offset += inode->i_size;
2335                 ll_inode_size_unlock(inode, 0);
2336         } else if (origin == 1) { /* SEEK_CUR */
2337                 offset += file->f_pos;
2338         }
2339
2340         retval = -EINVAL;
2341         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2342                 if (offset != file->f_pos) {
2343                         file->f_pos = offset;
2344 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2345                         file->f_reada = 0;
2346                         file->f_version = ++event;
2347 #endif
2348                 }
2349                 retval = offset;
2350         }
2351
2352         RETURN(retval);
2353 }
2354
2355 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2356 {
2357         struct inode *inode = dentry->d_inode;
2358         struct ll_inode_info *lli = ll_i2info(inode);
2359         struct lov_stripe_md *lsm = lli->lli_smd;
2360         struct ptlrpc_request *req;
2361         struct obd_capa *oc;
2362         int rc, err;
2363         ENTRY;
2364         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2365                inode->i_generation, inode);
2366         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2367
2368         /* fsync's caller has already called _fdata{sync,write}, we want
2369          * that IO to finish before calling the osc and mdc sync methods */
2370         rc = filemap_fdatawait(inode->i_mapping);
2371
2372         /* catch async errors that were recorded back when async writeback
2373          * failed for pages in this mapping. */
2374         err = lli->lli_async_rc;
2375         lli->lli_async_rc = 0;
2376         if (rc == 0)
2377                 rc = err;
2378         if (lsm) {
2379                 err = lov_test_and_clear_async_rc(lsm);
2380                 if (rc == 0)
2381                         rc = err;
2382         }
2383
2384         oc = ll_mdscapa_get(inode);
2385         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2386                       &req);
2387         capa_put(oc);
2388         if (!rc)
2389                 rc = err;
2390         if (!err)
2391                 ptlrpc_req_finished(req);
2392
2393         if (data && lsm) {
2394                 struct obdo *oa;
2395
2396                 OBDO_ALLOC(oa);
2397                 if (!oa)
2398                         RETURN(rc ? rc : -ENOMEM);
2399
2400                 oa->o_id = lsm->lsm_object_id;
2401                 oa->o_gr = lsm->lsm_object_gr;
2402                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2403                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2404                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2405                                            OBD_MD_FLGROUP);
2406
2407                 oc = ll_osscapa_get(inode, 0, CAPA_OPC_OSS_WRITE);
2408                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2409                                0, OBD_OBJECT_EOF, oc);
2410                 capa_put(oc);
2411                 if (!rc)
2412                         rc = err;
2413                 OBDO_FREE(oa);
2414         }
2415
2416         RETURN(rc);
2417 }
2418
2419 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2420 {
2421         struct inode *inode = file->f_dentry->d_inode;
2422         struct ll_sb_info *sbi = ll_i2sbi(inode);
2423         struct ldlm_res_id res_id =
2424                 { .name = { fid_seq(ll_inode2fid(inode)),
2425                             fid_oid(ll_inode2fid(inode)),
2426                             fid_ver(ll_inode2fid(inode)),
2427                             LDLM_FLOCK} };
2428         struct lustre_handle lockh = {0};
2429         ldlm_policy_data_t flock;
2430         ldlm_mode_t mode = 0;
2431         int flags = 0;
2432         int rc;
2433         ENTRY;
2434
2435         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2436                inode->i_ino, file_lock);
2437
2438         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2439
2440         if (file_lock->fl_flags & FL_FLOCK) {
2441                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2442                 /* set missing params for flock() calls */
2443                 file_lock->fl_end = OFFSET_MAX;
2444                 file_lock->fl_pid = current->tgid;
2445         }
2446         flock.l_flock.pid = file_lock->fl_pid;
2447         flock.l_flock.start = file_lock->fl_start;
2448         flock.l_flock.end = file_lock->fl_end;
2449
2450         switch (file_lock->fl_type) {
2451         case F_RDLCK:
2452                 mode = LCK_PR;
2453                 break;
2454         case F_UNLCK:
2455                 /* An unlock request may or may not have any relation to
2456                  * existing locks so we may not be able to pass a lock handle
2457                  * via a normal ldlm_lock_cancel() request. The request may even
2458                  * unlock a byte range in the middle of an existing lock. In
2459                  * order to process an unlock request we need all of the same
2460                  * information that is given with a normal read or write record
2461                  * lock request. To avoid creating another ldlm unlock (cancel)
2462                  * message we'll treat a LCK_NL flock request as an unlock. */
2463                 mode = LCK_NL;
2464                 break;
2465         case F_WRLCK:
2466                 mode = LCK_PW;
2467                 break;
2468         default:
2469                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2470                 LBUG();
2471         }
2472
2473         switch (cmd) {
2474         case F_SETLKW:
2475 #ifdef F_SETLKW64
2476         case F_SETLKW64:
2477 #endif
2478                 flags = 0;
2479                 break;
2480         case F_SETLK:
2481 #ifdef F_SETLK64
2482         case F_SETLK64:
2483 #endif
2484                 flags = LDLM_FL_BLOCK_NOWAIT;
2485                 break;
2486         case F_GETLK:
2487 #ifdef F_GETLK64
2488         case F_GETLK64:
2489 #endif
2490                 flags = LDLM_FL_TEST_LOCK;
2491                 /* Save the old mode so that if the mode in the lock changes we
2492                  * can decrement the appropriate reader or writer refcount. */
2493                 file_lock->fl_type = mode;
2494                 break;
2495         default:
2496                 CERROR("unknown fcntl lock command: %d\n", cmd);
2497                 LBUG();
2498         }
2499
2500         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2501                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2502                flags, mode, flock.l_flock.start, flock.l_flock.end);
2503
2504         rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &res_id,
2505                               LDLM_FLOCK, &flock, mode, &flags, NULL,
2506                               ldlm_flock_completion_ast, NULL, file_lock,
2507                               NULL, 0, NULL, &lockh, 0);
2508         if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2509                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2510 #ifdef HAVE_F_OP_FLOCK
2511         if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2512             !(flags & LDLM_FL_TEST_LOCK))
2513                 posix_lock_file_wait(file, file_lock);
2514 #endif
2515
2516         RETURN(rc);
2517 }
2518
2519 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2520 {
2521         ENTRY;
2522
2523         RETURN(-ENOSYS);
2524 }
2525
2526 int ll_have_md_lock(struct inode *inode, __u64 bits)
2527 {
2528         struct lustre_handle lockh;
2529         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2530         struct lu_fid *fid;
2531         int flags;
2532         ENTRY;
2533
2534         if (!inode)
2535                RETURN(0);
2536
2537         fid = &ll_i2info(inode)->lli_fid;
2538         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2539
2540         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2541         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2542                           LCK_CR|LCK_CW|LCK_PR, &lockh)) {
2543                 RETURN(1);
2544         }
2545
2546         RETURN(0);
2547 }
2548
2549 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2550         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2551                               * and return success */
2552                 inode->i_nlink = 0;
2553                 /* This path cannot be hit for regular files unless in
2554                  * case of obscure races, so no need to to validate
2555                  * size. */
2556                 if (!S_ISREG(inode->i_mode) &&
2557                     !S_ISDIR(inode->i_mode))
2558                         return 0;
2559         }
2560
2561         if (rc) {
2562                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2563                 return -abs(rc);
2564
2565         }
2566
2567         return 0;
2568 }
2569
2570 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2571 {
2572         struct inode *inode = dentry->d_inode;
2573         struct ptlrpc_request *req = NULL;
2574         struct ll_sb_info *sbi;
2575         struct obd_export *exp;
2576         int rc;
2577         ENTRY;
2578
2579         if (!inode) {
2580                 CERROR("REPORT THIS LINE TO PETER\n");
2581                 RETURN(0);
2582         }
2583         sbi = ll_i2sbi(inode);
2584
2585         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2586                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2587 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
2588         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REVALIDATE, 1);
2589 #endif
2590
2591         exp = ll_i2mdexp(inode);
2592
2593         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2594                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2595                 struct md_op_data *op_data;
2596
2597                 /* Call getattr by fid, so do not provide name at all. */
2598                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2599                                              dentry->d_inode, NULL, 0, 0,
2600                                              LUSTRE_OPC_ANY, NULL);
2601                 if (IS_ERR(op_data))
2602                         RETURN(PTR_ERR(op_data));
2603
2604                 oit.it_flags |= O_CHECK_STALE;
2605                 rc = md_intent_lock(exp, op_data, NULL, 0,
2606                                     /* we are not interested in name
2607                                        based lookup */
2608                                     &oit, 0, &req,
2609                                     ll_md_blocking_ast, 0);
2610                 ll_finish_md_op_data(op_data);
2611                 oit.it_flags &= ~O_CHECK_STALE;
2612                 if (rc < 0) {
2613                         rc = ll_inode_revalidate_fini(inode, rc);
2614                         GOTO (out, rc);
2615                 }
2616
2617                 rc = ll_revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2618                 if (rc != 0) {
2619                         ll_intent_release(&oit);
2620                         GOTO(out, rc);
2621                 }
2622
2623                 /* Unlinked? Unhash dentry, so it is not picked up later by
2624                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2625                    here to preserve get_cwd functionality on 2.6.
2626                    Bug 10503 */
2627                 if (!dentry->d_inode->i_nlink) {
2628                         spin_lock(&dcache_lock);
2629                         ll_drop_dentry(dentry);
2630                         spin_unlock(&dcache_lock);
2631                 }
2632
2633                 ll_lookup_finish_locks(&oit, dentry);
2634         } else if (!ll_have_md_lock(dentry->d_inode,
2635                                     MDS_INODELOCK_UPDATE)) {
2636                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2637                 obd_valid valid = OBD_MD_FLGETATTR;
2638                 struct obd_capa *oc;
2639                 int ealen = 0;
2640
2641                 if (S_ISREG(inode->i_mode)) {
2642                         rc = ll_get_max_mdsize(sbi, &ealen);
2643                         if (rc)
2644                                 RETURN(rc);
2645                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2646                 }
2647                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2648                  * capa for this inode. Because we only keep capas of dirs
2649                  * fresh. */
2650                 oc = ll_mdscapa_get(inode);
2651                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2652                                 ealen, &req);
2653                 capa_put(oc);
2654                 if (rc) {
2655                         rc = ll_inode_revalidate_fini(inode, rc);
2656                         RETURN(rc);
2657                 }
2658
2659                 rc = ll_prep_inode(&inode, req, REPLY_REC_OFF,
2660                                    NULL);
2661                 if (rc)
2662                         GOTO(out, rc);
2663         }
2664
2665         /* if object not yet allocated, don't validate size */
2666         if (ll_i2info(inode)->lli_smd == NULL)
2667                 GOTO(out, rc = 0);
2668
2669         /* ll_glimpse_size will prefer locally cached writes if they extend
2670          * the file */
2671         rc = ll_glimpse_size(inode, 0);
2672         EXIT;
2673 out:
2674         ptlrpc_req_finished(req);
2675         return rc;
2676 }
2677
2678 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2679 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2680                   struct lookup_intent *it, struct kstat *stat)
2681 {
2682         struct inode *inode = de->d_inode;
2683         int res = 0;
2684
2685         res = ll_inode_revalidate_it(de, it);
2686         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2687
2688         if (res)
2689                 return res;
2690
2691         stat->dev = inode->i_sb->s_dev;
2692         stat->ino = inode->i_ino;
2693         stat->mode = inode->i_mode;
2694         stat->nlink = inode->i_nlink;
2695         stat->uid = inode->i_uid;
2696         stat->gid = inode->i_gid;
2697         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2698         stat->atime = inode->i_atime;
2699         stat->mtime = inode->i_mtime;
2700         stat->ctime = inode->i_ctime;
2701 #ifdef HAVE_INODE_BLKSIZE
2702         stat->blksize = inode->i_blksize;
2703 #else
2704         stat->blksize = 1 << inode->i_blkbits;
2705 #endif
2706
2707         ll_inode_size_lock(inode, 0);
2708         stat->size = inode->i_size;
2709         stat->blocks = inode->i_blocks;
2710         ll_inode_size_unlock(inode, 0);
2711
2712         return 0;
2713 }
2714 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2715 {
2716         struct lookup_intent it = { .it_op = IT_GETATTR };
2717
2718         return ll_getattr_it(mnt, de, &it, stat);
2719 }
2720 #endif
2721
2722 static
2723 int lustre_check_acl(struct inode *inode, int mask)
2724 {
2725 #ifdef CONFIG_FS_POSIX_ACL
2726         struct ll_inode_info *lli = ll_i2info(inode);
2727         struct posix_acl *acl;
2728         int rc;
2729         ENTRY;
2730
2731         spin_lock(&lli->lli_lock);
2732         acl = posix_acl_dup(lli->lli_posix_acl);
2733         spin_unlock(&lli->lli_lock);
2734
2735         if (!acl)
2736                 RETURN(-EAGAIN);
2737
2738         rc = posix_acl_permission(inode, acl, mask);
2739         posix_acl_release(acl);
2740
2741         RETURN(rc);
2742 #else
2743         return -EAGAIN;
2744 #endif
2745 }
2746
2747 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2748 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2749 {
2750         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2751                inode->i_ino, inode->i_generation, inode, mask);
2752         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2753                 return lustre_check_remote_perm(inode, mask);
2754
2755         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2756         return generic_permission(inode, mask, lustre_check_acl);
2757 }
2758 #else
2759 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
2760 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2761 #else
2762 int ll_inode_permission(struct inode *inode, int mask)
2763 #endif
2764 {
2765         int mode = inode->i_mode;
2766         int rc;
2767
2768         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2769                inode->i_ino, inode->i_generation, inode, mask);
2770
2771         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2772                 return lustre_check_remote_perm(inode, mask);
2773
2774         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2775
2776         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2777             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2778                 return -EROFS;
2779         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2780                 return -EACCES;
2781         if (current->fsuid == inode->i_uid) {
2782                 mode >>= 6;
2783         } else if (1) {
2784                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2785                         goto check_groups;
2786                 rc = lustre_check_acl(inode, mask);
2787                 if (rc == -EAGAIN)
2788                         goto check_groups;
2789                 if (rc == -EACCES)
2790                         goto check_capabilities;
2791                 return rc;
2792         } else {
2793 check_groups:
2794                 if (in_group_p(inode->i_gid))
2795                         mode >>= 3;
2796         }
2797         if ((mode & mask & S_IRWXO) == mask)
2798                 return 0;
2799
2800 check_capabilities:
2801         if (!(mask & MAY_EXEC) ||
2802             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2803                 if (capable(CAP_DAC_OVERRIDE))
2804                         return 0;
2805
2806         if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2807             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2808                 return 0;
2809
2810         return -EACCES;
2811 }
2812 #endif
2813
2814 /* -o localflock - only provides locally consistent flock locks */
2815 struct file_operations ll_file_operations = {
2816         .read           = ll_file_read,
2817         .write          = ll_file_write,
2818         .ioctl          = ll_file_ioctl,
2819         .open           = ll_file_open,
2820         .release        = ll_file_release,
2821         .mmap           = ll_file_mmap,
2822         .llseek         = ll_file_seek,
2823 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2824         .sendfile       = ll_file_sendfile,
2825 #endif
2826         .fsync          = ll_fsync,
2827 };
2828
2829 struct file_operations ll_file_operations_flock = {
2830         .read           = ll_file_read,
2831         .write          = ll_file_write,
2832         .ioctl          = ll_file_ioctl,
2833         .open           = ll_file_open,
2834         .release        = ll_file_release,
2835         .mmap           = ll_file_mmap,
2836         .llseek         = ll_file_seek,
2837 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2838         .sendfile       = ll_file_sendfile,
2839 #endif
2840         .fsync          = ll_fsync,
2841 #ifdef HAVE_F_OP_FLOCK
2842         .flock          = ll_file_flock,
2843 #endif
2844         .lock           = ll_file_flock
2845 };
2846
2847 /* These are for -o noflock - to return ENOSYS on flock calls */
2848 struct file_operations ll_file_operations_noflock = {
2849         .read           = ll_file_read,
2850         .write          = ll_file_write,
2851         .ioctl          = ll_file_ioctl,
2852         .open           = ll_file_open,
2853         .release        = ll_file_release,
2854         .mmap           = ll_file_mmap,
2855         .llseek         = ll_file_seek,
2856 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2857         .sendfile       = ll_file_sendfile,
2858 #endif
2859         .fsync          = ll_fsync,
2860 #ifdef HAVE_F_OP_FLOCK
2861         .flock          = ll_file_noflock,
2862 #endif
2863         .lock           = ll_file_noflock
2864 };
2865
2866 struct inode_operations ll_file_inode_operations = {
2867 #ifdef LUSTRE_KERNEL_VERSION
2868         .setattr_raw    = ll_setattr_raw,
2869 #endif
2870         .setattr        = ll_setattr,
2871         .truncate       = ll_truncate,
2872 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2873         .getattr        = ll_getattr,
2874 #else
2875         .revalidate_it  = ll_inode_revalidate_it,
2876 #endif
2877         .permission     = ll_inode_permission,
2878         .setxattr       = ll_setxattr,
2879         .getxattr       = ll_getxattr,
2880         .listxattr      = ll_listxattr,
2881         .removexattr    = ll_removexattr,
2882 };
2883