lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  19  *
  20  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  21  * CA 95054 USA or visit www.sun.com if you need additional information or
  22  * have any questions.
  23  *
  24  * GPL HEADER END
  25  */
  26 /*
  27  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  28  * Use is subject to license terms.
  29  *
  30  * Copyright (c) 2011, 2013, Intel Corporation.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lustre/llite/file.c
  37  *
  38  * Author: Peter Braam <braam@clusterfs.com>
  39  * Author: Phil Schwan <phil@clusterfs.com>
  40  * Author: Andreas Dilger <adilger@clusterfs.com>
  41  */
  42
  43 #define DEBUG_SUBSYSTEM S_LLITE
  44 #include <lustre_dlm.h>
  45 #include <lustre_lite.h>
  46 #include <linux/pagemap.h>
  47 #include <linux/file.h>
  48 #include "llite_internal.h"
  49 #include <lustre/ll_fiemap.h>
  50
  51 #include "cl_object.h"
  52
  53 struct ll_file_data *ll_file_data_get(void)
  54 {
  55         struct ll_file_data *fd;
  56
  57         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, __GFP_IO);
  58         if (fd == NULL)
  59                 return NULL;
  60
  61         fd->fd_write_failed = false;
  62
  63         return fd;
  64 }
  65
  66 static void ll_file_data_put(struct ll_file_data *fd)
  67 {
  68         if (fd != NULL)
  69                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  70 }
  71
  72 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  73                           struct lustre_handle *fh)
  74 {
  75         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  76         op_data->op_attr.ia_mode = inode->i_mode;
  77         op_data->op_attr.ia_atime = inode->i_atime;
  78         op_data->op_attr.ia_mtime = inode->i_mtime;
  79         op_data->op_attr.ia_ctime = inode->i_ctime;
  80         op_data->op_attr.ia_size = i_size_read(inode);
  81         op_data->op_attr_blocks = inode->i_blocks;
  82         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
  83                                         ll_inode_to_ext_flags(inode->i_flags);
  84         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  85         if (fh)
  86                 op_data->op_handle = *fh;
  87         op_data->op_capa1 = ll_mdscapa_get(inode);
  88
  89         if (LLIF_DATA_MODIFIED & ll_i2info(inode)->lli_flags)
  90                 op_data->op_bias |= MDS_DATA_MODIFIED;
  91 }
  92
  93 /**
  94  * Closes the IO epoch and packs all the attributes into @op_data for
  95  * the CLOSE rpc.
  96  */
  97 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  98                              struct obd_client_handle *och)
  99 {
 100         ENTRY;
 101
 102         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                         ATTR_MTIME | ATTR_MTIME_SET |
 104                                         ATTR_CTIME | ATTR_CTIME_SET;
 105
 106         if (!(och->och_flags & FMODE_WRITE))
 107                 goto out;
 108
 109         if (!exp_connect_som(ll_i2mdexp(inode)) || !S_ISREG(inode->i_mode))
 110                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 111         else
 112                 ll_ioepoch_close(inode, op_data, &och, 0);
 113
 114 out:
 115         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
 116         ll_prep_md_op_data(op_data, inode, NULL, NULL,
 117                            0, 0, LUSTRE_OPC_ANY, NULL);
 118         EXIT;
 119 }
 120
 121 static int ll_close_inode_openhandle(struct obd_export *md_exp,
 122                                      struct inode *inode,
 123                                      struct obd_client_handle *och)
 124 {
 125         struct obd_export *exp = ll_i2mdexp(inode);
 126         struct md_op_data *op_data;
 127         struct ptlrpc_request *req = NULL;
 128         struct obd_device *obd = class_exp2obd(exp);
 129         int epoch_close = 1;
 130         int rc;
 131         ENTRY;
 132
 133         if (obd == NULL) {
 134                 /*
 135                  * XXX: in case of LMV, is this correct to access
 136                  * ->exp_handle?
 137                  */
 138                 CERROR("Invalid MDC connection handle "LPX64"\n",
 139                        ll_i2mdexp(inode)->exp_handle.h_cookie);
 140                 GOTO(out, rc = 0);
 141         }
 142
 143         OBD_ALLOC_PTR(op_data);
 144         if (op_data == NULL)
 145                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
 146
 147         ll_prepare_close(inode, op_data, och);
 148         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
 149         rc = md_close(md_exp, op_data, och->och_mod, &req);
 150         if (rc == -EAGAIN) {
 151                 /* This close must have the epoch closed. */
 152                 LASSERT(epoch_close);
 153                 /* MDS has instructed us to obtain Size-on-MDS attribute from
 154                  * OSTs and send setattr to back to MDS. */
 155                 rc = ll_som_update(inode, op_data);
 156                 if (rc) {
 157                         CERROR("inode %lu mdc Size-on-MDS update failed: "
 158                                "rc = %d\n", inode->i_ino, rc);
 159                         rc = 0;
 160                 }
 161         } else if (rc) {
 162                 CERROR("inode %lu mdc close failed: rc = %d\n",
 163                        inode->i_ino, rc);
 164         }
 165
 166         /* DATA_MODIFIED flag was successfully sent on close, cancel data
 167          * modification flag. */
 168         if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) {
 169                 struct ll_inode_info *lli = ll_i2info(inode);
 170
 171                 spin_lock(&lli->lli_lock);
 172                 lli->lli_flags &= ~LLIF_DATA_MODIFIED;
 173                 spin_unlock(&lli->lli_lock);
 174         }
 175
 176         ll_finish_md_op_data(op_data);
 177
 178         if (rc == 0) {
 179                 rc = ll_objects_destroy(req, inode);
 180                 if (rc)
 181                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
 182                                inode->i_ino, rc);
 183         }
 184
 185         EXIT;
 186 out:
 187
 188         if (exp_connect_som(exp) && !epoch_close &&
 189             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 190                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 191         } else {
 192                 md_clear_open_replay_data(md_exp, och);
 193                 /* Free @och if it is not waiting for DONE_WRITING. */
 194                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 195                 OBD_FREE_PTR(och);
 196         }
 197         if (req) /* This is close request */
 198                 ptlrpc_req_finished(req);
 199         return rc;
 200 }
 201
 202 int ll_md_real_close(struct inode *inode, int flags)
 203 {
 204         struct ll_inode_info *lli = ll_i2info(inode);
 205         struct obd_client_handle **och_p;
 206         struct obd_client_handle *och;
 207         __u64 *och_usecount;
 208         int rc = 0;
 209         ENTRY;
 210
 211         if (flags & FMODE_WRITE) {
 212                 och_p = &lli->lli_mds_write_och;
 213                 och_usecount = &lli->lli_open_fd_write_count;
 214         } else if (flags & FMODE_EXEC) {
 215                 och_p = &lli->lli_mds_exec_och;
 216                 och_usecount = &lli->lli_open_fd_exec_count;
 217         } else {
 218                 LASSERT(flags & FMODE_READ);
 219                 och_p = &lli->lli_mds_read_och;
 220                 och_usecount = &lli->lli_open_fd_read_count;
 221         }
 222
 223         mutex_lock(&lli->lli_och_mutex);
 224         if (*och_usecount) { /* There are still users of this handle, so
 225                                 skip freeing it. */
 226                 mutex_unlock(&lli->lli_och_mutex);
 227                 RETURN(0);
 228         }
 229         och=*och_p;
 230         *och_p = NULL;
 231         mutex_unlock(&lli->lli_och_mutex);
 232
 233         if (och) { /* There might be a race and somebody have freed this och
 234                       already */
 235                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 236                                                inode, och);
 237         }
 238
 239         RETURN(rc);
 240 }
 241
 242 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 243                 struct file *file)
 244 {
 245         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 246         struct ll_inode_info *lli = ll_i2info(inode);
 247         int rc = 0;
 248         ENTRY;
 249
 250         /* clear group lock, if present */
 251         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 252                 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
 253
 254         if (fd->fd_lease_och != NULL) {
 255                 bool lease_broken;
 256
 257                 /* Usually the lease is not released when the
 258                  * application crashed, we need to release here. */
 259                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 260                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 261                         PFID(&lli->lli_fid), rc, lease_broken);
 262
 263                 fd->fd_lease_och = NULL;
 264         }
 265
 266         if (fd->fd_och != NULL) {
 267                 rc = ll_close_inode_openhandle(md_exp, inode, fd->fd_och);
 268                 fd->fd_och = NULL;
 269                 GOTO(out, rc);
 270         }
 271
 272         /* Let's see if we have good enough OPEN lock on the file and if
 273            we can skip talking to MDS */
 274         if (file->f_dentry->d_inode) { /* Can this ever be false? */
 275                 int lockmode;
 276                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 277                 struct lustre_handle lockh;
 278                 struct inode *inode = file->f_dentry->d_inode;
 279                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 280
 281                 mutex_lock(&lli->lli_och_mutex);
 282                 if (fd->fd_omode & FMODE_WRITE) {
 283                         lockmode = LCK_CW;
 284                         LASSERT(lli->lli_open_fd_write_count);
 285                         lli->lli_open_fd_write_count--;
 286                 } else if (fd->fd_omode & FMODE_EXEC) {
 287                         lockmode = LCK_PR;
 288                         LASSERT(lli->lli_open_fd_exec_count);
 289                         lli->lli_open_fd_exec_count--;
 290                 } else {
 291                         lockmode = LCK_CR;
 292                         LASSERT(lli->lli_open_fd_read_count);
 293                         lli->lli_open_fd_read_count--;
 294                 }
 295                 mutex_unlock(&lli->lli_och_mutex);
 296
 297                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 298                                    LDLM_IBITS, &policy, lockmode,
 299                                    &lockh)) {
 300                         rc = ll_md_real_close(file->f_dentry->d_inode,
 301                                               fd->fd_omode);
 302                 }
 303         } else {
 304                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
 305                        file, file->f_dentry, file->f_dentry->d_name.name);
 306         }
 307
 308 out:
 309         LUSTRE_FPRIVATE(file) = NULL;
 310         ll_file_data_put(fd);
 311         ll_capa_close(inode);
 312
 313         RETURN(rc);
 314 }
 315
 316 /* While this returns an error code, fput() the caller does not, so we need
 317  * to make every effort to clean up all of our state here.  Also, applications
 318  * rarely check close errors and even if an error is returned they will not
 319  * re-try the close call.
 320  */
 321 int ll_file_release(struct inode *inode, struct file *file)
 322 {
 323         struct ll_file_data *fd;
 324         struct ll_sb_info *sbi = ll_i2sbi(inode);
 325         struct ll_inode_info *lli = ll_i2info(inode);
 326         int rc;
 327         ENTRY;
 328
 329         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 330                inode->i_generation, inode);
 331
 332 #ifdef CONFIG_FS_POSIX_ACL
 333         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
 334             inode == inode->i_sb->s_root->d_inode) {
 335                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 336
 337                 LASSERT(fd != NULL);
 338                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
 339                         fd->fd_flags &= ~LL_FILE_RMTACL;
 340                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
 341                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
 342                 }
 343         }
 344 #endif
 345
 346         if (inode->i_sb->s_root != file->f_dentry)
 347                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 348         fd = LUSTRE_FPRIVATE(file);
 349         LASSERT(fd != NULL);
 350
 351         /* The last ref on @file, maybe not the the owner pid of statahead.
 352          * Different processes can open the same dir, "ll_opendir_key" means:
 353          * it is me that should stop the statahead thread. */
 354         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd &&
 355             lli->lli_opendir_pid != 0)
 356                 ll_stop_statahead(inode, lli->lli_opendir_key);
 357
 358         if (inode->i_sb->s_root == file->f_dentry) {
 359                 LUSTRE_FPRIVATE(file) = NULL;
 360                 ll_file_data_put(fd);
 361                 RETURN(0);
 362         }
 363
 364         if (!S_ISDIR(inode->i_mode)) {
 365                 lov_read_and_clear_async_rc(lli->lli_clob);
 366                 lli->lli_async_rc = 0;
 367         }
 368
 369         rc = ll_md_close(sbi->ll_md_exp, inode, file);
 370
 371         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 372                 libcfs_debug_dumplog();
 373
 374         RETURN(rc);
 375 }
 376
 377 static int ll_intent_file_open(struct file *file, void *lmm,
 378                                int lmmsize, struct lookup_intent *itp)
 379 {
 380         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
 381         struct dentry *parent = file->f_dentry->d_parent;
 382         struct md_op_data *op_data;
 383         struct ptlrpc_request *req;
 384         __u32 opc = LUSTRE_OPC_ANY;
 385         int rc;
 386         ENTRY;
 387
 388         if (!parent)
 389                 RETURN(-ENOENT);
 390
 391         /* Usually we come here only for NFSD, and we want open lock.
 392            But we can also get here with pre 2.6.15 patchless kernels, and in
 393            that case that lock is also ok */
 394         /* We can also get here if there was cached open handle in revalidate_it
 395          * but it disappeared while we were getting from there to ll_file_open.
 396          * But this means this file was closed and immediatelly opened which
 397          * makes a good candidate for using OPEN lock */
 398         /* If lmmsize & lmm are not 0, we are just setting stripe info
 399          * parameters. No need for the open lock */
 400         if (lmm == NULL && lmmsize == 0) {
 401                 itp->it_flags |= MDS_OPEN_LOCK;
 402                 if (itp->it_flags & FMODE_WRITE)
 403                         opc = LUSTRE_OPC_CREATE;
 404         }
 405
 406         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
 407                                       file->f_dentry->d_inode, NULL, 0,
 408                                       O_RDWR, opc, NULL);
 409
 410         if (IS_ERR(op_data))
 411                 RETURN(PTR_ERR(op_data));
 412
 413         itp->it_flags |= MDS_OPEN_BY_FID;
 414         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 415                             0 /*unused */, &req, ll_md_blocking_ast, 0);
 416         ll_finish_md_op_data(op_data);
 417         if (rc == -ESTALE) {
 418                 /* reason for keep own exit path - don`t flood log
 419                 * with messages with -ESTALE errors.
 420                 */
 421                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 422                      it_open_error(DISP_OPEN_OPEN, itp))
 423                         GOTO(out, rc);
 424                 ll_release_openhandle(file->f_dentry, itp);
 425                 GOTO(out, rc);
 426         }
 427
 428         if (it_disposition(itp, DISP_LOOKUP_NEG))
 429                 GOTO(out, rc = -ENOENT);
 430
 431         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 432                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 433                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 434                 GOTO(out, rc);
 435         }
 436
 437         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL, itp);
 438         if (!rc && itp->d.lustre.it_lock_mode)
 439                 ll_set_lock_data(sbi->ll_md_exp, file->f_dentry->d_inode,
 440                                  itp, NULL);
 441
 442 out:
 443         ptlrpc_req_finished(itp->d.lustre.it_data);
 444         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
 445         ll_intent_drop_lock(itp);
 446
 447         RETURN(rc);
 448 }
 449
 450 /**
 451  * Assign an obtained @ioepoch to client's inode. No lock is needed, MDS does
 452  * not believe attributes if a few ioepoch holders exist. Attributes for
 453  * previous ioepoch if new one is opened are also skipped by MDS.
 454  */
 455 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
 456 {
 457         if (ioepoch && lli->lli_ioepoch != ioepoch) {
 458                 lli->lli_ioepoch = ioepoch;
 459                 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 460                        ioepoch, PFID(&lli->lli_fid));
 461         }
 462 }
 463
 464 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 465                        struct obd_client_handle *och)
 466 {
 467         struct ptlrpc_request *req = it->d.lustre.it_data;
 468         struct mdt_body *body;
 469
 470         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 471         och->och_fh = body->handle;
 472         och->och_fid = body->fid1;
 473         och->och_lease_handle.cookie = it->d.lustre.it_lock_handle;
 474         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 475         och->och_flags = it->it_flags;
 476
 477         return md_set_open_replay_data(md_exp, och, req);
 478 }
 479
 480 int ll_local_open(struct file *file, struct lookup_intent *it,
 481                   struct ll_file_data *fd, struct obd_client_handle *och)
 482 {
 483         struct inode *inode = file->f_dentry->d_inode;
 484         struct ll_inode_info *lli = ll_i2info(inode);
 485         ENTRY;
 486
 487         LASSERT(!LUSTRE_FPRIVATE(file));
 488
 489         LASSERT(fd != NULL);
 490
 491         if (och) {
 492                 struct ptlrpc_request *req = it->d.lustre.it_data;
 493                 struct mdt_body *body;
 494                 int rc;
 495
 496                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 497                 if (rc != 0)
 498                         RETURN(rc);
 499
 500                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 501                 ll_ioepoch_open(lli, body->ioepoch);
 502         }
 503
 504         LUSTRE_FPRIVATE(file) = fd;
 505         ll_readahead_init(inode, &fd->fd_ras);
 506         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 507
 508         RETURN(0);
 509 }
 510
 511 /* Open a file, and (for the very first open) create objects on the OSTs at
 512  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 513  * creation or open until ll_lov_setstripe() ioctl is called.
 514  *
 515  * If we already have the stripe MD locally then we don't request it in
 516  * md_open(), by passing a lmm_size = 0.
 517  *
 518  * It is up to the application to ensure no other processes open this file
 519  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 520  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 521  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 522  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 523  */
 524 int ll_file_open(struct inode *inode, struct file *file)
 525 {
 526         struct ll_inode_info *lli = ll_i2info(inode);
 527         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 528                                           .it_flags = file->f_flags };
 529         struct obd_client_handle **och_p = NULL;
 530         __u64 *och_usecount = NULL;
 531         struct ll_file_data *fd;
 532         int rc = 0, opendir_set = 0;
 533         ENTRY;
 534
 535         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 536                inode->i_generation, inode, file->f_flags);
 537
 538         it = file->private_data; /* XXX: compat macro */
 539         file->private_data = NULL; /* prevent ll_local_open assertion */
 540
 541         fd = ll_file_data_get();
 542         if (fd == NULL)
 543                 GOTO(out_openerr, rc = -ENOMEM);
 544
 545         fd->fd_file = file;
 546         if (S_ISDIR(inode->i_mode)) {
 547                 spin_lock(&lli->lli_sa_lock);
 548                 if (lli->lli_opendir_key == NULL && lli->lli_sai == NULL &&
 549                     lli->lli_opendir_pid == 0) {
 550                         lli->lli_opendir_key = fd;
 551                         lli->lli_opendir_pid = cfs_curproc_pid();
 552                         opendir_set = 1;
 553                 }
 554                 spin_unlock(&lli->lli_sa_lock);
 555         }
 556
 557         if (inode->i_sb->s_root == file->f_dentry) {
 558                 LUSTRE_FPRIVATE(file) = fd;
 559                 RETURN(0);
 560         }
 561
 562         if (!it || !it->d.lustre.it_disposition) {
 563                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 564                  * because everything but O_ACCMODE mask was stripped from
 565                  * there */
 566                 if ((oit.it_flags + 1) & O_ACCMODE)
 567                         oit.it_flags++;
 568                 if (file->f_flags & O_TRUNC)
 569                         oit.it_flags |= FMODE_WRITE;
 570
 571                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 572                  * dentry_open after call to open_namei that checks permissions.
 573                  * Only nfsd_open call dentry_open directly without checking
 574                  * permissions and because of that this code below is safe. */
 575                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 576                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 577
 578                 /* We do not want O_EXCL here, presumably we opened the file
 579                  * already? XXX - NFS implications? */
 580                 oit.it_flags &= ~O_EXCL;
 581
 582                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 583                  * created if necessary, then "IT_CREAT" should be set to keep
 584                  * consistent with it */
 585                 if (oit.it_flags & O_CREAT)
 586                         oit.it_op |= IT_CREAT;
 587
 588                 it = &oit;
 589         }
 590
 591 restart:
 592         /* Let's see if we have file open on MDS already. */
 593         if (it->it_flags & FMODE_WRITE) {
 594                 och_p = &lli->lli_mds_write_och;
 595                 och_usecount = &lli->lli_open_fd_write_count;
 596         } else if (it->it_flags & FMODE_EXEC) {
 597                 och_p = &lli->lli_mds_exec_och;
 598                 och_usecount = &lli->lli_open_fd_exec_count;
 599          } else {
 600                 och_p = &lli->lli_mds_read_och;
 601                 och_usecount = &lli->lli_open_fd_read_count;
 602         }
 603
 604         mutex_lock(&lli->lli_och_mutex);
 605         if (*och_p) { /* Open handle is present */
 606                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 607                         /* Well, there's extra open request that we do not need,
 608                            let's close it somehow. This will decref request. */
 609                         rc = it_open_error(DISP_OPEN_OPEN, it);
 610                         if (rc) {
 611                                 mutex_unlock(&lli->lli_och_mutex);
 612                                 GOTO(out_openerr, rc);
 613                         }
 614
 615                         ll_release_openhandle(file->f_dentry, it);
 616                 }
 617                 (*och_usecount)++;
 618
 619                 rc = ll_local_open(file, it, fd, NULL);
 620                 if (rc) {
 621                         (*och_usecount)--;
 622                         mutex_unlock(&lli->lli_och_mutex);
 623                         GOTO(out_openerr, rc);
 624                 }
 625         } else {
 626                 LASSERT(*och_usecount == 0);
 627                 if (!it->d.lustre.it_disposition) {
 628                         /* We cannot just request lock handle now, new ELC code
 629                            means that one of other OPEN locks for this file
 630                            could be cancelled, and since blocking ast handler
 631                            would attempt to grab och_mutex as well, that would
 632                            result in a deadlock */
 633                         mutex_unlock(&lli->lli_och_mutex);
 634                         it->it_create_mode |= M_CHECK_STALE;
 635                         rc = ll_intent_file_open(file, NULL, 0, it);
 636                         it->it_create_mode &= ~M_CHECK_STALE;
 637                         if (rc)
 638                                 GOTO(out_openerr, rc);
 639
 640                         goto restart;
 641                 }
 642                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 643                 if (!*och_p)
 644                         GOTO(out_och_free, rc = -ENOMEM);
 645
 646                 (*och_usecount)++;
 647
 648                 /* md_intent_lock() didn't get a request ref if there was an
 649                  * open error, so don't do cleanup on the request here
 650                  * (bug 3430) */
 651                 /* XXX (green): Should not we bail out on any error here, not
 652                  * just open error? */
 653                 rc = it_open_error(DISP_OPEN_OPEN, it);
 654                 if (rc)
 655                         GOTO(out_och_free, rc);
 656
 657                 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
 658
 659                 rc = ll_local_open(file, it, fd, *och_p);
 660                 if (rc)
 661                         GOTO(out_och_free, rc);
 662         }
 663         mutex_unlock(&lli->lli_och_mutex);
 664         fd = NULL;
 665
 666         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 667            different kind of OPEN lock for this same inode gets cancelled
 668            by ldlm_cancel_lru */
 669         if (!S_ISREG(inode->i_mode))
 670                 GOTO(out_och_free, rc);
 671
 672         ll_capa_open(inode);
 673
 674         if (!lli->lli_has_smd) {
 675                 if (file->f_flags & O_LOV_DELAY_CREATE ||
 676                     !(file->f_mode & FMODE_WRITE)) {
 677                         CDEBUG(D_INODE, "object creation was delayed\n");
 678                         GOTO(out_och_free, rc);
 679                 }
 680         }
 681         file->f_flags &= ~O_LOV_DELAY_CREATE;
 682         GOTO(out_och_free, rc);
 683
 684 out_och_free:
 685         if (rc) {
 686                 if (och_p && *och_p) {
 687                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 688                         *och_p = NULL; /* OBD_FREE writes some magic there */
 689                         (*och_usecount)--;
 690                 }
 691                 mutex_unlock(&lli->lli_och_mutex);
 692
 693 out_openerr:
 694                 if (opendir_set != 0)
 695                         ll_stop_statahead(inode, lli->lli_opendir_key);
 696                 if (fd != NULL)
 697                         ll_file_data_put(fd);
 698         } else {
 699                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 700         }
 701
 702         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 703                 ptlrpc_req_finished(it->d.lustre.it_data);
 704                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 705         }
 706
 707         return rc;
 708 }
 709
 710 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 711                         struct ldlm_lock_desc *desc, void *data, int flag)
 712 {
 713         int rc;
 714         struct lustre_handle lockh;
 715         ENTRY;
 716
 717         switch (flag) {
 718         case LDLM_CB_BLOCKING:
 719                 ldlm_lock2handle(lock, &lockh);
 720                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 721                 if (rc < 0) {
 722                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 723                         RETURN(rc);
 724                 }
 725                 break;
 726         case LDLM_CB_CANCELING:
 727                 /* do nothing */
 728                 break;
 729         }
 730         RETURN(0);
 731 }
 732
 733 /**
 734  * Acquire a lease and open the file.
 735  */
 736 struct obd_client_handle *ll_lease_open(struct inode *inode, struct file *file,
 737                                         fmode_t fmode)
 738 {
 739         struct lookup_intent it = { .it_op = IT_OPEN };
 740         struct ll_sb_info *sbi = ll_i2sbi(inode);
 741         struct md_op_data *op_data;
 742         struct ptlrpc_request *req;
 743         struct lustre_handle old_handle = { 0 };
 744         struct obd_client_handle *och = NULL;
 745         int rc;
 746         int rc2;
 747         ENTRY;
 748
 749         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 750                 RETURN(ERR_PTR(-EINVAL));
 751
 752         if (file != NULL) {
 753                 struct ll_inode_info *lli = ll_i2info(inode);
 754                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 755                 struct obd_client_handle **och_p;
 756                 __u64 *och_usecount;
 757
 758                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 759                         RETURN(ERR_PTR(-EPERM));
 760
 761                 /* Get the openhandle of the file */
 762                 rc = -EBUSY;
 763                 mutex_lock(&lli->lli_och_mutex);
 764                 if (fd->fd_lease_och != NULL) {
 765                         mutex_unlock(&lli->lli_och_mutex);
 766                         RETURN(ERR_PTR(rc));
 767                 }
 768
 769                 if (fd->fd_och == NULL) {
 770                         if (file->f_mode & FMODE_WRITE) {
 771                                 LASSERT(lli->lli_mds_write_och != NULL);
 772                                 och_p = &lli->lli_mds_write_och;
 773                                 och_usecount = &lli->lli_open_fd_write_count;
 774                         } else {
 775                                 LASSERT(lli->lli_mds_read_och != NULL);
 776                                 och_p = &lli->lli_mds_read_och;
 777                                 och_usecount = &lli->lli_open_fd_read_count;
 778                         }
 779                         if (*och_usecount == 1) {
 780                                 fd->fd_och = *och_p;
 781                                 *och_p = NULL;
 782                                 *och_usecount = 0;
 783                                 rc = 0;
 784                         }
 785                 }
 786                 mutex_unlock(&lli->lli_och_mutex);
 787                 if (rc < 0) /* more than 1 opener */
 788                         RETURN(ERR_PTR(rc));
 789
 790                 LASSERT(fd->fd_och != NULL);
 791                 old_handle = fd->fd_och->och_fh;
 792         }
 793
 794         OBD_ALLOC_PTR(och);
 795         if (och == NULL)
 796                 RETURN(ERR_PTR(-ENOMEM));
 797
 798         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 799                                         LUSTRE_OPC_ANY, NULL);
 800         if (IS_ERR(op_data))
 801                 GOTO(out, rc = PTR_ERR(op_data));
 802
 803         /* To tell the MDT this openhandle is from the same owner */
 804         op_data->op_handle = old_handle;
 805
 806         it.it_flags = fmode | MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 807         rc = md_intent_lock(sbi->ll_md_exp, op_data, NULL, 0, &it, 0, &req,
 808                                 ll_md_blocking_lease_ast,
 809         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 810          * it can be cancelled which may mislead applications that the lease is
 811          * broken;
 812          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 813          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 814          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 815                                 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 816         ll_finish_md_op_data(op_data);
 817         if (req != NULL) {
 818                 ptlrpc_req_finished(req);
 819                 it_clear_disposition(&it, DISP_ENQ_COMPLETE);
 820         }
 821         if (rc < 0)
 822                 GOTO(out_release_it, rc);
 823
 824         if (it_disposition(&it, DISP_LOOKUP_NEG))
 825                 GOTO(out_release_it, rc = -ENOENT);
 826
 827         rc = it_open_error(DISP_OPEN_OPEN, &it);
 828         if (rc)
 829                 GOTO(out_release_it, rc);
 830
 831         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 832         ll_och_fill(sbi->ll_md_exp, &it, och);
 833
 834         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 835                 GOTO(out_close, rc = -EOPNOTSUPP);
 836
 837         /* already get lease, handle lease lock */
 838         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 839         if (it.d.lustre.it_lock_mode == 0 ||
 840             it.d.lustre.it_lock_bits != MDS_INODELOCK_OPEN) {
 841                 /* open lock must return for lease */
 842                 CERROR(DFID "lease granted but no open lock, %d/%Lu.\n",
 843                         PFID(ll_inode2fid(inode)), it.d.lustre.it_lock_mode,
 844                         it.d.lustre.it_lock_bits);
 845                 GOTO(out_close, rc = -EPROTO);
 846         }
 847
 848         ll_intent_release(&it);
 849         RETURN(och);
 850
 851 out_close:
 852         rc2 = ll_close_inode_openhandle(sbi->ll_md_exp, inode, och);
 853         if (rc2)
 854                 CERROR("Close openhandle returned %d\n", rc2);
 855
 856         /* cancel open lock */
 857         if (it.d.lustre.it_lock_mode != 0) {
 858                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 859                                                 it.d.lustre.it_lock_mode);
 860                 it.d.lustre.it_lock_mode = 0;
 861         }
 862 out_release_it:
 863         ll_intent_release(&it);
 864 out:
 865         OBD_FREE_PTR(och);
 866         RETURN(ERR_PTR(rc));
 867 }
 868 EXPORT_SYMBOL(ll_lease_open);
 869
 870 /**
 871  * Release lease and close the file.
 872  * It will check if the lease has ever broken.
 873  */
 874 int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
 875                         bool *lease_broken)
 876 {
 877         struct ldlm_lock *lock;
 878         bool cancelled = true;
 879         int rc;
 880         ENTRY;
 881
 882         lock = ldlm_handle2lock(&och->och_lease_handle);
 883         if (lock != NULL) {
 884                 lock_res_and_lock(lock);
 885                 cancelled = ldlm_is_cancel(lock);
 886                 unlock_res_and_lock(lock);
 887                 ldlm_lock_put(lock);
 888         }
 889
 890         CDEBUG(D_INODE, "lease for "DFID" broken? %d\n",
 891                 PFID(&ll_i2info(inode)->lli_fid), cancelled);
 892
 893         if (!cancelled)
 894                 ldlm_cli_cancel(&och->och_lease_handle, 0);
 895         if (lease_broken != NULL)
 896                 *lease_broken = cancelled;
 897
 898         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp, inode, och);
 899         RETURN(rc);
 900 }
 901 EXPORT_SYMBOL(ll_lease_close);
 902
 903 /* Fills the obdo with the attributes for the lsm */
 904 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
 905                           struct obd_capa *capa, struct obdo *obdo,
 906                           __u64 ioepoch, int sync)
 907 {
 908         struct ptlrpc_request_set *set;
 909         struct obd_info            oinfo = { { { 0 } } };
 910         int                        rc;
 911
 912         ENTRY;
 913
 914         LASSERT(lsm != NULL);
 915
 916         oinfo.oi_md = lsm;
 917         oinfo.oi_oa = obdo;
 918         oinfo.oi_oa->o_oi = lsm->lsm_oi;
 919         oinfo.oi_oa->o_mode = S_IFREG;
 920         oinfo.oi_oa->o_ioepoch = ioepoch;
 921         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 922                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 923                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 924                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 925                                OBD_MD_FLGROUP | OBD_MD_FLEPOCH |
 926                                OBD_MD_FLDATAVERSION;
 927         oinfo.oi_capa = capa;
 928         if (sync) {
 929                 oinfo.oi_oa->o_valid |= OBD_MD_FLFLAGS;
 930                 oinfo.oi_oa->o_flags |= OBD_FL_SRVLOCK;
 931         }
 932
 933         set = ptlrpc_prep_set();
 934         if (set == NULL) {
 935                 CERROR("can't allocate ptlrpc set\n");
 936                 rc = -ENOMEM;
 937         } else {
 938                 rc = obd_getattr_async(exp, &oinfo, set);
 939                 if (rc == 0)
 940                         rc = ptlrpc_set_wait(set);
 941                 ptlrpc_set_destroy(set);
 942         }
 943         if (rc == 0)
 944                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 945                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
 946                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE |
 947                                          OBD_MD_FLDATAVERSION);
 948         RETURN(rc);
 949 }
 950
 951 /**
 952   * Performs the getattr on the inode and updates its fields.
 953   * If @sync != 0, perform the getattr under the server-side lock.
 954   */
 955 int ll_inode_getattr(struct inode *inode, struct obdo *obdo,
 956                      __u64 ioepoch, int sync)
 957 {
 958         struct obd_capa      *capa = ll_mdscapa_get(inode);
 959         struct lov_stripe_md *lsm;
 960         int rc;
 961         ENTRY;
 962
 963         lsm = ccc_inode_lsm_get(inode);
 964         rc = ll_lsm_getattr(lsm, ll_i2dtexp(inode),
 965                             capa, obdo, ioepoch, sync);
 966         capa_put(capa);
 967         if (rc == 0) {
 968                 struct ost_id *oi = lsm ? &lsm->lsm_oi : &obdo->o_oi;
 969
 970                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
 971                 CDEBUG(D_INODE, "objid "DOSTID" size %llu, blocks %llu,"
 972                        " blksize %lu\n", POSTID(oi), i_size_read(inode),
 973                        (unsigned long long)inode->i_blocks,
 974                        (unsigned long)ll_inode_blksize(inode));
 975         }
 976         ccc_inode_lsm_put(inode, lsm);
 977         RETURN(rc);
 978 }
 979
 980 int ll_merge_lvb(const struct lu_env *env, struct inode *inode)
 981 {
 982         struct ll_inode_info *lli = ll_i2info(inode);
 983         struct cl_object *obj = lli->lli_clob;
 984         struct cl_attr *attr = ccc_env_thread_attr(env);
 985         struct ost_lvb lvb;
 986         int rc = 0;
 987
 988         ENTRY;
 989
 990         ll_inode_size_lock(inode);
 991         /* merge timestamps the most recently obtained from mds with
 992            timestamps obtained from osts */
 993         LTIME_S(inode->i_atime) = lli->lli_lvb.lvb_atime;
 994         LTIME_S(inode->i_mtime) = lli->lli_lvb.lvb_mtime;
 995         LTIME_S(inode->i_ctime) = lli->lli_lvb.lvb_ctime;
 996         inode_init_lvb(inode, &lvb);
 997
 998         cl_object_attr_lock(obj);
 999         rc = cl_object_attr_get(env, obj, attr);
1000         cl_object_attr_unlock(obj);
1001
1002         if (rc == 0) {
1003                 if (lvb.lvb_atime < attr->cat_atime)
1004                         lvb.lvb_atime = attr->cat_atime;
1005                 if (lvb.lvb_ctime < attr->cat_ctime)
1006                         lvb.lvb_ctime = attr->cat_ctime;
1007                 if (lvb.lvb_mtime < attr->cat_mtime)
1008                         lvb.lvb_mtime = attr->cat_mtime;
1009
1010                 CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n",
1011                                 PFID(&lli->lli_fid), attr->cat_size);
1012                 cl_isize_write_nolock(inode, attr->cat_size);
1013
1014                 inode->i_blocks = attr->cat_blocks;
1015
1016                 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1017                 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1018                 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1019         }
1020         ll_inode_size_unlock(inode);
1021
1022         RETURN(rc);
1023 }
1024
1025 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1026                      lstat_t *st)
1027 {
1028         struct obdo obdo = { 0 };
1029         int rc;
1030
1031         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo, 0, 0);
1032         if (rc == 0) {
1033                 st->st_size   = obdo.o_size;
1034                 st->st_blocks = obdo.o_blocks;
1035                 st->st_mtime  = obdo.o_mtime;
1036                 st->st_atime  = obdo.o_atime;
1037                 st->st_ctime  = obdo.o_ctime;
1038         }
1039         return rc;
1040 }
1041
1042 void ll_io_init(struct cl_io *io, const struct file *file, int write)
1043 {
1044         struct inode *inode = file->f_dentry->d_inode;
1045
1046         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1047         if (write) {
1048                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1049                 io->u.ci_wr.wr_sync = file->f_flags & O_SYNC ||
1050                                       file->f_flags & O_DIRECT ||
1051                                       IS_SYNC(inode);
1052         }
1053         io->ci_obj     = ll_i2info(inode)->lli_clob;
1054         io->ci_lockreq = CILR_MAYBE;
1055         if (ll_file_nolock(file)) {
1056                 io->ci_lockreq = CILR_NEVER;
1057                 io->ci_no_srvlock = 1;
1058         } else if (file->f_flags & O_APPEND) {
1059                 io->ci_lockreq = CILR_MANDATORY;
1060         }
1061 }
1062
1063 static ssize_t
1064 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1065                    struct file *file, enum cl_io_type iot,
1066                    loff_t *ppos, size_t count)
1067 {
1068         struct ll_inode_info *lli = ll_i2info(file->f_dentry->d_inode);
1069         struct ll_file_data  *fd  = LUSTRE_FPRIVATE(file);
1070         struct cl_io         *io;
1071         ssize_t               result;
1072         ENTRY;
1073
1074 restart:
1075         io = ccc_env_thread_io(env);
1076         ll_io_init(io, file, iot == CIT_WRITE);
1077
1078         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1079                 struct vvp_io *vio = vvp_env_io(env);
1080                 struct ccc_io *cio = ccc_env_io(env);
1081                 int write_mutex_locked = 0;
1082
1083                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
1084                 vio->cui_io_subtype = args->via_io_subtype;
1085
1086                 switch (vio->cui_io_subtype) {
1087                 case IO_NORMAL:
1088                         cio->cui_iov = args->u.normal.via_iov;
1089                         cio->cui_nrsegs = args->u.normal.via_nrsegs;
1090                         cio->cui_tot_nrsegs = cio->cui_nrsegs;
1091                         cio->cui_iocb = args->u.normal.via_iocb;
1092                         if ((iot == CIT_WRITE) &&
1093                             !(cio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1094                                 if (mutex_lock_interruptible(&lli->
1095                                                                lli_write_mutex))
1096                                         GOTO(out, result = -ERESTARTSYS);
1097                                 write_mutex_locked = 1;
1098                         } else if (iot == CIT_READ) {
1099                                 down_read(&lli->lli_trunc_sem);
1100                         }
1101                         break;
1102                 case IO_SENDFILE:
1103                         vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
1104                         vio->u.sendfile.cui_target = args->u.sendfile.via_target;
1105                         break;
1106                 case IO_SPLICE:
1107                         vio->u.splice.cui_pipe = args->u.splice.via_pipe;
1108                         vio->u.splice.cui_flags = args->u.splice.via_flags;
1109                         break;
1110                 default:
1111                         CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
1112                         LBUG();
1113                 }
1114                 result = cl_io_loop(env, io);
1115                 if (write_mutex_locked)
1116                         mutex_unlock(&lli->lli_write_mutex);
1117                 else if (args->via_io_subtype == IO_NORMAL && iot == CIT_READ)
1118                         up_read(&lli->lli_trunc_sem);
1119         } else {
1120                 /* cl_io_rw_init() handled IO */
1121                 result = io->ci_result;
1122         }
1123
1124         if (io->ci_nob > 0) {
1125                 result = io->ci_nob;
1126                 *ppos = io->u.ci_wr.wr.crw_pos;
1127         }
1128         GOTO(out, result);
1129 out:
1130         cl_io_fini(env, io);
1131         /* If any bit been read/written (result != 0), we just return
1132          * short read/write instead of restart io. */
1133         if ((result == 0 || result == -ENODATA) && io->ci_need_restart) {
1134                 CDEBUG(D_VFSTRACE, "Restart %s on %s from %lld, count:%zd\n",
1135                        iot == CIT_READ ? "read" : "write",
1136                        file->f_dentry->d_name.name, *ppos, count);
1137                 LASSERTF(io->ci_nob == 0, "%zd", io->ci_nob);
1138                 goto restart;
1139         }
1140
1141         if (iot == CIT_READ) {
1142                 if (result >= 0)
1143                         ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1144                                            LPROC_LL_READ_BYTES, result);
1145         } else if (iot == CIT_WRITE) {
1146                 if (result >= 0) {
1147                         ll_stats_ops_tally(ll_i2sbi(file->f_dentry->d_inode),
1148                                            LPROC_LL_WRITE_BYTES, result);
1149                         fd->fd_write_failed = false;
1150                 } else if (result != -ERESTARTSYS) {
1151                         fd->fd_write_failed = true;
1152                 }
1153         }
1154
1155         return result;
1156 }
1157
1158
1159 /*
1160  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1161  */
1162 static int ll_file_get_iov_count(const struct iovec *iov,
1163                                  unsigned long *nr_segs, size_t *count)
1164 {
1165         size_t cnt = 0;
1166         unsigned long seg;
1167
1168         for (seg = 0; seg < *nr_segs; seg++) {
1169                 const struct iovec *iv = &iov[seg];
1170
1171                 /*
1172                  * If any segment has a negative length, or the cumulative
1173                  * length ever wraps negative then return -EINVAL.
1174                  */
1175                 cnt += iv->iov_len;
1176                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1177                         return -EINVAL;
1178                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1179                         continue;
1180                 if (seg == 0)
1181                         return -EFAULT;
1182                 *nr_segs = seg;
1183                 cnt -= iv->iov_len;   /* This segment is no good */
1184                 break;
1185         }
1186         *count = cnt;
1187         return 0;
1188 }
1189
1190 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1191                                 unsigned long nr_segs, loff_t pos)
1192 {
1193         struct lu_env      *env;
1194         struct vvp_io_args *args;
1195         size_t              count;
1196         ssize_t             result;
1197         int                 refcheck;
1198         ENTRY;
1199
1200         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1201         if (result)
1202                 RETURN(result);
1203
1204         env = cl_env_get(&refcheck);
1205         if (IS_ERR(env))
1206                 RETURN(PTR_ERR(env));
1207
1208         args = vvp_env_args(env, IO_NORMAL);
1209         args->u.normal.via_iov = (struct iovec *)iov;
1210         args->u.normal.via_nrsegs = nr_segs;
1211         args->u.normal.via_iocb = iocb;
1212
1213         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1214                                     &iocb->ki_pos, count);
1215         cl_env_put(env, &refcheck);
1216         RETURN(result);
1217 }
1218
1219 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1220                             loff_t *ppos)
1221 {
1222         struct lu_env *env;
1223         struct iovec  *local_iov;
1224         struct kiocb  *kiocb;
1225         ssize_t        result;
1226         int            refcheck;
1227         ENTRY;
1228
1229         env = cl_env_get(&refcheck);
1230         if (IS_ERR(env))
1231                 RETURN(PTR_ERR(env));
1232
1233         local_iov = &vvp_env_info(env)->vti_local_iov;
1234         kiocb = &vvp_env_info(env)->vti_kiocb;
1235         local_iov->iov_base = (void __user *)buf;
1236         local_iov->iov_len = count;
1237         init_sync_kiocb(kiocb, file);
1238         kiocb->ki_pos = *ppos;
1239         kiocb->ki_left = count;
1240
1241         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
1242         *ppos = kiocb->ki_pos;
1243
1244         cl_env_put(env, &refcheck);
1245         RETURN(result);
1246 }
1247
1248 /*
1249  * Write to a file (through the page cache).
1250  * AIO stuff
1251  */
1252 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1253                                  unsigned long nr_segs, loff_t pos)
1254 {
1255         struct lu_env      *env;
1256         struct vvp_io_args *args;
1257         size_t              count;
1258         ssize_t             result;
1259         int                 refcheck;
1260         ENTRY;
1261
1262         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1263         if (result)
1264                 RETURN(result);
1265
1266         env = cl_env_get(&refcheck);
1267         if (IS_ERR(env))
1268                 RETURN(PTR_ERR(env));
1269
1270         args = vvp_env_args(env, IO_NORMAL);
1271         args->u.normal.via_iov = (struct iovec *)iov;
1272         args->u.normal.via_nrsegs = nr_segs;
1273         args->u.normal.via_iocb = iocb;
1274
1275         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1276                                   &iocb->ki_pos, count);
1277         cl_env_put(env, &refcheck);
1278         RETURN(result);
1279 }
1280
1281 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1282                              loff_t *ppos)
1283 {
1284         struct lu_env *env;
1285         struct iovec  *local_iov;
1286         struct kiocb  *kiocb;
1287         ssize_t        result;
1288         int            refcheck;
1289         ENTRY;
1290
1291         env = cl_env_get(&refcheck);
1292         if (IS_ERR(env))
1293                 RETURN(PTR_ERR(env));
1294
1295         local_iov = &vvp_env_info(env)->vti_local_iov;
1296         kiocb = &vvp_env_info(env)->vti_kiocb;
1297         local_iov->iov_base = (void __user *)buf;
1298         local_iov->iov_len = count;
1299         init_sync_kiocb(kiocb, file);
1300         kiocb->ki_pos = *ppos;
1301         kiocb->ki_left = count;
1302
1303         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1304         *ppos = kiocb->ki_pos;
1305
1306         cl_env_put(env, &refcheck);
1307         RETURN(result);
1308 }
1309
1310 /*
1311  * Send file content (through pagecache) somewhere with helper
1312  */
1313 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1314                                    struct pipe_inode_info *pipe, size_t count,
1315                                    unsigned int flags)
1316 {
1317         struct lu_env      *env;
1318         struct vvp_io_args *args;
1319         ssize_t             result;
1320         int                 refcheck;
1321         ENTRY;
1322
1323         env = cl_env_get(&refcheck);
1324         if (IS_ERR(env))
1325                 RETURN(PTR_ERR(env));
1326
1327         args = vvp_env_args(env, IO_SPLICE);
1328         args->u.splice.via_pipe = pipe;
1329         args->u.splice.via_flags = flags;
1330
1331         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1332         cl_env_put(env, &refcheck);
1333         RETURN(result);
1334 }
1335
1336 static int ll_lov_recreate(struct inode *inode, struct ost_id *oi,
1337                            obd_count ost_idx)
1338 {
1339         struct obd_export *exp = ll_i2dtexp(inode);
1340         struct obd_trans_info oti = { 0 };
1341         struct obdo *oa = NULL;
1342         int lsm_size;
1343         int rc = 0;
1344         struct lov_stripe_md *lsm = NULL, *lsm2;
1345         ENTRY;
1346
1347         OBDO_ALLOC(oa);
1348         if (oa == NULL)
1349                 RETURN(-ENOMEM);
1350
1351         lsm = ccc_inode_lsm_get(inode);
1352         if (!lsm_has_objects(lsm))
1353                 GOTO(out, rc = -ENOENT);
1354
1355         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1356                    (lsm->lsm_stripe_count));
1357
1358         OBD_ALLOC_LARGE(lsm2, lsm_size);
1359         if (lsm2 == NULL)
1360                 GOTO(out, rc = -ENOMEM);
1361
1362         oa->o_oi = *oi;
1363         oa->o_nlink = ost_idx;
1364         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1365         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1366         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1367                                    OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1368         obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid);
1369         memcpy(lsm2, lsm, lsm_size);
1370         ll_inode_size_lock(inode);
1371         rc = obd_create(NULL, exp, oa, &lsm2, &oti);
1372         ll_inode_size_unlock(inode);
1373
1374         OBD_FREE_LARGE(lsm2, lsm_size);
1375         GOTO(out, rc);
1376 out:
1377         ccc_inode_lsm_put(inode, lsm);
1378         OBDO_FREE(oa);
1379         return rc;
1380 }
1381
1382 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
1383 {
1384         struct ll_recreate_obj ucreat;
1385         struct ost_id           oi;
1386         ENTRY;
1387
1388         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1389                 RETURN(-EPERM);
1390
1391         if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
1392                            sizeof(ucreat)))
1393                 RETURN(-EFAULT);
1394
1395         ostid_set_seq_mdt0(&oi);
1396         ostid_set_id(&oi, ucreat.lrc_id);
1397         RETURN(ll_lov_recreate(inode, &oi, ucreat.lrc_ost_idx));
1398 }
1399
1400 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
1401 {
1402         struct lu_fid   fid;
1403         struct ost_id   oi;
1404         obd_count       ost_idx;
1405         ENTRY;
1406
1407         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1408                 RETURN(-EPERM);
1409
1410         if (copy_from_user(&fid, (struct lu_fid *)arg, sizeof(fid)))
1411                 RETURN(-EFAULT);
1412
1413         fid_to_ostid(&fid, &oi);
1414         ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
1415         RETURN(ll_lov_recreate(inode, &oi, ost_idx));
1416 }
1417
1418 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1419                              int flags, struct lov_user_md *lum, int lum_size)
1420 {
1421         struct lov_stripe_md *lsm = NULL;
1422         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1423         int rc = 0;
1424         ENTRY;
1425
1426         lsm = ccc_inode_lsm_get(inode);
1427         if (lsm != NULL) {
1428                 ccc_inode_lsm_put(inode, lsm);
1429                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1430                        inode->i_ino);
1431                 RETURN(-EEXIST);
1432         }
1433
1434         ll_inode_size_lock(inode);
1435         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1436         if (rc)
1437                 GOTO(out, rc);
1438         rc = oit.d.lustre.it_status;
1439         if (rc < 0)
1440                 GOTO(out_req_free, rc);
1441
1442         ll_release_openhandle(file->f_dentry, &oit);
1443
1444  out:
1445         ll_inode_size_unlock(inode);
1446         ll_intent_release(&oit);
1447         ccc_inode_lsm_put(inode, lsm);
1448         RETURN(rc);
1449 out_req_free:
1450         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1451         goto out;
1452 }
1453
1454 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1455                              struct lov_mds_md **lmmp, int *lmm_size,
1456                              struct ptlrpc_request **request)
1457 {
1458         struct ll_sb_info *sbi = ll_i2sbi(inode);
1459         struct mdt_body  *body;
1460         struct lov_mds_md *lmm = NULL;
1461         struct ptlrpc_request *req = NULL;
1462         struct md_op_data *op_data;
1463         int rc, lmmsize;
1464
1465         rc = ll_get_max_mdsize(sbi, &lmmsize);
1466         if (rc)
1467                 RETURN(rc);
1468
1469         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1470                                      strlen(filename), lmmsize,
1471                                      LUSTRE_OPC_ANY, NULL);
1472         if (IS_ERR(op_data))
1473                 RETURN(PTR_ERR(op_data));
1474
1475         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1476         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1477         ll_finish_md_op_data(op_data);
1478         if (rc < 0) {
1479                 CDEBUG(D_INFO, "md_getattr_name failed "
1480                        "on %s: rc %d\n", filename, rc);
1481                 GOTO(out, rc);
1482         }
1483
1484         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1485         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1486
1487         lmmsize = body->eadatasize;
1488
1489         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1490                         lmmsize == 0) {
1491                 GOTO(out, rc = -ENODATA);
1492         }
1493
1494         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1495         LASSERT(lmm != NULL);
1496
1497         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1498             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) {
1499                 GOTO(out, rc = -EPROTO);
1500         }
1501
1502         /*
1503          * This is coming from the MDS, so is probably in
1504          * little endian.  We convert it to host endian before
1505          * passing it to userspace.
1506          */
1507         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1508                 int stripe_count;
1509
1510                 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1511                 if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED)
1512                         stripe_count = 0;
1513
1514                 /* if function called for directory - we should
1515                  * avoid swab not existent lsm objects */
1516                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1517                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1518                         if (S_ISREG(body->mode))
1519                                 lustre_swab_lov_user_md_objects(
1520                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1521                                  stripe_count);
1522                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1523                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1524                         if (S_ISREG(body->mode))
1525                                 lustre_swab_lov_user_md_objects(
1526                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1527                                  stripe_count);
1528                 }
1529         }
1530
1531 out:
1532         *lmmp = lmm;
1533         *lmm_size = lmmsize;
1534         *request = req;
1535         return rc;
1536 }
1537
1538 static int ll_lov_setea(struct inode *inode, struct file *file,
1539                             unsigned long arg)
1540 {
1541         int                      flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1542         struct lov_user_md      *lump;
1543         int                      lum_size = sizeof(struct lov_user_md) +
1544                                             sizeof(struct lov_user_ost_data);
1545         int                      rc;
1546         ENTRY;
1547
1548         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1549                 RETURN(-EPERM);
1550
1551         OBD_ALLOC_LARGE(lump, lum_size);
1552         if (lump == NULL)
1553                 RETURN(-ENOMEM);
1554
1555         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1556                 OBD_FREE_LARGE(lump, lum_size);
1557                 RETURN(-EFAULT);
1558         }
1559
1560         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1561
1562         OBD_FREE_LARGE(lump, lum_size);
1563         RETURN(rc);
1564 }
1565
1566 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1567                             unsigned long arg)
1568 {
1569         struct lov_user_md_v3    lumv3;
1570         struct lov_user_md_v1   *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1571         struct lov_user_md_v1   *lumv1p = (struct lov_user_md_v1 *)arg;
1572         struct lov_user_md_v3   *lumv3p = (struct lov_user_md_v3 *)arg;
1573         int                      lum_size, rc;
1574         int                      flags = FMODE_WRITE;
1575         ENTRY;
1576
1577         /* first try with v1 which is smaller than v3 */
1578         lum_size = sizeof(struct lov_user_md_v1);
1579         if (copy_from_user(lumv1, lumv1p, lum_size))
1580                 RETURN(-EFAULT);
1581
1582         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1583                 lum_size = sizeof(struct lov_user_md_v3);
1584                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1585                         RETURN(-EFAULT);
1586         }
1587
1588         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1589         if (rc == 0) {
1590                 struct lov_stripe_md *lsm;
1591                 __u32 gen;
1592
1593                 put_user(0, &lumv1p->lmm_stripe_count);
1594
1595                 ll_layout_refresh(inode, &gen);
1596                 lsm = ccc_inode_lsm_get(inode);
1597                 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1598                                    0, lsm, (void *)arg);
1599                 ccc_inode_lsm_put(inode, lsm);
1600         }
1601         RETURN(rc);
1602 }
1603
1604 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1605 {
1606         struct lov_stripe_md *lsm;
1607         int rc = -ENODATA;
1608         ENTRY;
1609
1610         lsm = ccc_inode_lsm_get(inode);
1611         if (lsm != NULL)
1612                 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0,
1613                                    lsm, (void *)arg);
1614         ccc_inode_lsm_put(inode, lsm);
1615         RETURN(rc);
1616 }
1617
1618 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1619 {
1620         struct ll_inode_info   *lli = ll_i2info(inode);
1621         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1622         struct ccc_grouplock    grouplock;
1623         int                     rc;
1624         ENTRY;
1625
1626         if (ll_file_nolock(file))
1627                 RETURN(-EOPNOTSUPP);
1628
1629         spin_lock(&lli->lli_lock);
1630         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1631                 CWARN("group lock already existed with gid %lu\n",
1632                       fd->fd_grouplock.cg_gid);
1633                 spin_unlock(&lli->lli_lock);
1634                 RETURN(-EINVAL);
1635         }
1636         LASSERT(fd->fd_grouplock.cg_lock == NULL);
1637         spin_unlock(&lli->lli_lock);
1638
1639         rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1640                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1641         if (rc)
1642                 RETURN(rc);
1643
1644         spin_lock(&lli->lli_lock);
1645         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1646                 spin_unlock(&lli->lli_lock);
1647                 CERROR("another thread just won the race\n");
1648                 cl_put_grouplock(&grouplock);
1649                 RETURN(-EINVAL);
1650         }
1651
1652         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1653         fd->fd_grouplock = grouplock;
1654         spin_unlock(&lli->lli_lock);
1655
1656         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1657         RETURN(0);
1658 }
1659
1660 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1661 {
1662         struct ll_inode_info   *lli = ll_i2info(inode);
1663         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1664         struct ccc_grouplock    grouplock;
1665         ENTRY;
1666
1667         spin_lock(&lli->lli_lock);
1668         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1669                 spin_unlock(&lli->lli_lock);
1670                 CWARN("no group lock held\n");
1671                 RETURN(-EINVAL);
1672         }
1673         LASSERT(fd->fd_grouplock.cg_lock != NULL);
1674
1675         if (fd->fd_grouplock.cg_gid != arg) {
1676                 CWARN("group lock %lu doesn't match current id %lu\n",
1677                        arg, fd->fd_grouplock.cg_gid);
1678                 spin_unlock(&lli->lli_lock);
1679                 RETURN(-EINVAL);
1680         }
1681
1682         grouplock = fd->fd_grouplock;
1683         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
1684         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1685         spin_unlock(&lli->lli_lock);
1686
1687         cl_put_grouplock(&grouplock);
1688         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1689         RETURN(0);
1690 }
1691
1692 /**
1693  * Close inode open handle
1694  *
1695  * \param dentry [in]     dentry which contains the inode
1696  * \param it     [in,out] intent which contains open info and result
1697  *
1698  * \retval 0     success
1699  * \retval <0    failure
1700  */
1701 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1702 {
1703         struct inode *inode = dentry->d_inode;
1704         struct obd_client_handle *och;
1705         int rc;
1706         ENTRY;
1707
1708         LASSERT(inode);
1709
1710         /* Root ? Do nothing. */
1711         if (dentry->d_inode->i_sb->s_root == dentry)
1712                 RETURN(0);
1713
1714         /* No open handle to close? Move away */
1715         if (!it_disposition(it, DISP_OPEN_OPEN))
1716                 RETURN(0);
1717
1718         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1719
1720         OBD_ALLOC(och, sizeof(*och));
1721         if (!och)
1722                 GOTO(out, rc = -ENOMEM);
1723
1724         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
1725
1726         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1727                                        inode, och);
1728  out:
1729         /* this one is in place of ll_file_open */
1730         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
1731                 ptlrpc_req_finished(it->d.lustre.it_data);
1732                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1733         }
1734         RETURN(rc);
1735 }
1736
1737 /**
1738  * Get size for inode for which FIEMAP mapping is requested.
1739  * Make the FIEMAP get_info call and returns the result.
1740  */
1741 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1742               int num_bytes)
1743 {
1744         struct obd_export *exp = ll_i2dtexp(inode);
1745         struct lov_stripe_md *lsm = NULL;
1746         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1747         int vallen = num_bytes;
1748         int rc;
1749         ENTRY;
1750
1751         /* Checks for fiemap flags */
1752         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1753                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1754                 return -EBADR;
1755         }
1756
1757         /* Check for FIEMAP_FLAG_SYNC */
1758         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
1759                 rc = filemap_fdatawrite(inode->i_mapping);
1760                 if (rc)
1761                         return rc;
1762         }
1763
1764         lsm = ccc_inode_lsm_get(inode);
1765         if (lsm == NULL)
1766                 return -ENOENT;
1767
1768         /* If the stripe_count > 1 and the application does not understand
1769          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1770          */
1771         if (lsm->lsm_stripe_count > 1 &&
1772             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1773                 GOTO(out, rc = -EOPNOTSUPP);
1774
1775         fm_key.oa.o_oi = lsm->lsm_oi;
1776         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1777
1778         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLSIZE);
1779         obdo_set_parent_fid(&fm_key.oa, &ll_i2info(inode)->lli_fid);
1780         /* If filesize is 0, then there would be no objects for mapping */
1781         if (fm_key.oa.o_size == 0) {
1782                 fiemap->fm_mapped_extents = 0;
1783                 GOTO(out, rc = 0);
1784         }
1785
1786         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1787
1788         rc = obd_get_info(NULL, exp, sizeof(fm_key), &fm_key, &vallen,
1789                           fiemap, lsm);
1790         if (rc)
1791                 CERROR("obd_get_info failed: rc = %d\n", rc);
1792
1793 out:
1794         ccc_inode_lsm_put(inode, lsm);
1795         RETURN(rc);
1796 }
1797
1798 int ll_fid2path(struct inode *inode, void *arg)
1799 {
1800         struct obd_export       *exp = ll_i2mdexp(inode);
1801         struct getinfo_fid2path *gfout, *gfin;
1802         int                      outsize, rc;
1803         ENTRY;
1804
1805         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
1806             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
1807                 RETURN(-EPERM);
1808
1809         /* Need to get the buflen */
1810         OBD_ALLOC_PTR(gfin);
1811         if (gfin == NULL)
1812                 RETURN(-ENOMEM);
1813         if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1814                 OBD_FREE_PTR(gfin);
1815                 RETURN(-EFAULT);
1816         }
1817
1818         outsize = sizeof(*gfout) + gfin->gf_pathlen;
1819         OBD_ALLOC(gfout, outsize);
1820         if (gfout == NULL) {
1821                 OBD_FREE_PTR(gfin);
1822                 RETURN(-ENOMEM);
1823         }
1824         memcpy(gfout, gfin, sizeof(*gfout));
1825         OBD_FREE_PTR(gfin);
1826
1827         /* Call mdc_iocontrol */
1828         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1829         if (rc)
1830                 GOTO(gf_free, rc);
1831
1832         if (copy_to_user(arg, gfout, outsize))
1833                 rc = -EFAULT;
1834
1835 gf_free:
1836         OBD_FREE(gfout, outsize);
1837         RETURN(rc);
1838 }
1839
1840 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
1841 {
1842         struct ll_user_fiemap *fiemap_s;
1843         size_t num_bytes, ret_bytes;
1844         unsigned int extent_count;
1845         int rc = 0;
1846
1847         /* Get the extent count so we can calculate the size of
1848          * required fiemap buffer */
1849         if (get_user(extent_count,
1850             &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1851                 RETURN(-EFAULT);
1852         num_bytes = sizeof(*fiemap_s) + (extent_count *
1853                                          sizeof(struct ll_fiemap_extent));
1854
1855         OBD_ALLOC_LARGE(fiemap_s, num_bytes);
1856         if (fiemap_s == NULL)
1857                 RETURN(-ENOMEM);
1858
1859         /* get the fiemap value */
1860         if (copy_from_user(fiemap_s, (struct ll_user_fiemap __user *)arg,
1861                            sizeof(*fiemap_s)))
1862                 GOTO(error, rc = -EFAULT);
1863
1864         /* If fm_extent_count is non-zero, read the first extent since
1865          * it is used to calculate end_offset and device from previous
1866          * fiemap call. */
1867         if (extent_count) {
1868                 if (copy_from_user(&fiemap_s->fm_extents[0],
1869                     (char __user *)arg + sizeof(*fiemap_s),
1870                     sizeof(struct ll_fiemap_extent)))
1871                         GOTO(error, rc = -EFAULT);
1872         }
1873
1874         rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
1875         if (rc)
1876                 GOTO(error, rc);
1877
1878         ret_bytes = sizeof(struct ll_user_fiemap);
1879
1880         if (extent_count != 0)
1881                 ret_bytes += (fiemap_s->fm_mapped_extents *
1882                                  sizeof(struct ll_fiemap_extent));
1883
1884         if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1885                 rc = -EFAULT;
1886
1887 error:
1888         OBD_FREE_LARGE(fiemap_s, num_bytes);
1889         RETURN(rc);
1890 }
1891
1892 /*
1893  * Read the data_version for inode.
1894  *
1895  * This value is computed using stripe object version on OST.
1896  * Version is computed using server side locking.
1897  *
1898  * @param extent_lock  Take extent lock. Not needed if a process is already
1899  *                     holding the OST object group locks.
1900  */
1901 int ll_data_version(struct inode *inode, __u64 *data_version,
1902                     int extent_lock)
1903 {
1904         struct lov_stripe_md    *lsm = NULL;
1905         struct ll_sb_info       *sbi = ll_i2sbi(inode);
1906         struct obdo             *obdo = NULL;
1907         int                      rc;
1908         ENTRY;
1909
1910         /* If no stripe, we consider version is 0. */
1911         lsm = ccc_inode_lsm_get(inode);
1912         if (!lsm_has_objects(lsm)) {
1913                 *data_version = 0;
1914                 CDEBUG(D_INODE, "No object for inode\n");
1915                 GOTO(out, rc = 0);
1916         }
1917
1918         OBD_ALLOC_PTR(obdo);
1919         if (obdo == NULL)
1920                 GOTO(out, rc = -ENOMEM);
1921
1922         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, obdo, 0, extent_lock);
1923         if (rc == 0) {
1924                 if (!(obdo->o_valid & OBD_MD_FLDATAVERSION))
1925                         rc = -EOPNOTSUPP;
1926                 else
1927                         *data_version = obdo->o_data_version;
1928         }
1929
1930         OBD_FREE_PTR(obdo);
1931         EXIT;
1932 out:
1933         ccc_inode_lsm_put(inode, lsm);
1934         RETURN(rc);
1935 }
1936
1937 struct ll_swap_stack {
1938         struct iattr             ia1, ia2;
1939         __u64                    dv1, dv2;
1940         struct inode            *inode1, *inode2;
1941         bool                     check_dv1, check_dv2;
1942 };
1943
1944 static int ll_swap_layouts(struct file *file1, struct file *file2,
1945                            struct lustre_swap_layouts *lsl)
1946 {
1947         struct mdc_swap_layouts  msl;
1948         struct md_op_data       *op_data;
1949         __u32                    gid;
1950         __u64                    dv;
1951         struct ll_swap_stack    *llss = NULL;
1952         int                      rc;
1953
1954         OBD_ALLOC_PTR(llss);
1955         if (llss == NULL)
1956                 RETURN(-ENOMEM);
1957
1958         llss->inode1 = file1->f_dentry->d_inode;
1959         llss->inode2 = file2->f_dentry->d_inode;
1960
1961         if (!S_ISREG(llss->inode2->i_mode))
1962                 GOTO(free, rc = -EINVAL);
1963
1964         if (inode_permission(llss->inode1, MAY_WRITE) ||
1965             inode_permission(llss->inode2, MAY_WRITE))
1966                 GOTO(free, rc = -EPERM);
1967
1968         if (llss->inode2->i_sb != llss->inode1->i_sb)
1969                 GOTO(free, rc = -EXDEV);
1970
1971         /* we use 2 bool because it is easier to swap than 2 bits */
1972         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
1973                 llss->check_dv1 = true;
1974
1975         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
1976                 llss->check_dv2 = true;
1977
1978         /* we cannot use lsl->sl_dvX directly because we may swap them */
1979         llss->dv1 = lsl->sl_dv1;
1980         llss->dv2 = lsl->sl_dv2;
1981
1982         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
1983         if (rc == 0) /* same file, done! */
1984                 GOTO(free, rc = 0);
1985
1986         if (rc < 0) { /* sequentialize it */
1987                 swap(llss->inode1, llss->inode2);
1988                 swap(file1, file2);
1989                 swap(llss->dv1, llss->dv2);
1990                 swap(llss->check_dv1, llss->check_dv2);
1991         }
1992
1993         gid = lsl->sl_gid;
1994         if (gid != 0) { /* application asks to flush dirty cache */
1995                 rc = ll_get_grouplock(llss->inode1, file1, gid);
1996                 if (rc < 0)
1997                         GOTO(free, rc);
1998
1999                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2000                 if (rc < 0) {
2001                         ll_put_grouplock(llss->inode1, file1, gid);
2002                         GOTO(free, rc);
2003                 }
2004         }
2005
2006         /* to be able to restore mtime and atime after swap
2007          * we need to first save them */
2008         if (lsl->sl_flags &
2009             (SWAP_LAYOUTS_KEEP_MTIME | SWAP_LAYOUTS_KEEP_ATIME)) {
2010                 llss->ia1.ia_mtime = llss->inode1->i_mtime;
2011                 llss->ia1.ia_atime = llss->inode1->i_atime;
2012                 llss->ia1.ia_valid = ATTR_MTIME | ATTR_ATIME;
2013                 llss->ia2.ia_mtime = llss->inode2->i_mtime;
2014                 llss->ia2.ia_atime = llss->inode2->i_atime;
2015                 llss->ia2.ia_valid = ATTR_MTIME | ATTR_ATIME;
2016         }
2017
2018         /* ultimate check, before swaping the layouts we check if
2019          * dataversion has changed (if requested) */
2020         if (llss->check_dv1) {
2021                 rc = ll_data_version(llss->inode1, &dv, 0);
2022                 if (rc)
2023                         GOTO(putgl, rc);
2024                 if (dv != llss->dv1)
2025                         GOTO(putgl, rc = -EAGAIN);
2026         }
2027
2028         if (llss->check_dv2) {
2029                 rc = ll_data_version(llss->inode2, &dv, 0);
2030                 if (rc)
2031                         GOTO(putgl, rc);
2032                 if (dv != llss->dv2)
2033                         GOTO(putgl, rc = -EAGAIN);
2034         }
2035
2036         /* struct md_op_data is used to send the swap args to the mdt
2037          * only flags is missing, so we use struct mdc_swap_layouts
2038          * through the md_op_data->op_data */
2039         /* flags from user space have to be converted before they are send to
2040          * server, no flag is sent today, they are only used on the client */
2041         msl.msl_flags = 0;
2042         rc = -ENOMEM;
2043         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2044                                      0, LUSTRE_OPC_ANY, &msl);
2045         if (IS_ERR(op_data))
2046                 GOTO(free, rc = PTR_ERR(op_data));
2047
2048         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2049                            sizeof(*op_data), op_data, NULL);
2050         ll_finish_md_op_data(op_data);
2051
2052 putgl:
2053         if (gid != 0) {
2054                 ll_put_grouplock(llss->inode2, file2, gid);
2055                 ll_put_grouplock(llss->inode1, file1, gid);
2056         }
2057
2058         /* rc can be set from obd_iocontrol() or from a GOTO(putgl, ...) */
2059         if (rc != 0)
2060                 GOTO(free, rc);
2061
2062         /* clear useless flags */
2063         if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_MTIME)) {
2064                 llss->ia1.ia_valid &= ~ATTR_MTIME;
2065                 llss->ia2.ia_valid &= ~ATTR_MTIME;
2066         }
2067
2068         if (!(lsl->sl_flags & SWAP_LAYOUTS_KEEP_ATIME)) {
2069                 llss->ia1.ia_valid &= ~ATTR_ATIME;
2070                 llss->ia2.ia_valid &= ~ATTR_ATIME;
2071         }
2072
2073         /* update time if requested */
2074         rc = 0;
2075         if (llss->ia2.ia_valid != 0) {
2076                 mutex_lock(&llss->inode1->i_mutex);
2077                 rc = ll_setattr(file1->f_dentry, &llss->ia2);
2078                 mutex_unlock(&llss->inode1->i_mutex);
2079         }
2080
2081         if (llss->ia1.ia_valid != 0) {
2082                 int rc1;
2083
2084                 mutex_lock(&llss->inode2->i_mutex);
2085                 rc1 = ll_setattr(file2->f_dentry, &llss->ia1);
2086                 mutex_unlock(&llss->inode2->i_mutex);
2087                 if (rc == 0)
2088                         rc = rc1;
2089         }
2090
2091 free:
2092         if (llss != NULL)
2093                 OBD_FREE_PTR(llss);
2094
2095         RETURN(rc);
2096 }
2097
2098 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2099 {
2100         struct inode            *inode = file->f_dentry->d_inode;
2101         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2102         int                      flags, rc;
2103         ENTRY;
2104
2105         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2106                inode->i_generation, inode, cmd);
2107         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2108
2109         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2110         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2111                 RETURN(-ENOTTY);
2112
2113         switch(cmd) {
2114         case LL_IOC_GETFLAGS:
2115                 /* Get the current value of the file flags */
2116                 return put_user(fd->fd_flags, (int *)arg);
2117         case LL_IOC_SETFLAGS:
2118         case LL_IOC_CLRFLAGS:
2119                 /* Set or clear specific file flags */
2120                 /* XXX This probably needs checks to ensure the flags are
2121                  *     not abused, and to handle any flag side effects.
2122                  */
2123                 if (get_user(flags, (int *) arg))
2124                         RETURN(-EFAULT);
2125
2126                 if (cmd == LL_IOC_SETFLAGS) {
2127                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2128                             !(file->f_flags & O_DIRECT)) {
2129                                 CERROR("%s: unable to disable locking on "
2130                                        "non-O_DIRECT file\n", current->comm);
2131                                 RETURN(-EINVAL);
2132                         }
2133
2134                         fd->fd_flags |= flags;
2135                 } else {
2136                         fd->fd_flags &= ~flags;
2137                 }
2138                 RETURN(0);
2139         case LL_IOC_LOV_SETSTRIPE:
2140                 RETURN(ll_lov_setstripe(inode, file, arg));
2141         case LL_IOC_LOV_SETEA:
2142                 RETURN(ll_lov_setea(inode, file, arg));
2143         case LL_IOC_LOV_SWAP_LAYOUTS: {
2144                 struct file *file2;
2145                 struct lustre_swap_layouts lsl;
2146
2147                 if (copy_from_user(&lsl, (char *)arg,
2148                                        sizeof(struct lustre_swap_layouts)))
2149                         RETURN(-EFAULT);
2150
2151                 if ((file->f_flags & O_ACCMODE) == 0) /* O_RDONLY */
2152                         RETURN(-EPERM);
2153
2154                 file2 = fget(lsl.sl_fd);
2155                 if (file2 == NULL)
2156                         RETURN(-EBADF);
2157
2158                 rc = -EPERM;
2159                 if ((file2->f_flags & O_ACCMODE) != 0) /* O_WRONLY or O_RDWR */
2160                         rc = ll_swap_layouts(file, file2, &lsl);
2161                 fput(file2);
2162                 RETURN(rc);
2163         }
2164         case LL_IOC_LOV_GETSTRIPE:
2165                 RETURN(ll_lov_getstripe(inode, arg));
2166         case LL_IOC_RECREATE_OBJ:
2167                 RETURN(ll_lov_recreate_obj(inode, arg));
2168         case LL_IOC_RECREATE_FID:
2169                 RETURN(ll_lov_recreate_fid(inode, arg));
2170         case FSFILT_IOC_FIEMAP:
2171                 RETURN(ll_ioctl_fiemap(inode, arg));
2172         case FSFILT_IOC_GETFLAGS:
2173         case FSFILT_IOC_SETFLAGS:
2174                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2175         case FSFILT_IOC_GETVERSION_OLD:
2176         case FSFILT_IOC_GETVERSION:
2177                 RETURN(put_user(inode->i_generation, (int *)arg));
2178         case LL_IOC_GROUP_LOCK:
2179                 RETURN(ll_get_grouplock(inode, file, arg));
2180         case LL_IOC_GROUP_UNLOCK:
2181                 RETURN(ll_put_grouplock(inode, file, arg));
2182         case IOC_OBD_STATFS:
2183                 RETURN(ll_obd_statfs(inode, (void *)arg));
2184
2185         /* We need to special case any other ioctls we want to handle,
2186          * to send them to the MDS/OST as appropriate and to properly
2187          * network encode the arg field.
2188         case FSFILT_IOC_SETVERSION_OLD:
2189         case FSFILT_IOC_SETVERSION:
2190         */
2191         case LL_IOC_FLUSHCTX:
2192                 RETURN(ll_flush_ctx(inode));
2193         case LL_IOC_PATH2FID: {
2194                 if (copy_to_user((void *)arg, ll_inode2fid(inode),
2195                                  sizeof(struct lu_fid)))
2196                         RETURN(-EFAULT);
2197
2198                 RETURN(0);
2199         }
2200         case OBD_IOC_FID2PATH:
2201                 RETURN(ll_fid2path(inode, (void *)arg));
2202         case LL_IOC_DATA_VERSION: {
2203                 struct ioc_data_version idv;
2204                 int                     rc;
2205
2206                 if (copy_from_user(&idv, (char *)arg, sizeof(idv)))
2207                         RETURN(-EFAULT);
2208
2209                 rc = ll_data_version(inode, &idv.idv_version,
2210                                 !(idv.idv_flags & LL_DV_NOFLUSH));
2211
2212                 if (rc == 0 && copy_to_user((char *) arg, &idv, sizeof(idv)))
2213                         RETURN(-EFAULT);
2214
2215                 RETURN(rc);
2216         }
2217
2218         case LL_IOC_GET_MDTIDX: {
2219                 int mdtidx;
2220
2221                 mdtidx = ll_get_mdt_idx(inode);
2222                 if (mdtidx < 0)
2223                         RETURN(mdtidx);
2224
2225                 if (put_user((int)mdtidx, (int*)arg))
2226                         RETURN(-EFAULT);
2227
2228                 RETURN(0);
2229         }
2230         case OBD_IOC_GETDTNAME:
2231         case OBD_IOC_GETMDNAME:
2232                 RETURN(ll_get_obd_name(inode, cmd, arg));
2233         case LL_IOC_HSM_STATE_GET: {
2234                 struct md_op_data       *op_data;
2235                 struct hsm_user_state   *hus;
2236                 int                      rc;
2237
2238                 OBD_ALLOC_PTR(hus);
2239                 if (hus == NULL)
2240                         RETURN(-ENOMEM);
2241
2242                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2243                                              LUSTRE_OPC_ANY, hus);
2244                 if (IS_ERR(op_data)) {
2245                         OBD_FREE_PTR(hus);
2246                         RETURN(PTR_ERR(op_data));
2247                 }
2248
2249                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2250                                    op_data, NULL);
2251
2252                 if (copy_to_user((void *)arg, hus, sizeof(*hus)))
2253                         rc = -EFAULT;
2254
2255                 ll_finish_md_op_data(op_data);
2256                 OBD_FREE_PTR(hus);
2257                 RETURN(rc);
2258         }
2259         case LL_IOC_HSM_STATE_SET: {
2260                 struct md_op_data       *op_data;
2261                 struct hsm_state_set    *hss;
2262                 int                      rc;
2263
2264                 OBD_ALLOC_PTR(hss);
2265                 if (hss == NULL)
2266                         RETURN(-ENOMEM);
2267                 if (copy_from_user(hss, (char *)arg, sizeof(*hss))) {
2268                         OBD_FREE_PTR(hss);
2269                         RETURN(-EFAULT);
2270                 }
2271
2272                 /* Non-root users are forbidden to set or clear flags which are
2273                  * NOT defined in HSM_USER_MASK. */
2274                 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK)
2275                     && !cfs_capable(CFS_CAP_SYS_ADMIN)) {
2276                         OBD_FREE_PTR(hss);
2277                         RETURN(-EPERM);
2278                 }
2279
2280                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2281                                              LUSTRE_OPC_ANY, hss);
2282                 if (IS_ERR(op_data)) {
2283                         OBD_FREE_PTR(hss);
2284                         RETURN(PTR_ERR(op_data));
2285                 }
2286
2287                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2288                                    op_data, NULL);
2289
2290                 ll_finish_md_op_data(op_data);
2291
2292                 OBD_FREE_PTR(hss);
2293                 RETURN(rc);
2294         }
2295         case LL_IOC_HSM_ACTION: {
2296                 struct md_op_data               *op_data;
2297                 struct hsm_current_action       *hca;
2298                 int                              rc;
2299
2300                 OBD_ALLOC_PTR(hca);
2301                 if (hca == NULL)
2302                         RETURN(-ENOMEM);
2303
2304                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2305                                              LUSTRE_OPC_ANY, hca);
2306                 if (IS_ERR(op_data)) {
2307                         OBD_FREE_PTR(hca);
2308                         RETURN(PTR_ERR(op_data));
2309                 }
2310
2311                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
2312                                    op_data, NULL);
2313
2314                 if (copy_to_user((char *)arg, hca, sizeof(*hca)))
2315                         rc = -EFAULT;
2316
2317                 ll_finish_md_op_data(op_data);
2318                 OBD_FREE_PTR(hca);
2319                 RETURN(rc);
2320         }
2321         case LL_IOC_SET_LEASE: {
2322                 struct ll_inode_info *lli = ll_i2info(inode);
2323                 struct obd_client_handle *och = NULL;
2324                 bool lease_broken;
2325                 fmode_t mode = 0;
2326
2327                 switch (arg) {
2328                 case F_WRLCK:
2329                         if (!(file->f_mode & FMODE_WRITE))
2330                                 RETURN(-EPERM);
2331                         mode = FMODE_WRITE;
2332                         break;
2333                 case F_RDLCK:
2334                         if (!(file->f_mode & FMODE_READ))
2335                                 RETURN(-EPERM);
2336                         mode = FMODE_READ;
2337                         break;
2338                 case F_UNLCK:
2339                         mutex_lock(&lli->lli_och_mutex);
2340                         if (fd->fd_lease_och != NULL) {
2341                                 och = fd->fd_lease_och;
2342                                 fd->fd_lease_och = NULL;
2343                         }
2344                         mutex_unlock(&lli->lli_och_mutex);
2345
2346                         if (och != NULL) {
2347                                 mode = och->och_flags &(FMODE_READ|FMODE_WRITE);
2348                                 rc = ll_lease_close(och, inode, &lease_broken);
2349                                 if (rc == 0 && lease_broken)
2350                                         mode = 0;
2351                         } else {
2352                                 rc = -ENOLCK;
2353                         }
2354
2355                         /* return the type of lease or error */
2356                         RETURN(rc < 0 ? rc : (int)mode);
2357                 default:
2358                         RETURN(-EINVAL);
2359                 }
2360
2361                 CDEBUG(D_INODE, "Set lease with mode %d\n", mode);
2362
2363                 /* apply for lease */
2364                 och = ll_lease_open(inode, file, mode);
2365                 if (IS_ERR(och))
2366                         RETURN(PTR_ERR(och));
2367
2368                 rc = 0;
2369                 mutex_lock(&lli->lli_och_mutex);
2370                 if (fd->fd_lease_och == NULL) {
2371                         fd->fd_lease_och = och;
2372                         och = NULL;
2373                 }
2374                 mutex_unlock(&lli->lli_och_mutex);
2375                 if (och != NULL) {
2376                         /* impossible now that only excl is supported for now */
2377                         ll_lease_close(och, inode, &lease_broken);
2378                         rc = -EBUSY;
2379                 }
2380                 RETURN(rc);
2381         }
2382         case LL_IOC_GET_LEASE: {
2383                 struct ll_inode_info *lli = ll_i2info(inode);
2384                 struct ldlm_lock *lock = NULL;
2385
2386                 rc = 0;
2387                 mutex_lock(&lli->lli_och_mutex);
2388                 if (fd->fd_lease_och != NULL) {
2389                         struct obd_client_handle *och = fd->fd_lease_och;
2390
2391                         lock = ldlm_handle2lock(&och->och_lease_handle);
2392                         if (lock != NULL) {
2393                                 lock_res_and_lock(lock);
2394                                 if (!ldlm_is_cancel(lock))
2395                                         rc = och->och_flags &
2396                                                 (FMODE_READ | FMODE_WRITE);
2397                                 unlock_res_and_lock(lock);
2398                                 ldlm_lock_put(lock);
2399                         }
2400                 }
2401                 mutex_unlock(&lli->lli_och_mutex);
2402
2403                 RETURN(rc);
2404         }
2405         default: {
2406                 int err;
2407
2408                 if (LLIOC_STOP ==
2409                      ll_iocontrol_call(inode, file, cmd, arg, &err))
2410                         RETURN(err);
2411
2412                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2413                                      (void *)arg));
2414         }
2415         }
2416 }
2417
2418 #ifndef HAVE_FILE_LLSEEK_SIZE
2419 static inline loff_t
2420 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
2421 {
2422         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2423                 return -EINVAL;
2424         if (offset > maxsize)
2425                 return -EINVAL;
2426
2427         if (offset != file->f_pos) {
2428                 file->f_pos = offset;
2429                 file->f_version = 0;
2430         }
2431         return offset;
2432 }
2433
2434 static loff_t
2435 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
2436                 loff_t maxsize, loff_t eof)
2437 {
2438         struct inode *inode = file->f_dentry->d_inode;
2439
2440         switch (origin) {
2441         case SEEK_END:
2442                 offset += eof;
2443                 break;
2444         case SEEK_CUR:
2445                 /*
2446                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
2447                  * position-querying operation.  Avoid rewriting the "same"
2448                  * f_pos value back to the file because a concurrent read(),
2449                  * write() or lseek() might have altered it
2450                  */
2451                 if (offset == 0)
2452                         return file->f_pos;
2453                 /*
2454                  * f_lock protects against read/modify/write race with other
2455                  * SEEK_CURs. Note that parallel writes and reads behave
2456                  * like SEEK_SET.
2457                  */
2458                 mutex_lock(&inode->i_mutex);
2459                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
2460                 mutex_unlock(&inode->i_mutex);
2461                 return offset;
2462         case SEEK_DATA:
2463                 /*
2464                  * In the generic case the entire file is data, so as long as
2465                  * offset isn't at the end of the file then the offset is data.
2466                  */
2467                 if (offset >= eof)
2468                         return -ENXIO;
2469                 break;
2470         case SEEK_HOLE:
2471                 /*
2472                  * There is a virtual hole at the end of the file, so as long as
2473                  * offset isn't i_size or larger, return i_size.
2474                  */
2475                 if (offset >= eof)
2476                         return -ENXIO;
2477                 offset = eof;
2478                 break;
2479         }
2480
2481         return llseek_execute(file, offset, maxsize);
2482 }
2483 #endif
2484
2485 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2486 {
2487         struct inode *inode = file->f_dentry->d_inode;
2488         loff_t retval, eof = 0;
2489
2490         ENTRY;
2491         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
2492                            (origin == SEEK_CUR) ? file->f_pos : 0);
2493         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%llu=%#llx(%d)\n",
2494                inode->i_ino, inode->i_generation, inode, retval, retval,
2495                origin);
2496         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2497
2498         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
2499                 retval = ll_glimpse_size(inode);
2500                 if (retval != 0)
2501                         RETURN(retval);
2502                 eof = i_size_read(inode);
2503         }
2504
2505         retval = ll_generic_file_llseek_size(file, offset, origin,
2506                                           ll_file_maxbytes(inode), eof);
2507         RETURN(retval);
2508 }
2509
2510 int ll_flush(struct file *file, fl_owner_t id)
2511 {
2512         struct inode *inode = file->f_dentry->d_inode;
2513         struct ll_inode_info *lli = ll_i2info(inode);
2514         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2515         int rc, err;
2516
2517         LASSERT(!S_ISDIR(inode->i_mode));
2518
2519         /* catch async errors that were recorded back when async writeback
2520          * failed for pages in this mapping. */
2521         rc = lli->lli_async_rc;
2522         lli->lli_async_rc = 0;
2523         err = lov_read_and_clear_async_rc(lli->lli_clob);
2524         if (rc == 0)
2525                 rc = err;
2526
2527         /* The application has been told write failure already.
2528          * Do not report failure again. */
2529         if (fd->fd_write_failed)
2530                 return 0;
2531         return rc ? -EIO : 0;
2532 }
2533
2534 /**
2535  * Called to make sure a portion of file has been written out.
2536  * if @local_only is not true, it will send OST_SYNC RPCs to ost.
2537  *
2538  * Return how many pages have been written.
2539  */
2540 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
2541                        enum cl_fsync_mode mode, int ignore_layout)
2542 {
2543         struct cl_env_nest nest;
2544         struct lu_env *env;
2545         struct cl_io *io;
2546         struct obd_capa *capa = NULL;
2547         struct cl_fsync_io *fio;
2548         int result;
2549         ENTRY;
2550
2551         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
2552             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
2553                 RETURN(-EINVAL);
2554
2555         env = cl_env_nested_get(&nest);
2556         if (IS_ERR(env))
2557                 RETURN(PTR_ERR(env));
2558
2559         capa = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2560
2561         io = ccc_env_thread_io(env);
2562         io->ci_obj = cl_i2info(inode)->lli_clob;
2563         io->ci_ignore_layout = ignore_layout;
2564
2565         /* initialize parameters for sync */
2566         fio = &io->u.ci_fsync;
2567         fio->fi_capa = capa;
2568         fio->fi_start = start;
2569         fio->fi_end = end;
2570         fio->fi_fid = ll_inode2fid(inode);
2571         fio->fi_mode = mode;
2572         fio->fi_nr_written = 0;
2573
2574         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
2575                 result = cl_io_loop(env, io);
2576         else
2577                 result = io->ci_result;
2578         if (result == 0)
2579                 result = fio->fi_nr_written;
2580         cl_io_fini(env, io);
2581         cl_env_nested_put(&nest, env);
2582
2583         capa_put(capa);
2584
2585         RETURN(result);
2586 }
2587
2588 /*
2589  * When dentry is provided (the 'else' case), *file->f_dentry may be
2590  * null and dentry must be used directly rather than pulled from
2591  * *file->f_dentry as is done otherwise.
2592  */
2593
2594 #ifdef HAVE_FILE_FSYNC_4ARGS
2595 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2596 {
2597         struct dentry *dentry = file->f_dentry;
2598 #elif defined(HAVE_FILE_FSYNC_2ARGS)
2599 int ll_fsync(struct file *file, int datasync)
2600 {
2601         struct dentry *dentry = file->f_dentry;
2602 #else
2603 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
2604 {
2605 #endif
2606         struct inode *inode = dentry->d_inode;
2607         struct ll_inode_info *lli = ll_i2info(inode);
2608         struct ptlrpc_request *req;
2609         struct obd_capa *oc;
2610         int rc, err;
2611         ENTRY;
2612
2613         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2614                inode->i_generation, inode);
2615         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2616
2617 #ifdef HAVE_FILE_FSYNC_4ARGS
2618         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
2619         mutex_lock(&inode->i_mutex);
2620 #else
2621         /* fsync's caller has already called _fdata{sync,write}, we want
2622          * that IO to finish before calling the osc and mdc sync methods */
2623         rc = filemap_fdatawait(inode->i_mapping);
2624 #endif
2625
2626         /* catch async errors that were recorded back when async writeback
2627          * failed for pages in this mapping. */
2628         if (!S_ISDIR(inode->i_mode)) {
2629                 err = lli->lli_async_rc;
2630                 lli->lli_async_rc = 0;
2631                 if (rc == 0)
2632                         rc = err;
2633                 err = lov_read_and_clear_async_rc(lli->lli_clob);
2634                 if (rc == 0)
2635                         rc = err;
2636         }
2637
2638         oc = ll_mdscapa_get(inode);
2639         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2640                       &req);
2641         capa_put(oc);
2642         if (!rc)
2643                 rc = err;
2644         if (!err)
2645                 ptlrpc_req_finished(req);
2646
2647         if (datasync && S_ISREG(inode->i_mode)) {
2648                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2649
2650                 err = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
2651                                 CL_FSYNC_ALL, 0);
2652                 if (rc == 0 && err < 0)
2653                         rc = err;
2654                 if (rc < 0)
2655                         fd->fd_write_failed = true;
2656                 else
2657                         fd->fd_write_failed = false;
2658         }
2659
2660 #ifdef HAVE_FILE_FSYNC_4ARGS
2661         mutex_unlock(&inode->i_mutex);
2662 #endif
2663         RETURN(rc);
2664 }
2665
2666 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2667 {
2668         struct inode *inode = file->f_dentry->d_inode;
2669         struct ll_sb_info *sbi = ll_i2sbi(inode);
2670         struct ldlm_enqueue_info einfo = {
2671                 .ei_type        = LDLM_FLOCK,
2672                 .ei_cb_cp       = ldlm_flock_completion_ast,
2673                 .ei_cbdata      = file_lock,
2674         };
2675         struct md_op_data *op_data;
2676         struct lustre_handle lockh = {0};
2677         ldlm_policy_data_t flock = {{0}};
2678         int flags = 0;
2679         int rc;
2680         int rc2 = 0;
2681         ENTRY;
2682
2683         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2684                inode->i_ino, file_lock);
2685
2686         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2687
2688         if (file_lock->fl_flags & FL_FLOCK) {
2689                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2690                 /* flocks are whole-file locks */
2691                 flock.l_flock.end = OFFSET_MAX;
2692                 /* For flocks owner is determined by the local file desctiptor*/
2693                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
2694         } else if (file_lock->fl_flags & FL_POSIX) {
2695                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
2696                 flock.l_flock.start = file_lock->fl_start;
2697                 flock.l_flock.end = file_lock->fl_end;
2698         } else {
2699                 RETURN(-EINVAL);
2700         }
2701         flock.l_flock.pid = file_lock->fl_pid;
2702
2703         /* Somewhat ugly workaround for svc lockd.
2704          * lockd installs custom fl_lmops->lm_compare_owner that checks
2705          * for the fl_owner to be the same (which it always is on local node
2706          * I guess between lockd processes) and then compares pid.
2707          * As such we assign pid to the owner field to make it all work,
2708          * conflict with normal locks is unlikely since pid space and
2709          * pointer space for current->files are not intersecting */
2710         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
2711                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
2712
2713         switch (file_lock->fl_type) {
2714         case F_RDLCK:
2715                 einfo.ei_mode = LCK_PR;
2716                 break;
2717         case F_UNLCK:
2718                 /* An unlock request may or may not have any relation to
2719                  * existing locks so we may not be able to pass a lock handle
2720                  * via a normal ldlm_lock_cancel() request. The request may even
2721                  * unlock a byte range in the middle of an existing lock. In
2722                  * order to process an unlock request we need all of the same
2723                  * information that is given with a normal read or write record
2724                  * lock request. To avoid creating another ldlm unlock (cancel)
2725                  * message we'll treat a LCK_NL flock request as an unlock. */
2726                 einfo.ei_mode = LCK_NL;
2727                 break;
2728         case F_WRLCK:
2729                 einfo.ei_mode = LCK_PW;
2730                 break;
2731         default:
2732                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n",
2733                         file_lock->fl_type);
2734                 RETURN (-ENOTSUPP);
2735         }
2736
2737         switch (cmd) {
2738         case F_SETLKW:
2739 #ifdef F_SETLKW64
2740         case F_SETLKW64:
2741 #endif
2742                 flags = 0;
2743                 break;
2744         case F_SETLK:
2745 #ifdef F_SETLK64
2746         case F_SETLK64:
2747 #endif
2748                 flags = LDLM_FL_BLOCK_NOWAIT;
2749                 break;
2750         case F_GETLK:
2751 #ifdef F_GETLK64
2752         case F_GETLK64:
2753 #endif
2754                 flags = LDLM_FL_TEST_LOCK;
2755                 /* Save the old mode so that if the mode in the lock changes we
2756                  * can decrement the appropriate reader or writer refcount. */
2757                 file_lock->fl_type = einfo.ei_mode;
2758                 break;
2759         default:
2760                 CERROR("unknown fcntl lock command: %d\n", cmd);
2761                 RETURN (-EINVAL);
2762         }
2763
2764         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2765                                      LUSTRE_OPC_ANY, NULL);
2766         if (IS_ERR(op_data))
2767                 RETURN(PTR_ERR(op_data));
2768
2769         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2770                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2771                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2772
2773         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2774                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2775
2776         if ((file_lock->fl_flags & FL_FLOCK) &&
2777             (rc == 0 || file_lock->fl_type == F_UNLCK))
2778                 rc2  = flock_lock_file_wait(file, file_lock);
2779         if ((file_lock->fl_flags & FL_POSIX) &&
2780             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2781             !(flags & LDLM_FL_TEST_LOCK))
2782                 rc2  = posix_lock_file_wait(file, file_lock);
2783
2784         if (rc2 && file_lock->fl_type != F_UNLCK) {
2785                 einfo.ei_mode = LCK_NL;
2786                 md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2787                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2788                 rc = rc2;
2789         }
2790
2791         ll_finish_md_op_data(op_data);
2792
2793         RETURN(rc);
2794 }
2795
2796 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2797 {
2798         ENTRY;
2799
2800         RETURN(-ENOSYS);
2801 }
2802
2803 /**
2804  * test if some locks matching bits and l_req_mode are acquired
2805  * - bits can be in different locks
2806  * - if found clear the common lock bits in *bits
2807  * - the bits not found, are kept in *bits
2808  * \param inode [IN]
2809  * \param bits [IN] searched lock bits [IN]
2810  * \param l_req_mode [IN] searched lock mode
2811  * \retval boolean, true iff all bits are found
2812  */
2813 int ll_have_md_lock(struct inode *inode, __u64 *bits,  ldlm_mode_t l_req_mode)
2814 {
2815         struct lustre_handle lockh;
2816         ldlm_policy_data_t policy;
2817         ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
2818                                 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
2819         struct lu_fid *fid;
2820         __u64 flags;
2821         int i;
2822         ENTRY;
2823
2824         if (!inode)
2825                RETURN(0);
2826
2827         fid = &ll_i2info(inode)->lli_fid;
2828         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
2829                ldlm_lockname[mode]);
2830
2831         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2832         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
2833                 policy.l_inodebits.bits = *bits & (1 << i);
2834                 if (policy.l_inodebits.bits == 0)
2835                         continue;
2836
2837                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
2838                                   &policy, mode, &lockh)) {
2839                         struct ldlm_lock *lock;
2840
2841                         lock = ldlm_handle2lock(&lockh);
2842                         if (lock) {
2843                                 *bits &=
2844                                       ~(lock->l_policy_data.l_inodebits.bits);
2845                                 LDLM_LOCK_PUT(lock);
2846                         } else {
2847                                 *bits &= ~policy.l_inodebits.bits;
2848                         }
2849                 }
2850         }
2851         RETURN(*bits == 0);
2852 }
2853
2854 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2855                             struct lustre_handle *lockh, __u64 flags)
2856 {
2857         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2858         struct lu_fid *fid;
2859         ldlm_mode_t rc;
2860         ENTRY;
2861
2862         fid = &ll_i2info(inode)->lli_fid;
2863         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2864
2865         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
2866                            fid, LDLM_IBITS, &policy,
2867                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2868         RETURN(rc);
2869 }
2870
2871 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
2872 {
2873         /* Already unlinked. Just update nlink and return success */
2874         if (rc == -ENOENT) {
2875                 clear_nlink(inode);
2876                 /* This path cannot be hit for regular files unless in
2877                  * case of obscure races, so no need to to validate
2878                  * size. */
2879                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
2880                         return 0;
2881         } else if (rc != 0) {
2882                 CERROR("%s: revalidate FID "DFID" error: rc = %d\n",
2883                        ll_get_fsname(inode->i_sb, NULL, 0),
2884                        PFID(ll_inode2fid(inode)), rc);
2885         }
2886
2887         return rc;
2888 }
2889
2890 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2891                              __u64 ibits)
2892 {
2893         struct inode *inode = dentry->d_inode;
2894         struct ptlrpc_request *req = NULL;
2895         struct obd_export *exp;
2896         int rc = 0;
2897         ENTRY;
2898
2899         LASSERT(inode != NULL);
2900
2901         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2902                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2903
2904         exp = ll_i2mdexp(inode);
2905
2906         /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC.
2907          *      But under CMD case, it caused some lock issues, should be fixed
2908          *      with new CMD ibits lock. See bug 12718 */
2909         if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) {
2910                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2911                 struct md_op_data *op_data;
2912
2913                 if (ibits == MDS_INODELOCK_LOOKUP)
2914                         oit.it_op = IT_LOOKUP;
2915
2916                 /* Call getattr by fid, so do not provide name at all. */
2917                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2918                                              dentry->d_inode, NULL, 0, 0,
2919                                              LUSTRE_OPC_ANY, NULL);
2920                 if (IS_ERR(op_data))
2921                         RETURN(PTR_ERR(op_data));
2922
2923                 oit.it_create_mode |= M_CHECK_STALE;
2924                 rc = md_intent_lock(exp, op_data, NULL, 0,
2925                                     /* we are not interested in name
2926                                        based lookup */
2927                                     &oit, 0, &req,
2928                                     ll_md_blocking_ast, 0);
2929                 ll_finish_md_op_data(op_data);
2930                 oit.it_create_mode &= ~M_CHECK_STALE;
2931                 if (rc < 0) {
2932                         rc = ll_inode_revalidate_fini(inode, rc);
2933                         GOTO (out, rc);
2934                 }
2935
2936                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2937                 if (rc != 0) {
2938                         ll_intent_release(&oit);
2939                         GOTO(out, rc);
2940                 }
2941
2942                 /* Unlinked? Unhash dentry, so it is not picked up later by
2943                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2944                    here to preserve get_cwd functionality on 2.6.
2945                    Bug 10503 */
2946                 if (!dentry->d_inode->i_nlink)
2947                         d_lustre_invalidate(dentry, 0);
2948
2949                 ll_lookup_finish_locks(&oit, dentry);
2950         } else if (!ll_have_md_lock(dentry->d_inode, &ibits, LCK_MINMODE)) {
2951                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2952                 obd_valid valid = OBD_MD_FLGETATTR;
2953                 struct md_op_data *op_data;
2954                 int ealen = 0;
2955
2956                 if (S_ISREG(inode->i_mode)) {
2957                         rc = ll_get_max_mdsize(sbi, &ealen);
2958                         if (rc)
2959                                 RETURN(rc);
2960                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2961                 }
2962
2963                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL,
2964                                              0, ealen, LUSTRE_OPC_ANY,
2965                                              NULL);
2966                 if (IS_ERR(op_data))
2967                         RETURN(PTR_ERR(op_data));
2968
2969                 op_data->op_valid = valid;
2970                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2971                  * capa for this inode. Because we only keep capas of dirs
2972                  * fresh. */
2973                 rc = md_getattr(sbi->ll_md_exp, op_data, &req);
2974                 ll_finish_md_op_data(op_data);
2975                 if (rc) {
2976                         rc = ll_inode_revalidate_fini(inode, rc);
2977                         RETURN(rc);
2978                 }
2979
2980                 rc = ll_prep_inode(&inode, req, NULL, NULL);
2981         }
2982 out:
2983         ptlrpc_req_finished(req);
2984         return rc;
2985 }
2986
2987 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2988                            __u64 ibits)
2989 {
2990         struct inode    *inode = dentry->d_inode;
2991         int              rc;
2992         ENTRY;
2993
2994         rc = __ll_inode_revalidate_it(dentry, it, ibits);
2995         if (rc != 0)
2996                 RETURN(rc);
2997
2998         /* if object isn't regular file, don't validate size */
2999         if (!S_ISREG(inode->i_mode)) {
3000                 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3001                 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3002                 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3003         } else {
3004                 /* In case of restore, the MDT has the right size and has
3005                  * already send it back without granting the layout lock,
3006                  * inode is up-to-date so glimpse is useless.
3007                  * Also to glimpse we need the layout, in case of a running
3008                  * restore the MDT holds the layout lock so the glimpse will
3009                  * block up to the end of restore (getattr will block)
3010                  */
3011                 if (!(ll_i2info(inode)->lli_flags & LLIF_FILE_RESTORING))
3012                         rc = ll_glimpse_size(inode);
3013         }
3014         RETURN(rc);
3015 }
3016
3017 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3018                   struct lookup_intent *it, struct kstat *stat)
3019 {
3020         struct inode *inode = de->d_inode;
3021         struct ll_sb_info *sbi = ll_i2sbi(inode);
3022         struct ll_inode_info *lli = ll_i2info(inode);
3023         int res = 0;
3024
3025         res = ll_inode_revalidate_it(de, it, MDS_INODELOCK_UPDATE |
3026                                              MDS_INODELOCK_LOOKUP);
3027         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3028
3029         if (res)
3030                 return res;
3031
3032         stat->dev = inode->i_sb->s_dev;
3033         if (ll_need_32bit_api(sbi))
3034                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
3035         else
3036                 stat->ino = inode->i_ino;
3037         stat->mode = inode->i_mode;
3038         stat->nlink = inode->i_nlink;
3039         stat->uid = inode->i_uid;
3040         stat->gid = inode->i_gid;
3041         stat->rdev = inode->i_rdev;
3042         stat->atime = inode->i_atime;
3043         stat->mtime = inode->i_mtime;
3044         stat->ctime = inode->i_ctime;
3045         stat->blksize = 1 << inode->i_blkbits;
3046
3047         stat->size = i_size_read(inode);
3048         stat->blocks = inode->i_blocks;
3049
3050         return 0;
3051 }
3052 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3053 {
3054         struct lookup_intent it = { .it_op = IT_GETATTR };
3055
3056         return ll_getattr_it(mnt, de, &it, stat);
3057 }
3058
3059 #ifdef HAVE_LINUX_FIEMAP_H
3060 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3061                 __u64 start, __u64 len)
3062 {
3063         int rc;
3064         size_t num_bytes;
3065         struct ll_user_fiemap *fiemap;
3066         unsigned int extent_count = fieinfo->fi_extents_max;
3067
3068         num_bytes = sizeof(*fiemap) + (extent_count *
3069                                        sizeof(struct ll_fiemap_extent));
3070         OBD_ALLOC_LARGE(fiemap, num_bytes);
3071
3072         if (fiemap == NULL)
3073                 RETURN(-ENOMEM);
3074
3075         fiemap->fm_flags = fieinfo->fi_flags;
3076         fiemap->fm_extent_count = fieinfo->fi_extents_max;
3077         fiemap->fm_start = start;
3078         fiemap->fm_length = len;
3079         memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3080                sizeof(struct ll_fiemap_extent));
3081
3082         rc = ll_do_fiemap(inode, fiemap, num_bytes);
3083
3084         fieinfo->fi_flags = fiemap->fm_flags;
3085         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3086         memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3087                fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3088
3089         OBD_FREE_LARGE(fiemap, num_bytes);
3090         return rc;
3091 }
3092 #endif
3093
3094 struct posix_acl * ll_get_acl(struct inode *inode, int type)
3095 {
3096         struct ll_inode_info *lli = ll_i2info(inode);
3097         struct posix_acl *acl = NULL;
3098         ENTRY;
3099
3100         spin_lock(&lli->lli_lock);
3101         /* VFS' acl_permission_check->check_acl will release the refcount */
3102         acl = posix_acl_dup(lli->lli_posix_acl);
3103         spin_unlock(&lli->lli_lock);
3104
3105         RETURN(acl);
3106 }
3107
3108 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
3109 static int
3110 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
3111 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
3112 # else
3113 ll_check_acl(struct inode *inode, int mask)
3114 # endif
3115 {
3116 # ifdef CONFIG_FS_POSIX_ACL
3117         struct posix_acl *acl;
3118         int rc;
3119         ENTRY;
3120
3121 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
3122         if (flags & IPERM_FLAG_RCU)
3123                 return -ECHILD;
3124 #  endif
3125         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
3126
3127         if (!acl)
3128                 RETURN(-EAGAIN);
3129
3130         rc = posix_acl_permission(inode, acl, mask);
3131         posix_acl_release(acl);
3132
3133         RETURN(rc);
3134 # else /* !CONFIG_FS_POSIX_ACL */
3135         return -EAGAIN;
3136 # endif /* CONFIG_FS_POSIX_ACL */
3137 }
3138 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
3139
3140 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
3141 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
3142 #else
3143 # ifdef HAVE_INODE_PERMISION_2ARGS
3144 int ll_inode_permission(struct inode *inode, int mask)
3145 # else
3146 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3147 # endif
3148 #endif
3149 {
3150         int rc = 0;
3151         ENTRY;
3152
3153 #ifdef MAY_NOT_BLOCK
3154         if (mask & MAY_NOT_BLOCK)
3155                 return -ECHILD;
3156 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
3157         if (flags & IPERM_FLAG_RCU)
3158                 return -ECHILD;
3159 #endif
3160
3161        /* as root inode are NOT getting validated in lookup operation,
3162         * need to do it before permission check. */
3163
3164         if (inode == inode->i_sb->s_root->d_inode) {
3165                 struct lookup_intent it = { .it_op = IT_LOOKUP };
3166
3167                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3168                                               MDS_INODELOCK_LOOKUP);
3169                 if (rc)
3170                         RETURN(rc);
3171         }
3172
3173         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3174                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3175
3176         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3177                 return lustre_check_remote_perm(inode, mask);
3178
3179         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3180         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
3181
3182         RETURN(rc);
3183 }
3184
3185 /* -o localflock - only provides locally consistent flock locks */
3186 struct file_operations ll_file_operations = {
3187         .read           = ll_file_read,
3188         .aio_read    = ll_file_aio_read,
3189         .write          = ll_file_write,
3190         .aio_write   = ll_file_aio_write,
3191         .unlocked_ioctl = ll_file_ioctl,
3192         .open           = ll_file_open,
3193         .release        = ll_file_release,
3194         .mmap           = ll_file_mmap,
3195         .llseek         = ll_file_seek,
3196         .splice_read    = ll_file_splice_read,
3197         .fsync          = ll_fsync,
3198         .flush          = ll_flush
3199 };
3200
3201 struct file_operations ll_file_operations_flock = {
3202         .read           = ll_file_read,
3203         .aio_read    = ll_file_aio_read,
3204         .write          = ll_file_write,
3205         .aio_write   = ll_file_aio_write,
3206         .unlocked_ioctl = ll_file_ioctl,
3207         .open           = ll_file_open,
3208         .release        = ll_file_release,
3209         .mmap           = ll_file_mmap,
3210         .llseek         = ll_file_seek,
3211         .splice_read    = ll_file_splice_read,
3212         .fsync          = ll_fsync,
3213         .flush          = ll_flush,
3214         .flock          = ll_file_flock,
3215         .lock           = ll_file_flock
3216 };
3217
3218 /* These are for -o noflock - to return ENOSYS on flock calls */
3219 struct file_operations ll_file_operations_noflock = {
3220         .read           = ll_file_read,
3221         .aio_read    = ll_file_aio_read,
3222         .write          = ll_file_write,
3223         .aio_write   = ll_file_aio_write,
3224         .unlocked_ioctl = ll_file_ioctl,
3225         .open           = ll_file_open,
3226         .release        = ll_file_release,
3227         .mmap           = ll_file_mmap,
3228         .llseek         = ll_file_seek,
3229         .splice_read    = ll_file_splice_read,
3230         .fsync          = ll_fsync,
3231         .flush          = ll_flush,
3232         .flock          = ll_file_noflock,
3233         .lock           = ll_file_noflock
3234 };
3235
3236 struct inode_operations ll_file_inode_operations = {
3237         .setattr        = ll_setattr,
3238         .getattr        = ll_getattr,
3239         .permission     = ll_inode_permission,
3240         .setxattr       = ll_setxattr,
3241         .getxattr       = ll_getxattr,
3242         .listxattr      = ll_listxattr,
3243         .removexattr    = ll_removexattr,
3244 #ifdef  HAVE_LINUX_FIEMAP_H
3245         .fiemap         = ll_fiemap,
3246 #endif
3247 #ifdef HAVE_IOP_GET_ACL
3248         .get_acl        = ll_get_acl,
3249 #endif
3250 };
3251
3252 /* dynamic ioctl number support routins */
3253 static struct llioc_ctl_data {
3254         struct rw_semaphore     ioc_sem;
3255         cfs_list_t              ioc_head;
3256 } llioc = {
3257         __RWSEM_INITIALIZER(llioc.ioc_sem),
3258         CFS_LIST_HEAD_INIT(llioc.ioc_head)
3259 };
3260
3261
3262 struct llioc_data {
3263         cfs_list_t              iocd_list;
3264         unsigned int            iocd_size;
3265         llioc_callback_t        iocd_cb;
3266         unsigned int            iocd_count;
3267         unsigned int            iocd_cmd[0];
3268 };
3269
3270 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3271 {
3272         unsigned int size;
3273         struct llioc_data *in_data = NULL;
3274         ENTRY;
3275
3276         if (cb == NULL || cmd == NULL ||
3277             count > LLIOC_MAX_CMD || count < 0)
3278                 RETURN(NULL);
3279
3280         size = sizeof(*in_data) + count * sizeof(unsigned int);
3281         OBD_ALLOC(in_data, size);
3282         if (in_data == NULL)
3283                 RETURN(NULL);
3284
3285         memset(in_data, 0, sizeof(*in_data));
3286         in_data->iocd_size = size;
3287         in_data->iocd_cb = cb;
3288         in_data->iocd_count = count;
3289         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3290
3291         down_write(&llioc.ioc_sem);
3292         cfs_list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3293         up_write(&llioc.ioc_sem);
3294
3295         RETURN(in_data);
3296 }
3297
3298 void ll_iocontrol_unregister(void *magic)
3299 {
3300         struct llioc_data *tmp;
3301
3302         if (magic == NULL)
3303                 return;
3304
3305         down_write(&llioc.ioc_sem);
3306         cfs_list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3307                 if (tmp == magic) {
3308                         unsigned int size = tmp->iocd_size;
3309
3310                         cfs_list_del(&tmp->iocd_list);
3311                         up_write(&llioc.ioc_sem);
3312
3313                         OBD_FREE(tmp, size);
3314                         return;
3315                 }
3316         }
3317         up_write(&llioc.ioc_sem);
3318
3319         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3320 }
3321
3322 EXPORT_SYMBOL(ll_iocontrol_register);
3323 EXPORT_SYMBOL(ll_iocontrol_unregister);
3324
3325 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3326                         unsigned int cmd, unsigned long arg, int *rcp)
3327 {
3328         enum llioc_iter ret = LLIOC_CONT;
3329         struct llioc_data *data;
3330         int rc = -EINVAL, i;
3331
3332         down_read(&llioc.ioc_sem);
3333         cfs_list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3334                 for (i = 0; i < data->iocd_count; i++) {
3335                         if (cmd != data->iocd_cmd[i])
3336                                 continue;
3337
3338                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3339                         break;
3340                 }
3341
3342                 if (ret == LLIOC_STOP)
3343                         break;
3344         }
3345         up_read(&llioc.ioc_sem);
3346
3347         if (rcp)
3348                 *rcp = rc;
3349         return ret;
3350 }
3351
3352 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
3353 {
3354         struct ll_inode_info *lli = ll_i2info(inode);
3355         struct cl_env_nest nest;
3356         struct lu_env *env;
3357         int result;
3358         ENTRY;
3359
3360         if (lli->lli_clob == NULL)
3361                 RETURN(0);
3362
3363         env = cl_env_nested_get(&nest);
3364         if (IS_ERR(env))
3365                 RETURN(PTR_ERR(env));
3366
3367         result = cl_conf_set(env, lli->lli_clob, conf);
3368         cl_env_nested_put(&nest, env);
3369
3370         if (conf->coc_opc == OBJECT_CONF_SET) {
3371                 struct ldlm_lock *lock = conf->coc_lock;
3372
3373                 LASSERT(lock != NULL);
3374                 LASSERT(ldlm_has_layout(lock));
3375                 if (result == 0) {
3376                         /* it can only be allowed to match after layout is
3377                          * applied to inode otherwise false layout would be
3378                          * seen. Applying layout shoud happen before dropping
3379                          * the intent lock. */
3380                         ldlm_lock_allow_match(lock);
3381                 }
3382         }
3383         RETURN(result);
3384 }
3385
3386 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
3387 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
3388
3389 {
3390         struct ll_sb_info *sbi = ll_i2sbi(inode);
3391         struct obd_capa *oc;
3392         struct ptlrpc_request *req;
3393         struct mdt_body *body;
3394         void *lvbdata;
3395         void *lmm;
3396         int lmmsize;
3397         int rc;
3398         ENTRY;
3399
3400         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
3401                PFID(ll_inode2fid(inode)), !!(lock->l_flags & LDLM_FL_LVB_READY),
3402                lock->l_lvb_data, lock->l_lvb_len);
3403
3404         if ((lock->l_lvb_data != NULL) && (lock->l_flags & LDLM_FL_LVB_READY))
3405                 RETURN(0);
3406
3407         /* if layout lock was granted right away, the layout is returned
3408          * within DLM_LVB of dlm reply; otherwise if the lock was ever
3409          * blocked and then granted via completion ast, we have to fetch
3410          * layout here. Please note that we can't use the LVB buffer in
3411          * completion AST because it doesn't have a large enough buffer */
3412         oc = ll_mdscapa_get(inode);
3413         rc = ll_get_max_mdsize(sbi, &lmmsize);
3414         if (rc == 0)
3415                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), oc,
3416                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
3417                                 lmmsize, 0, &req);
3418         capa_put(oc);
3419         if (rc < 0)
3420                 RETURN(rc);
3421
3422         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3423         if (body == NULL || body->eadatasize > lmmsize)
3424                 GOTO(out, rc = -EPROTO);
3425
3426         lmmsize = body->eadatasize;
3427         if (lmmsize == 0) /* empty layout */
3428                 GOTO(out, rc = 0);
3429
3430         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
3431         if (lmm == NULL)
3432                 GOTO(out, rc = -EFAULT);
3433
3434         OBD_ALLOC_LARGE(lvbdata, lmmsize);
3435         if (lvbdata == NULL)
3436                 GOTO(out, rc = -ENOMEM);
3437
3438         memcpy(lvbdata, lmm, lmmsize);
3439         lock_res_and_lock(lock);
3440         if (lock->l_lvb_data != NULL)
3441                 OBD_FREE_LARGE(lock->l_lvb_data, lock->l_lvb_len);
3442
3443         lock->l_lvb_data = lvbdata;
3444         lock->l_lvb_len = lmmsize;
3445         unlock_res_and_lock(lock);
3446
3447         EXIT;
3448
3449 out:
3450         ptlrpc_req_finished(req);
3451         return rc;
3452 }
3453
3454 /**
3455  * Apply the layout to the inode. Layout lock is held and will be released
3456  * in this function.
3457  */
3458 static int ll_layout_lock_set(struct lustre_handle *lockh, ldlm_mode_t mode,
3459                                 struct inode *inode, __u32 *gen, bool reconf)
3460 {
3461         struct ll_inode_info *lli = ll_i2info(inode);
3462         struct ll_sb_info    *sbi = ll_i2sbi(inode);
3463         struct ldlm_lock *lock;
3464         struct lustre_md md = { NULL };
3465         struct cl_object_conf conf;
3466         int rc = 0;
3467         bool lvb_ready;
3468         bool wait_layout = false;
3469         ENTRY;
3470
3471         LASSERT(lustre_handle_is_used(lockh));
3472
3473         lock = ldlm_handle2lock(lockh);
3474         LASSERT(lock != NULL);
3475         LASSERT(ldlm_has_layout(lock));
3476
3477         LDLM_DEBUG(lock, "File %p/"DFID" being reconfigured: %d.\n",
3478                    inode, PFID(&lli->lli_fid), reconf);
3479
3480         /* in case this is a caching lock and reinstate with new inode */
3481         md_set_lock_data(sbi->ll_md_exp, &lockh->cookie, inode, NULL);
3482
3483         lock_res_and_lock(lock);
3484         lvb_ready = !!(lock->l_flags & LDLM_FL_LVB_READY);
3485         unlock_res_and_lock(lock);
3486         /* checking lvb_ready is racy but this is okay. The worst case is
3487          * that multi processes may configure the file on the same time. */
3488
3489         if (lvb_ready || !reconf) {
3490                 rc = -ENODATA;
3491                 if (lvb_ready) {
3492                         /* layout_gen must be valid if layout lock is not
3493                          * cancelled and stripe has already set */
3494                         *gen = lli->lli_layout_gen;
3495                         rc = 0;
3496                 }
3497                 GOTO(out, rc);
3498         }
3499
3500         rc = ll_layout_fetch(inode, lock);
3501         if (rc < 0)
3502                 GOTO(out, rc);
3503
3504         /* for layout lock, lmm is returned in lock's lvb.
3505          * lvb_data is immutable if the lock is held so it's safe to access it
3506          * without res lock. See the description in ldlm_lock_decref_internal()
3507          * for the condition to free lvb_data of layout lock */
3508         if (lock->l_lvb_data != NULL) {
3509                 rc = obd_unpackmd(sbi->ll_dt_exp, &md.lsm,
3510                                   lock->l_lvb_data, lock->l_lvb_len);
3511                 if (rc >= 0) {
3512                         *gen = LL_LAYOUT_GEN_EMPTY;
3513                         if (md.lsm != NULL)
3514                                 *gen = md.lsm->lsm_layout_gen;
3515                         rc = 0;
3516                 } else {
3517                         CERROR("%s: file "DFID" unpackmd error: %d\n",
3518                                 ll_get_fsname(inode->i_sb, NULL, 0),
3519                                 PFID(&lli->lli_fid), rc);
3520                 }
3521         }
3522         if (rc < 0)
3523                 GOTO(out, rc);
3524
3525         /* set layout to file. Unlikely this will fail as old layout was
3526          * surely eliminated */
3527         memset(&conf, 0, sizeof conf);
3528         conf.coc_opc = OBJECT_CONF_SET;
3529         conf.coc_inode = inode;
3530         conf.coc_lock = lock;
3531         conf.u.coc_md = &md;
3532         rc = ll_layout_conf(inode, &conf);
3533
3534         if (md.lsm != NULL)
3535                 obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
3536
3537         /* refresh layout failed, need to wait */
3538         wait_layout = rc == -EBUSY;
3539         EXIT;
3540
3541 out:
3542         LDLM_LOCK_PUT(lock);
3543         ldlm_lock_decref(lockh, mode);
3544
3545         /* wait for IO to complete if it's still being used. */
3546         if (wait_layout) {
3547                 CDEBUG(D_INODE, "%s: %p/"DFID" wait for layout reconf.\n",
3548                         ll_get_fsname(inode->i_sb, NULL, 0),
3549                         inode, PFID(&lli->lli_fid));
3550
3551                 memset(&conf, 0, sizeof conf);
3552                 conf.coc_opc = OBJECT_CONF_WAIT;
3553                 conf.coc_inode = inode;
3554                 rc = ll_layout_conf(inode, &conf);
3555                 if (rc == 0)
3556                         rc = -EAGAIN;
3557
3558                 CDEBUG(D_INODE, "file: "DFID" waiting layout return: %d.\n",
3559                         PFID(&lli->lli_fid), rc);
3560         }
3561         RETURN(rc);
3562 }
3563
3564 /**
3565  * This function checks if there exists a LAYOUT lock on the client side,
3566  * or enqueues it if it doesn't have one in cache.
3567  *
3568  * This function will not hold layout lock so it may be revoked any time after
3569  * this function returns. Any operations depend on layout should be redone
3570  * in that case.
3571  *
3572  * This function should be called before lov_io_init() to get an uptodate
3573  * layout version, the caller should save the version number and after IO
3574  * is finished, this function should be called again to verify that layout
3575  * is not changed during IO time.
3576  */
3577 int ll_layout_refresh(struct inode *inode, __u32 *gen)
3578 {
3579         struct ll_inode_info  *lli = ll_i2info(inode);
3580         struct ll_sb_info     *sbi = ll_i2sbi(inode);
3581         struct md_op_data     *op_data;
3582         struct lookup_intent   it;
3583         struct lustre_handle   lockh;
3584         ldlm_mode_t            mode;
3585         struct ldlm_enqueue_info einfo = {
3586                 .ei_type = LDLM_IBITS,
3587                 .ei_mode = LCK_CR,
3588                 .ei_cb_bl = ll_md_blocking_ast,
3589                 .ei_cb_cp = ldlm_completion_ast,
3590         };
3591         int rc;
3592         ENTRY;
3593
3594         *gen = lli->lli_layout_gen;
3595         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK))
3596                 RETURN(0);
3597
3598         /* sanity checks */
3599         LASSERT(fid_is_sane(ll_inode2fid(inode)));
3600         LASSERT(S_ISREG(inode->i_mode));
3601
3602         /* mostly layout lock is caching on the local side, so try to match
3603          * it before grabbing layout lock mutex. */
3604         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3605         if (mode != 0) { /* hit cached lock */
3606                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, false);
3607                 if (rc == 0)
3608                         RETURN(0);
3609
3610                 /* better hold lli_layout_mutex to try again otherwise
3611                  * it will have starvation problem. */
3612         }
3613
3614         /* take layout lock mutex to enqueue layout lock exclusively. */
3615         mutex_lock(&lli->lli_layout_mutex);
3616
3617 again:
3618         /* try again. Maybe somebody else has done this. */
3619         mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0);
3620         if (mode != 0) { /* hit cached lock */
3621                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3622                 if (rc == -EAGAIN)
3623                         goto again;
3624
3625                 mutex_unlock(&lli->lli_layout_mutex);
3626                 RETURN(rc);
3627         }
3628
3629         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
3630                         0, 0, LUSTRE_OPC_ANY, NULL);
3631         if (IS_ERR(op_data)) {
3632                 mutex_unlock(&lli->lli_layout_mutex);
3633                 RETURN(PTR_ERR(op_data));
3634         }
3635
3636         /* have to enqueue one */
3637         memset(&it, 0, sizeof(it));
3638         it.it_op = IT_LAYOUT;
3639         lockh.cookie = 0ULL;
3640
3641         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file %p/"DFID".\n",
3642                         ll_get_fsname(inode->i_sb, NULL, 0), inode,
3643                         PFID(&lli->lli_fid));
3644
3645         rc = md_enqueue(sbi->ll_md_exp, &einfo, &it, op_data, &lockh,
3646                         NULL, 0, NULL, 0);
3647         if (it.d.lustre.it_data != NULL)
3648                 ptlrpc_req_finished(it.d.lustre.it_data);
3649         it.d.lustre.it_data = NULL;
3650
3651         ll_finish_md_op_data(op_data);
3652
3653         mode = it.d.lustre.it_lock_mode;
3654         it.d.lustre.it_lock_mode = 0;
3655         ll_intent_drop_lock(&it);
3656
3657         if (rc == 0) {
3658                 /* set lock data in case this is a new lock */
3659                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
3660                 rc = ll_layout_lock_set(&lockh, mode, inode, gen, true);
3661                 if (rc == -EAGAIN)
3662                         goto again;
3663         }
3664         mutex_unlock(&lli->lli_layout_mutex);
3665
3666         RETURN(rc);
3667 }
3668
3669 /**
3670  *  This function send a restore request to the MDT
3671  */
3672 int ll_layout_restore(struct inode *inode)
3673 {
3674         struct hsm_user_request *hur;
3675         int                      len, rc;
3676         ENTRY;
3677
3678         len = sizeof(struct hsm_user_request) +
3679               sizeof(struct hsm_user_item);
3680         OBD_ALLOC(hur, len);
3681         if (hur == NULL)
3682                 RETURN(-ENOMEM);
3683
3684         hur->hur_request.hr_action = HUA_RESTORE;
3685         hur->hur_request.hr_archive_id = 0;
3686         hur->hur_request.hr_flags = 0;
3687         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
3688                sizeof(hur->hur_user_item[0].hui_fid));
3689         hur->hur_user_item[0].hui_extent.length = -1;
3690         hur->hur_request.hr_itemcount = 1;
3691         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, cl_i2sbi(inode)->ll_md_exp,
3692                            len, hur, NULL);
3693         OBD_FREE(hur, len);
3694         RETURN(rc);
3695 }
3696