lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                      ATTR_MTIME | ATTR_MTIME_SET |
 104                                      ATTR_CTIME | ATTR_CTIME_SET;
 105         op_data->op_attr_blocks = inode->i_blocks;
 106         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 107         op_data->op_handle = och->och_fh;
 108
 109         if (och->och_flags & FMODE_WRITE &&
 110             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 111                 /* For HSM: if inode data has been modified, pack it so that
 112                  * MDT can set data dirty flag in the archive. */
 113                 op_data->op_bias |= MDS_DATA_MODIFIED;
 114
 115         EXIT;
 116 }
 117
 118 /**
 119  * Perform a close, possibly with a bias.
 120  * The meaning of "data" depends on the value of "bias".
 121  *
 122  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 123  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 124  * swap layouts with.
 125  */
 126 static int ll_close_inode_openhandle(struct inode *inode,
 127                                      struct obd_client_handle *och,
 128                                      enum mds_op_bias bias, void *data)
 129 {
 130         struct obd_export *md_exp = ll_i2mdexp(inode);
 131         const struct ll_inode_info *lli = ll_i2info(inode);
 132         struct md_op_data *op_data;
 133         struct ptlrpc_request *req = NULL;
 134         int rc;
 135         ENTRY;
 136
 137         if (class_exp2obd(md_exp) == NULL) {
 138                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 139                        ll_get_fsname(inode->i_sb, NULL, 0),
 140                        PFID(&lli->lli_fid));
 141                 GOTO(out, rc = 0);
 142         }
 143
 144         OBD_ALLOC_PTR(op_data);
 145         /* We leak openhandle and request here on error, but not much to be
 146          * done in OOM case since app won't retry close on error either. */
 147         if (op_data == NULL)
 148                 GOTO(out, rc = -ENOMEM);
 149
 150         ll_prepare_close(inode, op_data, och);
 151         switch (bias) {
 152         case MDS_CLOSE_LAYOUT_MERGE:
 153                 /* merge blocks from the victim inode */
 154                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 155                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 156         case MDS_CLOSE_LAYOUT_SPLIT:
 157         case MDS_CLOSE_LAYOUT_SWAP: {
 158                 struct split_param *sp = data;
 159
 160                 LASSERT(data != NULL);
 161                 op_data->op_bias |= bias;
 162                 op_data->op_data_version = 0;
 163                 op_data->op_lease_handle = och->och_lease_handle;
 164                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 165                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 166                         op_data->op_mirror_id = sp->sp_mirror_id;
 167                 } else {
 168                         op_data->op_fid2 = *ll_inode2fid(data);
 169                 }
 170                 break;
 171         }
 172
 173         case MDS_CLOSE_RESYNC_DONE: {
 174                 struct ll_ioc_lease *ioc = data;
 175
 176                 LASSERT(data != NULL);
 177                 op_data->op_attr_blocks +=
 178                         ioc->lil_count * op_data->op_attr_blocks;
 179                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 180                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 181
 182                 op_data->op_lease_handle = och->och_lease_handle;
 183                 op_data->op_data = &ioc->lil_ids[0];
 184                 op_data->op_data_size =
 185                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 186                 break;
 187         }
 188
 189         case MDS_HSM_RELEASE:
 190                 LASSERT(data != NULL);
 191                 op_data->op_bias |= MDS_HSM_RELEASE;
 192                 op_data->op_data_version = *(__u64 *)data;
 193                 op_data->op_lease_handle = och->och_lease_handle;
 194                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 195                 break;
 196
 197         default:
 198                 LASSERT(data == NULL);
 199                 break;
 200         }
 201
 202         rc = md_close(md_exp, op_data, och->och_mod, &req);
 203         if (rc != 0 && rc != -EINTR)
 204                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 205                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 206
 207         if (rc == 0 && op_data->op_bias & bias) {
 208                 struct mdt_body *body;
 209
 210                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 211                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 212                         rc = -EBUSY;
 213         }
 214
 215         ll_finish_md_op_data(op_data);
 216         EXIT;
 217 out:
 218
 219         md_clear_open_replay_data(md_exp, och);
 220         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 221         OBD_FREE_PTR(och);
 222
 223         ptlrpc_req_finished(req);       /* This is close request */
 224         return rc;
 225 }
 226
 227 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 228 {
 229         struct ll_inode_info *lli = ll_i2info(inode);
 230         struct obd_client_handle **och_p;
 231         struct obd_client_handle *och;
 232         __u64 *och_usecount;
 233         int rc = 0;
 234         ENTRY;
 235
 236         if (fmode & FMODE_WRITE) {
 237                 och_p = &lli->lli_mds_write_och;
 238                 och_usecount = &lli->lli_open_fd_write_count;
 239         } else if (fmode & FMODE_EXEC) {
 240                 och_p = &lli->lli_mds_exec_och;
 241                 och_usecount = &lli->lli_open_fd_exec_count;
 242         } else {
 243                 LASSERT(fmode & FMODE_READ);
 244                 och_p = &lli->lli_mds_read_och;
 245                 och_usecount = &lli->lli_open_fd_read_count;
 246         }
 247
 248         mutex_lock(&lli->lli_och_mutex);
 249         if (*och_usecount > 0) {
 250                 /* There are still users of this handle, so skip
 251                  * freeing it. */
 252                 mutex_unlock(&lli->lli_och_mutex);
 253                 RETURN(0);
 254         }
 255
 256         och = *och_p;
 257         *och_p = NULL;
 258         mutex_unlock(&lli->lli_och_mutex);
 259
 260         if (och != NULL) {
 261                 /* There might be a race and this handle may already
 262                  * be closed. */
 263                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 264         }
 265
 266         RETURN(rc);
 267 }
 268
 269 static int ll_md_close(struct inode *inode, struct file *file)
 270 {
 271         union ldlm_policy_data policy = {
 272                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 273         };
 274         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 275         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 276         struct ll_inode_info *lli = ll_i2info(inode);
 277         struct lustre_handle lockh;
 278         enum ldlm_mode lockmode;
 279         int rc = 0;
 280         ENTRY;
 281
 282         /* clear group lock, if present */
 283         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 284                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 285
 286         if (fd->fd_lease_och != NULL) {
 287                 bool lease_broken;
 288
 289                 /* Usually the lease is not released when the
 290                  * application crashed, we need to release here. */
 291                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 292                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 293                         PFID(&lli->lli_fid), rc, lease_broken);
 294
 295                 fd->fd_lease_och = NULL;
 296         }
 297
 298         if (fd->fd_och != NULL) {
 299                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 300                 fd->fd_och = NULL;
 301                 GOTO(out, rc);
 302         }
 303
 304         /* Let's see if we have good enough OPEN lock on the file and if
 305            we can skip talking to MDS */
 306         mutex_lock(&lli->lli_och_mutex);
 307         if (fd->fd_omode & FMODE_WRITE) {
 308                 lockmode = LCK_CW;
 309                 LASSERT(lli->lli_open_fd_write_count);
 310                 lli->lli_open_fd_write_count--;
 311         } else if (fd->fd_omode & FMODE_EXEC) {
 312                 lockmode = LCK_PR;
 313                 LASSERT(lli->lli_open_fd_exec_count);
 314                 lli->lli_open_fd_exec_count--;
 315         } else {
 316                 lockmode = LCK_CR;
 317                 LASSERT(lli->lli_open_fd_read_count);
 318                 lli->lli_open_fd_read_count--;
 319         }
 320         mutex_unlock(&lli->lli_och_mutex);
 321
 322         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 323                            LDLM_IBITS, &policy, lockmode, &lockh))
 324                 rc = ll_md_real_close(inode, fd->fd_omode);
 325
 326 out:
 327         LUSTRE_FPRIVATE(file) = NULL;
 328         ll_file_data_put(fd);
 329
 330         RETURN(rc);
 331 }
 332
 333 /* While this returns an error code, fput() the caller does not, so we need
 334  * to make every effort to clean up all of our state here.  Also, applications
 335  * rarely check close errors and even if an error is returned they will not
 336  * re-try the close call.
 337  */
 338 int ll_file_release(struct inode *inode, struct file *file)
 339 {
 340         struct ll_file_data *fd;
 341         struct ll_sb_info *sbi = ll_i2sbi(inode);
 342         struct ll_inode_info *lli = ll_i2info(inode);
 343         int rc;
 344         ENTRY;
 345
 346         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 347                PFID(ll_inode2fid(inode)), inode);
 348
 349         if (inode->i_sb->s_root != file_dentry(file))
 350                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 351         fd = LUSTRE_FPRIVATE(file);
 352         LASSERT(fd != NULL);
 353
 354         /* The last ref on @file, maybe not the the owner pid of statahead,
 355          * because parent and child process can share the same file handle. */
 356         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 357                 ll_deauthorize_statahead(inode, fd);
 358
 359         if (inode->i_sb->s_root == file_dentry(file)) {
 360                 LUSTRE_FPRIVATE(file) = NULL;
 361                 ll_file_data_put(fd);
 362                 RETURN(0);
 363         }
 364
 365         if (!S_ISDIR(inode->i_mode)) {
 366                 if (lli->lli_clob != NULL)
 367                         lov_read_and_clear_async_rc(lli->lli_clob);
 368                 lli->lli_async_rc = 0;
 369         }
 370
 371         rc = ll_md_close(inode, file);
 372
 373         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 374                 libcfs_debug_dumplog();
 375
 376         RETURN(rc);
 377 }
 378
 379 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 380                                 struct lookup_intent *itp)
 381 {
 382         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 383         struct dentry *parent = de->d_parent;
 384         const char *name = NULL;
 385         int len = 0;
 386         struct md_op_data *op_data;
 387         struct ptlrpc_request *req = NULL;
 388         int rc;
 389         ENTRY;
 390
 391         LASSERT(parent != NULL);
 392         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 393
 394         /* if server supports open-by-fid, or file name is invalid, don't pack
 395          * name in open request */
 396         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 397             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 398                 name = de->d_name.name;
 399                 len = de->d_name.len;
 400         }
 401
 402         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 403                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 404         if (IS_ERR(op_data))
 405                 RETURN(PTR_ERR(op_data));
 406         op_data->op_data = lmm;
 407         op_data->op_data_size = lmmsize;
 408
 409         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 410                             &ll_md_blocking_ast, 0);
 411         ll_finish_md_op_data(op_data);
 412         if (rc == -ESTALE) {
 413                 /* reason for keep own exit path - don`t flood log
 414                  * with messages with -ESTALE errors.
 415                  */
 416                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 417                      it_open_error(DISP_OPEN_OPEN, itp))
 418                         GOTO(out, rc);
 419                 ll_release_openhandle(de, itp);
 420                 GOTO(out, rc);
 421         }
 422
 423         if (it_disposition(itp, DISP_LOOKUP_NEG))
 424                 GOTO(out, rc = -ENOENT);
 425
 426         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 427                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 428                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 429                 GOTO(out, rc);
 430         }
 431
 432         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 433         if (!rc && itp->it_lock_mode)
 434                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 435
 436 out:
 437         ptlrpc_req_finished(req);
 438         ll_intent_drop_lock(itp);
 439
 440         /* We did open by fid, but by the time we got to the server,
 441          * the object disappeared. If this is a create, we cannot really
 442          * tell the userspace that the file it was trying to create
 443          * does not exist. Instead let's return -ESTALE, and the VFS will
 444          * retry the create with LOOKUP_REVAL that we are going to catch
 445          * in ll_revalidate_dentry() and use lookup then.
 446          */
 447         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 448                 rc = -ESTALE;
 449
 450         RETURN(rc);
 451 }
 452
 453 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 454                        struct obd_client_handle *och)
 455 {
 456         struct mdt_body *body;
 457
 458         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 459         och->och_fh = body->mbo_handle;
 460         och->och_fid = body->mbo_fid1;
 461         och->och_lease_handle.cookie = it->it_lock_handle;
 462         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 463         och->och_flags = it->it_flags;
 464
 465         return md_set_open_replay_data(md_exp, och, it);
 466 }
 467
 468 static int ll_local_open(struct file *file, struct lookup_intent *it,
 469                          struct ll_file_data *fd, struct obd_client_handle *och)
 470 {
 471         struct inode *inode = file_inode(file);
 472         ENTRY;
 473
 474         LASSERT(!LUSTRE_FPRIVATE(file));
 475
 476         LASSERT(fd != NULL);
 477
 478         if (och) {
 479                 int rc;
 480
 481                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 482                 if (rc != 0)
 483                         RETURN(rc);
 484         }
 485
 486         LUSTRE_FPRIVATE(file) = fd;
 487         ll_readahead_init(inode, &fd->fd_ras);
 488         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 489
 490         /* ll_cl_context initialize */
 491         rwlock_init(&fd->fd_lock);
 492         INIT_LIST_HEAD(&fd->fd_lccs);
 493
 494         RETURN(0);
 495 }
 496
 497 /* Open a file, and (for the very first open) create objects on the OSTs at
 498  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 499  * creation or open until ll_lov_setstripe() ioctl is called.
 500  *
 501  * If we already have the stripe MD locally then we don't request it in
 502  * md_open(), by passing a lmm_size = 0.
 503  *
 504  * It is up to the application to ensure no other processes open this file
 505  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 506  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 507  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 508  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 509  */
 510 int ll_file_open(struct inode *inode, struct file *file)
 511 {
 512         struct ll_inode_info *lli = ll_i2info(inode);
 513         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 514                                           .it_flags = file->f_flags };
 515         struct obd_client_handle **och_p = NULL;
 516         __u64 *och_usecount = NULL;
 517         struct ll_file_data *fd;
 518         int rc = 0;
 519         ENTRY;
 520
 521         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 522                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 523
 524         it = file->private_data; /* XXX: compat macro */
 525         file->private_data = NULL; /* prevent ll_local_open assertion */
 526
 527         fd = ll_file_data_get();
 528         if (fd == NULL)
 529                 GOTO(out_openerr, rc = -ENOMEM);
 530
 531         fd->fd_file = file;
 532         if (S_ISDIR(inode->i_mode))
 533                 ll_authorize_statahead(inode, fd);
 534
 535         if (inode->i_sb->s_root == file_dentry(file)) {
 536                 LUSTRE_FPRIVATE(file) = fd;
 537                 RETURN(0);
 538         }
 539
 540         if (!it || !it->it_disposition) {
 541                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 542                  * because everything but O_ACCMODE mask was stripped from
 543                  * there */
 544                 if ((oit.it_flags + 1) & O_ACCMODE)
 545                         oit.it_flags++;
 546                 if (file->f_flags & O_TRUNC)
 547                         oit.it_flags |= FMODE_WRITE;
 548
 549                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 550                  * dentry_open after call to open_namei that checks permissions.
 551                  * Only nfsd_open call dentry_open directly without checking
 552                  * permissions and because of that this code below is safe. */
 553                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 554                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 555
 556                 /* We do not want O_EXCL here, presumably we opened the file
 557                  * already? XXX - NFS implications? */
 558                 oit.it_flags &= ~O_EXCL;
 559
 560                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 561                  * created if necessary, then "IT_CREAT" should be set to keep
 562                  * consistent with it */
 563                 if (oit.it_flags & O_CREAT)
 564                         oit.it_op |= IT_CREAT;
 565
 566                 it = &oit;
 567         }
 568
 569 restart:
 570         /* Let's see if we have file open on MDS already. */
 571         if (it->it_flags & FMODE_WRITE) {
 572                 och_p = &lli->lli_mds_write_och;
 573                 och_usecount = &lli->lli_open_fd_write_count;
 574         } else if (it->it_flags & FMODE_EXEC) {
 575                 och_p = &lli->lli_mds_exec_och;
 576                 och_usecount = &lli->lli_open_fd_exec_count;
 577          } else {
 578                 och_p = &lli->lli_mds_read_och;
 579                 och_usecount = &lli->lli_open_fd_read_count;
 580         }
 581
 582         mutex_lock(&lli->lli_och_mutex);
 583         if (*och_p) { /* Open handle is present */
 584                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 585                         /* Well, there's extra open request that we do not need,
 586                            let's close it somehow. This will decref request. */
 587                         rc = it_open_error(DISP_OPEN_OPEN, it);
 588                         if (rc) {
 589                                 mutex_unlock(&lli->lli_och_mutex);
 590                                 GOTO(out_openerr, rc);
 591                         }
 592
 593                         ll_release_openhandle(file_dentry(file), it);
 594                 }
 595                 (*och_usecount)++;
 596
 597                 rc = ll_local_open(file, it, fd, NULL);
 598                 if (rc) {
 599                         (*och_usecount)--;
 600                         mutex_unlock(&lli->lli_och_mutex);
 601                         GOTO(out_openerr, rc);
 602                 }
 603         } else {
 604                 LASSERT(*och_usecount == 0);
 605                 if (!it->it_disposition) {
 606                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 607                         /* We cannot just request lock handle now, new ELC code
 608                            means that one of other OPEN locks for this file
 609                            could be cancelled, and since blocking ast handler
 610                            would attempt to grab och_mutex as well, that would
 611                            result in a deadlock */
 612                         mutex_unlock(&lli->lli_och_mutex);
 613                         /*
 614                          * Normally called under two situations:
 615                          * 1. NFS export.
 616                          * 2. A race/condition on MDS resulting in no open
 617                          *    handle to be returned from LOOKUP|OPEN request,
 618                          *    for example if the target entry was a symlink.
 619                          *
 620                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 621                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 622                          *  bit so that it's not confusing later callers.
 623                          *
 624                          *  NB; when ldd is NULL, it must have come via normal
 625                          *  lookup path only, since ll_iget_for_nfs always calls
 626                          *  ll_d_init().
 627                          */
 628                         if (ldd && ldd->lld_nfs_dentry) {
 629                                 ldd->lld_nfs_dentry = 0;
 630                                 it->it_flags |= MDS_OPEN_LOCK;
 631                         }
 632
 633                          /*
 634                          * Always specify MDS_OPEN_BY_FID because we don't want
 635                          * to get file with different fid.
 636                          */
 637                         it->it_flags |= MDS_OPEN_BY_FID;
 638                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 639                                                  it);
 640                         if (rc)
 641                                 GOTO(out_openerr, rc);
 642
 643                         goto restart;
 644                 }
 645                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 646                 if (!*och_p)
 647                         GOTO(out_och_free, rc = -ENOMEM);
 648
 649                 (*och_usecount)++;
 650
 651                 /* md_intent_lock() didn't get a request ref if there was an
 652                  * open error, so don't do cleanup on the request here
 653                  * (bug 3430) */
 654                 /* XXX (green): Should not we bail out on any error here, not
 655                  * just open error? */
 656                 rc = it_open_error(DISP_OPEN_OPEN, it);
 657                 if (rc != 0)
 658                         GOTO(out_och_free, rc);
 659
 660                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 661                          "inode %p: disposition %x, status %d\n", inode,
 662                          it_disposition(it, ~0), it->it_status);
 663
 664                 rc = ll_local_open(file, it, fd, *och_p);
 665                 if (rc)
 666                         GOTO(out_och_free, rc);
 667         }
 668         mutex_unlock(&lli->lli_och_mutex);
 669         fd = NULL;
 670
 671         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 672            different kind of OPEN lock for this same inode gets cancelled
 673            by ldlm_cancel_lru */
 674         if (!S_ISREG(inode->i_mode))
 675                 GOTO(out_och_free, rc);
 676
 677         cl_lov_delay_create_clear(&file->f_flags);
 678         GOTO(out_och_free, rc);
 679
 680 out_och_free:
 681         if (rc) {
 682                 if (och_p && *och_p) {
 683                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 684                         *och_p = NULL; /* OBD_FREE writes some magic there */
 685                         (*och_usecount)--;
 686                 }
 687                 mutex_unlock(&lli->lli_och_mutex);
 688
 689 out_openerr:
 690                 if (lli->lli_opendir_key == fd)
 691                         ll_deauthorize_statahead(inode, fd);
 692                 if (fd != NULL)
 693                         ll_file_data_put(fd);
 694         } else {
 695                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 696         }
 697
 698         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 699                 ptlrpc_req_finished(it->it_request);
 700                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 701         }
 702
 703         return rc;
 704 }
 705
 706 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 707                         struct ldlm_lock_desc *desc, void *data, int flag)
 708 {
 709         int rc;
 710         struct lustre_handle lockh;
 711         ENTRY;
 712
 713         switch (flag) {
 714         case LDLM_CB_BLOCKING:
 715                 ldlm_lock2handle(lock, &lockh);
 716                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 717                 if (rc < 0) {
 718                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 719                         RETURN(rc);
 720                 }
 721                 break;
 722         case LDLM_CB_CANCELING:
 723                 /* do nothing */
 724                 break;
 725         }
 726         RETURN(0);
 727 }
 728
 729 /**
 730  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 731  * and save it as fd->fd_och so as to force client to reopen the file even
 732  * if it has an open lock in cache already.
 733  */
 734 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 735                                 struct lustre_handle *old_handle)
 736 {
 737         struct ll_inode_info *lli = ll_i2info(inode);
 738         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 739         struct obd_client_handle **och_p;
 740         __u64 *och_usecount;
 741         int rc = 0;
 742         ENTRY;
 743
 744         /* Get the openhandle of the file */
 745         mutex_lock(&lli->lli_och_mutex);
 746         if (fd->fd_lease_och != NULL)
 747                 GOTO(out_unlock, rc = -EBUSY);
 748
 749         if (fd->fd_och == NULL) {
 750                 if (file->f_mode & FMODE_WRITE) {
 751                         LASSERT(lli->lli_mds_write_och != NULL);
 752                         och_p = &lli->lli_mds_write_och;
 753                         och_usecount = &lli->lli_open_fd_write_count;
 754                 } else {
 755                         LASSERT(lli->lli_mds_read_och != NULL);
 756                         och_p = &lli->lli_mds_read_och;
 757                         och_usecount = &lli->lli_open_fd_read_count;
 758                 }
 759
 760                 if (*och_usecount > 1)
 761                         GOTO(out_unlock, rc = -EBUSY);
 762
 763                 fd->fd_och = *och_p;
 764                 *och_usecount = 0;
 765                 *och_p = NULL;
 766         }
 767
 768         *old_handle = fd->fd_och->och_fh;
 769
 770         EXIT;
 771 out_unlock:
 772         mutex_unlock(&lli->lli_och_mutex);
 773         return rc;
 774 }
 775
 776 /**
 777  * Release ownership on lli_mds_*_och when putting back a file lease.
 778  */
 779 static int ll_lease_och_release(struct inode *inode, struct file *file)
 780 {
 781         struct ll_inode_info *lli = ll_i2info(inode);
 782         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 783         struct obd_client_handle **och_p;
 784         struct obd_client_handle *old_och = NULL;
 785         __u64 *och_usecount;
 786         int rc = 0;
 787         ENTRY;
 788
 789         mutex_lock(&lli->lli_och_mutex);
 790         if (file->f_mode & FMODE_WRITE) {
 791                 och_p = &lli->lli_mds_write_och;
 792                 och_usecount = &lli->lli_open_fd_write_count;
 793         } else {
 794                 och_p = &lli->lli_mds_read_och;
 795                 och_usecount = &lli->lli_open_fd_read_count;
 796         }
 797
 798         /* The file may have been open by another process (broken lease) so
 799          * *och_p is not NULL. In this case we should simply increase usecount
 800          * and close fd_och.
 801          */
 802         if (*och_p != NULL) {
 803                 old_och = fd->fd_och;
 804                 (*och_usecount)++;
 805         } else {
 806                 *och_p = fd->fd_och;
 807                 *och_usecount = 1;
 808         }
 809         fd->fd_och = NULL;
 810         mutex_unlock(&lli->lli_och_mutex);
 811
 812         if (old_och != NULL)
 813                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 814
 815         RETURN(rc);
 816 }
 817
 818 /**
 819  * Acquire a lease and open the file.
 820  */
 821 static struct obd_client_handle *
 822 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 823               __u64 open_flags)
 824 {
 825         struct lookup_intent it = { .it_op = IT_OPEN };
 826         struct ll_sb_info *sbi = ll_i2sbi(inode);
 827         struct md_op_data *op_data;
 828         struct ptlrpc_request *req = NULL;
 829         struct lustre_handle old_handle = { 0 };
 830         struct obd_client_handle *och = NULL;
 831         int rc;
 832         int rc2;
 833         ENTRY;
 834
 835         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 836                 RETURN(ERR_PTR(-EINVAL));
 837
 838         if (file != NULL) {
 839                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 840                         RETURN(ERR_PTR(-EPERM));
 841
 842                 rc = ll_lease_och_acquire(inode, file, &old_handle);
 843                 if (rc)
 844                         RETURN(ERR_PTR(rc));
 845         }
 846
 847         OBD_ALLOC_PTR(och);
 848         if (och == NULL)
 849                 RETURN(ERR_PTR(-ENOMEM));
 850
 851         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 852                                         LUSTRE_OPC_ANY, NULL);
 853         if (IS_ERR(op_data))
 854                 GOTO(out, rc = PTR_ERR(op_data));
 855
 856         /* To tell the MDT this openhandle is from the same owner */
 857         op_data->op_handle = old_handle;
 858
 859         it.it_flags = fmode | open_flags;
 860         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 861         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 862                             &ll_md_blocking_lease_ast,
 863         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 864          * it can be cancelled which may mislead applications that the lease is
 865          * broken;
 866          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 867          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 868          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 869                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 870         ll_finish_md_op_data(op_data);
 871         ptlrpc_req_finished(req);
 872         if (rc < 0)
 873                 GOTO(out_release_it, rc);
 874
 875         if (it_disposition(&it, DISP_LOOKUP_NEG))
 876                 GOTO(out_release_it, rc = -ENOENT);
 877
 878         rc = it_open_error(DISP_OPEN_OPEN, &it);
 879         if (rc)
 880                 GOTO(out_release_it, rc);
 881
 882         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 883         ll_och_fill(sbi->ll_md_exp, &it, och);
 884
 885         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 886                 GOTO(out_close, rc = -EOPNOTSUPP);
 887
 888         /* already get lease, handle lease lock */
 889         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 890         if (it.it_lock_mode == 0 ||
 891             it.it_lock_bits != MDS_INODELOCK_OPEN) {
 892                 /* open lock must return for lease */
 893                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 894                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
 895                         it.it_lock_bits);
 896                 GOTO(out_close, rc = -EPROTO);
 897         }
 898
 899         ll_intent_release(&it);
 900         RETURN(och);
 901
 902 out_close:
 903         /* Cancel open lock */
 904         if (it.it_lock_mode != 0) {
 905                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 906                                             it.it_lock_mode);
 907                 it.it_lock_mode = 0;
 908                 och->och_lease_handle.cookie = 0ULL;
 909         }
 910         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
 911         if (rc2 < 0)
 912                 CERROR("%s: error closing file "DFID": %d\n",
 913                        ll_get_fsname(inode->i_sb, NULL, 0),
 914                        PFID(&ll_i2info(inode)->lli_fid), rc2);
 915         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
 916 out_release_it:
 917         ll_intent_release(&it);
 918 out:
 919         if (och != NULL)
 920                 OBD_FREE_PTR(och);
 921         RETURN(ERR_PTR(rc));
 922 }
 923
 924 /**
 925  * Check whether a layout swap can be done between two inodes.
 926  *
 927  * \param[in] inode1  First inode to check
 928  * \param[in] inode2  Second inode to check
 929  *
 930  * \retval 0 on success, layout swap can be performed between both inodes
 931  * \retval negative error code if requirements are not met
 932  */
 933 static int ll_check_swap_layouts_validity(struct inode *inode1,
 934                                           struct inode *inode2)
 935 {
 936         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
 937                 return -EINVAL;
 938
 939         if (inode_permission(inode1, MAY_WRITE) ||
 940             inode_permission(inode2, MAY_WRITE))
 941                 return -EPERM;
 942
 943         if (inode1->i_sb != inode2->i_sb)
 944                 return -EXDEV;
 945
 946         return 0;
 947 }
 948
 949 static int ll_swap_layouts_close(struct obd_client_handle *och,
 950                                  struct inode *inode, struct inode *inode2)
 951 {
 952         const struct lu_fid     *fid1 = ll_inode2fid(inode);
 953         const struct lu_fid     *fid2;
 954         int                      rc;
 955         ENTRY;
 956
 957         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
 958                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
 959
 960         rc = ll_check_swap_layouts_validity(inode, inode2);
 961         if (rc < 0)
 962                 GOTO(out_free_och, rc);
 963
 964         /* We now know that inode2 is a lustre inode */
 965         fid2 = ll_inode2fid(inode2);
 966
 967         rc = lu_fid_cmp(fid1, fid2);
 968         if (rc == 0)
 969                 GOTO(out_free_och, rc = -EINVAL);
 970
 971         /* Close the file and {swap,merge} layouts between inode & inode2.
 972          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
 973          * because we still need it to pack l_remote_handle to MDT. */
 974         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
 975                                        inode2);
 976
 977         och = NULL; /* freed in ll_close_inode_openhandle() */
 978
 979 out_free_och:
 980         if (och != NULL)
 981                 OBD_FREE_PTR(och);
 982
 983         RETURN(rc);
 984 }
 985
 986 /**
 987  * Release lease and close the file.
 988  * It will check if the lease has ever broken.
 989  */
 990 static int ll_lease_close_intent(struct obd_client_handle *och,
 991                                  struct inode *inode,
 992                                  bool *lease_broken, enum mds_op_bias bias,
 993                                  void *data)
 994 {
 995         struct ldlm_lock *lock;
 996         bool cancelled = true;
 997         int rc;
 998         ENTRY;
 999
1000         lock = ldlm_handle2lock(&och->och_lease_handle);
1001         if (lock != NULL) {
1002                 lock_res_and_lock(lock);
1003                 cancelled = ldlm_is_cancel(lock);
1004                 unlock_res_and_lock(lock);
1005                 LDLM_LOCK_PUT(lock);
1006         }
1007
1008         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1009                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1010
1011         if (lease_broken != NULL)
1012                 *lease_broken = cancelled;
1013
1014         if (!cancelled && !bias)
1015                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1016
1017         if (cancelled) { /* no need to excute intent */
1018                 bias = 0;
1019                 data = NULL;
1020         }
1021
1022         rc = ll_close_inode_openhandle(inode, och, bias, data);
1023         RETURN(rc);
1024 }
1025
1026 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1027                           bool *lease_broken)
1028 {
1029         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1030 }
1031
1032 /**
1033  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1034  */
1035 static int ll_lease_file_resync(struct obd_client_handle *och,
1036                                 struct inode *inode)
1037 {
1038         struct ll_sb_info *sbi = ll_i2sbi(inode);
1039         struct md_op_data *op_data;
1040         __u64 data_version_unused;
1041         int rc;
1042         ENTRY;
1043
1044         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1045                                      LUSTRE_OPC_ANY, NULL);
1046         if (IS_ERR(op_data))
1047                 RETURN(PTR_ERR(op_data));
1048
1049         /* before starting file resync, it's necessary to clean up page cache
1050          * in client memory, otherwise once the layout version is increased,
1051          * writing back cached data will be denied the OSTs. */
1052         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1053         if (rc)
1054                 GOTO(out, rc);
1055
1056         op_data->op_handle = och->och_lease_handle;
1057         rc = md_file_resync(sbi->ll_md_exp, op_data);
1058         if (rc)
1059                 GOTO(out, rc);
1060
1061         EXIT;
1062 out:
1063         ll_finish_md_op_data(op_data);
1064         return rc;
1065 }
1066
1067 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1068 {
1069         struct ll_inode_info *lli = ll_i2info(inode);
1070         struct cl_object *obj = lli->lli_clob;
1071         struct cl_attr *attr = vvp_env_thread_attr(env);
1072         s64 atime;
1073         s64 mtime;
1074         s64 ctime;
1075         int rc = 0;
1076
1077         ENTRY;
1078
1079         ll_inode_size_lock(inode);
1080
1081         /* Merge timestamps the most recently obtained from MDS with
1082          * timestamps obtained from OSTs.
1083          *
1084          * Do not overwrite atime of inode because it may be refreshed
1085          * by file_accessed() function. If the read was served by cache
1086          * data, there is no RPC to be sent so that atime may not be
1087          * transferred to OSTs at all. MDT only updates atime at close time
1088          * if it's at least 'mdd.*.atime_diff' older.
1089          * All in all, the atime in Lustre does not strictly comply with
1090          * POSIX. Solving this problem needs to send an RPC to MDT for each
1091          * read, this will hurt performance. */
1092         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1093                 LTIME_S(inode->i_atime) = lli->lli_atime;
1094                 lli->lli_update_atime = 0;
1095         }
1096         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1097         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1098
1099         atime = LTIME_S(inode->i_atime);
1100         mtime = LTIME_S(inode->i_mtime);
1101         ctime = LTIME_S(inode->i_ctime);
1102
1103         cl_object_attr_lock(obj);
1104         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1105                 rc = -EINVAL;
1106         else
1107                 rc = cl_object_attr_get(env, obj, attr);
1108         cl_object_attr_unlock(obj);
1109
1110         if (rc != 0)
1111                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1112
1113         if (atime < attr->cat_atime)
1114                 atime = attr->cat_atime;
1115
1116         if (ctime < attr->cat_ctime)
1117                 ctime = attr->cat_ctime;
1118
1119         if (mtime < attr->cat_mtime)
1120                 mtime = attr->cat_mtime;
1121
1122         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1123                PFID(&lli->lli_fid), attr->cat_size);
1124
1125         i_size_write(inode, attr->cat_size);
1126         inode->i_blocks = attr->cat_blocks;
1127
1128         LTIME_S(inode->i_atime) = atime;
1129         LTIME_S(inode->i_mtime) = mtime;
1130         LTIME_S(inode->i_ctime) = ctime;
1131
1132 out_size_unlock:
1133         ll_inode_size_unlock(inode);
1134
1135         RETURN(rc);
1136 }
1137
1138 /**
1139  * Set designated mirror for I/O.
1140  *
1141  * So far only read, write, and truncated can support to issue I/O to
1142  * designated mirror.
1143  */
1144 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1145 {
1146         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1147
1148         /* clear layout version for generic(non-resync) I/O in case it carries
1149          * stale layout version due to I/O restart */
1150         io->ci_layout_version = 0;
1151
1152         /* FLR: disable non-delay for designated mirror I/O because obviously
1153          * only one mirror is available */
1154         if (fd->fd_designated_mirror > 0) {
1155                 io->ci_ndelay = 0;
1156                 io->ci_designated_mirror = fd->fd_designated_mirror;
1157                 io->ci_layout_version = fd->fd_layout_version;
1158                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1159                                  * io to ptasks */
1160         }
1161
1162         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1163                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1164 }
1165
1166 static bool file_is_noatime(const struct file *file)
1167 {
1168         const struct vfsmount *mnt = file->f_path.mnt;
1169         const struct inode *inode = file_inode((struct file *)file);
1170
1171         /* Adapted from file_accessed() and touch_atime().*/
1172         if (file->f_flags & O_NOATIME)
1173                 return true;
1174
1175         if (inode->i_flags & S_NOATIME)
1176                 return true;
1177
1178         if (IS_NOATIME(inode))
1179                 return true;
1180
1181         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1182                 return true;
1183
1184         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1185                 return true;
1186
1187         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1188                 return true;
1189
1190         return false;
1191 }
1192
1193 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1194
1195 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1196 {
1197         struct inode *inode = file_inode(file);
1198         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1199
1200         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1201         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1202         io->u.ci_rw.rw_file = file;
1203         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1204         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1205         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1206
1207         if (iot == CIT_WRITE) {
1208                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1209                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1210                                            file->f_flags & O_DIRECT ||
1211                                            IS_SYNC(inode));
1212         }
1213         io->ci_obj = ll_i2info(inode)->lli_clob;
1214         io->ci_lockreq = CILR_MAYBE;
1215         if (ll_file_nolock(file)) {
1216                 io->ci_lockreq = CILR_NEVER;
1217                 io->ci_no_srvlock = 1;
1218         } else if (file->f_flags & O_APPEND) {
1219                 io->ci_lockreq = CILR_MANDATORY;
1220         }
1221         io->ci_noatime = file_is_noatime(file);
1222         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1223                 io->ci_pio = !io->u.ci_rw.rw_append;
1224         else
1225                 io->ci_pio = 0;
1226
1227         /* FLR: only use non-delay I/O for read as there is only one
1228          * avaliable mirror for write. */
1229         io->ci_ndelay = !(iot == CIT_WRITE);
1230
1231         ll_io_set_mirror(io, file);
1232 }
1233
1234 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1235 {
1236         struct cl_io_pt *pt = ptask->pt_cbdata;
1237         struct file *file = pt->cip_file;
1238         struct lu_env *env;
1239         struct cl_io *io;
1240         loff_t pos = pt->cip_pos;
1241         int rc;
1242         __u16 refcheck;
1243         ENTRY;
1244
1245         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1246                 file_dentry(file)->d_name.name,
1247                 pt->cip_iot == CIT_READ ? "read" : "write",
1248                 pos, pos + pt->cip_count);
1249
1250         env = cl_env_get(&refcheck);
1251         if (IS_ERR(env))
1252                 RETURN(PTR_ERR(env));
1253
1254         io = vvp_env_thread_io(env);
1255         ll_io_init(io, file, pt->cip_iot);
1256         io->u.ci_rw.rw_iter = pt->cip_iter;
1257         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1258         io->ci_pio = 0; /* It's already in parallel task */
1259
1260         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1261                            pt->cip_count - pt->cip_result);
1262         if (!rc) {
1263                 struct vvp_io *vio = vvp_env_io(env);
1264
1265                 vio->vui_io_subtype = IO_NORMAL;
1266                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1267
1268                 ll_cl_add(file, env, io, LCC_RW);
1269                 rc = cl_io_loop(env, io);
1270                 ll_cl_remove(file, env);
1271         } else {
1272                 /* cl_io_rw_init() handled IO */
1273                 rc = io->ci_result;
1274         }
1275
1276         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1277                 if (io->ci_nob > 0)
1278                         io->ci_nob /= 2;
1279                 rc = -EIO;
1280         }
1281
1282         if (io->ci_nob > 0) {
1283                 pt->cip_result += io->ci_nob;
1284                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1285                 pos += io->ci_nob;
1286                 pt->cip_iocb.ki_pos = pos;
1287 #ifdef HAVE_KIOCB_KI_LEFT
1288                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1289 #elif defined(HAVE_KI_NBYTES)
1290                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1291 #endif
1292         }
1293
1294         cl_io_fini(env, io);
1295         cl_env_put(env, &refcheck);
1296
1297         pt->cip_need_restart = io->ci_need_restart;
1298
1299         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1300                 file_dentry(file)->d_name.name,
1301                 pt->cip_iot == CIT_READ ? "read" : "write",
1302                 pt->cip_result, rc);
1303
1304         RETURN(pt->cip_result > 0 ? 0 : rc);
1305 }
1306
1307 static ssize_t
1308 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1309                    struct file *file, enum cl_io_type iot,
1310                    loff_t *ppos, size_t count)
1311 {
1312         struct range_lock       range;
1313         struct vvp_io           *vio = vvp_env_io(env);
1314         struct inode            *inode = file_inode(file);
1315         struct ll_inode_info    *lli = ll_i2info(inode);
1316         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1317         struct cl_io            *io;
1318         loff_t                  pos = *ppos;
1319         ssize_t                 result = 0;
1320         int                     rc = 0;
1321         unsigned                retried = 0;
1322         bool                    restarted = false;
1323
1324         ENTRY;
1325
1326         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1327                 file_dentry(file)->d_name.name,
1328                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1329
1330 restart:
1331         io = vvp_env_thread_io(env);
1332         ll_io_init(io, file, iot);
1333         if (args->via_io_subtype == IO_NORMAL) {
1334                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1335                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1336         }
1337         if (args->via_io_subtype != IO_NORMAL || restarted)
1338                 io->ci_pio = 0;
1339         io->ci_ndelay_tried = retried;
1340
1341         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1342                 bool range_locked = false;
1343
1344                 if (file->f_flags & O_APPEND)
1345                         range_lock_init(&range, 0, LUSTRE_EOF);
1346                 else
1347                         range_lock_init(&range, pos, pos + count - 1);
1348
1349                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1350                 vio->vui_io_subtype = args->via_io_subtype;
1351
1352                 switch (vio->vui_io_subtype) {
1353                 case IO_NORMAL:
1354                         /* Direct IO reads must also take range lock,
1355                          * or multiple reads will try to work on the same pages
1356                          * See LU-6227 for details. */
1357                         if (((iot == CIT_WRITE) ||
1358                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1359                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1360                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1361                                        RL_PARA(&range));
1362                                 rc = range_lock(&lli->lli_write_tree, &range);
1363                                 if (rc < 0)
1364                                         GOTO(out, rc);
1365
1366                                 range_locked = true;
1367                         }
1368                         break;
1369                 case IO_SPLICE:
1370                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1371                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1372                         break;
1373                 default:
1374                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1375                         LBUG();
1376                 }
1377
1378                 ll_cl_add(file, env, io, LCC_RW);
1379                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1380                     !lli->lli_inode_locked) {
1381                         inode_lock(inode);
1382                         lli->lli_inode_locked = 1;
1383                 }
1384                 rc = cl_io_loop(env, io);
1385                 if (lli->lli_inode_locked) {
1386                         lli->lli_inode_locked = 0;
1387                         inode_unlock(inode);
1388                 }
1389                 ll_cl_remove(file, env);
1390
1391                 if (range_locked) {
1392                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1393                                RL_PARA(&range));
1394                         range_unlock(&lli->lli_write_tree, &range);
1395                 }
1396         } else {
1397                 /* cl_io_rw_init() handled IO */
1398                 rc = io->ci_result;
1399         }
1400
1401         if (io->ci_nob > 0) {
1402                 result += io->ci_nob;
1403                 count  -= io->ci_nob;
1404
1405                 if (args->via_io_subtype == IO_NORMAL) {
1406                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1407                         pos += io->ci_nob;
1408                         args->u.normal.via_iocb->ki_pos = pos;
1409 #ifdef HAVE_KIOCB_KI_LEFT
1410                         args->u.normal.via_iocb->ki_left = count;
1411 #elif defined(HAVE_KI_NBYTES)
1412                         args->u.normal.via_iocb->ki_nbytes = count;
1413 #endif
1414                 } else {
1415                         /* for splice */
1416                         pos = io->u.ci_rw.rw_range.cir_pos;
1417                 }
1418         }
1419 out:
1420         cl_io_fini(env, io);
1421
1422         CDEBUG(D_VFSTRACE,
1423                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1424                file->f_path.dentry->d_name.name,
1425                iot, rc, result, io->ci_need_restart);
1426
1427         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1428                 CDEBUG(D_VFSTRACE,
1429                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1430                         file_dentry(file)->d_name.name,
1431                         iot == CIT_READ ? "read" : "write",
1432                         pos, pos + count, result, rc);
1433                 /* preserve the tried count for FLR */
1434                 retried = io->ci_ndelay_tried;
1435                 restarted = true;
1436                 goto restart;
1437         }
1438
1439         if (iot == CIT_READ) {
1440                 if (result > 0)
1441                         ll_stats_ops_tally(ll_i2sbi(inode),
1442                                            LPROC_LL_READ_BYTES, result);
1443         } else if (iot == CIT_WRITE) {
1444                 if (result > 0) {
1445                         ll_stats_ops_tally(ll_i2sbi(inode),
1446                                            LPROC_LL_WRITE_BYTES, result);
1447                         fd->fd_write_failed = false;
1448                 } else if (result == 0 && rc == 0) {
1449                         rc = io->ci_result;
1450                         if (rc < 0)
1451                                 fd->fd_write_failed = true;
1452                         else
1453                                 fd->fd_write_failed = false;
1454                 } else if (rc != -ERESTARTSYS) {
1455                         fd->fd_write_failed = true;
1456                 }
1457         }
1458
1459         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1460                 file_dentry(file)->d_name.name,
1461                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1462
1463         *ppos = pos;
1464
1465         RETURN(result > 0 ? result : rc);
1466 }
1467
1468 /**
1469  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1470  * especially for small I/O.
1471  *
1472  * To serve a read request, CLIO has to create and initialize a cl_io and
1473  * then request DLM lock. This has turned out to have siginificant overhead
1474  * and affects the performance of small I/O dramatically.
1475  *
1476  * It's not necessary to create a cl_io for each I/O. Under the help of read
1477  * ahead, most of the pages being read are already in memory cache and we can
1478  * read those pages directly because if the pages exist, the corresponding DLM
1479  * lock must exist so that page content must be valid.
1480  *
1481  * In fast read implementation, the llite speculatively finds and reads pages
1482  * in memory cache. There are three scenarios for fast read:
1483  *   - If the page exists and is uptodate, kernel VM will provide the data and
1484  *     CLIO won't be intervened;
1485  *   - If the page was brought into memory by read ahead, it will be exported
1486  *     and read ahead parameters will be updated;
1487  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1488  *     it will go back and invoke normal read, i.e., a cl_io will be created
1489  *     and DLM lock will be requested.
1490  *
1491  * POSIX compliance: posix standard states that read is intended to be atomic.
1492  * Lustre read implementation is in line with Linux kernel read implementation
1493  * and neither of them complies with POSIX standard in this matter. Fast read
1494  * doesn't make the situation worse on single node but it may interleave write
1495  * results from multiple nodes due to short read handling in ll_file_aio_read().
1496  *
1497  * \param env - lu_env
1498  * \param iocb - kiocb from kernel
1499  * \param iter - user space buffers where the data will be copied
1500  *
1501  * \retval - number of bytes have been read, or error code if error occurred.
1502  */
1503 static ssize_t
1504 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1505 {
1506         ssize_t result;
1507
1508         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1509                 return 0;
1510
1511         /* NB: we can't do direct IO for fast read because it will need a lock
1512          * to make IO engine happy. */
1513         if (iocb->ki_filp->f_flags & O_DIRECT)
1514                 return 0;
1515
1516         result = generic_file_read_iter(iocb, iter);
1517
1518         /* If the first page is not in cache, generic_file_aio_read() will be
1519          * returned with -ENODATA.
1520          * See corresponding code in ll_readpage(). */
1521         if (result == -ENODATA)
1522                 result = 0;
1523
1524         if (result > 0)
1525                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1526                                 LPROC_LL_READ_BYTES, result);
1527
1528         return result;
1529 }
1530
1531 /*
1532  * Read from a file (through the page cache).
1533  */
1534 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1535 {
1536         struct lu_env *env;
1537         struct vvp_io_args *args;
1538         ssize_t result;
1539         ssize_t rc2;
1540         __u16 refcheck;
1541
1542         result = ll_do_fast_read(iocb, to);
1543         if (result < 0 || iov_iter_count(to) == 0)
1544                 GOTO(out, result);
1545
1546         env = cl_env_get(&refcheck);
1547         if (IS_ERR(env))
1548                 return PTR_ERR(env);
1549
1550         args = ll_env_args(env, IO_NORMAL);
1551         args->u.normal.via_iter = to;
1552         args->u.normal.via_iocb = iocb;
1553
1554         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1555                                  &iocb->ki_pos, iov_iter_count(to));
1556         if (rc2 > 0)
1557                 result += rc2;
1558         else if (result == 0)
1559                 result = rc2;
1560
1561         cl_env_put(env, &refcheck);
1562 out:
1563         return result;
1564 }
1565
1566 /*
1567  * Write to a file (through the page cache).
1568  */
1569 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1570 {
1571         struct vvp_io_args *args;
1572         struct lu_env *env;
1573         ssize_t result;
1574         __u16 refcheck;
1575
1576         env = cl_env_get(&refcheck);
1577         if (IS_ERR(env))
1578                 return PTR_ERR(env);
1579
1580         args = ll_env_args(env, IO_NORMAL);
1581         args->u.normal.via_iter = from;
1582         args->u.normal.via_iocb = iocb;
1583
1584         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1585                                     &iocb->ki_pos, iov_iter_count(from));
1586         cl_env_put(env, &refcheck);
1587         return result;
1588 }
1589
1590 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1591 /*
1592  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1593  */
1594 static int ll_file_get_iov_count(const struct iovec *iov,
1595                                  unsigned long *nr_segs, size_t *count)
1596 {
1597         size_t cnt = 0;
1598         unsigned long seg;
1599
1600         for (seg = 0; seg < *nr_segs; seg++) {
1601                 const struct iovec *iv = &iov[seg];
1602
1603                 /*
1604                  * If any segment has a negative length, or the cumulative
1605                  * length ever wraps negative then return -EINVAL.
1606                  */
1607                 cnt += iv->iov_len;
1608                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1609                         return -EINVAL;
1610                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1611                         continue;
1612                 if (seg == 0)
1613                         return -EFAULT;
1614                 *nr_segs = seg;
1615                 cnt -= iv->iov_len;     /* This segment is no good */
1616                 break;
1617         }
1618         *count = cnt;
1619         return 0;
1620 }
1621
1622 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1623                                 unsigned long nr_segs, loff_t pos)
1624 {
1625         struct iov_iter to;
1626         size_t iov_count;
1627         ssize_t result;
1628         ENTRY;
1629
1630         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1631         if (result)
1632                 RETURN(result);
1633
1634 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1635         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1636 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1637         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1638 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1639
1640         result = ll_file_read_iter(iocb, &to);
1641
1642         RETURN(result);
1643 }
1644
1645 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1646                             loff_t *ppos)
1647 {
1648         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1649         struct kiocb   kiocb;
1650         ssize_t        result;
1651         ENTRY;
1652
1653         init_sync_kiocb(&kiocb, file);
1654         kiocb.ki_pos = *ppos;
1655 #ifdef HAVE_KIOCB_KI_LEFT
1656         kiocb.ki_left = count;
1657 #elif defined(HAVE_KI_NBYTES)
1658         kiocb.i_nbytes = count;
1659 #endif
1660
1661         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1662         *ppos = kiocb.ki_pos;
1663
1664         RETURN(result);
1665 }
1666
1667 /*
1668  * Write to a file (through the page cache).
1669  * AIO stuff
1670  */
1671 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1672                                  unsigned long nr_segs, loff_t pos)
1673 {
1674         struct iov_iter from;
1675         size_t iov_count;
1676         ssize_t result;
1677         ENTRY;
1678
1679         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1680         if (result)
1681                 RETURN(result);
1682
1683 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1684         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1685 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1686         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1687 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1688
1689         result = ll_file_write_iter(iocb, &from);
1690
1691         RETURN(result);
1692 }
1693
1694 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1695                              size_t count, loff_t *ppos)
1696 {
1697         struct lu_env *env;
1698         struct iovec   iov = { .iov_base = (void __user *)buf,
1699                                .iov_len = count };
1700         struct kiocb  *kiocb;
1701         ssize_t        result;
1702         __u16          refcheck;
1703         ENTRY;
1704
1705         env = cl_env_get(&refcheck);
1706         if (IS_ERR(env))
1707                 RETURN(PTR_ERR(env));
1708
1709         kiocb = &ll_env_info(env)->lti_kiocb;
1710         init_sync_kiocb(kiocb, file);
1711         kiocb->ki_pos = *ppos;
1712 #ifdef HAVE_KIOCB_KI_LEFT
1713         kiocb->ki_left = count;
1714 #elif defined(HAVE_KI_NBYTES)
1715         kiocb->ki_nbytes = count;
1716 #endif
1717
1718         result = ll_file_aio_write(kiocb, &iov, 1, kiocb->ki_pos);
1719         *ppos = kiocb->ki_pos;
1720
1721         cl_env_put(env, &refcheck);
1722         RETURN(result);
1723 }
1724 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1725
1726 /*
1727  * Send file content (through pagecache) somewhere with helper
1728  */
1729 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1730                                    struct pipe_inode_info *pipe, size_t count,
1731                                    unsigned int flags)
1732 {
1733         struct lu_env      *env;
1734         struct vvp_io_args *args;
1735         ssize_t             result;
1736         __u16               refcheck;
1737         ENTRY;
1738
1739         env = cl_env_get(&refcheck);
1740         if (IS_ERR(env))
1741                 RETURN(PTR_ERR(env));
1742
1743         args = ll_env_args(env, IO_SPLICE);
1744         args->u.splice.via_pipe = pipe;
1745         args->u.splice.via_flags = flags;
1746
1747         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1748         cl_env_put(env, &refcheck);
1749         RETURN(result);
1750 }
1751
1752 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1753                              __u64 flags, struct lov_user_md *lum, int lum_size)
1754 {
1755         struct lookup_intent oit = {
1756                 .it_op = IT_OPEN,
1757                 .it_flags = flags | MDS_OPEN_BY_FID,
1758         };
1759         int rc;
1760         ENTRY;
1761
1762         ll_inode_size_lock(inode);
1763         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1764         if (rc < 0)
1765                 GOTO(out_unlock, rc);
1766
1767         ll_release_openhandle(dentry, &oit);
1768
1769 out_unlock:
1770         ll_inode_size_unlock(inode);
1771         ll_intent_release(&oit);
1772
1773         RETURN(rc);
1774 }
1775
1776 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1777                              struct lov_mds_md **lmmp, int *lmm_size,
1778                              struct ptlrpc_request **request)
1779 {
1780         struct ll_sb_info *sbi = ll_i2sbi(inode);
1781         struct mdt_body  *body;
1782         struct lov_mds_md *lmm = NULL;
1783         struct ptlrpc_request *req = NULL;
1784         struct md_op_data *op_data;
1785         int rc, lmmsize;
1786
1787         rc = ll_get_default_mdsize(sbi, &lmmsize);
1788         if (rc)
1789                 RETURN(rc);
1790
1791         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1792                                      strlen(filename), lmmsize,
1793                                      LUSTRE_OPC_ANY, NULL);
1794         if (IS_ERR(op_data))
1795                 RETURN(PTR_ERR(op_data));
1796
1797         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1798         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1799         ll_finish_md_op_data(op_data);
1800         if (rc < 0) {
1801                 CDEBUG(D_INFO, "md_getattr_name failed "
1802                        "on %s: rc %d\n", filename, rc);
1803                 GOTO(out, rc);
1804         }
1805
1806         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1807         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1808
1809         lmmsize = body->mbo_eadatasize;
1810
1811         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1812                         lmmsize == 0) {
1813                 GOTO(out, rc = -ENODATA);
1814         }
1815
1816         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1817         LASSERT(lmm != NULL);
1818
1819         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1820             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1821             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1822                 GOTO(out, rc = -EPROTO);
1823
1824         /*
1825          * This is coming from the MDS, so is probably in
1826          * little endian.  We convert it to host endian before
1827          * passing it to userspace.
1828          */
1829         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1830                 int stripe_count;
1831
1832                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1833                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1834                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1835                         if (le32_to_cpu(lmm->lmm_pattern) &
1836                             LOV_PATTERN_F_RELEASED)
1837                                 stripe_count = 0;
1838                 }
1839
1840                 /* if function called for directory - we should
1841                  * avoid swab not existent lsm objects */
1842                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1843                         lustre_swab_lov_user_md_v1(
1844                                         (struct lov_user_md_v1 *)lmm);
1845                         if (S_ISREG(body->mbo_mode))
1846                                 lustre_swab_lov_user_md_objects(
1847                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1848                                     stripe_count);
1849                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1850                         lustre_swab_lov_user_md_v3(
1851                                         (struct lov_user_md_v3 *)lmm);
1852                         if (S_ISREG(body->mbo_mode))
1853                                 lustre_swab_lov_user_md_objects(
1854                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1855                                     stripe_count);
1856                 } else if (lmm->lmm_magic ==
1857                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1858                         lustre_swab_lov_comp_md_v1(
1859                                         (struct lov_comp_md_v1 *)lmm);
1860                 }
1861         }
1862
1863 out:
1864         *lmmp = lmm;
1865         *lmm_size = lmmsize;
1866         *request = req;
1867         return rc;
1868 }
1869
1870 static int ll_lov_setea(struct inode *inode, struct file *file,
1871                         void __user *arg)
1872 {
1873         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1874         struct lov_user_md      *lump;
1875         int                      lum_size = sizeof(struct lov_user_md) +
1876                                             sizeof(struct lov_user_ost_data);
1877         int                      rc;
1878         ENTRY;
1879
1880         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1881                 RETURN(-EPERM);
1882
1883         OBD_ALLOC_LARGE(lump, lum_size);
1884         if (lump == NULL)
1885                 RETURN(-ENOMEM);
1886
1887         if (copy_from_user(lump, arg, lum_size))
1888                 GOTO(out_lump, rc = -EFAULT);
1889
1890         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1891                                       lum_size);
1892         cl_lov_delay_create_clear(&file->f_flags);
1893
1894 out_lump:
1895         OBD_FREE_LARGE(lump, lum_size);
1896         RETURN(rc);
1897 }
1898
1899 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1900 {
1901         struct lu_env   *env;
1902         __u16           refcheck;
1903         int             rc;
1904         ENTRY;
1905
1906         env = cl_env_get(&refcheck);
1907         if (IS_ERR(env))
1908                 RETURN(PTR_ERR(env));
1909
1910         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1911         cl_env_put(env, &refcheck);
1912         RETURN(rc);
1913 }
1914
1915 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1916                             void __user *arg)
1917 {
1918         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
1919         struct lov_user_md        *klum;
1920         int                        lum_size, rc;
1921         __u64                      flags = FMODE_WRITE;
1922         ENTRY;
1923
1924         rc = ll_copy_user_md(lum, &klum);
1925         if (rc < 0)
1926                 RETURN(rc);
1927
1928         lum_size = rc;
1929         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
1930                                       lum_size);
1931         if (!rc) {
1932                 __u32 gen;
1933
1934                 rc = put_user(0, &lum->lmm_stripe_count);
1935                 if (rc)
1936                         GOTO(out, rc);
1937
1938                 rc = ll_layout_refresh(inode, &gen);
1939                 if (rc)
1940                         GOTO(out, rc);
1941
1942                 rc = ll_file_getstripe(inode, arg, lum_size);
1943         }
1944         cl_lov_delay_create_clear(&file->f_flags);
1945
1946 out:
1947         OBD_FREE(klum, lum_size);
1948         RETURN(rc);
1949 }
1950
1951 static int
1952 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1953 {
1954         struct ll_inode_info *lli = ll_i2info(inode);
1955         struct cl_object *obj = lli->lli_clob;
1956         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1957         struct ll_grouplock grouplock;
1958         int rc;
1959         ENTRY;
1960
1961         if (arg == 0) {
1962                 CWARN("group id for group lock must not be 0\n");
1963                 RETURN(-EINVAL);
1964         }
1965
1966         if (ll_file_nolock(file))
1967                 RETURN(-EOPNOTSUPP);
1968
1969         spin_lock(&lli->lli_lock);
1970         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1971                 CWARN("group lock already existed with gid %lu\n",
1972                       fd->fd_grouplock.lg_gid);
1973                 spin_unlock(&lli->lli_lock);
1974                 RETURN(-EINVAL);
1975         }
1976         LASSERT(fd->fd_grouplock.lg_lock == NULL);
1977         spin_unlock(&lli->lli_lock);
1978
1979         /**
1980          * XXX: group lock needs to protect all OST objects while PFL
1981          * can add new OST objects during the IO, so we'd instantiate
1982          * all OST objects before getting its group lock.
1983          */
1984         if (obj) {
1985                 struct lu_env *env;
1986                 __u16 refcheck;
1987                 struct cl_layout cl = {
1988                         .cl_is_composite = false,
1989                 };
1990                 struct lu_extent ext = {
1991                         .e_start = 0,
1992                         .e_end = OBD_OBJECT_EOF,
1993                 };
1994
1995                 env = cl_env_get(&refcheck);
1996                 if (IS_ERR(env))
1997                         RETURN(PTR_ERR(env));
1998
1999                 rc = cl_object_layout_get(env, obj, &cl);
2000                 if (!rc && cl.cl_is_composite)
2001                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2002                                                     &ext);
2003
2004                 cl_env_put(env, &refcheck);
2005                 if (rc)
2006                         RETURN(rc);
2007         }
2008
2009         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2010                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2011         if (rc)
2012                 RETURN(rc);
2013
2014         spin_lock(&lli->lli_lock);
2015         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2016                 spin_unlock(&lli->lli_lock);
2017                 CERROR("another thread just won the race\n");
2018                 cl_put_grouplock(&grouplock);
2019                 RETURN(-EINVAL);
2020         }
2021
2022         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2023         fd->fd_grouplock = grouplock;
2024         spin_unlock(&lli->lli_lock);
2025
2026         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2027         RETURN(0);
2028 }
2029
2030 static int ll_put_grouplock(struct inode *inode, struct file *file,
2031                             unsigned long arg)
2032 {
2033         struct ll_inode_info   *lli = ll_i2info(inode);
2034         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2035         struct ll_grouplock     grouplock;
2036         ENTRY;
2037
2038         spin_lock(&lli->lli_lock);
2039         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2040                 spin_unlock(&lli->lli_lock);
2041                 CWARN("no group lock held\n");
2042                 RETURN(-EINVAL);
2043         }
2044
2045         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2046
2047         if (fd->fd_grouplock.lg_gid != arg) {
2048                 CWARN("group lock %lu doesn't match current id %lu\n",
2049                       arg, fd->fd_grouplock.lg_gid);
2050                 spin_unlock(&lli->lli_lock);
2051                 RETURN(-EINVAL);
2052         }
2053
2054         grouplock = fd->fd_grouplock;
2055         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2056         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2057         spin_unlock(&lli->lli_lock);
2058
2059         cl_put_grouplock(&grouplock);
2060         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2061         RETURN(0);
2062 }
2063
2064 /**
2065  * Close inode open handle
2066  *
2067  * \param dentry [in]     dentry which contains the inode
2068  * \param it     [in,out] intent which contains open info and result
2069  *
2070  * \retval 0     success
2071  * \retval <0    failure
2072  */
2073 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2074 {
2075         struct inode *inode = dentry->d_inode;
2076         struct obd_client_handle *och;
2077         int rc;
2078         ENTRY;
2079
2080         LASSERT(inode);
2081
2082         /* Root ? Do nothing. */
2083         if (dentry->d_inode->i_sb->s_root == dentry)
2084                 RETURN(0);
2085
2086         /* No open handle to close? Move away */
2087         if (!it_disposition(it, DISP_OPEN_OPEN))
2088                 RETURN(0);
2089
2090         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2091
2092         OBD_ALLOC(och, sizeof(*och));
2093         if (!och)
2094                 GOTO(out, rc = -ENOMEM);
2095
2096         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2097
2098         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2099 out:
2100         /* this one is in place of ll_file_open */
2101         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2102                 ptlrpc_req_finished(it->it_request);
2103                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2104         }
2105         RETURN(rc);
2106 }
2107
2108 /**
2109  * Get size for inode for which FIEMAP mapping is requested.
2110  * Make the FIEMAP get_info call and returns the result.
2111  * \param fiemap        kernel buffer to hold extens
2112  * \param num_bytes     kernel buffer size
2113  */
2114 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2115                         size_t num_bytes)
2116 {
2117         struct lu_env                   *env;
2118         __u16                           refcheck;
2119         int                             rc = 0;
2120         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2121         ENTRY;
2122
2123         /* Checks for fiemap flags */
2124         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2125                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2126                 return -EBADR;
2127         }
2128
2129         /* Check for FIEMAP_FLAG_SYNC */
2130         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2131                 rc = filemap_fdatawrite(inode->i_mapping);
2132                 if (rc)
2133                         return rc;
2134         }
2135
2136         env = cl_env_get(&refcheck);
2137         if (IS_ERR(env))
2138                 RETURN(PTR_ERR(env));
2139
2140         if (i_size_read(inode) == 0) {
2141                 rc = ll_glimpse_size(inode);
2142                 if (rc)
2143                         GOTO(out, rc);
2144         }
2145
2146         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2147         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2148         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2149
2150         /* If filesize is 0, then there would be no objects for mapping */
2151         if (fmkey.lfik_oa.o_size == 0) {
2152                 fiemap->fm_mapped_extents = 0;
2153                 GOTO(out, rc = 0);
2154         }
2155
2156         fmkey.lfik_fiemap = *fiemap;
2157
2158         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2159                               &fmkey, fiemap, &num_bytes);
2160 out:
2161         cl_env_put(env, &refcheck);
2162         RETURN(rc);
2163 }
2164
2165 int ll_fid2path(struct inode *inode, void __user *arg)
2166 {
2167         struct obd_export       *exp = ll_i2mdexp(inode);
2168         const struct getinfo_fid2path __user *gfin = arg;
2169         __u32                    pathlen;
2170         struct getinfo_fid2path *gfout;
2171         size_t                   outsize;
2172         int                      rc;
2173
2174         ENTRY;
2175
2176         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2177             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2178                 RETURN(-EPERM);
2179
2180         /* Only need to get the buflen */
2181         if (get_user(pathlen, &gfin->gf_pathlen))
2182                 RETURN(-EFAULT);
2183
2184         if (pathlen > PATH_MAX)
2185                 RETURN(-EINVAL);
2186
2187         outsize = sizeof(*gfout) + pathlen;
2188         OBD_ALLOC(gfout, outsize);
2189         if (gfout == NULL)
2190                 RETURN(-ENOMEM);
2191
2192         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2193                 GOTO(gf_free, rc = -EFAULT);
2194         /* append root FID after gfout to let MDT know the root FID so that it
2195          * can lookup the correct path, this is mainly for fileset.
2196          * old server without fileset mount support will ignore this. */
2197         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2198
2199         /* Call mdc_iocontrol */
2200         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2201         if (rc != 0)
2202                 GOTO(gf_free, rc);
2203
2204         if (copy_to_user(arg, gfout, outsize))
2205                 rc = -EFAULT;
2206
2207 gf_free:
2208         OBD_FREE(gfout, outsize);
2209         RETURN(rc);
2210 }
2211
2212 static int
2213 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2214 {
2215         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2216         struct lu_env *env;
2217         struct cl_io *io;
2218         __u16  refcheck;
2219         int result;
2220
2221         ENTRY;
2222
2223         ioc->idv_version = 0;
2224         ioc->idv_layout_version = UINT_MAX;
2225
2226         /* If no file object initialized, we consider its version is 0. */
2227         if (obj == NULL)
2228                 RETURN(0);
2229
2230         env = cl_env_get(&refcheck);
2231         if (IS_ERR(env))
2232                 RETURN(PTR_ERR(env));
2233
2234         io = vvp_env_thread_io(env);
2235         io->ci_obj = obj;
2236         io->u.ci_data_version.dv_data_version = 0;
2237         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2238         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2239
2240 restart:
2241         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2242                 result = cl_io_loop(env, io);
2243         else
2244                 result = io->ci_result;
2245
2246         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2247         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2248
2249         cl_io_fini(env, io);
2250
2251         if (unlikely(io->ci_need_restart))
2252                 goto restart;
2253
2254         cl_env_put(env, &refcheck);
2255
2256         RETURN(result);
2257 }
2258
2259 /*
2260  * Read the data_version for inode.
2261  *
2262  * This value is computed using stripe object version on OST.
2263  * Version is computed using server side locking.
2264  *
2265  * @param flags if do sync on the OST side;
2266  *              0: no sync
2267  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2268  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2269  */
2270 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2271 {
2272         struct ioc_data_version ioc = { .idv_flags = flags };
2273         int rc;
2274
2275         rc = ll_ioc_data_version(inode, &ioc);
2276         if (!rc)
2277                 *data_version = ioc.idv_version;
2278
2279         return rc;
2280 }
2281
2282 /*
2283  * Trigger a HSM release request for the provided inode.
2284  */
2285 int ll_hsm_release(struct inode *inode)
2286 {
2287         struct lu_env *env;
2288         struct obd_client_handle *och = NULL;
2289         __u64 data_version = 0;
2290         int rc;
2291         __u16 refcheck;
2292         ENTRY;
2293
2294         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2295                ll_get_fsname(inode->i_sb, NULL, 0),
2296                PFID(&ll_i2info(inode)->lli_fid));
2297
2298         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2299         if (IS_ERR(och))
2300                 GOTO(out, rc = PTR_ERR(och));
2301
2302         /* Grab latest data_version and [am]time values */
2303         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2304         if (rc != 0)
2305                 GOTO(out, rc);
2306
2307         env = cl_env_get(&refcheck);
2308         if (IS_ERR(env))
2309                 GOTO(out, rc = PTR_ERR(env));
2310
2311         rc = ll_merge_attr(env, inode);
2312         cl_env_put(env, &refcheck);
2313
2314         /* If error happen, we have the wrong size for a file.
2315          * Don't release it.
2316          */
2317         if (rc != 0)
2318                 GOTO(out, rc);
2319
2320         /* Release the file.
2321          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2322          * we still need it to pack l_remote_handle to MDT. */
2323         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2324                                        &data_version);
2325         och = NULL;
2326
2327         EXIT;
2328 out:
2329         if (och != NULL && !IS_ERR(och)) /* close the file */
2330                 ll_lease_close(och, inode, NULL);
2331
2332         return rc;
2333 }
2334
2335 struct ll_swap_stack {
2336         __u64                    dv1;
2337         __u64                    dv2;
2338         struct inode            *inode1;
2339         struct inode            *inode2;
2340         bool                     check_dv1;
2341         bool                     check_dv2;
2342 };
2343
2344 static int ll_swap_layouts(struct file *file1, struct file *file2,
2345                            struct lustre_swap_layouts *lsl)
2346 {
2347         struct mdc_swap_layouts  msl;
2348         struct md_op_data       *op_data;
2349         __u32                    gid;
2350         __u64                    dv;
2351         struct ll_swap_stack    *llss = NULL;
2352         int                      rc;
2353
2354         OBD_ALLOC_PTR(llss);
2355         if (llss == NULL)
2356                 RETURN(-ENOMEM);
2357
2358         llss->inode1 = file_inode(file1);
2359         llss->inode2 = file_inode(file2);
2360
2361         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2362         if (rc < 0)
2363                 GOTO(free, rc);
2364
2365         /* we use 2 bool because it is easier to swap than 2 bits */
2366         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2367                 llss->check_dv1 = true;
2368
2369         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2370                 llss->check_dv2 = true;
2371
2372         /* we cannot use lsl->sl_dvX directly because we may swap them */
2373         llss->dv1 = lsl->sl_dv1;
2374         llss->dv2 = lsl->sl_dv2;
2375
2376         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2377         if (rc == 0) /* same file, done! */
2378                 GOTO(free, rc);
2379
2380         if (rc < 0) { /* sequentialize it */
2381                 swap(llss->inode1, llss->inode2);
2382                 swap(file1, file2);
2383                 swap(llss->dv1, llss->dv2);
2384                 swap(llss->check_dv1, llss->check_dv2);
2385         }
2386
2387         gid = lsl->sl_gid;
2388         if (gid != 0) { /* application asks to flush dirty cache */
2389                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2390                 if (rc < 0)
2391                         GOTO(free, rc);
2392
2393                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2394                 if (rc < 0) {
2395                         ll_put_grouplock(llss->inode1, file1, gid);
2396                         GOTO(free, rc);
2397                 }
2398         }
2399
2400         /* ultimate check, before swaping the layouts we check if
2401          * dataversion has changed (if requested) */
2402         if (llss->check_dv1) {
2403                 rc = ll_data_version(llss->inode1, &dv, 0);
2404                 if (rc)
2405                         GOTO(putgl, rc);
2406                 if (dv != llss->dv1)
2407                         GOTO(putgl, rc = -EAGAIN);
2408         }
2409
2410         if (llss->check_dv2) {
2411                 rc = ll_data_version(llss->inode2, &dv, 0);
2412                 if (rc)
2413                         GOTO(putgl, rc);
2414                 if (dv != llss->dv2)
2415                         GOTO(putgl, rc = -EAGAIN);
2416         }
2417
2418         /* struct md_op_data is used to send the swap args to the mdt
2419          * only flags is missing, so we use struct mdc_swap_layouts
2420          * through the md_op_data->op_data */
2421         /* flags from user space have to be converted before they are send to
2422          * server, no flag is sent today, they are only used on the client */
2423         msl.msl_flags = 0;
2424         rc = -ENOMEM;
2425         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2426                                      0, LUSTRE_OPC_ANY, &msl);
2427         if (IS_ERR(op_data))
2428                 GOTO(free, rc = PTR_ERR(op_data));
2429
2430         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2431                            sizeof(*op_data), op_data, NULL);
2432         ll_finish_md_op_data(op_data);
2433
2434         if (rc < 0)
2435                 GOTO(putgl, rc);
2436
2437 putgl:
2438         if (gid != 0) {
2439                 ll_put_grouplock(llss->inode2, file2, gid);
2440                 ll_put_grouplock(llss->inode1, file1, gid);
2441         }
2442
2443 free:
2444         if (llss != NULL)
2445                 OBD_FREE_PTR(llss);
2446
2447         RETURN(rc);
2448 }
2449
2450 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2451 {
2452         struct md_op_data       *op_data;
2453         int                      rc;
2454         ENTRY;
2455
2456         /* Detect out-of range masks */
2457         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2458                 RETURN(-EINVAL);
2459
2460         /* Non-root users are forbidden to set or clear flags which are
2461          * NOT defined in HSM_USER_MASK. */
2462         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2463             !cfs_capable(CFS_CAP_SYS_ADMIN))
2464                 RETURN(-EPERM);
2465
2466         /* Detect out-of range archive id */
2467         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2468             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2469                 RETURN(-EINVAL);
2470
2471         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2472                                      LUSTRE_OPC_ANY, hss);
2473         if (IS_ERR(op_data))
2474                 RETURN(PTR_ERR(op_data));
2475
2476         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2477                            sizeof(*op_data), op_data, NULL);
2478
2479         ll_finish_md_op_data(op_data);
2480
2481         RETURN(rc);
2482 }
2483
2484 static int ll_hsm_import(struct inode *inode, struct file *file,
2485                          struct hsm_user_import *hui)
2486 {
2487         struct hsm_state_set    *hss = NULL;
2488         struct iattr            *attr = NULL;
2489         int                      rc;
2490         ENTRY;
2491
2492         if (!S_ISREG(inode->i_mode))
2493                 RETURN(-EINVAL);
2494
2495         /* set HSM flags */
2496         OBD_ALLOC_PTR(hss);
2497         if (hss == NULL)
2498                 GOTO(out, rc = -ENOMEM);
2499
2500         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2501         hss->hss_archive_id = hui->hui_archive_id;
2502         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2503         rc = ll_hsm_state_set(inode, hss);
2504         if (rc != 0)
2505                 GOTO(out, rc);
2506
2507         OBD_ALLOC_PTR(attr);
2508         if (attr == NULL)
2509                 GOTO(out, rc = -ENOMEM);
2510
2511         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2512         attr->ia_mode |= S_IFREG;
2513         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2514         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2515         attr->ia_size = hui->hui_size;
2516         attr->ia_mtime.tv_sec = hui->hui_mtime;
2517         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2518         attr->ia_atime.tv_sec = hui->hui_atime;
2519         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2520
2521         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2522                          ATTR_UID | ATTR_GID |
2523                          ATTR_MTIME | ATTR_MTIME_SET |
2524                          ATTR_ATIME | ATTR_ATIME_SET;
2525
2526         inode_lock(inode);
2527
2528         rc = ll_setattr_raw(file_dentry(file), attr, true);
2529         if (rc == -ENODATA)
2530                 rc = 0;
2531
2532         inode_unlock(inode);
2533
2534 out:
2535         if (hss != NULL)
2536                 OBD_FREE_PTR(hss);
2537
2538         if (attr != NULL)
2539                 OBD_FREE_PTR(attr);
2540
2541         RETURN(rc);
2542 }
2543
2544 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2545 {
2546         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2547                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2548 }
2549
2550 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2551 {
2552         struct inode *inode = file_inode(file);
2553         struct iattr ia = {
2554                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2555                             ATTR_MTIME | ATTR_MTIME_SET |
2556                             ATTR_CTIME | ATTR_CTIME_SET,
2557                 .ia_atime = {
2558                         .tv_sec = lfu->lfu_atime_sec,
2559                         .tv_nsec = lfu->lfu_atime_nsec,
2560                 },
2561                 .ia_mtime = {
2562                         .tv_sec = lfu->lfu_mtime_sec,
2563                         .tv_nsec = lfu->lfu_mtime_nsec,
2564                 },
2565                 .ia_ctime = {
2566                         .tv_sec = lfu->lfu_ctime_sec,
2567                         .tv_nsec = lfu->lfu_ctime_nsec,
2568                 },
2569         };
2570         int rc;
2571         ENTRY;
2572
2573         if (!capable(CAP_SYS_ADMIN))
2574                 RETURN(-EPERM);
2575
2576         if (!S_ISREG(inode->i_mode))
2577                 RETURN(-EINVAL);
2578
2579         inode_lock(inode);
2580         rc = ll_setattr_raw(file_dentry(file), &ia, false);
2581         inode_unlock(inode);
2582
2583         RETURN(rc);
2584 }
2585
2586 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2587 {
2588         switch (mode) {
2589         case MODE_READ_USER:
2590                 return CLM_READ;
2591         case MODE_WRITE_USER:
2592                 return CLM_WRITE;
2593         default:
2594                 return -EINVAL;
2595         }
2596 }
2597
2598 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2599
2600 /* Used to allow the upper layers of the client to request an LDLM lock
2601  * without doing an actual read or write.
2602  *
2603  * Used for ladvise lockahead to manually request specific locks.
2604  *
2605  * \param[in] file      file this ladvise lock request is on
2606  * \param[in] ladvise   ladvise struct describing this lock request
2607  *
2608  * \retval 0            success, no detailed result available (sync requests
2609  *                      and requests sent to the server [not handled locally]
2610  *                      cannot return detailed results)
2611  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2612  *                                       see definitions for details.
2613  * \retval negative     negative errno on error
2614  */
2615 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2616 {
2617         struct lu_env *env = NULL;
2618         struct cl_io *io  = NULL;
2619         struct cl_lock *lock = NULL;
2620         struct cl_lock_descr *descr = NULL;
2621         struct dentry *dentry = file->f_path.dentry;
2622         struct inode *inode = dentry->d_inode;
2623         enum cl_lock_mode cl_mode;
2624         off_t start = ladvise->lla_start;
2625         off_t end = ladvise->lla_end;
2626         int result;
2627         __u16 refcheck;
2628
2629         ENTRY;
2630
2631         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2632                "start=%llu, end=%llu\n", dentry->d_name.len,
2633                dentry->d_name.name, dentry->d_inode,
2634                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2635                (__u64) end);
2636
2637         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2638         if (cl_mode < 0)
2639                 GOTO(out, result = cl_mode);
2640
2641         /* Get IO environment */
2642         result = cl_io_get(inode, &env, &io, &refcheck);
2643         if (result <= 0)
2644                 GOTO(out, result);
2645
2646         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2647         if (result > 0) {
2648                 /*
2649                  * nothing to do for this io. This currently happens when
2650                  * stripe sub-object's are not yet created.
2651                  */
2652                 result = io->ci_result;
2653         } else if (result == 0) {
2654                 lock = vvp_env_lock(env);
2655                 descr = &lock->cll_descr;
2656
2657                 descr->cld_obj   = io->ci_obj;
2658                 /* Convert byte offsets to pages */
2659                 descr->cld_start = cl_index(io->ci_obj, start);
2660                 descr->cld_end   = cl_index(io->ci_obj, end);
2661                 descr->cld_mode  = cl_mode;
2662                 /* CEF_MUST is used because we do not want to convert a
2663                  * lockahead request to a lockless lock */
2664                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2665                                        CEF_NONBLOCK;
2666
2667                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2668                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2669
2670                 result = cl_lock_request(env, io, lock);
2671
2672                 /* On success, we need to release the lock */
2673                 if (result >= 0)
2674                         cl_lock_release(env, lock);
2675         }
2676         cl_io_fini(env, io);
2677         cl_env_put(env, &refcheck);
2678
2679         /* -ECANCELED indicates a matching lock with a different extent
2680          * was already present, and -EEXIST indicates a matching lock
2681          * on exactly the same extent was already present.
2682          * We convert them to positive values for userspace to make
2683          * recognizing true errors easier.
2684          * Note we can only return these detailed results on async requests,
2685          * as sync requests look the same as i/o requests for locking. */
2686         if (result == -ECANCELED)
2687                 result = LLA_RESULT_DIFFERENT;
2688         else if (result == -EEXIST)
2689                 result = LLA_RESULT_SAME;
2690
2691 out:
2692         RETURN(result);
2693 }
2694 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2695
2696 static int ll_ladvise_sanity(struct inode *inode,
2697                              struct llapi_lu_ladvise *ladvise)
2698 {
2699         enum lu_ladvise_type advice = ladvise->lla_advice;
2700         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2701          * be in the first 32 bits of enum ladvise_flags */
2702         __u32 flags = ladvise->lla_peradvice_flags;
2703         /* 3 lines at 80 characters per line, should be plenty */
2704         int rc = 0;
2705
2706         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2707                 rc = -EINVAL;
2708                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2709                        "last supported advice is %s (value '%d'): rc = %d\n",
2710                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2711                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2712                 GOTO(out, rc);
2713         }
2714
2715         /* Per-advice checks */
2716         switch (advice) {
2717         case LU_LADVISE_LOCKNOEXPAND:
2718                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2719                         rc = -EINVAL;
2720                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2721                                "rc = %d\n",
2722                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2723                                ladvise_names[advice], rc);
2724                         GOTO(out, rc);
2725                 }
2726                 break;
2727         case LU_LADVISE_LOCKAHEAD:
2728                 /* Currently only READ and WRITE modes can be requested */
2729                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2730                     ladvise->lla_lockahead_mode == 0) {
2731                         rc = -EINVAL;
2732                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2733                                "rc = %d\n",
2734                                ll_get_fsname(inode->i_sb, NULL, 0),
2735                                ladvise->lla_lockahead_mode,
2736                                ladvise_names[advice], rc);
2737                         GOTO(out, rc);
2738                 }
2739         case LU_LADVISE_WILLREAD:
2740         case LU_LADVISE_DONTNEED:
2741         default:
2742                 /* Note fall through above - These checks apply to all advices
2743                  * except LOCKNOEXPAND */
2744                 if (flags & ~LF_DEFAULT_MASK) {
2745                         rc = -EINVAL;
2746                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2747                                "rc = %d\n",
2748                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2749                                ladvise_names[advice], rc);
2750                         GOTO(out, rc);
2751                 }
2752                 if (ladvise->lla_start >= ladvise->lla_end) {
2753                         rc = -EINVAL;
2754                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2755                                "for %s: rc = %d\n",
2756                                ll_get_fsname(inode->i_sb, NULL, 0),
2757                                ladvise->lla_start, ladvise->lla_end,
2758                                ladvise_names[advice], rc);
2759                         GOTO(out, rc);
2760                 }
2761                 break;
2762         }
2763
2764 out:
2765         return rc;
2766 }
2767 #undef ERRSIZE
2768
2769 /*
2770  * Give file access advices
2771  *
2772  * The ladvise interface is similar to Linux fadvise() system call, except it
2773  * forwards the advices directly from Lustre client to server. The server side
2774  * codes will apply appropriate read-ahead and caching techniques for the
2775  * corresponding files.
2776  *
2777  * A typical workload for ladvise is e.g. a bunch of different clients are
2778  * doing small random reads of a file, so prefetching pages into OSS cache
2779  * with big linear reads before the random IO is a net benefit. Fetching
2780  * all that data into each client cache with fadvise() may not be, due to
2781  * much more data being sent to the client.
2782  */
2783 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2784                       struct llapi_lu_ladvise *ladvise)
2785 {
2786         struct lu_env *env;
2787         struct cl_io *io;
2788         struct cl_ladvise_io *lio;
2789         int rc;
2790         __u16 refcheck;
2791         ENTRY;
2792
2793         env = cl_env_get(&refcheck);
2794         if (IS_ERR(env))
2795                 RETURN(PTR_ERR(env));
2796
2797         io = vvp_env_thread_io(env);
2798         io->ci_obj = ll_i2info(inode)->lli_clob;
2799
2800         /* initialize parameters for ladvise */
2801         lio = &io->u.ci_ladvise;
2802         lio->li_start = ladvise->lla_start;
2803         lio->li_end = ladvise->lla_end;
2804         lio->li_fid = ll_inode2fid(inode);
2805         lio->li_advice = ladvise->lla_advice;
2806         lio->li_flags = flags;
2807
2808         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2809                 rc = cl_io_loop(env, io);
2810         else
2811                 rc = io->ci_result;
2812
2813         cl_io_fini(env, io);
2814         cl_env_put(env, &refcheck);
2815         RETURN(rc);
2816 }
2817
2818 static int ll_lock_noexpand(struct file *file, int flags)
2819 {
2820         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2821
2822         fd->ll_lock_no_expand = !(flags & LF_UNSET);
2823
2824         return 0;
2825 }
2826
2827 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2828                         unsigned long arg)
2829 {
2830         struct fsxattr fsxattr;
2831
2832         if (copy_from_user(&fsxattr,
2833                            (const struct fsxattr __user *)arg,
2834                            sizeof(fsxattr)))
2835                 RETURN(-EFAULT);
2836
2837         fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2838         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2839         if (copy_to_user((struct fsxattr __user *)arg,
2840                          &fsxattr, sizeof(fsxattr)))
2841                 RETURN(-EFAULT);
2842
2843         RETURN(0);
2844 }
2845
2846 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2847                         unsigned long arg)
2848 {
2849
2850         struct md_op_data *op_data;
2851         struct ptlrpc_request *req = NULL;
2852         int rc = 0;
2853         struct fsxattr fsxattr;
2854         struct cl_object *obj;
2855
2856         /* only root could change project ID */
2857         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2858                 RETURN(-EPERM);
2859
2860         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2861                                      LUSTRE_OPC_ANY, NULL);
2862         if (IS_ERR(op_data))
2863                 RETURN(PTR_ERR(op_data));
2864
2865         if (copy_from_user(&fsxattr,
2866                            (const struct fsxattr __user *)arg,
2867                            sizeof(fsxattr)))
2868                 GOTO(out_fsxattr1, rc = -EFAULT);
2869
2870         op_data->op_attr_flags = fsxattr.fsx_xflags;
2871         op_data->op_projid = fsxattr.fsx_projid;
2872         op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2873         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2874                         0, &req);
2875         ptlrpc_req_finished(req);
2876
2877         obj = ll_i2info(inode)->lli_clob;
2878         if (obj) {
2879                 struct iattr *attr;
2880
2881                 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2882                 OBD_ALLOC_PTR(attr);
2883                 if (attr == NULL)
2884                         GOTO(out_fsxattr1, rc = -ENOMEM);
2885                 attr->ia_valid = ATTR_ATTR_FLAG;
2886                 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2887
2888                 OBD_FREE_PTR(attr);
2889         }
2890 out_fsxattr1:
2891         ll_finish_md_op_data(op_data);
2892         RETURN(rc);
2893 }
2894
2895 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
2896                                  unsigned long arg)
2897 {
2898         struct inode            *inode = file_inode(file);
2899         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2900         struct ll_inode_info    *lli = ll_i2info(inode);
2901         struct obd_client_handle *och = NULL;
2902         struct split_param sp;
2903         bool lease_broken;
2904         fmode_t fmode = 0;
2905         enum mds_op_bias bias = 0;
2906         struct file *layout_file = NULL;
2907         void *data = NULL;
2908         size_t data_size = 0;
2909         long rc;
2910         ENTRY;
2911
2912         mutex_lock(&lli->lli_och_mutex);
2913         if (fd->fd_lease_och != NULL) {
2914                 och = fd->fd_lease_och;
2915                 fd->fd_lease_och = NULL;
2916         }
2917         mutex_unlock(&lli->lli_och_mutex);
2918
2919         if (och == NULL)
2920                 GOTO(out, rc = -ENOLCK);
2921
2922         fmode = och->och_flags;
2923
2924         switch (ioc->lil_flags) {
2925         case LL_LEASE_RESYNC_DONE:
2926                 if (ioc->lil_count > IOC_IDS_MAX)
2927                         GOTO(out, rc = -EINVAL);
2928
2929                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
2930                 OBD_ALLOC(data, data_size);
2931                 if (!data)
2932                         GOTO(out, rc = -ENOMEM);
2933
2934                 if (copy_from_user(data, (void __user *)arg, data_size))
2935                         GOTO(out, rc = -EFAULT);
2936
2937                 bias = MDS_CLOSE_RESYNC_DONE;
2938                 break;
2939         case LL_LEASE_LAYOUT_MERGE: {
2940                 int fd;
2941
2942                 if (ioc->lil_count != 1)
2943                         GOTO(out, rc = -EINVAL);
2944
2945                 arg += sizeof(*ioc);
2946                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
2947                         GOTO(out, rc = -EFAULT);
2948
2949                 layout_file = fget(fd);
2950                 if (!layout_file)
2951                         GOTO(out, rc = -EBADF);
2952
2953                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
2954                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
2955                         GOTO(out, rc = -EPERM);
2956
2957                 data = file_inode(layout_file);
2958                 bias = MDS_CLOSE_LAYOUT_MERGE;
2959                 break;
2960         }
2961         case LL_LEASE_LAYOUT_SPLIT: {
2962                 int fdv;
2963                 int mirror_id;
2964
2965                 if (ioc->lil_count != 2)
2966                         GOTO(out, rc = -EINVAL);
2967
2968                 arg += sizeof(*ioc);
2969                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
2970                         GOTO(out, rc = -EFAULT);
2971
2972                 arg += sizeof(__u32);
2973                 if (copy_from_user(&mirror_id, (void __user *)arg,
2974                                    sizeof(__u32)))
2975                         GOTO(out, rc = -EFAULT);
2976
2977                 layout_file = fget(fdv);
2978                 if (!layout_file)
2979                         GOTO(out, rc = -EBADF);
2980
2981                 sp.sp_inode = file_inode(layout_file);
2982                 sp.sp_mirror_id = (__u16)mirror_id;
2983                 data = &sp;
2984                 bias = MDS_CLOSE_LAYOUT_SPLIT;
2985                 break;
2986         }
2987         default:
2988                 /* without close intent */
2989                 break;
2990         }
2991
2992         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
2993         if (rc < 0)
2994                 GOTO(out, rc);
2995
2996         rc = ll_lease_och_release(inode, file);
2997         if (rc < 0)
2998                 GOTO(out, rc);
2999
3000         if (lease_broken)
3001                 fmode = 0;
3002         EXIT;
3003
3004 out:
3005         switch (ioc->lil_flags) {
3006         case LL_LEASE_RESYNC_DONE:
3007                 if (data)
3008                         OBD_FREE(data, data_size);
3009                 break;
3010         case LL_LEASE_LAYOUT_MERGE:
3011         case LL_LEASE_LAYOUT_SPLIT:
3012                 if (layout_file)
3013                         fput(layout_file);
3014                 break;
3015         }
3016
3017         if (!rc)
3018                 rc = ll_lease_type_from_fmode(fmode);
3019         RETURN(rc);
3020 }
3021
3022 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3023                               unsigned long arg)
3024 {
3025         struct inode *inode = file_inode(file);
3026         struct ll_inode_info *lli = ll_i2info(inode);
3027         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3028         struct obd_client_handle *och = NULL;
3029         __u64 open_flags = 0;
3030         bool lease_broken;
3031         fmode_t fmode;
3032         long rc;
3033         ENTRY;
3034
3035         switch (ioc->lil_mode) {
3036         case LL_LEASE_WRLCK:
3037                 if (!(file->f_mode & FMODE_WRITE))
3038                         RETURN(-EPERM);
3039                 fmode = FMODE_WRITE;
3040                 break;
3041         case LL_LEASE_RDLCK:
3042                 if (!(file->f_mode & FMODE_READ))
3043                         RETURN(-EPERM);
3044                 fmode = FMODE_READ;
3045                 break;
3046         case LL_LEASE_UNLCK:
3047                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3048         default:
3049                 RETURN(-EINVAL);
3050         }
3051
3052         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3053
3054         /* apply for lease */
3055         if (ioc->lil_flags & LL_LEASE_RESYNC)
3056                 open_flags = MDS_OPEN_RESYNC;
3057         och = ll_lease_open(inode, file, fmode, open_flags);
3058         if (IS_ERR(och))
3059                 RETURN(PTR_ERR(och));
3060
3061         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3062                 rc = ll_lease_file_resync(och, inode);
3063                 if (rc) {
3064                         ll_lease_close(och, inode, NULL);
3065                         RETURN(rc);
3066                 }
3067                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3068                 if (rc) {
3069                         ll_lease_close(och, inode, NULL);
3070                         RETURN(rc);
3071                 }
3072         }
3073
3074         rc = 0;
3075         mutex_lock(&lli->lli_och_mutex);
3076         if (fd->fd_lease_och == NULL) {
3077                 fd->fd_lease_och = och;
3078                 och = NULL;
3079         }
3080         mutex_unlock(&lli->lli_och_mutex);
3081         if (och != NULL) {
3082                 /* impossible now that only excl is supported for now */
3083                 ll_lease_close(och, inode, &lease_broken);
3084                 rc = -EBUSY;
3085         }
3086         RETURN(rc);
3087 }
3088
3089 static long
3090 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3091 {
3092         struct inode            *inode = file_inode(file);
3093         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3094         int                      flags, rc;
3095         ENTRY;
3096
3097         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3098                PFID(ll_inode2fid(inode)), inode, cmd);
3099         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3100
3101         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3102         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3103                 RETURN(-ENOTTY);
3104
3105         switch(cmd) {
3106         case LL_IOC_GETFLAGS:
3107                 /* Get the current value of the file flags */
3108                 return put_user(fd->fd_flags, (int __user *)arg);
3109         case LL_IOC_SETFLAGS:
3110         case LL_IOC_CLRFLAGS:
3111                 /* Set or clear specific file flags */
3112                 /* XXX This probably needs checks to ensure the flags are
3113                  *     not abused, and to handle any flag side effects.
3114                  */
3115                 if (get_user(flags, (int __user *) arg))
3116                         RETURN(-EFAULT);
3117
3118                 if (cmd == LL_IOC_SETFLAGS) {
3119                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3120                             !(file->f_flags & O_DIRECT)) {
3121                                 CERROR("%s: unable to disable locking on "
3122                                        "non-O_DIRECT file\n", current->comm);
3123                                 RETURN(-EINVAL);
3124                         }
3125
3126                         fd->fd_flags |= flags;
3127                 } else {
3128                         fd->fd_flags &= ~flags;
3129                 }
3130                 RETURN(0);
3131         case LL_IOC_LOV_SETSTRIPE:
3132         case LL_IOC_LOV_SETSTRIPE_NEW:
3133                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3134         case LL_IOC_LOV_SETEA:
3135                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3136         case LL_IOC_LOV_SWAP_LAYOUTS: {
3137                 struct file *file2;
3138                 struct lustre_swap_layouts lsl;
3139
3140                 if (copy_from_user(&lsl, (char __user *)arg,
3141                                    sizeof(struct lustre_swap_layouts)))
3142                         RETURN(-EFAULT);
3143
3144                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3145                         RETURN(-EPERM);
3146
3147                 file2 = fget(lsl.sl_fd);
3148                 if (file2 == NULL)
3149                         RETURN(-EBADF);
3150
3151                 /* O_WRONLY or O_RDWR */
3152                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3153                         GOTO(out, rc = -EPERM);
3154
3155                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3156                         struct inode                    *inode2;
3157                         struct ll_inode_info            *lli;
3158                         struct obd_client_handle        *och = NULL;
3159
3160                         lli = ll_i2info(inode);
3161                         mutex_lock(&lli->lli_och_mutex);
3162                         if (fd->fd_lease_och != NULL) {
3163                                 och = fd->fd_lease_och;
3164                                 fd->fd_lease_och = NULL;
3165                         }
3166                         mutex_unlock(&lli->lli_och_mutex);
3167                         if (och == NULL)
3168                                 GOTO(out, rc = -ENOLCK);
3169                         inode2 = file_inode(file2);
3170                         rc = ll_swap_layouts_close(och, inode, inode2);
3171                 } else {
3172                         rc = ll_swap_layouts(file, file2, &lsl);
3173                 }
3174 out:
3175                 fput(file2);
3176                 RETURN(rc);
3177         }
3178         case LL_IOC_LOV_GETSTRIPE:
3179         case LL_IOC_LOV_GETSTRIPE_NEW:
3180                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3181         case FSFILT_IOC_GETFLAGS:
3182         case FSFILT_IOC_SETFLAGS:
3183                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3184         case FSFILT_IOC_GETVERSION_OLD:
3185         case FSFILT_IOC_GETVERSION:
3186                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3187         case LL_IOC_GROUP_LOCK:
3188                 RETURN(ll_get_grouplock(inode, file, arg));
3189         case LL_IOC_GROUP_UNLOCK:
3190                 RETURN(ll_put_grouplock(inode, file, arg));
3191         case IOC_OBD_STATFS:
3192                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3193
3194         /* We need to special case any other ioctls we want to handle,
3195          * to send them to the MDS/OST as appropriate and to properly
3196          * network encode the arg field.
3197         case FSFILT_IOC_SETVERSION_OLD:
3198         case FSFILT_IOC_SETVERSION:
3199         */
3200         case LL_IOC_FLUSHCTX:
3201                 RETURN(ll_flush_ctx(inode));
3202         case LL_IOC_PATH2FID: {
3203                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3204                                  sizeof(struct lu_fid)))
3205                         RETURN(-EFAULT);
3206
3207                 RETURN(0);
3208         }
3209         case LL_IOC_GETPARENT:
3210                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3211
3212         case OBD_IOC_FID2PATH:
3213                 RETURN(ll_fid2path(inode, (void __user *)arg));
3214         case LL_IOC_DATA_VERSION: {
3215                 struct ioc_data_version idv;
3216                 int rc;
3217
3218                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3219                         RETURN(-EFAULT);
3220
3221                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3222                 rc = ll_ioc_data_version(inode, &idv);
3223
3224                 if (rc == 0 &&
3225                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3226                         RETURN(-EFAULT);
3227
3228                 RETURN(rc);
3229         }
3230
3231         case LL_IOC_GET_MDTIDX: {
3232                 int mdtidx;
3233
3234                 mdtidx = ll_get_mdt_idx(inode);
3235                 if (mdtidx < 0)
3236                         RETURN(mdtidx);
3237
3238                 if (put_user((int)mdtidx, (int __user *)arg))
3239                         RETURN(-EFAULT);
3240
3241                 RETURN(0);
3242         }
3243         case OBD_IOC_GETDTNAME:
3244         case OBD_IOC_GETMDNAME:
3245                 RETURN(ll_get_obd_name(inode, cmd, arg));
3246         case LL_IOC_HSM_STATE_GET: {
3247                 struct md_op_data       *op_data;
3248                 struct hsm_user_state   *hus;
3249                 int                      rc;
3250
3251                 OBD_ALLOC_PTR(hus);
3252                 if (hus == NULL)
3253                         RETURN(-ENOMEM);
3254
3255                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3256                                              LUSTRE_OPC_ANY, hus);
3257                 if (IS_ERR(op_data)) {
3258                         OBD_FREE_PTR(hus);
3259                         RETURN(PTR_ERR(op_data));
3260                 }
3261
3262                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3263                                    op_data, NULL);
3264
3265                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3266                         rc = -EFAULT;
3267
3268                 ll_finish_md_op_data(op_data);
3269                 OBD_FREE_PTR(hus);
3270                 RETURN(rc);
3271         }
3272         case LL_IOC_HSM_STATE_SET: {
3273                 struct hsm_state_set    *hss;
3274                 int                      rc;
3275
3276                 OBD_ALLOC_PTR(hss);
3277                 if (hss == NULL)
3278                         RETURN(-ENOMEM);
3279
3280                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3281                         OBD_FREE_PTR(hss);
3282                         RETURN(-EFAULT);
3283                 }
3284
3285                 rc = ll_hsm_state_set(inode, hss);
3286
3287                 OBD_FREE_PTR(hss);
3288                 RETURN(rc);
3289         }
3290         case LL_IOC_HSM_ACTION: {
3291                 struct md_op_data               *op_data;
3292                 struct hsm_current_action       *hca;
3293                 int                              rc;
3294
3295                 OBD_ALLOC_PTR(hca);
3296                 if (hca == NULL)
3297                         RETURN(-ENOMEM);
3298
3299                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3300                                              LUSTRE_OPC_ANY, hca);
3301                 if (IS_ERR(op_data)) {
3302                         OBD_FREE_PTR(hca);
3303                         RETURN(PTR_ERR(op_data));
3304                 }
3305
3306                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3307                                    op_data, NULL);
3308
3309                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3310                         rc = -EFAULT;
3311
3312                 ll_finish_md_op_data(op_data);
3313                 OBD_FREE_PTR(hca);
3314                 RETURN(rc);
3315         }
3316         case LL_IOC_SET_LEASE_OLD: {
3317                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3318
3319                 RETURN(ll_file_set_lease(file, &ioc, 0));
3320         }
3321         case LL_IOC_SET_LEASE: {
3322                 struct ll_ioc_lease ioc;
3323
3324                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3325                         RETURN(-EFAULT);
3326
3327                 RETURN(ll_file_set_lease(file, &ioc, arg));
3328         }
3329         case LL_IOC_GET_LEASE: {
3330                 struct ll_inode_info *lli = ll_i2info(inode);
3331                 struct ldlm_lock *lock = NULL;
3332                 fmode_t fmode = 0;
3333
3334                 mutex_lock(&lli->lli_och_mutex);
3335                 if (fd->fd_lease_och != NULL) {
3336                         struct obd_client_handle *och = fd->fd_lease_och;
3337
3338                         lock = ldlm_handle2lock(&och->och_lease_handle);
3339                         if (lock != NULL) {
3340                                 lock_res_and_lock(lock);
3341                                 if (!ldlm_is_cancel(lock))
3342                                         fmode = och->och_flags;
3343
3344                                 unlock_res_and_lock(lock);
3345                                 LDLM_LOCK_PUT(lock);
3346                         }
3347                 }
3348                 mutex_unlock(&lli->lli_och_mutex);
3349
3350                 RETURN(ll_lease_type_from_fmode(fmode));
3351         }
3352         case LL_IOC_HSM_IMPORT: {
3353                 struct hsm_user_import *hui;
3354
3355                 OBD_ALLOC_PTR(hui);
3356                 if (hui == NULL)
3357                         RETURN(-ENOMEM);
3358
3359                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3360                         OBD_FREE_PTR(hui);
3361                         RETURN(-EFAULT);
3362                 }
3363
3364                 rc = ll_hsm_import(inode, file, hui);
3365
3366                 OBD_FREE_PTR(hui);
3367                 RETURN(rc);
3368         }
3369         case LL_IOC_FUTIMES_3: {
3370                 struct ll_futimes_3 lfu;
3371
3372                 if (copy_from_user(&lfu,
3373                                    (const struct ll_futimes_3 __user *)arg,
3374                                    sizeof(lfu)))
3375                         RETURN(-EFAULT);
3376
3377                 RETURN(ll_file_futimes_3(file, &lfu));
3378         }
3379         case LL_IOC_LADVISE: {
3380                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3381                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3382                 int i;
3383                 int num_advise;
3384                 int alloc_size = sizeof(*k_ladvise_hdr);
3385
3386                 rc = 0;
3387                 u_ladvise_hdr = (void __user *)arg;
3388                 OBD_ALLOC_PTR(k_ladvise_hdr);
3389                 if (k_ladvise_hdr == NULL)
3390                         RETURN(-ENOMEM);
3391
3392                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3393                         GOTO(out_ladvise, rc = -EFAULT);
3394
3395                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3396                     k_ladvise_hdr->lah_count < 1)
3397                         GOTO(out_ladvise, rc = -EINVAL);
3398
3399                 num_advise = k_ladvise_hdr->lah_count;
3400                 if (num_advise >= LAH_COUNT_MAX)
3401                         GOTO(out_ladvise, rc = -EFBIG);
3402
3403                 OBD_FREE_PTR(k_ladvise_hdr);
3404                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3405                                       lah_advise[num_advise]);
3406                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3407                 if (k_ladvise_hdr == NULL)
3408                         RETURN(-ENOMEM);
3409
3410                 /*
3411                  * TODO: submit multiple advices to one server in a single RPC
3412                  */
3413                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3414                         GOTO(out_ladvise, rc = -EFAULT);
3415
3416                 for (i = 0; i < num_advise; i++) {
3417                         struct llapi_lu_ladvise *k_ladvise =
3418                                         &k_ladvise_hdr->lah_advise[i];
3419                         struct llapi_lu_ladvise __user *u_ladvise =
3420                                         &u_ladvise_hdr->lah_advise[i];
3421
3422                         rc = ll_ladvise_sanity(inode, k_ladvise);
3423                         if (rc)
3424                                 GOTO(out_ladvise, rc);
3425
3426                         switch (k_ladvise->lla_advice) {
3427                         case LU_LADVISE_LOCKNOEXPAND:
3428                                 rc = ll_lock_noexpand(file,
3429                                                k_ladvise->lla_peradvice_flags);
3430                                 GOTO(out_ladvise, rc);
3431                         case LU_LADVISE_LOCKAHEAD:
3432
3433                                 rc = ll_file_lock_ahead(file, k_ladvise);
3434
3435                                 if (rc < 0)
3436                                         GOTO(out_ladvise, rc);
3437
3438                                 if (put_user(rc,
3439                                              &u_ladvise->lla_lockahead_result))
3440                                         GOTO(out_ladvise, rc = -EFAULT);
3441                                 break;
3442                         default:
3443                                 rc = ll_ladvise(inode, file,
3444                                                 k_ladvise_hdr->lah_flags,
3445                                                 k_ladvise);
3446                                 if (rc)
3447                                         GOTO(out_ladvise, rc);
3448                                 break;
3449                         }
3450
3451                 }
3452
3453 out_ladvise:
3454                 OBD_FREE(k_ladvise_hdr, alloc_size);
3455                 RETURN(rc);
3456         }
3457         case LL_IOC_FLR_SET_MIRROR: {
3458                 /* mirror I/O must be direct to avoid polluting page cache
3459                  * by stale data. */
3460                 if (!(file->f_flags & O_DIRECT))
3461                         RETURN(-EINVAL);
3462
3463                 fd->fd_designated_mirror = (__u32)arg;
3464                 RETURN(0);
3465         }
3466         case LL_IOC_FSGETXATTR:
3467                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3468         case LL_IOC_FSSETXATTR:
3469                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3470         case BLKSSZGET:
3471                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3472         default:
3473                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3474                                      (void __user *)arg));
3475         }
3476 }
3477
3478 #ifndef HAVE_FILE_LLSEEK_SIZE
3479 static inline loff_t
3480 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3481 {
3482         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3483                 return -EINVAL;
3484         if (offset > maxsize)
3485                 return -EINVAL;
3486
3487         if (offset != file->f_pos) {
3488                 file->f_pos = offset;
3489                 file->f_version = 0;
3490         }
3491         return offset;
3492 }
3493
3494 static loff_t
3495 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3496                 loff_t maxsize, loff_t eof)
3497 {
3498         struct inode *inode = file_inode(file);
3499
3500         switch (origin) {
3501         case SEEK_END:
3502                 offset += eof;
3503                 break;
3504         case SEEK_CUR:
3505                 /*
3506                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3507                  * position-querying operation.  Avoid rewriting the "same"
3508                  * f_pos value back to the file because a concurrent read(),
3509                  * write() or lseek() might have altered it
3510                  */
3511                 if (offset == 0)
3512                         return file->f_pos;
3513                 /*
3514                  * f_lock protects against read/modify/write race with other
3515                  * SEEK_CURs. Note that parallel writes and reads behave
3516                  * like SEEK_SET.
3517                  */
3518                 inode_lock(inode);
3519                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3520                 inode_unlock(inode);
3521                 return offset;
3522         case SEEK_DATA:
3523                 /*
3524                  * In the generic case the entire file is data, so as long as
3525                  * offset isn't at the end of the file then the offset is data.
3526                  */
3527                 if (offset >= eof)
3528                         return -ENXIO;
3529                 break;
3530         case SEEK_HOLE:
3531                 /*
3532                  * There is a virtual hole at the end of the file, so as long as
3533                  * offset isn't i_size or larger, return i_size.
3534                  */
3535                 if (offset >= eof)
3536                         return -ENXIO;
3537                 offset = eof;
3538                 break;
3539         }
3540
3541         return llseek_execute(file, offset, maxsize);
3542 }
3543 #endif
3544
3545 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3546 {
3547         struct inode *inode = file_inode(file);
3548         loff_t retval, eof = 0;
3549
3550         ENTRY;
3551         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3552                            (origin == SEEK_CUR) ? file->f_pos : 0);
3553         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3554                PFID(ll_inode2fid(inode)), inode, retval, retval,
3555                origin);
3556         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3557
3558         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3559                 retval = ll_glimpse_size(inode);
3560                 if (retval != 0)
3561                         RETURN(retval);
3562                 eof = i_size_read(inode);
3563         }
3564
3565         retval = ll_generic_file_llseek_size(file, offset, origin,
3566                                           ll_file_maxbytes(inode), eof);
3567         RETURN(retval);
3568 }
3569
3570 static int ll_flush(struct file *file, fl_owner_t id)
3571 {
3572         struct inode *inode = file_inode(file);
3573         struct ll_inode_info *lli = ll_i2info(inode);
3574         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3575         int rc, err;
3576
3577         LASSERT(!S_ISDIR(inode->i_mode));
3578
3579         /* catch async errors that were recorded back when async writeback
3580          * failed for pages in this mapping. */
3581         rc = lli->lli_async_rc;
3582         lli->lli_async_rc = 0;
3583         if (lli->lli_clob != NULL) {
3584                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3585                 if (rc == 0)
3586                         rc = err;
3587         }
3588
3589         /* The application has been told write failure already.
3590          * Do not report failure again. */
3591         if (fd->fd_write_failed)
3592                 return 0;
3593         return rc ? -EIO : 0;
3594 }
3595
3596 /**
3597  * Called to make sure a portion of file has been written out.
3598  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3599  *
3600  * Return how many pages have been written.
3601  */
3602 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3603                        enum cl_fsync_mode mode, int ignore_layout)
3604 {
3605         struct lu_env *env;
3606         struct cl_io *io;
3607         struct cl_fsync_io *fio;
3608         int result;
3609         __u16 refcheck;
3610         ENTRY;
3611
3612         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3613             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3614                 RETURN(-EINVAL);
3615
3616         env = cl_env_get(&refcheck);
3617         if (IS_ERR(env))
3618                 RETURN(PTR_ERR(env));
3619
3620         io = vvp_env_thread_io(env);
3621         io->ci_obj = ll_i2info(inode)->lli_clob;
3622         io->ci_ignore_layout = ignore_layout;
3623
3624         /* initialize parameters for sync */
3625         fio = &io->u.ci_fsync;
3626         fio->fi_start = start;
3627         fio->fi_end = end;
3628         fio->fi_fid = ll_inode2fid(inode);
3629         fio->fi_mode = mode;
3630         fio->fi_nr_written = 0;
3631
3632         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3633                 result = cl_io_loop(env, io);
3634         else
3635                 result = io->ci_result;
3636         if (result == 0)
3637                 result = fio->fi_nr_written;
3638         cl_io_fini(env, io);
3639         cl_env_put(env, &refcheck);
3640
3641         RETURN(result);
3642 }
3643
3644 /*
3645  * When dentry is provided (the 'else' case), file_dentry() may be
3646  * null and dentry must be used directly rather than pulled from
3647  * file_dentry() as is done otherwise.
3648  */
3649
3650 #ifdef HAVE_FILE_FSYNC_4ARGS
3651 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3652 {
3653         struct dentry *dentry = file_dentry(file);
3654         bool lock_inode;
3655 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3656 int ll_fsync(struct file *file, int datasync)
3657 {
3658         struct dentry *dentry = file_dentry(file);
3659         loff_t start = 0;
3660         loff_t end = LLONG_MAX;
3661 #else
3662 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3663 {
3664         loff_t start = 0;
3665         loff_t end = LLONG_MAX;
3666 #endif
3667         struct inode *inode = dentry->d_inode;
3668         struct ll_inode_info *lli = ll_i2info(inode);
3669         struct ptlrpc_request *req;
3670         int rc, err;
3671         ENTRY;
3672
3673         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3674                PFID(ll_inode2fid(inode)), inode);
3675         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3676
3677 #ifdef HAVE_FILE_FSYNC_4ARGS
3678         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3679         lock_inode = !lli->lli_inode_locked;
3680         if (lock_inode)
3681                 inode_lock(inode);
3682 #else
3683         /* fsync's caller has already called _fdata{sync,write}, we want
3684          * that IO to finish before calling the osc and mdc sync methods */
3685         rc = filemap_fdatawait(inode->i_mapping);
3686 #endif
3687
3688         /* catch async errors that were recorded back when async writeback
3689          * failed for pages in this mapping. */
3690         if (!S_ISDIR(inode->i_mode)) {
3691                 err = lli->lli_async_rc;
3692                 lli->lli_async_rc = 0;
3693                 if (rc == 0)
3694                         rc = err;
3695                 if (lli->lli_clob != NULL) {
3696                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3697                         if (rc == 0)
3698                                 rc = err;
3699                 }
3700         }
3701
3702         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3703         if (!rc)
3704                 rc = err;
3705         if (!err)
3706                 ptlrpc_req_finished(req);
3707
3708         if (S_ISREG(inode->i_mode)) {
3709                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3710
3711                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3712                 if (rc == 0 && err < 0)
3713                         rc = err;
3714                 if (rc < 0)
3715                         fd->fd_write_failed = true;
3716                 else
3717                         fd->fd_write_failed = false;
3718         }
3719
3720 #ifdef HAVE_FILE_FSYNC_4ARGS
3721         if (lock_inode)
3722                 inode_unlock(inode);
3723 #endif
3724         RETURN(rc);
3725 }
3726
3727 static int
3728 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3729 {
3730         struct inode *inode = file_inode(file);
3731         struct ll_sb_info *sbi = ll_i2sbi(inode);
3732         struct ldlm_enqueue_info einfo = {
3733                 .ei_type        = LDLM_FLOCK,
3734                 .ei_cb_cp       = ldlm_flock_completion_ast,
3735                 .ei_cbdata      = file_lock,
3736         };
3737         struct md_op_data *op_data;
3738         struct lustre_handle lockh = { 0 };
3739         union ldlm_policy_data flock = { { 0 } };
3740         int fl_type = file_lock->fl_type;
3741         __u64 flags = 0;
3742         int rc;
3743         int rc2 = 0;
3744         ENTRY;
3745
3746         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3747                PFID(ll_inode2fid(inode)), file_lock);
3748
3749         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3750
3751         if (file_lock->fl_flags & FL_FLOCK) {
3752                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3753                 /* flocks are whole-file locks */
3754                 flock.l_flock.end = OFFSET_MAX;
3755                 /* For flocks owner is determined by the local file desctiptor*/
3756                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3757         } else if (file_lock->fl_flags & FL_POSIX) {
3758                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3759                 flock.l_flock.start = file_lock->fl_start;
3760                 flock.l_flock.end = file_lock->fl_end;
3761         } else {
3762                 RETURN(-EINVAL);
3763         }
3764         flock.l_flock.pid = file_lock->fl_pid;
3765
3766         /* Somewhat ugly workaround for svc lockd.
3767          * lockd installs custom fl_lmops->lm_compare_owner that checks
3768          * for the fl_owner to be the same (which it always is on local node
3769          * I guess between lockd processes) and then compares pid.
3770          * As such we assign pid to the owner field to make it all work,
3771          * conflict with normal locks is unlikely since pid space and
3772          * pointer space for current->files are not intersecting */
3773         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3774                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3775
3776         switch (fl_type) {
3777         case F_RDLCK:
3778                 einfo.ei_mode = LCK_PR;
3779                 break;
3780         case F_UNLCK:
3781                 /* An unlock request may or may not have any relation to
3782                  * existing locks so we may not be able to pass a lock handle
3783                  * via a normal ldlm_lock_cancel() request. The request may even
3784                  * unlock a byte range in the middle of an existing lock. In
3785                  * order to process an unlock request we need all of the same
3786                  * information that is given with a normal read or write record
3787                  * lock request. To avoid creating another ldlm unlock (cancel)
3788                  * message we'll treat a LCK_NL flock request as an unlock. */
3789                 einfo.ei_mode = LCK_NL;
3790                 break;
3791         case F_WRLCK:
3792                 einfo.ei_mode = LCK_PW;
3793                 break;
3794         default:
3795                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3796                 RETURN (-ENOTSUPP);
3797         }
3798
3799         switch (cmd) {
3800         case F_SETLKW:
3801 #ifdef F_SETLKW64
3802         case F_SETLKW64:
3803 #endif
3804                 flags = 0;
3805                 break;
3806         case F_SETLK:
3807 #ifdef F_SETLK64
3808         case F_SETLK64:
3809 #endif
3810                 flags = LDLM_FL_BLOCK_NOWAIT;
3811                 break;
3812         case F_GETLK:
3813 #ifdef F_GETLK64
3814         case F_GETLK64:
3815 #endif
3816                 flags = LDLM_FL_TEST_LOCK;
3817                 break;
3818         default:
3819                 CERROR("unknown fcntl lock command: %d\n", cmd);
3820                 RETURN (-EINVAL);
3821         }
3822
3823         /* Save the old mode so that if the mode in the lock changes we
3824          * can decrement the appropriate reader or writer refcount. */
3825         file_lock->fl_type = einfo.ei_mode;
3826
3827         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3828                                      LUSTRE_OPC_ANY, NULL);
3829         if (IS_ERR(op_data))
3830                 RETURN(PTR_ERR(op_data));
3831
3832         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3833                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3834                flock.l_flock.pid, flags, einfo.ei_mode,
3835                flock.l_flock.start, flock.l_flock.end);
3836
3837         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3838                         flags);
3839
3840         /* Restore the file lock type if not TEST lock. */
3841         if (!(flags & LDLM_FL_TEST_LOCK))
3842                 file_lock->fl_type = fl_type;
3843
3844 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3845         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3846             !(flags & LDLM_FL_TEST_LOCK))
3847                 rc2  = locks_lock_file_wait(file, file_lock);
3848 #else
3849         if ((file_lock->fl_flags & FL_FLOCK) &&
3850             (rc == 0 || file_lock->fl_type == F_UNLCK))
3851                 rc2  = flock_lock_file_wait(file, file_lock);
3852         if ((file_lock->fl_flags & FL_POSIX) &&
3853             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3854             !(flags & LDLM_FL_TEST_LOCK))
3855                 rc2  = posix_lock_file_wait(file, file_lock);
3856 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3857
3858         if (rc2 && file_lock->fl_type != F_UNLCK) {
3859                 einfo.ei_mode = LCK_NL;
3860                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3861                            &lockh, flags);
3862                 rc = rc2;
3863         }
3864
3865         ll_finish_md_op_data(op_data);
3866
3867         RETURN(rc);
3868 }
3869
3870 int ll_get_fid_by_name(struct inode *parent, const char *name,
3871                        int namelen, struct lu_fid *fid,
3872                        struct inode **inode)
3873 {
3874         struct md_op_data       *op_data = NULL;
3875         struct mdt_body         *body;
3876         struct ptlrpc_request   *req;
3877         int                     rc;
3878         ENTRY;
3879
3880         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3881                                      LUSTRE_OPC_ANY, NULL);
3882         if (IS_ERR(op_data))
3883                 RETURN(PTR_ERR(op_data));
3884
3885         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3886         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3887         ll_finish_md_op_data(op_data);
3888         if (rc < 0)
3889                 RETURN(rc);
3890
3891         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3892         if (body == NULL)
3893                 GOTO(out_req, rc = -EFAULT);
3894         if (fid != NULL)
3895                 *fid = body->mbo_fid1;
3896
3897         if (inode != NULL)
3898                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3899 out_req:
3900         ptlrpc_req_finished(req);
3901         RETURN(rc);
3902 }
3903
3904 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3905                const char *name, int namelen)
3906 {
3907         struct dentry         *dchild = NULL;
3908         struct inode          *child_inode = NULL;
3909         struct md_op_data     *op_data;
3910         struct ptlrpc_request *request = NULL;
3911         struct obd_client_handle *och = NULL;
3912         struct qstr           qstr;
3913         struct mdt_body         *body;
3914         int                    rc;
3915         __u64                   data_version = 0;
3916         ENTRY;
3917
3918         CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
3919                name, PFID(ll_inode2fid(parent)), mdtidx);
3920
3921         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
3922                                      0, LUSTRE_OPC_ANY, NULL);
3923         if (IS_ERR(op_data))
3924                 RETURN(PTR_ERR(op_data));
3925
3926         /* Get child FID first */
3927         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
3928         qstr.name = name;
3929         qstr.len = namelen;
3930         dchild = d_lookup(file_dentry(file), &qstr);
3931         if (dchild != NULL) {
3932                 if (dchild->d_inode != NULL)
3933                         child_inode = igrab(dchild->d_inode);
3934                 dput(dchild);
3935         }
3936
3937         if (child_inode == NULL) {
3938                 rc = ll_get_fid_by_name(parent, name, namelen,
3939                                         &op_data->op_fid3, &child_inode);
3940                 if (rc != 0)
3941                         GOTO(out_free, rc);
3942         }
3943
3944         if (child_inode == NULL)
3945                 GOTO(out_free, rc = -EINVAL);
3946
3947         /*
3948          * lfs migrate command needs to be blocked on the client
3949          * by checking the migrate FID against the FID of the
3950          * filesystem root.
3951          */
3952         if (child_inode == parent->i_sb->s_root->d_inode)
3953                 GOTO(out_iput, rc = -EINVAL);
3954
3955         inode_lock(child_inode);
3956         op_data->op_fid3 = *ll_inode2fid(child_inode);
3957         if (!fid_is_sane(&op_data->op_fid3)) {
3958                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
3959                        ll_get_fsname(parent->i_sb, NULL, 0), name,
3960                        PFID(&op_data->op_fid3));
3961                 GOTO(out_unlock, rc = -EINVAL);
3962         }
3963
3964         rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
3965         if (rc < 0)
3966                 GOTO(out_unlock, rc);
3967
3968         if (rc == mdtidx) {
3969                 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
3970                        PFID(&op_data->op_fid3), mdtidx);
3971                 GOTO(out_unlock, rc = 0);
3972         }
3973 again:
3974         if (S_ISREG(child_inode->i_mode)) {
3975                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
3976                 if (IS_ERR(och)) {
3977                         rc = PTR_ERR(och);
3978                         och = NULL;
3979                         GOTO(out_unlock, rc);
3980                 }
3981
3982                 rc = ll_data_version(child_inode, &data_version,
3983                                      LL_DV_WR_FLUSH);
3984                 if (rc != 0)
3985                         GOTO(out_close, rc);
3986
3987                 op_data->op_handle = och->och_fh;
3988                 op_data->op_data = och->och_mod;
3989                 op_data->op_data_version = data_version;
3990                 op_data->op_lease_handle = och->och_lease_handle;
3991                 op_data->op_bias |= MDS_RENAME_MIGRATE;
3992         }
3993
3994         op_data->op_mds = mdtidx;
3995         op_data->op_cli_flags = CLI_MIGRATE;
3996         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
3997                        namelen, name, namelen, &request);
3998         if (rc == 0) {
3999                 LASSERT(request != NULL);
4000                 ll_update_times(request, parent);
4001
4002                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4003                 LASSERT(body != NULL);
4004
4005                 /* If the server does release layout lock, then we cleanup
4006                  * the client och here, otherwise release it in out_close: */
4007                 if (och != NULL &&
4008                     body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4009                         obd_mod_put(och->och_mod);
4010                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4011                                                   och);
4012                         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4013                         OBD_FREE_PTR(och);
4014                         och = NULL;
4015                 }
4016         }
4017
4018         if (request != NULL) {
4019                 ptlrpc_req_finished(request);
4020                 request = NULL;
4021         }
4022
4023         /* Try again if the file layout has changed. */
4024         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4025                 goto again;
4026
4027 out_close:
4028         if (och != NULL) /* close the file */
4029                 ll_lease_close(och, child_inode, NULL);
4030         if (rc == 0)
4031                 clear_nlink(child_inode);
4032 out_unlock:
4033         inode_unlock(child_inode);
4034 out_iput:
4035         iput(child_inode);
4036 out_free:
4037         ll_finish_md_op_data(op_data);
4038         RETURN(rc);
4039 }
4040
4041 static int
4042 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4043 {
4044         ENTRY;
4045
4046         RETURN(-ENOSYS);
4047 }
4048
4049 /**
4050  * test if some locks matching bits and l_req_mode are acquired
4051  * - bits can be in different locks
4052  * - if found clear the common lock bits in *bits
4053  * - the bits not found, are kept in *bits
4054  * \param inode [IN]
4055  * \param bits [IN] searched lock bits [IN]
4056  * \param l_req_mode [IN] searched lock mode
4057  * \retval boolean, true iff all bits are found
4058  */
4059 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4060 {
4061         struct lustre_handle lockh;
4062         union ldlm_policy_data policy;
4063         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4064                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4065         struct lu_fid *fid;
4066         __u64 flags;
4067         int i;
4068         ENTRY;
4069
4070         if (!inode)
4071                RETURN(0);
4072
4073         fid = &ll_i2info(inode)->lli_fid;
4074         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4075                ldlm_lockname[mode]);
4076
4077         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4078         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4079                 policy.l_inodebits.bits = *bits & (1 << i);
4080                 if (policy.l_inodebits.bits == 0)
4081                         continue;
4082
4083                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4084                                   &policy, mode, &lockh)) {
4085                         struct ldlm_lock *lock;
4086
4087                         lock = ldlm_handle2lock(&lockh);
4088                         if (lock) {
4089                                 *bits &=
4090                                       ~(lock->l_policy_data.l_inodebits.bits);
4091                                 LDLM_LOCK_PUT(lock);
4092                         } else {
4093                                 *bits &= ~policy.l_inodebits.bits;
4094                         }
4095                 }
4096         }
4097         RETURN(*bits == 0);
4098 }
4099
4100 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4101                                struct lustre_handle *lockh, __u64 flags,
4102                                enum ldlm_mode mode)
4103 {
4104         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4105         struct lu_fid *fid;
4106         enum ldlm_mode rc;
4107         ENTRY;
4108
4109         fid = &ll_i2info(inode)->lli_fid;
4110         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4111
4112         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4113                            fid, LDLM_IBITS, &policy, mode, lockh);
4114
4115         RETURN(rc);
4116 }
4117
4118 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4119 {
4120         /* Already unlinked. Just update nlink and return success */
4121         if (rc == -ENOENT) {
4122                 clear_nlink(inode);
4123                 /* If it is striped directory, and there is bad stripe
4124                  * Let's revalidate the dentry again, instead of returning
4125                  * error */
4126                 if (S_ISDIR(inode->i_mode) &&
4127                     ll_i2info(inode)->lli_lsm_md != NULL)
4128                         return 0;
4129
4130                 /* This path cannot be hit for regular files unless in
4131                  * case of obscure races, so no need to to validate
4132                  * size. */
4133                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4134                         return 0;
4135         } else if (rc != 0) {
4136                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4137                              "%s: revalidate FID "DFID" error: rc = %d\n",
4138                              ll_get_fsname(inode->i_sb, NULL, 0),
4139                              PFID(ll_inode2fid(inode)), rc);
4140         }
4141
4142         return rc;
4143 }
4144
4145 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4146 {
4147         struct inode *inode = dentry->d_inode;
4148         struct obd_export *exp = ll_i2mdexp(inode);
4149         struct lookup_intent oit = {
4150                 .it_op = op,
4151         };
4152         struct ptlrpc_request *req = NULL;
4153         struct md_op_data *op_data;
4154         int rc = 0;
4155         ENTRY;
4156
4157         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4158                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4159
4160         /* Call getattr by fid, so do not provide name at all. */
4161         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4162                                      LUSTRE_OPC_ANY, NULL);
4163         if (IS_ERR(op_data))
4164                 RETURN(PTR_ERR(op_data));
4165
4166         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4167         ll_finish_md_op_data(op_data);
4168         if (rc < 0) {
4169                 rc = ll_inode_revalidate_fini(inode, rc);
4170                 GOTO(out, rc);
4171         }
4172
4173         rc = ll_revalidate_it_finish(req, &oit, dentry);
4174         if (rc != 0) {
4175                 ll_intent_release(&oit);
4176                 GOTO(out, rc);
4177         }
4178
4179         /* Unlinked? Unhash dentry, so it is not picked up later by
4180          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4181          * here to preserve get_cwd functionality on 2.6.
4182          * Bug 10503 */
4183         if (!dentry->d_inode->i_nlink) {
4184                 ll_lock_dcache(inode);
4185                 d_lustre_invalidate(dentry, 0);
4186                 ll_unlock_dcache(inode);
4187         }
4188
4189         ll_lookup_finish_locks(&oit, dentry);
4190 out:
4191         ptlrpc_req_finished(req);
4192
4193         return rc;
4194 }
4195
4196 static int ll_merge_md_attr(struct inode *inode)
4197 {
4198         struct cl_attr attr = { 0 };
4199         int rc;
4200
4201         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4202         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4203                            &attr, ll_md_blocking_ast);
4204         if (rc != 0)
4205                 RETURN(rc);
4206
4207         set_nlink(inode, attr.cat_nlink);
4208         inode->i_blocks = attr.cat_blocks;
4209         i_size_write(inode, attr.cat_size);
4210
4211         ll_i2info(inode)->lli_atime = attr.cat_atime;
4212         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4213         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4214
4215         RETURN(0);
4216 }
4217
4218 static inline dev_t ll_compat_encode_dev(dev_t dev)
4219 {
4220         /* The compat_sys_*stat*() syscalls will fail unless the
4221          * device majors and minors are both less than 256. Note that
4222          * the value returned here will be passed through
4223          * old_encode_dev() in cp_compat_stat(). And so we are not
4224          * trying to return a valid compat (u16) device number, just
4225          * one that will pass the old_valid_dev() check. */
4226
4227         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4228 }
4229
4230 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4231 int ll_getattr(const struct path *path, struct kstat *stat,
4232                u32 request_mask, unsigned int flags)
4233 {
4234         struct dentry *de = path->dentry;
4235 #else
4236 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4237 {
4238 #endif
4239         struct inode *inode = de->d_inode;
4240         struct ll_sb_info *sbi = ll_i2sbi(inode);
4241         struct ll_inode_info *lli = ll_i2info(inode);
4242         int rc;
4243
4244         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4245
4246         rc = ll_inode_revalidate(de, IT_GETATTR);
4247         if (rc < 0)
4248                 RETURN(rc);
4249
4250         if (S_ISREG(inode->i_mode)) {
4251                 /* In case of restore, the MDT has the right size and has
4252                  * already send it back without granting the layout lock,
4253                  * inode is up-to-date so glimpse is useless.
4254                  * Also to glimpse we need the layout, in case of a running
4255                  * restore the MDT holds the layout lock so the glimpse will
4256                  * block up to the end of restore (getattr will block)
4257                  */
4258                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4259                         rc = ll_glimpse_size(inode);
4260                         if (rc < 0)
4261                                 RETURN(rc);
4262                 }
4263         } else {
4264                 /* If object isn't regular a file then don't validate size. */
4265                 if (S_ISDIR(inode->i_mode) &&
4266                     lli->lli_lsm_md != NULL) {
4267                         rc = ll_merge_md_attr(inode);
4268                         if (rc < 0)
4269                                 RETURN(rc);
4270                 }
4271
4272                 LTIME_S(inode->i_atime) = lli->lli_atime;
4273                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4274                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4275         }
4276
4277         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4278
4279         if (ll_need_32bit_api(sbi)) {
4280                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4281                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4282                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4283         } else {
4284                 stat->ino = inode->i_ino;
4285                 stat->dev = inode->i_sb->s_dev;
4286                 stat->rdev = inode->i_rdev;
4287         }
4288
4289         stat->mode = inode->i_mode;
4290         stat->uid = inode->i_uid;
4291         stat->gid = inode->i_gid;
4292         stat->atime = inode->i_atime;
4293         stat->mtime = inode->i_mtime;
4294         stat->ctime = inode->i_ctime;
4295         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4296
4297         stat->nlink = inode->i_nlink;
4298         stat->size = i_size_read(inode);
4299         stat->blocks = inode->i_blocks;
4300
4301         return 0;
4302 }
4303
4304 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4305                      __u64 start, __u64 len)
4306 {
4307         int             rc;
4308         size_t          num_bytes;
4309         struct fiemap   *fiemap;
4310         unsigned int    extent_count = fieinfo->fi_extents_max;
4311
4312         num_bytes = sizeof(*fiemap) + (extent_count *
4313                                        sizeof(struct fiemap_extent));
4314         OBD_ALLOC_LARGE(fiemap, num_bytes);
4315
4316         if (fiemap == NULL)
4317                 RETURN(-ENOMEM);
4318
4319         fiemap->fm_flags = fieinfo->fi_flags;
4320         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4321         fiemap->fm_start = start;
4322         fiemap->fm_length = len;
4323         if (extent_count > 0 &&
4324             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4325                            sizeof(struct fiemap_extent)) != 0)
4326                 GOTO(out, rc = -EFAULT);
4327
4328         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4329
4330         fieinfo->fi_flags = fiemap->fm_flags;
4331         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4332         if (extent_count > 0 &&
4333             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4334                          fiemap->fm_mapped_extents *
4335                          sizeof(struct fiemap_extent)) != 0)
4336                 GOTO(out, rc = -EFAULT);
4337 out:
4338         OBD_FREE_LARGE(fiemap, num_bytes);
4339         return rc;
4340 }
4341
4342 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4343 {
4344         struct ll_inode_info *lli = ll_i2info(inode);
4345         struct posix_acl *acl = NULL;
4346         ENTRY;
4347
4348         spin_lock(&lli->lli_lock);
4349         /* VFS' acl_permission_check->check_acl will release the refcount */
4350         acl = posix_acl_dup(lli->lli_posix_acl);
4351         spin_unlock(&lli->lli_lock);
4352
4353         RETURN(acl);
4354 }
4355
4356 #ifdef HAVE_IOP_SET_ACL
4357 #ifdef CONFIG_FS_POSIX_ACL
4358 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4359 {
4360         const char *name = NULL;
4361         char *value = NULL;
4362         size_t size = 0;
4363         int rc = 0;
4364         ENTRY;
4365
4366         switch (type) {
4367         case ACL_TYPE_ACCESS:
4368                 if (acl) {
4369                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4370                         if (rc)
4371                                 GOTO(out, rc);
4372                 }
4373                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4374                 break;
4375         case ACL_TYPE_DEFAULT:
4376                 if (!S_ISDIR(inode->i_mode))
4377                         GOTO(out, rc = acl ? -EACCES : 0);
4378                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4379                 break;
4380         default:
4381                 GOTO(out, rc = -EINVAL);
4382         }
4383
4384         if (acl) {
4385                 size = posix_acl_xattr_size(acl->a_count);
4386                 value = kmalloc(size, GFP_NOFS);
4387                 if (value == NULL)
4388                         GOTO(out, rc = -ENOMEM);
4389
4390                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4391                 if (rc < 0)
4392                         GOTO(out_free, rc);
4393         }
4394
4395         /* dentry is only used for *.lov attributes so it's safe to be NULL */
4396         rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4397 out_free:
4398         kfree(value);
4399 out:
4400         if (!rc)
4401                 set_cached_acl(inode, type, acl);
4402         else
4403                 forget_cached_acl(inode, type);
4404         RETURN(rc);
4405 }
4406 #endif /* CONFIG_FS_POSIX_ACL */
4407 #endif /* HAVE_IOP_SET_ACL */
4408
4409 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4410 static int
4411 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4412 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4413 # else
4414 ll_check_acl(struct inode *inode, int mask)
4415 # endif
4416 {
4417 # ifdef CONFIG_FS_POSIX_ACL
4418         struct posix_acl *acl;
4419         int rc;
4420         ENTRY;
4421
4422 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4423         if (flags & IPERM_FLAG_RCU)
4424                 return -ECHILD;
4425 #  endif
4426         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4427
4428         if (!acl)
4429                 RETURN(-EAGAIN);
4430
4431         rc = posix_acl_permission(inode, acl, mask);
4432         posix_acl_release(acl);
4433
4434         RETURN(rc);
4435 # else /* !CONFIG_FS_POSIX_ACL */
4436         return -EAGAIN;
4437 # endif /* CONFIG_FS_POSIX_ACL */
4438 }
4439 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4440
4441 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4442 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4443 #else
4444 # ifdef HAVE_INODE_PERMISION_2ARGS
4445 int ll_inode_permission(struct inode *inode, int mask)
4446 # else
4447 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4448 # endif
4449 #endif
4450 {
4451         int rc = 0;
4452         struct ll_sb_info *sbi;
4453         struct root_squash_info *squash;
4454         struct cred *cred = NULL;
4455         const struct cred *old_cred = NULL;
4456         cfs_cap_t cap;
4457         bool squash_id = false;
4458         ENTRY;
4459
4460 #ifdef MAY_NOT_BLOCK
4461         if (mask & MAY_NOT_BLOCK)
4462                 return -ECHILD;
4463 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4464         if (flags & IPERM_FLAG_RCU)
4465                 return -ECHILD;
4466 #endif
4467
4468        /* as root inode are NOT getting validated in lookup operation,
4469         * need to do it before permission check. */
4470
4471         if (inode == inode->i_sb->s_root->d_inode) {
4472                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4473                 if (rc)
4474                         RETURN(rc);
4475         }
4476
4477         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4478                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4479
4480         /* squash fsuid/fsgid if needed */
4481         sbi = ll_i2sbi(inode);
4482         squash = &sbi->ll_squash;
4483         if (unlikely(squash->rsi_uid != 0 &&
4484                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4485                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4486                         squash_id = true;
4487         }
4488         if (squash_id) {
4489                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4490                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4491                        squash->rsi_uid, squash->rsi_gid);
4492
4493                 /* update current process's credentials
4494                  * and FS capability */
4495                 cred = prepare_creds();
4496                 if (cred == NULL)
4497                         RETURN(-ENOMEM);
4498
4499                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4500                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4501                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4502                         if ((1 << cap) & CFS_CAP_FS_MASK)
4503                                 cap_lower(cred->cap_effective, cap);
4504                 }
4505                 old_cred = override_creds(cred);
4506         }
4507
4508         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4509         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4510         /* restore current process's credentials and FS capability */
4511         if (squash_id) {
4512                 revert_creds(old_cred);
4513                 put_cred(cred);
4514         }
4515
4516         RETURN(rc);
4517 }
4518
4519 /* -o localflock - only provides locally consistent flock locks */
4520 struct file_operations ll_file_operations = {
4521 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4522 # ifdef HAVE_SYNC_READ_WRITE
4523         .read           = new_sync_read,
4524         .write          = new_sync_write,
4525 # endif
4526         .read_iter      = ll_file_read_iter,
4527         .write_iter     = ll_file_write_iter,
4528 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4529         .read           = ll_file_read,
4530         .aio_read       = ll_file_aio_read,
4531         .write          = ll_file_write,
4532         .aio_write      = ll_file_aio_write,
4533 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4534         .unlocked_ioctl = ll_file_ioctl,
4535         .open           = ll_file_open,
4536         .release        = ll_file_release,
4537         .mmap           = ll_file_mmap,
4538         .llseek         = ll_file_seek,
4539         .splice_read    = ll_file_splice_read,
4540         .fsync          = ll_fsync,
4541         .flush          = ll_flush
4542 };
4543
4544 struct file_operations ll_file_operations_flock = {
4545 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4546 # ifdef HAVE_SYNC_READ_WRITE
4547         .read           = new_sync_read,
4548         .write          = new_sync_write,
4549 # endif /* HAVE_SYNC_READ_WRITE */
4550         .read_iter      = ll_file_read_iter,
4551         .write_iter     = ll_file_write_iter,
4552 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4553         .read           = ll_file_read,
4554         .aio_read       = ll_file_aio_read,
4555         .write          = ll_file_write,
4556         .aio_write      = ll_file_aio_write,
4557 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4558         .unlocked_ioctl = ll_file_ioctl,
4559         .open           = ll_file_open,
4560         .release        = ll_file_release,
4561         .mmap           = ll_file_mmap,
4562         .llseek         = ll_file_seek,
4563         .splice_read    = ll_file_splice_read,
4564         .fsync          = ll_fsync,
4565         .flush          = ll_flush,
4566         .flock          = ll_file_flock,
4567         .lock           = ll_file_flock
4568 };
4569
4570 /* These are for -o noflock - to return ENOSYS on flock calls */
4571 struct file_operations ll_file_operations_noflock = {
4572 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4573 # ifdef HAVE_SYNC_READ_WRITE
4574         .read           = new_sync_read,
4575         .write          = new_sync_write,
4576 # endif /* HAVE_SYNC_READ_WRITE */
4577         .read_iter      = ll_file_read_iter,
4578         .write_iter     = ll_file_write_iter,
4579 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4580         .read           = ll_file_read,
4581         .aio_read       = ll_file_aio_read,
4582         .write          = ll_file_write,
4583         .aio_write      = ll_file_aio_write,
4584 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4585         .unlocked_ioctl = ll_file_ioctl,
4586         .open           = ll_file_open,
4587         .release        = ll_file_release,
4588         .mmap           = ll_file_mmap,
4589         .llseek         = ll_file_seek,
4590         .splice_read    = ll_file_splice_read,
4591         .fsync          = ll_fsync,
4592         .flush          = ll_flush,
4593         .flock          = ll_file_noflock,
4594         .lock           = ll_file_noflock
4595 };
4596
4597 struct inode_operations ll_file_inode_operations = {
4598         .setattr        = ll_setattr,
4599         .getattr        = ll_getattr,
4600         .permission     = ll_inode_permission,
4601 #ifdef HAVE_IOP_XATTR
4602         .setxattr       = ll_setxattr,
4603         .getxattr       = ll_getxattr,
4604         .removexattr    = ll_removexattr,
4605 #endif
4606         .listxattr      = ll_listxattr,
4607         .fiemap         = ll_fiemap,
4608 #ifdef HAVE_IOP_GET_ACL
4609         .get_acl        = ll_get_acl,
4610 #endif
4611 #ifdef HAVE_IOP_SET_ACL
4612         .set_acl        = ll_set_acl,
4613 #endif
4614 };
4615
4616 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4617 {
4618         struct ll_inode_info *lli = ll_i2info(inode);
4619         struct cl_object *obj = lli->lli_clob;
4620         struct lu_env *env;
4621         int rc;
4622         __u16 refcheck;
4623         ENTRY;
4624
4625         if (obj == NULL)
4626                 RETURN(0);
4627
4628         env = cl_env_get(&refcheck);
4629         if (IS_ERR(env))
4630                 RETURN(PTR_ERR(env));
4631
4632         rc = cl_conf_set(env, lli->lli_clob, conf);
4633         if (rc < 0)
4634                 GOTO(out, rc);
4635
4636         if (conf->coc_opc == OBJECT_CONF_SET) {
4637                 struct ldlm_lock *lock = conf->coc_lock;
4638                 struct cl_layout cl = {
4639                         .cl_layout_gen = 0,
4640                 };
4641
4642                 LASSERT(lock != NULL);
4643                 LASSERT(ldlm_has_layout(lock));
4644
4645                 /* it can only be allowed to match after layout is
4646                  * applied to inode otherwise false layout would be
4647                  * seen. Applying layout shoud happen before dropping
4648                  * the intent lock. */
4649                 ldlm_lock_allow_match(lock);
4650
4651                 rc = cl_object_layout_get(env, obj, &cl);
4652                 if (rc < 0)
4653                         GOTO(out, rc);
4654
4655                 CDEBUG(D_VFSTRACE,
4656                        DFID": layout version change: %u -> %u\n",
4657                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4658                        cl.cl_layout_gen);
4659                 ll_layout_version_set(lli, cl.cl_layout_gen);
4660         }
4661
4662 out:
4663         cl_env_put(env, &refcheck);
4664
4665         RETURN(rc);
4666 }
4667
4668 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4669 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4670
4671 {
4672         struct ll_sb_info *sbi = ll_i2sbi(inode);
4673         struct ptlrpc_request *req;
4674         struct mdt_body *body;
4675         void *lvbdata;
4676         void *lmm;
4677         int lmmsize;
4678         int rc;
4679         ENTRY;
4680
4681         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4682                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4683                lock->l_lvb_data, lock->l_lvb_len);
4684
4685         if (lock->l_lvb_data != NULL)
4686                 RETURN(0);
4687
4688         /* if layout lock was granted right away, the layout is returned
4689          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4690          * blocked and then granted via completion ast, we have to fetch
4691          * layout here. Please note that we can't use the LVB buffer in
4692          * completion AST because it doesn't have a large enough buffer */
4693         rc = ll_get_default_mdsize(sbi, &lmmsize);
4694         if (rc == 0)
4695                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4696                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4697                                 lmmsize, 0, &req);
4698         if (rc < 0)
4699                 RETURN(rc);
4700
4701         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4702         if (body == NULL)
4703                 GOTO(out, rc = -EPROTO);
4704
4705         lmmsize = body->mbo_eadatasize;
4706         if (lmmsize == 0) /* empty layout */
4707                 GOTO(out, rc = 0);
4708
4709         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4710         if (lmm == NULL)
4711                 GOTO(out, rc = -EFAULT);
4712
4713         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4714         if (lvbdata == NULL)
4715                 GOTO(out, rc = -ENOMEM);
4716
4717         memcpy(lvbdata, lmm, lmmsize);
4718         lock_res_and_lock(lock);
4719         if (unlikely(lock->l_lvb_data == NULL)) {
4720                 lock->l_lvb_type = LVB_T_LAYOUT;
4721                 lock->l_lvb_data = lvbdata;
4722                 lock->l_lvb_len = lmmsize;
4723                 lvbdata = NULL;
4724         }
4725         unlock_res_and_lock(lock);
4726
4727         if (lvbdata)
4728                 OBD_FREE_LARGE(lvbdata, lmmsize);
4729
4730         EXIT;
4731
4732 out:
4733         ptlrpc_req_finished(req);
4734         return rc;
4735 }
4736
4737 /**
4738  * Apply the layout to the inode. Layout lock is held and will be released
4739  * in this function.
4740  */
4741 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4742                               struct inode *inode)
4743 {
4744         struct ll_inode_info *lli = ll_i2info(inode);
4745         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4746         struct ldlm_lock *lock;
4747         struct cl_object_conf conf;
4748         int rc = 0;
4749         bool lvb_ready;
4750         bool wait_layout = false;
4751         ENTRY;
4752
4753         LASSERT(lustre_handle_is_used(lockh));
4754
4755         lock = ldlm_handle2lock(lockh);
4756         LASSERT(lock != NULL);
4757         LASSERT(ldlm_has_layout(lock));
4758
4759         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4760                    PFID(&lli->lli_fid), inode);
4761
4762         /* in case this is a caching lock and reinstate with new inode */
4763         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4764
4765         lock_res_and_lock(lock);
4766         lvb_ready = ldlm_is_lvb_ready(lock);
4767         unlock_res_and_lock(lock);
4768
4769         /* checking lvb_ready is racy but this is okay. The worst case is
4770          * that multi processes may configure the file on the same time. */
4771         if (lvb_ready)
4772                 GOTO(out, rc = 0);
4773
4774         rc = ll_layout_fetch(inode, lock);
4775         if (rc < 0)
4776                 GOTO(out, rc);
4777
4778         /* for layout lock, lmm is stored in lock's lvb.
4779          * lvb_data is immutable if the lock is held so it's safe to access it
4780          * without res lock.
4781          *
4782          * set layout to file. Unlikely this will fail as old layout was
4783          * surely eliminated */
4784         memset(&conf, 0, sizeof conf);
4785         conf.coc_opc = OBJECT_CONF_SET;
4786         conf.coc_inode = inode;
4787         conf.coc_lock = lock;
4788         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4789         conf.u.coc_layout.lb_len = lock->l_lvb_len;
4790         rc = ll_layout_conf(inode, &conf);
4791
4792         /* refresh layout failed, need to wait */
4793         wait_layout = rc == -EBUSY;
4794         EXIT;
4795 out:
4796         LDLM_LOCK_PUT(lock);
4797         ldlm_lock_decref(lockh, mode);
4798
4799         /* wait for IO to complete if it's still being used. */
4800         if (wait_layout) {
4801                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4802                        ll_get_fsname(inode->i_sb, NULL, 0),
4803                        PFID(&lli->lli_fid), inode);
4804
4805                 memset(&conf, 0, sizeof conf);
4806                 conf.coc_opc = OBJECT_CONF_WAIT;
4807                 conf.coc_inode = inode;
4808                 rc = ll_layout_conf(inode, &conf);
4809                 if (rc == 0)
4810                         rc = -EAGAIN;
4811
4812                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4813                        ll_get_fsname(inode->i_sb, NULL, 0),
4814                        PFID(&lli->lli_fid), rc);
4815         }
4816         RETURN(rc);
4817 }
4818
4819 /**
4820  * Issue layout intent RPC to MDS.
4821  * \param inode [in]    file inode
4822  * \param intent [in]   layout intent
4823  *
4824  * \retval 0    on success
4825  * \retval < 0  error code
4826  */
4827 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4828 {
4829         struct ll_inode_info  *lli = ll_i2info(inode);
4830         struct ll_sb_info     *sbi = ll_i2sbi(inode);
4831         struct md_op_data     *op_data;
4832         struct lookup_intent it;
4833         struct ptlrpc_request *req;
4834         int rc;
4835         ENTRY;
4836
4837         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4838                                      0, 0, LUSTRE_OPC_ANY, NULL);
4839         if (IS_ERR(op_data))
4840                 RETURN(PTR_ERR(op_data));
4841
4842         op_data->op_data = intent;
4843         op_data->op_data_size = sizeof(*intent);
4844
4845         memset(&it, 0, sizeof(it));
4846         it.it_op = IT_LAYOUT;
4847         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4848             intent->li_opc == LAYOUT_INTENT_TRUNC)
4849                 it.it_flags = FMODE_WRITE;
4850
4851         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4852                           ll_get_fsname(inode->i_sb, NULL, 0),
4853                           PFID(&lli->lli_fid), inode);
4854
4855         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4856                             &ll_md_blocking_ast, 0);
4857         if (it.it_request != NULL)
4858                 ptlrpc_req_finished(it.it_request);
4859         it.it_request = NULL;
4860
4861         ll_finish_md_op_data(op_data);
4862
4863         /* set lock data in case this is a new lock */
4864         if (!rc)
4865                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4866
4867         ll_intent_drop_lock(&it);
4868
4869         RETURN(rc);
4870 }
4871
4872 /**
4873  * This function checks if there exists a LAYOUT lock on the client side,
4874  * or enqueues it if it doesn't have one in cache.
4875  *
4876  * This function will not hold layout lock so it may be revoked any time after
4877  * this function returns. Any operations depend on layout should be redone
4878  * in that case.
4879  *
4880  * This function should be called before lov_io_init() to get an uptodate
4881  * layout version, the caller should save the version number and after IO
4882  * is finished, this function should be called again to verify that layout
4883  * is not changed during IO time.
4884  */
4885 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4886 {
4887         struct ll_inode_info    *lli = ll_i2info(inode);
4888         struct ll_sb_info       *sbi = ll_i2sbi(inode);
4889         struct lustre_handle lockh;
4890         struct layout_intent intent = {
4891                 .li_opc = LAYOUT_INTENT_ACCESS,
4892         };
4893         enum ldlm_mode mode;
4894         int rc;
4895         ENTRY;
4896
4897         *gen = ll_layout_version_get(lli);
4898         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4899                 RETURN(0);
4900
4901         /* sanity checks */
4902         LASSERT(fid_is_sane(ll_inode2fid(inode)));
4903         LASSERT(S_ISREG(inode->i_mode));
4904
4905         /* take layout lock mutex to enqueue layout lock exclusively. */
4906         mutex_lock(&lli->lli_layout_mutex);
4907
4908         while (1) {
4909                 /* mostly layout lock is caching on the local side, so try to
4910                  * match it before grabbing layout lock mutex. */
4911                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
4912                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
4913                 if (mode != 0) { /* hit cached lock */
4914                         rc = ll_layout_lock_set(&lockh, mode, inode);
4915                         if (rc == -EAGAIN)
4916                                 continue;
4917                         break;
4918                 }
4919
4920                 rc = ll_layout_intent(inode, &intent);
4921                 if (rc != 0)
4922                         break;
4923         }
4924
4925         if (rc == 0)
4926                 *gen = ll_layout_version_get(lli);
4927         mutex_unlock(&lli->lli_layout_mutex);
4928
4929         RETURN(rc);
4930 }
4931
4932 /**
4933  * Issue layout intent RPC indicating where in a file an IO is about to write.
4934  *
4935  * \param[in] inode     file inode.
4936  * \param[in] ext       write range with start offset of fille in bytes where
4937  *                      an IO is about to write, and exclusive end offset in
4938  *                      bytes.
4939  *
4940  * \retval 0    on success
4941  * \retval < 0  error code
4942  */
4943 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
4944                            struct lu_extent *ext)
4945 {
4946         struct layout_intent intent = {
4947                 .li_opc = opc,
4948                 .li_extent.e_start = ext->e_start,
4949                 .li_extent.e_end = ext->e_end,
4950         };
4951         int rc;
4952         ENTRY;
4953
4954         rc = ll_layout_intent(inode, &intent);
4955
4956         RETURN(rc);
4957 }
4958
4959 /**
4960  *  This function send a restore request to the MDT
4961  */
4962 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
4963 {
4964         struct hsm_user_request *hur;
4965         int                      len, rc;
4966         ENTRY;
4967
4968         len = sizeof(struct hsm_user_request) +
4969               sizeof(struct hsm_user_item);
4970         OBD_ALLOC(hur, len);
4971         if (hur == NULL)
4972                 RETURN(-ENOMEM);
4973
4974         hur->hur_request.hr_action = HUA_RESTORE;
4975         hur->hur_request.hr_archive_id = 0;
4976         hur->hur_request.hr_flags = 0;
4977         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
4978                sizeof(hur->hur_user_item[0].hui_fid));
4979         hur->hur_user_item[0].hui_extent.offset = offset;
4980         hur->hur_user_item[0].hui_extent.length = length;
4981         hur->hur_request.hr_itemcount = 1;
4982         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
4983                            len, hur, NULL);
4984         OBD_FREE(hur, len);
4985         RETURN(rc);
4986 }