lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                      ATTR_MTIME | ATTR_MTIME_SET |
 104                                      ATTR_CTIME | ATTR_CTIME_SET;
 105         op_data->op_attr_blocks = inode->i_blocks;
 106         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 107         op_data->op_handle = och->och_fh;
 108
 109         if (och->och_flags & FMODE_WRITE &&
 110             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 111                 /* For HSM: if inode data has been modified, pack it so that
 112                  * MDT can set data dirty flag in the archive. */
 113                 op_data->op_bias |= MDS_DATA_MODIFIED;
 114
 115         EXIT;
 116 }
 117
 118 /**
 119  * Perform a close, possibly with a bias.
 120  * The meaning of "data" depends on the value of "bias".
 121  *
 122  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 123  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 124  * swap layouts with.
 125  */
 126 static int ll_close_inode_openhandle(struct inode *inode,
 127                                      struct obd_client_handle *och,
 128                                      enum mds_op_bias bias, void *data)
 129 {
 130         struct obd_export *md_exp = ll_i2mdexp(inode);
 131         const struct ll_inode_info *lli = ll_i2info(inode);
 132         struct md_op_data *op_data;
 133         struct ptlrpc_request *req = NULL;
 134         int rc;
 135         ENTRY;
 136
 137         if (class_exp2obd(md_exp) == NULL) {
 138                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 139                        ll_get_fsname(inode->i_sb, NULL, 0),
 140                        PFID(&lli->lli_fid));
 141                 GOTO(out, rc = 0);
 142         }
 143
 144         OBD_ALLOC_PTR(op_data);
 145         /* We leak openhandle and request here on error, but not much to be
 146          * done in OOM case since app won't retry close on error either. */
 147         if (op_data == NULL)
 148                 GOTO(out, rc = -ENOMEM);
 149
 150         ll_prepare_close(inode, op_data, och);
 151         switch (bias) {
 152         case MDS_CLOSE_LAYOUT_MERGE:
 153                 /* merge blocks from the victim inode */
 154                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 155                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 156         case MDS_CLOSE_LAYOUT_SPLIT:
 157         case MDS_CLOSE_LAYOUT_SWAP: {
 158                 struct split_param *sp = data;
 159
 160                 LASSERT(data != NULL);
 161                 op_data->op_bias |= bias;
 162                 op_data->op_data_version = 0;
 163                 op_data->op_lease_handle = och->och_lease_handle;
 164                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 165                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 166                         op_data->op_mirror_id = sp->sp_mirror_id;
 167                 } else {
 168                         op_data->op_fid2 = *ll_inode2fid(data);
 169                 }
 170                 break;
 171         }
 172
 173         case MDS_CLOSE_RESYNC_DONE: {
 174                 struct ll_ioc_lease *ioc = data;
 175
 176                 LASSERT(data != NULL);
 177                 op_data->op_attr_blocks +=
 178                         ioc->lil_count * op_data->op_attr_blocks;
 179                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 180                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 181
 182                 op_data->op_lease_handle = och->och_lease_handle;
 183                 op_data->op_data = &ioc->lil_ids[0];
 184                 op_data->op_data_size =
 185                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 186                 break;
 187         }
 188
 189         case MDS_HSM_RELEASE:
 190                 LASSERT(data != NULL);
 191                 op_data->op_bias |= MDS_HSM_RELEASE;
 192                 op_data->op_data_version = *(__u64 *)data;
 193                 op_data->op_lease_handle = och->och_lease_handle;
 194                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 195                 break;
 196
 197         default:
 198                 LASSERT(data == NULL);
 199                 break;
 200         }
 201
 202         rc = md_close(md_exp, op_data, och->och_mod, &req);
 203         if (rc != 0 && rc != -EINTR)
 204                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 205                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 206
 207         if (rc == 0 && op_data->op_bias & bias) {
 208                 struct mdt_body *body;
 209
 210                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 211                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 212                         rc = -EBUSY;
 213         }
 214
 215         ll_finish_md_op_data(op_data);
 216         EXIT;
 217 out:
 218
 219         md_clear_open_replay_data(md_exp, och);
 220         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 221         OBD_FREE_PTR(och);
 222
 223         ptlrpc_req_finished(req);       /* This is close request */
 224         return rc;
 225 }
 226
 227 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 228 {
 229         struct ll_inode_info *lli = ll_i2info(inode);
 230         struct obd_client_handle **och_p;
 231         struct obd_client_handle *och;
 232         __u64 *och_usecount;
 233         int rc = 0;
 234         ENTRY;
 235
 236         if (fmode & FMODE_WRITE) {
 237                 och_p = &lli->lli_mds_write_och;
 238                 och_usecount = &lli->lli_open_fd_write_count;
 239         } else if (fmode & FMODE_EXEC) {
 240                 och_p = &lli->lli_mds_exec_och;
 241                 och_usecount = &lli->lli_open_fd_exec_count;
 242         } else {
 243                 LASSERT(fmode & FMODE_READ);
 244                 och_p = &lli->lli_mds_read_och;
 245                 och_usecount = &lli->lli_open_fd_read_count;
 246         }
 247
 248         mutex_lock(&lli->lli_och_mutex);
 249         if (*och_usecount > 0) {
 250                 /* There are still users of this handle, so skip
 251                  * freeing it. */
 252                 mutex_unlock(&lli->lli_och_mutex);
 253                 RETURN(0);
 254         }
 255
 256         och = *och_p;
 257         *och_p = NULL;
 258         mutex_unlock(&lli->lli_och_mutex);
 259
 260         if (och != NULL) {
 261                 /* There might be a race and this handle may already
 262                  * be closed. */
 263                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 264         }
 265
 266         RETURN(rc);
 267 }
 268
 269 static int ll_md_close(struct inode *inode, struct file *file)
 270 {
 271         union ldlm_policy_data policy = {
 272                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 273         };
 274         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 275         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 276         struct ll_inode_info *lli = ll_i2info(inode);
 277         struct lustre_handle lockh;
 278         enum ldlm_mode lockmode;
 279         int rc = 0;
 280         ENTRY;
 281
 282         /* clear group lock, if present */
 283         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 284                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 285
 286         if (fd->fd_lease_och != NULL) {
 287                 bool lease_broken;
 288
 289                 /* Usually the lease is not released when the
 290                  * application crashed, we need to release here. */
 291                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 292                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 293                         PFID(&lli->lli_fid), rc, lease_broken);
 294
 295                 fd->fd_lease_och = NULL;
 296         }
 297
 298         if (fd->fd_och != NULL) {
 299                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 300                 fd->fd_och = NULL;
 301                 GOTO(out, rc);
 302         }
 303
 304         /* Let's see if we have good enough OPEN lock on the file and if
 305            we can skip talking to MDS */
 306         mutex_lock(&lli->lli_och_mutex);
 307         if (fd->fd_omode & FMODE_WRITE) {
 308                 lockmode = LCK_CW;
 309                 LASSERT(lli->lli_open_fd_write_count);
 310                 lli->lli_open_fd_write_count--;
 311         } else if (fd->fd_omode & FMODE_EXEC) {
 312                 lockmode = LCK_PR;
 313                 LASSERT(lli->lli_open_fd_exec_count);
 314                 lli->lli_open_fd_exec_count--;
 315         } else {
 316                 lockmode = LCK_CR;
 317                 LASSERT(lli->lli_open_fd_read_count);
 318                 lli->lli_open_fd_read_count--;
 319         }
 320         mutex_unlock(&lli->lli_och_mutex);
 321
 322         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 323                            LDLM_IBITS, &policy, lockmode, &lockh))
 324                 rc = ll_md_real_close(inode, fd->fd_omode);
 325
 326 out:
 327         LUSTRE_FPRIVATE(file) = NULL;
 328         ll_file_data_put(fd);
 329
 330         RETURN(rc);
 331 }
 332
 333 /* While this returns an error code, fput() the caller does not, so we need
 334  * to make every effort to clean up all of our state here.  Also, applications
 335  * rarely check close errors and even if an error is returned they will not
 336  * re-try the close call.
 337  */
 338 int ll_file_release(struct inode *inode, struct file *file)
 339 {
 340         struct ll_file_data *fd;
 341         struct ll_sb_info *sbi = ll_i2sbi(inode);
 342         struct ll_inode_info *lli = ll_i2info(inode);
 343         int rc;
 344         ENTRY;
 345
 346         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 347                PFID(ll_inode2fid(inode)), inode);
 348
 349         if (inode->i_sb->s_root != file_dentry(file))
 350                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 351         fd = LUSTRE_FPRIVATE(file);
 352         LASSERT(fd != NULL);
 353
 354         /* The last ref on @file, maybe not the the owner pid of statahead,
 355          * because parent and child process can share the same file handle. */
 356         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 357                 ll_deauthorize_statahead(inode, fd);
 358
 359         if (inode->i_sb->s_root == file_dentry(file)) {
 360                 LUSTRE_FPRIVATE(file) = NULL;
 361                 ll_file_data_put(fd);
 362                 RETURN(0);
 363         }
 364
 365         if (!S_ISDIR(inode->i_mode)) {
 366                 if (lli->lli_clob != NULL)
 367                         lov_read_and_clear_async_rc(lli->lli_clob);
 368                 lli->lli_async_rc = 0;
 369         }
 370
 371         rc = ll_md_close(inode, file);
 372
 373         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 374                 libcfs_debug_dumplog();
 375
 376         RETURN(rc);
 377 }
 378
 379 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 380                                 struct lookup_intent *itp)
 381 {
 382         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 383         struct dentry *parent = de->d_parent;
 384         const char *name = NULL;
 385         int len = 0;
 386         struct md_op_data *op_data;
 387         struct ptlrpc_request *req = NULL;
 388         int rc;
 389         ENTRY;
 390
 391         LASSERT(parent != NULL);
 392         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 393
 394         /* if server supports open-by-fid, or file name is invalid, don't pack
 395          * name in open request */
 396         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 397             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 398                 name = de->d_name.name;
 399                 len = de->d_name.len;
 400         }
 401
 402         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 403                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 404         if (IS_ERR(op_data))
 405                 RETURN(PTR_ERR(op_data));
 406         op_data->op_data = lmm;
 407         op_data->op_data_size = lmmsize;
 408
 409         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 410                             &ll_md_blocking_ast, 0);
 411         ll_finish_md_op_data(op_data);
 412         if (rc == -ESTALE) {
 413                 /* reason for keep own exit path - don`t flood log
 414                  * with messages with -ESTALE errors.
 415                  */
 416                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 417                      it_open_error(DISP_OPEN_OPEN, itp))
 418                         GOTO(out, rc);
 419                 ll_release_openhandle(de, itp);
 420                 GOTO(out, rc);
 421         }
 422
 423         if (it_disposition(itp, DISP_LOOKUP_NEG))
 424                 GOTO(out, rc = -ENOENT);
 425
 426         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 427                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 428                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 429                 GOTO(out, rc);
 430         }
 431
 432         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 433         if (!rc && itp->it_lock_mode)
 434                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 435
 436 out:
 437         ptlrpc_req_finished(req);
 438         ll_intent_drop_lock(itp);
 439
 440         /* We did open by fid, but by the time we got to the server,
 441          * the object disappeared. If this is a create, we cannot really
 442          * tell the userspace that the file it was trying to create
 443          * does not exist. Instead let's return -ESTALE, and the VFS will
 444          * retry the create with LOOKUP_REVAL that we are going to catch
 445          * in ll_revalidate_dentry() and use lookup then.
 446          */
 447         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 448                 rc = -ESTALE;
 449
 450         RETURN(rc);
 451 }
 452
 453 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 454                        struct obd_client_handle *och)
 455 {
 456         struct mdt_body *body;
 457
 458         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 459         och->och_fh = body->mbo_handle;
 460         och->och_fid = body->mbo_fid1;
 461         och->och_lease_handle.cookie = it->it_lock_handle;
 462         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 463         och->och_flags = it->it_flags;
 464
 465         return md_set_open_replay_data(md_exp, och, it);
 466 }
 467
 468 static int ll_local_open(struct file *file, struct lookup_intent *it,
 469                          struct ll_file_data *fd, struct obd_client_handle *och)
 470 {
 471         struct inode *inode = file_inode(file);
 472         ENTRY;
 473
 474         LASSERT(!LUSTRE_FPRIVATE(file));
 475
 476         LASSERT(fd != NULL);
 477
 478         if (och) {
 479                 int rc;
 480
 481                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 482                 if (rc != 0)
 483                         RETURN(rc);
 484         }
 485
 486         LUSTRE_FPRIVATE(file) = fd;
 487         ll_readahead_init(inode, &fd->fd_ras);
 488         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 489
 490         /* ll_cl_context initialize */
 491         rwlock_init(&fd->fd_lock);
 492         INIT_LIST_HEAD(&fd->fd_lccs);
 493
 494         RETURN(0);
 495 }
 496
 497 /* Open a file, and (for the very first open) create objects on the OSTs at
 498  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 499  * creation or open until ll_lov_setstripe() ioctl is called.
 500  *
 501  * If we already have the stripe MD locally then we don't request it in
 502  * md_open(), by passing a lmm_size = 0.
 503  *
 504  * It is up to the application to ensure no other processes open this file
 505  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 506  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 507  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 508  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 509  */
 510 int ll_file_open(struct inode *inode, struct file *file)
 511 {
 512         struct ll_inode_info *lli = ll_i2info(inode);
 513         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 514                                           .it_flags = file->f_flags };
 515         struct obd_client_handle **och_p = NULL;
 516         __u64 *och_usecount = NULL;
 517         struct ll_file_data *fd;
 518         int rc = 0;
 519         ENTRY;
 520
 521         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 522                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 523
 524         it = file->private_data; /* XXX: compat macro */
 525         file->private_data = NULL; /* prevent ll_local_open assertion */
 526
 527         fd = ll_file_data_get();
 528         if (fd == NULL)
 529                 GOTO(out_nofiledata, rc = -ENOMEM);
 530
 531         fd->fd_file = file;
 532         if (S_ISDIR(inode->i_mode))
 533                 ll_authorize_statahead(inode, fd);
 534
 535         if (inode->i_sb->s_root == file_dentry(file)) {
 536                 LUSTRE_FPRIVATE(file) = fd;
 537                 RETURN(0);
 538         }
 539
 540         if (!it || !it->it_disposition) {
 541                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 542                  * because everything but O_ACCMODE mask was stripped from
 543                  * there */
 544                 if ((oit.it_flags + 1) & O_ACCMODE)
 545                         oit.it_flags++;
 546                 if (file->f_flags & O_TRUNC)
 547                         oit.it_flags |= FMODE_WRITE;
 548
 549                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 550                  * dentry_open after call to open_namei that checks permissions.
 551                  * Only nfsd_open call dentry_open directly without checking
 552                  * permissions and because of that this code below is safe. */
 553                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 554                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 555
 556                 /* We do not want O_EXCL here, presumably we opened the file
 557                  * already? XXX - NFS implications? */
 558                 oit.it_flags &= ~O_EXCL;
 559
 560                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 561                  * created if necessary, then "IT_CREAT" should be set to keep
 562                  * consistent with it */
 563                 if (oit.it_flags & O_CREAT)
 564                         oit.it_op |= IT_CREAT;
 565
 566                 it = &oit;
 567         }
 568
 569 restart:
 570         /* Let's see if we have file open on MDS already. */
 571         if (it->it_flags & FMODE_WRITE) {
 572                 och_p = &lli->lli_mds_write_och;
 573                 och_usecount = &lli->lli_open_fd_write_count;
 574         } else if (it->it_flags & FMODE_EXEC) {
 575                 och_p = &lli->lli_mds_exec_och;
 576                 och_usecount = &lli->lli_open_fd_exec_count;
 577          } else {
 578                 och_p = &lli->lli_mds_read_och;
 579                 och_usecount = &lli->lli_open_fd_read_count;
 580         }
 581
 582         mutex_lock(&lli->lli_och_mutex);
 583         if (*och_p) { /* Open handle is present */
 584                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 585                         /* Well, there's extra open request that we do not need,
 586                            let's close it somehow. This will decref request. */
 587                         rc = it_open_error(DISP_OPEN_OPEN, it);
 588                         if (rc) {
 589                                 mutex_unlock(&lli->lli_och_mutex);
 590                                 GOTO(out_openerr, rc);
 591                         }
 592
 593                         ll_release_openhandle(file_dentry(file), it);
 594                 }
 595                 (*och_usecount)++;
 596
 597                 rc = ll_local_open(file, it, fd, NULL);
 598                 if (rc) {
 599                         (*och_usecount)--;
 600                         mutex_unlock(&lli->lli_och_mutex);
 601                         GOTO(out_openerr, rc);
 602                 }
 603         } else {
 604                 LASSERT(*och_usecount == 0);
 605                 if (!it->it_disposition) {
 606                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 607                         /* We cannot just request lock handle now, new ELC code
 608                            means that one of other OPEN locks for this file
 609                            could be cancelled, and since blocking ast handler
 610                            would attempt to grab och_mutex as well, that would
 611                            result in a deadlock */
 612                         mutex_unlock(&lli->lli_och_mutex);
 613                         /*
 614                          * Normally called under two situations:
 615                          * 1. NFS export.
 616                          * 2. A race/condition on MDS resulting in no open
 617                          *    handle to be returned from LOOKUP|OPEN request,
 618                          *    for example if the target entry was a symlink.
 619                          *
 620                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 621                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 622                          *  bit so that it's not confusing later callers.
 623                          *
 624                          *  NB; when ldd is NULL, it must have come via normal
 625                          *  lookup path only, since ll_iget_for_nfs always calls
 626                          *  ll_d_init().
 627                          */
 628                         if (ldd && ldd->lld_nfs_dentry) {
 629                                 ldd->lld_nfs_dentry = 0;
 630                                 it->it_flags |= MDS_OPEN_LOCK;
 631                         }
 632
 633                          /*
 634                          * Always specify MDS_OPEN_BY_FID because we don't want
 635                          * to get file with different fid.
 636                          */
 637                         it->it_flags |= MDS_OPEN_BY_FID;
 638                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 639                                                  it);
 640                         if (rc)
 641                                 GOTO(out_openerr, rc);
 642
 643                         goto restart;
 644                 }
 645                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 646                 if (!*och_p)
 647                         GOTO(out_och_free, rc = -ENOMEM);
 648
 649                 (*och_usecount)++;
 650
 651                 /* md_intent_lock() didn't get a request ref if there was an
 652                  * open error, so don't do cleanup on the request here
 653                  * (bug 3430) */
 654                 /* XXX (green): Should not we bail out on any error here, not
 655                  * just open error? */
 656                 rc = it_open_error(DISP_OPEN_OPEN, it);
 657                 if (rc != 0)
 658                         GOTO(out_och_free, rc);
 659
 660                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 661                          "inode %p: disposition %x, status %d\n", inode,
 662                          it_disposition(it, ~0), it->it_status);
 663
 664                 rc = ll_local_open(file, it, fd, *och_p);
 665                 if (rc)
 666                         GOTO(out_och_free, rc);
 667         }
 668         mutex_unlock(&lli->lli_och_mutex);
 669         fd = NULL;
 670
 671         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 672            different kind of OPEN lock for this same inode gets cancelled
 673            by ldlm_cancel_lru */
 674         if (!S_ISREG(inode->i_mode))
 675                 GOTO(out_och_free, rc);
 676
 677         cl_lov_delay_create_clear(&file->f_flags);
 678         GOTO(out_och_free, rc);
 679
 680 out_och_free:
 681         if (rc) {
 682                 if (och_p && *och_p) {
 683                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 684                         *och_p = NULL; /* OBD_FREE writes some magic there */
 685                         (*och_usecount)--;
 686                 }
 687                 mutex_unlock(&lli->lli_och_mutex);
 688
 689 out_openerr:
 690                 if (lli->lli_opendir_key == fd)
 691                         ll_deauthorize_statahead(inode, fd);
 692                 if (fd != NULL)
 693                         ll_file_data_put(fd);
 694         } else {
 695                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 696         }
 697
 698 out_nofiledata:
 699         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 700                 ptlrpc_req_finished(it->it_request);
 701                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 702         }
 703
 704         return rc;
 705 }
 706
 707 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 708                         struct ldlm_lock_desc *desc, void *data, int flag)
 709 {
 710         int rc;
 711         struct lustre_handle lockh;
 712         ENTRY;
 713
 714         switch (flag) {
 715         case LDLM_CB_BLOCKING:
 716                 ldlm_lock2handle(lock, &lockh);
 717                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 718                 if (rc < 0) {
 719                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 720                         RETURN(rc);
 721                 }
 722                 break;
 723         case LDLM_CB_CANCELING:
 724                 /* do nothing */
 725                 break;
 726         }
 727         RETURN(0);
 728 }
 729
 730 /**
 731  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 732  * and save it as fd->fd_och so as to force client to reopen the file even
 733  * if it has an open lock in cache already.
 734  */
 735 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 736                                 struct lustre_handle *old_handle)
 737 {
 738         struct ll_inode_info *lli = ll_i2info(inode);
 739         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 740         struct obd_client_handle **och_p;
 741         __u64 *och_usecount;
 742         int rc = 0;
 743         ENTRY;
 744
 745         /* Get the openhandle of the file */
 746         mutex_lock(&lli->lli_och_mutex);
 747         if (fd->fd_lease_och != NULL)
 748                 GOTO(out_unlock, rc = -EBUSY);
 749
 750         if (fd->fd_och == NULL) {
 751                 if (file->f_mode & FMODE_WRITE) {
 752                         LASSERT(lli->lli_mds_write_och != NULL);
 753                         och_p = &lli->lli_mds_write_och;
 754                         och_usecount = &lli->lli_open_fd_write_count;
 755                 } else {
 756                         LASSERT(lli->lli_mds_read_och != NULL);
 757                         och_p = &lli->lli_mds_read_och;
 758                         och_usecount = &lli->lli_open_fd_read_count;
 759                 }
 760
 761                 if (*och_usecount > 1)
 762                         GOTO(out_unlock, rc = -EBUSY);
 763
 764                 fd->fd_och = *och_p;
 765                 *och_usecount = 0;
 766                 *och_p = NULL;
 767         }
 768
 769         *old_handle = fd->fd_och->och_fh;
 770
 771         EXIT;
 772 out_unlock:
 773         mutex_unlock(&lli->lli_och_mutex);
 774         return rc;
 775 }
 776
 777 /**
 778  * Release ownership on lli_mds_*_och when putting back a file lease.
 779  */
 780 static int ll_lease_och_release(struct inode *inode, struct file *file)
 781 {
 782         struct ll_inode_info *lli = ll_i2info(inode);
 783         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 784         struct obd_client_handle **och_p;
 785         struct obd_client_handle *old_och = NULL;
 786         __u64 *och_usecount;
 787         int rc = 0;
 788         ENTRY;
 789
 790         mutex_lock(&lli->lli_och_mutex);
 791         if (file->f_mode & FMODE_WRITE) {
 792                 och_p = &lli->lli_mds_write_och;
 793                 och_usecount = &lli->lli_open_fd_write_count;
 794         } else {
 795                 och_p = &lli->lli_mds_read_och;
 796                 och_usecount = &lli->lli_open_fd_read_count;
 797         }
 798
 799         /* The file may have been open by another process (broken lease) so
 800          * *och_p is not NULL. In this case we should simply increase usecount
 801          * and close fd_och.
 802          */
 803         if (*och_p != NULL) {
 804                 old_och = fd->fd_och;
 805                 (*och_usecount)++;
 806         } else {
 807                 *och_p = fd->fd_och;
 808                 *och_usecount = 1;
 809         }
 810         fd->fd_och = NULL;
 811         mutex_unlock(&lli->lli_och_mutex);
 812
 813         if (old_och != NULL)
 814                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 815
 816         RETURN(rc);
 817 }
 818
 819 /**
 820  * Acquire a lease and open the file.
 821  */
 822 static struct obd_client_handle *
 823 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 824               __u64 open_flags)
 825 {
 826         struct lookup_intent it = { .it_op = IT_OPEN };
 827         struct ll_sb_info *sbi = ll_i2sbi(inode);
 828         struct md_op_data *op_data;
 829         struct ptlrpc_request *req = NULL;
 830         struct lustre_handle old_handle = { 0 };
 831         struct obd_client_handle *och = NULL;
 832         int rc;
 833         int rc2;
 834         ENTRY;
 835
 836         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 837                 RETURN(ERR_PTR(-EINVAL));
 838
 839         if (file != NULL) {
 840                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 841                         RETURN(ERR_PTR(-EPERM));
 842
 843                 rc = ll_lease_och_acquire(inode, file, &old_handle);
 844                 if (rc)
 845                         RETURN(ERR_PTR(rc));
 846         }
 847
 848         OBD_ALLOC_PTR(och);
 849         if (och == NULL)
 850                 RETURN(ERR_PTR(-ENOMEM));
 851
 852         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 853                                         LUSTRE_OPC_ANY, NULL);
 854         if (IS_ERR(op_data))
 855                 GOTO(out, rc = PTR_ERR(op_data));
 856
 857         /* To tell the MDT this openhandle is from the same owner */
 858         op_data->op_handle = old_handle;
 859
 860         it.it_flags = fmode | open_flags;
 861         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 862         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 863                             &ll_md_blocking_lease_ast,
 864         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 865          * it can be cancelled which may mislead applications that the lease is
 866          * broken;
 867          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 868          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 869          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 870                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 871         ll_finish_md_op_data(op_data);
 872         ptlrpc_req_finished(req);
 873         if (rc < 0)
 874                 GOTO(out_release_it, rc);
 875
 876         if (it_disposition(&it, DISP_LOOKUP_NEG))
 877                 GOTO(out_release_it, rc = -ENOENT);
 878
 879         rc = it_open_error(DISP_OPEN_OPEN, &it);
 880         if (rc)
 881                 GOTO(out_release_it, rc);
 882
 883         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 884         ll_och_fill(sbi->ll_md_exp, &it, och);
 885
 886         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 887                 GOTO(out_close, rc = -EOPNOTSUPP);
 888
 889         /* already get lease, handle lease lock */
 890         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 891         if (it.it_lock_mode == 0 ||
 892             it.it_lock_bits != MDS_INODELOCK_OPEN) {
 893                 /* open lock must return for lease */
 894                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 895                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
 896                         it.it_lock_bits);
 897                 GOTO(out_close, rc = -EPROTO);
 898         }
 899
 900         ll_intent_release(&it);
 901         RETURN(och);
 902
 903 out_close:
 904         /* Cancel open lock */
 905         if (it.it_lock_mode != 0) {
 906                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 907                                             it.it_lock_mode);
 908                 it.it_lock_mode = 0;
 909                 och->och_lease_handle.cookie = 0ULL;
 910         }
 911         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
 912         if (rc2 < 0)
 913                 CERROR("%s: error closing file "DFID": %d\n",
 914                        ll_get_fsname(inode->i_sb, NULL, 0),
 915                        PFID(&ll_i2info(inode)->lli_fid), rc2);
 916         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
 917 out_release_it:
 918         ll_intent_release(&it);
 919 out:
 920         if (och != NULL)
 921                 OBD_FREE_PTR(och);
 922         RETURN(ERR_PTR(rc));
 923 }
 924
 925 /**
 926  * Check whether a layout swap can be done between two inodes.
 927  *
 928  * \param[in] inode1  First inode to check
 929  * \param[in] inode2  Second inode to check
 930  *
 931  * \retval 0 on success, layout swap can be performed between both inodes
 932  * \retval negative error code if requirements are not met
 933  */
 934 static int ll_check_swap_layouts_validity(struct inode *inode1,
 935                                           struct inode *inode2)
 936 {
 937         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
 938                 return -EINVAL;
 939
 940         if (inode_permission(inode1, MAY_WRITE) ||
 941             inode_permission(inode2, MAY_WRITE))
 942                 return -EPERM;
 943
 944         if (inode1->i_sb != inode2->i_sb)
 945                 return -EXDEV;
 946
 947         return 0;
 948 }
 949
 950 static int ll_swap_layouts_close(struct obd_client_handle *och,
 951                                  struct inode *inode, struct inode *inode2)
 952 {
 953         const struct lu_fid     *fid1 = ll_inode2fid(inode);
 954         const struct lu_fid     *fid2;
 955         int                      rc;
 956         ENTRY;
 957
 958         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
 959                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
 960
 961         rc = ll_check_swap_layouts_validity(inode, inode2);
 962         if (rc < 0)
 963                 GOTO(out_free_och, rc);
 964
 965         /* We now know that inode2 is a lustre inode */
 966         fid2 = ll_inode2fid(inode2);
 967
 968         rc = lu_fid_cmp(fid1, fid2);
 969         if (rc == 0)
 970                 GOTO(out_free_och, rc = -EINVAL);
 971
 972         /* Close the file and {swap,merge} layouts between inode & inode2.
 973          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
 974          * because we still need it to pack l_remote_handle to MDT. */
 975         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
 976                                        inode2);
 977
 978         och = NULL; /* freed in ll_close_inode_openhandle() */
 979
 980 out_free_och:
 981         if (och != NULL)
 982                 OBD_FREE_PTR(och);
 983
 984         RETURN(rc);
 985 }
 986
 987 /**
 988  * Release lease and close the file.
 989  * It will check if the lease has ever broken.
 990  */
 991 static int ll_lease_close_intent(struct obd_client_handle *och,
 992                                  struct inode *inode,
 993                                  bool *lease_broken, enum mds_op_bias bias,
 994                                  void *data)
 995 {
 996         struct ldlm_lock *lock;
 997         bool cancelled = true;
 998         int rc;
 999         ENTRY;
1000
1001         lock = ldlm_handle2lock(&och->och_lease_handle);
1002         if (lock != NULL) {
1003                 lock_res_and_lock(lock);
1004                 cancelled = ldlm_is_cancel(lock);
1005                 unlock_res_and_lock(lock);
1006                 LDLM_LOCK_PUT(lock);
1007         }
1008
1009         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1010                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1011
1012         if (lease_broken != NULL)
1013                 *lease_broken = cancelled;
1014
1015         if (!cancelled && !bias)
1016                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1017
1018         if (cancelled) { /* no need to excute intent */
1019                 bias = 0;
1020                 data = NULL;
1021         }
1022
1023         rc = ll_close_inode_openhandle(inode, och, bias, data);
1024         RETURN(rc);
1025 }
1026
1027 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1028                           bool *lease_broken)
1029 {
1030         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1031 }
1032
1033 /**
1034  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1035  */
1036 static int ll_lease_file_resync(struct obd_client_handle *och,
1037                                 struct inode *inode)
1038 {
1039         struct ll_sb_info *sbi = ll_i2sbi(inode);
1040         struct md_op_data *op_data;
1041         __u64 data_version_unused;
1042         int rc;
1043         ENTRY;
1044
1045         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1046                                      LUSTRE_OPC_ANY, NULL);
1047         if (IS_ERR(op_data))
1048                 RETURN(PTR_ERR(op_data));
1049
1050         /* before starting file resync, it's necessary to clean up page cache
1051          * in client memory, otherwise once the layout version is increased,
1052          * writing back cached data will be denied the OSTs. */
1053         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1054         if (rc)
1055                 GOTO(out, rc);
1056
1057         op_data->op_handle = och->och_lease_handle;
1058         rc = md_file_resync(sbi->ll_md_exp, op_data);
1059         if (rc)
1060                 GOTO(out, rc);
1061
1062         EXIT;
1063 out:
1064         ll_finish_md_op_data(op_data);
1065         return rc;
1066 }
1067
1068 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1069 {
1070         struct ll_inode_info *lli = ll_i2info(inode);
1071         struct cl_object *obj = lli->lli_clob;
1072         struct cl_attr *attr = vvp_env_thread_attr(env);
1073         s64 atime;
1074         s64 mtime;
1075         s64 ctime;
1076         int rc = 0;
1077
1078         ENTRY;
1079
1080         ll_inode_size_lock(inode);
1081
1082         /* Merge timestamps the most recently obtained from MDS with
1083          * timestamps obtained from OSTs.
1084          *
1085          * Do not overwrite atime of inode because it may be refreshed
1086          * by file_accessed() function. If the read was served by cache
1087          * data, there is no RPC to be sent so that atime may not be
1088          * transferred to OSTs at all. MDT only updates atime at close time
1089          * if it's at least 'mdd.*.atime_diff' older.
1090          * All in all, the atime in Lustre does not strictly comply with
1091          * POSIX. Solving this problem needs to send an RPC to MDT for each
1092          * read, this will hurt performance. */
1093         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1094                 LTIME_S(inode->i_atime) = lli->lli_atime;
1095                 lli->lli_update_atime = 0;
1096         }
1097         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1098         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1099
1100         atime = LTIME_S(inode->i_atime);
1101         mtime = LTIME_S(inode->i_mtime);
1102         ctime = LTIME_S(inode->i_ctime);
1103
1104         cl_object_attr_lock(obj);
1105         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1106                 rc = -EINVAL;
1107         else
1108                 rc = cl_object_attr_get(env, obj, attr);
1109         cl_object_attr_unlock(obj);
1110
1111         if (rc != 0)
1112                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1113
1114         if (atime < attr->cat_atime)
1115                 atime = attr->cat_atime;
1116
1117         if (ctime < attr->cat_ctime)
1118                 ctime = attr->cat_ctime;
1119
1120         if (mtime < attr->cat_mtime)
1121                 mtime = attr->cat_mtime;
1122
1123         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1124                PFID(&lli->lli_fid), attr->cat_size);
1125
1126         i_size_write(inode, attr->cat_size);
1127         inode->i_blocks = attr->cat_blocks;
1128
1129         LTIME_S(inode->i_atime) = atime;
1130         LTIME_S(inode->i_mtime) = mtime;
1131         LTIME_S(inode->i_ctime) = ctime;
1132
1133 out_size_unlock:
1134         ll_inode_size_unlock(inode);
1135
1136         RETURN(rc);
1137 }
1138
1139 /**
1140  * Set designated mirror for I/O.
1141  *
1142  * So far only read, write, and truncated can support to issue I/O to
1143  * designated mirror.
1144  */
1145 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1146 {
1147         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1148
1149         /* clear layout version for generic(non-resync) I/O in case it carries
1150          * stale layout version due to I/O restart */
1151         io->ci_layout_version = 0;
1152
1153         /* FLR: disable non-delay for designated mirror I/O because obviously
1154          * only one mirror is available */
1155         if (fd->fd_designated_mirror > 0) {
1156                 io->ci_ndelay = 0;
1157                 io->ci_designated_mirror = fd->fd_designated_mirror;
1158                 io->ci_layout_version = fd->fd_layout_version;
1159                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1160                                  * io to ptasks */
1161         }
1162
1163         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1164                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1165 }
1166
1167 static bool file_is_noatime(const struct file *file)
1168 {
1169         const struct vfsmount *mnt = file->f_path.mnt;
1170         const struct inode *inode = file_inode((struct file *)file);
1171
1172         /* Adapted from file_accessed() and touch_atime().*/
1173         if (file->f_flags & O_NOATIME)
1174                 return true;
1175
1176         if (inode->i_flags & S_NOATIME)
1177                 return true;
1178
1179         if (IS_NOATIME(inode))
1180                 return true;
1181
1182         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1183                 return true;
1184
1185         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1186                 return true;
1187
1188         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1189                 return true;
1190
1191         return false;
1192 }
1193
1194 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1195
1196 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1197 {
1198         struct inode *inode = file_inode(file);
1199         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1200
1201         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1202         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1203         io->u.ci_rw.rw_file = file;
1204         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1205         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1206         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1207
1208         if (iot == CIT_WRITE) {
1209                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1210                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1211                                            file->f_flags & O_DIRECT ||
1212                                            IS_SYNC(inode));
1213         }
1214         io->ci_obj = ll_i2info(inode)->lli_clob;
1215         io->ci_lockreq = CILR_MAYBE;
1216         if (ll_file_nolock(file)) {
1217                 io->ci_lockreq = CILR_NEVER;
1218                 io->ci_no_srvlock = 1;
1219         } else if (file->f_flags & O_APPEND) {
1220                 io->ci_lockreq = CILR_MANDATORY;
1221         }
1222         io->ci_noatime = file_is_noatime(file);
1223         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1224                 io->ci_pio = !io->u.ci_rw.rw_append;
1225         else
1226                 io->ci_pio = 0;
1227
1228         /* FLR: only use non-delay I/O for read as there is only one
1229          * avaliable mirror for write. */
1230         io->ci_ndelay = !(iot == CIT_WRITE);
1231
1232         ll_io_set_mirror(io, file);
1233 }
1234
1235 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1236 {
1237         struct cl_io_pt *pt = ptask->pt_cbdata;
1238         struct file *file = pt->cip_file;
1239         struct lu_env *env;
1240         struct cl_io *io;
1241         loff_t pos = pt->cip_pos;
1242         int rc;
1243         __u16 refcheck;
1244         ENTRY;
1245
1246         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1247                 file_dentry(file)->d_name.name,
1248                 pt->cip_iot == CIT_READ ? "read" : "write",
1249                 pos, pos + pt->cip_count);
1250
1251         env = cl_env_get(&refcheck);
1252         if (IS_ERR(env))
1253                 RETURN(PTR_ERR(env));
1254
1255         io = vvp_env_thread_io(env);
1256         ll_io_init(io, file, pt->cip_iot);
1257         io->u.ci_rw.rw_iter = pt->cip_iter;
1258         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1259         io->ci_pio = 0; /* It's already in parallel task */
1260
1261         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1262                            pt->cip_count - pt->cip_result);
1263         if (!rc) {
1264                 struct vvp_io *vio = vvp_env_io(env);
1265
1266                 vio->vui_io_subtype = IO_NORMAL;
1267                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1268
1269                 ll_cl_add(file, env, io, LCC_RW);
1270                 rc = cl_io_loop(env, io);
1271                 ll_cl_remove(file, env);
1272         } else {
1273                 /* cl_io_rw_init() handled IO */
1274                 rc = io->ci_result;
1275         }
1276
1277         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1278                 if (io->ci_nob > 0)
1279                         io->ci_nob /= 2;
1280                 rc = -EIO;
1281         }
1282
1283         if (io->ci_nob > 0) {
1284                 pt->cip_result += io->ci_nob;
1285                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1286                 pos += io->ci_nob;
1287                 pt->cip_iocb.ki_pos = pos;
1288 #ifdef HAVE_KIOCB_KI_LEFT
1289                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1290 #elif defined(HAVE_KI_NBYTES)
1291                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1292 #endif
1293         }
1294
1295         cl_io_fini(env, io);
1296         cl_env_put(env, &refcheck);
1297
1298         pt->cip_need_restart = io->ci_need_restart;
1299
1300         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1301                 file_dentry(file)->d_name.name,
1302                 pt->cip_iot == CIT_READ ? "read" : "write",
1303                 pt->cip_result, rc);
1304
1305         RETURN(pt->cip_result > 0 ? 0 : rc);
1306 }
1307
1308 static ssize_t
1309 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1310                    struct file *file, enum cl_io_type iot,
1311                    loff_t *ppos, size_t count)
1312 {
1313         struct range_lock       range;
1314         struct vvp_io           *vio = vvp_env_io(env);
1315         struct inode            *inode = file_inode(file);
1316         struct ll_inode_info    *lli = ll_i2info(inode);
1317         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1318         struct cl_io            *io;
1319         loff_t                  pos = *ppos;
1320         ssize_t                 result = 0;
1321         int                     rc = 0;
1322         unsigned                retried = 0;
1323         bool                    restarted = false;
1324
1325         ENTRY;
1326
1327         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1328                 file_dentry(file)->d_name.name,
1329                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1330
1331 restart:
1332         io = vvp_env_thread_io(env);
1333         ll_io_init(io, file, iot);
1334         if (args->via_io_subtype == IO_NORMAL) {
1335                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1336                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1337         }
1338         if (args->via_io_subtype != IO_NORMAL || restarted)
1339                 io->ci_pio = 0;
1340         io->ci_ndelay_tried = retried;
1341
1342         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1343                 bool range_locked = false;
1344
1345                 if (file->f_flags & O_APPEND)
1346                         range_lock_init(&range, 0, LUSTRE_EOF);
1347                 else
1348                         range_lock_init(&range, pos, pos + count - 1);
1349
1350                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1351                 vio->vui_io_subtype = args->via_io_subtype;
1352
1353                 switch (vio->vui_io_subtype) {
1354                 case IO_NORMAL:
1355                         /* Direct IO reads must also take range lock,
1356                          * or multiple reads will try to work on the same pages
1357                          * See LU-6227 for details. */
1358                         if (((iot == CIT_WRITE) ||
1359                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1360                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1361                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1362                                        RL_PARA(&range));
1363                                 rc = range_lock(&lli->lli_write_tree, &range);
1364                                 if (rc < 0)
1365                                         GOTO(out, rc);
1366
1367                                 range_locked = true;
1368                         }
1369                         break;
1370                 case IO_SPLICE:
1371                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1372                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1373                         break;
1374                 default:
1375                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1376                         LBUG();
1377                 }
1378
1379                 ll_cl_add(file, env, io, LCC_RW);
1380                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1381                     !lli->lli_inode_locked) {
1382                         inode_lock(inode);
1383                         lli->lli_inode_locked = 1;
1384                 }
1385                 rc = cl_io_loop(env, io);
1386                 if (lli->lli_inode_locked) {
1387                         lli->lli_inode_locked = 0;
1388                         inode_unlock(inode);
1389                 }
1390                 ll_cl_remove(file, env);
1391
1392                 if (range_locked) {
1393                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1394                                RL_PARA(&range));
1395                         range_unlock(&lli->lli_write_tree, &range);
1396                 }
1397         } else {
1398                 /* cl_io_rw_init() handled IO */
1399                 rc = io->ci_result;
1400         }
1401
1402         if (io->ci_nob > 0) {
1403                 result += io->ci_nob;
1404                 count  -= io->ci_nob;
1405
1406                 if (args->via_io_subtype == IO_NORMAL) {
1407                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1408                         pos += io->ci_nob;
1409                         args->u.normal.via_iocb->ki_pos = pos;
1410 #ifdef HAVE_KIOCB_KI_LEFT
1411                         args->u.normal.via_iocb->ki_left = count;
1412 #elif defined(HAVE_KI_NBYTES)
1413                         args->u.normal.via_iocb->ki_nbytes = count;
1414 #endif
1415                 } else {
1416                         /* for splice */
1417                         pos = io->u.ci_rw.rw_range.cir_pos;
1418                 }
1419         }
1420 out:
1421         cl_io_fini(env, io);
1422
1423         CDEBUG(D_VFSTRACE,
1424                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1425                file->f_path.dentry->d_name.name,
1426                iot, rc, result, io->ci_need_restart);
1427
1428         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1429                 CDEBUG(D_VFSTRACE,
1430                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1431                         file_dentry(file)->d_name.name,
1432                         iot == CIT_READ ? "read" : "write",
1433                         pos, pos + count, result, rc);
1434                 /* preserve the tried count for FLR */
1435                 retried = io->ci_ndelay_tried;
1436                 restarted = true;
1437                 goto restart;
1438         }
1439
1440         if (iot == CIT_READ) {
1441                 if (result > 0)
1442                         ll_stats_ops_tally(ll_i2sbi(inode),
1443                                            LPROC_LL_READ_BYTES, result);
1444         } else if (iot == CIT_WRITE) {
1445                 if (result > 0) {
1446                         ll_stats_ops_tally(ll_i2sbi(inode),
1447                                            LPROC_LL_WRITE_BYTES, result);
1448                         fd->fd_write_failed = false;
1449                 } else if (result == 0 && rc == 0) {
1450                         rc = io->ci_result;
1451                         if (rc < 0)
1452                                 fd->fd_write_failed = true;
1453                         else
1454                                 fd->fd_write_failed = false;
1455                 } else if (rc != -ERESTARTSYS) {
1456                         fd->fd_write_failed = true;
1457                 }
1458         }
1459
1460         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1461                 file_dentry(file)->d_name.name,
1462                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1463
1464         *ppos = pos;
1465
1466         RETURN(result > 0 ? result : rc);
1467 }
1468
1469 /**
1470  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1471  * especially for small I/O.
1472  *
1473  * To serve a read request, CLIO has to create and initialize a cl_io and
1474  * then request DLM lock. This has turned out to have siginificant overhead
1475  * and affects the performance of small I/O dramatically.
1476  *
1477  * It's not necessary to create a cl_io for each I/O. Under the help of read
1478  * ahead, most of the pages being read are already in memory cache and we can
1479  * read those pages directly because if the pages exist, the corresponding DLM
1480  * lock must exist so that page content must be valid.
1481  *
1482  * In fast read implementation, the llite speculatively finds and reads pages
1483  * in memory cache. There are three scenarios for fast read:
1484  *   - If the page exists and is uptodate, kernel VM will provide the data and
1485  *     CLIO won't be intervened;
1486  *   - If the page was brought into memory by read ahead, it will be exported
1487  *     and read ahead parameters will be updated;
1488  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1489  *     it will go back and invoke normal read, i.e., a cl_io will be created
1490  *     and DLM lock will be requested.
1491  *
1492  * POSIX compliance: posix standard states that read is intended to be atomic.
1493  * Lustre read implementation is in line with Linux kernel read implementation
1494  * and neither of them complies with POSIX standard in this matter. Fast read
1495  * doesn't make the situation worse on single node but it may interleave write
1496  * results from multiple nodes due to short read handling in ll_file_aio_read().
1497  *
1498  * \param env - lu_env
1499  * \param iocb - kiocb from kernel
1500  * \param iter - user space buffers where the data will be copied
1501  *
1502  * \retval - number of bytes have been read, or error code if error occurred.
1503  */
1504 static ssize_t
1505 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1506 {
1507         ssize_t result;
1508
1509         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1510                 return 0;
1511
1512         /* NB: we can't do direct IO for fast read because it will need a lock
1513          * to make IO engine happy. */
1514         if (iocb->ki_filp->f_flags & O_DIRECT)
1515                 return 0;
1516
1517         result = generic_file_read_iter(iocb, iter);
1518
1519         /* If the first page is not in cache, generic_file_aio_read() will be
1520          * returned with -ENODATA.
1521          * See corresponding code in ll_readpage(). */
1522         if (result == -ENODATA)
1523                 result = 0;
1524
1525         if (result > 0)
1526                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1527                                 LPROC_LL_READ_BYTES, result);
1528
1529         return result;
1530 }
1531
1532 /*
1533  * Read from a file (through the page cache).
1534  */
1535 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1536 {
1537         struct lu_env *env;
1538         struct vvp_io_args *args;
1539         ssize_t result;
1540         ssize_t rc2;
1541         __u16 refcheck;
1542
1543         result = ll_do_fast_read(iocb, to);
1544         if (result < 0 || iov_iter_count(to) == 0)
1545                 GOTO(out, result);
1546
1547         env = cl_env_get(&refcheck);
1548         if (IS_ERR(env))
1549                 return PTR_ERR(env);
1550
1551         args = ll_env_args(env, IO_NORMAL);
1552         args->u.normal.via_iter = to;
1553         args->u.normal.via_iocb = iocb;
1554
1555         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1556                                  &iocb->ki_pos, iov_iter_count(to));
1557         if (rc2 > 0)
1558                 result += rc2;
1559         else if (result == 0)
1560                 result = rc2;
1561
1562         cl_env_put(env, &refcheck);
1563 out:
1564         return result;
1565 }
1566
1567 /**
1568  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1569  * If a page is already in the page cache and dirty (and some other things -
1570  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1571  * write to it without doing a full I/O, because Lustre already knows about it
1572  * and will write it out.  This saves a lot of processing time.
1573  *
1574  * All writes here are within one page, so exclusion is handled by the page
1575  * lock on the vm page.  Exception is appending, which requires locking the
1576  * full file to handle size issues.  We do not do tiny writes for writes which
1577  * touch multiple pages because it's very unlikely multiple sequential pages
1578  * are already dirty.
1579  *
1580  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1581  * and are unlikely to be to already dirty pages.
1582  *
1583  * Attribute updates are important here, we do it in ll_tiny_write_end.
1584  */
1585 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1586 {
1587         ssize_t count = iov_iter_count(iter);
1588         struct file *file = iocb->ki_filp;
1589         struct inode *inode = file_inode(file);
1590         struct ll_inode_info *lli = ll_i2info(inode);
1591         struct range_lock range;
1592         ssize_t result = 0;
1593         bool append = false;
1594
1595         ENTRY;
1596
1597         /* NB: we can't do direct IO for tiny writes because they use the page
1598          * cache, and we can't do sync writes because tiny writes can't flush
1599          * pages.
1600          */
1601         if (file->f_flags & (O_DIRECT | O_SYNC))
1602                 RETURN(0);
1603
1604         /* It is relatively unlikely we will overwrite a full dirty page, so
1605          * limit tiny writes to < PAGE_SIZE
1606          */
1607         if (count >= PAGE_SIZE)
1608                 RETURN(0);
1609
1610         /* For append writes, we must take the range lock to protect size
1611          * and also move pos to current size before writing.
1612          */
1613         if (file->f_flags & O_APPEND) {
1614                 struct lu_env *env;
1615                 __u16 refcheck;
1616
1617                 append = true;
1618                 range_lock_init(&range, 0, LUSTRE_EOF);
1619                 result = range_lock(&lli->lli_write_tree, &range);
1620                 if (result)
1621                         RETURN(result);
1622                 env = cl_env_get(&refcheck);
1623                 if (IS_ERR(env))
1624                         GOTO(out, result = PTR_ERR(env));
1625                 ll_merge_attr(env, inode);
1626                 cl_env_put(env, &refcheck);
1627                 iocb->ki_pos = i_size_read(inode);
1628         }
1629
1630         /* Does this write touch multiple pages?
1631          *
1632          * This partly duplicates the PAGE_SIZE check above, but must come
1633          * after range locking for append writes because it depends on the
1634          * write position (ki_pos).
1635          */
1636         if ((iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1637                 goto out;
1638
1639         result = __generic_file_write_iter(iocb, iter);
1640
1641         /* If the page is not already dirty, ll_tiny_write_begin returns
1642          * -ENODATA.  We continue on to normal write.
1643          */
1644         if (result == -ENODATA)
1645                 result = 0;
1646
1647         if (result > 0) {
1648                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1649                                    result);
1650                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1651         }
1652
1653 out:
1654         if (append)
1655                 range_unlock(&lli->lli_write_tree, &range);
1656
1657         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1658
1659         RETURN(result);
1660 }
1661
1662 /*
1663  * Write to a file (through the page cache).
1664  */
1665 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1666 {
1667         struct vvp_io_args *args;
1668         struct lu_env *env;
1669         ssize_t rc_tiny, rc_normal;
1670         __u16 refcheck;
1671
1672         ENTRY;
1673
1674         rc_tiny = ll_do_tiny_write(iocb, from);
1675
1676         /* In case of error, go on and try normal write - Only stop if tiny
1677          * write completed I/O.
1678          */
1679         if (iov_iter_count(from) == 0)
1680                 GOTO(out, rc_normal = rc_tiny);
1681
1682         env = cl_env_get(&refcheck);
1683         if (IS_ERR(env))
1684                 return PTR_ERR(env);
1685
1686         args = ll_env_args(env, IO_NORMAL);
1687         args->u.normal.via_iter = from;
1688         args->u.normal.via_iocb = iocb;
1689
1690         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1691                                     &iocb->ki_pos, iov_iter_count(from));
1692
1693         /* On success, combine bytes written. */
1694         if (rc_tiny >= 0 && rc_normal > 0)
1695                 rc_normal += rc_tiny;
1696         /* On error, only return error from normal write if tiny write did not
1697          * write any bytes.  Otherwise return bytes written by tiny write.
1698          */
1699         else if (rc_tiny > 0)
1700                 rc_normal = rc_tiny;
1701
1702         cl_env_put(env, &refcheck);
1703 out:
1704         RETURN(rc_normal);
1705 }
1706
1707 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1708 /*
1709  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1710  */
1711 static int ll_file_get_iov_count(const struct iovec *iov,
1712                                  unsigned long *nr_segs, size_t *count)
1713 {
1714         size_t cnt = 0;
1715         unsigned long seg;
1716
1717         for (seg = 0; seg < *nr_segs; seg++) {
1718                 const struct iovec *iv = &iov[seg];
1719
1720                 /*
1721                  * If any segment has a negative length, or the cumulative
1722                  * length ever wraps negative then return -EINVAL.
1723                  */
1724                 cnt += iv->iov_len;
1725                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1726                         return -EINVAL;
1727                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1728                         continue;
1729                 if (seg == 0)
1730                         return -EFAULT;
1731                 *nr_segs = seg;
1732                 cnt -= iv->iov_len;     /* This segment is no good */
1733                 break;
1734         }
1735         *count = cnt;
1736         return 0;
1737 }
1738
1739 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1740                                 unsigned long nr_segs, loff_t pos)
1741 {
1742         struct iov_iter to;
1743         size_t iov_count;
1744         ssize_t result;
1745         ENTRY;
1746
1747         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1748         if (result)
1749                 RETURN(result);
1750
1751 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1752         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1753 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1754         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1755 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1756
1757         result = ll_file_read_iter(iocb, &to);
1758
1759         RETURN(result);
1760 }
1761
1762 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1763                             loff_t *ppos)
1764 {
1765         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1766         struct kiocb   kiocb;
1767         ssize_t        result;
1768         ENTRY;
1769
1770         init_sync_kiocb(&kiocb, file);
1771         kiocb.ki_pos = *ppos;
1772 #ifdef HAVE_KIOCB_KI_LEFT
1773         kiocb.ki_left = count;
1774 #elif defined(HAVE_KI_NBYTES)
1775         kiocb.i_nbytes = count;
1776 #endif
1777
1778         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1779         *ppos = kiocb.ki_pos;
1780
1781         RETURN(result);
1782 }
1783
1784 /*
1785  * Write to a file (through the page cache).
1786  * AIO stuff
1787  */
1788 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1789                                  unsigned long nr_segs, loff_t pos)
1790 {
1791         struct iov_iter from;
1792         size_t iov_count;
1793         ssize_t result;
1794         ENTRY;
1795
1796         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1797         if (result)
1798                 RETURN(result);
1799
1800 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1801         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1802 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1803         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1804 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1805
1806         result = ll_file_write_iter(iocb, &from);
1807
1808         RETURN(result);
1809 }
1810
1811 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1812                              size_t count, loff_t *ppos)
1813 {
1814         struct iovec   iov = { .iov_base = (void __user *)buf,
1815                                .iov_len = count };
1816         struct kiocb   kiocb;
1817         ssize_t        result;
1818
1819         ENTRY;
1820
1821         init_sync_kiocb(&kiocb, file);
1822         kiocb.ki_pos = *ppos;
1823 #ifdef HAVE_KIOCB_KI_LEFT
1824         kiocb.ki_left = count;
1825 #elif defined(HAVE_KI_NBYTES)
1826         kiocb.ki_nbytes = count;
1827 #endif
1828
1829         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1830         *ppos = kiocb.ki_pos;
1831
1832         RETURN(result);
1833 }
1834 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1835
1836 /*
1837  * Send file content (through pagecache) somewhere with helper
1838  */
1839 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1840                                    struct pipe_inode_info *pipe, size_t count,
1841                                    unsigned int flags)
1842 {
1843         struct lu_env      *env;
1844         struct vvp_io_args *args;
1845         ssize_t             result;
1846         __u16               refcheck;
1847         ENTRY;
1848
1849         env = cl_env_get(&refcheck);
1850         if (IS_ERR(env))
1851                 RETURN(PTR_ERR(env));
1852
1853         args = ll_env_args(env, IO_SPLICE);
1854         args->u.splice.via_pipe = pipe;
1855         args->u.splice.via_flags = flags;
1856
1857         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1858         cl_env_put(env, &refcheck);
1859         RETURN(result);
1860 }
1861
1862 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1863                              __u64 flags, struct lov_user_md *lum, int lum_size)
1864 {
1865         struct lookup_intent oit = {
1866                 .it_op = IT_OPEN,
1867                 .it_flags = flags | MDS_OPEN_BY_FID,
1868         };
1869         int rc;
1870         ENTRY;
1871
1872         ll_inode_size_lock(inode);
1873         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1874         if (rc < 0)
1875                 GOTO(out_unlock, rc);
1876
1877         ll_release_openhandle(dentry, &oit);
1878
1879 out_unlock:
1880         ll_inode_size_unlock(inode);
1881         ll_intent_release(&oit);
1882
1883         RETURN(rc);
1884 }
1885
1886 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1887                              struct lov_mds_md **lmmp, int *lmm_size,
1888                              struct ptlrpc_request **request)
1889 {
1890         struct ll_sb_info *sbi = ll_i2sbi(inode);
1891         struct mdt_body  *body;
1892         struct lov_mds_md *lmm = NULL;
1893         struct ptlrpc_request *req = NULL;
1894         struct md_op_data *op_data;
1895         int rc, lmmsize;
1896
1897         rc = ll_get_default_mdsize(sbi, &lmmsize);
1898         if (rc)
1899                 RETURN(rc);
1900
1901         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1902                                      strlen(filename), lmmsize,
1903                                      LUSTRE_OPC_ANY, NULL);
1904         if (IS_ERR(op_data))
1905                 RETURN(PTR_ERR(op_data));
1906
1907         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1908         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1909         ll_finish_md_op_data(op_data);
1910         if (rc < 0) {
1911                 CDEBUG(D_INFO, "md_getattr_name failed "
1912                        "on %s: rc %d\n", filename, rc);
1913                 GOTO(out, rc);
1914         }
1915
1916         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1917         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1918
1919         lmmsize = body->mbo_eadatasize;
1920
1921         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1922                         lmmsize == 0) {
1923                 GOTO(out, rc = -ENODATA);
1924         }
1925
1926         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1927         LASSERT(lmm != NULL);
1928
1929         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1930             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1931             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1932                 GOTO(out, rc = -EPROTO);
1933
1934         /*
1935          * This is coming from the MDS, so is probably in
1936          * little endian.  We convert it to host endian before
1937          * passing it to userspace.
1938          */
1939         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1940                 int stripe_count;
1941
1942                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1943                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1944                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1945                         if (le32_to_cpu(lmm->lmm_pattern) &
1946                             LOV_PATTERN_F_RELEASED)
1947                                 stripe_count = 0;
1948                 }
1949
1950                 /* if function called for directory - we should
1951                  * avoid swab not existent lsm objects */
1952                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1953                         lustre_swab_lov_user_md_v1(
1954                                         (struct lov_user_md_v1 *)lmm);
1955                         if (S_ISREG(body->mbo_mode))
1956                                 lustre_swab_lov_user_md_objects(
1957                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1958                                     stripe_count);
1959                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1960                         lustre_swab_lov_user_md_v3(
1961                                         (struct lov_user_md_v3 *)lmm);
1962                         if (S_ISREG(body->mbo_mode))
1963                                 lustre_swab_lov_user_md_objects(
1964                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1965                                     stripe_count);
1966                 } else if (lmm->lmm_magic ==
1967                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1968                         lustre_swab_lov_comp_md_v1(
1969                                         (struct lov_comp_md_v1 *)lmm);
1970                 }
1971         }
1972
1973 out:
1974         *lmmp = lmm;
1975         *lmm_size = lmmsize;
1976         *request = req;
1977         return rc;
1978 }
1979
1980 static int ll_lov_setea(struct inode *inode, struct file *file,
1981                         void __user *arg)
1982 {
1983         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1984         struct lov_user_md      *lump;
1985         int                      lum_size = sizeof(struct lov_user_md) +
1986                                             sizeof(struct lov_user_ost_data);
1987         int                      rc;
1988         ENTRY;
1989
1990         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1991                 RETURN(-EPERM);
1992
1993         OBD_ALLOC_LARGE(lump, lum_size);
1994         if (lump == NULL)
1995                 RETURN(-ENOMEM);
1996
1997         if (copy_from_user(lump, arg, lum_size))
1998                 GOTO(out_lump, rc = -EFAULT);
1999
2000         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2001                                       lum_size);
2002         cl_lov_delay_create_clear(&file->f_flags);
2003
2004 out_lump:
2005         OBD_FREE_LARGE(lump, lum_size);
2006         RETURN(rc);
2007 }
2008
2009 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2010 {
2011         struct lu_env   *env;
2012         __u16           refcheck;
2013         int             rc;
2014         ENTRY;
2015
2016         env = cl_env_get(&refcheck);
2017         if (IS_ERR(env))
2018                 RETURN(PTR_ERR(env));
2019
2020         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2021         cl_env_put(env, &refcheck);
2022         RETURN(rc);
2023 }
2024
2025 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2026                             void __user *arg)
2027 {
2028         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2029         struct lov_user_md        *klum;
2030         int                        lum_size, rc;
2031         __u64                      flags = FMODE_WRITE;
2032         ENTRY;
2033
2034         rc = ll_copy_user_md(lum, &klum);
2035         if (rc < 0)
2036                 RETURN(rc);
2037
2038         lum_size = rc;
2039         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2040                                       lum_size);
2041         if (!rc) {
2042                 __u32 gen;
2043
2044                 rc = put_user(0, &lum->lmm_stripe_count);
2045                 if (rc)
2046                         GOTO(out, rc);
2047
2048                 rc = ll_layout_refresh(inode, &gen);
2049                 if (rc)
2050                         GOTO(out, rc);
2051
2052                 rc = ll_file_getstripe(inode, arg, lum_size);
2053         }
2054         cl_lov_delay_create_clear(&file->f_flags);
2055
2056 out:
2057         OBD_FREE(klum, lum_size);
2058         RETURN(rc);
2059 }
2060
2061 static int
2062 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2063 {
2064         struct ll_inode_info *lli = ll_i2info(inode);
2065         struct cl_object *obj = lli->lli_clob;
2066         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2067         struct ll_grouplock grouplock;
2068         int rc;
2069         ENTRY;
2070
2071         if (arg == 0) {
2072                 CWARN("group id for group lock must not be 0\n");
2073                 RETURN(-EINVAL);
2074         }
2075
2076         if (ll_file_nolock(file))
2077                 RETURN(-EOPNOTSUPP);
2078
2079         spin_lock(&lli->lli_lock);
2080         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2081                 CWARN("group lock already existed with gid %lu\n",
2082                       fd->fd_grouplock.lg_gid);
2083                 spin_unlock(&lli->lli_lock);
2084                 RETURN(-EINVAL);
2085         }
2086         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2087         spin_unlock(&lli->lli_lock);
2088
2089         /**
2090          * XXX: group lock needs to protect all OST objects while PFL
2091          * can add new OST objects during the IO, so we'd instantiate
2092          * all OST objects before getting its group lock.
2093          */
2094         if (obj) {
2095                 struct lu_env *env;
2096                 __u16 refcheck;
2097                 struct cl_layout cl = {
2098                         .cl_is_composite = false,
2099                 };
2100                 struct lu_extent ext = {
2101                         .e_start = 0,
2102                         .e_end = OBD_OBJECT_EOF,
2103                 };
2104
2105                 env = cl_env_get(&refcheck);
2106                 if (IS_ERR(env))
2107                         RETURN(PTR_ERR(env));
2108
2109                 rc = cl_object_layout_get(env, obj, &cl);
2110                 if (!rc && cl.cl_is_composite)
2111                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2112                                                     &ext);
2113
2114                 cl_env_put(env, &refcheck);
2115                 if (rc)
2116                         RETURN(rc);
2117         }
2118
2119         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2120                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2121         if (rc)
2122                 RETURN(rc);
2123
2124         spin_lock(&lli->lli_lock);
2125         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2126                 spin_unlock(&lli->lli_lock);
2127                 CERROR("another thread just won the race\n");
2128                 cl_put_grouplock(&grouplock);
2129                 RETURN(-EINVAL);
2130         }
2131
2132         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2133         fd->fd_grouplock = grouplock;
2134         spin_unlock(&lli->lli_lock);
2135
2136         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2137         RETURN(0);
2138 }
2139
2140 static int ll_put_grouplock(struct inode *inode, struct file *file,
2141                             unsigned long arg)
2142 {
2143         struct ll_inode_info   *lli = ll_i2info(inode);
2144         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2145         struct ll_grouplock     grouplock;
2146         ENTRY;
2147
2148         spin_lock(&lli->lli_lock);
2149         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2150                 spin_unlock(&lli->lli_lock);
2151                 CWARN("no group lock held\n");
2152                 RETURN(-EINVAL);
2153         }
2154
2155         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2156
2157         if (fd->fd_grouplock.lg_gid != arg) {
2158                 CWARN("group lock %lu doesn't match current id %lu\n",
2159                       arg, fd->fd_grouplock.lg_gid);
2160                 spin_unlock(&lli->lli_lock);
2161                 RETURN(-EINVAL);
2162         }
2163
2164         grouplock = fd->fd_grouplock;
2165         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2166         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2167         spin_unlock(&lli->lli_lock);
2168
2169         cl_put_grouplock(&grouplock);
2170         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2171         RETURN(0);
2172 }
2173
2174 /**
2175  * Close inode open handle
2176  *
2177  * \param dentry [in]     dentry which contains the inode
2178  * \param it     [in,out] intent which contains open info and result
2179  *
2180  * \retval 0     success
2181  * \retval <0    failure
2182  */
2183 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2184 {
2185         struct inode *inode = dentry->d_inode;
2186         struct obd_client_handle *och;
2187         int rc;
2188         ENTRY;
2189
2190         LASSERT(inode);
2191
2192         /* Root ? Do nothing. */
2193         if (dentry->d_inode->i_sb->s_root == dentry)
2194                 RETURN(0);
2195
2196         /* No open handle to close? Move away */
2197         if (!it_disposition(it, DISP_OPEN_OPEN))
2198                 RETURN(0);
2199
2200         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2201
2202         OBD_ALLOC(och, sizeof(*och));
2203         if (!och)
2204                 GOTO(out, rc = -ENOMEM);
2205
2206         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2207
2208         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2209 out:
2210         /* this one is in place of ll_file_open */
2211         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2212                 ptlrpc_req_finished(it->it_request);
2213                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2214         }
2215         RETURN(rc);
2216 }
2217
2218 /**
2219  * Get size for inode for which FIEMAP mapping is requested.
2220  * Make the FIEMAP get_info call and returns the result.
2221  * \param fiemap        kernel buffer to hold extens
2222  * \param num_bytes     kernel buffer size
2223  */
2224 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2225                         size_t num_bytes)
2226 {
2227         struct lu_env                   *env;
2228         __u16                           refcheck;
2229         int                             rc = 0;
2230         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2231         ENTRY;
2232
2233         /* Checks for fiemap flags */
2234         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2235                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2236                 return -EBADR;
2237         }
2238
2239         /* Check for FIEMAP_FLAG_SYNC */
2240         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2241                 rc = filemap_fdatawrite(inode->i_mapping);
2242                 if (rc)
2243                         return rc;
2244         }
2245
2246         env = cl_env_get(&refcheck);
2247         if (IS_ERR(env))
2248                 RETURN(PTR_ERR(env));
2249
2250         if (i_size_read(inode) == 0) {
2251                 rc = ll_glimpse_size(inode);
2252                 if (rc)
2253                         GOTO(out, rc);
2254         }
2255
2256         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2257         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2258         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2259
2260         /* If filesize is 0, then there would be no objects for mapping */
2261         if (fmkey.lfik_oa.o_size == 0) {
2262                 fiemap->fm_mapped_extents = 0;
2263                 GOTO(out, rc = 0);
2264         }
2265
2266         fmkey.lfik_fiemap = *fiemap;
2267
2268         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2269                               &fmkey, fiemap, &num_bytes);
2270 out:
2271         cl_env_put(env, &refcheck);
2272         RETURN(rc);
2273 }
2274
2275 int ll_fid2path(struct inode *inode, void __user *arg)
2276 {
2277         struct obd_export       *exp = ll_i2mdexp(inode);
2278         const struct getinfo_fid2path __user *gfin = arg;
2279         __u32                    pathlen;
2280         struct getinfo_fid2path *gfout;
2281         size_t                   outsize;
2282         int                      rc;
2283
2284         ENTRY;
2285
2286         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2287             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2288                 RETURN(-EPERM);
2289
2290         /* Only need to get the buflen */
2291         if (get_user(pathlen, &gfin->gf_pathlen))
2292                 RETURN(-EFAULT);
2293
2294         if (pathlen > PATH_MAX)
2295                 RETURN(-EINVAL);
2296
2297         outsize = sizeof(*gfout) + pathlen;
2298         OBD_ALLOC(gfout, outsize);
2299         if (gfout == NULL)
2300                 RETURN(-ENOMEM);
2301
2302         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2303                 GOTO(gf_free, rc = -EFAULT);
2304         /* append root FID after gfout to let MDT know the root FID so that it
2305          * can lookup the correct path, this is mainly for fileset.
2306          * old server without fileset mount support will ignore this. */
2307         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2308
2309         /* Call mdc_iocontrol */
2310         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2311         if (rc != 0)
2312                 GOTO(gf_free, rc);
2313
2314         if (copy_to_user(arg, gfout, outsize))
2315                 rc = -EFAULT;
2316
2317 gf_free:
2318         OBD_FREE(gfout, outsize);
2319         RETURN(rc);
2320 }
2321
2322 static int
2323 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2324 {
2325         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2326         struct lu_env *env;
2327         struct cl_io *io;
2328         __u16  refcheck;
2329         int result;
2330
2331         ENTRY;
2332
2333         ioc->idv_version = 0;
2334         ioc->idv_layout_version = UINT_MAX;
2335
2336         /* If no file object initialized, we consider its version is 0. */
2337         if (obj == NULL)
2338                 RETURN(0);
2339
2340         env = cl_env_get(&refcheck);
2341         if (IS_ERR(env))
2342                 RETURN(PTR_ERR(env));
2343
2344         io = vvp_env_thread_io(env);
2345         io->ci_obj = obj;
2346         io->u.ci_data_version.dv_data_version = 0;
2347         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2348         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2349
2350 restart:
2351         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2352                 result = cl_io_loop(env, io);
2353         else
2354                 result = io->ci_result;
2355
2356         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2357         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2358
2359         cl_io_fini(env, io);
2360
2361         if (unlikely(io->ci_need_restart))
2362                 goto restart;
2363
2364         cl_env_put(env, &refcheck);
2365
2366         RETURN(result);
2367 }
2368
2369 /*
2370  * Read the data_version for inode.
2371  *
2372  * This value is computed using stripe object version on OST.
2373  * Version is computed using server side locking.
2374  *
2375  * @param flags if do sync on the OST side;
2376  *              0: no sync
2377  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2378  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2379  */
2380 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2381 {
2382         struct ioc_data_version ioc = { .idv_flags = flags };
2383         int rc;
2384
2385         rc = ll_ioc_data_version(inode, &ioc);
2386         if (!rc)
2387                 *data_version = ioc.idv_version;
2388
2389         return rc;
2390 }
2391
2392 /*
2393  * Trigger a HSM release request for the provided inode.
2394  */
2395 int ll_hsm_release(struct inode *inode)
2396 {
2397         struct lu_env *env;
2398         struct obd_client_handle *och = NULL;
2399         __u64 data_version = 0;
2400         int rc;
2401         __u16 refcheck;
2402         ENTRY;
2403
2404         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2405                ll_get_fsname(inode->i_sb, NULL, 0),
2406                PFID(&ll_i2info(inode)->lli_fid));
2407
2408         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2409         if (IS_ERR(och))
2410                 GOTO(out, rc = PTR_ERR(och));
2411
2412         /* Grab latest data_version and [am]time values */
2413         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2414         if (rc != 0)
2415                 GOTO(out, rc);
2416
2417         env = cl_env_get(&refcheck);
2418         if (IS_ERR(env))
2419                 GOTO(out, rc = PTR_ERR(env));
2420
2421         rc = ll_merge_attr(env, inode);
2422         cl_env_put(env, &refcheck);
2423
2424         /* If error happen, we have the wrong size for a file.
2425          * Don't release it.
2426          */
2427         if (rc != 0)
2428                 GOTO(out, rc);
2429
2430         /* Release the file.
2431          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2432          * we still need it to pack l_remote_handle to MDT. */
2433         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2434                                        &data_version);
2435         och = NULL;
2436
2437         EXIT;
2438 out:
2439         if (och != NULL && !IS_ERR(och)) /* close the file */
2440                 ll_lease_close(och, inode, NULL);
2441
2442         return rc;
2443 }
2444
2445 struct ll_swap_stack {
2446         __u64                    dv1;
2447         __u64                    dv2;
2448         struct inode            *inode1;
2449         struct inode            *inode2;
2450         bool                     check_dv1;
2451         bool                     check_dv2;
2452 };
2453
2454 static int ll_swap_layouts(struct file *file1, struct file *file2,
2455                            struct lustre_swap_layouts *lsl)
2456 {
2457         struct mdc_swap_layouts  msl;
2458         struct md_op_data       *op_data;
2459         __u32                    gid;
2460         __u64                    dv;
2461         struct ll_swap_stack    *llss = NULL;
2462         int                      rc;
2463
2464         OBD_ALLOC_PTR(llss);
2465         if (llss == NULL)
2466                 RETURN(-ENOMEM);
2467
2468         llss->inode1 = file_inode(file1);
2469         llss->inode2 = file_inode(file2);
2470
2471         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2472         if (rc < 0)
2473                 GOTO(free, rc);
2474
2475         /* we use 2 bool because it is easier to swap than 2 bits */
2476         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2477                 llss->check_dv1 = true;
2478
2479         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2480                 llss->check_dv2 = true;
2481
2482         /* we cannot use lsl->sl_dvX directly because we may swap them */
2483         llss->dv1 = lsl->sl_dv1;
2484         llss->dv2 = lsl->sl_dv2;
2485
2486         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2487         if (rc == 0) /* same file, done! */
2488                 GOTO(free, rc);
2489
2490         if (rc < 0) { /* sequentialize it */
2491                 swap(llss->inode1, llss->inode2);
2492                 swap(file1, file2);
2493                 swap(llss->dv1, llss->dv2);
2494                 swap(llss->check_dv1, llss->check_dv2);
2495         }
2496
2497         gid = lsl->sl_gid;
2498         if (gid != 0) { /* application asks to flush dirty cache */
2499                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2500                 if (rc < 0)
2501                         GOTO(free, rc);
2502
2503                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2504                 if (rc < 0) {
2505                         ll_put_grouplock(llss->inode1, file1, gid);
2506                         GOTO(free, rc);
2507                 }
2508         }
2509
2510         /* ultimate check, before swaping the layouts we check if
2511          * dataversion has changed (if requested) */
2512         if (llss->check_dv1) {
2513                 rc = ll_data_version(llss->inode1, &dv, 0);
2514                 if (rc)
2515                         GOTO(putgl, rc);
2516                 if (dv != llss->dv1)
2517                         GOTO(putgl, rc = -EAGAIN);
2518         }
2519
2520         if (llss->check_dv2) {
2521                 rc = ll_data_version(llss->inode2, &dv, 0);
2522                 if (rc)
2523                         GOTO(putgl, rc);
2524                 if (dv != llss->dv2)
2525                         GOTO(putgl, rc = -EAGAIN);
2526         }
2527
2528         /* struct md_op_data is used to send the swap args to the mdt
2529          * only flags is missing, so we use struct mdc_swap_layouts
2530          * through the md_op_data->op_data */
2531         /* flags from user space have to be converted before they are send to
2532          * server, no flag is sent today, they are only used on the client */
2533         msl.msl_flags = 0;
2534         rc = -ENOMEM;
2535         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2536                                      0, LUSTRE_OPC_ANY, &msl);
2537         if (IS_ERR(op_data))
2538                 GOTO(free, rc = PTR_ERR(op_data));
2539
2540         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2541                            sizeof(*op_data), op_data, NULL);
2542         ll_finish_md_op_data(op_data);
2543
2544         if (rc < 0)
2545                 GOTO(putgl, rc);
2546
2547 putgl:
2548         if (gid != 0) {
2549                 ll_put_grouplock(llss->inode2, file2, gid);
2550                 ll_put_grouplock(llss->inode1, file1, gid);
2551         }
2552
2553 free:
2554         if (llss != NULL)
2555                 OBD_FREE_PTR(llss);
2556
2557         RETURN(rc);
2558 }
2559
2560 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2561 {
2562         struct md_op_data       *op_data;
2563         int                      rc;
2564         ENTRY;
2565
2566         /* Detect out-of range masks */
2567         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2568                 RETURN(-EINVAL);
2569
2570         /* Non-root users are forbidden to set or clear flags which are
2571          * NOT defined in HSM_USER_MASK. */
2572         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2573             !cfs_capable(CFS_CAP_SYS_ADMIN))
2574                 RETURN(-EPERM);
2575
2576         /* Detect out-of range archive id */
2577         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2578             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2579                 RETURN(-EINVAL);
2580
2581         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2582                                      LUSTRE_OPC_ANY, hss);
2583         if (IS_ERR(op_data))
2584                 RETURN(PTR_ERR(op_data));
2585
2586         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2587                            sizeof(*op_data), op_data, NULL);
2588
2589         ll_finish_md_op_data(op_data);
2590
2591         RETURN(rc);
2592 }
2593
2594 static int ll_hsm_import(struct inode *inode, struct file *file,
2595                          struct hsm_user_import *hui)
2596 {
2597         struct hsm_state_set    *hss = NULL;
2598         struct iattr            *attr = NULL;
2599         int                      rc;
2600         ENTRY;
2601
2602         if (!S_ISREG(inode->i_mode))
2603                 RETURN(-EINVAL);
2604
2605         /* set HSM flags */
2606         OBD_ALLOC_PTR(hss);
2607         if (hss == NULL)
2608                 GOTO(out, rc = -ENOMEM);
2609
2610         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2611         hss->hss_archive_id = hui->hui_archive_id;
2612         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2613         rc = ll_hsm_state_set(inode, hss);
2614         if (rc != 0)
2615                 GOTO(out, rc);
2616
2617         OBD_ALLOC_PTR(attr);
2618         if (attr == NULL)
2619                 GOTO(out, rc = -ENOMEM);
2620
2621         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2622         attr->ia_mode |= S_IFREG;
2623         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2624         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2625         attr->ia_size = hui->hui_size;
2626         attr->ia_mtime.tv_sec = hui->hui_mtime;
2627         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2628         attr->ia_atime.tv_sec = hui->hui_atime;
2629         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2630
2631         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2632                          ATTR_UID | ATTR_GID |
2633                          ATTR_MTIME | ATTR_MTIME_SET |
2634                          ATTR_ATIME | ATTR_ATIME_SET;
2635
2636         inode_lock(inode);
2637
2638         rc = ll_setattr_raw(file_dentry(file), attr, true);
2639         if (rc == -ENODATA)
2640                 rc = 0;
2641
2642         inode_unlock(inode);
2643
2644 out:
2645         if (hss != NULL)
2646                 OBD_FREE_PTR(hss);
2647
2648         if (attr != NULL)
2649                 OBD_FREE_PTR(attr);
2650
2651         RETURN(rc);
2652 }
2653
2654 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2655 {
2656         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2657                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2658 }
2659
2660 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2661 {
2662         struct inode *inode = file_inode(file);
2663         struct iattr ia = {
2664                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2665                             ATTR_MTIME | ATTR_MTIME_SET |
2666                             ATTR_CTIME | ATTR_CTIME_SET,
2667                 .ia_atime = {
2668                         .tv_sec = lfu->lfu_atime_sec,
2669                         .tv_nsec = lfu->lfu_atime_nsec,
2670                 },
2671                 .ia_mtime = {
2672                         .tv_sec = lfu->lfu_mtime_sec,
2673                         .tv_nsec = lfu->lfu_mtime_nsec,
2674                 },
2675                 .ia_ctime = {
2676                         .tv_sec = lfu->lfu_ctime_sec,
2677                         .tv_nsec = lfu->lfu_ctime_nsec,
2678                 },
2679         };
2680         int rc;
2681         ENTRY;
2682
2683         if (!capable(CAP_SYS_ADMIN))
2684                 RETURN(-EPERM);
2685
2686         if (!S_ISREG(inode->i_mode))
2687                 RETURN(-EINVAL);
2688
2689         inode_lock(inode);
2690         rc = ll_setattr_raw(file_dentry(file), &ia, false);
2691         inode_unlock(inode);
2692
2693         RETURN(rc);
2694 }
2695
2696 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2697 {
2698         switch (mode) {
2699         case MODE_READ_USER:
2700                 return CLM_READ;
2701         case MODE_WRITE_USER:
2702                 return CLM_WRITE;
2703         default:
2704                 return -EINVAL;
2705         }
2706 }
2707
2708 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2709
2710 /* Used to allow the upper layers of the client to request an LDLM lock
2711  * without doing an actual read or write.
2712  *
2713  * Used for ladvise lockahead to manually request specific locks.
2714  *
2715  * \param[in] file      file this ladvise lock request is on
2716  * \param[in] ladvise   ladvise struct describing this lock request
2717  *
2718  * \retval 0            success, no detailed result available (sync requests
2719  *                      and requests sent to the server [not handled locally]
2720  *                      cannot return detailed results)
2721  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2722  *                                       see definitions for details.
2723  * \retval negative     negative errno on error
2724  */
2725 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2726 {
2727         struct lu_env *env = NULL;
2728         struct cl_io *io  = NULL;
2729         struct cl_lock *lock = NULL;
2730         struct cl_lock_descr *descr = NULL;
2731         struct dentry *dentry = file->f_path.dentry;
2732         struct inode *inode = dentry->d_inode;
2733         enum cl_lock_mode cl_mode;
2734         off_t start = ladvise->lla_start;
2735         off_t end = ladvise->lla_end;
2736         int result;
2737         __u16 refcheck;
2738
2739         ENTRY;
2740
2741         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2742                "start=%llu, end=%llu\n", dentry->d_name.len,
2743                dentry->d_name.name, dentry->d_inode,
2744                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2745                (__u64) end);
2746
2747         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2748         if (cl_mode < 0)
2749                 GOTO(out, result = cl_mode);
2750
2751         /* Get IO environment */
2752         result = cl_io_get(inode, &env, &io, &refcheck);
2753         if (result <= 0)
2754                 GOTO(out, result);
2755
2756         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2757         if (result > 0) {
2758                 /*
2759                  * nothing to do for this io. This currently happens when
2760                  * stripe sub-object's are not yet created.
2761                  */
2762                 result = io->ci_result;
2763         } else if (result == 0) {
2764                 lock = vvp_env_lock(env);
2765                 descr = &lock->cll_descr;
2766
2767                 descr->cld_obj   = io->ci_obj;
2768                 /* Convert byte offsets to pages */
2769                 descr->cld_start = cl_index(io->ci_obj, start);
2770                 descr->cld_end   = cl_index(io->ci_obj, end);
2771                 descr->cld_mode  = cl_mode;
2772                 /* CEF_MUST is used because we do not want to convert a
2773                  * lockahead request to a lockless lock */
2774                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2775                                        CEF_NONBLOCK;
2776
2777                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2778                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2779
2780                 result = cl_lock_request(env, io, lock);
2781
2782                 /* On success, we need to release the lock */
2783                 if (result >= 0)
2784                         cl_lock_release(env, lock);
2785         }
2786         cl_io_fini(env, io);
2787         cl_env_put(env, &refcheck);
2788
2789         /* -ECANCELED indicates a matching lock with a different extent
2790          * was already present, and -EEXIST indicates a matching lock
2791          * on exactly the same extent was already present.
2792          * We convert them to positive values for userspace to make
2793          * recognizing true errors easier.
2794          * Note we can only return these detailed results on async requests,
2795          * as sync requests look the same as i/o requests for locking. */
2796         if (result == -ECANCELED)
2797                 result = LLA_RESULT_DIFFERENT;
2798         else if (result == -EEXIST)
2799                 result = LLA_RESULT_SAME;
2800
2801 out:
2802         RETURN(result);
2803 }
2804 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2805
2806 static int ll_ladvise_sanity(struct inode *inode,
2807                              struct llapi_lu_ladvise *ladvise)
2808 {
2809         enum lu_ladvise_type advice = ladvise->lla_advice;
2810         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2811          * be in the first 32 bits of enum ladvise_flags */
2812         __u32 flags = ladvise->lla_peradvice_flags;
2813         /* 3 lines at 80 characters per line, should be plenty */
2814         int rc = 0;
2815
2816         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2817                 rc = -EINVAL;
2818                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2819                        "last supported advice is %s (value '%d'): rc = %d\n",
2820                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2821                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2822                 GOTO(out, rc);
2823         }
2824
2825         /* Per-advice checks */
2826         switch (advice) {
2827         case LU_LADVISE_LOCKNOEXPAND:
2828                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2829                         rc = -EINVAL;
2830                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2831                                "rc = %d\n",
2832                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2833                                ladvise_names[advice], rc);
2834                         GOTO(out, rc);
2835                 }
2836                 break;
2837         case LU_LADVISE_LOCKAHEAD:
2838                 /* Currently only READ and WRITE modes can be requested */
2839                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2840                     ladvise->lla_lockahead_mode == 0) {
2841                         rc = -EINVAL;
2842                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2843                                "rc = %d\n",
2844                                ll_get_fsname(inode->i_sb, NULL, 0),
2845                                ladvise->lla_lockahead_mode,
2846                                ladvise_names[advice], rc);
2847                         GOTO(out, rc);
2848                 }
2849         case LU_LADVISE_WILLREAD:
2850         case LU_LADVISE_DONTNEED:
2851         default:
2852                 /* Note fall through above - These checks apply to all advices
2853                  * except LOCKNOEXPAND */
2854                 if (flags & ~LF_DEFAULT_MASK) {
2855                         rc = -EINVAL;
2856                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2857                                "rc = %d\n",
2858                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2859                                ladvise_names[advice], rc);
2860                         GOTO(out, rc);
2861                 }
2862                 if (ladvise->lla_start >= ladvise->lla_end) {
2863                         rc = -EINVAL;
2864                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2865                                "for %s: rc = %d\n",
2866                                ll_get_fsname(inode->i_sb, NULL, 0),
2867                                ladvise->lla_start, ladvise->lla_end,
2868                                ladvise_names[advice], rc);
2869                         GOTO(out, rc);
2870                 }
2871                 break;
2872         }
2873
2874 out:
2875         return rc;
2876 }
2877 #undef ERRSIZE
2878
2879 /*
2880  * Give file access advices
2881  *
2882  * The ladvise interface is similar to Linux fadvise() system call, except it
2883  * forwards the advices directly from Lustre client to server. The server side
2884  * codes will apply appropriate read-ahead and caching techniques for the
2885  * corresponding files.
2886  *
2887  * A typical workload for ladvise is e.g. a bunch of different clients are
2888  * doing small random reads of a file, so prefetching pages into OSS cache
2889  * with big linear reads before the random IO is a net benefit. Fetching
2890  * all that data into each client cache with fadvise() may not be, due to
2891  * much more data being sent to the client.
2892  */
2893 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2894                       struct llapi_lu_ladvise *ladvise)
2895 {
2896         struct lu_env *env;
2897         struct cl_io *io;
2898         struct cl_ladvise_io *lio;
2899         int rc;
2900         __u16 refcheck;
2901         ENTRY;
2902
2903         env = cl_env_get(&refcheck);
2904         if (IS_ERR(env))
2905                 RETURN(PTR_ERR(env));
2906
2907         io = vvp_env_thread_io(env);
2908         io->ci_obj = ll_i2info(inode)->lli_clob;
2909
2910         /* initialize parameters for ladvise */
2911         lio = &io->u.ci_ladvise;
2912         lio->li_start = ladvise->lla_start;
2913         lio->li_end = ladvise->lla_end;
2914         lio->li_fid = ll_inode2fid(inode);
2915         lio->li_advice = ladvise->lla_advice;
2916         lio->li_flags = flags;
2917
2918         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2919                 rc = cl_io_loop(env, io);
2920         else
2921                 rc = io->ci_result;
2922
2923         cl_io_fini(env, io);
2924         cl_env_put(env, &refcheck);
2925         RETURN(rc);
2926 }
2927
2928 static int ll_lock_noexpand(struct file *file, int flags)
2929 {
2930         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2931
2932         fd->ll_lock_no_expand = !(flags & LF_UNSET);
2933
2934         return 0;
2935 }
2936
2937 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2938                         unsigned long arg)
2939 {
2940         struct fsxattr fsxattr;
2941
2942         if (copy_from_user(&fsxattr,
2943                            (const struct fsxattr __user *)arg,
2944                            sizeof(fsxattr)))
2945                 RETURN(-EFAULT);
2946
2947         fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2948         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2949         if (copy_to_user((struct fsxattr __user *)arg,
2950                          &fsxattr, sizeof(fsxattr)))
2951                 RETURN(-EFAULT);
2952
2953         RETURN(0);
2954 }
2955
2956 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2957                         unsigned long arg)
2958 {
2959
2960         struct md_op_data *op_data;
2961         struct ptlrpc_request *req = NULL;
2962         int rc = 0;
2963         struct fsxattr fsxattr;
2964         struct cl_object *obj;
2965
2966         /* only root could change project ID */
2967         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2968                 RETURN(-EPERM);
2969
2970         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2971                                      LUSTRE_OPC_ANY, NULL);
2972         if (IS_ERR(op_data))
2973                 RETURN(PTR_ERR(op_data));
2974
2975         if (copy_from_user(&fsxattr,
2976                            (const struct fsxattr __user *)arg,
2977                            sizeof(fsxattr)))
2978                 GOTO(out_fsxattr1, rc = -EFAULT);
2979
2980         op_data->op_attr_flags = fsxattr.fsx_xflags;
2981         op_data->op_projid = fsxattr.fsx_projid;
2982         op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2983         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2984                         0, &req);
2985         ptlrpc_req_finished(req);
2986
2987         obj = ll_i2info(inode)->lli_clob;
2988         if (obj) {
2989                 struct iattr *attr;
2990
2991                 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2992                 OBD_ALLOC_PTR(attr);
2993                 if (attr == NULL)
2994                         GOTO(out_fsxattr1, rc = -ENOMEM);
2995                 attr->ia_valid = ATTR_ATTR_FLAG;
2996                 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2997
2998                 OBD_FREE_PTR(attr);
2999         }
3000 out_fsxattr1:
3001         ll_finish_md_op_data(op_data);
3002         RETURN(rc);
3003 }
3004
3005 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3006                                  unsigned long arg)
3007 {
3008         struct inode            *inode = file_inode(file);
3009         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3010         struct ll_inode_info    *lli = ll_i2info(inode);
3011         struct obd_client_handle *och = NULL;
3012         struct split_param sp;
3013         bool lease_broken;
3014         fmode_t fmode = 0;
3015         enum mds_op_bias bias = 0;
3016         struct file *layout_file = NULL;
3017         void *data = NULL;
3018         size_t data_size = 0;
3019         long rc;
3020         ENTRY;
3021
3022         mutex_lock(&lli->lli_och_mutex);
3023         if (fd->fd_lease_och != NULL) {
3024                 och = fd->fd_lease_och;
3025                 fd->fd_lease_och = NULL;
3026         }
3027         mutex_unlock(&lli->lli_och_mutex);
3028
3029         if (och == NULL)
3030                 GOTO(out, rc = -ENOLCK);
3031
3032         fmode = och->och_flags;
3033
3034         switch (ioc->lil_flags) {
3035         case LL_LEASE_RESYNC_DONE:
3036                 if (ioc->lil_count > IOC_IDS_MAX)
3037                         GOTO(out, rc = -EINVAL);
3038
3039                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3040                 OBD_ALLOC(data, data_size);
3041                 if (!data)
3042                         GOTO(out, rc = -ENOMEM);
3043
3044                 if (copy_from_user(data, (void __user *)arg, data_size))
3045                         GOTO(out, rc = -EFAULT);
3046
3047                 bias = MDS_CLOSE_RESYNC_DONE;
3048                 break;
3049         case LL_LEASE_LAYOUT_MERGE: {
3050                 int fd;
3051
3052                 if (ioc->lil_count != 1)
3053                         GOTO(out, rc = -EINVAL);
3054
3055                 arg += sizeof(*ioc);
3056                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3057                         GOTO(out, rc = -EFAULT);
3058
3059                 layout_file = fget(fd);
3060                 if (!layout_file)
3061                         GOTO(out, rc = -EBADF);
3062
3063                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3064                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3065                         GOTO(out, rc = -EPERM);
3066
3067                 data = file_inode(layout_file);
3068                 bias = MDS_CLOSE_LAYOUT_MERGE;
3069                 break;
3070         }
3071         case LL_LEASE_LAYOUT_SPLIT: {
3072                 int fdv;
3073                 int mirror_id;
3074
3075                 if (ioc->lil_count != 2)
3076                         GOTO(out, rc = -EINVAL);
3077
3078                 arg += sizeof(*ioc);
3079                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3080                         GOTO(out, rc = -EFAULT);
3081
3082                 arg += sizeof(__u32);
3083                 if (copy_from_user(&mirror_id, (void __user *)arg,
3084                                    sizeof(__u32)))
3085                         GOTO(out, rc = -EFAULT);
3086
3087                 layout_file = fget(fdv);
3088                 if (!layout_file)
3089                         GOTO(out, rc = -EBADF);
3090
3091                 sp.sp_inode = file_inode(layout_file);
3092                 sp.sp_mirror_id = (__u16)mirror_id;
3093                 data = &sp;
3094                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3095                 break;
3096         }
3097         default:
3098                 /* without close intent */
3099                 break;
3100         }
3101
3102         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3103         if (rc < 0)
3104                 GOTO(out, rc);
3105
3106         rc = ll_lease_och_release(inode, file);
3107         if (rc < 0)
3108                 GOTO(out, rc);
3109
3110         if (lease_broken)
3111                 fmode = 0;
3112         EXIT;
3113
3114 out:
3115         switch (ioc->lil_flags) {
3116         case LL_LEASE_RESYNC_DONE:
3117                 if (data)
3118                         OBD_FREE(data, data_size);
3119                 break;
3120         case LL_LEASE_LAYOUT_MERGE:
3121         case LL_LEASE_LAYOUT_SPLIT:
3122                 if (layout_file)
3123                         fput(layout_file);
3124                 break;
3125         }
3126
3127         if (!rc)
3128                 rc = ll_lease_type_from_fmode(fmode);
3129         RETURN(rc);
3130 }
3131
3132 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3133                               unsigned long arg)
3134 {
3135         struct inode *inode = file_inode(file);
3136         struct ll_inode_info *lli = ll_i2info(inode);
3137         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3138         struct obd_client_handle *och = NULL;
3139         __u64 open_flags = 0;
3140         bool lease_broken;
3141         fmode_t fmode;
3142         long rc;
3143         ENTRY;
3144
3145         switch (ioc->lil_mode) {
3146         case LL_LEASE_WRLCK:
3147                 if (!(file->f_mode & FMODE_WRITE))
3148                         RETURN(-EPERM);
3149                 fmode = FMODE_WRITE;
3150                 break;
3151         case LL_LEASE_RDLCK:
3152                 if (!(file->f_mode & FMODE_READ))
3153                         RETURN(-EPERM);
3154                 fmode = FMODE_READ;
3155                 break;
3156         case LL_LEASE_UNLCK:
3157                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3158         default:
3159                 RETURN(-EINVAL);
3160         }
3161
3162         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3163
3164         /* apply for lease */
3165         if (ioc->lil_flags & LL_LEASE_RESYNC)
3166                 open_flags = MDS_OPEN_RESYNC;
3167         och = ll_lease_open(inode, file, fmode, open_flags);
3168         if (IS_ERR(och))
3169                 RETURN(PTR_ERR(och));
3170
3171         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3172                 rc = ll_lease_file_resync(och, inode);
3173                 if (rc) {
3174                         ll_lease_close(och, inode, NULL);
3175                         RETURN(rc);
3176                 }
3177                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3178                 if (rc) {
3179                         ll_lease_close(och, inode, NULL);
3180                         RETURN(rc);
3181                 }
3182         }
3183
3184         rc = 0;
3185         mutex_lock(&lli->lli_och_mutex);
3186         if (fd->fd_lease_och == NULL) {
3187                 fd->fd_lease_och = och;
3188                 och = NULL;
3189         }
3190         mutex_unlock(&lli->lli_och_mutex);
3191         if (och != NULL) {
3192                 /* impossible now that only excl is supported for now */
3193                 ll_lease_close(och, inode, &lease_broken);
3194                 rc = -EBUSY;
3195         }
3196         RETURN(rc);
3197 }
3198
3199 static long
3200 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3201 {
3202         struct inode            *inode = file_inode(file);
3203         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3204         int                      flags, rc;
3205         ENTRY;
3206
3207         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3208                PFID(ll_inode2fid(inode)), inode, cmd);
3209         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3210
3211         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3212         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3213                 RETURN(-ENOTTY);
3214
3215         switch(cmd) {
3216         case LL_IOC_GETFLAGS:
3217                 /* Get the current value of the file flags */
3218                 return put_user(fd->fd_flags, (int __user *)arg);
3219         case LL_IOC_SETFLAGS:
3220         case LL_IOC_CLRFLAGS:
3221                 /* Set or clear specific file flags */
3222                 /* XXX This probably needs checks to ensure the flags are
3223                  *     not abused, and to handle any flag side effects.
3224                  */
3225                 if (get_user(flags, (int __user *) arg))
3226                         RETURN(-EFAULT);
3227
3228                 if (cmd == LL_IOC_SETFLAGS) {
3229                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3230                             !(file->f_flags & O_DIRECT)) {
3231                                 CERROR("%s: unable to disable locking on "
3232                                        "non-O_DIRECT file\n", current->comm);
3233                                 RETURN(-EINVAL);
3234                         }
3235
3236                         fd->fd_flags |= flags;
3237                 } else {
3238                         fd->fd_flags &= ~flags;
3239                 }
3240                 RETURN(0);
3241         case LL_IOC_LOV_SETSTRIPE:
3242         case LL_IOC_LOV_SETSTRIPE_NEW:
3243                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3244         case LL_IOC_LOV_SETEA:
3245                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3246         case LL_IOC_LOV_SWAP_LAYOUTS: {
3247                 struct file *file2;
3248                 struct lustre_swap_layouts lsl;
3249
3250                 if (copy_from_user(&lsl, (char __user *)arg,
3251                                    sizeof(struct lustre_swap_layouts)))
3252                         RETURN(-EFAULT);
3253
3254                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3255                         RETURN(-EPERM);
3256
3257                 file2 = fget(lsl.sl_fd);
3258                 if (file2 == NULL)
3259                         RETURN(-EBADF);
3260
3261                 /* O_WRONLY or O_RDWR */
3262                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3263                         GOTO(out, rc = -EPERM);
3264
3265                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3266                         struct inode                    *inode2;
3267                         struct ll_inode_info            *lli;
3268                         struct obd_client_handle        *och = NULL;
3269
3270                         lli = ll_i2info(inode);
3271                         mutex_lock(&lli->lli_och_mutex);
3272                         if (fd->fd_lease_och != NULL) {
3273                                 och = fd->fd_lease_och;
3274                                 fd->fd_lease_och = NULL;
3275                         }
3276                         mutex_unlock(&lli->lli_och_mutex);
3277                         if (och == NULL)
3278                                 GOTO(out, rc = -ENOLCK);
3279                         inode2 = file_inode(file2);
3280                         rc = ll_swap_layouts_close(och, inode, inode2);
3281                 } else {
3282                         rc = ll_swap_layouts(file, file2, &lsl);
3283                 }
3284 out:
3285                 fput(file2);
3286                 RETURN(rc);
3287         }
3288         case LL_IOC_LOV_GETSTRIPE:
3289         case LL_IOC_LOV_GETSTRIPE_NEW:
3290                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3291         case FSFILT_IOC_GETFLAGS:
3292         case FSFILT_IOC_SETFLAGS:
3293                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3294         case FSFILT_IOC_GETVERSION_OLD:
3295         case FSFILT_IOC_GETVERSION:
3296                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3297         case LL_IOC_GROUP_LOCK:
3298                 RETURN(ll_get_grouplock(inode, file, arg));
3299         case LL_IOC_GROUP_UNLOCK:
3300                 RETURN(ll_put_grouplock(inode, file, arg));
3301         case IOC_OBD_STATFS:
3302                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3303
3304         /* We need to special case any other ioctls we want to handle,
3305          * to send them to the MDS/OST as appropriate and to properly
3306          * network encode the arg field.
3307         case FSFILT_IOC_SETVERSION_OLD:
3308         case FSFILT_IOC_SETVERSION:
3309         */
3310         case LL_IOC_FLUSHCTX:
3311                 RETURN(ll_flush_ctx(inode));
3312         case LL_IOC_PATH2FID: {
3313                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3314                                  sizeof(struct lu_fid)))
3315                         RETURN(-EFAULT);
3316
3317                 RETURN(0);
3318         }
3319         case LL_IOC_GETPARENT:
3320                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3321
3322         case OBD_IOC_FID2PATH:
3323                 RETURN(ll_fid2path(inode, (void __user *)arg));
3324         case LL_IOC_DATA_VERSION: {
3325                 struct ioc_data_version idv;
3326                 int rc;
3327
3328                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3329                         RETURN(-EFAULT);
3330
3331                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3332                 rc = ll_ioc_data_version(inode, &idv);
3333
3334                 if (rc == 0 &&
3335                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3336                         RETURN(-EFAULT);
3337
3338                 RETURN(rc);
3339         }
3340
3341         case LL_IOC_GET_MDTIDX: {
3342                 int mdtidx;
3343
3344                 mdtidx = ll_get_mdt_idx(inode);
3345                 if (mdtidx < 0)
3346                         RETURN(mdtidx);
3347
3348                 if (put_user((int)mdtidx, (int __user *)arg))
3349                         RETURN(-EFAULT);
3350
3351                 RETURN(0);
3352         }
3353         case OBD_IOC_GETDTNAME:
3354         case OBD_IOC_GETMDNAME:
3355                 RETURN(ll_get_obd_name(inode, cmd, arg));
3356         case LL_IOC_HSM_STATE_GET: {
3357                 struct md_op_data       *op_data;
3358                 struct hsm_user_state   *hus;
3359                 int                      rc;
3360
3361                 OBD_ALLOC_PTR(hus);
3362                 if (hus == NULL)
3363                         RETURN(-ENOMEM);
3364
3365                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3366                                              LUSTRE_OPC_ANY, hus);
3367                 if (IS_ERR(op_data)) {
3368                         OBD_FREE_PTR(hus);
3369                         RETURN(PTR_ERR(op_data));
3370                 }
3371
3372                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3373                                    op_data, NULL);
3374
3375                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3376                         rc = -EFAULT;
3377
3378                 ll_finish_md_op_data(op_data);
3379                 OBD_FREE_PTR(hus);
3380                 RETURN(rc);
3381         }
3382         case LL_IOC_HSM_STATE_SET: {
3383                 struct hsm_state_set    *hss;
3384                 int                      rc;
3385
3386                 OBD_ALLOC_PTR(hss);
3387                 if (hss == NULL)
3388                         RETURN(-ENOMEM);
3389
3390                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3391                         OBD_FREE_PTR(hss);
3392                         RETURN(-EFAULT);
3393                 }
3394
3395                 rc = ll_hsm_state_set(inode, hss);
3396
3397                 OBD_FREE_PTR(hss);
3398                 RETURN(rc);
3399         }
3400         case LL_IOC_HSM_ACTION: {
3401                 struct md_op_data               *op_data;
3402                 struct hsm_current_action       *hca;
3403                 int                              rc;
3404
3405                 OBD_ALLOC_PTR(hca);
3406                 if (hca == NULL)
3407                         RETURN(-ENOMEM);
3408
3409                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3410                                              LUSTRE_OPC_ANY, hca);
3411                 if (IS_ERR(op_data)) {
3412                         OBD_FREE_PTR(hca);
3413                         RETURN(PTR_ERR(op_data));
3414                 }
3415
3416                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3417                                    op_data, NULL);
3418
3419                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3420                         rc = -EFAULT;
3421
3422                 ll_finish_md_op_data(op_data);
3423                 OBD_FREE_PTR(hca);
3424                 RETURN(rc);
3425         }
3426         case LL_IOC_SET_LEASE_OLD: {
3427                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3428
3429                 RETURN(ll_file_set_lease(file, &ioc, 0));
3430         }
3431         case LL_IOC_SET_LEASE: {
3432                 struct ll_ioc_lease ioc;
3433
3434                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3435                         RETURN(-EFAULT);
3436
3437                 RETURN(ll_file_set_lease(file, &ioc, arg));
3438         }
3439         case LL_IOC_GET_LEASE: {
3440                 struct ll_inode_info *lli = ll_i2info(inode);
3441                 struct ldlm_lock *lock = NULL;
3442                 fmode_t fmode = 0;
3443
3444                 mutex_lock(&lli->lli_och_mutex);
3445                 if (fd->fd_lease_och != NULL) {
3446                         struct obd_client_handle *och = fd->fd_lease_och;
3447
3448                         lock = ldlm_handle2lock(&och->och_lease_handle);
3449                         if (lock != NULL) {
3450                                 lock_res_and_lock(lock);
3451                                 if (!ldlm_is_cancel(lock))
3452                                         fmode = och->och_flags;
3453
3454                                 unlock_res_and_lock(lock);
3455                                 LDLM_LOCK_PUT(lock);
3456                         }
3457                 }
3458                 mutex_unlock(&lli->lli_och_mutex);
3459
3460                 RETURN(ll_lease_type_from_fmode(fmode));
3461         }
3462         case LL_IOC_HSM_IMPORT: {
3463                 struct hsm_user_import *hui;
3464
3465                 OBD_ALLOC_PTR(hui);
3466                 if (hui == NULL)
3467                         RETURN(-ENOMEM);
3468
3469                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3470                         OBD_FREE_PTR(hui);
3471                         RETURN(-EFAULT);
3472                 }
3473
3474                 rc = ll_hsm_import(inode, file, hui);
3475
3476                 OBD_FREE_PTR(hui);
3477                 RETURN(rc);
3478         }
3479         case LL_IOC_FUTIMES_3: {
3480                 struct ll_futimes_3 lfu;
3481
3482                 if (copy_from_user(&lfu,
3483                                    (const struct ll_futimes_3 __user *)arg,
3484                                    sizeof(lfu)))
3485                         RETURN(-EFAULT);
3486
3487                 RETURN(ll_file_futimes_3(file, &lfu));
3488         }
3489         case LL_IOC_LADVISE: {
3490                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3491                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3492                 int i;
3493                 int num_advise;
3494                 int alloc_size = sizeof(*k_ladvise_hdr);
3495
3496                 rc = 0;
3497                 u_ladvise_hdr = (void __user *)arg;
3498                 OBD_ALLOC_PTR(k_ladvise_hdr);
3499                 if (k_ladvise_hdr == NULL)
3500                         RETURN(-ENOMEM);
3501
3502                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3503                         GOTO(out_ladvise, rc = -EFAULT);
3504
3505                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3506                     k_ladvise_hdr->lah_count < 1)
3507                         GOTO(out_ladvise, rc = -EINVAL);
3508
3509                 num_advise = k_ladvise_hdr->lah_count;
3510                 if (num_advise >= LAH_COUNT_MAX)
3511                         GOTO(out_ladvise, rc = -EFBIG);
3512
3513                 OBD_FREE_PTR(k_ladvise_hdr);
3514                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3515                                       lah_advise[num_advise]);
3516                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3517                 if (k_ladvise_hdr == NULL)
3518                         RETURN(-ENOMEM);
3519
3520                 /*
3521                  * TODO: submit multiple advices to one server in a single RPC
3522                  */
3523                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3524                         GOTO(out_ladvise, rc = -EFAULT);
3525
3526                 for (i = 0; i < num_advise; i++) {
3527                         struct llapi_lu_ladvise *k_ladvise =
3528                                         &k_ladvise_hdr->lah_advise[i];
3529                         struct llapi_lu_ladvise __user *u_ladvise =
3530                                         &u_ladvise_hdr->lah_advise[i];
3531
3532                         rc = ll_ladvise_sanity(inode, k_ladvise);
3533                         if (rc)
3534                                 GOTO(out_ladvise, rc);
3535
3536                         switch (k_ladvise->lla_advice) {
3537                         case LU_LADVISE_LOCKNOEXPAND:
3538                                 rc = ll_lock_noexpand(file,
3539                                                k_ladvise->lla_peradvice_flags);
3540                                 GOTO(out_ladvise, rc);
3541                         case LU_LADVISE_LOCKAHEAD:
3542
3543                                 rc = ll_file_lock_ahead(file, k_ladvise);
3544
3545                                 if (rc < 0)
3546                                         GOTO(out_ladvise, rc);
3547
3548                                 if (put_user(rc,
3549                                              &u_ladvise->lla_lockahead_result))
3550                                         GOTO(out_ladvise, rc = -EFAULT);
3551                                 break;
3552                         default:
3553                                 rc = ll_ladvise(inode, file,
3554                                                 k_ladvise_hdr->lah_flags,
3555                                                 k_ladvise);
3556                                 if (rc)
3557                                         GOTO(out_ladvise, rc);
3558                                 break;
3559                         }
3560
3561                 }
3562
3563 out_ladvise:
3564                 OBD_FREE(k_ladvise_hdr, alloc_size);
3565                 RETURN(rc);
3566         }
3567         case LL_IOC_FLR_SET_MIRROR: {
3568                 /* mirror I/O must be direct to avoid polluting page cache
3569                  * by stale data. */
3570                 if (!(file->f_flags & O_DIRECT))
3571                         RETURN(-EINVAL);
3572
3573                 fd->fd_designated_mirror = (__u32)arg;
3574                 RETURN(0);
3575         }
3576         case LL_IOC_FSGETXATTR:
3577                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3578         case LL_IOC_FSSETXATTR:
3579                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3580         case BLKSSZGET:
3581                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3582         default:
3583                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3584                                      (void __user *)arg));
3585         }
3586 }
3587
3588 #ifndef HAVE_FILE_LLSEEK_SIZE
3589 static inline loff_t
3590 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3591 {
3592         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3593                 return -EINVAL;
3594         if (offset > maxsize)
3595                 return -EINVAL;
3596
3597         if (offset != file->f_pos) {
3598                 file->f_pos = offset;
3599                 file->f_version = 0;
3600         }
3601         return offset;
3602 }
3603
3604 static loff_t
3605 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3606                 loff_t maxsize, loff_t eof)
3607 {
3608         struct inode *inode = file_inode(file);
3609
3610         switch (origin) {
3611         case SEEK_END:
3612                 offset += eof;
3613                 break;
3614         case SEEK_CUR:
3615                 /*
3616                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3617                  * position-querying operation.  Avoid rewriting the "same"
3618                  * f_pos value back to the file because a concurrent read(),
3619                  * write() or lseek() might have altered it
3620                  */
3621                 if (offset == 0)
3622                         return file->f_pos;
3623                 /*
3624                  * f_lock protects against read/modify/write race with other
3625                  * SEEK_CURs. Note that parallel writes and reads behave
3626                  * like SEEK_SET.
3627                  */
3628                 inode_lock(inode);
3629                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3630                 inode_unlock(inode);
3631                 return offset;
3632         case SEEK_DATA:
3633                 /*
3634                  * In the generic case the entire file is data, so as long as
3635                  * offset isn't at the end of the file then the offset is data.
3636                  */
3637                 if (offset >= eof)
3638                         return -ENXIO;
3639                 break;
3640         case SEEK_HOLE:
3641                 /*
3642                  * There is a virtual hole at the end of the file, so as long as
3643                  * offset isn't i_size or larger, return i_size.
3644                  */
3645                 if (offset >= eof)
3646                         return -ENXIO;
3647                 offset = eof;
3648                 break;
3649         }
3650
3651         return llseek_execute(file, offset, maxsize);
3652 }
3653 #endif
3654
3655 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3656 {
3657         struct inode *inode = file_inode(file);
3658         loff_t retval, eof = 0;
3659
3660         ENTRY;
3661         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3662                            (origin == SEEK_CUR) ? file->f_pos : 0);
3663         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3664                PFID(ll_inode2fid(inode)), inode, retval, retval,
3665                origin);
3666         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3667
3668         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3669                 retval = ll_glimpse_size(inode);
3670                 if (retval != 0)
3671                         RETURN(retval);
3672                 eof = i_size_read(inode);
3673         }
3674
3675         retval = ll_generic_file_llseek_size(file, offset, origin,
3676                                           ll_file_maxbytes(inode), eof);
3677         RETURN(retval);
3678 }
3679
3680 static int ll_flush(struct file *file, fl_owner_t id)
3681 {
3682         struct inode *inode = file_inode(file);
3683         struct ll_inode_info *lli = ll_i2info(inode);
3684         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3685         int rc, err;
3686
3687         LASSERT(!S_ISDIR(inode->i_mode));
3688
3689         /* catch async errors that were recorded back when async writeback
3690          * failed for pages in this mapping. */
3691         rc = lli->lli_async_rc;
3692         lli->lli_async_rc = 0;
3693         if (lli->lli_clob != NULL) {
3694                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3695                 if (rc == 0)
3696                         rc = err;
3697         }
3698
3699         /* The application has been told write failure already.
3700          * Do not report failure again. */
3701         if (fd->fd_write_failed)
3702                 return 0;
3703         return rc ? -EIO : 0;
3704 }
3705
3706 /**
3707  * Called to make sure a portion of file has been written out.
3708  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3709  *
3710  * Return how many pages have been written.
3711  */
3712 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3713                        enum cl_fsync_mode mode, int ignore_layout)
3714 {
3715         struct lu_env *env;
3716         struct cl_io *io;
3717         struct cl_fsync_io *fio;
3718         int result;
3719         __u16 refcheck;
3720         ENTRY;
3721
3722         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3723             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3724                 RETURN(-EINVAL);
3725
3726         env = cl_env_get(&refcheck);
3727         if (IS_ERR(env))
3728                 RETURN(PTR_ERR(env));
3729
3730         io = vvp_env_thread_io(env);
3731         io->ci_obj = ll_i2info(inode)->lli_clob;
3732         io->ci_ignore_layout = ignore_layout;
3733
3734         /* initialize parameters for sync */
3735         fio = &io->u.ci_fsync;
3736         fio->fi_start = start;
3737         fio->fi_end = end;
3738         fio->fi_fid = ll_inode2fid(inode);
3739         fio->fi_mode = mode;
3740         fio->fi_nr_written = 0;
3741
3742         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3743                 result = cl_io_loop(env, io);
3744         else
3745                 result = io->ci_result;
3746         if (result == 0)
3747                 result = fio->fi_nr_written;
3748         cl_io_fini(env, io);
3749         cl_env_put(env, &refcheck);
3750
3751         RETURN(result);
3752 }
3753
3754 /*
3755  * When dentry is provided (the 'else' case), file_dentry() may be
3756  * null and dentry must be used directly rather than pulled from
3757  * file_dentry() as is done otherwise.
3758  */
3759
3760 #ifdef HAVE_FILE_FSYNC_4ARGS
3761 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3762 {
3763         struct dentry *dentry = file_dentry(file);
3764         bool lock_inode;
3765 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3766 int ll_fsync(struct file *file, int datasync)
3767 {
3768         struct dentry *dentry = file_dentry(file);
3769         loff_t start = 0;
3770         loff_t end = LLONG_MAX;
3771 #else
3772 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3773 {
3774         loff_t start = 0;
3775         loff_t end = LLONG_MAX;
3776 #endif
3777         struct inode *inode = dentry->d_inode;
3778         struct ll_inode_info *lli = ll_i2info(inode);
3779         struct ptlrpc_request *req;
3780         int rc, err;
3781         ENTRY;
3782
3783         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3784                PFID(ll_inode2fid(inode)), inode);
3785         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3786
3787 #ifdef HAVE_FILE_FSYNC_4ARGS
3788         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3789         lock_inode = !lli->lli_inode_locked;
3790         if (lock_inode)
3791                 inode_lock(inode);
3792 #else
3793         /* fsync's caller has already called _fdata{sync,write}, we want
3794          * that IO to finish before calling the osc and mdc sync methods */
3795         rc = filemap_fdatawait(inode->i_mapping);
3796 #endif
3797
3798         /* catch async errors that were recorded back when async writeback
3799          * failed for pages in this mapping. */
3800         if (!S_ISDIR(inode->i_mode)) {
3801                 err = lli->lli_async_rc;
3802                 lli->lli_async_rc = 0;
3803                 if (rc == 0)
3804                         rc = err;
3805                 if (lli->lli_clob != NULL) {
3806                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3807                         if (rc == 0)
3808                                 rc = err;
3809                 }
3810         }
3811
3812         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3813         if (!rc)
3814                 rc = err;
3815         if (!err)
3816                 ptlrpc_req_finished(req);
3817
3818         if (S_ISREG(inode->i_mode)) {
3819                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3820
3821                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3822                 if (rc == 0 && err < 0)
3823                         rc = err;
3824                 if (rc < 0)
3825                         fd->fd_write_failed = true;
3826                 else
3827                         fd->fd_write_failed = false;
3828         }
3829
3830 #ifdef HAVE_FILE_FSYNC_4ARGS
3831         if (lock_inode)
3832                 inode_unlock(inode);
3833 #endif
3834         RETURN(rc);
3835 }
3836
3837 static int
3838 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3839 {
3840         struct inode *inode = file_inode(file);
3841         struct ll_sb_info *sbi = ll_i2sbi(inode);
3842         struct ldlm_enqueue_info einfo = {
3843                 .ei_type        = LDLM_FLOCK,
3844                 .ei_cb_cp       = ldlm_flock_completion_ast,
3845                 .ei_cbdata      = file_lock,
3846         };
3847         struct md_op_data *op_data;
3848         struct lustre_handle lockh = { 0 };
3849         union ldlm_policy_data flock = { { 0 } };
3850         int fl_type = file_lock->fl_type;
3851         __u64 flags = 0;
3852         int rc;
3853         int rc2 = 0;
3854         ENTRY;
3855
3856         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3857                PFID(ll_inode2fid(inode)), file_lock);
3858
3859         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3860
3861         if (file_lock->fl_flags & FL_FLOCK) {
3862                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3863                 /* flocks are whole-file locks */
3864                 flock.l_flock.end = OFFSET_MAX;
3865                 /* For flocks owner is determined by the local file desctiptor*/
3866                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3867         } else if (file_lock->fl_flags & FL_POSIX) {
3868                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3869                 flock.l_flock.start = file_lock->fl_start;
3870                 flock.l_flock.end = file_lock->fl_end;
3871         } else {
3872                 RETURN(-EINVAL);
3873         }
3874         flock.l_flock.pid = file_lock->fl_pid;
3875
3876         /* Somewhat ugly workaround for svc lockd.
3877          * lockd installs custom fl_lmops->lm_compare_owner that checks
3878          * for the fl_owner to be the same (which it always is on local node
3879          * I guess between lockd processes) and then compares pid.
3880          * As such we assign pid to the owner field to make it all work,
3881          * conflict with normal locks is unlikely since pid space and
3882          * pointer space for current->files are not intersecting */
3883         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3884                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3885
3886         switch (fl_type) {
3887         case F_RDLCK:
3888                 einfo.ei_mode = LCK_PR;
3889                 break;
3890         case F_UNLCK:
3891                 /* An unlock request may or may not have any relation to
3892                  * existing locks so we may not be able to pass a lock handle
3893                  * via a normal ldlm_lock_cancel() request. The request may even
3894                  * unlock a byte range in the middle of an existing lock. In
3895                  * order to process an unlock request we need all of the same
3896                  * information that is given with a normal read or write record
3897                  * lock request. To avoid creating another ldlm unlock (cancel)
3898                  * message we'll treat a LCK_NL flock request as an unlock. */
3899                 einfo.ei_mode = LCK_NL;
3900                 break;
3901         case F_WRLCK:
3902                 einfo.ei_mode = LCK_PW;
3903                 break;
3904         default:
3905                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3906                 RETURN (-ENOTSUPP);
3907         }
3908
3909         switch (cmd) {
3910         case F_SETLKW:
3911 #ifdef F_SETLKW64
3912         case F_SETLKW64:
3913 #endif
3914                 flags = 0;
3915                 break;
3916         case F_SETLK:
3917 #ifdef F_SETLK64
3918         case F_SETLK64:
3919 #endif
3920                 flags = LDLM_FL_BLOCK_NOWAIT;
3921                 break;
3922         case F_GETLK:
3923 #ifdef F_GETLK64
3924         case F_GETLK64:
3925 #endif
3926                 flags = LDLM_FL_TEST_LOCK;
3927                 break;
3928         default:
3929                 CERROR("unknown fcntl lock command: %d\n", cmd);
3930                 RETURN (-EINVAL);
3931         }
3932
3933         /* Save the old mode so that if the mode in the lock changes we
3934          * can decrement the appropriate reader or writer refcount. */
3935         file_lock->fl_type = einfo.ei_mode;
3936
3937         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3938                                      LUSTRE_OPC_ANY, NULL);
3939         if (IS_ERR(op_data))
3940                 RETURN(PTR_ERR(op_data));
3941
3942         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3943                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3944                flock.l_flock.pid, flags, einfo.ei_mode,
3945                flock.l_flock.start, flock.l_flock.end);
3946
3947         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3948                         flags);
3949
3950         /* Restore the file lock type if not TEST lock. */
3951         if (!(flags & LDLM_FL_TEST_LOCK))
3952                 file_lock->fl_type = fl_type;
3953
3954 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3955         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3956             !(flags & LDLM_FL_TEST_LOCK))
3957                 rc2  = locks_lock_file_wait(file, file_lock);
3958 #else
3959         if ((file_lock->fl_flags & FL_FLOCK) &&
3960             (rc == 0 || file_lock->fl_type == F_UNLCK))
3961                 rc2  = flock_lock_file_wait(file, file_lock);
3962         if ((file_lock->fl_flags & FL_POSIX) &&
3963             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3964             !(flags & LDLM_FL_TEST_LOCK))
3965                 rc2  = posix_lock_file_wait(file, file_lock);
3966 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3967
3968         if (rc2 && file_lock->fl_type != F_UNLCK) {
3969                 einfo.ei_mode = LCK_NL;
3970                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3971                            &lockh, flags);
3972                 rc = rc2;
3973         }
3974
3975         ll_finish_md_op_data(op_data);
3976
3977         RETURN(rc);
3978 }
3979
3980 int ll_get_fid_by_name(struct inode *parent, const char *name,
3981                        int namelen, struct lu_fid *fid,
3982                        struct inode **inode)
3983 {
3984         struct md_op_data       *op_data = NULL;
3985         struct mdt_body         *body;
3986         struct ptlrpc_request   *req;
3987         int                     rc;
3988         ENTRY;
3989
3990         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3991                                      LUSTRE_OPC_ANY, NULL);
3992         if (IS_ERR(op_data))
3993                 RETURN(PTR_ERR(op_data));
3994
3995         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3996         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3997         ll_finish_md_op_data(op_data);
3998         if (rc < 0)
3999                 RETURN(rc);
4000
4001         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4002         if (body == NULL)
4003                 GOTO(out_req, rc = -EFAULT);
4004         if (fid != NULL)
4005                 *fid = body->mbo_fid1;
4006
4007         if (inode != NULL)
4008                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4009 out_req:
4010         ptlrpc_req_finished(req);
4011         RETURN(rc);
4012 }
4013
4014 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
4015                const char *name, int namelen)
4016 {
4017         struct dentry         *dchild = NULL;
4018         struct inode          *child_inode = NULL;
4019         struct md_op_data     *op_data;
4020         struct ptlrpc_request *request = NULL;
4021         struct obd_client_handle *och = NULL;
4022         struct qstr           qstr;
4023         struct mdt_body         *body;
4024         int                    rc;
4025         __u64                   data_version = 0;
4026         ENTRY;
4027
4028         CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
4029                name, PFID(ll_inode2fid(parent)), mdtidx);
4030
4031         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4032                                      0, LUSTRE_OPC_ANY, NULL);
4033         if (IS_ERR(op_data))
4034                 RETURN(PTR_ERR(op_data));
4035
4036         /* Get child FID first */
4037         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4038         qstr.name = name;
4039         qstr.len = namelen;
4040         dchild = d_lookup(file_dentry(file), &qstr);
4041         if (dchild != NULL) {
4042                 if (dchild->d_inode != NULL)
4043                         child_inode = igrab(dchild->d_inode);
4044                 dput(dchild);
4045         }
4046
4047         if (child_inode == NULL) {
4048                 rc = ll_get_fid_by_name(parent, name, namelen,
4049                                         &op_data->op_fid3, &child_inode);
4050                 if (rc != 0)
4051                         GOTO(out_free, rc);
4052         }
4053
4054         if (child_inode == NULL)
4055                 GOTO(out_free, rc = -EINVAL);
4056
4057         /*
4058          * lfs migrate command needs to be blocked on the client
4059          * by checking the migrate FID against the FID of the
4060          * filesystem root.
4061          */
4062         if (child_inode == parent->i_sb->s_root->d_inode)
4063                 GOTO(out_iput, rc = -EINVAL);
4064
4065         inode_lock(child_inode);
4066         op_data->op_fid3 = *ll_inode2fid(child_inode);
4067         if (!fid_is_sane(&op_data->op_fid3)) {
4068                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4069                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4070                        PFID(&op_data->op_fid3));
4071                 GOTO(out_unlock, rc = -EINVAL);
4072         }
4073
4074         rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
4075         if (rc < 0)
4076                 GOTO(out_unlock, rc);
4077
4078         if (rc == mdtidx) {
4079                 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
4080                        PFID(&op_data->op_fid3), mdtidx);
4081                 GOTO(out_unlock, rc = 0);
4082         }
4083 again:
4084         if (S_ISREG(child_inode->i_mode)) {
4085                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4086                 if (IS_ERR(och)) {
4087                         rc = PTR_ERR(och);
4088                         och = NULL;
4089                         GOTO(out_unlock, rc);
4090                 }
4091
4092                 rc = ll_data_version(child_inode, &data_version,
4093                                      LL_DV_WR_FLUSH);
4094                 if (rc != 0)
4095                         GOTO(out_close, rc);
4096
4097                 op_data->op_handle = och->och_fh;
4098                 op_data->op_data = och->och_mod;
4099                 op_data->op_data_version = data_version;
4100                 op_data->op_lease_handle = och->och_lease_handle;
4101                 op_data->op_bias |= MDS_RENAME_MIGRATE;
4102         }
4103
4104         op_data->op_mds = mdtidx;
4105         op_data->op_cli_flags = CLI_MIGRATE;
4106         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
4107                        namelen, name, namelen, &request);
4108         if (rc == 0) {
4109                 LASSERT(request != NULL);
4110                 ll_update_times(request, parent);
4111
4112                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4113                 LASSERT(body != NULL);
4114
4115                 /* If the server does release layout lock, then we cleanup
4116                  * the client och here, otherwise release it in out_close: */
4117                 if (och != NULL &&
4118                     body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4119                         obd_mod_put(och->och_mod);
4120                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4121                                                   och);
4122                         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4123                         OBD_FREE_PTR(och);
4124                         och = NULL;
4125                 }
4126         }
4127
4128         if (request != NULL) {
4129                 ptlrpc_req_finished(request);
4130                 request = NULL;
4131         }
4132
4133         /* Try again if the file layout has changed. */
4134         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4135                 goto again;
4136
4137 out_close:
4138         if (och != NULL) /* close the file */
4139                 ll_lease_close(och, child_inode, NULL);
4140         if (rc == 0)
4141                 clear_nlink(child_inode);
4142 out_unlock:
4143         inode_unlock(child_inode);
4144 out_iput:
4145         iput(child_inode);
4146 out_free:
4147         ll_finish_md_op_data(op_data);
4148         RETURN(rc);
4149 }
4150
4151 static int
4152 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4153 {
4154         ENTRY;
4155
4156         RETURN(-ENOSYS);
4157 }
4158
4159 /**
4160  * test if some locks matching bits and l_req_mode are acquired
4161  * - bits can be in different locks
4162  * - if found clear the common lock bits in *bits
4163  * - the bits not found, are kept in *bits
4164  * \param inode [IN]
4165  * \param bits [IN] searched lock bits [IN]
4166  * \param l_req_mode [IN] searched lock mode
4167  * \retval boolean, true iff all bits are found
4168  */
4169 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4170 {
4171         struct lustre_handle lockh;
4172         union ldlm_policy_data policy;
4173         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4174                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4175         struct lu_fid *fid;
4176         __u64 flags;
4177         int i;
4178         ENTRY;
4179
4180         if (!inode)
4181                RETURN(0);
4182
4183         fid = &ll_i2info(inode)->lli_fid;
4184         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4185                ldlm_lockname[mode]);
4186
4187         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4188         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4189                 policy.l_inodebits.bits = *bits & (1 << i);
4190                 if (policy.l_inodebits.bits == 0)
4191                         continue;
4192
4193                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4194                                   &policy, mode, &lockh)) {
4195                         struct ldlm_lock *lock;
4196
4197                         lock = ldlm_handle2lock(&lockh);
4198                         if (lock) {
4199                                 *bits &=
4200                                       ~(lock->l_policy_data.l_inodebits.bits);
4201                                 LDLM_LOCK_PUT(lock);
4202                         } else {
4203                                 *bits &= ~policy.l_inodebits.bits;
4204                         }
4205                 }
4206         }
4207         RETURN(*bits == 0);
4208 }
4209
4210 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4211                                struct lustre_handle *lockh, __u64 flags,
4212                                enum ldlm_mode mode)
4213 {
4214         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4215         struct lu_fid *fid;
4216         enum ldlm_mode rc;
4217         ENTRY;
4218
4219         fid = &ll_i2info(inode)->lli_fid;
4220         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4221
4222         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4223                            fid, LDLM_IBITS, &policy, mode, lockh);
4224
4225         RETURN(rc);
4226 }
4227
4228 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4229 {
4230         /* Already unlinked. Just update nlink and return success */
4231         if (rc == -ENOENT) {
4232                 clear_nlink(inode);
4233                 /* If it is striped directory, and there is bad stripe
4234                  * Let's revalidate the dentry again, instead of returning
4235                  * error */
4236                 if (S_ISDIR(inode->i_mode) &&
4237                     ll_i2info(inode)->lli_lsm_md != NULL)
4238                         return 0;
4239
4240                 /* This path cannot be hit for regular files unless in
4241                  * case of obscure races, so no need to to validate
4242                  * size. */
4243                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4244                         return 0;
4245         } else if (rc != 0) {
4246                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4247                              "%s: revalidate FID "DFID" error: rc = %d\n",
4248                              ll_get_fsname(inode->i_sb, NULL, 0),
4249                              PFID(ll_inode2fid(inode)), rc);
4250         }
4251
4252         return rc;
4253 }
4254
4255 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4256 {
4257         struct inode *inode = dentry->d_inode;
4258         struct obd_export *exp = ll_i2mdexp(inode);
4259         struct lookup_intent oit = {
4260                 .it_op = op,
4261         };
4262         struct ptlrpc_request *req = NULL;
4263         struct md_op_data *op_data;
4264         int rc = 0;
4265         ENTRY;
4266
4267         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4268                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4269
4270         /* Call getattr by fid, so do not provide name at all. */
4271         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4272                                      LUSTRE_OPC_ANY, NULL);
4273         if (IS_ERR(op_data))
4274                 RETURN(PTR_ERR(op_data));
4275
4276         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4277         ll_finish_md_op_data(op_data);
4278         if (rc < 0) {
4279                 rc = ll_inode_revalidate_fini(inode, rc);
4280                 GOTO(out, rc);
4281         }
4282
4283         rc = ll_revalidate_it_finish(req, &oit, dentry);
4284         if (rc != 0) {
4285                 ll_intent_release(&oit);
4286                 GOTO(out, rc);
4287         }
4288
4289         /* Unlinked? Unhash dentry, so it is not picked up later by
4290          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4291          * here to preserve get_cwd functionality on 2.6.
4292          * Bug 10503 */
4293         if (!dentry->d_inode->i_nlink) {
4294                 ll_lock_dcache(inode);
4295                 d_lustre_invalidate(dentry, 0);
4296                 ll_unlock_dcache(inode);
4297         }
4298
4299         ll_lookup_finish_locks(&oit, dentry);
4300 out:
4301         ptlrpc_req_finished(req);
4302
4303         return rc;
4304 }
4305
4306 static int ll_merge_md_attr(struct inode *inode)
4307 {
4308         struct cl_attr attr = { 0 };
4309         int rc;
4310
4311         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4312         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4313                            &attr, ll_md_blocking_ast);
4314         if (rc != 0)
4315                 RETURN(rc);
4316
4317         set_nlink(inode, attr.cat_nlink);
4318         inode->i_blocks = attr.cat_blocks;
4319         i_size_write(inode, attr.cat_size);
4320
4321         ll_i2info(inode)->lli_atime = attr.cat_atime;
4322         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4323         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4324
4325         RETURN(0);
4326 }
4327
4328 static inline dev_t ll_compat_encode_dev(dev_t dev)
4329 {
4330         /* The compat_sys_*stat*() syscalls will fail unless the
4331          * device majors and minors are both less than 256. Note that
4332          * the value returned here will be passed through
4333          * old_encode_dev() in cp_compat_stat(). And so we are not
4334          * trying to return a valid compat (u16) device number, just
4335          * one that will pass the old_valid_dev() check. */
4336
4337         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4338 }
4339
4340 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4341 int ll_getattr(const struct path *path, struct kstat *stat,
4342                u32 request_mask, unsigned int flags)
4343 {
4344         struct dentry *de = path->dentry;
4345 #else
4346 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4347 {
4348 #endif
4349         struct inode *inode = de->d_inode;
4350         struct ll_sb_info *sbi = ll_i2sbi(inode);
4351         struct ll_inode_info *lli = ll_i2info(inode);
4352         int rc;
4353
4354         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4355
4356         rc = ll_inode_revalidate(de, IT_GETATTR);
4357         if (rc < 0)
4358                 RETURN(rc);
4359
4360         if (S_ISREG(inode->i_mode)) {
4361                 /* In case of restore, the MDT has the right size and has
4362                  * already send it back without granting the layout lock,
4363                  * inode is up-to-date so glimpse is useless.
4364                  * Also to glimpse we need the layout, in case of a running
4365                  * restore the MDT holds the layout lock so the glimpse will
4366                  * block up to the end of restore (getattr will block)
4367                  */
4368                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4369                         rc = ll_glimpse_size(inode);
4370                         if (rc < 0)
4371                                 RETURN(rc);
4372                 }
4373         } else {
4374                 /* If object isn't regular a file then don't validate size. */
4375                 if (S_ISDIR(inode->i_mode) &&
4376                     lli->lli_lsm_md != NULL) {
4377                         rc = ll_merge_md_attr(inode);
4378                         if (rc < 0)
4379                                 RETURN(rc);
4380                 }
4381
4382                 LTIME_S(inode->i_atime) = lli->lli_atime;
4383                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4384                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4385         }
4386
4387         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4388
4389         if (ll_need_32bit_api(sbi)) {
4390                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4391                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4392                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4393         } else {
4394                 stat->ino = inode->i_ino;
4395                 stat->dev = inode->i_sb->s_dev;
4396                 stat->rdev = inode->i_rdev;
4397         }
4398
4399         stat->mode = inode->i_mode;
4400         stat->uid = inode->i_uid;
4401         stat->gid = inode->i_gid;
4402         stat->atime = inode->i_atime;
4403         stat->mtime = inode->i_mtime;
4404         stat->ctime = inode->i_ctime;
4405         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4406
4407         stat->nlink = inode->i_nlink;
4408         stat->size = i_size_read(inode);
4409         stat->blocks = inode->i_blocks;
4410
4411         return 0;
4412 }
4413
4414 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4415                      __u64 start, __u64 len)
4416 {
4417         int             rc;
4418         size_t          num_bytes;
4419         struct fiemap   *fiemap;
4420         unsigned int    extent_count = fieinfo->fi_extents_max;
4421
4422         num_bytes = sizeof(*fiemap) + (extent_count *
4423                                        sizeof(struct fiemap_extent));
4424         OBD_ALLOC_LARGE(fiemap, num_bytes);
4425
4426         if (fiemap == NULL)
4427                 RETURN(-ENOMEM);
4428
4429         fiemap->fm_flags = fieinfo->fi_flags;
4430         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4431         fiemap->fm_start = start;
4432         fiemap->fm_length = len;
4433         if (extent_count > 0 &&
4434             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4435                            sizeof(struct fiemap_extent)) != 0)
4436                 GOTO(out, rc = -EFAULT);
4437
4438         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4439
4440         fieinfo->fi_flags = fiemap->fm_flags;
4441         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4442         if (extent_count > 0 &&
4443             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4444                          fiemap->fm_mapped_extents *
4445                          sizeof(struct fiemap_extent)) != 0)
4446                 GOTO(out, rc = -EFAULT);
4447 out:
4448         OBD_FREE_LARGE(fiemap, num_bytes);
4449         return rc;
4450 }
4451
4452 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4453 {
4454         struct ll_inode_info *lli = ll_i2info(inode);
4455         struct posix_acl *acl = NULL;
4456         ENTRY;
4457
4458         spin_lock(&lli->lli_lock);
4459         /* VFS' acl_permission_check->check_acl will release the refcount */
4460         acl = posix_acl_dup(lli->lli_posix_acl);
4461         spin_unlock(&lli->lli_lock);
4462
4463         RETURN(acl);
4464 }
4465
4466 #ifdef HAVE_IOP_SET_ACL
4467 #ifdef CONFIG_FS_POSIX_ACL
4468 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4469 {
4470         const char *name = NULL;
4471         char *value = NULL;
4472         size_t size = 0;
4473         int rc = 0;
4474         ENTRY;
4475
4476         switch (type) {
4477         case ACL_TYPE_ACCESS:
4478                 if (acl) {
4479                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4480                         if (rc)
4481                                 GOTO(out, rc);
4482                 }
4483                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4484                 break;
4485         case ACL_TYPE_DEFAULT:
4486                 if (!S_ISDIR(inode->i_mode))
4487                         GOTO(out, rc = acl ? -EACCES : 0);
4488                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4489                 break;
4490         default:
4491                 GOTO(out, rc = -EINVAL);
4492         }
4493
4494         if (acl) {
4495                 size = posix_acl_xattr_size(acl->a_count);
4496                 value = kmalloc(size, GFP_NOFS);
4497                 if (value == NULL)
4498                         GOTO(out, rc = -ENOMEM);
4499
4500                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4501                 if (rc < 0)
4502                         GOTO(out_free, rc);
4503         }
4504
4505         /* dentry is only used for *.lov attributes so it's safe to be NULL */
4506         rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4507 out_free:
4508         kfree(value);
4509 out:
4510         if (!rc)
4511                 set_cached_acl(inode, type, acl);
4512         else
4513                 forget_cached_acl(inode, type);
4514         RETURN(rc);
4515 }
4516 #endif /* CONFIG_FS_POSIX_ACL */
4517 #endif /* HAVE_IOP_SET_ACL */
4518
4519 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4520 static int
4521 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4522 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4523 # else
4524 ll_check_acl(struct inode *inode, int mask)
4525 # endif
4526 {
4527 # ifdef CONFIG_FS_POSIX_ACL
4528         struct posix_acl *acl;
4529         int rc;
4530         ENTRY;
4531
4532 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4533         if (flags & IPERM_FLAG_RCU)
4534                 return -ECHILD;
4535 #  endif
4536         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4537
4538         if (!acl)
4539                 RETURN(-EAGAIN);
4540
4541         rc = posix_acl_permission(inode, acl, mask);
4542         posix_acl_release(acl);
4543
4544         RETURN(rc);
4545 # else /* !CONFIG_FS_POSIX_ACL */
4546         return -EAGAIN;
4547 # endif /* CONFIG_FS_POSIX_ACL */
4548 }
4549 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4550
4551 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4552 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4553 #else
4554 # ifdef HAVE_INODE_PERMISION_2ARGS
4555 int ll_inode_permission(struct inode *inode, int mask)
4556 # else
4557 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4558 # endif
4559 #endif
4560 {
4561         int rc = 0;
4562         struct ll_sb_info *sbi;
4563         struct root_squash_info *squash;
4564         struct cred *cred = NULL;
4565         const struct cred *old_cred = NULL;
4566         cfs_cap_t cap;
4567         bool squash_id = false;
4568         ENTRY;
4569
4570 #ifdef MAY_NOT_BLOCK
4571         if (mask & MAY_NOT_BLOCK)
4572                 return -ECHILD;
4573 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4574         if (flags & IPERM_FLAG_RCU)
4575                 return -ECHILD;
4576 #endif
4577
4578        /* as root inode are NOT getting validated in lookup operation,
4579         * need to do it before permission check. */
4580
4581         if (inode == inode->i_sb->s_root->d_inode) {
4582                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4583                 if (rc)
4584                         RETURN(rc);
4585         }
4586
4587         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4588                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4589
4590         /* squash fsuid/fsgid if needed */
4591         sbi = ll_i2sbi(inode);
4592         squash = &sbi->ll_squash;
4593         if (unlikely(squash->rsi_uid != 0 &&
4594                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4595                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4596                         squash_id = true;
4597         }
4598         if (squash_id) {
4599                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4600                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4601                        squash->rsi_uid, squash->rsi_gid);
4602
4603                 /* update current process's credentials
4604                  * and FS capability */
4605                 cred = prepare_creds();
4606                 if (cred == NULL)
4607                         RETURN(-ENOMEM);
4608
4609                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4610                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4611                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4612                         if ((1 << cap) & CFS_CAP_FS_MASK)
4613                                 cap_lower(cred->cap_effective, cap);
4614                 }
4615                 old_cred = override_creds(cred);
4616         }
4617
4618         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4619         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4620         /* restore current process's credentials and FS capability */
4621         if (squash_id) {
4622                 revert_creds(old_cred);
4623                 put_cred(cred);
4624         }
4625
4626         RETURN(rc);
4627 }
4628
4629 /* -o localflock - only provides locally consistent flock locks */
4630 struct file_operations ll_file_operations = {
4631 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4632 # ifdef HAVE_SYNC_READ_WRITE
4633         .read           = new_sync_read,
4634         .write          = new_sync_write,
4635 # endif
4636         .read_iter      = ll_file_read_iter,
4637         .write_iter     = ll_file_write_iter,
4638 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4639         .read           = ll_file_read,
4640         .aio_read       = ll_file_aio_read,
4641         .write          = ll_file_write,
4642         .aio_write      = ll_file_aio_write,
4643 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4644         .unlocked_ioctl = ll_file_ioctl,
4645         .open           = ll_file_open,
4646         .release        = ll_file_release,
4647         .mmap           = ll_file_mmap,
4648         .llseek         = ll_file_seek,
4649         .splice_read    = ll_file_splice_read,
4650         .fsync          = ll_fsync,
4651         .flush          = ll_flush
4652 };
4653
4654 struct file_operations ll_file_operations_flock = {
4655 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4656 # ifdef HAVE_SYNC_READ_WRITE
4657         .read           = new_sync_read,
4658         .write          = new_sync_write,
4659 # endif /* HAVE_SYNC_READ_WRITE */
4660         .read_iter      = ll_file_read_iter,
4661         .write_iter     = ll_file_write_iter,
4662 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4663         .read           = ll_file_read,
4664         .aio_read       = ll_file_aio_read,
4665         .write          = ll_file_write,
4666         .aio_write      = ll_file_aio_write,
4667 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4668         .unlocked_ioctl = ll_file_ioctl,
4669         .open           = ll_file_open,
4670         .release        = ll_file_release,
4671         .mmap           = ll_file_mmap,
4672         .llseek         = ll_file_seek,
4673         .splice_read    = ll_file_splice_read,
4674         .fsync          = ll_fsync,
4675         .flush          = ll_flush,
4676         .flock          = ll_file_flock,
4677         .lock           = ll_file_flock
4678 };
4679
4680 /* These are for -o noflock - to return ENOSYS on flock calls */
4681 struct file_operations ll_file_operations_noflock = {
4682 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4683 # ifdef HAVE_SYNC_READ_WRITE
4684         .read           = new_sync_read,
4685         .write          = new_sync_write,
4686 # endif /* HAVE_SYNC_READ_WRITE */
4687         .read_iter      = ll_file_read_iter,
4688         .write_iter     = ll_file_write_iter,
4689 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4690         .read           = ll_file_read,
4691         .aio_read       = ll_file_aio_read,
4692         .write          = ll_file_write,
4693         .aio_write      = ll_file_aio_write,
4694 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4695         .unlocked_ioctl = ll_file_ioctl,
4696         .open           = ll_file_open,
4697         .release        = ll_file_release,
4698         .mmap           = ll_file_mmap,
4699         .llseek         = ll_file_seek,
4700         .splice_read    = ll_file_splice_read,
4701         .fsync          = ll_fsync,
4702         .flush          = ll_flush,
4703         .flock          = ll_file_noflock,
4704         .lock           = ll_file_noflock
4705 };
4706
4707 struct inode_operations ll_file_inode_operations = {
4708         .setattr        = ll_setattr,
4709         .getattr        = ll_getattr,
4710         .permission     = ll_inode_permission,
4711 #ifdef HAVE_IOP_XATTR
4712         .setxattr       = ll_setxattr,
4713         .getxattr       = ll_getxattr,
4714         .removexattr    = ll_removexattr,
4715 #endif
4716         .listxattr      = ll_listxattr,
4717         .fiemap         = ll_fiemap,
4718 #ifdef HAVE_IOP_GET_ACL
4719         .get_acl        = ll_get_acl,
4720 #endif
4721 #ifdef HAVE_IOP_SET_ACL
4722         .set_acl        = ll_set_acl,
4723 #endif
4724 };
4725
4726 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4727 {
4728         struct ll_inode_info *lli = ll_i2info(inode);
4729         struct cl_object *obj = lli->lli_clob;
4730         struct lu_env *env;
4731         int rc;
4732         __u16 refcheck;
4733         ENTRY;
4734
4735         if (obj == NULL)
4736                 RETURN(0);
4737
4738         env = cl_env_get(&refcheck);
4739         if (IS_ERR(env))
4740                 RETURN(PTR_ERR(env));
4741
4742         rc = cl_conf_set(env, lli->lli_clob, conf);
4743         if (rc < 0)
4744                 GOTO(out, rc);
4745
4746         if (conf->coc_opc == OBJECT_CONF_SET) {
4747                 struct ldlm_lock *lock = conf->coc_lock;
4748                 struct cl_layout cl = {
4749                         .cl_layout_gen = 0,
4750                 };
4751
4752                 LASSERT(lock != NULL);
4753                 LASSERT(ldlm_has_layout(lock));
4754
4755                 /* it can only be allowed to match after layout is
4756                  * applied to inode otherwise false layout would be
4757                  * seen. Applying layout shoud happen before dropping
4758                  * the intent lock. */
4759                 ldlm_lock_allow_match(lock);
4760
4761                 rc = cl_object_layout_get(env, obj, &cl);
4762                 if (rc < 0)
4763                         GOTO(out, rc);
4764
4765                 CDEBUG(D_VFSTRACE,
4766                        DFID": layout version change: %u -> %u\n",
4767                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4768                        cl.cl_layout_gen);
4769                 ll_layout_version_set(lli, cl.cl_layout_gen);
4770         }
4771
4772 out:
4773         cl_env_put(env, &refcheck);
4774
4775         RETURN(rc);
4776 }
4777
4778 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4779 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4780
4781 {
4782         struct ll_sb_info *sbi = ll_i2sbi(inode);
4783         struct ptlrpc_request *req;
4784         struct mdt_body *body;
4785         void *lvbdata;
4786         void *lmm;
4787         int lmmsize;
4788         int rc;
4789         ENTRY;
4790
4791         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4792                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4793                lock->l_lvb_data, lock->l_lvb_len);
4794
4795         if (lock->l_lvb_data != NULL)
4796                 RETURN(0);
4797
4798         /* if layout lock was granted right away, the layout is returned
4799          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4800          * blocked and then granted via completion ast, we have to fetch
4801          * layout here. Please note that we can't use the LVB buffer in
4802          * completion AST because it doesn't have a large enough buffer */
4803         rc = ll_get_default_mdsize(sbi, &lmmsize);
4804         if (rc == 0)
4805                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4806                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4807                                 lmmsize, 0, &req);
4808         if (rc < 0)
4809                 RETURN(rc);
4810
4811         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4812         if (body == NULL)
4813                 GOTO(out, rc = -EPROTO);
4814
4815         lmmsize = body->mbo_eadatasize;
4816         if (lmmsize == 0) /* empty layout */
4817                 GOTO(out, rc = 0);
4818
4819         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4820         if (lmm == NULL)
4821                 GOTO(out, rc = -EFAULT);
4822
4823         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4824         if (lvbdata == NULL)
4825                 GOTO(out, rc = -ENOMEM);
4826
4827         memcpy(lvbdata, lmm, lmmsize);
4828         lock_res_and_lock(lock);
4829         if (unlikely(lock->l_lvb_data == NULL)) {
4830                 lock->l_lvb_type = LVB_T_LAYOUT;
4831                 lock->l_lvb_data = lvbdata;
4832                 lock->l_lvb_len = lmmsize;
4833                 lvbdata = NULL;
4834         }
4835         unlock_res_and_lock(lock);
4836
4837         if (lvbdata)
4838                 OBD_FREE_LARGE(lvbdata, lmmsize);
4839
4840         EXIT;
4841
4842 out:
4843         ptlrpc_req_finished(req);
4844         return rc;
4845 }
4846
4847 /**
4848  * Apply the layout to the inode. Layout lock is held and will be released
4849  * in this function.
4850  */
4851 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4852                               struct inode *inode)
4853 {
4854         struct ll_inode_info *lli = ll_i2info(inode);
4855         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4856         struct ldlm_lock *lock;
4857         struct cl_object_conf conf;
4858         int rc = 0;
4859         bool lvb_ready;
4860         bool wait_layout = false;
4861         ENTRY;
4862
4863         LASSERT(lustre_handle_is_used(lockh));
4864
4865         lock = ldlm_handle2lock(lockh);
4866         LASSERT(lock != NULL);
4867         LASSERT(ldlm_has_layout(lock));
4868
4869         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4870                    PFID(&lli->lli_fid), inode);
4871
4872         /* in case this is a caching lock and reinstate with new inode */
4873         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4874
4875         lock_res_and_lock(lock);
4876         lvb_ready = ldlm_is_lvb_ready(lock);
4877         unlock_res_and_lock(lock);
4878
4879         /* checking lvb_ready is racy but this is okay. The worst case is
4880          * that multi processes may configure the file on the same time. */
4881         if (lvb_ready)
4882                 GOTO(out, rc = 0);
4883
4884         rc = ll_layout_fetch(inode, lock);
4885         if (rc < 0)
4886                 GOTO(out, rc);
4887
4888         /* for layout lock, lmm is stored in lock's lvb.
4889          * lvb_data is immutable if the lock is held so it's safe to access it
4890          * without res lock.
4891          *
4892          * set layout to file. Unlikely this will fail as old layout was
4893          * surely eliminated */
4894         memset(&conf, 0, sizeof conf);
4895         conf.coc_opc = OBJECT_CONF_SET;
4896         conf.coc_inode = inode;
4897         conf.coc_lock = lock;
4898         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4899         conf.u.coc_layout.lb_len = lock->l_lvb_len;
4900         rc = ll_layout_conf(inode, &conf);
4901
4902         /* refresh layout failed, need to wait */
4903         wait_layout = rc == -EBUSY;
4904         EXIT;
4905 out:
4906         LDLM_LOCK_PUT(lock);
4907         ldlm_lock_decref(lockh, mode);
4908
4909         /* wait for IO to complete if it's still being used. */
4910         if (wait_layout) {
4911                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4912                        ll_get_fsname(inode->i_sb, NULL, 0),
4913                        PFID(&lli->lli_fid), inode);
4914
4915                 memset(&conf, 0, sizeof conf);
4916                 conf.coc_opc = OBJECT_CONF_WAIT;
4917                 conf.coc_inode = inode;
4918                 rc = ll_layout_conf(inode, &conf);
4919                 if (rc == 0)
4920                         rc = -EAGAIN;
4921
4922                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4923                        ll_get_fsname(inode->i_sb, NULL, 0),
4924                        PFID(&lli->lli_fid), rc);
4925         }
4926         RETURN(rc);
4927 }
4928
4929 /**
4930  * Issue layout intent RPC to MDS.
4931  * \param inode [in]    file inode
4932  * \param intent [in]   layout intent
4933  *
4934  * \retval 0    on success
4935  * \retval < 0  error code
4936  */
4937 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4938 {
4939         struct ll_inode_info  *lli = ll_i2info(inode);
4940         struct ll_sb_info     *sbi = ll_i2sbi(inode);
4941         struct md_op_data     *op_data;
4942         struct lookup_intent it;
4943         struct ptlrpc_request *req;
4944         int rc;
4945         ENTRY;
4946
4947         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4948                                      0, 0, LUSTRE_OPC_ANY, NULL);
4949         if (IS_ERR(op_data))
4950                 RETURN(PTR_ERR(op_data));
4951
4952         op_data->op_data = intent;
4953         op_data->op_data_size = sizeof(*intent);
4954
4955         memset(&it, 0, sizeof(it));
4956         it.it_op = IT_LAYOUT;
4957         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4958             intent->li_opc == LAYOUT_INTENT_TRUNC)
4959                 it.it_flags = FMODE_WRITE;
4960
4961         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4962                           ll_get_fsname(inode->i_sb, NULL, 0),
4963                           PFID(&lli->lli_fid), inode);
4964
4965         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4966                             &ll_md_blocking_ast, 0);
4967         if (it.it_request != NULL)
4968                 ptlrpc_req_finished(it.it_request);
4969         it.it_request = NULL;
4970
4971         ll_finish_md_op_data(op_data);
4972
4973         /* set lock data in case this is a new lock */
4974         if (!rc)
4975                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4976
4977         ll_intent_drop_lock(&it);
4978
4979         RETURN(rc);
4980 }
4981
4982 /**
4983  * This function checks if there exists a LAYOUT lock on the client side,
4984  * or enqueues it if it doesn't have one in cache.
4985  *
4986  * This function will not hold layout lock so it may be revoked any time after
4987  * this function returns. Any operations depend on layout should be redone
4988  * in that case.
4989  *
4990  * This function should be called before lov_io_init() to get an uptodate
4991  * layout version, the caller should save the version number and after IO
4992  * is finished, this function should be called again to verify that layout
4993  * is not changed during IO time.
4994  */
4995 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4996 {
4997         struct ll_inode_info    *lli = ll_i2info(inode);
4998         struct ll_sb_info       *sbi = ll_i2sbi(inode);
4999         struct lustre_handle lockh;
5000         struct layout_intent intent = {
5001                 .li_opc = LAYOUT_INTENT_ACCESS,
5002         };
5003         enum ldlm_mode mode;
5004         int rc;
5005         ENTRY;
5006
5007         *gen = ll_layout_version_get(lli);
5008         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5009                 RETURN(0);
5010
5011         /* sanity checks */
5012         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5013         LASSERT(S_ISREG(inode->i_mode));
5014
5015         /* take layout lock mutex to enqueue layout lock exclusively. */
5016         mutex_lock(&lli->lli_layout_mutex);
5017
5018         while (1) {
5019                 /* mostly layout lock is caching on the local side, so try to
5020                  * match it before grabbing layout lock mutex. */
5021                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5022                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5023                 if (mode != 0) { /* hit cached lock */
5024                         rc = ll_layout_lock_set(&lockh, mode, inode);
5025                         if (rc == -EAGAIN)
5026                                 continue;
5027                         break;
5028                 }
5029
5030                 rc = ll_layout_intent(inode, &intent);
5031                 if (rc != 0)
5032                         break;
5033         }
5034
5035         if (rc == 0)
5036                 *gen = ll_layout_version_get(lli);
5037         mutex_unlock(&lli->lli_layout_mutex);
5038
5039         RETURN(rc);
5040 }
5041
5042 /**
5043  * Issue layout intent RPC indicating where in a file an IO is about to write.
5044  *
5045  * \param[in] inode     file inode.
5046  * \param[in] ext       write range with start offset of fille in bytes where
5047  *                      an IO is about to write, and exclusive end offset in
5048  *                      bytes.
5049  *
5050  * \retval 0    on success
5051  * \retval < 0  error code
5052  */
5053 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5054                            struct lu_extent *ext)
5055 {
5056         struct layout_intent intent = {
5057                 .li_opc = opc,
5058                 .li_extent.e_start = ext->e_start,
5059                 .li_extent.e_end = ext->e_end,
5060         };
5061         int rc;
5062         ENTRY;
5063
5064         rc = ll_layout_intent(inode, &intent);
5065
5066         RETURN(rc);
5067 }
5068
5069 /**
5070  *  This function send a restore request to the MDT
5071  */
5072 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5073 {
5074         struct hsm_user_request *hur;
5075         int                      len, rc;
5076         ENTRY;
5077
5078         len = sizeof(struct hsm_user_request) +
5079               sizeof(struct hsm_user_item);
5080         OBD_ALLOC(hur, len);
5081         if (hur == NULL)
5082                 RETURN(-ENOMEM);
5083
5084         hur->hur_request.hr_action = HUA_RESTORE;
5085         hur->hur_request.hr_archive_id = 0;
5086         hur->hur_request.hr_flags = 0;
5087         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5088                sizeof(hur->hur_user_item[0].hui_fid));
5089         hur->hur_user_item[0].hui_extent.offset = offset;
5090         hur->hur_user_item[0].hui_extent.length = length;
5091         hur->hur_request.hr_itemcount = 1;
5092         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5093                            len, hur, NULL);
5094         OBD_FREE(hur, len);
5095         RETURN(rc);
5096 }