lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                      ATTR_MTIME | ATTR_MTIME_SET |
 104                                      ATTR_CTIME | ATTR_CTIME_SET;
 105         op_data->op_attr_blocks = inode->i_blocks;
 106         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 107         op_data->op_handle = och->och_fh;
 108
 109         if (och->och_flags & FMODE_WRITE &&
 110             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 111                 /* For HSM: if inode data has been modified, pack it so that
 112                  * MDT can set data dirty flag in the archive. */
 113                 op_data->op_bias |= MDS_DATA_MODIFIED;
 114
 115         EXIT;
 116 }
 117
 118 /**
 119  * Perform a close, possibly with a bias.
 120  * The meaning of "data" depends on the value of "bias".
 121  *
 122  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 123  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 124  * swap layouts with.
 125  */
 126 static int ll_close_inode_openhandle(struct inode *inode,
 127                                      struct obd_client_handle *och,
 128                                      enum mds_op_bias bias, void *data)
 129 {
 130         struct obd_export *md_exp = ll_i2mdexp(inode);
 131         const struct ll_inode_info *lli = ll_i2info(inode);
 132         struct md_op_data *op_data;
 133         struct ptlrpc_request *req = NULL;
 134         int rc;
 135         ENTRY;
 136
 137         if (class_exp2obd(md_exp) == NULL) {
 138                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 139                        ll_get_fsname(inode->i_sb, NULL, 0),
 140                        PFID(&lli->lli_fid));
 141                 GOTO(out, rc = 0);
 142         }
 143
 144         OBD_ALLOC_PTR(op_data);
 145         /* We leak openhandle and request here on error, but not much to be
 146          * done in OOM case since app won't retry close on error either. */
 147         if (op_data == NULL)
 148                 GOTO(out, rc = -ENOMEM);
 149
 150         ll_prepare_close(inode, op_data, och);
 151         switch (bias) {
 152         case MDS_CLOSE_LAYOUT_MERGE:
 153                 /* merge blocks from the victim inode */
 154                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 155                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 156         case MDS_CLOSE_LAYOUT_SPLIT:
 157         case MDS_CLOSE_LAYOUT_SWAP: {
 158                 struct split_param *sp = data;
 159
 160                 LASSERT(data != NULL);
 161                 op_data->op_bias |= bias;
 162                 op_data->op_data_version = 0;
 163                 op_data->op_lease_handle = och->och_lease_handle;
 164                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 165                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 166                         op_data->op_mirror_id = sp->sp_mirror_id;
 167                 } else {
 168                         op_data->op_fid2 = *ll_inode2fid(data);
 169                 }
 170                 break;
 171         }
 172
 173         case MDS_CLOSE_RESYNC_DONE: {
 174                 struct ll_ioc_lease *ioc = data;
 175
 176                 LASSERT(data != NULL);
 177                 op_data->op_attr_blocks +=
 178                         ioc->lil_count * op_data->op_attr_blocks;
 179                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 180                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 181
 182                 op_data->op_lease_handle = och->och_lease_handle;
 183                 op_data->op_data = &ioc->lil_ids[0];
 184                 op_data->op_data_size =
 185                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 186                 break;
 187         }
 188
 189         case MDS_HSM_RELEASE:
 190                 LASSERT(data != NULL);
 191                 op_data->op_bias |= MDS_HSM_RELEASE;
 192                 op_data->op_data_version = *(__u64 *)data;
 193                 op_data->op_lease_handle = och->och_lease_handle;
 194                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 195                 break;
 196
 197         default:
 198                 LASSERT(data == NULL);
 199                 break;
 200         }
 201
 202         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 203                 op_data->op_attr.ia_valid |= MDS_ATTR_LSIZE;
 204         if (!(op_data->op_attr.ia_valid & ATTR_BLOCKS))
 205                 op_data->op_attr.ia_valid |= MDS_ATTR_LBLOCKS;
 206
 207         rc = md_close(md_exp, op_data, och->och_mod, &req);
 208         if (rc != 0 && rc != -EINTR)
 209                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 210                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 211
 212         if (rc == 0 && op_data->op_bias & bias) {
 213                 struct mdt_body *body;
 214
 215                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 216                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 217                         rc = -EBUSY;
 218         }
 219
 220         ll_finish_md_op_data(op_data);
 221         EXIT;
 222 out:
 223
 224         md_clear_open_replay_data(md_exp, och);
 225         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 226         OBD_FREE_PTR(och);
 227
 228         ptlrpc_req_finished(req);       /* This is close request */
 229         return rc;
 230 }
 231
 232 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 233 {
 234         struct ll_inode_info *lli = ll_i2info(inode);
 235         struct obd_client_handle **och_p;
 236         struct obd_client_handle *och;
 237         __u64 *och_usecount;
 238         int rc = 0;
 239         ENTRY;
 240
 241         if (fmode & FMODE_WRITE) {
 242                 och_p = &lli->lli_mds_write_och;
 243                 och_usecount = &lli->lli_open_fd_write_count;
 244         } else if (fmode & FMODE_EXEC) {
 245                 och_p = &lli->lli_mds_exec_och;
 246                 och_usecount = &lli->lli_open_fd_exec_count;
 247         } else {
 248                 LASSERT(fmode & FMODE_READ);
 249                 och_p = &lli->lli_mds_read_och;
 250                 och_usecount = &lli->lli_open_fd_read_count;
 251         }
 252
 253         mutex_lock(&lli->lli_och_mutex);
 254         if (*och_usecount > 0) {
 255                 /* There are still users of this handle, so skip
 256                  * freeing it. */
 257                 mutex_unlock(&lli->lli_och_mutex);
 258                 RETURN(0);
 259         }
 260
 261         och = *och_p;
 262         *och_p = NULL;
 263         mutex_unlock(&lli->lli_och_mutex);
 264
 265         if (och != NULL) {
 266                 /* There might be a race and this handle may already
 267                  * be closed. */
 268                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 269         }
 270
 271         RETURN(rc);
 272 }
 273
 274 static int ll_md_close(struct inode *inode, struct file *file)
 275 {
 276         union ldlm_policy_data policy = {
 277                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 278         };
 279         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 280         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 281         struct ll_inode_info *lli = ll_i2info(inode);
 282         struct lustre_handle lockh;
 283         enum ldlm_mode lockmode;
 284         int rc = 0;
 285         ENTRY;
 286
 287         /* clear group lock, if present */
 288         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 289                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 290
 291         if (fd->fd_lease_och != NULL) {
 292                 bool lease_broken;
 293
 294                 /* Usually the lease is not released when the
 295                  * application crashed, we need to release here. */
 296                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 297                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 298                         PFID(&lli->lli_fid), rc, lease_broken);
 299
 300                 fd->fd_lease_och = NULL;
 301         }
 302
 303         if (fd->fd_och != NULL) {
 304                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 305                 fd->fd_och = NULL;
 306                 GOTO(out, rc);
 307         }
 308
 309         /* Let's see if we have good enough OPEN lock on the file and if
 310            we can skip talking to MDS */
 311         mutex_lock(&lli->lli_och_mutex);
 312         if (fd->fd_omode & FMODE_WRITE) {
 313                 lockmode = LCK_CW;
 314                 LASSERT(lli->lli_open_fd_write_count);
 315                 lli->lli_open_fd_write_count--;
 316         } else if (fd->fd_omode & FMODE_EXEC) {
 317                 lockmode = LCK_PR;
 318                 LASSERT(lli->lli_open_fd_exec_count);
 319                 lli->lli_open_fd_exec_count--;
 320         } else {
 321                 lockmode = LCK_CR;
 322                 LASSERT(lli->lli_open_fd_read_count);
 323                 lli->lli_open_fd_read_count--;
 324         }
 325         mutex_unlock(&lli->lli_och_mutex);
 326
 327         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 328                            LDLM_IBITS, &policy, lockmode, &lockh))
 329                 rc = ll_md_real_close(inode, fd->fd_omode);
 330
 331 out:
 332         LUSTRE_FPRIVATE(file) = NULL;
 333         ll_file_data_put(fd);
 334
 335         RETURN(rc);
 336 }
 337
 338 /* While this returns an error code, fput() the caller does not, so we need
 339  * to make every effort to clean up all of our state here.  Also, applications
 340  * rarely check close errors and even if an error is returned they will not
 341  * re-try the close call.
 342  */
 343 int ll_file_release(struct inode *inode, struct file *file)
 344 {
 345         struct ll_file_data *fd;
 346         struct ll_sb_info *sbi = ll_i2sbi(inode);
 347         struct ll_inode_info *lli = ll_i2info(inode);
 348         int rc;
 349         ENTRY;
 350
 351         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 352                PFID(ll_inode2fid(inode)), inode);
 353
 354         if (inode->i_sb->s_root != file_dentry(file))
 355                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 356         fd = LUSTRE_FPRIVATE(file);
 357         LASSERT(fd != NULL);
 358
 359         /* The last ref on @file, maybe not the the owner pid of statahead,
 360          * because parent and child process can share the same file handle. */
 361         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 362                 ll_deauthorize_statahead(inode, fd);
 363
 364         if (inode->i_sb->s_root == file_dentry(file)) {
 365                 LUSTRE_FPRIVATE(file) = NULL;
 366                 ll_file_data_put(fd);
 367                 RETURN(0);
 368         }
 369
 370         if (!S_ISDIR(inode->i_mode)) {
 371                 if (lli->lli_clob != NULL)
 372                         lov_read_and_clear_async_rc(lli->lli_clob);
 373                 lli->lli_async_rc = 0;
 374         }
 375
 376         rc = ll_md_close(inode, file);
 377
 378         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 379                 libcfs_debug_dumplog();
 380
 381         RETURN(rc);
 382 }
 383
 384 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 385                                 struct lookup_intent *itp)
 386 {
 387         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 388         struct dentry *parent = de->d_parent;
 389         const char *name = NULL;
 390         int len = 0;
 391         struct md_op_data *op_data;
 392         struct ptlrpc_request *req = NULL;
 393         int rc;
 394         ENTRY;
 395
 396         LASSERT(parent != NULL);
 397         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 398
 399         /* if server supports open-by-fid, or file name is invalid, don't pack
 400          * name in open request */
 401         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 402             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 403                 name = de->d_name.name;
 404                 len = de->d_name.len;
 405         }
 406
 407         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 408                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 409         if (IS_ERR(op_data))
 410                 RETURN(PTR_ERR(op_data));
 411         op_data->op_data = lmm;
 412         op_data->op_data_size = lmmsize;
 413
 414         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 415                             &ll_md_blocking_ast, 0);
 416         ll_finish_md_op_data(op_data);
 417         if (rc == -ESTALE) {
 418                 /* reason for keep own exit path - don`t flood log
 419                  * with messages with -ESTALE errors.
 420                  */
 421                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 422                      it_open_error(DISP_OPEN_OPEN, itp))
 423                         GOTO(out, rc);
 424                 ll_release_openhandle(de, itp);
 425                 GOTO(out, rc);
 426         }
 427
 428         if (it_disposition(itp, DISP_LOOKUP_NEG))
 429                 GOTO(out, rc = -ENOENT);
 430
 431         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 432                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 433                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 434                 GOTO(out, rc);
 435         }
 436
 437         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 438         if (!rc && itp->it_lock_mode)
 439                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 440
 441 out:
 442         ptlrpc_req_finished(req);
 443         ll_intent_drop_lock(itp);
 444
 445         /* We did open by fid, but by the time we got to the server,
 446          * the object disappeared. If this is a create, we cannot really
 447          * tell the userspace that the file it was trying to create
 448          * does not exist. Instead let's return -ESTALE, and the VFS will
 449          * retry the create with LOOKUP_REVAL that we are going to catch
 450          * in ll_revalidate_dentry() and use lookup then.
 451          */
 452         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 453                 rc = -ESTALE;
 454
 455         RETURN(rc);
 456 }
 457
 458 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 459                        struct obd_client_handle *och)
 460 {
 461         struct mdt_body *body;
 462
 463         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 464         och->och_fh = body->mbo_handle;
 465         och->och_fid = body->mbo_fid1;
 466         och->och_lease_handle.cookie = it->it_lock_handle;
 467         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 468         och->och_flags = it->it_flags;
 469
 470         return md_set_open_replay_data(md_exp, och, it);
 471 }
 472
 473 static int ll_local_open(struct file *file, struct lookup_intent *it,
 474                          struct ll_file_data *fd, struct obd_client_handle *och)
 475 {
 476         struct inode *inode = file_inode(file);
 477         ENTRY;
 478
 479         LASSERT(!LUSTRE_FPRIVATE(file));
 480
 481         LASSERT(fd != NULL);
 482
 483         if (och) {
 484                 int rc;
 485
 486                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 487                 if (rc != 0)
 488                         RETURN(rc);
 489         }
 490
 491         LUSTRE_FPRIVATE(file) = fd;
 492         ll_readahead_init(inode, &fd->fd_ras);
 493         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 494
 495         /* ll_cl_context initialize */
 496         rwlock_init(&fd->fd_lock);
 497         INIT_LIST_HEAD(&fd->fd_lccs);
 498
 499         RETURN(0);
 500 }
 501
 502 /* Open a file, and (for the very first open) create objects on the OSTs at
 503  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 504  * creation or open until ll_lov_setstripe() ioctl is called.
 505  *
 506  * If we already have the stripe MD locally then we don't request it in
 507  * md_open(), by passing a lmm_size = 0.
 508  *
 509  * It is up to the application to ensure no other processes open this file
 510  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 511  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 512  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 513  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 514  */
 515 int ll_file_open(struct inode *inode, struct file *file)
 516 {
 517         struct ll_inode_info *lli = ll_i2info(inode);
 518         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 519                                           .it_flags = file->f_flags };
 520         struct obd_client_handle **och_p = NULL;
 521         __u64 *och_usecount = NULL;
 522         struct ll_file_data *fd;
 523         int rc = 0;
 524         ENTRY;
 525
 526         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 527                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 528
 529         it = file->private_data; /* XXX: compat macro */
 530         file->private_data = NULL; /* prevent ll_local_open assertion */
 531
 532         fd = ll_file_data_get();
 533         if (fd == NULL)
 534                 GOTO(out_nofiledata, rc = -ENOMEM);
 535
 536         fd->fd_file = file;
 537         if (S_ISDIR(inode->i_mode))
 538                 ll_authorize_statahead(inode, fd);
 539
 540         if (inode->i_sb->s_root == file_dentry(file)) {
 541                 LUSTRE_FPRIVATE(file) = fd;
 542                 RETURN(0);
 543         }
 544
 545         if (!it || !it->it_disposition) {
 546                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 547                  * because everything but O_ACCMODE mask was stripped from
 548                  * there */
 549                 if ((oit.it_flags + 1) & O_ACCMODE)
 550                         oit.it_flags++;
 551                 if (file->f_flags & O_TRUNC)
 552                         oit.it_flags |= FMODE_WRITE;
 553
 554                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 555                  * dentry_open after call to open_namei that checks permissions.
 556                  * Only nfsd_open call dentry_open directly without checking
 557                  * permissions and because of that this code below is safe. */
 558                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 559                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 560
 561                 /* We do not want O_EXCL here, presumably we opened the file
 562                  * already? XXX - NFS implications? */
 563                 oit.it_flags &= ~O_EXCL;
 564
 565                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 566                  * created if necessary, then "IT_CREAT" should be set to keep
 567                  * consistent with it */
 568                 if (oit.it_flags & O_CREAT)
 569                         oit.it_op |= IT_CREAT;
 570
 571                 it = &oit;
 572         }
 573
 574 restart:
 575         /* Let's see if we have file open on MDS already. */
 576         if (it->it_flags & FMODE_WRITE) {
 577                 och_p = &lli->lli_mds_write_och;
 578                 och_usecount = &lli->lli_open_fd_write_count;
 579         } else if (it->it_flags & FMODE_EXEC) {
 580                 och_p = &lli->lli_mds_exec_och;
 581                 och_usecount = &lli->lli_open_fd_exec_count;
 582          } else {
 583                 och_p = &lli->lli_mds_read_och;
 584                 och_usecount = &lli->lli_open_fd_read_count;
 585         }
 586
 587         mutex_lock(&lli->lli_och_mutex);
 588         if (*och_p) { /* Open handle is present */
 589                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 590                         /* Well, there's extra open request that we do not need,
 591                            let's close it somehow. This will decref request. */
 592                         rc = it_open_error(DISP_OPEN_OPEN, it);
 593                         if (rc) {
 594                                 mutex_unlock(&lli->lli_och_mutex);
 595                                 GOTO(out_openerr, rc);
 596                         }
 597
 598                         ll_release_openhandle(file_dentry(file), it);
 599                 }
 600                 (*och_usecount)++;
 601
 602                 rc = ll_local_open(file, it, fd, NULL);
 603                 if (rc) {
 604                         (*och_usecount)--;
 605                         mutex_unlock(&lli->lli_och_mutex);
 606                         GOTO(out_openerr, rc);
 607                 }
 608         } else {
 609                 LASSERT(*och_usecount == 0);
 610                 if (!it->it_disposition) {
 611                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 612                         /* We cannot just request lock handle now, new ELC code
 613                            means that one of other OPEN locks for this file
 614                            could be cancelled, and since blocking ast handler
 615                            would attempt to grab och_mutex as well, that would
 616                            result in a deadlock */
 617                         mutex_unlock(&lli->lli_och_mutex);
 618                         /*
 619                          * Normally called under two situations:
 620                          * 1. NFS export.
 621                          * 2. A race/condition on MDS resulting in no open
 622                          *    handle to be returned from LOOKUP|OPEN request,
 623                          *    for example if the target entry was a symlink.
 624                          *
 625                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 626                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 627                          *  bit so that it's not confusing later callers.
 628                          *
 629                          *  NB; when ldd is NULL, it must have come via normal
 630                          *  lookup path only, since ll_iget_for_nfs always calls
 631                          *  ll_d_init().
 632                          */
 633                         if (ldd && ldd->lld_nfs_dentry) {
 634                                 ldd->lld_nfs_dentry = 0;
 635                                 it->it_flags |= MDS_OPEN_LOCK;
 636                         }
 637
 638                          /*
 639                          * Always specify MDS_OPEN_BY_FID because we don't want
 640                          * to get file with different fid.
 641                          */
 642                         it->it_flags |= MDS_OPEN_BY_FID;
 643                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 644                                                  it);
 645                         if (rc)
 646                                 GOTO(out_openerr, rc);
 647
 648                         goto restart;
 649                 }
 650                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 651                 if (!*och_p)
 652                         GOTO(out_och_free, rc = -ENOMEM);
 653
 654                 (*och_usecount)++;
 655
 656                 /* md_intent_lock() didn't get a request ref if there was an
 657                  * open error, so don't do cleanup on the request here
 658                  * (bug 3430) */
 659                 /* XXX (green): Should not we bail out on any error here, not
 660                  * just open error? */
 661                 rc = it_open_error(DISP_OPEN_OPEN, it);
 662                 if (rc != 0)
 663                         GOTO(out_och_free, rc);
 664
 665                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 666                          "inode %p: disposition %x, status %d\n", inode,
 667                          it_disposition(it, ~0), it->it_status);
 668
 669                 rc = ll_local_open(file, it, fd, *och_p);
 670                 if (rc)
 671                         GOTO(out_och_free, rc);
 672         }
 673         mutex_unlock(&lli->lli_och_mutex);
 674         fd = NULL;
 675
 676         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 677            different kind of OPEN lock for this same inode gets cancelled
 678            by ldlm_cancel_lru */
 679         if (!S_ISREG(inode->i_mode))
 680                 GOTO(out_och_free, rc);
 681
 682         cl_lov_delay_create_clear(&file->f_flags);
 683         GOTO(out_och_free, rc);
 684
 685 out_och_free:
 686         if (rc) {
 687                 if (och_p && *och_p) {
 688                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 689                         *och_p = NULL; /* OBD_FREE writes some magic there */
 690                         (*och_usecount)--;
 691                 }
 692                 mutex_unlock(&lli->lli_och_mutex);
 693
 694 out_openerr:
 695                 if (lli->lli_opendir_key == fd)
 696                         ll_deauthorize_statahead(inode, fd);
 697                 if (fd != NULL)
 698                         ll_file_data_put(fd);
 699         } else {
 700                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 701         }
 702
 703 out_nofiledata:
 704         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 705                 ptlrpc_req_finished(it->it_request);
 706                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 707         }
 708
 709         return rc;
 710 }
 711
 712 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 713                         struct ldlm_lock_desc *desc, void *data, int flag)
 714 {
 715         int rc;
 716         struct lustre_handle lockh;
 717         ENTRY;
 718
 719         switch (flag) {
 720         case LDLM_CB_BLOCKING:
 721                 ldlm_lock2handle(lock, &lockh);
 722                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 723                 if (rc < 0) {
 724                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 725                         RETURN(rc);
 726                 }
 727                 break;
 728         case LDLM_CB_CANCELING:
 729                 /* do nothing */
 730                 break;
 731         }
 732         RETURN(0);
 733 }
 734
 735 /**
 736  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 737  * and save it as fd->fd_och so as to force client to reopen the file even
 738  * if it has an open lock in cache already.
 739  */
 740 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 741                                 struct lustre_handle *old_handle)
 742 {
 743         struct ll_inode_info *lli = ll_i2info(inode);
 744         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 745         struct obd_client_handle **och_p;
 746         __u64 *och_usecount;
 747         int rc = 0;
 748         ENTRY;
 749
 750         /* Get the openhandle of the file */
 751         mutex_lock(&lli->lli_och_mutex);
 752         if (fd->fd_lease_och != NULL)
 753                 GOTO(out_unlock, rc = -EBUSY);
 754
 755         if (fd->fd_och == NULL) {
 756                 if (file->f_mode & FMODE_WRITE) {
 757                         LASSERT(lli->lli_mds_write_och != NULL);
 758                         och_p = &lli->lli_mds_write_och;
 759                         och_usecount = &lli->lli_open_fd_write_count;
 760                 } else {
 761                         LASSERT(lli->lli_mds_read_och != NULL);
 762                         och_p = &lli->lli_mds_read_och;
 763                         och_usecount = &lli->lli_open_fd_read_count;
 764                 }
 765
 766                 if (*och_usecount > 1)
 767                         GOTO(out_unlock, rc = -EBUSY);
 768
 769                 fd->fd_och = *och_p;
 770                 *och_usecount = 0;
 771                 *och_p = NULL;
 772         }
 773
 774         *old_handle = fd->fd_och->och_fh;
 775
 776         EXIT;
 777 out_unlock:
 778         mutex_unlock(&lli->lli_och_mutex);
 779         return rc;
 780 }
 781
 782 /**
 783  * Release ownership on lli_mds_*_och when putting back a file lease.
 784  */
 785 static int ll_lease_och_release(struct inode *inode, struct file *file)
 786 {
 787         struct ll_inode_info *lli = ll_i2info(inode);
 788         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 789         struct obd_client_handle **och_p;
 790         struct obd_client_handle *old_och = NULL;
 791         __u64 *och_usecount;
 792         int rc = 0;
 793         ENTRY;
 794
 795         mutex_lock(&lli->lli_och_mutex);
 796         if (file->f_mode & FMODE_WRITE) {
 797                 och_p = &lli->lli_mds_write_och;
 798                 och_usecount = &lli->lli_open_fd_write_count;
 799         } else {
 800                 och_p = &lli->lli_mds_read_och;
 801                 och_usecount = &lli->lli_open_fd_read_count;
 802         }
 803
 804         /* The file may have been open by another process (broken lease) so
 805          * *och_p is not NULL. In this case we should simply increase usecount
 806          * and close fd_och.
 807          */
 808         if (*och_p != NULL) {
 809                 old_och = fd->fd_och;
 810                 (*och_usecount)++;
 811         } else {
 812                 *och_p = fd->fd_och;
 813                 *och_usecount = 1;
 814         }
 815         fd->fd_och = NULL;
 816         mutex_unlock(&lli->lli_och_mutex);
 817
 818         if (old_och != NULL)
 819                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 820
 821         RETURN(rc);
 822 }
 823
 824 /**
 825  * Acquire a lease and open the file.
 826  */
 827 static struct obd_client_handle *
 828 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 829               __u64 open_flags)
 830 {
 831         struct lookup_intent it = { .it_op = IT_OPEN };
 832         struct ll_sb_info *sbi = ll_i2sbi(inode);
 833         struct md_op_data *op_data;
 834         struct ptlrpc_request *req = NULL;
 835         struct lustre_handle old_handle = { 0 };
 836         struct obd_client_handle *och = NULL;
 837         int rc;
 838         int rc2;
 839         ENTRY;
 840
 841         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 842                 RETURN(ERR_PTR(-EINVAL));
 843
 844         if (file != NULL) {
 845                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 846                         RETURN(ERR_PTR(-EPERM));
 847
 848                 rc = ll_lease_och_acquire(inode, file, &old_handle);
 849                 if (rc)
 850                         RETURN(ERR_PTR(rc));
 851         }
 852
 853         OBD_ALLOC_PTR(och);
 854         if (och == NULL)
 855                 RETURN(ERR_PTR(-ENOMEM));
 856
 857         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 858                                         LUSTRE_OPC_ANY, NULL);
 859         if (IS_ERR(op_data))
 860                 GOTO(out, rc = PTR_ERR(op_data));
 861
 862         /* To tell the MDT this openhandle is from the same owner */
 863         op_data->op_handle = old_handle;
 864
 865         it.it_flags = fmode | open_flags;
 866         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 867         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 868                             &ll_md_blocking_lease_ast,
 869         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 870          * it can be cancelled which may mislead applications that the lease is
 871          * broken;
 872          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 873          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 874          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 875                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 876         ll_finish_md_op_data(op_data);
 877         ptlrpc_req_finished(req);
 878         if (rc < 0)
 879                 GOTO(out_release_it, rc);
 880
 881         if (it_disposition(&it, DISP_LOOKUP_NEG))
 882                 GOTO(out_release_it, rc = -ENOENT);
 883
 884         rc = it_open_error(DISP_OPEN_OPEN, &it);
 885         if (rc)
 886                 GOTO(out_release_it, rc);
 887
 888         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 889         ll_och_fill(sbi->ll_md_exp, &it, och);
 890
 891         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 892                 GOTO(out_close, rc = -EOPNOTSUPP);
 893
 894         /* already get lease, handle lease lock */
 895         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 896         if (it.it_lock_mode == 0 ||
 897             it.it_lock_bits != MDS_INODELOCK_OPEN) {
 898                 /* open lock must return for lease */
 899                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 900                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
 901                         it.it_lock_bits);
 902                 GOTO(out_close, rc = -EPROTO);
 903         }
 904
 905         ll_intent_release(&it);
 906         RETURN(och);
 907
 908 out_close:
 909         /* Cancel open lock */
 910         if (it.it_lock_mode != 0) {
 911                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 912                                             it.it_lock_mode);
 913                 it.it_lock_mode = 0;
 914                 och->och_lease_handle.cookie = 0ULL;
 915         }
 916         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
 917         if (rc2 < 0)
 918                 CERROR("%s: error closing file "DFID": %d\n",
 919                        ll_get_fsname(inode->i_sb, NULL, 0),
 920                        PFID(&ll_i2info(inode)->lli_fid), rc2);
 921         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
 922 out_release_it:
 923         ll_intent_release(&it);
 924 out:
 925         if (och != NULL)
 926                 OBD_FREE_PTR(och);
 927         RETURN(ERR_PTR(rc));
 928 }
 929
 930 /**
 931  * Check whether a layout swap can be done between two inodes.
 932  *
 933  * \param[in] inode1  First inode to check
 934  * \param[in] inode2  Second inode to check
 935  *
 936  * \retval 0 on success, layout swap can be performed between both inodes
 937  * \retval negative error code if requirements are not met
 938  */
 939 static int ll_check_swap_layouts_validity(struct inode *inode1,
 940                                           struct inode *inode2)
 941 {
 942         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
 943                 return -EINVAL;
 944
 945         if (inode_permission(inode1, MAY_WRITE) ||
 946             inode_permission(inode2, MAY_WRITE))
 947                 return -EPERM;
 948
 949         if (inode1->i_sb != inode2->i_sb)
 950                 return -EXDEV;
 951
 952         return 0;
 953 }
 954
 955 static int ll_swap_layouts_close(struct obd_client_handle *och,
 956                                  struct inode *inode, struct inode *inode2)
 957 {
 958         const struct lu_fid     *fid1 = ll_inode2fid(inode);
 959         const struct lu_fid     *fid2;
 960         int                      rc;
 961         ENTRY;
 962
 963         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
 964                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
 965
 966         rc = ll_check_swap_layouts_validity(inode, inode2);
 967         if (rc < 0)
 968                 GOTO(out_free_och, rc);
 969
 970         /* We now know that inode2 is a lustre inode */
 971         fid2 = ll_inode2fid(inode2);
 972
 973         rc = lu_fid_cmp(fid1, fid2);
 974         if (rc == 0)
 975                 GOTO(out_free_och, rc = -EINVAL);
 976
 977         /* Close the file and {swap,merge} layouts between inode & inode2.
 978          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
 979          * because we still need it to pack l_remote_handle to MDT. */
 980         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
 981                                        inode2);
 982
 983         och = NULL; /* freed in ll_close_inode_openhandle() */
 984
 985 out_free_och:
 986         if (och != NULL)
 987                 OBD_FREE_PTR(och);
 988
 989         RETURN(rc);
 990 }
 991
 992 /**
 993  * Release lease and close the file.
 994  * It will check if the lease has ever broken.
 995  */
 996 static int ll_lease_close_intent(struct obd_client_handle *och,
 997                                  struct inode *inode,
 998                                  bool *lease_broken, enum mds_op_bias bias,
 999                                  void *data)
1000 {
1001         struct ldlm_lock *lock;
1002         bool cancelled = true;
1003         int rc;
1004         ENTRY;
1005
1006         lock = ldlm_handle2lock(&och->och_lease_handle);
1007         if (lock != NULL) {
1008                 lock_res_and_lock(lock);
1009                 cancelled = ldlm_is_cancel(lock);
1010                 unlock_res_and_lock(lock);
1011                 LDLM_LOCK_PUT(lock);
1012         }
1013
1014         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1015                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1016
1017         if (lease_broken != NULL)
1018                 *lease_broken = cancelled;
1019
1020         if (!cancelled && !bias)
1021                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1022
1023         if (cancelled) { /* no need to excute intent */
1024                 bias = 0;
1025                 data = NULL;
1026         }
1027
1028         rc = ll_close_inode_openhandle(inode, och, bias, data);
1029         RETURN(rc);
1030 }
1031
1032 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1033                           bool *lease_broken)
1034 {
1035         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1036 }
1037
1038 /**
1039  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1040  */
1041 static int ll_lease_file_resync(struct obd_client_handle *och,
1042                                 struct inode *inode)
1043 {
1044         struct ll_sb_info *sbi = ll_i2sbi(inode);
1045         struct md_op_data *op_data;
1046         __u64 data_version_unused;
1047         int rc;
1048         ENTRY;
1049
1050         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1051                                      LUSTRE_OPC_ANY, NULL);
1052         if (IS_ERR(op_data))
1053                 RETURN(PTR_ERR(op_data));
1054
1055         /* before starting file resync, it's necessary to clean up page cache
1056          * in client memory, otherwise once the layout version is increased,
1057          * writing back cached data will be denied the OSTs. */
1058         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1059         if (rc)
1060                 GOTO(out, rc);
1061
1062         op_data->op_handle = och->och_lease_handle;
1063         rc = md_file_resync(sbi->ll_md_exp, op_data);
1064         if (rc)
1065                 GOTO(out, rc);
1066
1067         EXIT;
1068 out:
1069         ll_finish_md_op_data(op_data);
1070         return rc;
1071 }
1072
1073 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1074 {
1075         struct ll_inode_info *lli = ll_i2info(inode);
1076         struct cl_object *obj = lli->lli_clob;
1077         struct cl_attr *attr = vvp_env_thread_attr(env);
1078         s64 atime;
1079         s64 mtime;
1080         s64 ctime;
1081         int rc = 0;
1082
1083         ENTRY;
1084
1085         ll_inode_size_lock(inode);
1086
1087         /* Merge timestamps the most recently obtained from MDS with
1088          * timestamps obtained from OSTs.
1089          *
1090          * Do not overwrite atime of inode because it may be refreshed
1091          * by file_accessed() function. If the read was served by cache
1092          * data, there is no RPC to be sent so that atime may not be
1093          * transferred to OSTs at all. MDT only updates atime at close time
1094          * if it's at least 'mdd.*.atime_diff' older.
1095          * All in all, the atime in Lustre does not strictly comply with
1096          * POSIX. Solving this problem needs to send an RPC to MDT for each
1097          * read, this will hurt performance. */
1098         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1099                 LTIME_S(inode->i_atime) = lli->lli_atime;
1100                 lli->lli_update_atime = 0;
1101         }
1102         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1103         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1104
1105         atime = LTIME_S(inode->i_atime);
1106         mtime = LTIME_S(inode->i_mtime);
1107         ctime = LTIME_S(inode->i_ctime);
1108
1109         cl_object_attr_lock(obj);
1110         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1111                 rc = -EINVAL;
1112         else
1113                 rc = cl_object_attr_get(env, obj, attr);
1114         cl_object_attr_unlock(obj);
1115
1116         if (rc != 0)
1117                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1118
1119         if (atime < attr->cat_atime)
1120                 atime = attr->cat_atime;
1121
1122         if (ctime < attr->cat_ctime)
1123                 ctime = attr->cat_ctime;
1124
1125         if (mtime < attr->cat_mtime)
1126                 mtime = attr->cat_mtime;
1127
1128         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1129                PFID(&lli->lli_fid), attr->cat_size);
1130
1131         i_size_write(inode, attr->cat_size);
1132         inode->i_blocks = attr->cat_blocks;
1133
1134         LTIME_S(inode->i_atime) = atime;
1135         LTIME_S(inode->i_mtime) = mtime;
1136         LTIME_S(inode->i_ctime) = ctime;
1137
1138 out_size_unlock:
1139         ll_inode_size_unlock(inode);
1140
1141         RETURN(rc);
1142 }
1143
1144 /**
1145  * Set designated mirror for I/O.
1146  *
1147  * So far only read, write, and truncated can support to issue I/O to
1148  * designated mirror.
1149  */
1150 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1151 {
1152         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1153
1154         /* clear layout version for generic(non-resync) I/O in case it carries
1155          * stale layout version due to I/O restart */
1156         io->ci_layout_version = 0;
1157
1158         /* FLR: disable non-delay for designated mirror I/O because obviously
1159          * only one mirror is available */
1160         if (fd->fd_designated_mirror > 0) {
1161                 io->ci_ndelay = 0;
1162                 io->ci_designated_mirror = fd->fd_designated_mirror;
1163                 io->ci_layout_version = fd->fd_layout_version;
1164                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1165                                  * io to ptasks */
1166         }
1167
1168         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1169                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1170 }
1171
1172 static bool file_is_noatime(const struct file *file)
1173 {
1174         const struct vfsmount *mnt = file->f_path.mnt;
1175         const struct inode *inode = file_inode((struct file *)file);
1176
1177         /* Adapted from file_accessed() and touch_atime().*/
1178         if (file->f_flags & O_NOATIME)
1179                 return true;
1180
1181         if (inode->i_flags & S_NOATIME)
1182                 return true;
1183
1184         if (IS_NOATIME(inode))
1185                 return true;
1186
1187         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1188                 return true;
1189
1190         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1191                 return true;
1192
1193         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1194                 return true;
1195
1196         return false;
1197 }
1198
1199 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1200
1201 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1202 {
1203         struct inode *inode = file_inode(file);
1204         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1205
1206         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1207         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1208         io->u.ci_rw.rw_file = file;
1209         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1210         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1211         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1212
1213         if (iot == CIT_WRITE) {
1214                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1215                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1216                                            file->f_flags & O_DIRECT ||
1217                                            IS_SYNC(inode));
1218         }
1219         io->ci_obj = ll_i2info(inode)->lli_clob;
1220         io->ci_lockreq = CILR_MAYBE;
1221         if (ll_file_nolock(file)) {
1222                 io->ci_lockreq = CILR_NEVER;
1223                 io->ci_no_srvlock = 1;
1224         } else if (file->f_flags & O_APPEND) {
1225                 io->ci_lockreq = CILR_MANDATORY;
1226         }
1227         io->ci_noatime = file_is_noatime(file);
1228         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1229                 io->ci_pio = !io->u.ci_rw.rw_append;
1230         else
1231                 io->ci_pio = 0;
1232
1233         /* FLR: only use non-delay I/O for read as there is only one
1234          * avaliable mirror for write. */
1235         io->ci_ndelay = !(iot == CIT_WRITE);
1236
1237         ll_io_set_mirror(io, file);
1238 }
1239
1240 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1241 {
1242         struct cl_io_pt *pt = ptask->pt_cbdata;
1243         struct file *file = pt->cip_file;
1244         struct lu_env *env;
1245         struct cl_io *io;
1246         loff_t pos = pt->cip_pos;
1247         int rc;
1248         __u16 refcheck;
1249         ENTRY;
1250
1251         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1252                 file_dentry(file)->d_name.name,
1253                 pt->cip_iot == CIT_READ ? "read" : "write",
1254                 pos, pos + pt->cip_count);
1255
1256         env = cl_env_get(&refcheck);
1257         if (IS_ERR(env))
1258                 RETURN(PTR_ERR(env));
1259
1260         io = vvp_env_thread_io(env);
1261         ll_io_init(io, file, pt->cip_iot);
1262         io->u.ci_rw.rw_iter = pt->cip_iter;
1263         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1264         io->ci_pio = 0; /* It's already in parallel task */
1265
1266         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1267                            pt->cip_count - pt->cip_result);
1268         if (!rc) {
1269                 struct vvp_io *vio = vvp_env_io(env);
1270
1271                 vio->vui_io_subtype = IO_NORMAL;
1272                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1273
1274                 ll_cl_add(file, env, io, LCC_RW);
1275                 rc = cl_io_loop(env, io);
1276                 ll_cl_remove(file, env);
1277         } else {
1278                 /* cl_io_rw_init() handled IO */
1279                 rc = io->ci_result;
1280         }
1281
1282         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1283                 if (io->ci_nob > 0)
1284                         io->ci_nob /= 2;
1285                 rc = -EIO;
1286         }
1287
1288         if (io->ci_nob > 0) {
1289                 pt->cip_result += io->ci_nob;
1290                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1291                 pos += io->ci_nob;
1292                 pt->cip_iocb.ki_pos = pos;
1293 #ifdef HAVE_KIOCB_KI_LEFT
1294                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1295 #elif defined(HAVE_KI_NBYTES)
1296                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1297 #endif
1298         }
1299
1300         cl_io_fini(env, io);
1301         cl_env_put(env, &refcheck);
1302
1303         pt->cip_need_restart = io->ci_need_restart;
1304
1305         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1306                 file_dentry(file)->d_name.name,
1307                 pt->cip_iot == CIT_READ ? "read" : "write",
1308                 pt->cip_result, rc);
1309
1310         RETURN(pt->cip_result > 0 ? 0 : rc);
1311 }
1312
1313 static ssize_t
1314 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1315                    struct file *file, enum cl_io_type iot,
1316                    loff_t *ppos, size_t count)
1317 {
1318         struct range_lock       range;
1319         struct vvp_io           *vio = vvp_env_io(env);
1320         struct inode            *inode = file_inode(file);
1321         struct ll_inode_info    *lli = ll_i2info(inode);
1322         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1323         struct cl_io            *io;
1324         loff_t                  pos = *ppos;
1325         ssize_t                 result = 0;
1326         int                     rc = 0;
1327         unsigned                retried = 0;
1328         bool                    restarted = false;
1329
1330         ENTRY;
1331
1332         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1333                 file_dentry(file)->d_name.name,
1334                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1335
1336 restart:
1337         io = vvp_env_thread_io(env);
1338         ll_io_init(io, file, iot);
1339         if (args->via_io_subtype == IO_NORMAL) {
1340                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1341                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1342         }
1343         if (args->via_io_subtype != IO_NORMAL || restarted)
1344                 io->ci_pio = 0;
1345         io->ci_ndelay_tried = retried;
1346
1347         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1348                 bool range_locked = false;
1349
1350                 if (file->f_flags & O_APPEND)
1351                         range_lock_init(&range, 0, LUSTRE_EOF);
1352                 else
1353                         range_lock_init(&range, pos, pos + count - 1);
1354
1355                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1356                 vio->vui_io_subtype = args->via_io_subtype;
1357
1358                 switch (vio->vui_io_subtype) {
1359                 case IO_NORMAL:
1360                         /* Direct IO reads must also take range lock,
1361                          * or multiple reads will try to work on the same pages
1362                          * See LU-6227 for details. */
1363                         if (((iot == CIT_WRITE) ||
1364                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1365                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1366                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1367                                        RL_PARA(&range));
1368                                 rc = range_lock(&lli->lli_write_tree, &range);
1369                                 if (rc < 0)
1370                                         GOTO(out, rc);
1371
1372                                 range_locked = true;
1373                         }
1374                         break;
1375                 case IO_SPLICE:
1376                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1377                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1378                         break;
1379                 default:
1380                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1381                         LBUG();
1382                 }
1383
1384                 ll_cl_add(file, env, io, LCC_RW);
1385                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1386                     !lli->lli_inode_locked) {
1387                         inode_lock(inode);
1388                         lli->lli_inode_locked = 1;
1389                 }
1390                 rc = cl_io_loop(env, io);
1391                 if (lli->lli_inode_locked) {
1392                         lli->lli_inode_locked = 0;
1393                         inode_unlock(inode);
1394                 }
1395                 ll_cl_remove(file, env);
1396
1397                 if (range_locked) {
1398                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1399                                RL_PARA(&range));
1400                         range_unlock(&lli->lli_write_tree, &range);
1401                 }
1402         } else {
1403                 /* cl_io_rw_init() handled IO */
1404                 rc = io->ci_result;
1405         }
1406
1407         if (io->ci_nob > 0) {
1408                 result += io->ci_nob;
1409                 count  -= io->ci_nob;
1410
1411                 if (args->via_io_subtype == IO_NORMAL) {
1412                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1413
1414                         /* CLIO is too complicated. See LU-11069. */
1415                         if (cl_io_is_append(io))
1416                                 pos = io->u.ci_rw.rw_iocb.ki_pos;
1417                         else
1418                                 pos += io->ci_nob;
1419
1420                         args->u.normal.via_iocb->ki_pos = pos;
1421 #ifdef HAVE_KIOCB_KI_LEFT
1422                         args->u.normal.via_iocb->ki_left = count;
1423 #elif defined(HAVE_KI_NBYTES)
1424                         args->u.normal.via_iocb->ki_nbytes = count;
1425 #endif
1426                 } else {
1427                         /* for splice */
1428                         pos = io->u.ci_rw.rw_range.cir_pos;
1429                 }
1430         }
1431 out:
1432         cl_io_fini(env, io);
1433
1434         CDEBUG(D_VFSTRACE,
1435                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1436                file->f_path.dentry->d_name.name,
1437                iot, rc, result, io->ci_need_restart);
1438
1439         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1440                 CDEBUG(D_VFSTRACE,
1441                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1442                         file_dentry(file)->d_name.name,
1443                         iot == CIT_READ ? "read" : "write",
1444                         pos, pos + count, result, rc);
1445                 /* preserve the tried count for FLR */
1446                 retried = io->ci_ndelay_tried;
1447                 restarted = true;
1448                 goto restart;
1449         }
1450
1451         if (iot == CIT_READ) {
1452                 if (result > 0)
1453                         ll_stats_ops_tally(ll_i2sbi(inode),
1454                                            LPROC_LL_READ_BYTES, result);
1455         } else if (iot == CIT_WRITE) {
1456                 if (result > 0) {
1457                         ll_stats_ops_tally(ll_i2sbi(inode),
1458                                            LPROC_LL_WRITE_BYTES, result);
1459                         fd->fd_write_failed = false;
1460                 } else if (result == 0 && rc == 0) {
1461                         rc = io->ci_result;
1462                         if (rc < 0)
1463                                 fd->fd_write_failed = true;
1464                         else
1465                                 fd->fd_write_failed = false;
1466                 } else if (rc != -ERESTARTSYS) {
1467                         fd->fd_write_failed = true;
1468                 }
1469         }
1470
1471         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1472                 file_dentry(file)->d_name.name,
1473                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1474
1475         *ppos = pos;
1476
1477         RETURN(result > 0 ? result : rc);
1478 }
1479
1480 /**
1481  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1482  * especially for small I/O.
1483  *
1484  * To serve a read request, CLIO has to create and initialize a cl_io and
1485  * then request DLM lock. This has turned out to have siginificant overhead
1486  * and affects the performance of small I/O dramatically.
1487  *
1488  * It's not necessary to create a cl_io for each I/O. Under the help of read
1489  * ahead, most of the pages being read are already in memory cache and we can
1490  * read those pages directly because if the pages exist, the corresponding DLM
1491  * lock must exist so that page content must be valid.
1492  *
1493  * In fast read implementation, the llite speculatively finds and reads pages
1494  * in memory cache. There are three scenarios for fast read:
1495  *   - If the page exists and is uptodate, kernel VM will provide the data and
1496  *     CLIO won't be intervened;
1497  *   - If the page was brought into memory by read ahead, it will be exported
1498  *     and read ahead parameters will be updated;
1499  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1500  *     it will go back and invoke normal read, i.e., a cl_io will be created
1501  *     and DLM lock will be requested.
1502  *
1503  * POSIX compliance: posix standard states that read is intended to be atomic.
1504  * Lustre read implementation is in line with Linux kernel read implementation
1505  * and neither of them complies with POSIX standard in this matter. Fast read
1506  * doesn't make the situation worse on single node but it may interleave write
1507  * results from multiple nodes due to short read handling in ll_file_aio_read().
1508  *
1509  * \param env - lu_env
1510  * \param iocb - kiocb from kernel
1511  * \param iter - user space buffers where the data will be copied
1512  *
1513  * \retval - number of bytes have been read, or error code if error occurred.
1514  */
1515 static ssize_t
1516 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1517 {
1518         ssize_t result;
1519
1520         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1521                 return 0;
1522
1523         /* NB: we can't do direct IO for fast read because it will need a lock
1524          * to make IO engine happy. */
1525         if (iocb->ki_filp->f_flags & O_DIRECT)
1526                 return 0;
1527
1528         result = generic_file_read_iter(iocb, iter);
1529
1530         /* If the first page is not in cache, generic_file_aio_read() will be
1531          * returned with -ENODATA.
1532          * See corresponding code in ll_readpage(). */
1533         if (result == -ENODATA)
1534                 result = 0;
1535
1536         if (result > 0)
1537                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1538                                 LPROC_LL_READ_BYTES, result);
1539
1540         return result;
1541 }
1542
1543 /*
1544  * Read from a file (through the page cache).
1545  */
1546 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1547 {
1548         struct lu_env *env;
1549         struct vvp_io_args *args;
1550         ssize_t result;
1551         ssize_t rc2;
1552         __u16 refcheck;
1553
1554         result = ll_do_fast_read(iocb, to);
1555         if (result < 0 || iov_iter_count(to) == 0)
1556                 GOTO(out, result);
1557
1558         env = cl_env_get(&refcheck);
1559         if (IS_ERR(env))
1560                 return PTR_ERR(env);
1561
1562         args = ll_env_args(env, IO_NORMAL);
1563         args->u.normal.via_iter = to;
1564         args->u.normal.via_iocb = iocb;
1565
1566         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1567                                  &iocb->ki_pos, iov_iter_count(to));
1568         if (rc2 > 0)
1569                 result += rc2;
1570         else if (result == 0)
1571                 result = rc2;
1572
1573         cl_env_put(env, &refcheck);
1574 out:
1575         return result;
1576 }
1577
1578 /**
1579  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1580  * If a page is already in the page cache and dirty (and some other things -
1581  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1582  * write to it without doing a full I/O, because Lustre already knows about it
1583  * and will write it out.  This saves a lot of processing time.
1584  *
1585  * All writes here are within one page, so exclusion is handled by the page
1586  * lock on the vm page.  We do not do tiny writes for writes which touch
1587  * multiple pages because it's very unlikely multiple sequential pages are
1588  * are already dirty.
1589  *
1590  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1591  * and are unlikely to be to already dirty pages.
1592  *
1593  * Attribute updates are important here, we do them in ll_tiny_write_end.
1594  */
1595 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1596 {
1597         ssize_t count = iov_iter_count(iter);
1598         struct file *file = iocb->ki_filp;
1599         struct inode *inode = file_inode(file);
1600         ssize_t result = 0;
1601
1602         ENTRY;
1603
1604         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1605          * of function for why.
1606          */
1607         if (count >= PAGE_SIZE ||
1608             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1609                 RETURN(0);
1610
1611         result = __generic_file_write_iter(iocb, iter);
1612
1613         /* If the page is not already dirty, ll_tiny_write_begin returns
1614          * -ENODATA.  We continue on to normal write.
1615          */
1616         if (result == -ENODATA)
1617                 result = 0;
1618
1619         if (result > 0) {
1620                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1621                                    result);
1622                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1623         }
1624
1625         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1626
1627         RETURN(result);
1628 }
1629
1630 /*
1631  * Write to a file (through the page cache).
1632  */
1633 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1634 {
1635         struct vvp_io_args *args;
1636         struct lu_env *env;
1637         ssize_t rc_tiny = 0, rc_normal;
1638         __u16 refcheck;
1639
1640         ENTRY;
1641
1642         /* NB: we can't do direct IO for tiny writes because they use the page
1643          * cache, we can't do sync writes because tiny writes can't flush
1644          * pages, and we can't do append writes because we can't guarantee the
1645          * required DLM locks are held to protect file size.
1646          */
1647         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1648             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1649                 rc_tiny = ll_do_tiny_write(iocb, from);
1650
1651         /* In case of error, go on and try normal write - Only stop if tiny
1652          * write completed I/O.
1653          */
1654         if (iov_iter_count(from) == 0)
1655                 GOTO(out, rc_normal = rc_tiny);
1656
1657         env = cl_env_get(&refcheck);
1658         if (IS_ERR(env))
1659                 return PTR_ERR(env);
1660
1661         args = ll_env_args(env, IO_NORMAL);
1662         args->u.normal.via_iter = from;
1663         args->u.normal.via_iocb = iocb;
1664
1665         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1666                                     &iocb->ki_pos, iov_iter_count(from));
1667
1668         /* On success, combine bytes written. */
1669         if (rc_tiny >= 0 && rc_normal > 0)
1670                 rc_normal += rc_tiny;
1671         /* On error, only return error from normal write if tiny write did not
1672          * write any bytes.  Otherwise return bytes written by tiny write.
1673          */
1674         else if (rc_tiny > 0)
1675                 rc_normal = rc_tiny;
1676
1677         cl_env_put(env, &refcheck);
1678 out:
1679         RETURN(rc_normal);
1680 }
1681
1682 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1683 /*
1684  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1685  */
1686 static int ll_file_get_iov_count(const struct iovec *iov,
1687                                  unsigned long *nr_segs, size_t *count)
1688 {
1689         size_t cnt = 0;
1690         unsigned long seg;
1691
1692         for (seg = 0; seg < *nr_segs; seg++) {
1693                 const struct iovec *iv = &iov[seg];
1694
1695                 /*
1696                  * If any segment has a negative length, or the cumulative
1697                  * length ever wraps negative then return -EINVAL.
1698                  */
1699                 cnt += iv->iov_len;
1700                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1701                         return -EINVAL;
1702                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1703                         continue;
1704                 if (seg == 0)
1705                         return -EFAULT;
1706                 *nr_segs = seg;
1707                 cnt -= iv->iov_len;     /* This segment is no good */
1708                 break;
1709         }
1710         *count = cnt;
1711         return 0;
1712 }
1713
1714 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1715                                 unsigned long nr_segs, loff_t pos)
1716 {
1717         struct iov_iter to;
1718         size_t iov_count;
1719         ssize_t result;
1720         ENTRY;
1721
1722         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1723         if (result)
1724                 RETURN(result);
1725
1726 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1727         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1728 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1729         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1730 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1731
1732         result = ll_file_read_iter(iocb, &to);
1733
1734         RETURN(result);
1735 }
1736
1737 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1738                             loff_t *ppos)
1739 {
1740         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1741         struct kiocb   kiocb;
1742         ssize_t        result;
1743         ENTRY;
1744
1745         init_sync_kiocb(&kiocb, file);
1746         kiocb.ki_pos = *ppos;
1747 #ifdef HAVE_KIOCB_KI_LEFT
1748         kiocb.ki_left = count;
1749 #elif defined(HAVE_KI_NBYTES)
1750         kiocb.i_nbytes = count;
1751 #endif
1752
1753         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1754         *ppos = kiocb.ki_pos;
1755
1756         RETURN(result);
1757 }
1758
1759 /*
1760  * Write to a file (through the page cache).
1761  * AIO stuff
1762  */
1763 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1764                                  unsigned long nr_segs, loff_t pos)
1765 {
1766         struct iov_iter from;
1767         size_t iov_count;
1768         ssize_t result;
1769         ENTRY;
1770
1771         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1772         if (result)
1773                 RETURN(result);
1774
1775 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1776         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1777 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1778         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1779 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1780
1781         result = ll_file_write_iter(iocb, &from);
1782
1783         RETURN(result);
1784 }
1785
1786 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1787                              size_t count, loff_t *ppos)
1788 {
1789         struct iovec   iov = { .iov_base = (void __user *)buf,
1790                                .iov_len = count };
1791         struct kiocb   kiocb;
1792         ssize_t        result;
1793
1794         ENTRY;
1795
1796         init_sync_kiocb(&kiocb, file);
1797         kiocb.ki_pos = *ppos;
1798 #ifdef HAVE_KIOCB_KI_LEFT
1799         kiocb.ki_left = count;
1800 #elif defined(HAVE_KI_NBYTES)
1801         kiocb.ki_nbytes = count;
1802 #endif
1803
1804         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1805         *ppos = kiocb.ki_pos;
1806
1807         RETURN(result);
1808 }
1809 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1810
1811 /*
1812  * Send file content (through pagecache) somewhere with helper
1813  */
1814 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1815                                    struct pipe_inode_info *pipe, size_t count,
1816                                    unsigned int flags)
1817 {
1818         struct lu_env      *env;
1819         struct vvp_io_args *args;
1820         ssize_t             result;
1821         __u16               refcheck;
1822         ENTRY;
1823
1824         env = cl_env_get(&refcheck);
1825         if (IS_ERR(env))
1826                 RETURN(PTR_ERR(env));
1827
1828         args = ll_env_args(env, IO_SPLICE);
1829         args->u.splice.via_pipe = pipe;
1830         args->u.splice.via_flags = flags;
1831
1832         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1833         cl_env_put(env, &refcheck);
1834         RETURN(result);
1835 }
1836
1837 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1838                              __u64 flags, struct lov_user_md *lum, int lum_size)
1839 {
1840         struct lookup_intent oit = {
1841                 .it_op = IT_OPEN,
1842                 .it_flags = flags | MDS_OPEN_BY_FID,
1843         };
1844         int rc;
1845         ENTRY;
1846
1847         ll_inode_size_lock(inode);
1848         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1849         if (rc < 0)
1850                 GOTO(out_unlock, rc);
1851
1852         ll_release_openhandle(dentry, &oit);
1853
1854 out_unlock:
1855         ll_inode_size_unlock(inode);
1856         ll_intent_release(&oit);
1857
1858         RETURN(rc);
1859 }
1860
1861 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1862                              struct lov_mds_md **lmmp, int *lmm_size,
1863                              struct ptlrpc_request **request)
1864 {
1865         struct ll_sb_info *sbi = ll_i2sbi(inode);
1866         struct mdt_body  *body;
1867         struct lov_mds_md *lmm = NULL;
1868         struct ptlrpc_request *req = NULL;
1869         struct md_op_data *op_data;
1870         int rc, lmmsize;
1871
1872         rc = ll_get_default_mdsize(sbi, &lmmsize);
1873         if (rc)
1874                 RETURN(rc);
1875
1876         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1877                                      strlen(filename), lmmsize,
1878                                      LUSTRE_OPC_ANY, NULL);
1879         if (IS_ERR(op_data))
1880                 RETURN(PTR_ERR(op_data));
1881
1882         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1883         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1884         ll_finish_md_op_data(op_data);
1885         if (rc < 0) {
1886                 CDEBUG(D_INFO, "md_getattr_name failed "
1887                        "on %s: rc %d\n", filename, rc);
1888                 GOTO(out, rc);
1889         }
1890
1891         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1892         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1893
1894         lmmsize = body->mbo_eadatasize;
1895
1896         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1897                         lmmsize == 0) {
1898                 GOTO(out, rc = -ENODATA);
1899         }
1900
1901         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1902         LASSERT(lmm != NULL);
1903
1904         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1905             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1906             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1907                 GOTO(out, rc = -EPROTO);
1908
1909         /*
1910          * This is coming from the MDS, so is probably in
1911          * little endian.  We convert it to host endian before
1912          * passing it to userspace.
1913          */
1914         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1915                 int stripe_count;
1916
1917                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1918                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1919                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1920                         if (le32_to_cpu(lmm->lmm_pattern) &
1921                             LOV_PATTERN_F_RELEASED)
1922                                 stripe_count = 0;
1923                 }
1924
1925                 /* if function called for directory - we should
1926                  * avoid swab not existent lsm objects */
1927                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1928                         lustre_swab_lov_user_md_v1(
1929                                         (struct lov_user_md_v1 *)lmm);
1930                         if (S_ISREG(body->mbo_mode))
1931                                 lustre_swab_lov_user_md_objects(
1932                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1933                                     stripe_count);
1934                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1935                         lustre_swab_lov_user_md_v3(
1936                                         (struct lov_user_md_v3 *)lmm);
1937                         if (S_ISREG(body->mbo_mode))
1938                                 lustre_swab_lov_user_md_objects(
1939                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1940                                     stripe_count);
1941                 } else if (lmm->lmm_magic ==
1942                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1943                         lustre_swab_lov_comp_md_v1(
1944                                         (struct lov_comp_md_v1 *)lmm);
1945                 }
1946         }
1947
1948 out:
1949         *lmmp = lmm;
1950         *lmm_size = lmmsize;
1951         *request = req;
1952         return rc;
1953 }
1954
1955 static int ll_lov_setea(struct inode *inode, struct file *file,
1956                         void __user *arg)
1957 {
1958         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1959         struct lov_user_md      *lump;
1960         int                      lum_size = sizeof(struct lov_user_md) +
1961                                             sizeof(struct lov_user_ost_data);
1962         int                      rc;
1963         ENTRY;
1964
1965         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1966                 RETURN(-EPERM);
1967
1968         OBD_ALLOC_LARGE(lump, lum_size);
1969         if (lump == NULL)
1970                 RETURN(-ENOMEM);
1971
1972         if (copy_from_user(lump, arg, lum_size))
1973                 GOTO(out_lump, rc = -EFAULT);
1974
1975         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1976                                       lum_size);
1977         cl_lov_delay_create_clear(&file->f_flags);
1978
1979 out_lump:
1980         OBD_FREE_LARGE(lump, lum_size);
1981         RETURN(rc);
1982 }
1983
1984 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1985 {
1986         struct lu_env   *env;
1987         __u16           refcheck;
1988         int             rc;
1989         ENTRY;
1990
1991         env = cl_env_get(&refcheck);
1992         if (IS_ERR(env))
1993                 RETURN(PTR_ERR(env));
1994
1995         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
1996         cl_env_put(env, &refcheck);
1997         RETURN(rc);
1998 }
1999
2000 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2001                             void __user *arg)
2002 {
2003         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2004         struct lov_user_md        *klum;
2005         int                        lum_size, rc;
2006         __u64                      flags = FMODE_WRITE;
2007         ENTRY;
2008
2009         rc = ll_copy_user_md(lum, &klum);
2010         if (rc < 0)
2011                 RETURN(rc);
2012
2013         lum_size = rc;
2014         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2015                                       lum_size);
2016         if (!rc) {
2017                 __u32 gen;
2018
2019                 rc = put_user(0, &lum->lmm_stripe_count);
2020                 if (rc)
2021                         GOTO(out, rc);
2022
2023                 rc = ll_layout_refresh(inode, &gen);
2024                 if (rc)
2025                         GOTO(out, rc);
2026
2027                 rc = ll_file_getstripe(inode, arg, lum_size);
2028         }
2029         cl_lov_delay_create_clear(&file->f_flags);
2030
2031 out:
2032         OBD_FREE(klum, lum_size);
2033         RETURN(rc);
2034 }
2035
2036 static int
2037 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2038 {
2039         struct ll_inode_info *lli = ll_i2info(inode);
2040         struct cl_object *obj = lli->lli_clob;
2041         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2042         struct ll_grouplock grouplock;
2043         int rc;
2044         ENTRY;
2045
2046         if (arg == 0) {
2047                 CWARN("group id for group lock must not be 0\n");
2048                 RETURN(-EINVAL);
2049         }
2050
2051         if (ll_file_nolock(file))
2052                 RETURN(-EOPNOTSUPP);
2053
2054         spin_lock(&lli->lli_lock);
2055         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2056                 CWARN("group lock already existed with gid %lu\n",
2057                       fd->fd_grouplock.lg_gid);
2058                 spin_unlock(&lli->lli_lock);
2059                 RETURN(-EINVAL);
2060         }
2061         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2062         spin_unlock(&lli->lli_lock);
2063
2064         /**
2065          * XXX: group lock needs to protect all OST objects while PFL
2066          * can add new OST objects during the IO, so we'd instantiate
2067          * all OST objects before getting its group lock.
2068          */
2069         if (obj) {
2070                 struct lu_env *env;
2071                 __u16 refcheck;
2072                 struct cl_layout cl = {
2073                         .cl_is_composite = false,
2074                 };
2075                 struct lu_extent ext = {
2076                         .e_start = 0,
2077                         .e_end = OBD_OBJECT_EOF,
2078                 };
2079
2080                 env = cl_env_get(&refcheck);
2081                 if (IS_ERR(env))
2082                         RETURN(PTR_ERR(env));
2083
2084                 rc = cl_object_layout_get(env, obj, &cl);
2085                 if (!rc && cl.cl_is_composite)
2086                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2087                                                     &ext);
2088
2089                 cl_env_put(env, &refcheck);
2090                 if (rc)
2091                         RETURN(rc);
2092         }
2093
2094         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2095                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2096         if (rc)
2097                 RETURN(rc);
2098
2099         spin_lock(&lli->lli_lock);
2100         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2101                 spin_unlock(&lli->lli_lock);
2102                 CERROR("another thread just won the race\n");
2103                 cl_put_grouplock(&grouplock);
2104                 RETURN(-EINVAL);
2105         }
2106
2107         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2108         fd->fd_grouplock = grouplock;
2109         spin_unlock(&lli->lli_lock);
2110
2111         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2112         RETURN(0);
2113 }
2114
2115 static int ll_put_grouplock(struct inode *inode, struct file *file,
2116                             unsigned long arg)
2117 {
2118         struct ll_inode_info   *lli = ll_i2info(inode);
2119         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2120         struct ll_grouplock     grouplock;
2121         ENTRY;
2122
2123         spin_lock(&lli->lli_lock);
2124         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2125                 spin_unlock(&lli->lli_lock);
2126                 CWARN("no group lock held\n");
2127                 RETURN(-EINVAL);
2128         }
2129
2130         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2131
2132         if (fd->fd_grouplock.lg_gid != arg) {
2133                 CWARN("group lock %lu doesn't match current id %lu\n",
2134                       arg, fd->fd_grouplock.lg_gid);
2135                 spin_unlock(&lli->lli_lock);
2136                 RETURN(-EINVAL);
2137         }
2138
2139         grouplock = fd->fd_grouplock;
2140         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2141         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2142         spin_unlock(&lli->lli_lock);
2143
2144         cl_put_grouplock(&grouplock);
2145         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2146         RETURN(0);
2147 }
2148
2149 /**
2150  * Close inode open handle
2151  *
2152  * \param dentry [in]     dentry which contains the inode
2153  * \param it     [in,out] intent which contains open info and result
2154  *
2155  * \retval 0     success
2156  * \retval <0    failure
2157  */
2158 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2159 {
2160         struct inode *inode = dentry->d_inode;
2161         struct obd_client_handle *och;
2162         int rc;
2163         ENTRY;
2164
2165         LASSERT(inode);
2166
2167         /* Root ? Do nothing. */
2168         if (dentry->d_inode->i_sb->s_root == dentry)
2169                 RETURN(0);
2170
2171         /* No open handle to close? Move away */
2172         if (!it_disposition(it, DISP_OPEN_OPEN))
2173                 RETURN(0);
2174
2175         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2176
2177         OBD_ALLOC(och, sizeof(*och));
2178         if (!och)
2179                 GOTO(out, rc = -ENOMEM);
2180
2181         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2182
2183         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2184 out:
2185         /* this one is in place of ll_file_open */
2186         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2187                 ptlrpc_req_finished(it->it_request);
2188                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2189         }
2190         RETURN(rc);
2191 }
2192
2193 /**
2194  * Get size for inode for which FIEMAP mapping is requested.
2195  * Make the FIEMAP get_info call and returns the result.
2196  * \param fiemap        kernel buffer to hold extens
2197  * \param num_bytes     kernel buffer size
2198  */
2199 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2200                         size_t num_bytes)
2201 {
2202         struct lu_env                   *env;
2203         __u16                           refcheck;
2204         int                             rc = 0;
2205         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2206         ENTRY;
2207
2208         /* Checks for fiemap flags */
2209         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2210                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2211                 return -EBADR;
2212         }
2213
2214         /* Check for FIEMAP_FLAG_SYNC */
2215         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2216                 rc = filemap_fdatawrite(inode->i_mapping);
2217                 if (rc)
2218                         return rc;
2219         }
2220
2221         env = cl_env_get(&refcheck);
2222         if (IS_ERR(env))
2223                 RETURN(PTR_ERR(env));
2224
2225         if (i_size_read(inode) == 0) {
2226                 rc = ll_glimpse_size(inode);
2227                 if (rc)
2228                         GOTO(out, rc);
2229         }
2230
2231         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2232         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2233         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2234
2235         /* If filesize is 0, then there would be no objects for mapping */
2236         if (fmkey.lfik_oa.o_size == 0) {
2237                 fiemap->fm_mapped_extents = 0;
2238                 GOTO(out, rc = 0);
2239         }
2240
2241         fmkey.lfik_fiemap = *fiemap;
2242
2243         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2244                               &fmkey, fiemap, &num_bytes);
2245 out:
2246         cl_env_put(env, &refcheck);
2247         RETURN(rc);
2248 }
2249
2250 int ll_fid2path(struct inode *inode, void __user *arg)
2251 {
2252         struct obd_export       *exp = ll_i2mdexp(inode);
2253         const struct getinfo_fid2path __user *gfin = arg;
2254         __u32                    pathlen;
2255         struct getinfo_fid2path *gfout;
2256         size_t                   outsize;
2257         int                      rc;
2258
2259         ENTRY;
2260
2261         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2262             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2263                 RETURN(-EPERM);
2264
2265         /* Only need to get the buflen */
2266         if (get_user(pathlen, &gfin->gf_pathlen))
2267                 RETURN(-EFAULT);
2268
2269         if (pathlen > PATH_MAX)
2270                 RETURN(-EINVAL);
2271
2272         outsize = sizeof(*gfout) + pathlen;
2273         OBD_ALLOC(gfout, outsize);
2274         if (gfout == NULL)
2275                 RETURN(-ENOMEM);
2276
2277         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2278                 GOTO(gf_free, rc = -EFAULT);
2279         /* append root FID after gfout to let MDT know the root FID so that it
2280          * can lookup the correct path, this is mainly for fileset.
2281          * old server without fileset mount support will ignore this. */
2282         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2283
2284         /* Call mdc_iocontrol */
2285         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2286         if (rc != 0)
2287                 GOTO(gf_free, rc);
2288
2289         if (copy_to_user(arg, gfout, outsize))
2290                 rc = -EFAULT;
2291
2292 gf_free:
2293         OBD_FREE(gfout, outsize);
2294         RETURN(rc);
2295 }
2296
2297 static int
2298 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2299 {
2300         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2301         struct lu_env *env;
2302         struct cl_io *io;
2303         __u16  refcheck;
2304         int result;
2305
2306         ENTRY;
2307
2308         ioc->idv_version = 0;
2309         ioc->idv_layout_version = UINT_MAX;
2310
2311         /* If no file object initialized, we consider its version is 0. */
2312         if (obj == NULL)
2313                 RETURN(0);
2314
2315         env = cl_env_get(&refcheck);
2316         if (IS_ERR(env))
2317                 RETURN(PTR_ERR(env));
2318
2319         io = vvp_env_thread_io(env);
2320         io->ci_obj = obj;
2321         io->u.ci_data_version.dv_data_version = 0;
2322         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2323         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2324
2325 restart:
2326         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2327                 result = cl_io_loop(env, io);
2328         else
2329                 result = io->ci_result;
2330
2331         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2332         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2333
2334         cl_io_fini(env, io);
2335
2336         if (unlikely(io->ci_need_restart))
2337                 goto restart;
2338
2339         cl_env_put(env, &refcheck);
2340
2341         RETURN(result);
2342 }
2343
2344 /*
2345  * Read the data_version for inode.
2346  *
2347  * This value is computed using stripe object version on OST.
2348  * Version is computed using server side locking.
2349  *
2350  * @param flags if do sync on the OST side;
2351  *              0: no sync
2352  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2353  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2354  */
2355 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2356 {
2357         struct ioc_data_version ioc = { .idv_flags = flags };
2358         int rc;
2359
2360         rc = ll_ioc_data_version(inode, &ioc);
2361         if (!rc)
2362                 *data_version = ioc.idv_version;
2363
2364         return rc;
2365 }
2366
2367 /*
2368  * Trigger a HSM release request for the provided inode.
2369  */
2370 int ll_hsm_release(struct inode *inode)
2371 {
2372         struct lu_env *env;
2373         struct obd_client_handle *och = NULL;
2374         __u64 data_version = 0;
2375         int rc;
2376         __u16 refcheck;
2377         ENTRY;
2378
2379         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2380                ll_get_fsname(inode->i_sb, NULL, 0),
2381                PFID(&ll_i2info(inode)->lli_fid));
2382
2383         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2384         if (IS_ERR(och))
2385                 GOTO(out, rc = PTR_ERR(och));
2386
2387         /* Grab latest data_version and [am]time values */
2388         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2389         if (rc != 0)
2390                 GOTO(out, rc);
2391
2392         env = cl_env_get(&refcheck);
2393         if (IS_ERR(env))
2394                 GOTO(out, rc = PTR_ERR(env));
2395
2396         rc = ll_merge_attr(env, inode);
2397         cl_env_put(env, &refcheck);
2398
2399         /* If error happen, we have the wrong size for a file.
2400          * Don't release it.
2401          */
2402         if (rc != 0)
2403                 GOTO(out, rc);
2404
2405         /* Release the file.
2406          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2407          * we still need it to pack l_remote_handle to MDT. */
2408         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2409                                        &data_version);
2410         och = NULL;
2411
2412         EXIT;
2413 out:
2414         if (och != NULL && !IS_ERR(och)) /* close the file */
2415                 ll_lease_close(och, inode, NULL);
2416
2417         return rc;
2418 }
2419
2420 struct ll_swap_stack {
2421         __u64                    dv1;
2422         __u64                    dv2;
2423         struct inode            *inode1;
2424         struct inode            *inode2;
2425         bool                     check_dv1;
2426         bool                     check_dv2;
2427 };
2428
2429 static int ll_swap_layouts(struct file *file1, struct file *file2,
2430                            struct lustre_swap_layouts *lsl)
2431 {
2432         struct mdc_swap_layouts  msl;
2433         struct md_op_data       *op_data;
2434         __u32                    gid;
2435         __u64                    dv;
2436         struct ll_swap_stack    *llss = NULL;
2437         int                      rc;
2438
2439         OBD_ALLOC_PTR(llss);
2440         if (llss == NULL)
2441                 RETURN(-ENOMEM);
2442
2443         llss->inode1 = file_inode(file1);
2444         llss->inode2 = file_inode(file2);
2445
2446         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2447         if (rc < 0)
2448                 GOTO(free, rc);
2449
2450         /* we use 2 bool because it is easier to swap than 2 bits */
2451         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2452                 llss->check_dv1 = true;
2453
2454         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2455                 llss->check_dv2 = true;
2456
2457         /* we cannot use lsl->sl_dvX directly because we may swap them */
2458         llss->dv1 = lsl->sl_dv1;
2459         llss->dv2 = lsl->sl_dv2;
2460
2461         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2462         if (rc == 0) /* same file, done! */
2463                 GOTO(free, rc);
2464
2465         if (rc < 0) { /* sequentialize it */
2466                 swap(llss->inode1, llss->inode2);
2467                 swap(file1, file2);
2468                 swap(llss->dv1, llss->dv2);
2469                 swap(llss->check_dv1, llss->check_dv2);
2470         }
2471
2472         gid = lsl->sl_gid;
2473         if (gid != 0) { /* application asks to flush dirty cache */
2474                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2475                 if (rc < 0)
2476                         GOTO(free, rc);
2477
2478                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2479                 if (rc < 0) {
2480                         ll_put_grouplock(llss->inode1, file1, gid);
2481                         GOTO(free, rc);
2482                 }
2483         }
2484
2485         /* ultimate check, before swaping the layouts we check if
2486          * dataversion has changed (if requested) */
2487         if (llss->check_dv1) {
2488                 rc = ll_data_version(llss->inode1, &dv, 0);
2489                 if (rc)
2490                         GOTO(putgl, rc);
2491                 if (dv != llss->dv1)
2492                         GOTO(putgl, rc = -EAGAIN);
2493         }
2494
2495         if (llss->check_dv2) {
2496                 rc = ll_data_version(llss->inode2, &dv, 0);
2497                 if (rc)
2498                         GOTO(putgl, rc);
2499                 if (dv != llss->dv2)
2500                         GOTO(putgl, rc = -EAGAIN);
2501         }
2502
2503         /* struct md_op_data is used to send the swap args to the mdt
2504          * only flags is missing, so we use struct mdc_swap_layouts
2505          * through the md_op_data->op_data */
2506         /* flags from user space have to be converted before they are send to
2507          * server, no flag is sent today, they are only used on the client */
2508         msl.msl_flags = 0;
2509         rc = -ENOMEM;
2510         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2511                                      0, LUSTRE_OPC_ANY, &msl);
2512         if (IS_ERR(op_data))
2513                 GOTO(free, rc = PTR_ERR(op_data));
2514
2515         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2516                            sizeof(*op_data), op_data, NULL);
2517         ll_finish_md_op_data(op_data);
2518
2519         if (rc < 0)
2520                 GOTO(putgl, rc);
2521
2522 putgl:
2523         if (gid != 0) {
2524                 ll_put_grouplock(llss->inode2, file2, gid);
2525                 ll_put_grouplock(llss->inode1, file1, gid);
2526         }
2527
2528 free:
2529         if (llss != NULL)
2530                 OBD_FREE_PTR(llss);
2531
2532         RETURN(rc);
2533 }
2534
2535 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2536 {
2537         struct md_op_data       *op_data;
2538         int                      rc;
2539         ENTRY;
2540
2541         /* Detect out-of range masks */
2542         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2543                 RETURN(-EINVAL);
2544
2545         /* Non-root users are forbidden to set or clear flags which are
2546          * NOT defined in HSM_USER_MASK. */
2547         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2548             !cfs_capable(CFS_CAP_SYS_ADMIN))
2549                 RETURN(-EPERM);
2550
2551         /* Detect out-of range archive id */
2552         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2553             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2554                 RETURN(-EINVAL);
2555
2556         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2557                                      LUSTRE_OPC_ANY, hss);
2558         if (IS_ERR(op_data))
2559                 RETURN(PTR_ERR(op_data));
2560
2561         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2562                            sizeof(*op_data), op_data, NULL);
2563
2564         ll_finish_md_op_data(op_data);
2565
2566         RETURN(rc);
2567 }
2568
2569 static int ll_hsm_import(struct inode *inode, struct file *file,
2570                          struct hsm_user_import *hui)
2571 {
2572         struct hsm_state_set    *hss = NULL;
2573         struct iattr            *attr = NULL;
2574         int                      rc;
2575         ENTRY;
2576
2577         if (!S_ISREG(inode->i_mode))
2578                 RETURN(-EINVAL);
2579
2580         /* set HSM flags */
2581         OBD_ALLOC_PTR(hss);
2582         if (hss == NULL)
2583                 GOTO(out, rc = -ENOMEM);
2584
2585         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2586         hss->hss_archive_id = hui->hui_archive_id;
2587         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2588         rc = ll_hsm_state_set(inode, hss);
2589         if (rc != 0)
2590                 GOTO(out, rc);
2591
2592         OBD_ALLOC_PTR(attr);
2593         if (attr == NULL)
2594                 GOTO(out, rc = -ENOMEM);
2595
2596         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2597         attr->ia_mode |= S_IFREG;
2598         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2599         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2600         attr->ia_size = hui->hui_size;
2601         attr->ia_mtime.tv_sec = hui->hui_mtime;
2602         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2603         attr->ia_atime.tv_sec = hui->hui_atime;
2604         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2605
2606         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2607                          ATTR_UID | ATTR_GID |
2608                          ATTR_MTIME | ATTR_MTIME_SET |
2609                          ATTR_ATIME | ATTR_ATIME_SET;
2610
2611         inode_lock(inode);
2612
2613         rc = ll_setattr_raw(file_dentry(file), attr, true);
2614         if (rc == -ENODATA)
2615                 rc = 0;
2616
2617         inode_unlock(inode);
2618
2619 out:
2620         if (hss != NULL)
2621                 OBD_FREE_PTR(hss);
2622
2623         if (attr != NULL)
2624                 OBD_FREE_PTR(attr);
2625
2626         RETURN(rc);
2627 }
2628
2629 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2630 {
2631         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2632                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2633 }
2634
2635 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2636 {
2637         struct inode *inode = file_inode(file);
2638         struct iattr ia = {
2639                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2640                             ATTR_MTIME | ATTR_MTIME_SET |
2641                             ATTR_CTIME | ATTR_CTIME_SET,
2642                 .ia_atime = {
2643                         .tv_sec = lfu->lfu_atime_sec,
2644                         .tv_nsec = lfu->lfu_atime_nsec,
2645                 },
2646                 .ia_mtime = {
2647                         .tv_sec = lfu->lfu_mtime_sec,
2648                         .tv_nsec = lfu->lfu_mtime_nsec,
2649                 },
2650                 .ia_ctime = {
2651                         .tv_sec = lfu->lfu_ctime_sec,
2652                         .tv_nsec = lfu->lfu_ctime_nsec,
2653                 },
2654         };
2655         int rc;
2656         ENTRY;
2657
2658         if (!capable(CAP_SYS_ADMIN))
2659                 RETURN(-EPERM);
2660
2661         if (!S_ISREG(inode->i_mode))
2662                 RETURN(-EINVAL);
2663
2664         inode_lock(inode);
2665         rc = ll_setattr_raw(file_dentry(file), &ia, false);
2666         inode_unlock(inode);
2667
2668         RETURN(rc);
2669 }
2670
2671 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2672 {
2673         switch (mode) {
2674         case MODE_READ_USER:
2675                 return CLM_READ;
2676         case MODE_WRITE_USER:
2677                 return CLM_WRITE;
2678         default:
2679                 return -EINVAL;
2680         }
2681 }
2682
2683 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2684
2685 /* Used to allow the upper layers of the client to request an LDLM lock
2686  * without doing an actual read or write.
2687  *
2688  * Used for ladvise lockahead to manually request specific locks.
2689  *
2690  * \param[in] file      file this ladvise lock request is on
2691  * \param[in] ladvise   ladvise struct describing this lock request
2692  *
2693  * \retval 0            success, no detailed result available (sync requests
2694  *                      and requests sent to the server [not handled locally]
2695  *                      cannot return detailed results)
2696  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2697  *                                       see definitions for details.
2698  * \retval negative     negative errno on error
2699  */
2700 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2701 {
2702         struct lu_env *env = NULL;
2703         struct cl_io *io  = NULL;
2704         struct cl_lock *lock = NULL;
2705         struct cl_lock_descr *descr = NULL;
2706         struct dentry *dentry = file->f_path.dentry;
2707         struct inode *inode = dentry->d_inode;
2708         enum cl_lock_mode cl_mode;
2709         off_t start = ladvise->lla_start;
2710         off_t end = ladvise->lla_end;
2711         int result;
2712         __u16 refcheck;
2713
2714         ENTRY;
2715
2716         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2717                "start=%llu, end=%llu\n", dentry->d_name.len,
2718                dentry->d_name.name, dentry->d_inode,
2719                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2720                (__u64) end);
2721
2722         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2723         if (cl_mode < 0)
2724                 GOTO(out, result = cl_mode);
2725
2726         /* Get IO environment */
2727         result = cl_io_get(inode, &env, &io, &refcheck);
2728         if (result <= 0)
2729                 GOTO(out, result);
2730
2731         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2732         if (result > 0) {
2733                 /*
2734                  * nothing to do for this io. This currently happens when
2735                  * stripe sub-object's are not yet created.
2736                  */
2737                 result = io->ci_result;
2738         } else if (result == 0) {
2739                 lock = vvp_env_lock(env);
2740                 descr = &lock->cll_descr;
2741
2742                 descr->cld_obj   = io->ci_obj;
2743                 /* Convert byte offsets to pages */
2744                 descr->cld_start = cl_index(io->ci_obj, start);
2745                 descr->cld_end   = cl_index(io->ci_obj, end);
2746                 descr->cld_mode  = cl_mode;
2747                 /* CEF_MUST is used because we do not want to convert a
2748                  * lockahead request to a lockless lock */
2749                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2750                                        CEF_NONBLOCK;
2751
2752                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2753                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2754
2755                 result = cl_lock_request(env, io, lock);
2756
2757                 /* On success, we need to release the lock */
2758                 if (result >= 0)
2759                         cl_lock_release(env, lock);
2760         }
2761         cl_io_fini(env, io);
2762         cl_env_put(env, &refcheck);
2763
2764         /* -ECANCELED indicates a matching lock with a different extent
2765          * was already present, and -EEXIST indicates a matching lock
2766          * on exactly the same extent was already present.
2767          * We convert them to positive values for userspace to make
2768          * recognizing true errors easier.
2769          * Note we can only return these detailed results on async requests,
2770          * as sync requests look the same as i/o requests for locking. */
2771         if (result == -ECANCELED)
2772                 result = LLA_RESULT_DIFFERENT;
2773         else if (result == -EEXIST)
2774                 result = LLA_RESULT_SAME;
2775
2776 out:
2777         RETURN(result);
2778 }
2779 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2780
2781 static int ll_ladvise_sanity(struct inode *inode,
2782                              struct llapi_lu_ladvise *ladvise)
2783 {
2784         enum lu_ladvise_type advice = ladvise->lla_advice;
2785         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2786          * be in the first 32 bits of enum ladvise_flags */
2787         __u32 flags = ladvise->lla_peradvice_flags;
2788         /* 3 lines at 80 characters per line, should be plenty */
2789         int rc = 0;
2790
2791         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2792                 rc = -EINVAL;
2793                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2794                        "last supported advice is %s (value '%d'): rc = %d\n",
2795                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2796                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2797                 GOTO(out, rc);
2798         }
2799
2800         /* Per-advice checks */
2801         switch (advice) {
2802         case LU_LADVISE_LOCKNOEXPAND:
2803                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2804                         rc = -EINVAL;
2805                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2806                                "rc = %d\n",
2807                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2808                                ladvise_names[advice], rc);
2809                         GOTO(out, rc);
2810                 }
2811                 break;
2812         case LU_LADVISE_LOCKAHEAD:
2813                 /* Currently only READ and WRITE modes can be requested */
2814                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2815                     ladvise->lla_lockahead_mode == 0) {
2816                         rc = -EINVAL;
2817                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2818                                "rc = %d\n",
2819                                ll_get_fsname(inode->i_sb, NULL, 0),
2820                                ladvise->lla_lockahead_mode,
2821                                ladvise_names[advice], rc);
2822                         GOTO(out, rc);
2823                 }
2824         case LU_LADVISE_WILLREAD:
2825         case LU_LADVISE_DONTNEED:
2826         default:
2827                 /* Note fall through above - These checks apply to all advices
2828                  * except LOCKNOEXPAND */
2829                 if (flags & ~LF_DEFAULT_MASK) {
2830                         rc = -EINVAL;
2831                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2832                                "rc = %d\n",
2833                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2834                                ladvise_names[advice], rc);
2835                         GOTO(out, rc);
2836                 }
2837                 if (ladvise->lla_start >= ladvise->lla_end) {
2838                         rc = -EINVAL;
2839                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2840                                "for %s: rc = %d\n",
2841                                ll_get_fsname(inode->i_sb, NULL, 0),
2842                                ladvise->lla_start, ladvise->lla_end,
2843                                ladvise_names[advice], rc);
2844                         GOTO(out, rc);
2845                 }
2846                 break;
2847         }
2848
2849 out:
2850         return rc;
2851 }
2852 #undef ERRSIZE
2853
2854 /*
2855  * Give file access advices
2856  *
2857  * The ladvise interface is similar to Linux fadvise() system call, except it
2858  * forwards the advices directly from Lustre client to server. The server side
2859  * codes will apply appropriate read-ahead and caching techniques for the
2860  * corresponding files.
2861  *
2862  * A typical workload for ladvise is e.g. a bunch of different clients are
2863  * doing small random reads of a file, so prefetching pages into OSS cache
2864  * with big linear reads before the random IO is a net benefit. Fetching
2865  * all that data into each client cache with fadvise() may not be, due to
2866  * much more data being sent to the client.
2867  */
2868 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2869                       struct llapi_lu_ladvise *ladvise)
2870 {
2871         struct lu_env *env;
2872         struct cl_io *io;
2873         struct cl_ladvise_io *lio;
2874         int rc;
2875         __u16 refcheck;
2876         ENTRY;
2877
2878         env = cl_env_get(&refcheck);
2879         if (IS_ERR(env))
2880                 RETURN(PTR_ERR(env));
2881
2882         io = vvp_env_thread_io(env);
2883         io->ci_obj = ll_i2info(inode)->lli_clob;
2884
2885         /* initialize parameters for ladvise */
2886         lio = &io->u.ci_ladvise;
2887         lio->li_start = ladvise->lla_start;
2888         lio->li_end = ladvise->lla_end;
2889         lio->li_fid = ll_inode2fid(inode);
2890         lio->li_advice = ladvise->lla_advice;
2891         lio->li_flags = flags;
2892
2893         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2894                 rc = cl_io_loop(env, io);
2895         else
2896                 rc = io->ci_result;
2897
2898         cl_io_fini(env, io);
2899         cl_env_put(env, &refcheck);
2900         RETURN(rc);
2901 }
2902
2903 static int ll_lock_noexpand(struct file *file, int flags)
2904 {
2905         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2906
2907         fd->ll_lock_no_expand = !(flags & LF_UNSET);
2908
2909         return 0;
2910 }
2911
2912 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2913                         unsigned long arg)
2914 {
2915         struct fsxattr fsxattr;
2916
2917         if (copy_from_user(&fsxattr,
2918                            (const struct fsxattr __user *)arg,
2919                            sizeof(fsxattr)))
2920                 RETURN(-EFAULT);
2921
2922         fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2923         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2924         if (copy_to_user((struct fsxattr __user *)arg,
2925                          &fsxattr, sizeof(fsxattr)))
2926                 RETURN(-EFAULT);
2927
2928         RETURN(0);
2929 }
2930
2931 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2932                         unsigned long arg)
2933 {
2934
2935         struct md_op_data *op_data;
2936         struct ptlrpc_request *req = NULL;
2937         int rc = 0;
2938         struct fsxattr fsxattr;
2939         struct cl_object *obj;
2940
2941         /* only root could change project ID */
2942         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2943                 RETURN(-EPERM);
2944
2945         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2946                                      LUSTRE_OPC_ANY, NULL);
2947         if (IS_ERR(op_data))
2948                 RETURN(PTR_ERR(op_data));
2949
2950         if (copy_from_user(&fsxattr,
2951                            (const struct fsxattr __user *)arg,
2952                            sizeof(fsxattr)))
2953                 GOTO(out_fsxattr1, rc = -EFAULT);
2954
2955         op_data->op_attr_flags = fsxattr.fsx_xflags;
2956         op_data->op_projid = fsxattr.fsx_projid;
2957         op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2958         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2959                         0, &req);
2960         ptlrpc_req_finished(req);
2961
2962         obj = ll_i2info(inode)->lli_clob;
2963         if (obj) {
2964                 struct iattr *attr;
2965
2966                 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2967                 OBD_ALLOC_PTR(attr);
2968                 if (attr == NULL)
2969                         GOTO(out_fsxattr1, rc = -ENOMEM);
2970                 attr->ia_valid = ATTR_ATTR_FLAG;
2971                 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2972
2973                 OBD_FREE_PTR(attr);
2974         }
2975 out_fsxattr1:
2976         ll_finish_md_op_data(op_data);
2977         RETURN(rc);
2978 }
2979
2980 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
2981                                  unsigned long arg)
2982 {
2983         struct inode            *inode = file_inode(file);
2984         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
2985         struct ll_inode_info    *lli = ll_i2info(inode);
2986         struct obd_client_handle *och = NULL;
2987         struct split_param sp;
2988         bool lease_broken;
2989         fmode_t fmode = 0;
2990         enum mds_op_bias bias = 0;
2991         struct file *layout_file = NULL;
2992         void *data = NULL;
2993         size_t data_size = 0;
2994         long rc;
2995         ENTRY;
2996
2997         mutex_lock(&lli->lli_och_mutex);
2998         if (fd->fd_lease_och != NULL) {
2999                 och = fd->fd_lease_och;
3000                 fd->fd_lease_och = NULL;
3001         }
3002         mutex_unlock(&lli->lli_och_mutex);
3003
3004         if (och == NULL)
3005                 GOTO(out, rc = -ENOLCK);
3006
3007         fmode = och->och_flags;
3008
3009         switch (ioc->lil_flags) {
3010         case LL_LEASE_RESYNC_DONE:
3011                 if (ioc->lil_count > IOC_IDS_MAX)
3012                         GOTO(out, rc = -EINVAL);
3013
3014                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3015                 OBD_ALLOC(data, data_size);
3016                 if (!data)
3017                         GOTO(out, rc = -ENOMEM);
3018
3019                 if (copy_from_user(data, (void __user *)arg, data_size))
3020                         GOTO(out, rc = -EFAULT);
3021
3022                 bias = MDS_CLOSE_RESYNC_DONE;
3023                 break;
3024         case LL_LEASE_LAYOUT_MERGE: {
3025                 int fd;
3026
3027                 if (ioc->lil_count != 1)
3028                         GOTO(out, rc = -EINVAL);
3029
3030                 arg += sizeof(*ioc);
3031                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3032                         GOTO(out, rc = -EFAULT);
3033
3034                 layout_file = fget(fd);
3035                 if (!layout_file)
3036                         GOTO(out, rc = -EBADF);
3037
3038                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3039                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3040                         GOTO(out, rc = -EPERM);
3041
3042                 data = file_inode(layout_file);
3043                 bias = MDS_CLOSE_LAYOUT_MERGE;
3044                 break;
3045         }
3046         case LL_LEASE_LAYOUT_SPLIT: {
3047                 int fdv;
3048                 int mirror_id;
3049
3050                 if (ioc->lil_count != 2)
3051                         GOTO(out, rc = -EINVAL);
3052
3053                 arg += sizeof(*ioc);
3054                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3055                         GOTO(out, rc = -EFAULT);
3056
3057                 arg += sizeof(__u32);
3058                 if (copy_from_user(&mirror_id, (void __user *)arg,
3059                                    sizeof(__u32)))
3060                         GOTO(out, rc = -EFAULT);
3061
3062                 layout_file = fget(fdv);
3063                 if (!layout_file)
3064                         GOTO(out, rc = -EBADF);
3065
3066                 sp.sp_inode = file_inode(layout_file);
3067                 sp.sp_mirror_id = (__u16)mirror_id;
3068                 data = &sp;
3069                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3070                 break;
3071         }
3072         default:
3073                 /* without close intent */
3074                 break;
3075         }
3076
3077         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3078         if (rc < 0)
3079                 GOTO(out, rc);
3080
3081         rc = ll_lease_och_release(inode, file);
3082         if (rc < 0)
3083                 GOTO(out, rc);
3084
3085         if (lease_broken)
3086                 fmode = 0;
3087         EXIT;
3088
3089 out:
3090         switch (ioc->lil_flags) {
3091         case LL_LEASE_RESYNC_DONE:
3092                 if (data)
3093                         OBD_FREE(data, data_size);
3094                 break;
3095         case LL_LEASE_LAYOUT_MERGE:
3096         case LL_LEASE_LAYOUT_SPLIT:
3097                 if (layout_file)
3098                         fput(layout_file);
3099                 break;
3100         }
3101
3102         if (!rc)
3103                 rc = ll_lease_type_from_fmode(fmode);
3104         RETURN(rc);
3105 }
3106
3107 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3108                               unsigned long arg)
3109 {
3110         struct inode *inode = file_inode(file);
3111         struct ll_inode_info *lli = ll_i2info(inode);
3112         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3113         struct obd_client_handle *och = NULL;
3114         __u64 open_flags = 0;
3115         bool lease_broken;
3116         fmode_t fmode;
3117         long rc;
3118         ENTRY;
3119
3120         switch (ioc->lil_mode) {
3121         case LL_LEASE_WRLCK:
3122                 if (!(file->f_mode & FMODE_WRITE))
3123                         RETURN(-EPERM);
3124                 fmode = FMODE_WRITE;
3125                 break;
3126         case LL_LEASE_RDLCK:
3127                 if (!(file->f_mode & FMODE_READ))
3128                         RETURN(-EPERM);
3129                 fmode = FMODE_READ;
3130                 break;
3131         case LL_LEASE_UNLCK:
3132                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3133         default:
3134                 RETURN(-EINVAL);
3135         }
3136
3137         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3138
3139         /* apply for lease */
3140         if (ioc->lil_flags & LL_LEASE_RESYNC)
3141                 open_flags = MDS_OPEN_RESYNC;
3142         och = ll_lease_open(inode, file, fmode, open_flags);
3143         if (IS_ERR(och))
3144                 RETURN(PTR_ERR(och));
3145
3146         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3147                 rc = ll_lease_file_resync(och, inode);
3148                 if (rc) {
3149                         ll_lease_close(och, inode, NULL);
3150                         RETURN(rc);
3151                 }
3152                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3153                 if (rc) {
3154                         ll_lease_close(och, inode, NULL);
3155                         RETURN(rc);
3156                 }
3157         }
3158
3159         rc = 0;
3160         mutex_lock(&lli->lli_och_mutex);
3161         if (fd->fd_lease_och == NULL) {
3162                 fd->fd_lease_och = och;
3163                 och = NULL;
3164         }
3165         mutex_unlock(&lli->lli_och_mutex);
3166         if (och != NULL) {
3167                 /* impossible now that only excl is supported for now */
3168                 ll_lease_close(och, inode, &lease_broken);
3169                 rc = -EBUSY;
3170         }
3171         RETURN(rc);
3172 }
3173
3174 static long
3175 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3176 {
3177         struct inode            *inode = file_inode(file);
3178         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3179         int                      flags, rc;
3180         ENTRY;
3181
3182         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3183                PFID(ll_inode2fid(inode)), inode, cmd);
3184         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3185
3186         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3187         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3188                 RETURN(-ENOTTY);
3189
3190         switch (cmd) {
3191         case LL_IOC_GETFLAGS:
3192                 /* Get the current value of the file flags */
3193                 return put_user(fd->fd_flags, (int __user *)arg);
3194         case LL_IOC_SETFLAGS:
3195         case LL_IOC_CLRFLAGS:
3196                 /* Set or clear specific file flags */
3197                 /* XXX This probably needs checks to ensure the flags are
3198                  *     not abused, and to handle any flag side effects.
3199                  */
3200                 if (get_user(flags, (int __user *) arg))
3201                         RETURN(-EFAULT);
3202
3203                 if (cmd == LL_IOC_SETFLAGS) {
3204                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3205                             !(file->f_flags & O_DIRECT)) {
3206                                 CERROR("%s: unable to disable locking on "
3207                                        "non-O_DIRECT file\n", current->comm);
3208                                 RETURN(-EINVAL);
3209                         }
3210
3211                         fd->fd_flags |= flags;
3212                 } else {
3213                         fd->fd_flags &= ~flags;
3214                 }
3215                 RETURN(0);
3216         case LL_IOC_LOV_SETSTRIPE:
3217         case LL_IOC_LOV_SETSTRIPE_NEW:
3218                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3219         case LL_IOC_LOV_SETEA:
3220                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3221         case LL_IOC_LOV_SWAP_LAYOUTS: {
3222                 struct file *file2;
3223                 struct lustre_swap_layouts lsl;
3224
3225                 if (copy_from_user(&lsl, (char __user *)arg,
3226                                    sizeof(struct lustre_swap_layouts)))
3227                         RETURN(-EFAULT);
3228
3229                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3230                         RETURN(-EPERM);
3231
3232                 file2 = fget(lsl.sl_fd);
3233                 if (file2 == NULL)
3234                         RETURN(-EBADF);
3235
3236                 /* O_WRONLY or O_RDWR */
3237                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3238                         GOTO(out, rc = -EPERM);
3239
3240                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3241                         struct inode                    *inode2;
3242                         struct ll_inode_info            *lli;
3243                         struct obd_client_handle        *och = NULL;
3244
3245                         lli = ll_i2info(inode);
3246                         mutex_lock(&lli->lli_och_mutex);
3247                         if (fd->fd_lease_och != NULL) {
3248                                 och = fd->fd_lease_och;
3249                                 fd->fd_lease_och = NULL;
3250                         }
3251                         mutex_unlock(&lli->lli_och_mutex);
3252                         if (och == NULL)
3253                                 GOTO(out, rc = -ENOLCK);
3254                         inode2 = file_inode(file2);
3255                         rc = ll_swap_layouts_close(och, inode, inode2);
3256                 } else {
3257                         rc = ll_swap_layouts(file, file2, &lsl);
3258                 }
3259 out:
3260                 fput(file2);
3261                 RETURN(rc);
3262         }
3263         case LL_IOC_LOV_GETSTRIPE:
3264         case LL_IOC_LOV_GETSTRIPE_NEW:
3265                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3266         case FS_IOC_GETFLAGS:
3267         case FS_IOC_SETFLAGS:
3268                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3269         case FSFILT_IOC_GETVERSION:
3270         case FS_IOC_GETVERSION:
3271                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3272         /* We need to special case any other ioctls we want to handle,
3273          * to send them to the MDS/OST as appropriate and to properly
3274          * network encode the arg field. */
3275         case FS_IOC_SETVERSION:
3276                 RETURN(-ENOTSUPP);
3277
3278         case LL_IOC_GROUP_LOCK:
3279                 RETURN(ll_get_grouplock(inode, file, arg));
3280         case LL_IOC_GROUP_UNLOCK:
3281                 RETURN(ll_put_grouplock(inode, file, arg));
3282         case IOC_OBD_STATFS:
3283                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3284
3285         case LL_IOC_FLUSHCTX:
3286                 RETURN(ll_flush_ctx(inode));
3287         case LL_IOC_PATH2FID: {
3288                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3289                                  sizeof(struct lu_fid)))
3290                         RETURN(-EFAULT);
3291
3292                 RETURN(0);
3293         }
3294         case LL_IOC_GETPARENT:
3295                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3296
3297         case OBD_IOC_FID2PATH:
3298                 RETURN(ll_fid2path(inode, (void __user *)arg));
3299         case LL_IOC_DATA_VERSION: {
3300                 struct ioc_data_version idv;
3301                 int rc;
3302
3303                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3304                         RETURN(-EFAULT);
3305
3306                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3307                 rc = ll_ioc_data_version(inode, &idv);
3308
3309                 if (rc == 0 &&
3310                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3311                         RETURN(-EFAULT);
3312
3313                 RETURN(rc);
3314         }
3315
3316         case LL_IOC_GET_MDTIDX: {
3317                 int mdtidx;
3318
3319                 mdtidx = ll_get_mdt_idx(inode);
3320                 if (mdtidx < 0)
3321                         RETURN(mdtidx);
3322
3323                 if (put_user((int)mdtidx, (int __user *)arg))
3324                         RETURN(-EFAULT);
3325
3326                 RETURN(0);
3327         }
3328         case OBD_IOC_GETDTNAME:
3329         case OBD_IOC_GETMDNAME:
3330                 RETURN(ll_get_obd_name(inode, cmd, arg));
3331         case LL_IOC_HSM_STATE_GET: {
3332                 struct md_op_data       *op_data;
3333                 struct hsm_user_state   *hus;
3334                 int                      rc;
3335
3336                 OBD_ALLOC_PTR(hus);
3337                 if (hus == NULL)
3338                         RETURN(-ENOMEM);
3339
3340                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3341                                              LUSTRE_OPC_ANY, hus);
3342                 if (IS_ERR(op_data)) {
3343                         OBD_FREE_PTR(hus);
3344                         RETURN(PTR_ERR(op_data));
3345                 }
3346
3347                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3348                                    op_data, NULL);
3349
3350                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3351                         rc = -EFAULT;
3352
3353                 ll_finish_md_op_data(op_data);
3354                 OBD_FREE_PTR(hus);
3355                 RETURN(rc);
3356         }
3357         case LL_IOC_HSM_STATE_SET: {
3358                 struct hsm_state_set    *hss;
3359                 int                      rc;
3360
3361                 OBD_ALLOC_PTR(hss);
3362                 if (hss == NULL)
3363                         RETURN(-ENOMEM);
3364
3365                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3366                         OBD_FREE_PTR(hss);
3367                         RETURN(-EFAULT);
3368                 }
3369
3370                 rc = ll_hsm_state_set(inode, hss);
3371
3372                 OBD_FREE_PTR(hss);
3373                 RETURN(rc);
3374         }
3375         case LL_IOC_HSM_ACTION: {
3376                 struct md_op_data               *op_data;
3377                 struct hsm_current_action       *hca;
3378                 int                              rc;
3379
3380                 OBD_ALLOC_PTR(hca);
3381                 if (hca == NULL)
3382                         RETURN(-ENOMEM);
3383
3384                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3385                                              LUSTRE_OPC_ANY, hca);
3386                 if (IS_ERR(op_data)) {
3387                         OBD_FREE_PTR(hca);
3388                         RETURN(PTR_ERR(op_data));
3389                 }
3390
3391                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3392                                    op_data, NULL);
3393
3394                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3395                         rc = -EFAULT;
3396
3397                 ll_finish_md_op_data(op_data);
3398                 OBD_FREE_PTR(hca);
3399                 RETURN(rc);
3400         }
3401         case LL_IOC_SET_LEASE_OLD: {
3402                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3403
3404                 RETURN(ll_file_set_lease(file, &ioc, 0));
3405         }
3406         case LL_IOC_SET_LEASE: {
3407                 struct ll_ioc_lease ioc;
3408
3409                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3410                         RETURN(-EFAULT);
3411
3412                 RETURN(ll_file_set_lease(file, &ioc, arg));
3413         }
3414         case LL_IOC_GET_LEASE: {
3415                 struct ll_inode_info *lli = ll_i2info(inode);
3416                 struct ldlm_lock *lock = NULL;
3417                 fmode_t fmode = 0;
3418
3419                 mutex_lock(&lli->lli_och_mutex);
3420                 if (fd->fd_lease_och != NULL) {
3421                         struct obd_client_handle *och = fd->fd_lease_och;
3422
3423                         lock = ldlm_handle2lock(&och->och_lease_handle);
3424                         if (lock != NULL) {
3425                                 lock_res_and_lock(lock);
3426                                 if (!ldlm_is_cancel(lock))
3427                                         fmode = och->och_flags;
3428
3429                                 unlock_res_and_lock(lock);
3430                                 LDLM_LOCK_PUT(lock);
3431                         }
3432                 }
3433                 mutex_unlock(&lli->lli_och_mutex);
3434
3435                 RETURN(ll_lease_type_from_fmode(fmode));
3436         }
3437         case LL_IOC_HSM_IMPORT: {
3438                 struct hsm_user_import *hui;
3439
3440                 OBD_ALLOC_PTR(hui);
3441                 if (hui == NULL)
3442                         RETURN(-ENOMEM);
3443
3444                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3445                         OBD_FREE_PTR(hui);
3446                         RETURN(-EFAULT);
3447                 }
3448
3449                 rc = ll_hsm_import(inode, file, hui);
3450
3451                 OBD_FREE_PTR(hui);
3452                 RETURN(rc);
3453         }
3454         case LL_IOC_FUTIMES_3: {
3455                 struct ll_futimes_3 lfu;
3456
3457                 if (copy_from_user(&lfu,
3458                                    (const struct ll_futimes_3 __user *)arg,
3459                                    sizeof(lfu)))
3460                         RETURN(-EFAULT);
3461
3462                 RETURN(ll_file_futimes_3(file, &lfu));
3463         }
3464         case LL_IOC_LADVISE: {
3465                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3466                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3467                 int i;
3468                 int num_advise;
3469                 int alloc_size = sizeof(*k_ladvise_hdr);
3470
3471                 rc = 0;
3472                 u_ladvise_hdr = (void __user *)arg;
3473                 OBD_ALLOC_PTR(k_ladvise_hdr);
3474                 if (k_ladvise_hdr == NULL)
3475                         RETURN(-ENOMEM);
3476
3477                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3478                         GOTO(out_ladvise, rc = -EFAULT);
3479
3480                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3481                     k_ladvise_hdr->lah_count < 1)
3482                         GOTO(out_ladvise, rc = -EINVAL);
3483
3484                 num_advise = k_ladvise_hdr->lah_count;
3485                 if (num_advise >= LAH_COUNT_MAX)
3486                         GOTO(out_ladvise, rc = -EFBIG);
3487
3488                 OBD_FREE_PTR(k_ladvise_hdr);
3489                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3490                                       lah_advise[num_advise]);
3491                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3492                 if (k_ladvise_hdr == NULL)
3493                         RETURN(-ENOMEM);
3494
3495                 /*
3496                  * TODO: submit multiple advices to one server in a single RPC
3497                  */
3498                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3499                         GOTO(out_ladvise, rc = -EFAULT);
3500
3501                 for (i = 0; i < num_advise; i++) {
3502                         struct llapi_lu_ladvise *k_ladvise =
3503                                         &k_ladvise_hdr->lah_advise[i];
3504                         struct llapi_lu_ladvise __user *u_ladvise =
3505                                         &u_ladvise_hdr->lah_advise[i];
3506
3507                         rc = ll_ladvise_sanity(inode, k_ladvise);
3508                         if (rc)
3509                                 GOTO(out_ladvise, rc);
3510
3511                         switch (k_ladvise->lla_advice) {
3512                         case LU_LADVISE_LOCKNOEXPAND:
3513                                 rc = ll_lock_noexpand(file,
3514                                                k_ladvise->lla_peradvice_flags);
3515                                 GOTO(out_ladvise, rc);
3516                         case LU_LADVISE_LOCKAHEAD:
3517
3518                                 rc = ll_file_lock_ahead(file, k_ladvise);
3519
3520                                 if (rc < 0)
3521                                         GOTO(out_ladvise, rc);
3522
3523                                 if (put_user(rc,
3524                                              &u_ladvise->lla_lockahead_result))
3525                                         GOTO(out_ladvise, rc = -EFAULT);
3526                                 break;
3527                         default:
3528                                 rc = ll_ladvise(inode, file,
3529                                                 k_ladvise_hdr->lah_flags,
3530                                                 k_ladvise);
3531                                 if (rc)
3532                                         GOTO(out_ladvise, rc);
3533                                 break;
3534                         }
3535
3536                 }
3537
3538 out_ladvise:
3539                 OBD_FREE(k_ladvise_hdr, alloc_size);
3540                 RETURN(rc);
3541         }
3542         case LL_IOC_FLR_SET_MIRROR: {
3543                 /* mirror I/O must be direct to avoid polluting page cache
3544                  * by stale data. */
3545                 if (!(file->f_flags & O_DIRECT))
3546                         RETURN(-EINVAL);
3547
3548                 fd->fd_designated_mirror = (__u32)arg;
3549                 RETURN(0);
3550         }
3551         case LL_IOC_FSGETXATTR:
3552                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3553         case LL_IOC_FSSETXATTR:
3554                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3555         case BLKSSZGET:
3556                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3557         default:
3558                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3559                                      (void __user *)arg));
3560         }
3561 }
3562
3563 #ifndef HAVE_FILE_LLSEEK_SIZE
3564 static inline loff_t
3565 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3566 {
3567         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3568                 return -EINVAL;
3569         if (offset > maxsize)
3570                 return -EINVAL;
3571
3572         if (offset != file->f_pos) {
3573                 file->f_pos = offset;
3574                 file->f_version = 0;
3575         }
3576         return offset;
3577 }
3578
3579 static loff_t
3580 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3581                 loff_t maxsize, loff_t eof)
3582 {
3583         struct inode *inode = file_inode(file);
3584
3585         switch (origin) {
3586         case SEEK_END:
3587                 offset += eof;
3588                 break;
3589         case SEEK_CUR:
3590                 /*
3591                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3592                  * position-querying operation.  Avoid rewriting the "same"
3593                  * f_pos value back to the file because a concurrent read(),
3594                  * write() or lseek() might have altered it
3595                  */
3596                 if (offset == 0)
3597                         return file->f_pos;
3598                 /*
3599                  * f_lock protects against read/modify/write race with other
3600                  * SEEK_CURs. Note that parallel writes and reads behave
3601                  * like SEEK_SET.
3602                  */
3603                 inode_lock(inode);
3604                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3605                 inode_unlock(inode);
3606                 return offset;
3607         case SEEK_DATA:
3608                 /*
3609                  * In the generic case the entire file is data, so as long as
3610                  * offset isn't at the end of the file then the offset is data.
3611                  */
3612                 if (offset >= eof)
3613                         return -ENXIO;
3614                 break;
3615         case SEEK_HOLE:
3616                 /*
3617                  * There is a virtual hole at the end of the file, so as long as
3618                  * offset isn't i_size or larger, return i_size.
3619                  */
3620                 if (offset >= eof)
3621                         return -ENXIO;
3622                 offset = eof;
3623                 break;
3624         }
3625
3626         return llseek_execute(file, offset, maxsize);
3627 }
3628 #endif
3629
3630 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3631 {
3632         struct inode *inode = file_inode(file);
3633         loff_t retval, eof = 0;
3634
3635         ENTRY;
3636         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3637                            (origin == SEEK_CUR) ? file->f_pos : 0);
3638         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3639                PFID(ll_inode2fid(inode)), inode, retval, retval,
3640                origin);
3641         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3642
3643         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3644                 retval = ll_glimpse_size(inode);
3645                 if (retval != 0)
3646                         RETURN(retval);
3647                 eof = i_size_read(inode);
3648         }
3649
3650         retval = ll_generic_file_llseek_size(file, offset, origin,
3651                                           ll_file_maxbytes(inode), eof);
3652         RETURN(retval);
3653 }
3654
3655 static int ll_flush(struct file *file, fl_owner_t id)
3656 {
3657         struct inode *inode = file_inode(file);
3658         struct ll_inode_info *lli = ll_i2info(inode);
3659         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3660         int rc, err;
3661
3662         LASSERT(!S_ISDIR(inode->i_mode));
3663
3664         /* catch async errors that were recorded back when async writeback
3665          * failed for pages in this mapping. */
3666         rc = lli->lli_async_rc;
3667         lli->lli_async_rc = 0;
3668         if (lli->lli_clob != NULL) {
3669                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3670                 if (rc == 0)
3671                         rc = err;
3672         }
3673
3674         /* The application has been told write failure already.
3675          * Do not report failure again. */
3676         if (fd->fd_write_failed)
3677                 return 0;
3678         return rc ? -EIO : 0;
3679 }
3680
3681 /**
3682  * Called to make sure a portion of file has been written out.
3683  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3684  *
3685  * Return how many pages have been written.
3686  */
3687 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3688                        enum cl_fsync_mode mode, int ignore_layout)
3689 {
3690         struct lu_env *env;
3691         struct cl_io *io;
3692         struct cl_fsync_io *fio;
3693         int result;
3694         __u16 refcheck;
3695         ENTRY;
3696
3697         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3698             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3699                 RETURN(-EINVAL);
3700
3701         env = cl_env_get(&refcheck);
3702         if (IS_ERR(env))
3703                 RETURN(PTR_ERR(env));
3704
3705         io = vvp_env_thread_io(env);
3706         io->ci_obj = ll_i2info(inode)->lli_clob;
3707         io->ci_ignore_layout = ignore_layout;
3708
3709         /* initialize parameters for sync */
3710         fio = &io->u.ci_fsync;
3711         fio->fi_start = start;
3712         fio->fi_end = end;
3713         fio->fi_fid = ll_inode2fid(inode);
3714         fio->fi_mode = mode;
3715         fio->fi_nr_written = 0;
3716
3717         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3718                 result = cl_io_loop(env, io);
3719         else
3720                 result = io->ci_result;
3721         if (result == 0)
3722                 result = fio->fi_nr_written;
3723         cl_io_fini(env, io);
3724         cl_env_put(env, &refcheck);
3725
3726         RETURN(result);
3727 }
3728
3729 /*
3730  * When dentry is provided (the 'else' case), file_dentry() may be
3731  * null and dentry must be used directly rather than pulled from
3732  * file_dentry() as is done otherwise.
3733  */
3734
3735 #ifdef HAVE_FILE_FSYNC_4ARGS
3736 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3737 {
3738         struct dentry *dentry = file_dentry(file);
3739         bool lock_inode;
3740 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3741 int ll_fsync(struct file *file, int datasync)
3742 {
3743         struct dentry *dentry = file_dentry(file);
3744         loff_t start = 0;
3745         loff_t end = LLONG_MAX;
3746 #else
3747 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3748 {
3749         loff_t start = 0;
3750         loff_t end = LLONG_MAX;
3751 #endif
3752         struct inode *inode = dentry->d_inode;
3753         struct ll_inode_info *lli = ll_i2info(inode);
3754         struct ptlrpc_request *req;
3755         int rc, err;
3756         ENTRY;
3757
3758         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3759                PFID(ll_inode2fid(inode)), inode);
3760         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3761
3762 #ifdef HAVE_FILE_FSYNC_4ARGS
3763         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3764         lock_inode = !lli->lli_inode_locked;
3765         if (lock_inode)
3766                 inode_lock(inode);
3767 #else
3768         /* fsync's caller has already called _fdata{sync,write}, we want
3769          * that IO to finish before calling the osc and mdc sync methods */
3770         rc = filemap_fdatawait(inode->i_mapping);
3771 #endif
3772
3773         /* catch async errors that were recorded back when async writeback
3774          * failed for pages in this mapping. */
3775         if (!S_ISDIR(inode->i_mode)) {
3776                 err = lli->lli_async_rc;
3777                 lli->lli_async_rc = 0;
3778                 if (rc == 0)
3779                         rc = err;
3780                 if (lli->lli_clob != NULL) {
3781                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3782                         if (rc == 0)
3783                                 rc = err;
3784                 }
3785         }
3786
3787         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3788         if (!rc)
3789                 rc = err;
3790         if (!err)
3791                 ptlrpc_req_finished(req);
3792
3793         if (S_ISREG(inode->i_mode)) {
3794                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3795
3796                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3797                 if (rc == 0 && err < 0)
3798                         rc = err;
3799                 if (rc < 0)
3800                         fd->fd_write_failed = true;
3801                 else
3802                         fd->fd_write_failed = false;
3803         }
3804
3805 #ifdef HAVE_FILE_FSYNC_4ARGS
3806         if (lock_inode)
3807                 inode_unlock(inode);
3808 #endif
3809         RETURN(rc);
3810 }
3811
3812 static int
3813 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3814 {
3815         struct inode *inode = file_inode(file);
3816         struct ll_sb_info *sbi = ll_i2sbi(inode);
3817         struct ldlm_enqueue_info einfo = {
3818                 .ei_type        = LDLM_FLOCK,
3819                 .ei_cb_cp       = ldlm_flock_completion_ast,
3820                 .ei_cbdata      = file_lock,
3821         };
3822         struct md_op_data *op_data;
3823         struct lustre_handle lockh = { 0 };
3824         union ldlm_policy_data flock = { { 0 } };
3825         int fl_type = file_lock->fl_type;
3826         __u64 flags = 0;
3827         int rc;
3828         int rc2 = 0;
3829         ENTRY;
3830
3831         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3832                PFID(ll_inode2fid(inode)), file_lock);
3833
3834         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3835
3836         if (file_lock->fl_flags & FL_FLOCK) {
3837                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3838                 /* flocks are whole-file locks */
3839                 flock.l_flock.end = OFFSET_MAX;
3840                 /* For flocks owner is determined by the local file desctiptor*/
3841                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3842         } else if (file_lock->fl_flags & FL_POSIX) {
3843                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3844                 flock.l_flock.start = file_lock->fl_start;
3845                 flock.l_flock.end = file_lock->fl_end;
3846         } else {
3847                 RETURN(-EINVAL);
3848         }
3849         flock.l_flock.pid = file_lock->fl_pid;
3850
3851         /* Somewhat ugly workaround for svc lockd.
3852          * lockd installs custom fl_lmops->lm_compare_owner that checks
3853          * for the fl_owner to be the same (which it always is on local node
3854          * I guess between lockd processes) and then compares pid.
3855          * As such we assign pid to the owner field to make it all work,
3856          * conflict with normal locks is unlikely since pid space and
3857          * pointer space for current->files are not intersecting */
3858         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3859                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3860
3861         switch (fl_type) {
3862         case F_RDLCK:
3863                 einfo.ei_mode = LCK_PR;
3864                 break;
3865         case F_UNLCK:
3866                 /* An unlock request may or may not have any relation to
3867                  * existing locks so we may not be able to pass a lock handle
3868                  * via a normal ldlm_lock_cancel() request. The request may even
3869                  * unlock a byte range in the middle of an existing lock. In
3870                  * order to process an unlock request we need all of the same
3871                  * information that is given with a normal read or write record
3872                  * lock request. To avoid creating another ldlm unlock (cancel)
3873                  * message we'll treat a LCK_NL flock request as an unlock. */
3874                 einfo.ei_mode = LCK_NL;
3875                 break;
3876         case F_WRLCK:
3877                 einfo.ei_mode = LCK_PW;
3878                 break;
3879         default:
3880                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3881                 RETURN (-ENOTSUPP);
3882         }
3883
3884         switch (cmd) {
3885         case F_SETLKW:
3886 #ifdef F_SETLKW64
3887         case F_SETLKW64:
3888 #endif
3889                 flags = 0;
3890                 break;
3891         case F_SETLK:
3892 #ifdef F_SETLK64
3893         case F_SETLK64:
3894 #endif
3895                 flags = LDLM_FL_BLOCK_NOWAIT;
3896                 break;
3897         case F_GETLK:
3898 #ifdef F_GETLK64
3899         case F_GETLK64:
3900 #endif
3901                 flags = LDLM_FL_TEST_LOCK;
3902                 break;
3903         default:
3904                 CERROR("unknown fcntl lock command: %d\n", cmd);
3905                 RETURN (-EINVAL);
3906         }
3907
3908         /* Save the old mode so that if the mode in the lock changes we
3909          * can decrement the appropriate reader or writer refcount. */
3910         file_lock->fl_type = einfo.ei_mode;
3911
3912         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3913                                      LUSTRE_OPC_ANY, NULL);
3914         if (IS_ERR(op_data))
3915                 RETURN(PTR_ERR(op_data));
3916
3917         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3918                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3919                flock.l_flock.pid, flags, einfo.ei_mode,
3920                flock.l_flock.start, flock.l_flock.end);
3921
3922         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3923                         flags);
3924
3925         /* Restore the file lock type if not TEST lock. */
3926         if (!(flags & LDLM_FL_TEST_LOCK))
3927                 file_lock->fl_type = fl_type;
3928
3929 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3930         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3931             !(flags & LDLM_FL_TEST_LOCK))
3932                 rc2  = locks_lock_file_wait(file, file_lock);
3933 #else
3934         if ((file_lock->fl_flags & FL_FLOCK) &&
3935             (rc == 0 || file_lock->fl_type == F_UNLCK))
3936                 rc2  = flock_lock_file_wait(file, file_lock);
3937         if ((file_lock->fl_flags & FL_POSIX) &&
3938             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3939             !(flags & LDLM_FL_TEST_LOCK))
3940                 rc2  = posix_lock_file_wait(file, file_lock);
3941 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3942
3943         if (rc2 && file_lock->fl_type != F_UNLCK) {
3944                 einfo.ei_mode = LCK_NL;
3945                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3946                            &lockh, flags);
3947                 rc = rc2;
3948         }
3949
3950         ll_finish_md_op_data(op_data);
3951
3952         RETURN(rc);
3953 }
3954
3955 int ll_get_fid_by_name(struct inode *parent, const char *name,
3956                        int namelen, struct lu_fid *fid,
3957                        struct inode **inode)
3958 {
3959         struct md_op_data       *op_data = NULL;
3960         struct mdt_body         *body;
3961         struct ptlrpc_request   *req;
3962         int                     rc;
3963         ENTRY;
3964
3965         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3966                                      LUSTRE_OPC_ANY, NULL);
3967         if (IS_ERR(op_data))
3968                 RETURN(PTR_ERR(op_data));
3969
3970         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3971         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3972         ll_finish_md_op_data(op_data);
3973         if (rc < 0)
3974                 RETURN(rc);
3975
3976         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
3977         if (body == NULL)
3978                 GOTO(out_req, rc = -EFAULT);
3979         if (fid != NULL)
3980                 *fid = body->mbo_fid1;
3981
3982         if (inode != NULL)
3983                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
3984 out_req:
3985         ptlrpc_req_finished(req);
3986         RETURN(rc);
3987 }
3988
3989 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
3990                const char *name, int namelen)
3991 {
3992         struct dentry         *dchild = NULL;
3993         struct inode          *child_inode = NULL;
3994         struct md_op_data     *op_data;
3995         struct ptlrpc_request *request = NULL;
3996         struct obd_client_handle *och = NULL;
3997         struct qstr           qstr;
3998         struct mdt_body         *body;
3999         int                    rc;
4000         __u64                   data_version = 0;
4001         ENTRY;
4002
4003         CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
4004                name, PFID(ll_inode2fid(parent)), mdtidx);
4005
4006         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4007                                      0, LUSTRE_OPC_ANY, NULL);
4008         if (IS_ERR(op_data))
4009                 RETURN(PTR_ERR(op_data));
4010
4011         /* Get child FID first */
4012         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4013         qstr.name = name;
4014         qstr.len = namelen;
4015         dchild = d_lookup(file_dentry(file), &qstr);
4016         if (dchild != NULL) {
4017                 if (dchild->d_inode != NULL)
4018                         child_inode = igrab(dchild->d_inode);
4019                 dput(dchild);
4020         }
4021
4022         if (child_inode == NULL) {
4023                 rc = ll_get_fid_by_name(parent, name, namelen,
4024                                         &op_data->op_fid3, &child_inode);
4025                 if (rc != 0)
4026                         GOTO(out_free, rc);
4027         }
4028
4029         if (child_inode == NULL)
4030                 GOTO(out_free, rc = -EINVAL);
4031
4032         /*
4033          * lfs migrate command needs to be blocked on the client
4034          * by checking the migrate FID against the FID of the
4035          * filesystem root.
4036          */
4037         if (child_inode == parent->i_sb->s_root->d_inode)
4038                 GOTO(out_iput, rc = -EINVAL);
4039
4040         inode_lock(child_inode);
4041         op_data->op_fid3 = *ll_inode2fid(child_inode);
4042         if (!fid_is_sane(&op_data->op_fid3)) {
4043                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4044                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4045                        PFID(&op_data->op_fid3));
4046                 GOTO(out_unlock, rc = -EINVAL);
4047         }
4048
4049         rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
4050         if (rc < 0)
4051                 GOTO(out_unlock, rc);
4052
4053         if (rc == mdtidx) {
4054                 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
4055                        PFID(&op_data->op_fid3), mdtidx);
4056                 GOTO(out_unlock, rc = 0);
4057         }
4058 again:
4059         if (S_ISREG(child_inode->i_mode)) {
4060                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4061                 if (IS_ERR(och)) {
4062                         rc = PTR_ERR(och);
4063                         och = NULL;
4064                         GOTO(out_unlock, rc);
4065                 }
4066
4067                 rc = ll_data_version(child_inode, &data_version,
4068                                      LL_DV_WR_FLUSH);
4069                 if (rc != 0)
4070                         GOTO(out_close, rc);
4071
4072                 op_data->op_handle = och->och_fh;
4073                 op_data->op_data = och->och_mod;
4074                 op_data->op_data_version = data_version;
4075                 op_data->op_lease_handle = och->och_lease_handle;
4076                 op_data->op_bias |= MDS_RENAME_MIGRATE;
4077         }
4078
4079         op_data->op_mds = mdtidx;
4080         op_data->op_cli_flags = CLI_MIGRATE;
4081         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
4082                        namelen, name, namelen, &request);
4083         if (rc == 0) {
4084                 LASSERT(request != NULL);
4085                 ll_update_times(request, parent);
4086
4087                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4088                 LASSERT(body != NULL);
4089
4090                 /* If the server does release layout lock, then we cleanup
4091                  * the client och here, otherwise release it in out_close: */
4092                 if (och != NULL &&
4093                     body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4094                         obd_mod_put(och->och_mod);
4095                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4096                                                   och);
4097                         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4098                         OBD_FREE_PTR(och);
4099                         och = NULL;
4100                 }
4101         }
4102
4103         if (request != NULL) {
4104                 ptlrpc_req_finished(request);
4105                 request = NULL;
4106         }
4107
4108         /* Try again if the file layout has changed. */
4109         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4110                 goto again;
4111
4112 out_close:
4113         if (och != NULL) /* close the file */
4114                 ll_lease_close(och, child_inode, NULL);
4115         if (rc == 0)
4116                 clear_nlink(child_inode);
4117 out_unlock:
4118         inode_unlock(child_inode);
4119 out_iput:
4120         iput(child_inode);
4121 out_free:
4122         ll_finish_md_op_data(op_data);
4123         RETURN(rc);
4124 }
4125
4126 static int
4127 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4128 {
4129         ENTRY;
4130
4131         RETURN(-ENOSYS);
4132 }
4133
4134 /**
4135  * test if some locks matching bits and l_req_mode are acquired
4136  * - bits can be in different locks
4137  * - if found clear the common lock bits in *bits
4138  * - the bits not found, are kept in *bits
4139  * \param inode [IN]
4140  * \param bits [IN] searched lock bits [IN]
4141  * \param l_req_mode [IN] searched lock mode
4142  * \retval boolean, true iff all bits are found
4143  */
4144 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4145 {
4146         struct lustre_handle lockh;
4147         union ldlm_policy_data policy;
4148         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4149                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4150         struct lu_fid *fid;
4151         __u64 flags;
4152         int i;
4153         ENTRY;
4154
4155         if (!inode)
4156                RETURN(0);
4157
4158         fid = &ll_i2info(inode)->lli_fid;
4159         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4160                ldlm_lockname[mode]);
4161
4162         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4163         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4164                 policy.l_inodebits.bits = *bits & (1 << i);
4165                 if (policy.l_inodebits.bits == 0)
4166                         continue;
4167
4168                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4169                                   &policy, mode, &lockh)) {
4170                         struct ldlm_lock *lock;
4171
4172                         lock = ldlm_handle2lock(&lockh);
4173                         if (lock) {
4174                                 *bits &=
4175                                       ~(lock->l_policy_data.l_inodebits.bits);
4176                                 LDLM_LOCK_PUT(lock);
4177                         } else {
4178                                 *bits &= ~policy.l_inodebits.bits;
4179                         }
4180                 }
4181         }
4182         RETURN(*bits == 0);
4183 }
4184
4185 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4186                                struct lustre_handle *lockh, __u64 flags,
4187                                enum ldlm_mode mode)
4188 {
4189         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4190         struct lu_fid *fid;
4191         enum ldlm_mode rc;
4192         ENTRY;
4193
4194         fid = &ll_i2info(inode)->lli_fid;
4195         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4196
4197         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4198                            fid, LDLM_IBITS, &policy, mode, lockh);
4199
4200         RETURN(rc);
4201 }
4202
4203 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4204 {
4205         /* Already unlinked. Just update nlink and return success */
4206         if (rc == -ENOENT) {
4207                 clear_nlink(inode);
4208                 /* If it is striped directory, and there is bad stripe
4209                  * Let's revalidate the dentry again, instead of returning
4210                  * error */
4211                 if (S_ISDIR(inode->i_mode) &&
4212                     ll_i2info(inode)->lli_lsm_md != NULL)
4213                         return 0;
4214
4215                 /* This path cannot be hit for regular files unless in
4216                  * case of obscure races, so no need to to validate
4217                  * size. */
4218                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4219                         return 0;
4220         } else if (rc != 0) {
4221                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4222                              "%s: revalidate FID "DFID" error: rc = %d\n",
4223                              ll_get_fsname(inode->i_sb, NULL, 0),
4224                              PFID(ll_inode2fid(inode)), rc);
4225         }
4226
4227         return rc;
4228 }
4229
4230 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4231 {
4232         struct inode *inode = dentry->d_inode;
4233         struct obd_export *exp = ll_i2mdexp(inode);
4234         struct lookup_intent oit = {
4235                 .it_op = op,
4236         };
4237         struct ptlrpc_request *req = NULL;
4238         struct md_op_data *op_data;
4239         int rc = 0;
4240         ENTRY;
4241
4242         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4243                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4244
4245         /* Call getattr by fid, so do not provide name at all. */
4246         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4247                                      LUSTRE_OPC_ANY, NULL);
4248         if (IS_ERR(op_data))
4249                 RETURN(PTR_ERR(op_data));
4250
4251         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4252         ll_finish_md_op_data(op_data);
4253         if (rc < 0) {
4254                 rc = ll_inode_revalidate_fini(inode, rc);
4255                 GOTO(out, rc);
4256         }
4257
4258         rc = ll_revalidate_it_finish(req, &oit, dentry);
4259         if (rc != 0) {
4260                 ll_intent_release(&oit);
4261                 GOTO(out, rc);
4262         }
4263
4264         /* Unlinked? Unhash dentry, so it is not picked up later by
4265          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4266          * here to preserve get_cwd functionality on 2.6.
4267          * Bug 10503 */
4268         if (!dentry->d_inode->i_nlink) {
4269                 ll_lock_dcache(inode);
4270                 d_lustre_invalidate(dentry, 0);
4271                 ll_unlock_dcache(inode);
4272         }
4273
4274         ll_lookup_finish_locks(&oit, dentry);
4275 out:
4276         ptlrpc_req_finished(req);
4277
4278         return rc;
4279 }
4280
4281 static int ll_merge_md_attr(struct inode *inode)
4282 {
4283         struct cl_attr attr = { 0 };
4284         int rc;
4285
4286         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4287         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4288                            &attr, ll_md_blocking_ast);
4289         if (rc != 0)
4290                 RETURN(rc);
4291
4292         set_nlink(inode, attr.cat_nlink);
4293         inode->i_blocks = attr.cat_blocks;
4294         i_size_write(inode, attr.cat_size);
4295
4296         ll_i2info(inode)->lli_atime = attr.cat_atime;
4297         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4298         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4299
4300         RETURN(0);
4301 }
4302
4303 static inline dev_t ll_compat_encode_dev(dev_t dev)
4304 {
4305         /* The compat_sys_*stat*() syscalls will fail unless the
4306          * device majors and minors are both less than 256. Note that
4307          * the value returned here will be passed through
4308          * old_encode_dev() in cp_compat_stat(). And so we are not
4309          * trying to return a valid compat (u16) device number, just
4310          * one that will pass the old_valid_dev() check. */
4311
4312         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4313 }
4314
4315 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4316 int ll_getattr(const struct path *path, struct kstat *stat,
4317                u32 request_mask, unsigned int flags)
4318 {
4319         struct dentry *de = path->dentry;
4320 #else
4321 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4322 {
4323 #endif
4324         struct inode *inode = de->d_inode;
4325         struct ll_sb_info *sbi = ll_i2sbi(inode);
4326         struct ll_inode_info *lli = ll_i2info(inode);
4327         int rc;
4328
4329         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4330
4331         rc = ll_inode_revalidate(de, IT_GETATTR);
4332         if (rc < 0)
4333                 RETURN(rc);
4334
4335         if (S_ISREG(inode->i_mode)) {
4336                 /* In case of restore, the MDT has the right size and has
4337                  * already send it back without granting the layout lock,
4338                  * inode is up-to-date so glimpse is useless.
4339                  * Also to glimpse we need the layout, in case of a running
4340                  * restore the MDT holds the layout lock so the glimpse will
4341                  * block up to the end of restore (getattr will block)
4342                  */
4343                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4344                         rc = ll_glimpse_size(inode);
4345                         if (rc < 0)
4346                                 RETURN(rc);
4347                 }
4348         } else {
4349                 /* If object isn't regular a file then don't validate size. */
4350                 if (S_ISDIR(inode->i_mode) &&
4351                     lli->lli_lsm_md != NULL) {
4352                         rc = ll_merge_md_attr(inode);
4353                         if (rc < 0)
4354                                 RETURN(rc);
4355                 }
4356
4357                 LTIME_S(inode->i_atime) = lli->lli_atime;
4358                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4359                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4360         }
4361
4362         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4363
4364         if (ll_need_32bit_api(sbi)) {
4365                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4366                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4367                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4368         } else {
4369                 stat->ino = inode->i_ino;
4370                 stat->dev = inode->i_sb->s_dev;
4371                 stat->rdev = inode->i_rdev;
4372         }
4373
4374         stat->mode = inode->i_mode;
4375         stat->uid = inode->i_uid;
4376         stat->gid = inode->i_gid;
4377         stat->atime = inode->i_atime;
4378         stat->mtime = inode->i_mtime;
4379         stat->ctime = inode->i_ctime;
4380         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4381
4382         stat->nlink = inode->i_nlink;
4383         stat->size = i_size_read(inode);
4384         stat->blocks = inode->i_blocks;
4385
4386         return 0;
4387 }
4388
4389 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4390                      __u64 start, __u64 len)
4391 {
4392         int             rc;
4393         size_t          num_bytes;
4394         struct fiemap   *fiemap;
4395         unsigned int    extent_count = fieinfo->fi_extents_max;
4396
4397         num_bytes = sizeof(*fiemap) + (extent_count *
4398                                        sizeof(struct fiemap_extent));
4399         OBD_ALLOC_LARGE(fiemap, num_bytes);
4400
4401         if (fiemap == NULL)
4402                 RETURN(-ENOMEM);
4403
4404         fiemap->fm_flags = fieinfo->fi_flags;
4405         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4406         fiemap->fm_start = start;
4407         fiemap->fm_length = len;
4408         if (extent_count > 0 &&
4409             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4410                            sizeof(struct fiemap_extent)) != 0)
4411                 GOTO(out, rc = -EFAULT);
4412
4413         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4414
4415         fieinfo->fi_flags = fiemap->fm_flags;
4416         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4417         if (extent_count > 0 &&
4418             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4419                          fiemap->fm_mapped_extents *
4420                          sizeof(struct fiemap_extent)) != 0)
4421                 GOTO(out, rc = -EFAULT);
4422 out:
4423         OBD_FREE_LARGE(fiemap, num_bytes);
4424         return rc;
4425 }
4426
4427 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4428 {
4429         struct ll_inode_info *lli = ll_i2info(inode);
4430         struct posix_acl *acl = NULL;
4431         ENTRY;
4432
4433         spin_lock(&lli->lli_lock);
4434         /* VFS' acl_permission_check->check_acl will release the refcount */
4435         acl = posix_acl_dup(lli->lli_posix_acl);
4436         spin_unlock(&lli->lli_lock);
4437
4438         RETURN(acl);
4439 }
4440
4441 #ifdef HAVE_IOP_SET_ACL
4442 #ifdef CONFIG_FS_POSIX_ACL
4443 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4444 {
4445         struct ll_sb_info *sbi = ll_i2sbi(inode);
4446         struct ptlrpc_request *req = NULL;
4447         const char *name = NULL;
4448         char *value = NULL;
4449         size_t value_size = 0;
4450         int rc = 0;
4451         ENTRY;
4452
4453         switch (type) {
4454         case ACL_TYPE_ACCESS:
4455                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4456                 if (acl)
4457                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4458                 break;
4459
4460         case ACL_TYPE_DEFAULT:
4461                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4462                 if (!S_ISDIR(inode->i_mode))
4463                         rc = acl ? -EACCES : 0;
4464                 break;
4465
4466         default:
4467                 rc = -EINVAL;
4468                 break;
4469         }
4470         if (rc)
4471                 return rc;
4472
4473         if (acl) {
4474                 value_size = posix_acl_xattr_size(acl->a_count);
4475                 value = kmalloc(value_size, GFP_NOFS);
4476                 if (value == NULL)
4477                         GOTO(out, rc = -ENOMEM);
4478
4479                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4480                 if (rc < 0)
4481                         GOTO(out_value, rc);
4482         }
4483
4484         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4485                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4486                          name, value, value_size, 0, 0, &req);
4487
4488         ptlrpc_req_finished(req);
4489 out_value:
4490         kfree(value);
4491 out:
4492         if (rc)
4493                 forget_cached_acl(inode, type);
4494         else
4495                 set_cached_acl(inode, type, acl);
4496         RETURN(rc);
4497 }
4498 #endif /* CONFIG_FS_POSIX_ACL */
4499 #endif /* HAVE_IOP_SET_ACL */
4500
4501 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4502 static int
4503 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4504 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4505 # else
4506 ll_check_acl(struct inode *inode, int mask)
4507 # endif
4508 {
4509 # ifdef CONFIG_FS_POSIX_ACL
4510         struct posix_acl *acl;
4511         int rc;
4512         ENTRY;
4513
4514 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4515         if (flags & IPERM_FLAG_RCU)
4516                 return -ECHILD;
4517 #  endif
4518         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4519
4520         if (!acl)
4521                 RETURN(-EAGAIN);
4522
4523         rc = posix_acl_permission(inode, acl, mask);
4524         posix_acl_release(acl);
4525
4526         RETURN(rc);
4527 # else /* !CONFIG_FS_POSIX_ACL */
4528         return -EAGAIN;
4529 # endif /* CONFIG_FS_POSIX_ACL */
4530 }
4531 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4532
4533 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4534 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4535 #else
4536 # ifdef HAVE_INODE_PERMISION_2ARGS
4537 int ll_inode_permission(struct inode *inode, int mask)
4538 # else
4539 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4540 # endif
4541 #endif
4542 {
4543         int rc = 0;
4544         struct ll_sb_info *sbi;
4545         struct root_squash_info *squash;
4546         struct cred *cred = NULL;
4547         const struct cred *old_cred = NULL;
4548         cfs_cap_t cap;
4549         bool squash_id = false;
4550         ENTRY;
4551
4552 #ifdef MAY_NOT_BLOCK
4553         if (mask & MAY_NOT_BLOCK)
4554                 return -ECHILD;
4555 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4556         if (flags & IPERM_FLAG_RCU)
4557                 return -ECHILD;
4558 #endif
4559
4560        /* as root inode are NOT getting validated in lookup operation,
4561         * need to do it before permission check. */
4562
4563         if (inode == inode->i_sb->s_root->d_inode) {
4564                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4565                 if (rc)
4566                         RETURN(rc);
4567         }
4568
4569         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4570                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4571
4572         /* squash fsuid/fsgid if needed */
4573         sbi = ll_i2sbi(inode);
4574         squash = &sbi->ll_squash;
4575         if (unlikely(squash->rsi_uid != 0 &&
4576                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4577                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4578                         squash_id = true;
4579         }
4580         if (squash_id) {
4581                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4582                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4583                        squash->rsi_uid, squash->rsi_gid);
4584
4585                 /* update current process's credentials
4586                  * and FS capability */
4587                 cred = prepare_creds();
4588                 if (cred == NULL)
4589                         RETURN(-ENOMEM);
4590
4591                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4592                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4593                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4594                         if ((1 << cap) & CFS_CAP_FS_MASK)
4595                                 cap_lower(cred->cap_effective, cap);
4596                 }
4597                 old_cred = override_creds(cred);
4598         }
4599
4600         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4601         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4602         /* restore current process's credentials and FS capability */
4603         if (squash_id) {
4604                 revert_creds(old_cred);
4605                 put_cred(cred);
4606         }
4607
4608         RETURN(rc);
4609 }
4610
4611 /* -o localflock - only provides locally consistent flock locks */
4612 struct file_operations ll_file_operations = {
4613 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4614 # ifdef HAVE_SYNC_READ_WRITE
4615         .read           = new_sync_read,
4616         .write          = new_sync_write,
4617 # endif
4618         .read_iter      = ll_file_read_iter,
4619         .write_iter     = ll_file_write_iter,
4620 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4621         .read           = ll_file_read,
4622         .aio_read       = ll_file_aio_read,
4623         .write          = ll_file_write,
4624         .aio_write      = ll_file_aio_write,
4625 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4626         .unlocked_ioctl = ll_file_ioctl,
4627         .open           = ll_file_open,
4628         .release        = ll_file_release,
4629         .mmap           = ll_file_mmap,
4630         .llseek         = ll_file_seek,
4631         .splice_read    = ll_file_splice_read,
4632         .fsync          = ll_fsync,
4633         .flush          = ll_flush
4634 };
4635
4636 struct file_operations ll_file_operations_flock = {
4637 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4638 # ifdef HAVE_SYNC_READ_WRITE
4639         .read           = new_sync_read,
4640         .write          = new_sync_write,
4641 # endif /* HAVE_SYNC_READ_WRITE */
4642         .read_iter      = ll_file_read_iter,
4643         .write_iter     = ll_file_write_iter,
4644 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4645         .read           = ll_file_read,
4646         .aio_read       = ll_file_aio_read,
4647         .write          = ll_file_write,
4648         .aio_write      = ll_file_aio_write,
4649 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4650         .unlocked_ioctl = ll_file_ioctl,
4651         .open           = ll_file_open,
4652         .release        = ll_file_release,
4653         .mmap           = ll_file_mmap,
4654         .llseek         = ll_file_seek,
4655         .splice_read    = ll_file_splice_read,
4656         .fsync          = ll_fsync,
4657         .flush          = ll_flush,
4658         .flock          = ll_file_flock,
4659         .lock           = ll_file_flock
4660 };
4661
4662 /* These are for -o noflock - to return ENOSYS on flock calls */
4663 struct file_operations ll_file_operations_noflock = {
4664 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4665 # ifdef HAVE_SYNC_READ_WRITE
4666         .read           = new_sync_read,
4667         .write          = new_sync_write,
4668 # endif /* HAVE_SYNC_READ_WRITE */
4669         .read_iter      = ll_file_read_iter,
4670         .write_iter     = ll_file_write_iter,
4671 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4672         .read           = ll_file_read,
4673         .aio_read       = ll_file_aio_read,
4674         .write          = ll_file_write,
4675         .aio_write      = ll_file_aio_write,
4676 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4677         .unlocked_ioctl = ll_file_ioctl,
4678         .open           = ll_file_open,
4679         .release        = ll_file_release,
4680         .mmap           = ll_file_mmap,
4681         .llseek         = ll_file_seek,
4682         .splice_read    = ll_file_splice_read,
4683         .fsync          = ll_fsync,
4684         .flush          = ll_flush,
4685         .flock          = ll_file_noflock,
4686         .lock           = ll_file_noflock
4687 };
4688
4689 struct inode_operations ll_file_inode_operations = {
4690         .setattr        = ll_setattr,
4691         .getattr        = ll_getattr,
4692         .permission     = ll_inode_permission,
4693 #ifdef HAVE_IOP_XATTR
4694         .setxattr       = ll_setxattr,
4695         .getxattr       = ll_getxattr,
4696         .removexattr    = ll_removexattr,
4697 #endif
4698         .listxattr      = ll_listxattr,
4699         .fiemap         = ll_fiemap,
4700 #ifdef HAVE_IOP_GET_ACL
4701         .get_acl        = ll_get_acl,
4702 #endif
4703 #ifdef HAVE_IOP_SET_ACL
4704         .set_acl        = ll_set_acl,
4705 #endif
4706 };
4707
4708 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4709 {
4710         struct ll_inode_info *lli = ll_i2info(inode);
4711         struct cl_object *obj = lli->lli_clob;
4712         struct lu_env *env;
4713         int rc;
4714         __u16 refcheck;
4715         ENTRY;
4716
4717         if (obj == NULL)
4718                 RETURN(0);
4719
4720         env = cl_env_get(&refcheck);
4721         if (IS_ERR(env))
4722                 RETURN(PTR_ERR(env));
4723
4724         rc = cl_conf_set(env, lli->lli_clob, conf);
4725         if (rc < 0)
4726                 GOTO(out, rc);
4727
4728         if (conf->coc_opc == OBJECT_CONF_SET) {
4729                 struct ldlm_lock *lock = conf->coc_lock;
4730                 struct cl_layout cl = {
4731                         .cl_layout_gen = 0,
4732                 };
4733
4734                 LASSERT(lock != NULL);
4735                 LASSERT(ldlm_has_layout(lock));
4736
4737                 /* it can only be allowed to match after layout is
4738                  * applied to inode otherwise false layout would be
4739                  * seen. Applying layout shoud happen before dropping
4740                  * the intent lock. */
4741                 ldlm_lock_allow_match(lock);
4742
4743                 rc = cl_object_layout_get(env, obj, &cl);
4744                 if (rc < 0)
4745                         GOTO(out, rc);
4746
4747                 CDEBUG(D_VFSTRACE,
4748                        DFID": layout version change: %u -> %u\n",
4749                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4750                        cl.cl_layout_gen);
4751                 ll_layout_version_set(lli, cl.cl_layout_gen);
4752         }
4753
4754 out:
4755         cl_env_put(env, &refcheck);
4756
4757         RETURN(rc);
4758 }
4759
4760 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4761 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4762
4763 {
4764         struct ll_sb_info *sbi = ll_i2sbi(inode);
4765         struct ptlrpc_request *req;
4766         struct mdt_body *body;
4767         void *lvbdata;
4768         void *lmm;
4769         int lmmsize;
4770         int rc;
4771         ENTRY;
4772
4773         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4774                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4775                lock->l_lvb_data, lock->l_lvb_len);
4776
4777         if (lock->l_lvb_data != NULL)
4778                 RETURN(0);
4779
4780         /* if layout lock was granted right away, the layout is returned
4781          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4782          * blocked and then granted via completion ast, we have to fetch
4783          * layout here. Please note that we can't use the LVB buffer in
4784          * completion AST because it doesn't have a large enough buffer */
4785         rc = ll_get_default_mdsize(sbi, &lmmsize);
4786         if (rc == 0)
4787                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4788                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4789         if (rc < 0)
4790                 RETURN(rc);
4791
4792         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4793         if (body == NULL)
4794                 GOTO(out, rc = -EPROTO);
4795
4796         lmmsize = body->mbo_eadatasize;
4797         if (lmmsize == 0) /* empty layout */
4798                 GOTO(out, rc = 0);
4799
4800         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4801         if (lmm == NULL)
4802                 GOTO(out, rc = -EFAULT);
4803
4804         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4805         if (lvbdata == NULL)
4806                 GOTO(out, rc = -ENOMEM);
4807
4808         memcpy(lvbdata, lmm, lmmsize);
4809         lock_res_and_lock(lock);
4810         if (unlikely(lock->l_lvb_data == NULL)) {
4811                 lock->l_lvb_type = LVB_T_LAYOUT;
4812                 lock->l_lvb_data = lvbdata;
4813                 lock->l_lvb_len = lmmsize;
4814                 lvbdata = NULL;
4815         }
4816         unlock_res_and_lock(lock);
4817
4818         if (lvbdata)
4819                 OBD_FREE_LARGE(lvbdata, lmmsize);
4820
4821         EXIT;
4822
4823 out:
4824         ptlrpc_req_finished(req);
4825         return rc;
4826 }
4827
4828 /**
4829  * Apply the layout to the inode. Layout lock is held and will be released
4830  * in this function.
4831  */
4832 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4833                               struct inode *inode)
4834 {
4835         struct ll_inode_info *lli = ll_i2info(inode);
4836         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4837         struct ldlm_lock *lock;
4838         struct cl_object_conf conf;
4839         int rc = 0;
4840         bool lvb_ready;
4841         bool wait_layout = false;
4842         ENTRY;
4843
4844         LASSERT(lustre_handle_is_used(lockh));
4845
4846         lock = ldlm_handle2lock(lockh);
4847         LASSERT(lock != NULL);
4848         LASSERT(ldlm_has_layout(lock));
4849
4850         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4851                    PFID(&lli->lli_fid), inode);
4852
4853         /* in case this is a caching lock and reinstate with new inode */
4854         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4855
4856         lock_res_and_lock(lock);
4857         lvb_ready = ldlm_is_lvb_ready(lock);
4858         unlock_res_and_lock(lock);
4859
4860         /* checking lvb_ready is racy but this is okay. The worst case is
4861          * that multi processes may configure the file on the same time. */
4862         if (lvb_ready)
4863                 GOTO(out, rc = 0);
4864
4865         rc = ll_layout_fetch(inode, lock);
4866         if (rc < 0)
4867                 GOTO(out, rc);
4868
4869         /* for layout lock, lmm is stored in lock's lvb.
4870          * lvb_data is immutable if the lock is held so it's safe to access it
4871          * without res lock.
4872          *
4873          * set layout to file. Unlikely this will fail as old layout was
4874          * surely eliminated */
4875         memset(&conf, 0, sizeof conf);
4876         conf.coc_opc = OBJECT_CONF_SET;
4877         conf.coc_inode = inode;
4878         conf.coc_lock = lock;
4879         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4880         conf.u.coc_layout.lb_len = lock->l_lvb_len;
4881         rc = ll_layout_conf(inode, &conf);
4882
4883         /* refresh layout failed, need to wait */
4884         wait_layout = rc == -EBUSY;
4885         EXIT;
4886 out:
4887         LDLM_LOCK_PUT(lock);
4888         ldlm_lock_decref(lockh, mode);
4889
4890         /* wait for IO to complete if it's still being used. */
4891         if (wait_layout) {
4892                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4893                        ll_get_fsname(inode->i_sb, NULL, 0),
4894                        PFID(&lli->lli_fid), inode);
4895
4896                 memset(&conf, 0, sizeof conf);
4897                 conf.coc_opc = OBJECT_CONF_WAIT;
4898                 conf.coc_inode = inode;
4899                 rc = ll_layout_conf(inode, &conf);
4900                 if (rc == 0)
4901                         rc = -EAGAIN;
4902
4903                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4904                        ll_get_fsname(inode->i_sb, NULL, 0),
4905                        PFID(&lli->lli_fid), rc);
4906         }
4907         RETURN(rc);
4908 }
4909
4910 /**
4911  * Issue layout intent RPC to MDS.
4912  * \param inode [in]    file inode
4913  * \param intent [in]   layout intent
4914  *
4915  * \retval 0    on success
4916  * \retval < 0  error code
4917  */
4918 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4919 {
4920         struct ll_inode_info  *lli = ll_i2info(inode);
4921         struct ll_sb_info     *sbi = ll_i2sbi(inode);
4922         struct md_op_data     *op_data;
4923         struct lookup_intent it;
4924         struct ptlrpc_request *req;
4925         int rc;
4926         ENTRY;
4927
4928         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4929                                      0, 0, LUSTRE_OPC_ANY, NULL);
4930         if (IS_ERR(op_data))
4931                 RETURN(PTR_ERR(op_data));
4932
4933         op_data->op_data = intent;
4934         op_data->op_data_size = sizeof(*intent);
4935
4936         memset(&it, 0, sizeof(it));
4937         it.it_op = IT_LAYOUT;
4938         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4939             intent->li_opc == LAYOUT_INTENT_TRUNC)
4940                 it.it_flags = FMODE_WRITE;
4941
4942         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4943                           ll_get_fsname(inode->i_sb, NULL, 0),
4944                           PFID(&lli->lli_fid), inode);
4945
4946         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4947                             &ll_md_blocking_ast, 0);
4948         if (it.it_request != NULL)
4949                 ptlrpc_req_finished(it.it_request);
4950         it.it_request = NULL;
4951
4952         ll_finish_md_op_data(op_data);
4953
4954         /* set lock data in case this is a new lock */
4955         if (!rc)
4956                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4957
4958         ll_intent_drop_lock(&it);
4959
4960         RETURN(rc);
4961 }
4962
4963 /**
4964  * This function checks if there exists a LAYOUT lock on the client side,
4965  * or enqueues it if it doesn't have one in cache.
4966  *
4967  * This function will not hold layout lock so it may be revoked any time after
4968  * this function returns. Any operations depend on layout should be redone
4969  * in that case.
4970  *
4971  * This function should be called before lov_io_init() to get an uptodate
4972  * layout version, the caller should save the version number and after IO
4973  * is finished, this function should be called again to verify that layout
4974  * is not changed during IO time.
4975  */
4976 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4977 {
4978         struct ll_inode_info    *lli = ll_i2info(inode);
4979         struct ll_sb_info       *sbi = ll_i2sbi(inode);
4980         struct lustre_handle lockh;
4981         struct layout_intent intent = {
4982                 .li_opc = LAYOUT_INTENT_ACCESS,
4983         };
4984         enum ldlm_mode mode;
4985         int rc;
4986         ENTRY;
4987
4988         *gen = ll_layout_version_get(lli);
4989         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
4990                 RETURN(0);
4991
4992         /* sanity checks */
4993         LASSERT(fid_is_sane(ll_inode2fid(inode)));
4994         LASSERT(S_ISREG(inode->i_mode));
4995
4996         /* take layout lock mutex to enqueue layout lock exclusively. */
4997         mutex_lock(&lli->lli_layout_mutex);
4998
4999         while (1) {
5000                 /* mostly layout lock is caching on the local side, so try to
5001                  * match it before grabbing layout lock mutex. */
5002                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5003                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5004                 if (mode != 0) { /* hit cached lock */
5005                         rc = ll_layout_lock_set(&lockh, mode, inode);
5006                         if (rc == -EAGAIN)
5007                                 continue;
5008                         break;
5009                 }
5010
5011                 rc = ll_layout_intent(inode, &intent);
5012                 if (rc != 0)
5013                         break;
5014         }
5015
5016         if (rc == 0)
5017                 *gen = ll_layout_version_get(lli);
5018         mutex_unlock(&lli->lli_layout_mutex);
5019
5020         RETURN(rc);
5021 }
5022
5023 /**
5024  * Issue layout intent RPC indicating where in a file an IO is about to write.
5025  *
5026  * \param[in] inode     file inode.
5027  * \param[in] ext       write range with start offset of fille in bytes where
5028  *                      an IO is about to write, and exclusive end offset in
5029  *                      bytes.
5030  *
5031  * \retval 0    on success
5032  * \retval < 0  error code
5033  */
5034 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5035                            struct lu_extent *ext)
5036 {
5037         struct layout_intent intent = {
5038                 .li_opc = opc,
5039                 .li_extent.e_start = ext->e_start,
5040                 .li_extent.e_end = ext->e_end,
5041         };
5042         int rc;
5043         ENTRY;
5044
5045         rc = ll_layout_intent(inode, &intent);
5046
5047         RETURN(rc);
5048 }
5049
5050 /**
5051  *  This function send a restore request to the MDT
5052  */
5053 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5054 {
5055         struct hsm_user_request *hur;
5056         int                      len, rc;
5057         ENTRY;
5058
5059         len = sizeof(struct hsm_user_request) +
5060               sizeof(struct hsm_user_item);
5061         OBD_ALLOC(hur, len);
5062         if (hur == NULL)
5063                 RETURN(-ENOMEM);
5064
5065         hur->hur_request.hr_action = HUA_RESTORE;
5066         hur->hur_request.hr_archive_id = 0;
5067         hur->hur_request.hr_flags = 0;
5068         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5069                sizeof(hur->hur_user_item[0].hui_fid));
5070         hur->hur_user_item[0].hui_extent.offset = offset;
5071         hur->hur_user_item[0].hui_extent.length = length;
5072         hur->hur_request.hr_itemcount = 1;
5073         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5074                            len, hur, NULL);
5075         OBD_FREE(hur, len);
5076         RETURN(rc);
5077 }