lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                      ATTR_MTIME | ATTR_MTIME_SET |
 104                                      ATTR_CTIME | ATTR_CTIME_SET;
 105         op_data->op_attr_blocks = inode->i_blocks;
 106         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 107         op_data->op_handle = och->och_fh;
 108
 109         if (och->och_flags & FMODE_WRITE &&
 110             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 111                 /* For HSM: if inode data has been modified, pack it so that
 112                  * MDT can set data dirty flag in the archive. */
 113                 op_data->op_bias |= MDS_DATA_MODIFIED;
 114
 115         EXIT;
 116 }
 117
 118 /**
 119  * Perform a close, possibly with a bias.
 120  * The meaning of "data" depends on the value of "bias".
 121  *
 122  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 123  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 124  * swap layouts with.
 125  */
 126 static int ll_close_inode_openhandle(struct inode *inode,
 127                                      struct obd_client_handle *och,
 128                                      enum mds_op_bias bias, void *data)
 129 {
 130         struct obd_export *md_exp = ll_i2mdexp(inode);
 131         const struct ll_inode_info *lli = ll_i2info(inode);
 132         struct md_op_data *op_data;
 133         struct ptlrpc_request *req = NULL;
 134         int rc;
 135         ENTRY;
 136
 137         if (class_exp2obd(md_exp) == NULL) {
 138                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 139                        ll_get_fsname(inode->i_sb, NULL, 0),
 140                        PFID(&lli->lli_fid));
 141                 GOTO(out, rc = 0);
 142         }
 143
 144         OBD_ALLOC_PTR(op_data);
 145         /* We leak openhandle and request here on error, but not much to be
 146          * done in OOM case since app won't retry close on error either. */
 147         if (op_data == NULL)
 148                 GOTO(out, rc = -ENOMEM);
 149
 150         ll_prepare_close(inode, op_data, och);
 151         switch (bias) {
 152         case MDS_CLOSE_LAYOUT_MERGE:
 153                 /* merge blocks from the victim inode */
 154                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 155                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 156         case MDS_CLOSE_LAYOUT_SPLIT:
 157         case MDS_CLOSE_LAYOUT_SWAP: {
 158                 struct split_param *sp = data;
 159
 160                 LASSERT(data != NULL);
 161                 op_data->op_bias |= bias;
 162                 op_data->op_data_version = 0;
 163                 op_data->op_lease_handle = och->och_lease_handle;
 164                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 165                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 166                         op_data->op_mirror_id = sp->sp_mirror_id;
 167                 } else {
 168                         op_data->op_fid2 = *ll_inode2fid(data);
 169                 }
 170                 break;
 171         }
 172
 173         case MDS_CLOSE_RESYNC_DONE: {
 174                 struct ll_ioc_lease *ioc = data;
 175
 176                 LASSERT(data != NULL);
 177                 op_data->op_attr_blocks +=
 178                         ioc->lil_count * op_data->op_attr_blocks;
 179                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 180                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 181
 182                 op_data->op_lease_handle = och->och_lease_handle;
 183                 op_data->op_data = &ioc->lil_ids[0];
 184                 op_data->op_data_size =
 185                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 186                 break;
 187         }
 188
 189         case MDS_HSM_RELEASE:
 190                 LASSERT(data != NULL);
 191                 op_data->op_bias |= MDS_HSM_RELEASE;
 192                 op_data->op_data_version = *(__u64 *)data;
 193                 op_data->op_lease_handle = och->och_lease_handle;
 194                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 195                 break;
 196
 197         default:
 198                 LASSERT(data == NULL);
 199                 break;
 200         }
 201
 202         rc = md_close(md_exp, op_data, och->och_mod, &req);
 203         if (rc != 0 && rc != -EINTR)
 204                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 205                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 206
 207         if (rc == 0 && op_data->op_bias & bias) {
 208                 struct mdt_body *body;
 209
 210                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 211                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 212                         rc = -EBUSY;
 213         }
 214
 215         ll_finish_md_op_data(op_data);
 216         EXIT;
 217 out:
 218
 219         md_clear_open_replay_data(md_exp, och);
 220         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 221         OBD_FREE_PTR(och);
 222
 223         ptlrpc_req_finished(req);       /* This is close request */
 224         return rc;
 225 }
 226
 227 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 228 {
 229         struct ll_inode_info *lli = ll_i2info(inode);
 230         struct obd_client_handle **och_p;
 231         struct obd_client_handle *och;
 232         __u64 *och_usecount;
 233         int rc = 0;
 234         ENTRY;
 235
 236         if (fmode & FMODE_WRITE) {
 237                 och_p = &lli->lli_mds_write_och;
 238                 och_usecount = &lli->lli_open_fd_write_count;
 239         } else if (fmode & FMODE_EXEC) {
 240                 och_p = &lli->lli_mds_exec_och;
 241                 och_usecount = &lli->lli_open_fd_exec_count;
 242         } else {
 243                 LASSERT(fmode & FMODE_READ);
 244                 och_p = &lli->lli_mds_read_och;
 245                 och_usecount = &lli->lli_open_fd_read_count;
 246         }
 247
 248         mutex_lock(&lli->lli_och_mutex);
 249         if (*och_usecount > 0) {
 250                 /* There are still users of this handle, so skip
 251                  * freeing it. */
 252                 mutex_unlock(&lli->lli_och_mutex);
 253                 RETURN(0);
 254         }
 255
 256         och = *och_p;
 257         *och_p = NULL;
 258         mutex_unlock(&lli->lli_och_mutex);
 259
 260         if (och != NULL) {
 261                 /* There might be a race and this handle may already
 262                  * be closed. */
 263                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 264         }
 265
 266         RETURN(rc);
 267 }
 268
 269 static int ll_md_close(struct inode *inode, struct file *file)
 270 {
 271         union ldlm_policy_data policy = {
 272                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 273         };
 274         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 275         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 276         struct ll_inode_info *lli = ll_i2info(inode);
 277         struct lustre_handle lockh;
 278         enum ldlm_mode lockmode;
 279         int rc = 0;
 280         ENTRY;
 281
 282         /* clear group lock, if present */
 283         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 284                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 285
 286         if (fd->fd_lease_och != NULL) {
 287                 bool lease_broken;
 288
 289                 /* Usually the lease is not released when the
 290                  * application crashed, we need to release here. */
 291                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 292                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 293                         PFID(&lli->lli_fid), rc, lease_broken);
 294
 295                 fd->fd_lease_och = NULL;
 296         }
 297
 298         if (fd->fd_och != NULL) {
 299                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 300                 fd->fd_och = NULL;
 301                 GOTO(out, rc);
 302         }
 303
 304         /* Let's see if we have good enough OPEN lock on the file and if
 305            we can skip talking to MDS */
 306         mutex_lock(&lli->lli_och_mutex);
 307         if (fd->fd_omode & FMODE_WRITE) {
 308                 lockmode = LCK_CW;
 309                 LASSERT(lli->lli_open_fd_write_count);
 310                 lli->lli_open_fd_write_count--;
 311         } else if (fd->fd_omode & FMODE_EXEC) {
 312                 lockmode = LCK_PR;
 313                 LASSERT(lli->lli_open_fd_exec_count);
 314                 lli->lli_open_fd_exec_count--;
 315         } else {
 316                 lockmode = LCK_CR;
 317                 LASSERT(lli->lli_open_fd_read_count);
 318                 lli->lli_open_fd_read_count--;
 319         }
 320         mutex_unlock(&lli->lli_och_mutex);
 321
 322         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 323                            LDLM_IBITS, &policy, lockmode, &lockh))
 324                 rc = ll_md_real_close(inode, fd->fd_omode);
 325
 326 out:
 327         LUSTRE_FPRIVATE(file) = NULL;
 328         ll_file_data_put(fd);
 329
 330         RETURN(rc);
 331 }
 332
 333 /* While this returns an error code, fput() the caller does not, so we need
 334  * to make every effort to clean up all of our state here.  Also, applications
 335  * rarely check close errors and even if an error is returned they will not
 336  * re-try the close call.
 337  */
 338 int ll_file_release(struct inode *inode, struct file *file)
 339 {
 340         struct ll_file_data *fd;
 341         struct ll_sb_info *sbi = ll_i2sbi(inode);
 342         struct ll_inode_info *lli = ll_i2info(inode);
 343         int rc;
 344         ENTRY;
 345
 346         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 347                PFID(ll_inode2fid(inode)), inode);
 348
 349         if (inode->i_sb->s_root != file_dentry(file))
 350                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 351         fd = LUSTRE_FPRIVATE(file);
 352         LASSERT(fd != NULL);
 353
 354         /* The last ref on @file, maybe not the the owner pid of statahead,
 355          * because parent and child process can share the same file handle. */
 356         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 357                 ll_deauthorize_statahead(inode, fd);
 358
 359         if (inode->i_sb->s_root == file_dentry(file)) {
 360                 LUSTRE_FPRIVATE(file) = NULL;
 361                 ll_file_data_put(fd);
 362                 RETURN(0);
 363         }
 364
 365         if (!S_ISDIR(inode->i_mode)) {
 366                 if (lli->lli_clob != NULL)
 367                         lov_read_and_clear_async_rc(lli->lli_clob);
 368                 lli->lli_async_rc = 0;
 369         }
 370
 371         rc = ll_md_close(inode, file);
 372
 373         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 374                 libcfs_debug_dumplog();
 375
 376         RETURN(rc);
 377 }
 378
 379 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 380                                 struct lookup_intent *itp)
 381 {
 382         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 383         struct dentry *parent = de->d_parent;
 384         const char *name = NULL;
 385         int len = 0;
 386         struct md_op_data *op_data;
 387         struct ptlrpc_request *req = NULL;
 388         int rc;
 389         ENTRY;
 390
 391         LASSERT(parent != NULL);
 392         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 393
 394         /* if server supports open-by-fid, or file name is invalid, don't pack
 395          * name in open request */
 396         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 397             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 398                 name = de->d_name.name;
 399                 len = de->d_name.len;
 400         }
 401
 402         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 403                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 404         if (IS_ERR(op_data))
 405                 RETURN(PTR_ERR(op_data));
 406         op_data->op_data = lmm;
 407         op_data->op_data_size = lmmsize;
 408
 409         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 410                             &ll_md_blocking_ast, 0);
 411         ll_finish_md_op_data(op_data);
 412         if (rc == -ESTALE) {
 413                 /* reason for keep own exit path - don`t flood log
 414                  * with messages with -ESTALE errors.
 415                  */
 416                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 417                      it_open_error(DISP_OPEN_OPEN, itp))
 418                         GOTO(out, rc);
 419                 ll_release_openhandle(de, itp);
 420                 GOTO(out, rc);
 421         }
 422
 423         if (it_disposition(itp, DISP_LOOKUP_NEG))
 424                 GOTO(out, rc = -ENOENT);
 425
 426         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 427                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 428                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 429                 GOTO(out, rc);
 430         }
 431
 432         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 433         if (!rc && itp->it_lock_mode)
 434                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 435
 436 out:
 437         ptlrpc_req_finished(req);
 438         ll_intent_drop_lock(itp);
 439
 440         /* We did open by fid, but by the time we got to the server,
 441          * the object disappeared. If this is a create, we cannot really
 442          * tell the userspace that the file it was trying to create
 443          * does not exist. Instead let's return -ESTALE, and the VFS will
 444          * retry the create with LOOKUP_REVAL that we are going to catch
 445          * in ll_revalidate_dentry() and use lookup then.
 446          */
 447         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 448                 rc = -ESTALE;
 449
 450         RETURN(rc);
 451 }
 452
 453 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 454                        struct obd_client_handle *och)
 455 {
 456         struct mdt_body *body;
 457
 458         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 459         och->och_fh = body->mbo_handle;
 460         och->och_fid = body->mbo_fid1;
 461         och->och_lease_handle.cookie = it->it_lock_handle;
 462         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 463         och->och_flags = it->it_flags;
 464
 465         return md_set_open_replay_data(md_exp, och, it);
 466 }
 467
 468 static int ll_local_open(struct file *file, struct lookup_intent *it,
 469                          struct ll_file_data *fd, struct obd_client_handle *och)
 470 {
 471         struct inode *inode = file_inode(file);
 472         ENTRY;
 473
 474         LASSERT(!LUSTRE_FPRIVATE(file));
 475
 476         LASSERT(fd != NULL);
 477
 478         if (och) {
 479                 int rc;
 480
 481                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 482                 if (rc != 0)
 483                         RETURN(rc);
 484         }
 485
 486         LUSTRE_FPRIVATE(file) = fd;
 487         ll_readahead_init(inode, &fd->fd_ras);
 488         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 489
 490         /* ll_cl_context initialize */
 491         rwlock_init(&fd->fd_lock);
 492         INIT_LIST_HEAD(&fd->fd_lccs);
 493
 494         RETURN(0);
 495 }
 496
 497 /* Open a file, and (for the very first open) create objects on the OSTs at
 498  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 499  * creation or open until ll_lov_setstripe() ioctl is called.
 500  *
 501  * If we already have the stripe MD locally then we don't request it in
 502  * md_open(), by passing a lmm_size = 0.
 503  *
 504  * It is up to the application to ensure no other processes open this file
 505  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 506  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 507  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 508  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 509  */
 510 int ll_file_open(struct inode *inode, struct file *file)
 511 {
 512         struct ll_inode_info *lli = ll_i2info(inode);
 513         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 514                                           .it_flags = file->f_flags };
 515         struct obd_client_handle **och_p = NULL;
 516         __u64 *och_usecount = NULL;
 517         struct ll_file_data *fd;
 518         int rc = 0;
 519         ENTRY;
 520
 521         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 522                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 523
 524         it = file->private_data; /* XXX: compat macro */
 525         file->private_data = NULL; /* prevent ll_local_open assertion */
 526
 527         fd = ll_file_data_get();
 528         if (fd == NULL)
 529                 GOTO(out_openerr, rc = -ENOMEM);
 530
 531         fd->fd_file = file;
 532         if (S_ISDIR(inode->i_mode))
 533                 ll_authorize_statahead(inode, fd);
 534
 535         if (inode->i_sb->s_root == file_dentry(file)) {
 536                 LUSTRE_FPRIVATE(file) = fd;
 537                 RETURN(0);
 538         }
 539
 540         if (!it || !it->it_disposition) {
 541                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 542                  * because everything but O_ACCMODE mask was stripped from
 543                  * there */
 544                 if ((oit.it_flags + 1) & O_ACCMODE)
 545                         oit.it_flags++;
 546                 if (file->f_flags & O_TRUNC)
 547                         oit.it_flags |= FMODE_WRITE;
 548
 549                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 550                  * dentry_open after call to open_namei that checks permissions.
 551                  * Only nfsd_open call dentry_open directly without checking
 552                  * permissions and because of that this code below is safe. */
 553                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 554                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 555
 556                 /* We do not want O_EXCL here, presumably we opened the file
 557                  * already? XXX - NFS implications? */
 558                 oit.it_flags &= ~O_EXCL;
 559
 560                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 561                  * created if necessary, then "IT_CREAT" should be set to keep
 562                  * consistent with it */
 563                 if (oit.it_flags & O_CREAT)
 564                         oit.it_op |= IT_CREAT;
 565
 566                 it = &oit;
 567         }
 568
 569 restart:
 570         /* Let's see if we have file open on MDS already. */
 571         if (it->it_flags & FMODE_WRITE) {
 572                 och_p = &lli->lli_mds_write_och;
 573                 och_usecount = &lli->lli_open_fd_write_count;
 574         } else if (it->it_flags & FMODE_EXEC) {
 575                 och_p = &lli->lli_mds_exec_och;
 576                 och_usecount = &lli->lli_open_fd_exec_count;
 577          } else {
 578                 och_p = &lli->lli_mds_read_och;
 579                 och_usecount = &lli->lli_open_fd_read_count;
 580         }
 581
 582         mutex_lock(&lli->lli_och_mutex);
 583         if (*och_p) { /* Open handle is present */
 584                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 585                         /* Well, there's extra open request that we do not need,
 586                            let's close it somehow. This will decref request. */
 587                         rc = it_open_error(DISP_OPEN_OPEN, it);
 588                         if (rc) {
 589                                 mutex_unlock(&lli->lli_och_mutex);
 590                                 GOTO(out_openerr, rc);
 591                         }
 592
 593                         ll_release_openhandle(file_dentry(file), it);
 594                 }
 595                 (*och_usecount)++;
 596
 597                 rc = ll_local_open(file, it, fd, NULL);
 598                 if (rc) {
 599                         (*och_usecount)--;
 600                         mutex_unlock(&lli->lli_och_mutex);
 601                         GOTO(out_openerr, rc);
 602                 }
 603         } else {
 604                 LASSERT(*och_usecount == 0);
 605                 if (!it->it_disposition) {
 606                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 607                         /* We cannot just request lock handle now, new ELC code
 608                            means that one of other OPEN locks for this file
 609                            could be cancelled, and since blocking ast handler
 610                            would attempt to grab och_mutex as well, that would
 611                            result in a deadlock */
 612                         mutex_unlock(&lli->lli_och_mutex);
 613                         /*
 614                          * Normally called under two situations:
 615                          * 1. NFS export.
 616                          * 2. A race/condition on MDS resulting in no open
 617                          *    handle to be returned from LOOKUP|OPEN request,
 618                          *    for example if the target entry was a symlink.
 619                          *
 620                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 621                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 622                          *  bit so that it's not confusing later callers.
 623                          *
 624                          *  NB; when ldd is NULL, it must have come via normal
 625                          *  lookup path only, since ll_iget_for_nfs always calls
 626                          *  ll_d_init().
 627                          */
 628                         if (ldd && ldd->lld_nfs_dentry) {
 629                                 ldd->lld_nfs_dentry = 0;
 630                                 it->it_flags |= MDS_OPEN_LOCK;
 631                         }
 632
 633                          /*
 634                          * Always specify MDS_OPEN_BY_FID because we don't want
 635                          * to get file with different fid.
 636                          */
 637                         it->it_flags |= MDS_OPEN_BY_FID;
 638                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 639                                                  it);
 640                         if (rc)
 641                                 GOTO(out_openerr, rc);
 642
 643                         goto restart;
 644                 }
 645                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 646                 if (!*och_p)
 647                         GOTO(out_och_free, rc = -ENOMEM);
 648
 649                 (*och_usecount)++;
 650
 651                 /* md_intent_lock() didn't get a request ref if there was an
 652                  * open error, so don't do cleanup on the request here
 653                  * (bug 3430) */
 654                 /* XXX (green): Should not we bail out on any error here, not
 655                  * just open error? */
 656                 rc = it_open_error(DISP_OPEN_OPEN, it);
 657                 if (rc != 0)
 658                         GOTO(out_och_free, rc);
 659
 660                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 661                          "inode %p: disposition %x, status %d\n", inode,
 662                          it_disposition(it, ~0), it->it_status);
 663
 664                 rc = ll_local_open(file, it, fd, *och_p);
 665                 if (rc)
 666                         GOTO(out_och_free, rc);
 667         }
 668         mutex_unlock(&lli->lli_och_mutex);
 669         fd = NULL;
 670
 671         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 672            different kind of OPEN lock for this same inode gets cancelled
 673            by ldlm_cancel_lru */
 674         if (!S_ISREG(inode->i_mode))
 675                 GOTO(out_och_free, rc);
 676
 677         cl_lov_delay_create_clear(&file->f_flags);
 678         GOTO(out_och_free, rc);
 679
 680 out_och_free:
 681         if (rc) {
 682                 if (och_p && *och_p) {
 683                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 684                         *och_p = NULL; /* OBD_FREE writes some magic there */
 685                         (*och_usecount)--;
 686                 }
 687                 mutex_unlock(&lli->lli_och_mutex);
 688
 689 out_openerr:
 690                 if (lli->lli_opendir_key == fd)
 691                         ll_deauthorize_statahead(inode, fd);
 692                 if (fd != NULL)
 693                         ll_file_data_put(fd);
 694         } else {
 695                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 696         }
 697
 698         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 699                 ptlrpc_req_finished(it->it_request);
 700                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 701         }
 702
 703         return rc;
 704 }
 705
 706 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 707                         struct ldlm_lock_desc *desc, void *data, int flag)
 708 {
 709         int rc;
 710         struct lustre_handle lockh;
 711         ENTRY;
 712
 713         switch (flag) {
 714         case LDLM_CB_BLOCKING:
 715                 ldlm_lock2handle(lock, &lockh);
 716                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 717                 if (rc < 0) {
 718                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 719                         RETURN(rc);
 720                 }
 721                 break;
 722         case LDLM_CB_CANCELING:
 723                 /* do nothing */
 724                 break;
 725         }
 726         RETURN(0);
 727 }
 728
 729 /**
 730  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 731  * and save it as fd->fd_och so as to force client to reopen the file even
 732  * if it has an open lock in cache already.
 733  */
 734 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 735                                 struct lustre_handle *old_handle)
 736 {
 737         struct ll_inode_info *lli = ll_i2info(inode);
 738         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 739         struct obd_client_handle **och_p;
 740         __u64 *och_usecount;
 741         int rc = 0;
 742         ENTRY;
 743
 744         /* Get the openhandle of the file */
 745         mutex_lock(&lli->lli_och_mutex);
 746         if (fd->fd_lease_och != NULL)
 747                 GOTO(out_unlock, rc = -EBUSY);
 748
 749         if (fd->fd_och == NULL) {
 750                 if (file->f_mode & FMODE_WRITE) {
 751                         LASSERT(lli->lli_mds_write_och != NULL);
 752                         och_p = &lli->lli_mds_write_och;
 753                         och_usecount = &lli->lli_open_fd_write_count;
 754                 } else {
 755                         LASSERT(lli->lli_mds_read_och != NULL);
 756                         och_p = &lli->lli_mds_read_och;
 757                         och_usecount = &lli->lli_open_fd_read_count;
 758                 }
 759
 760                 if (*och_usecount > 1)
 761                         GOTO(out_unlock, rc = -EBUSY);
 762
 763                 fd->fd_och = *och_p;
 764                 *och_usecount = 0;
 765                 *och_p = NULL;
 766         }
 767
 768         *old_handle = fd->fd_och->och_fh;
 769
 770         EXIT;
 771 out_unlock:
 772         mutex_unlock(&lli->lli_och_mutex);
 773         return rc;
 774 }
 775
 776 /**
 777  * Release ownership on lli_mds_*_och when putting back a file lease.
 778  */
 779 static int ll_lease_och_release(struct inode *inode, struct file *file)
 780 {
 781         struct ll_inode_info *lli = ll_i2info(inode);
 782         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 783         struct obd_client_handle **och_p;
 784         struct obd_client_handle *old_och = NULL;
 785         __u64 *och_usecount;
 786         int rc = 0;
 787         ENTRY;
 788
 789         mutex_lock(&lli->lli_och_mutex);
 790         if (file->f_mode & FMODE_WRITE) {
 791                 och_p = &lli->lli_mds_write_och;
 792                 och_usecount = &lli->lli_open_fd_write_count;
 793         } else {
 794                 och_p = &lli->lli_mds_read_och;
 795                 och_usecount = &lli->lli_open_fd_read_count;
 796         }
 797
 798         /* The file may have been open by another process (broken lease) so
 799          * *och_p is not NULL. In this case we should simply increase usecount
 800          * and close fd_och.
 801          */
 802         if (*och_p != NULL) {
 803                 old_och = fd->fd_och;
 804                 (*och_usecount)++;
 805         } else {
 806                 *och_p = fd->fd_och;
 807                 *och_usecount = 1;
 808         }
 809         fd->fd_och = NULL;
 810         mutex_unlock(&lli->lli_och_mutex);
 811
 812         if (old_och != NULL)
 813                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 814
 815         RETURN(rc);
 816 }
 817
 818 /**
 819  * Acquire a lease and open the file.
 820  */
 821 static struct obd_client_handle *
 822 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 823               __u64 open_flags)
 824 {
 825         struct lookup_intent it = { .it_op = IT_OPEN };
 826         struct ll_sb_info *sbi = ll_i2sbi(inode);
 827         struct md_op_data *op_data;
 828         struct ptlrpc_request *req = NULL;
 829         struct lustre_handle old_handle = { 0 };
 830         struct obd_client_handle *och = NULL;
 831         int rc;
 832         int rc2;
 833         ENTRY;
 834
 835         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 836                 RETURN(ERR_PTR(-EINVAL));
 837
 838         if (file != NULL) {
 839                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 840                         RETURN(ERR_PTR(-EPERM));
 841
 842                 rc = ll_lease_och_acquire(inode, file, &old_handle);
 843                 if (rc)
 844                         RETURN(ERR_PTR(rc));
 845         }
 846
 847         OBD_ALLOC_PTR(och);
 848         if (och == NULL)
 849                 RETURN(ERR_PTR(-ENOMEM));
 850
 851         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 852                                         LUSTRE_OPC_ANY, NULL);
 853         if (IS_ERR(op_data))
 854                 GOTO(out, rc = PTR_ERR(op_data));
 855
 856         /* To tell the MDT this openhandle is from the same owner */
 857         op_data->op_handle = old_handle;
 858
 859         it.it_flags = fmode | open_flags;
 860         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 861         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 862                             &ll_md_blocking_lease_ast,
 863         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 864          * it can be cancelled which may mislead applications that the lease is
 865          * broken;
 866          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 867          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 868          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 869                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 870         ll_finish_md_op_data(op_data);
 871         ptlrpc_req_finished(req);
 872         if (rc < 0)
 873                 GOTO(out_release_it, rc);
 874
 875         if (it_disposition(&it, DISP_LOOKUP_NEG))
 876                 GOTO(out_release_it, rc = -ENOENT);
 877
 878         rc = it_open_error(DISP_OPEN_OPEN, &it);
 879         if (rc)
 880                 GOTO(out_release_it, rc);
 881
 882         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 883         ll_och_fill(sbi->ll_md_exp, &it, och);
 884
 885         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 886                 GOTO(out_close, rc = -EOPNOTSUPP);
 887
 888         /* already get lease, handle lease lock */
 889         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 890         if (it.it_lock_mode == 0 ||
 891             it.it_lock_bits != MDS_INODELOCK_OPEN) {
 892                 /* open lock must return for lease */
 893                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 894                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
 895                         it.it_lock_bits);
 896                 GOTO(out_close, rc = -EPROTO);
 897         }
 898
 899         ll_intent_release(&it);
 900         RETURN(och);
 901
 902 out_close:
 903         /* Cancel open lock */
 904         if (it.it_lock_mode != 0) {
 905                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
 906                                             it.it_lock_mode);
 907                 it.it_lock_mode = 0;
 908                 och->och_lease_handle.cookie = 0ULL;
 909         }
 910         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
 911         if (rc2 < 0)
 912                 CERROR("%s: error closing file "DFID": %d\n",
 913                        ll_get_fsname(inode->i_sb, NULL, 0),
 914                        PFID(&ll_i2info(inode)->lli_fid), rc2);
 915         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
 916 out_release_it:
 917         ll_intent_release(&it);
 918 out:
 919         if (och != NULL)
 920                 OBD_FREE_PTR(och);
 921         RETURN(ERR_PTR(rc));
 922 }
 923
 924 /**
 925  * Check whether a layout swap can be done between two inodes.
 926  *
 927  * \param[in] inode1  First inode to check
 928  * \param[in] inode2  Second inode to check
 929  *
 930  * \retval 0 on success, layout swap can be performed between both inodes
 931  * \retval negative error code if requirements are not met
 932  */
 933 static int ll_check_swap_layouts_validity(struct inode *inode1,
 934                                           struct inode *inode2)
 935 {
 936         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
 937                 return -EINVAL;
 938
 939         if (inode_permission(inode1, MAY_WRITE) ||
 940             inode_permission(inode2, MAY_WRITE))
 941                 return -EPERM;
 942
 943         if (inode1->i_sb != inode2->i_sb)
 944                 return -EXDEV;
 945
 946         return 0;
 947 }
 948
 949 static int ll_swap_layouts_close(struct obd_client_handle *och,
 950                                  struct inode *inode, struct inode *inode2)
 951 {
 952         const struct lu_fid     *fid1 = ll_inode2fid(inode);
 953         const struct lu_fid     *fid2;
 954         int                      rc;
 955         ENTRY;
 956
 957         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
 958                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
 959
 960         rc = ll_check_swap_layouts_validity(inode, inode2);
 961         if (rc < 0)
 962                 GOTO(out_free_och, rc);
 963
 964         /* We now know that inode2 is a lustre inode */
 965         fid2 = ll_inode2fid(inode2);
 966
 967         rc = lu_fid_cmp(fid1, fid2);
 968         if (rc == 0)
 969                 GOTO(out_free_och, rc = -EINVAL);
 970
 971         /* Close the file and {swap,merge} layouts between inode & inode2.
 972          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
 973          * because we still need it to pack l_remote_handle to MDT. */
 974         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
 975                                        inode2);
 976
 977         och = NULL; /* freed in ll_close_inode_openhandle() */
 978
 979 out_free_och:
 980         if (och != NULL)
 981                 OBD_FREE_PTR(och);
 982
 983         RETURN(rc);
 984 }
 985
 986 /**
 987  * Release lease and close the file.
 988  * It will check if the lease has ever broken.
 989  */
 990 static int ll_lease_close_intent(struct obd_client_handle *och,
 991                                  struct inode *inode,
 992                                  bool *lease_broken, enum mds_op_bias bias,
 993                                  void *data)
 994 {
 995         struct ldlm_lock *lock;
 996         bool cancelled = true;
 997         int rc;
 998         ENTRY;
 999
1000         lock = ldlm_handle2lock(&och->och_lease_handle);
1001         if (lock != NULL) {
1002                 lock_res_and_lock(lock);
1003                 cancelled = ldlm_is_cancel(lock);
1004                 unlock_res_and_lock(lock);
1005                 LDLM_LOCK_PUT(lock);
1006         }
1007
1008         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1009                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1010
1011         if (lease_broken != NULL)
1012                 *lease_broken = cancelled;
1013
1014         if (!cancelled && !bias)
1015                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1016
1017         if (cancelled) { /* no need to excute intent */
1018                 bias = 0;
1019                 data = NULL;
1020         }
1021
1022         rc = ll_close_inode_openhandle(inode, och, bias, data);
1023         RETURN(rc);
1024 }
1025
1026 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1027                           bool *lease_broken)
1028 {
1029         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1030 }
1031
1032 /**
1033  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1034  */
1035 static int ll_lease_file_resync(struct obd_client_handle *och,
1036                                 struct inode *inode)
1037 {
1038         struct ll_sb_info *sbi = ll_i2sbi(inode);
1039         struct md_op_data *op_data;
1040         __u64 data_version_unused;
1041         int rc;
1042         ENTRY;
1043
1044         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1045                                      LUSTRE_OPC_ANY, NULL);
1046         if (IS_ERR(op_data))
1047                 RETURN(PTR_ERR(op_data));
1048
1049         /* before starting file resync, it's necessary to clean up page cache
1050          * in client memory, otherwise once the layout version is increased,
1051          * writing back cached data will be denied the OSTs. */
1052         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1053         if (rc)
1054                 GOTO(out, rc);
1055
1056         op_data->op_handle = och->och_lease_handle;
1057         rc = md_file_resync(sbi->ll_md_exp, op_data);
1058         if (rc)
1059                 GOTO(out, rc);
1060
1061         EXIT;
1062 out:
1063         ll_finish_md_op_data(op_data);
1064         return rc;
1065 }
1066
1067 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1068 {
1069         struct ll_inode_info *lli = ll_i2info(inode);
1070         struct cl_object *obj = lli->lli_clob;
1071         struct cl_attr *attr = vvp_env_thread_attr(env);
1072         s64 atime;
1073         s64 mtime;
1074         s64 ctime;
1075         int rc = 0;
1076
1077         ENTRY;
1078
1079         ll_inode_size_lock(inode);
1080
1081         /* Merge timestamps the most recently obtained from MDS with
1082          * timestamps obtained from OSTs.
1083          *
1084          * Do not overwrite atime of inode because it may be refreshed
1085          * by file_accessed() function. If the read was served by cache
1086          * data, there is no RPC to be sent so that atime may not be
1087          * transferred to OSTs at all. MDT only updates atime at close time
1088          * if it's at least 'mdd.*.atime_diff' older.
1089          * All in all, the atime in Lustre does not strictly comply with
1090          * POSIX. Solving this problem needs to send an RPC to MDT for each
1091          * read, this will hurt performance. */
1092         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1093                 LTIME_S(inode->i_atime) = lli->lli_atime;
1094                 lli->lli_update_atime = 0;
1095         }
1096         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1097         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1098
1099         atime = LTIME_S(inode->i_atime);
1100         mtime = LTIME_S(inode->i_mtime);
1101         ctime = LTIME_S(inode->i_ctime);
1102
1103         cl_object_attr_lock(obj);
1104         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1105                 rc = -EINVAL;
1106         else
1107                 rc = cl_object_attr_get(env, obj, attr);
1108         cl_object_attr_unlock(obj);
1109
1110         if (rc != 0)
1111                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1112
1113         if (atime < attr->cat_atime)
1114                 atime = attr->cat_atime;
1115
1116         if (ctime < attr->cat_ctime)
1117                 ctime = attr->cat_ctime;
1118
1119         if (mtime < attr->cat_mtime)
1120                 mtime = attr->cat_mtime;
1121
1122         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1123                PFID(&lli->lli_fid), attr->cat_size);
1124
1125         i_size_write(inode, attr->cat_size);
1126         inode->i_blocks = attr->cat_blocks;
1127
1128         LTIME_S(inode->i_atime) = atime;
1129         LTIME_S(inode->i_mtime) = mtime;
1130         LTIME_S(inode->i_ctime) = ctime;
1131
1132 out_size_unlock:
1133         ll_inode_size_unlock(inode);
1134
1135         RETURN(rc);
1136 }
1137
1138 /**
1139  * Set designated mirror for I/O.
1140  *
1141  * So far only read, write, and truncated can support to issue I/O to
1142  * designated mirror.
1143  */
1144 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1145 {
1146         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1147
1148         /* clear layout version for generic(non-resync) I/O in case it carries
1149          * stale layout version due to I/O restart */
1150         io->ci_layout_version = 0;
1151
1152         /* FLR: disable non-delay for designated mirror I/O because obviously
1153          * only one mirror is available */
1154         if (fd->fd_designated_mirror > 0) {
1155                 io->ci_ndelay = 0;
1156                 io->ci_designated_mirror = fd->fd_designated_mirror;
1157                 io->ci_layout_version = fd->fd_layout_version;
1158                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1159                                  * io to ptasks */
1160         }
1161
1162         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1163                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1164 }
1165
1166 static bool file_is_noatime(const struct file *file)
1167 {
1168         const struct vfsmount *mnt = file->f_path.mnt;
1169         const struct inode *inode = file_inode((struct file *)file);
1170
1171         /* Adapted from file_accessed() and touch_atime().*/
1172         if (file->f_flags & O_NOATIME)
1173                 return true;
1174
1175         if (inode->i_flags & S_NOATIME)
1176                 return true;
1177
1178         if (IS_NOATIME(inode))
1179                 return true;
1180
1181         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1182                 return true;
1183
1184         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1185                 return true;
1186
1187         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1188                 return true;
1189
1190         return false;
1191 }
1192
1193 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1194
1195 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1196 {
1197         struct inode *inode = file_inode(file);
1198         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1199
1200         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1201         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1202         io->u.ci_rw.rw_file = file;
1203         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1204         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1205         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1206
1207         if (iot == CIT_WRITE) {
1208                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1209                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1210                                            file->f_flags & O_DIRECT ||
1211                                            IS_SYNC(inode));
1212         }
1213         io->ci_obj = ll_i2info(inode)->lli_clob;
1214         io->ci_lockreq = CILR_MAYBE;
1215         if (ll_file_nolock(file)) {
1216                 io->ci_lockreq = CILR_NEVER;
1217                 io->ci_no_srvlock = 1;
1218         } else if (file->f_flags & O_APPEND) {
1219                 io->ci_lockreq = CILR_MANDATORY;
1220         }
1221         io->ci_noatime = file_is_noatime(file);
1222         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1223                 io->ci_pio = !io->u.ci_rw.rw_append;
1224         else
1225                 io->ci_pio = 0;
1226
1227         /* FLR: only use non-delay I/O for read as there is only one
1228          * avaliable mirror for write. */
1229         io->ci_ndelay = !(iot == CIT_WRITE);
1230
1231         ll_io_set_mirror(io, file);
1232 }
1233
1234 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1235 {
1236         struct cl_io_pt *pt = ptask->pt_cbdata;
1237         struct file *file = pt->cip_file;
1238         struct lu_env *env;
1239         struct cl_io *io;
1240         loff_t pos = pt->cip_pos;
1241         int rc;
1242         __u16 refcheck;
1243         ENTRY;
1244
1245         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1246                 file_dentry(file)->d_name.name,
1247                 pt->cip_iot == CIT_READ ? "read" : "write",
1248                 pos, pos + pt->cip_count);
1249
1250         env = cl_env_get(&refcheck);
1251         if (IS_ERR(env))
1252                 RETURN(PTR_ERR(env));
1253
1254         io = vvp_env_thread_io(env);
1255         ll_io_init(io, file, pt->cip_iot);
1256         io->u.ci_rw.rw_iter = pt->cip_iter;
1257         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1258         io->ci_pio = 0; /* It's already in parallel task */
1259
1260         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1261                            pt->cip_count - pt->cip_result);
1262         if (!rc) {
1263                 struct vvp_io *vio = vvp_env_io(env);
1264
1265                 vio->vui_io_subtype = IO_NORMAL;
1266                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1267
1268                 ll_cl_add(file, env, io, LCC_RW);
1269                 rc = cl_io_loop(env, io);
1270                 ll_cl_remove(file, env);
1271         } else {
1272                 /* cl_io_rw_init() handled IO */
1273                 rc = io->ci_result;
1274         }
1275
1276         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1277                 if (io->ci_nob > 0)
1278                         io->ci_nob /= 2;
1279                 rc = -EIO;
1280         }
1281
1282         if (io->ci_nob > 0) {
1283                 pt->cip_result += io->ci_nob;
1284                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1285                 pos += io->ci_nob;
1286                 pt->cip_iocb.ki_pos = pos;
1287 #ifdef HAVE_KIOCB_KI_LEFT
1288                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1289 #elif defined(HAVE_KI_NBYTES)
1290                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1291 #endif
1292         }
1293
1294         cl_io_fini(env, io);
1295         cl_env_put(env, &refcheck);
1296
1297         pt->cip_need_restart = io->ci_need_restart;
1298
1299         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1300                 file_dentry(file)->d_name.name,
1301                 pt->cip_iot == CIT_READ ? "read" : "write",
1302                 pt->cip_result, rc);
1303
1304         RETURN(pt->cip_result > 0 ? 0 : rc);
1305 }
1306
1307 static ssize_t
1308 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1309                    struct file *file, enum cl_io_type iot,
1310                    loff_t *ppos, size_t count)
1311 {
1312         struct range_lock       range;
1313         struct vvp_io           *vio = vvp_env_io(env);
1314         struct inode            *inode = file_inode(file);
1315         struct ll_inode_info    *lli = ll_i2info(inode);
1316         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1317         struct cl_io            *io;
1318         loff_t                  pos = *ppos;
1319         ssize_t                 result = 0;
1320         int                     rc = 0;
1321         unsigned                retried = 0;
1322         bool                    restarted = false;
1323
1324         ENTRY;
1325
1326         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1327                 file_dentry(file)->d_name.name,
1328                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1329
1330 restart:
1331         io = vvp_env_thread_io(env);
1332         ll_io_init(io, file, iot);
1333         if (args->via_io_subtype == IO_NORMAL) {
1334                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1335                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1336         }
1337         if (args->via_io_subtype != IO_NORMAL || restarted)
1338                 io->ci_pio = 0;
1339         io->ci_ndelay_tried = retried;
1340
1341         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1342                 bool range_locked = false;
1343
1344                 if (file->f_flags & O_APPEND)
1345                         range_lock_init(&range, 0, LUSTRE_EOF);
1346                 else
1347                         range_lock_init(&range, pos, pos + count - 1);
1348
1349                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1350                 vio->vui_io_subtype = args->via_io_subtype;
1351
1352                 switch (vio->vui_io_subtype) {
1353                 case IO_NORMAL:
1354                         /* Direct IO reads must also take range lock,
1355                          * or multiple reads will try to work on the same pages
1356                          * See LU-6227 for details. */
1357                         if (((iot == CIT_WRITE) ||
1358                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1359                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1360                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1361                                        RL_PARA(&range));
1362                                 rc = range_lock(&lli->lli_write_tree, &range);
1363                                 if (rc < 0)
1364                                         GOTO(out, rc);
1365
1366                                 range_locked = true;
1367                         }
1368                         break;
1369                 case IO_SPLICE:
1370                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1371                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1372                         break;
1373                 default:
1374                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1375                         LBUG();
1376                 }
1377
1378                 ll_cl_add(file, env, io, LCC_RW);
1379                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1380                     !lli->lli_inode_locked) {
1381                         inode_lock(inode);
1382                         lli->lli_inode_locked = 1;
1383                 }
1384                 rc = cl_io_loop(env, io);
1385                 if (lli->lli_inode_locked) {
1386                         lli->lli_inode_locked = 0;
1387                         inode_unlock(inode);
1388                 }
1389                 ll_cl_remove(file, env);
1390
1391                 if (range_locked) {
1392                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1393                                RL_PARA(&range));
1394                         range_unlock(&lli->lli_write_tree, &range);
1395                 }
1396         } else {
1397                 /* cl_io_rw_init() handled IO */
1398                 rc = io->ci_result;
1399         }
1400
1401         if (io->ci_nob > 0) {
1402                 result += io->ci_nob;
1403                 count  -= io->ci_nob;
1404
1405                 if (args->via_io_subtype == IO_NORMAL) {
1406                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1407                         pos += io->ci_nob;
1408                         args->u.normal.via_iocb->ki_pos = pos;
1409 #ifdef HAVE_KIOCB_KI_LEFT
1410                         args->u.normal.via_iocb->ki_left = count;
1411 #elif defined(HAVE_KI_NBYTES)
1412                         args->u.normal.via_iocb->ki_nbytes = count;
1413 #endif
1414                 } else {
1415                         /* for splice */
1416                         pos = io->u.ci_rw.rw_range.cir_pos;
1417                 }
1418         }
1419 out:
1420         cl_io_fini(env, io);
1421
1422         CDEBUG(D_VFSTRACE,
1423                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1424                file->f_path.dentry->d_name.name,
1425                iot, rc, result, io->ci_need_restart);
1426
1427         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1428                 CDEBUG(D_VFSTRACE,
1429                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1430                         file_dentry(file)->d_name.name,
1431                         iot == CIT_READ ? "read" : "write",
1432                         pos, pos + count, result, rc);
1433                 /* preserve the tried count for FLR */
1434                 retried = io->ci_ndelay_tried;
1435                 restarted = true;
1436                 goto restart;
1437         }
1438
1439         if (iot == CIT_READ) {
1440                 if (result > 0)
1441                         ll_stats_ops_tally(ll_i2sbi(inode),
1442                                            LPROC_LL_READ_BYTES, result);
1443         } else if (iot == CIT_WRITE) {
1444                 if (result > 0) {
1445                         ll_stats_ops_tally(ll_i2sbi(inode),
1446                                            LPROC_LL_WRITE_BYTES, result);
1447                         fd->fd_write_failed = false;
1448                 } else if (result == 0 && rc == 0) {
1449                         rc = io->ci_result;
1450                         if (rc < 0)
1451                                 fd->fd_write_failed = true;
1452                         else
1453                                 fd->fd_write_failed = false;
1454                 } else if (rc != -ERESTARTSYS) {
1455                         fd->fd_write_failed = true;
1456                 }
1457         }
1458
1459         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1460                 file_dentry(file)->d_name.name,
1461                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1462
1463         *ppos = pos;
1464
1465         RETURN(result > 0 ? result : rc);
1466 }
1467
1468 /**
1469  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1470  * especially for small I/O.
1471  *
1472  * To serve a read request, CLIO has to create and initialize a cl_io and
1473  * then request DLM lock. This has turned out to have siginificant overhead
1474  * and affects the performance of small I/O dramatically.
1475  *
1476  * It's not necessary to create a cl_io for each I/O. Under the help of read
1477  * ahead, most of the pages being read are already in memory cache and we can
1478  * read those pages directly because if the pages exist, the corresponding DLM
1479  * lock must exist so that page content must be valid.
1480  *
1481  * In fast read implementation, the llite speculatively finds and reads pages
1482  * in memory cache. There are three scenarios for fast read:
1483  *   - If the page exists and is uptodate, kernel VM will provide the data and
1484  *     CLIO won't be intervened;
1485  *   - If the page was brought into memory by read ahead, it will be exported
1486  *     and read ahead parameters will be updated;
1487  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1488  *     it will go back and invoke normal read, i.e., a cl_io will be created
1489  *     and DLM lock will be requested.
1490  *
1491  * POSIX compliance: posix standard states that read is intended to be atomic.
1492  * Lustre read implementation is in line with Linux kernel read implementation
1493  * and neither of them complies with POSIX standard in this matter. Fast read
1494  * doesn't make the situation worse on single node but it may interleave write
1495  * results from multiple nodes due to short read handling in ll_file_aio_read().
1496  *
1497  * \param env - lu_env
1498  * \param iocb - kiocb from kernel
1499  * \param iter - user space buffers where the data will be copied
1500  *
1501  * \retval - number of bytes have been read, or error code if error occurred.
1502  */
1503 static ssize_t
1504 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1505 {
1506         ssize_t result;
1507
1508         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1509                 return 0;
1510
1511         /* NB: we can't do direct IO for fast read because it will need a lock
1512          * to make IO engine happy. */
1513         if (iocb->ki_filp->f_flags & O_DIRECT)
1514                 return 0;
1515
1516         result = generic_file_read_iter(iocb, iter);
1517
1518         /* If the first page is not in cache, generic_file_aio_read() will be
1519          * returned with -ENODATA.
1520          * See corresponding code in ll_readpage(). */
1521         if (result == -ENODATA)
1522                 result = 0;
1523
1524         if (result > 0)
1525                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1526                                 LPROC_LL_READ_BYTES, result);
1527
1528         return result;
1529 }
1530
1531 /*
1532  * Read from a file (through the page cache).
1533  */
1534 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1535 {
1536         struct lu_env *env;
1537         struct vvp_io_args *args;
1538         ssize_t result;
1539         ssize_t rc2;
1540         __u16 refcheck;
1541
1542         result = ll_do_fast_read(iocb, to);
1543         if (result < 0 || iov_iter_count(to) == 0)
1544                 GOTO(out, result);
1545
1546         env = cl_env_get(&refcheck);
1547         if (IS_ERR(env))
1548                 return PTR_ERR(env);
1549
1550         args = ll_env_args(env, IO_NORMAL);
1551         args->u.normal.via_iter = to;
1552         args->u.normal.via_iocb = iocb;
1553
1554         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1555                                  &iocb->ki_pos, iov_iter_count(to));
1556         if (rc2 > 0)
1557                 result += rc2;
1558         else if (result == 0)
1559                 result = rc2;
1560
1561         cl_env_put(env, &refcheck);
1562 out:
1563         return result;
1564 }
1565
1566 /**
1567  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1568  * If a page is already in the page cache and dirty (and some other things -
1569  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1570  * write to it without doing a full I/O, because Lustre already knows about it
1571  * and will write it out.  This saves a lot of processing time.
1572  *
1573  * All writes here are within one page, so exclusion is handled by the page
1574  * lock on the vm page.  Exception is appending, which requires locking the
1575  * full file to handle size issues.  We do not do tiny writes for writes which
1576  * touch multiple pages because it's very unlikely multiple sequential pages
1577  * are already dirty.
1578  *
1579  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1580  * and are unlikely to be to already dirty pages.
1581  *
1582  * Attribute updates are important here, we do it in ll_tiny_write_end.
1583  */
1584 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1585 {
1586         ssize_t count = iov_iter_count(iter);
1587         struct file *file = iocb->ki_filp;
1588         struct inode *inode = file_inode(file);
1589         struct ll_inode_info *lli = ll_i2info(inode);
1590         struct range_lock range;
1591         ssize_t result = 0;
1592         bool append = false;
1593
1594         ENTRY;
1595
1596         /* NB: we can't do direct IO for tiny writes because they use the page
1597          * cache, and we can't do sync writes because tiny writes can't flush
1598          * pages.
1599          */
1600         if (file->f_flags & (O_DIRECT | O_SYNC))
1601                 RETURN(0);
1602
1603         /* It is relatively unlikely we will overwrite a full dirty page, so
1604          * limit tiny writes to < PAGE_SIZE
1605          */
1606         if (count >= PAGE_SIZE)
1607                 RETURN(0);
1608
1609         /* For append writes, we must take the range lock to protect size
1610          * and also move pos to current size before writing.
1611          */
1612         if (file->f_flags & O_APPEND) {
1613                 struct lu_env *env;
1614                 __u16 refcheck;
1615
1616                 append = true;
1617                 range_lock_init(&range, 0, LUSTRE_EOF);
1618                 result = range_lock(&lli->lli_write_tree, &range);
1619                 if (result)
1620                         RETURN(result);
1621                 env = cl_env_get(&refcheck);
1622                 if (IS_ERR(env))
1623                         GOTO(out, result = PTR_ERR(env));
1624                 ll_merge_attr(env, inode);
1625                 cl_env_put(env, &refcheck);
1626                 iocb->ki_pos = i_size_read(inode);
1627         }
1628
1629         /* Does this write touch multiple pages?
1630          *
1631          * This partly duplicates the PAGE_SIZE check above, but must come
1632          * after range locking for append writes because it depends on the
1633          * write position (ki_pos).
1634          */
1635         if ((iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1636                 goto out;
1637
1638         result = __generic_file_write_iter(iocb, iter);
1639
1640         /* If the page is not already dirty, ll_tiny_write_begin returns
1641          * -ENODATA.  We continue on to normal write.
1642          */
1643         if (result == -ENODATA)
1644                 result = 0;
1645
1646         if (result > 0) {
1647                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1648                                    result);
1649                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1650         }
1651
1652 out:
1653         if (append)
1654                 range_unlock(&lli->lli_write_tree, &range);
1655
1656         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1657
1658         RETURN(result);
1659 }
1660
1661 /*
1662  * Write to a file (through the page cache).
1663  */
1664 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1665 {
1666         struct vvp_io_args *args;
1667         struct lu_env *env;
1668         ssize_t rc_tiny, rc_normal;
1669         __u16 refcheck;
1670
1671         ENTRY;
1672
1673         rc_tiny = ll_do_tiny_write(iocb, from);
1674
1675         /* In case of error, go on and try normal write - Only stop if tiny
1676          * write completed I/O.
1677          */
1678         if (iov_iter_count(from) == 0)
1679                 GOTO(out, rc_normal = rc_tiny);
1680
1681         env = cl_env_get(&refcheck);
1682         if (IS_ERR(env))
1683                 return PTR_ERR(env);
1684
1685         args = ll_env_args(env, IO_NORMAL);
1686         args->u.normal.via_iter = from;
1687         args->u.normal.via_iocb = iocb;
1688
1689         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1690                                     &iocb->ki_pos, iov_iter_count(from));
1691
1692         /* On success, combine bytes written. */
1693         if (rc_tiny >= 0 && rc_normal > 0)
1694                 rc_normal += rc_tiny;
1695         /* On error, only return error from normal write if tiny write did not
1696          * write any bytes.  Otherwise return bytes written by tiny write.
1697          */
1698         else if (rc_tiny > 0)
1699                 rc_normal = rc_tiny;
1700
1701         cl_env_put(env, &refcheck);
1702 out:
1703         RETURN(rc_normal);
1704 }
1705
1706 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1707 /*
1708  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1709  */
1710 static int ll_file_get_iov_count(const struct iovec *iov,
1711                                  unsigned long *nr_segs, size_t *count)
1712 {
1713         size_t cnt = 0;
1714         unsigned long seg;
1715
1716         for (seg = 0; seg < *nr_segs; seg++) {
1717                 const struct iovec *iv = &iov[seg];
1718
1719                 /*
1720                  * If any segment has a negative length, or the cumulative
1721                  * length ever wraps negative then return -EINVAL.
1722                  */
1723                 cnt += iv->iov_len;
1724                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1725                         return -EINVAL;
1726                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1727                         continue;
1728                 if (seg == 0)
1729                         return -EFAULT;
1730                 *nr_segs = seg;
1731                 cnt -= iv->iov_len;     /* This segment is no good */
1732                 break;
1733         }
1734         *count = cnt;
1735         return 0;
1736 }
1737
1738 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1739                                 unsigned long nr_segs, loff_t pos)
1740 {
1741         struct iov_iter to;
1742         size_t iov_count;
1743         ssize_t result;
1744         ENTRY;
1745
1746         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1747         if (result)
1748                 RETURN(result);
1749
1750 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1751         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1752 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1753         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1754 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1755
1756         result = ll_file_read_iter(iocb, &to);
1757
1758         RETURN(result);
1759 }
1760
1761 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1762                             loff_t *ppos)
1763 {
1764         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1765         struct kiocb   kiocb;
1766         ssize_t        result;
1767         ENTRY;
1768
1769         init_sync_kiocb(&kiocb, file);
1770         kiocb.ki_pos = *ppos;
1771 #ifdef HAVE_KIOCB_KI_LEFT
1772         kiocb.ki_left = count;
1773 #elif defined(HAVE_KI_NBYTES)
1774         kiocb.i_nbytes = count;
1775 #endif
1776
1777         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1778         *ppos = kiocb.ki_pos;
1779
1780         RETURN(result);
1781 }
1782
1783 /*
1784  * Write to a file (through the page cache).
1785  * AIO stuff
1786  */
1787 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1788                                  unsigned long nr_segs, loff_t pos)
1789 {
1790         struct iov_iter from;
1791         size_t iov_count;
1792         ssize_t result;
1793         ENTRY;
1794
1795         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1796         if (result)
1797                 RETURN(result);
1798
1799 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1800         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1801 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1802         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1803 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1804
1805         result = ll_file_write_iter(iocb, &from);
1806
1807         RETURN(result);
1808 }
1809
1810 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1811                              size_t count, loff_t *ppos)
1812 {
1813         struct iovec   iov = { .iov_base = (void __user *)buf,
1814                                .iov_len = count };
1815         struct kiocb   kiocb;
1816         ssize_t        result;
1817
1818         ENTRY;
1819
1820         init_sync_kiocb(&kiocb, file);
1821         kiocb.ki_pos = *ppos;
1822 #ifdef HAVE_KIOCB_KI_LEFT
1823         kiocb.ki_left = count;
1824 #elif defined(HAVE_KI_NBYTES)
1825         kiocb.ki_nbytes = count;
1826 #endif
1827
1828         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1829         *ppos = kiocb.ki_pos;
1830
1831         RETURN(result);
1832 }
1833 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1834
1835 /*
1836  * Send file content (through pagecache) somewhere with helper
1837  */
1838 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1839                                    struct pipe_inode_info *pipe, size_t count,
1840                                    unsigned int flags)
1841 {
1842         struct lu_env      *env;
1843         struct vvp_io_args *args;
1844         ssize_t             result;
1845         __u16               refcheck;
1846         ENTRY;
1847
1848         env = cl_env_get(&refcheck);
1849         if (IS_ERR(env))
1850                 RETURN(PTR_ERR(env));
1851
1852         args = ll_env_args(env, IO_SPLICE);
1853         args->u.splice.via_pipe = pipe;
1854         args->u.splice.via_flags = flags;
1855
1856         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1857         cl_env_put(env, &refcheck);
1858         RETURN(result);
1859 }
1860
1861 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1862                              __u64 flags, struct lov_user_md *lum, int lum_size)
1863 {
1864         struct lookup_intent oit = {
1865                 .it_op = IT_OPEN,
1866                 .it_flags = flags | MDS_OPEN_BY_FID,
1867         };
1868         int rc;
1869         ENTRY;
1870
1871         ll_inode_size_lock(inode);
1872         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1873         if (rc < 0)
1874                 GOTO(out_unlock, rc);
1875
1876         ll_release_openhandle(dentry, &oit);
1877
1878 out_unlock:
1879         ll_inode_size_unlock(inode);
1880         ll_intent_release(&oit);
1881
1882         RETURN(rc);
1883 }
1884
1885 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1886                              struct lov_mds_md **lmmp, int *lmm_size,
1887                              struct ptlrpc_request **request)
1888 {
1889         struct ll_sb_info *sbi = ll_i2sbi(inode);
1890         struct mdt_body  *body;
1891         struct lov_mds_md *lmm = NULL;
1892         struct ptlrpc_request *req = NULL;
1893         struct md_op_data *op_data;
1894         int rc, lmmsize;
1895
1896         rc = ll_get_default_mdsize(sbi, &lmmsize);
1897         if (rc)
1898                 RETURN(rc);
1899
1900         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1901                                      strlen(filename), lmmsize,
1902                                      LUSTRE_OPC_ANY, NULL);
1903         if (IS_ERR(op_data))
1904                 RETURN(PTR_ERR(op_data));
1905
1906         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1907         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1908         ll_finish_md_op_data(op_data);
1909         if (rc < 0) {
1910                 CDEBUG(D_INFO, "md_getattr_name failed "
1911                        "on %s: rc %d\n", filename, rc);
1912                 GOTO(out, rc);
1913         }
1914
1915         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1916         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1917
1918         lmmsize = body->mbo_eadatasize;
1919
1920         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1921                         lmmsize == 0) {
1922                 GOTO(out, rc = -ENODATA);
1923         }
1924
1925         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1926         LASSERT(lmm != NULL);
1927
1928         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1929             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1930             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1931                 GOTO(out, rc = -EPROTO);
1932
1933         /*
1934          * This is coming from the MDS, so is probably in
1935          * little endian.  We convert it to host endian before
1936          * passing it to userspace.
1937          */
1938         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1939                 int stripe_count;
1940
1941                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1942                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1943                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1944                         if (le32_to_cpu(lmm->lmm_pattern) &
1945                             LOV_PATTERN_F_RELEASED)
1946                                 stripe_count = 0;
1947                 }
1948
1949                 /* if function called for directory - we should
1950                  * avoid swab not existent lsm objects */
1951                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1952                         lustre_swab_lov_user_md_v1(
1953                                         (struct lov_user_md_v1 *)lmm);
1954                         if (S_ISREG(body->mbo_mode))
1955                                 lustre_swab_lov_user_md_objects(
1956                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1957                                     stripe_count);
1958                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1959                         lustre_swab_lov_user_md_v3(
1960                                         (struct lov_user_md_v3 *)lmm);
1961                         if (S_ISREG(body->mbo_mode))
1962                                 lustre_swab_lov_user_md_objects(
1963                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1964                                     stripe_count);
1965                 } else if (lmm->lmm_magic ==
1966                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1967                         lustre_swab_lov_comp_md_v1(
1968                                         (struct lov_comp_md_v1 *)lmm);
1969                 }
1970         }
1971
1972 out:
1973         *lmmp = lmm;
1974         *lmm_size = lmmsize;
1975         *request = req;
1976         return rc;
1977 }
1978
1979 static int ll_lov_setea(struct inode *inode, struct file *file,
1980                         void __user *arg)
1981 {
1982         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1983         struct lov_user_md      *lump;
1984         int                      lum_size = sizeof(struct lov_user_md) +
1985                                             sizeof(struct lov_user_ost_data);
1986         int                      rc;
1987         ENTRY;
1988
1989         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1990                 RETURN(-EPERM);
1991
1992         OBD_ALLOC_LARGE(lump, lum_size);
1993         if (lump == NULL)
1994                 RETURN(-ENOMEM);
1995
1996         if (copy_from_user(lump, arg, lum_size))
1997                 GOTO(out_lump, rc = -EFAULT);
1998
1999         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2000                                       lum_size);
2001         cl_lov_delay_create_clear(&file->f_flags);
2002
2003 out_lump:
2004         OBD_FREE_LARGE(lump, lum_size);
2005         RETURN(rc);
2006 }
2007
2008 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2009 {
2010         struct lu_env   *env;
2011         __u16           refcheck;
2012         int             rc;
2013         ENTRY;
2014
2015         env = cl_env_get(&refcheck);
2016         if (IS_ERR(env))
2017                 RETURN(PTR_ERR(env));
2018
2019         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2020         cl_env_put(env, &refcheck);
2021         RETURN(rc);
2022 }
2023
2024 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2025                             void __user *arg)
2026 {
2027         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2028         struct lov_user_md        *klum;
2029         int                        lum_size, rc;
2030         __u64                      flags = FMODE_WRITE;
2031         ENTRY;
2032
2033         rc = ll_copy_user_md(lum, &klum);
2034         if (rc < 0)
2035                 RETURN(rc);
2036
2037         lum_size = rc;
2038         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2039                                       lum_size);
2040         if (!rc) {
2041                 __u32 gen;
2042
2043                 rc = put_user(0, &lum->lmm_stripe_count);
2044                 if (rc)
2045                         GOTO(out, rc);
2046
2047                 rc = ll_layout_refresh(inode, &gen);
2048                 if (rc)
2049                         GOTO(out, rc);
2050
2051                 rc = ll_file_getstripe(inode, arg, lum_size);
2052         }
2053         cl_lov_delay_create_clear(&file->f_flags);
2054
2055 out:
2056         OBD_FREE(klum, lum_size);
2057         RETURN(rc);
2058 }
2059
2060 static int
2061 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2062 {
2063         struct ll_inode_info *lli = ll_i2info(inode);
2064         struct cl_object *obj = lli->lli_clob;
2065         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2066         struct ll_grouplock grouplock;
2067         int rc;
2068         ENTRY;
2069
2070         if (arg == 0) {
2071                 CWARN("group id for group lock must not be 0\n");
2072                 RETURN(-EINVAL);
2073         }
2074
2075         if (ll_file_nolock(file))
2076                 RETURN(-EOPNOTSUPP);
2077
2078         spin_lock(&lli->lli_lock);
2079         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2080                 CWARN("group lock already existed with gid %lu\n",
2081                       fd->fd_grouplock.lg_gid);
2082                 spin_unlock(&lli->lli_lock);
2083                 RETURN(-EINVAL);
2084         }
2085         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2086         spin_unlock(&lli->lli_lock);
2087
2088         /**
2089          * XXX: group lock needs to protect all OST objects while PFL
2090          * can add new OST objects during the IO, so we'd instantiate
2091          * all OST objects before getting its group lock.
2092          */
2093         if (obj) {
2094                 struct lu_env *env;
2095                 __u16 refcheck;
2096                 struct cl_layout cl = {
2097                         .cl_is_composite = false,
2098                 };
2099                 struct lu_extent ext = {
2100                         .e_start = 0,
2101                         .e_end = OBD_OBJECT_EOF,
2102                 };
2103
2104                 env = cl_env_get(&refcheck);
2105                 if (IS_ERR(env))
2106                         RETURN(PTR_ERR(env));
2107
2108                 rc = cl_object_layout_get(env, obj, &cl);
2109                 if (!rc && cl.cl_is_composite)
2110                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2111                                                     &ext);
2112
2113                 cl_env_put(env, &refcheck);
2114                 if (rc)
2115                         RETURN(rc);
2116         }
2117
2118         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2119                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2120         if (rc)
2121                 RETURN(rc);
2122
2123         spin_lock(&lli->lli_lock);
2124         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2125                 spin_unlock(&lli->lli_lock);
2126                 CERROR("another thread just won the race\n");
2127                 cl_put_grouplock(&grouplock);
2128                 RETURN(-EINVAL);
2129         }
2130
2131         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2132         fd->fd_grouplock = grouplock;
2133         spin_unlock(&lli->lli_lock);
2134
2135         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2136         RETURN(0);
2137 }
2138
2139 static int ll_put_grouplock(struct inode *inode, struct file *file,
2140                             unsigned long arg)
2141 {
2142         struct ll_inode_info   *lli = ll_i2info(inode);
2143         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2144         struct ll_grouplock     grouplock;
2145         ENTRY;
2146
2147         spin_lock(&lli->lli_lock);
2148         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2149                 spin_unlock(&lli->lli_lock);
2150                 CWARN("no group lock held\n");
2151                 RETURN(-EINVAL);
2152         }
2153
2154         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2155
2156         if (fd->fd_grouplock.lg_gid != arg) {
2157                 CWARN("group lock %lu doesn't match current id %lu\n",
2158                       arg, fd->fd_grouplock.lg_gid);
2159                 spin_unlock(&lli->lli_lock);
2160                 RETURN(-EINVAL);
2161         }
2162
2163         grouplock = fd->fd_grouplock;
2164         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2165         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2166         spin_unlock(&lli->lli_lock);
2167
2168         cl_put_grouplock(&grouplock);
2169         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2170         RETURN(0);
2171 }
2172
2173 /**
2174  * Close inode open handle
2175  *
2176  * \param dentry [in]     dentry which contains the inode
2177  * \param it     [in,out] intent which contains open info and result
2178  *
2179  * \retval 0     success
2180  * \retval <0    failure
2181  */
2182 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2183 {
2184         struct inode *inode = dentry->d_inode;
2185         struct obd_client_handle *och;
2186         int rc;
2187         ENTRY;
2188
2189         LASSERT(inode);
2190
2191         /* Root ? Do nothing. */
2192         if (dentry->d_inode->i_sb->s_root == dentry)
2193                 RETURN(0);
2194
2195         /* No open handle to close? Move away */
2196         if (!it_disposition(it, DISP_OPEN_OPEN))
2197                 RETURN(0);
2198
2199         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2200
2201         OBD_ALLOC(och, sizeof(*och));
2202         if (!och)
2203                 GOTO(out, rc = -ENOMEM);
2204
2205         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2206
2207         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2208 out:
2209         /* this one is in place of ll_file_open */
2210         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2211                 ptlrpc_req_finished(it->it_request);
2212                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2213         }
2214         RETURN(rc);
2215 }
2216
2217 /**
2218  * Get size for inode for which FIEMAP mapping is requested.
2219  * Make the FIEMAP get_info call and returns the result.
2220  * \param fiemap        kernel buffer to hold extens
2221  * \param num_bytes     kernel buffer size
2222  */
2223 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2224                         size_t num_bytes)
2225 {
2226         struct lu_env                   *env;
2227         __u16                           refcheck;
2228         int                             rc = 0;
2229         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2230         ENTRY;
2231
2232         /* Checks for fiemap flags */
2233         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2234                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2235                 return -EBADR;
2236         }
2237
2238         /* Check for FIEMAP_FLAG_SYNC */
2239         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2240                 rc = filemap_fdatawrite(inode->i_mapping);
2241                 if (rc)
2242                         return rc;
2243         }
2244
2245         env = cl_env_get(&refcheck);
2246         if (IS_ERR(env))
2247                 RETURN(PTR_ERR(env));
2248
2249         if (i_size_read(inode) == 0) {
2250                 rc = ll_glimpse_size(inode);
2251                 if (rc)
2252                         GOTO(out, rc);
2253         }
2254
2255         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2256         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2257         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2258
2259         /* If filesize is 0, then there would be no objects for mapping */
2260         if (fmkey.lfik_oa.o_size == 0) {
2261                 fiemap->fm_mapped_extents = 0;
2262                 GOTO(out, rc = 0);
2263         }
2264
2265         fmkey.lfik_fiemap = *fiemap;
2266
2267         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2268                               &fmkey, fiemap, &num_bytes);
2269 out:
2270         cl_env_put(env, &refcheck);
2271         RETURN(rc);
2272 }
2273
2274 int ll_fid2path(struct inode *inode, void __user *arg)
2275 {
2276         struct obd_export       *exp = ll_i2mdexp(inode);
2277         const struct getinfo_fid2path __user *gfin = arg;
2278         __u32                    pathlen;
2279         struct getinfo_fid2path *gfout;
2280         size_t                   outsize;
2281         int                      rc;
2282
2283         ENTRY;
2284
2285         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2286             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2287                 RETURN(-EPERM);
2288
2289         /* Only need to get the buflen */
2290         if (get_user(pathlen, &gfin->gf_pathlen))
2291                 RETURN(-EFAULT);
2292
2293         if (pathlen > PATH_MAX)
2294                 RETURN(-EINVAL);
2295
2296         outsize = sizeof(*gfout) + pathlen;
2297         OBD_ALLOC(gfout, outsize);
2298         if (gfout == NULL)
2299                 RETURN(-ENOMEM);
2300
2301         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2302                 GOTO(gf_free, rc = -EFAULT);
2303         /* append root FID after gfout to let MDT know the root FID so that it
2304          * can lookup the correct path, this is mainly for fileset.
2305          * old server without fileset mount support will ignore this. */
2306         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2307
2308         /* Call mdc_iocontrol */
2309         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2310         if (rc != 0)
2311                 GOTO(gf_free, rc);
2312
2313         if (copy_to_user(arg, gfout, outsize))
2314                 rc = -EFAULT;
2315
2316 gf_free:
2317         OBD_FREE(gfout, outsize);
2318         RETURN(rc);
2319 }
2320
2321 static int
2322 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2323 {
2324         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2325         struct lu_env *env;
2326         struct cl_io *io;
2327         __u16  refcheck;
2328         int result;
2329
2330         ENTRY;
2331
2332         ioc->idv_version = 0;
2333         ioc->idv_layout_version = UINT_MAX;
2334
2335         /* If no file object initialized, we consider its version is 0. */
2336         if (obj == NULL)
2337                 RETURN(0);
2338
2339         env = cl_env_get(&refcheck);
2340         if (IS_ERR(env))
2341                 RETURN(PTR_ERR(env));
2342
2343         io = vvp_env_thread_io(env);
2344         io->ci_obj = obj;
2345         io->u.ci_data_version.dv_data_version = 0;
2346         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2347         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2348
2349 restart:
2350         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2351                 result = cl_io_loop(env, io);
2352         else
2353                 result = io->ci_result;
2354
2355         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2356         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2357
2358         cl_io_fini(env, io);
2359
2360         if (unlikely(io->ci_need_restart))
2361                 goto restart;
2362
2363         cl_env_put(env, &refcheck);
2364
2365         RETURN(result);
2366 }
2367
2368 /*
2369  * Read the data_version for inode.
2370  *
2371  * This value is computed using stripe object version on OST.
2372  * Version is computed using server side locking.
2373  *
2374  * @param flags if do sync on the OST side;
2375  *              0: no sync
2376  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2377  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2378  */
2379 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2380 {
2381         struct ioc_data_version ioc = { .idv_flags = flags };
2382         int rc;
2383
2384         rc = ll_ioc_data_version(inode, &ioc);
2385         if (!rc)
2386                 *data_version = ioc.idv_version;
2387
2388         return rc;
2389 }
2390
2391 /*
2392  * Trigger a HSM release request for the provided inode.
2393  */
2394 int ll_hsm_release(struct inode *inode)
2395 {
2396         struct lu_env *env;
2397         struct obd_client_handle *och = NULL;
2398         __u64 data_version = 0;
2399         int rc;
2400         __u16 refcheck;
2401         ENTRY;
2402
2403         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2404                ll_get_fsname(inode->i_sb, NULL, 0),
2405                PFID(&ll_i2info(inode)->lli_fid));
2406
2407         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2408         if (IS_ERR(och))
2409                 GOTO(out, rc = PTR_ERR(och));
2410
2411         /* Grab latest data_version and [am]time values */
2412         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2413         if (rc != 0)
2414                 GOTO(out, rc);
2415
2416         env = cl_env_get(&refcheck);
2417         if (IS_ERR(env))
2418                 GOTO(out, rc = PTR_ERR(env));
2419
2420         rc = ll_merge_attr(env, inode);
2421         cl_env_put(env, &refcheck);
2422
2423         /* If error happen, we have the wrong size for a file.
2424          * Don't release it.
2425          */
2426         if (rc != 0)
2427                 GOTO(out, rc);
2428
2429         /* Release the file.
2430          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2431          * we still need it to pack l_remote_handle to MDT. */
2432         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2433                                        &data_version);
2434         och = NULL;
2435
2436         EXIT;
2437 out:
2438         if (och != NULL && !IS_ERR(och)) /* close the file */
2439                 ll_lease_close(och, inode, NULL);
2440
2441         return rc;
2442 }
2443
2444 struct ll_swap_stack {
2445         __u64                    dv1;
2446         __u64                    dv2;
2447         struct inode            *inode1;
2448         struct inode            *inode2;
2449         bool                     check_dv1;
2450         bool                     check_dv2;
2451 };
2452
2453 static int ll_swap_layouts(struct file *file1, struct file *file2,
2454                            struct lustre_swap_layouts *lsl)
2455 {
2456         struct mdc_swap_layouts  msl;
2457         struct md_op_data       *op_data;
2458         __u32                    gid;
2459         __u64                    dv;
2460         struct ll_swap_stack    *llss = NULL;
2461         int                      rc;
2462
2463         OBD_ALLOC_PTR(llss);
2464         if (llss == NULL)
2465                 RETURN(-ENOMEM);
2466
2467         llss->inode1 = file_inode(file1);
2468         llss->inode2 = file_inode(file2);
2469
2470         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2471         if (rc < 0)
2472                 GOTO(free, rc);
2473
2474         /* we use 2 bool because it is easier to swap than 2 bits */
2475         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2476                 llss->check_dv1 = true;
2477
2478         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2479                 llss->check_dv2 = true;
2480
2481         /* we cannot use lsl->sl_dvX directly because we may swap them */
2482         llss->dv1 = lsl->sl_dv1;
2483         llss->dv2 = lsl->sl_dv2;
2484
2485         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2486         if (rc == 0) /* same file, done! */
2487                 GOTO(free, rc);
2488
2489         if (rc < 0) { /* sequentialize it */
2490                 swap(llss->inode1, llss->inode2);
2491                 swap(file1, file2);
2492                 swap(llss->dv1, llss->dv2);
2493                 swap(llss->check_dv1, llss->check_dv2);
2494         }
2495
2496         gid = lsl->sl_gid;
2497         if (gid != 0) { /* application asks to flush dirty cache */
2498                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2499                 if (rc < 0)
2500                         GOTO(free, rc);
2501
2502                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2503                 if (rc < 0) {
2504                         ll_put_grouplock(llss->inode1, file1, gid);
2505                         GOTO(free, rc);
2506                 }
2507         }
2508
2509         /* ultimate check, before swaping the layouts we check if
2510          * dataversion has changed (if requested) */
2511         if (llss->check_dv1) {
2512                 rc = ll_data_version(llss->inode1, &dv, 0);
2513                 if (rc)
2514                         GOTO(putgl, rc);
2515                 if (dv != llss->dv1)
2516                         GOTO(putgl, rc = -EAGAIN);
2517         }
2518
2519         if (llss->check_dv2) {
2520                 rc = ll_data_version(llss->inode2, &dv, 0);
2521                 if (rc)
2522                         GOTO(putgl, rc);
2523                 if (dv != llss->dv2)
2524                         GOTO(putgl, rc = -EAGAIN);
2525         }
2526
2527         /* struct md_op_data is used to send the swap args to the mdt
2528          * only flags is missing, so we use struct mdc_swap_layouts
2529          * through the md_op_data->op_data */
2530         /* flags from user space have to be converted before they are send to
2531          * server, no flag is sent today, they are only used on the client */
2532         msl.msl_flags = 0;
2533         rc = -ENOMEM;
2534         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2535                                      0, LUSTRE_OPC_ANY, &msl);
2536         if (IS_ERR(op_data))
2537                 GOTO(free, rc = PTR_ERR(op_data));
2538
2539         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2540                            sizeof(*op_data), op_data, NULL);
2541         ll_finish_md_op_data(op_data);
2542
2543         if (rc < 0)
2544                 GOTO(putgl, rc);
2545
2546 putgl:
2547         if (gid != 0) {
2548                 ll_put_grouplock(llss->inode2, file2, gid);
2549                 ll_put_grouplock(llss->inode1, file1, gid);
2550         }
2551
2552 free:
2553         if (llss != NULL)
2554                 OBD_FREE_PTR(llss);
2555
2556         RETURN(rc);
2557 }
2558
2559 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2560 {
2561         struct md_op_data       *op_data;
2562         int                      rc;
2563         ENTRY;
2564
2565         /* Detect out-of range masks */
2566         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2567                 RETURN(-EINVAL);
2568
2569         /* Non-root users are forbidden to set or clear flags which are
2570          * NOT defined in HSM_USER_MASK. */
2571         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2572             !cfs_capable(CFS_CAP_SYS_ADMIN))
2573                 RETURN(-EPERM);
2574
2575         /* Detect out-of range archive id */
2576         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2577             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2578                 RETURN(-EINVAL);
2579
2580         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2581                                      LUSTRE_OPC_ANY, hss);
2582         if (IS_ERR(op_data))
2583                 RETURN(PTR_ERR(op_data));
2584
2585         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2586                            sizeof(*op_data), op_data, NULL);
2587
2588         ll_finish_md_op_data(op_data);
2589
2590         RETURN(rc);
2591 }
2592
2593 static int ll_hsm_import(struct inode *inode, struct file *file,
2594                          struct hsm_user_import *hui)
2595 {
2596         struct hsm_state_set    *hss = NULL;
2597         struct iattr            *attr = NULL;
2598         int                      rc;
2599         ENTRY;
2600
2601         if (!S_ISREG(inode->i_mode))
2602                 RETURN(-EINVAL);
2603
2604         /* set HSM flags */
2605         OBD_ALLOC_PTR(hss);
2606         if (hss == NULL)
2607                 GOTO(out, rc = -ENOMEM);
2608
2609         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2610         hss->hss_archive_id = hui->hui_archive_id;
2611         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2612         rc = ll_hsm_state_set(inode, hss);
2613         if (rc != 0)
2614                 GOTO(out, rc);
2615
2616         OBD_ALLOC_PTR(attr);
2617         if (attr == NULL)
2618                 GOTO(out, rc = -ENOMEM);
2619
2620         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2621         attr->ia_mode |= S_IFREG;
2622         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2623         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2624         attr->ia_size = hui->hui_size;
2625         attr->ia_mtime.tv_sec = hui->hui_mtime;
2626         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2627         attr->ia_atime.tv_sec = hui->hui_atime;
2628         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2629
2630         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2631                          ATTR_UID | ATTR_GID |
2632                          ATTR_MTIME | ATTR_MTIME_SET |
2633                          ATTR_ATIME | ATTR_ATIME_SET;
2634
2635         inode_lock(inode);
2636
2637         rc = ll_setattr_raw(file_dentry(file), attr, true);
2638         if (rc == -ENODATA)
2639                 rc = 0;
2640
2641         inode_unlock(inode);
2642
2643 out:
2644         if (hss != NULL)
2645                 OBD_FREE_PTR(hss);
2646
2647         if (attr != NULL)
2648                 OBD_FREE_PTR(attr);
2649
2650         RETURN(rc);
2651 }
2652
2653 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2654 {
2655         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2656                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2657 }
2658
2659 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2660 {
2661         struct inode *inode = file_inode(file);
2662         struct iattr ia = {
2663                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2664                             ATTR_MTIME | ATTR_MTIME_SET |
2665                             ATTR_CTIME | ATTR_CTIME_SET,
2666                 .ia_atime = {
2667                         .tv_sec = lfu->lfu_atime_sec,
2668                         .tv_nsec = lfu->lfu_atime_nsec,
2669                 },
2670                 .ia_mtime = {
2671                         .tv_sec = lfu->lfu_mtime_sec,
2672                         .tv_nsec = lfu->lfu_mtime_nsec,
2673                 },
2674                 .ia_ctime = {
2675                         .tv_sec = lfu->lfu_ctime_sec,
2676                         .tv_nsec = lfu->lfu_ctime_nsec,
2677                 },
2678         };
2679         int rc;
2680         ENTRY;
2681
2682         if (!capable(CAP_SYS_ADMIN))
2683                 RETURN(-EPERM);
2684
2685         if (!S_ISREG(inode->i_mode))
2686                 RETURN(-EINVAL);
2687
2688         inode_lock(inode);
2689         rc = ll_setattr_raw(file_dentry(file), &ia, false);
2690         inode_unlock(inode);
2691
2692         RETURN(rc);
2693 }
2694
2695 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2696 {
2697         switch (mode) {
2698         case MODE_READ_USER:
2699                 return CLM_READ;
2700         case MODE_WRITE_USER:
2701                 return CLM_WRITE;
2702         default:
2703                 return -EINVAL;
2704         }
2705 }
2706
2707 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2708
2709 /* Used to allow the upper layers of the client to request an LDLM lock
2710  * without doing an actual read or write.
2711  *
2712  * Used for ladvise lockahead to manually request specific locks.
2713  *
2714  * \param[in] file      file this ladvise lock request is on
2715  * \param[in] ladvise   ladvise struct describing this lock request
2716  *
2717  * \retval 0            success, no detailed result available (sync requests
2718  *                      and requests sent to the server [not handled locally]
2719  *                      cannot return detailed results)
2720  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2721  *                                       see definitions for details.
2722  * \retval negative     negative errno on error
2723  */
2724 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2725 {
2726         struct lu_env *env = NULL;
2727         struct cl_io *io  = NULL;
2728         struct cl_lock *lock = NULL;
2729         struct cl_lock_descr *descr = NULL;
2730         struct dentry *dentry = file->f_path.dentry;
2731         struct inode *inode = dentry->d_inode;
2732         enum cl_lock_mode cl_mode;
2733         off_t start = ladvise->lla_start;
2734         off_t end = ladvise->lla_end;
2735         int result;
2736         __u16 refcheck;
2737
2738         ENTRY;
2739
2740         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2741                "start=%llu, end=%llu\n", dentry->d_name.len,
2742                dentry->d_name.name, dentry->d_inode,
2743                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2744                (__u64) end);
2745
2746         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2747         if (cl_mode < 0)
2748                 GOTO(out, result = cl_mode);
2749
2750         /* Get IO environment */
2751         result = cl_io_get(inode, &env, &io, &refcheck);
2752         if (result <= 0)
2753                 GOTO(out, result);
2754
2755         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2756         if (result > 0) {
2757                 /*
2758                  * nothing to do for this io. This currently happens when
2759                  * stripe sub-object's are not yet created.
2760                  */
2761                 result = io->ci_result;
2762         } else if (result == 0) {
2763                 lock = vvp_env_lock(env);
2764                 descr = &lock->cll_descr;
2765
2766                 descr->cld_obj   = io->ci_obj;
2767                 /* Convert byte offsets to pages */
2768                 descr->cld_start = cl_index(io->ci_obj, start);
2769                 descr->cld_end   = cl_index(io->ci_obj, end);
2770                 descr->cld_mode  = cl_mode;
2771                 /* CEF_MUST is used because we do not want to convert a
2772                  * lockahead request to a lockless lock */
2773                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2774                                        CEF_NONBLOCK;
2775
2776                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2777                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2778
2779                 result = cl_lock_request(env, io, lock);
2780
2781                 /* On success, we need to release the lock */
2782                 if (result >= 0)
2783                         cl_lock_release(env, lock);
2784         }
2785         cl_io_fini(env, io);
2786         cl_env_put(env, &refcheck);
2787
2788         /* -ECANCELED indicates a matching lock with a different extent
2789          * was already present, and -EEXIST indicates a matching lock
2790          * on exactly the same extent was already present.
2791          * We convert them to positive values for userspace to make
2792          * recognizing true errors easier.
2793          * Note we can only return these detailed results on async requests,
2794          * as sync requests look the same as i/o requests for locking. */
2795         if (result == -ECANCELED)
2796                 result = LLA_RESULT_DIFFERENT;
2797         else if (result == -EEXIST)
2798                 result = LLA_RESULT_SAME;
2799
2800 out:
2801         RETURN(result);
2802 }
2803 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2804
2805 static int ll_ladvise_sanity(struct inode *inode,
2806                              struct llapi_lu_ladvise *ladvise)
2807 {
2808         enum lu_ladvise_type advice = ladvise->lla_advice;
2809         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2810          * be in the first 32 bits of enum ladvise_flags */
2811         __u32 flags = ladvise->lla_peradvice_flags;
2812         /* 3 lines at 80 characters per line, should be plenty */
2813         int rc = 0;
2814
2815         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2816                 rc = -EINVAL;
2817                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2818                        "last supported advice is %s (value '%d'): rc = %d\n",
2819                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2820                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2821                 GOTO(out, rc);
2822         }
2823
2824         /* Per-advice checks */
2825         switch (advice) {
2826         case LU_LADVISE_LOCKNOEXPAND:
2827                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2828                         rc = -EINVAL;
2829                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2830                                "rc = %d\n",
2831                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2832                                ladvise_names[advice], rc);
2833                         GOTO(out, rc);
2834                 }
2835                 break;
2836         case LU_LADVISE_LOCKAHEAD:
2837                 /* Currently only READ and WRITE modes can be requested */
2838                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2839                     ladvise->lla_lockahead_mode == 0) {
2840                         rc = -EINVAL;
2841                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2842                                "rc = %d\n",
2843                                ll_get_fsname(inode->i_sb, NULL, 0),
2844                                ladvise->lla_lockahead_mode,
2845                                ladvise_names[advice], rc);
2846                         GOTO(out, rc);
2847                 }
2848         case LU_LADVISE_WILLREAD:
2849         case LU_LADVISE_DONTNEED:
2850         default:
2851                 /* Note fall through above - These checks apply to all advices
2852                  * except LOCKNOEXPAND */
2853                 if (flags & ~LF_DEFAULT_MASK) {
2854                         rc = -EINVAL;
2855                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2856                                "rc = %d\n",
2857                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2858                                ladvise_names[advice], rc);
2859                         GOTO(out, rc);
2860                 }
2861                 if (ladvise->lla_start >= ladvise->lla_end) {
2862                         rc = -EINVAL;
2863                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2864                                "for %s: rc = %d\n",
2865                                ll_get_fsname(inode->i_sb, NULL, 0),
2866                                ladvise->lla_start, ladvise->lla_end,
2867                                ladvise_names[advice], rc);
2868                         GOTO(out, rc);
2869                 }
2870                 break;
2871         }
2872
2873 out:
2874         return rc;
2875 }
2876 #undef ERRSIZE
2877
2878 /*
2879  * Give file access advices
2880  *
2881  * The ladvise interface is similar to Linux fadvise() system call, except it
2882  * forwards the advices directly from Lustre client to server. The server side
2883  * codes will apply appropriate read-ahead and caching techniques for the
2884  * corresponding files.
2885  *
2886  * A typical workload for ladvise is e.g. a bunch of different clients are
2887  * doing small random reads of a file, so prefetching pages into OSS cache
2888  * with big linear reads before the random IO is a net benefit. Fetching
2889  * all that data into each client cache with fadvise() may not be, due to
2890  * much more data being sent to the client.
2891  */
2892 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2893                       struct llapi_lu_ladvise *ladvise)
2894 {
2895         struct lu_env *env;
2896         struct cl_io *io;
2897         struct cl_ladvise_io *lio;
2898         int rc;
2899         __u16 refcheck;
2900         ENTRY;
2901
2902         env = cl_env_get(&refcheck);
2903         if (IS_ERR(env))
2904                 RETURN(PTR_ERR(env));
2905
2906         io = vvp_env_thread_io(env);
2907         io->ci_obj = ll_i2info(inode)->lli_clob;
2908
2909         /* initialize parameters for ladvise */
2910         lio = &io->u.ci_ladvise;
2911         lio->li_start = ladvise->lla_start;
2912         lio->li_end = ladvise->lla_end;
2913         lio->li_fid = ll_inode2fid(inode);
2914         lio->li_advice = ladvise->lla_advice;
2915         lio->li_flags = flags;
2916
2917         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2918                 rc = cl_io_loop(env, io);
2919         else
2920                 rc = io->ci_result;
2921
2922         cl_io_fini(env, io);
2923         cl_env_put(env, &refcheck);
2924         RETURN(rc);
2925 }
2926
2927 static int ll_lock_noexpand(struct file *file, int flags)
2928 {
2929         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2930
2931         fd->ll_lock_no_expand = !(flags & LF_UNSET);
2932
2933         return 0;
2934 }
2935
2936 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2937                         unsigned long arg)
2938 {
2939         struct fsxattr fsxattr;
2940
2941         if (copy_from_user(&fsxattr,
2942                            (const struct fsxattr __user *)arg,
2943                            sizeof(fsxattr)))
2944                 RETURN(-EFAULT);
2945
2946         fsxattr.fsx_xflags = ll_inode_to_ext_flags(inode->i_flags);
2947         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2948         if (copy_to_user((struct fsxattr __user *)arg,
2949                          &fsxattr, sizeof(fsxattr)))
2950                 RETURN(-EFAULT);
2951
2952         RETURN(0);
2953 }
2954
2955 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2956                         unsigned long arg)
2957 {
2958
2959         struct md_op_data *op_data;
2960         struct ptlrpc_request *req = NULL;
2961         int rc = 0;
2962         struct fsxattr fsxattr;
2963         struct cl_object *obj;
2964
2965         /* only root could change project ID */
2966         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2967                 RETURN(-EPERM);
2968
2969         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2970                                      LUSTRE_OPC_ANY, NULL);
2971         if (IS_ERR(op_data))
2972                 RETURN(PTR_ERR(op_data));
2973
2974         if (copy_from_user(&fsxattr,
2975                            (const struct fsxattr __user *)arg,
2976                            sizeof(fsxattr)))
2977                 GOTO(out_fsxattr1, rc = -EFAULT);
2978
2979         op_data->op_attr_flags = fsxattr.fsx_xflags;
2980         op_data->op_projid = fsxattr.fsx_projid;
2981         op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
2982         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2983                         0, &req);
2984         ptlrpc_req_finished(req);
2985
2986         obj = ll_i2info(inode)->lli_clob;
2987         if (obj) {
2988                 struct iattr *attr;
2989
2990                 inode->i_flags = ll_ext_to_inode_flags(fsxattr.fsx_xflags);
2991                 OBD_ALLOC_PTR(attr);
2992                 if (attr == NULL)
2993                         GOTO(out_fsxattr1, rc = -ENOMEM);
2994                 attr->ia_valid = ATTR_ATTR_FLAG;
2995                 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
2996
2997                 OBD_FREE_PTR(attr);
2998         }
2999 out_fsxattr1:
3000         ll_finish_md_op_data(op_data);
3001         RETURN(rc);
3002 }
3003
3004 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3005                                  unsigned long arg)
3006 {
3007         struct inode            *inode = file_inode(file);
3008         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3009         struct ll_inode_info    *lli = ll_i2info(inode);
3010         struct obd_client_handle *och = NULL;
3011         struct split_param sp;
3012         bool lease_broken;
3013         fmode_t fmode = 0;
3014         enum mds_op_bias bias = 0;
3015         struct file *layout_file = NULL;
3016         void *data = NULL;
3017         size_t data_size = 0;
3018         long rc;
3019         ENTRY;
3020
3021         mutex_lock(&lli->lli_och_mutex);
3022         if (fd->fd_lease_och != NULL) {
3023                 och = fd->fd_lease_och;
3024                 fd->fd_lease_och = NULL;
3025         }
3026         mutex_unlock(&lli->lli_och_mutex);
3027
3028         if (och == NULL)
3029                 GOTO(out, rc = -ENOLCK);
3030
3031         fmode = och->och_flags;
3032
3033         switch (ioc->lil_flags) {
3034         case LL_LEASE_RESYNC_DONE:
3035                 if (ioc->lil_count > IOC_IDS_MAX)
3036                         GOTO(out, rc = -EINVAL);
3037
3038                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3039                 OBD_ALLOC(data, data_size);
3040                 if (!data)
3041                         GOTO(out, rc = -ENOMEM);
3042
3043                 if (copy_from_user(data, (void __user *)arg, data_size))
3044                         GOTO(out, rc = -EFAULT);
3045
3046                 bias = MDS_CLOSE_RESYNC_DONE;
3047                 break;
3048         case LL_LEASE_LAYOUT_MERGE: {
3049                 int fd;
3050
3051                 if (ioc->lil_count != 1)
3052                         GOTO(out, rc = -EINVAL);
3053
3054                 arg += sizeof(*ioc);
3055                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3056                         GOTO(out, rc = -EFAULT);
3057
3058                 layout_file = fget(fd);
3059                 if (!layout_file)
3060                         GOTO(out, rc = -EBADF);
3061
3062                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3063                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3064                         GOTO(out, rc = -EPERM);
3065
3066                 data = file_inode(layout_file);
3067                 bias = MDS_CLOSE_LAYOUT_MERGE;
3068                 break;
3069         }
3070         case LL_LEASE_LAYOUT_SPLIT: {
3071                 int fdv;
3072                 int mirror_id;
3073
3074                 if (ioc->lil_count != 2)
3075                         GOTO(out, rc = -EINVAL);
3076
3077                 arg += sizeof(*ioc);
3078                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3079                         GOTO(out, rc = -EFAULT);
3080
3081                 arg += sizeof(__u32);
3082                 if (copy_from_user(&mirror_id, (void __user *)arg,
3083                                    sizeof(__u32)))
3084                         GOTO(out, rc = -EFAULT);
3085
3086                 layout_file = fget(fdv);
3087                 if (!layout_file)
3088                         GOTO(out, rc = -EBADF);
3089
3090                 sp.sp_inode = file_inode(layout_file);
3091                 sp.sp_mirror_id = (__u16)mirror_id;
3092                 data = &sp;
3093                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3094                 break;
3095         }
3096         default:
3097                 /* without close intent */
3098                 break;
3099         }
3100
3101         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3102         if (rc < 0)
3103                 GOTO(out, rc);
3104
3105         rc = ll_lease_och_release(inode, file);
3106         if (rc < 0)
3107                 GOTO(out, rc);
3108
3109         if (lease_broken)
3110                 fmode = 0;
3111         EXIT;
3112
3113 out:
3114         switch (ioc->lil_flags) {
3115         case LL_LEASE_RESYNC_DONE:
3116                 if (data)
3117                         OBD_FREE(data, data_size);
3118                 break;
3119         case LL_LEASE_LAYOUT_MERGE:
3120         case LL_LEASE_LAYOUT_SPLIT:
3121                 if (layout_file)
3122                         fput(layout_file);
3123                 break;
3124         }
3125
3126         if (!rc)
3127                 rc = ll_lease_type_from_fmode(fmode);
3128         RETURN(rc);
3129 }
3130
3131 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3132                               unsigned long arg)
3133 {
3134         struct inode *inode = file_inode(file);
3135         struct ll_inode_info *lli = ll_i2info(inode);
3136         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3137         struct obd_client_handle *och = NULL;
3138         __u64 open_flags = 0;
3139         bool lease_broken;
3140         fmode_t fmode;
3141         long rc;
3142         ENTRY;
3143
3144         switch (ioc->lil_mode) {
3145         case LL_LEASE_WRLCK:
3146                 if (!(file->f_mode & FMODE_WRITE))
3147                         RETURN(-EPERM);
3148                 fmode = FMODE_WRITE;
3149                 break;
3150         case LL_LEASE_RDLCK:
3151                 if (!(file->f_mode & FMODE_READ))
3152                         RETURN(-EPERM);
3153                 fmode = FMODE_READ;
3154                 break;
3155         case LL_LEASE_UNLCK:
3156                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3157         default:
3158                 RETURN(-EINVAL);
3159         }
3160
3161         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3162
3163         /* apply for lease */
3164         if (ioc->lil_flags & LL_LEASE_RESYNC)
3165                 open_flags = MDS_OPEN_RESYNC;
3166         och = ll_lease_open(inode, file, fmode, open_flags);
3167         if (IS_ERR(och))
3168                 RETURN(PTR_ERR(och));
3169
3170         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3171                 rc = ll_lease_file_resync(och, inode);
3172                 if (rc) {
3173                         ll_lease_close(och, inode, NULL);
3174                         RETURN(rc);
3175                 }
3176                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3177                 if (rc) {
3178                         ll_lease_close(och, inode, NULL);
3179                         RETURN(rc);
3180                 }
3181         }
3182
3183         rc = 0;
3184         mutex_lock(&lli->lli_och_mutex);
3185         if (fd->fd_lease_och == NULL) {
3186                 fd->fd_lease_och = och;
3187                 och = NULL;
3188         }
3189         mutex_unlock(&lli->lli_och_mutex);
3190         if (och != NULL) {
3191                 /* impossible now that only excl is supported for now */
3192                 ll_lease_close(och, inode, &lease_broken);
3193                 rc = -EBUSY;
3194         }
3195         RETURN(rc);
3196 }
3197
3198 static long
3199 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3200 {
3201         struct inode            *inode = file_inode(file);
3202         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3203         int                      flags, rc;
3204         ENTRY;
3205
3206         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3207                PFID(ll_inode2fid(inode)), inode, cmd);
3208         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3209
3210         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3211         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3212                 RETURN(-ENOTTY);
3213
3214         switch(cmd) {
3215         case LL_IOC_GETFLAGS:
3216                 /* Get the current value of the file flags */
3217                 return put_user(fd->fd_flags, (int __user *)arg);
3218         case LL_IOC_SETFLAGS:
3219         case LL_IOC_CLRFLAGS:
3220                 /* Set or clear specific file flags */
3221                 /* XXX This probably needs checks to ensure the flags are
3222                  *     not abused, and to handle any flag side effects.
3223                  */
3224                 if (get_user(flags, (int __user *) arg))
3225                         RETURN(-EFAULT);
3226
3227                 if (cmd == LL_IOC_SETFLAGS) {
3228                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3229                             !(file->f_flags & O_DIRECT)) {
3230                                 CERROR("%s: unable to disable locking on "
3231                                        "non-O_DIRECT file\n", current->comm);
3232                                 RETURN(-EINVAL);
3233                         }
3234
3235                         fd->fd_flags |= flags;
3236                 } else {
3237                         fd->fd_flags &= ~flags;
3238                 }
3239                 RETURN(0);
3240         case LL_IOC_LOV_SETSTRIPE:
3241         case LL_IOC_LOV_SETSTRIPE_NEW:
3242                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3243         case LL_IOC_LOV_SETEA:
3244                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3245         case LL_IOC_LOV_SWAP_LAYOUTS: {
3246                 struct file *file2;
3247                 struct lustre_swap_layouts lsl;
3248
3249                 if (copy_from_user(&lsl, (char __user *)arg,
3250                                    sizeof(struct lustre_swap_layouts)))
3251                         RETURN(-EFAULT);
3252
3253                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3254                         RETURN(-EPERM);
3255
3256                 file2 = fget(lsl.sl_fd);
3257                 if (file2 == NULL)
3258                         RETURN(-EBADF);
3259
3260                 /* O_WRONLY or O_RDWR */
3261                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3262                         GOTO(out, rc = -EPERM);
3263
3264                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3265                         struct inode                    *inode2;
3266                         struct ll_inode_info            *lli;
3267                         struct obd_client_handle        *och = NULL;
3268
3269                         lli = ll_i2info(inode);
3270                         mutex_lock(&lli->lli_och_mutex);
3271                         if (fd->fd_lease_och != NULL) {
3272                                 och = fd->fd_lease_och;
3273                                 fd->fd_lease_och = NULL;
3274                         }
3275                         mutex_unlock(&lli->lli_och_mutex);
3276                         if (och == NULL)
3277                                 GOTO(out, rc = -ENOLCK);
3278                         inode2 = file_inode(file2);
3279                         rc = ll_swap_layouts_close(och, inode, inode2);
3280                 } else {
3281                         rc = ll_swap_layouts(file, file2, &lsl);
3282                 }
3283 out:
3284                 fput(file2);
3285                 RETURN(rc);
3286         }
3287         case LL_IOC_LOV_GETSTRIPE:
3288         case LL_IOC_LOV_GETSTRIPE_NEW:
3289                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3290         case FSFILT_IOC_GETFLAGS:
3291         case FSFILT_IOC_SETFLAGS:
3292                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3293         case FSFILT_IOC_GETVERSION_OLD:
3294         case FSFILT_IOC_GETVERSION:
3295                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3296         case LL_IOC_GROUP_LOCK:
3297                 RETURN(ll_get_grouplock(inode, file, arg));
3298         case LL_IOC_GROUP_UNLOCK:
3299                 RETURN(ll_put_grouplock(inode, file, arg));
3300         case IOC_OBD_STATFS:
3301                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3302
3303         /* We need to special case any other ioctls we want to handle,
3304          * to send them to the MDS/OST as appropriate and to properly
3305          * network encode the arg field.
3306         case FSFILT_IOC_SETVERSION_OLD:
3307         case FSFILT_IOC_SETVERSION:
3308         */
3309         case LL_IOC_FLUSHCTX:
3310                 RETURN(ll_flush_ctx(inode));
3311         case LL_IOC_PATH2FID: {
3312                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3313                                  sizeof(struct lu_fid)))
3314                         RETURN(-EFAULT);
3315
3316                 RETURN(0);
3317         }
3318         case LL_IOC_GETPARENT:
3319                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3320
3321         case OBD_IOC_FID2PATH:
3322                 RETURN(ll_fid2path(inode, (void __user *)arg));
3323         case LL_IOC_DATA_VERSION: {
3324                 struct ioc_data_version idv;
3325                 int rc;
3326
3327                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3328                         RETURN(-EFAULT);
3329
3330                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3331                 rc = ll_ioc_data_version(inode, &idv);
3332
3333                 if (rc == 0 &&
3334                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3335                         RETURN(-EFAULT);
3336
3337                 RETURN(rc);
3338         }
3339
3340         case LL_IOC_GET_MDTIDX: {
3341                 int mdtidx;
3342
3343                 mdtidx = ll_get_mdt_idx(inode);
3344                 if (mdtidx < 0)
3345                         RETURN(mdtidx);
3346
3347                 if (put_user((int)mdtidx, (int __user *)arg))
3348                         RETURN(-EFAULT);
3349
3350                 RETURN(0);
3351         }
3352         case OBD_IOC_GETDTNAME:
3353         case OBD_IOC_GETMDNAME:
3354                 RETURN(ll_get_obd_name(inode, cmd, arg));
3355         case LL_IOC_HSM_STATE_GET: {
3356                 struct md_op_data       *op_data;
3357                 struct hsm_user_state   *hus;
3358                 int                      rc;
3359
3360                 OBD_ALLOC_PTR(hus);
3361                 if (hus == NULL)
3362                         RETURN(-ENOMEM);
3363
3364                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3365                                              LUSTRE_OPC_ANY, hus);
3366                 if (IS_ERR(op_data)) {
3367                         OBD_FREE_PTR(hus);
3368                         RETURN(PTR_ERR(op_data));
3369                 }
3370
3371                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3372                                    op_data, NULL);
3373
3374                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3375                         rc = -EFAULT;
3376
3377                 ll_finish_md_op_data(op_data);
3378                 OBD_FREE_PTR(hus);
3379                 RETURN(rc);
3380         }
3381         case LL_IOC_HSM_STATE_SET: {
3382                 struct hsm_state_set    *hss;
3383                 int                      rc;
3384
3385                 OBD_ALLOC_PTR(hss);
3386                 if (hss == NULL)
3387                         RETURN(-ENOMEM);
3388
3389                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3390                         OBD_FREE_PTR(hss);
3391                         RETURN(-EFAULT);
3392                 }
3393
3394                 rc = ll_hsm_state_set(inode, hss);
3395
3396                 OBD_FREE_PTR(hss);
3397                 RETURN(rc);
3398         }
3399         case LL_IOC_HSM_ACTION: {
3400                 struct md_op_data               *op_data;
3401                 struct hsm_current_action       *hca;
3402                 int                              rc;
3403
3404                 OBD_ALLOC_PTR(hca);
3405                 if (hca == NULL)
3406                         RETURN(-ENOMEM);
3407
3408                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3409                                              LUSTRE_OPC_ANY, hca);
3410                 if (IS_ERR(op_data)) {
3411                         OBD_FREE_PTR(hca);
3412                         RETURN(PTR_ERR(op_data));
3413                 }
3414
3415                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3416                                    op_data, NULL);
3417
3418                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3419                         rc = -EFAULT;
3420
3421                 ll_finish_md_op_data(op_data);
3422                 OBD_FREE_PTR(hca);
3423                 RETURN(rc);
3424         }
3425         case LL_IOC_SET_LEASE_OLD: {
3426                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3427
3428                 RETURN(ll_file_set_lease(file, &ioc, 0));
3429         }
3430         case LL_IOC_SET_LEASE: {
3431                 struct ll_ioc_lease ioc;
3432
3433                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3434                         RETURN(-EFAULT);
3435
3436                 RETURN(ll_file_set_lease(file, &ioc, arg));
3437         }
3438         case LL_IOC_GET_LEASE: {
3439                 struct ll_inode_info *lli = ll_i2info(inode);
3440                 struct ldlm_lock *lock = NULL;
3441                 fmode_t fmode = 0;
3442
3443                 mutex_lock(&lli->lli_och_mutex);
3444                 if (fd->fd_lease_och != NULL) {
3445                         struct obd_client_handle *och = fd->fd_lease_och;
3446
3447                         lock = ldlm_handle2lock(&och->och_lease_handle);
3448                         if (lock != NULL) {
3449                                 lock_res_and_lock(lock);
3450                                 if (!ldlm_is_cancel(lock))
3451                                         fmode = och->och_flags;
3452
3453                                 unlock_res_and_lock(lock);
3454                                 LDLM_LOCK_PUT(lock);
3455                         }
3456                 }
3457                 mutex_unlock(&lli->lli_och_mutex);
3458
3459                 RETURN(ll_lease_type_from_fmode(fmode));
3460         }
3461         case LL_IOC_HSM_IMPORT: {
3462                 struct hsm_user_import *hui;
3463
3464                 OBD_ALLOC_PTR(hui);
3465                 if (hui == NULL)
3466                         RETURN(-ENOMEM);
3467
3468                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3469                         OBD_FREE_PTR(hui);
3470                         RETURN(-EFAULT);
3471                 }
3472
3473                 rc = ll_hsm_import(inode, file, hui);
3474
3475                 OBD_FREE_PTR(hui);
3476                 RETURN(rc);
3477         }
3478         case LL_IOC_FUTIMES_3: {
3479                 struct ll_futimes_3 lfu;
3480
3481                 if (copy_from_user(&lfu,
3482                                    (const struct ll_futimes_3 __user *)arg,
3483                                    sizeof(lfu)))
3484                         RETURN(-EFAULT);
3485
3486                 RETURN(ll_file_futimes_3(file, &lfu));
3487         }
3488         case LL_IOC_LADVISE: {
3489                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3490                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3491                 int i;
3492                 int num_advise;
3493                 int alloc_size = sizeof(*k_ladvise_hdr);
3494
3495                 rc = 0;
3496                 u_ladvise_hdr = (void __user *)arg;
3497                 OBD_ALLOC_PTR(k_ladvise_hdr);
3498                 if (k_ladvise_hdr == NULL)
3499                         RETURN(-ENOMEM);
3500
3501                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3502                         GOTO(out_ladvise, rc = -EFAULT);
3503
3504                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3505                     k_ladvise_hdr->lah_count < 1)
3506                         GOTO(out_ladvise, rc = -EINVAL);
3507
3508                 num_advise = k_ladvise_hdr->lah_count;
3509                 if (num_advise >= LAH_COUNT_MAX)
3510                         GOTO(out_ladvise, rc = -EFBIG);
3511
3512                 OBD_FREE_PTR(k_ladvise_hdr);
3513                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3514                                       lah_advise[num_advise]);
3515                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3516                 if (k_ladvise_hdr == NULL)
3517                         RETURN(-ENOMEM);
3518
3519                 /*
3520                  * TODO: submit multiple advices to one server in a single RPC
3521                  */
3522                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3523                         GOTO(out_ladvise, rc = -EFAULT);
3524
3525                 for (i = 0; i < num_advise; i++) {
3526                         struct llapi_lu_ladvise *k_ladvise =
3527                                         &k_ladvise_hdr->lah_advise[i];
3528                         struct llapi_lu_ladvise __user *u_ladvise =
3529                                         &u_ladvise_hdr->lah_advise[i];
3530
3531                         rc = ll_ladvise_sanity(inode, k_ladvise);
3532                         if (rc)
3533                                 GOTO(out_ladvise, rc);
3534
3535                         switch (k_ladvise->lla_advice) {
3536                         case LU_LADVISE_LOCKNOEXPAND:
3537                                 rc = ll_lock_noexpand(file,
3538                                                k_ladvise->lla_peradvice_flags);
3539                                 GOTO(out_ladvise, rc);
3540                         case LU_LADVISE_LOCKAHEAD:
3541
3542                                 rc = ll_file_lock_ahead(file, k_ladvise);
3543
3544                                 if (rc < 0)
3545                                         GOTO(out_ladvise, rc);
3546
3547                                 if (put_user(rc,
3548                                              &u_ladvise->lla_lockahead_result))
3549                                         GOTO(out_ladvise, rc = -EFAULT);
3550                                 break;
3551                         default:
3552                                 rc = ll_ladvise(inode, file,
3553                                                 k_ladvise_hdr->lah_flags,
3554                                                 k_ladvise);
3555                                 if (rc)
3556                                         GOTO(out_ladvise, rc);
3557                                 break;
3558                         }
3559
3560                 }
3561
3562 out_ladvise:
3563                 OBD_FREE(k_ladvise_hdr, alloc_size);
3564                 RETURN(rc);
3565         }
3566         case LL_IOC_FLR_SET_MIRROR: {
3567                 /* mirror I/O must be direct to avoid polluting page cache
3568                  * by stale data. */
3569                 if (!(file->f_flags & O_DIRECT))
3570                         RETURN(-EINVAL);
3571
3572                 fd->fd_designated_mirror = (__u32)arg;
3573                 RETURN(0);
3574         }
3575         case LL_IOC_FSGETXATTR:
3576                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3577         case LL_IOC_FSSETXATTR:
3578                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3579         case BLKSSZGET:
3580                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3581         default:
3582                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3583                                      (void __user *)arg));
3584         }
3585 }
3586
3587 #ifndef HAVE_FILE_LLSEEK_SIZE
3588 static inline loff_t
3589 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3590 {
3591         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3592                 return -EINVAL;
3593         if (offset > maxsize)
3594                 return -EINVAL;
3595
3596         if (offset != file->f_pos) {
3597                 file->f_pos = offset;
3598                 file->f_version = 0;
3599         }
3600         return offset;
3601 }
3602
3603 static loff_t
3604 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3605                 loff_t maxsize, loff_t eof)
3606 {
3607         struct inode *inode = file_inode(file);
3608
3609         switch (origin) {
3610         case SEEK_END:
3611                 offset += eof;
3612                 break;
3613         case SEEK_CUR:
3614                 /*
3615                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3616                  * position-querying operation.  Avoid rewriting the "same"
3617                  * f_pos value back to the file because a concurrent read(),
3618                  * write() or lseek() might have altered it
3619                  */
3620                 if (offset == 0)
3621                         return file->f_pos;
3622                 /*
3623                  * f_lock protects against read/modify/write race with other
3624                  * SEEK_CURs. Note that parallel writes and reads behave
3625                  * like SEEK_SET.
3626                  */
3627                 inode_lock(inode);
3628                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3629                 inode_unlock(inode);
3630                 return offset;
3631         case SEEK_DATA:
3632                 /*
3633                  * In the generic case the entire file is data, so as long as
3634                  * offset isn't at the end of the file then the offset is data.
3635                  */
3636                 if (offset >= eof)
3637                         return -ENXIO;
3638                 break;
3639         case SEEK_HOLE:
3640                 /*
3641                  * There is a virtual hole at the end of the file, so as long as
3642                  * offset isn't i_size or larger, return i_size.
3643                  */
3644                 if (offset >= eof)
3645                         return -ENXIO;
3646                 offset = eof;
3647                 break;
3648         }
3649
3650         return llseek_execute(file, offset, maxsize);
3651 }
3652 #endif
3653
3654 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3655 {
3656         struct inode *inode = file_inode(file);
3657         loff_t retval, eof = 0;
3658
3659         ENTRY;
3660         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3661                            (origin == SEEK_CUR) ? file->f_pos : 0);
3662         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3663                PFID(ll_inode2fid(inode)), inode, retval, retval,
3664                origin);
3665         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3666
3667         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3668                 retval = ll_glimpse_size(inode);
3669                 if (retval != 0)
3670                         RETURN(retval);
3671                 eof = i_size_read(inode);
3672         }
3673
3674         retval = ll_generic_file_llseek_size(file, offset, origin,
3675                                           ll_file_maxbytes(inode), eof);
3676         RETURN(retval);
3677 }
3678
3679 static int ll_flush(struct file *file, fl_owner_t id)
3680 {
3681         struct inode *inode = file_inode(file);
3682         struct ll_inode_info *lli = ll_i2info(inode);
3683         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3684         int rc, err;
3685
3686         LASSERT(!S_ISDIR(inode->i_mode));
3687
3688         /* catch async errors that were recorded back when async writeback
3689          * failed for pages in this mapping. */
3690         rc = lli->lli_async_rc;
3691         lli->lli_async_rc = 0;
3692         if (lli->lli_clob != NULL) {
3693                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3694                 if (rc == 0)
3695                         rc = err;
3696         }
3697
3698         /* The application has been told write failure already.
3699          * Do not report failure again. */
3700         if (fd->fd_write_failed)
3701                 return 0;
3702         return rc ? -EIO : 0;
3703 }
3704
3705 /**
3706  * Called to make sure a portion of file has been written out.
3707  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3708  *
3709  * Return how many pages have been written.
3710  */
3711 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3712                        enum cl_fsync_mode mode, int ignore_layout)
3713 {
3714         struct lu_env *env;
3715         struct cl_io *io;
3716         struct cl_fsync_io *fio;
3717         int result;
3718         __u16 refcheck;
3719         ENTRY;
3720
3721         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3722             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3723                 RETURN(-EINVAL);
3724
3725         env = cl_env_get(&refcheck);
3726         if (IS_ERR(env))
3727                 RETURN(PTR_ERR(env));
3728
3729         io = vvp_env_thread_io(env);
3730         io->ci_obj = ll_i2info(inode)->lli_clob;
3731         io->ci_ignore_layout = ignore_layout;
3732
3733         /* initialize parameters for sync */
3734         fio = &io->u.ci_fsync;
3735         fio->fi_start = start;
3736         fio->fi_end = end;
3737         fio->fi_fid = ll_inode2fid(inode);
3738         fio->fi_mode = mode;
3739         fio->fi_nr_written = 0;
3740
3741         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3742                 result = cl_io_loop(env, io);
3743         else
3744                 result = io->ci_result;
3745         if (result == 0)
3746                 result = fio->fi_nr_written;
3747         cl_io_fini(env, io);
3748         cl_env_put(env, &refcheck);
3749
3750         RETURN(result);
3751 }
3752
3753 /*
3754  * When dentry is provided (the 'else' case), file_dentry() may be
3755  * null and dentry must be used directly rather than pulled from
3756  * file_dentry() as is done otherwise.
3757  */
3758
3759 #ifdef HAVE_FILE_FSYNC_4ARGS
3760 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3761 {
3762         struct dentry *dentry = file_dentry(file);
3763         bool lock_inode;
3764 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3765 int ll_fsync(struct file *file, int datasync)
3766 {
3767         struct dentry *dentry = file_dentry(file);
3768         loff_t start = 0;
3769         loff_t end = LLONG_MAX;
3770 #else
3771 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3772 {
3773         loff_t start = 0;
3774         loff_t end = LLONG_MAX;
3775 #endif
3776         struct inode *inode = dentry->d_inode;
3777         struct ll_inode_info *lli = ll_i2info(inode);
3778         struct ptlrpc_request *req;
3779         int rc, err;
3780         ENTRY;
3781
3782         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3783                PFID(ll_inode2fid(inode)), inode);
3784         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3785
3786 #ifdef HAVE_FILE_FSYNC_4ARGS
3787         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3788         lock_inode = !lli->lli_inode_locked;
3789         if (lock_inode)
3790                 inode_lock(inode);
3791 #else
3792         /* fsync's caller has already called _fdata{sync,write}, we want
3793          * that IO to finish before calling the osc and mdc sync methods */
3794         rc = filemap_fdatawait(inode->i_mapping);
3795 #endif
3796
3797         /* catch async errors that were recorded back when async writeback
3798          * failed for pages in this mapping. */
3799         if (!S_ISDIR(inode->i_mode)) {
3800                 err = lli->lli_async_rc;
3801                 lli->lli_async_rc = 0;
3802                 if (rc == 0)
3803                         rc = err;
3804                 if (lli->lli_clob != NULL) {
3805                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3806                         if (rc == 0)
3807                                 rc = err;
3808                 }
3809         }
3810
3811         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3812         if (!rc)
3813                 rc = err;
3814         if (!err)
3815                 ptlrpc_req_finished(req);
3816
3817         if (S_ISREG(inode->i_mode)) {
3818                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3819
3820                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3821                 if (rc == 0 && err < 0)
3822                         rc = err;
3823                 if (rc < 0)
3824                         fd->fd_write_failed = true;
3825                 else
3826                         fd->fd_write_failed = false;
3827         }
3828
3829 #ifdef HAVE_FILE_FSYNC_4ARGS
3830         if (lock_inode)
3831                 inode_unlock(inode);
3832 #endif
3833         RETURN(rc);
3834 }
3835
3836 static int
3837 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3838 {
3839         struct inode *inode = file_inode(file);
3840         struct ll_sb_info *sbi = ll_i2sbi(inode);
3841         struct ldlm_enqueue_info einfo = {
3842                 .ei_type        = LDLM_FLOCK,
3843                 .ei_cb_cp       = ldlm_flock_completion_ast,
3844                 .ei_cbdata      = file_lock,
3845         };
3846         struct md_op_data *op_data;
3847         struct lustre_handle lockh = { 0 };
3848         union ldlm_policy_data flock = { { 0 } };
3849         int fl_type = file_lock->fl_type;
3850         __u64 flags = 0;
3851         int rc;
3852         int rc2 = 0;
3853         ENTRY;
3854
3855         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3856                PFID(ll_inode2fid(inode)), file_lock);
3857
3858         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3859
3860         if (file_lock->fl_flags & FL_FLOCK) {
3861                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3862                 /* flocks are whole-file locks */
3863                 flock.l_flock.end = OFFSET_MAX;
3864                 /* For flocks owner is determined by the local file desctiptor*/
3865                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3866         } else if (file_lock->fl_flags & FL_POSIX) {
3867                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3868                 flock.l_flock.start = file_lock->fl_start;
3869                 flock.l_flock.end = file_lock->fl_end;
3870         } else {
3871                 RETURN(-EINVAL);
3872         }
3873         flock.l_flock.pid = file_lock->fl_pid;
3874
3875         /* Somewhat ugly workaround for svc lockd.
3876          * lockd installs custom fl_lmops->lm_compare_owner that checks
3877          * for the fl_owner to be the same (which it always is on local node
3878          * I guess between lockd processes) and then compares pid.
3879          * As such we assign pid to the owner field to make it all work,
3880          * conflict with normal locks is unlikely since pid space and
3881          * pointer space for current->files are not intersecting */
3882         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3883                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3884
3885         switch (fl_type) {
3886         case F_RDLCK:
3887                 einfo.ei_mode = LCK_PR;
3888                 break;
3889         case F_UNLCK:
3890                 /* An unlock request may or may not have any relation to
3891                  * existing locks so we may not be able to pass a lock handle
3892                  * via a normal ldlm_lock_cancel() request. The request may even
3893                  * unlock a byte range in the middle of an existing lock. In
3894                  * order to process an unlock request we need all of the same
3895                  * information that is given with a normal read or write record
3896                  * lock request. To avoid creating another ldlm unlock (cancel)
3897                  * message we'll treat a LCK_NL flock request as an unlock. */
3898                 einfo.ei_mode = LCK_NL;
3899                 break;
3900         case F_WRLCK:
3901                 einfo.ei_mode = LCK_PW;
3902                 break;
3903         default:
3904                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3905                 RETURN (-ENOTSUPP);
3906         }
3907
3908         switch (cmd) {
3909         case F_SETLKW:
3910 #ifdef F_SETLKW64
3911         case F_SETLKW64:
3912 #endif
3913                 flags = 0;
3914                 break;
3915         case F_SETLK:
3916 #ifdef F_SETLK64
3917         case F_SETLK64:
3918 #endif
3919                 flags = LDLM_FL_BLOCK_NOWAIT;
3920                 break;
3921         case F_GETLK:
3922 #ifdef F_GETLK64
3923         case F_GETLK64:
3924 #endif
3925                 flags = LDLM_FL_TEST_LOCK;
3926                 break;
3927         default:
3928                 CERROR("unknown fcntl lock command: %d\n", cmd);
3929                 RETURN (-EINVAL);
3930         }
3931
3932         /* Save the old mode so that if the mode in the lock changes we
3933          * can decrement the appropriate reader or writer refcount. */
3934         file_lock->fl_type = einfo.ei_mode;
3935
3936         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3937                                      LUSTRE_OPC_ANY, NULL);
3938         if (IS_ERR(op_data))
3939                 RETURN(PTR_ERR(op_data));
3940
3941         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3942                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3943                flock.l_flock.pid, flags, einfo.ei_mode,
3944                flock.l_flock.start, flock.l_flock.end);
3945
3946         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3947                         flags);
3948
3949         /* Restore the file lock type if not TEST lock. */
3950         if (!(flags & LDLM_FL_TEST_LOCK))
3951                 file_lock->fl_type = fl_type;
3952
3953 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3954         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3955             !(flags & LDLM_FL_TEST_LOCK))
3956                 rc2  = locks_lock_file_wait(file, file_lock);
3957 #else
3958         if ((file_lock->fl_flags & FL_FLOCK) &&
3959             (rc == 0 || file_lock->fl_type == F_UNLCK))
3960                 rc2  = flock_lock_file_wait(file, file_lock);
3961         if ((file_lock->fl_flags & FL_POSIX) &&
3962             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3963             !(flags & LDLM_FL_TEST_LOCK))
3964                 rc2  = posix_lock_file_wait(file, file_lock);
3965 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3966
3967         if (rc2 && file_lock->fl_type != F_UNLCK) {
3968                 einfo.ei_mode = LCK_NL;
3969                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3970                            &lockh, flags);
3971                 rc = rc2;
3972         }
3973
3974         ll_finish_md_op_data(op_data);
3975
3976         RETURN(rc);
3977 }
3978
3979 int ll_get_fid_by_name(struct inode *parent, const char *name,
3980                        int namelen, struct lu_fid *fid,
3981                        struct inode **inode)
3982 {
3983         struct md_op_data       *op_data = NULL;
3984         struct mdt_body         *body;
3985         struct ptlrpc_request   *req;
3986         int                     rc;
3987         ENTRY;
3988
3989         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
3990                                      LUSTRE_OPC_ANY, NULL);
3991         if (IS_ERR(op_data))
3992                 RETURN(PTR_ERR(op_data));
3993
3994         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
3995         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
3996         ll_finish_md_op_data(op_data);
3997         if (rc < 0)
3998                 RETURN(rc);
3999
4000         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4001         if (body == NULL)
4002                 GOTO(out_req, rc = -EFAULT);
4003         if (fid != NULL)
4004                 *fid = body->mbo_fid1;
4005
4006         if (inode != NULL)
4007                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4008 out_req:
4009         ptlrpc_req_finished(req);
4010         RETURN(rc);
4011 }
4012
4013 int ll_migrate(struct inode *parent, struct file *file, int mdtidx,
4014                const char *name, int namelen)
4015 {
4016         struct dentry         *dchild = NULL;
4017         struct inode          *child_inode = NULL;
4018         struct md_op_data     *op_data;
4019         struct ptlrpc_request *request = NULL;
4020         struct obd_client_handle *och = NULL;
4021         struct qstr           qstr;
4022         struct mdt_body         *body;
4023         int                    rc;
4024         __u64                   data_version = 0;
4025         ENTRY;
4026
4027         CDEBUG(D_VFSTRACE, "migrate %s under "DFID" to MDT%04x\n",
4028                name, PFID(ll_inode2fid(parent)), mdtidx);
4029
4030         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4031                                      0, LUSTRE_OPC_ANY, NULL);
4032         if (IS_ERR(op_data))
4033                 RETURN(PTR_ERR(op_data));
4034
4035         /* Get child FID first */
4036         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4037         qstr.name = name;
4038         qstr.len = namelen;
4039         dchild = d_lookup(file_dentry(file), &qstr);
4040         if (dchild != NULL) {
4041                 if (dchild->d_inode != NULL)
4042                         child_inode = igrab(dchild->d_inode);
4043                 dput(dchild);
4044         }
4045
4046         if (child_inode == NULL) {
4047                 rc = ll_get_fid_by_name(parent, name, namelen,
4048                                         &op_data->op_fid3, &child_inode);
4049                 if (rc != 0)
4050                         GOTO(out_free, rc);
4051         }
4052
4053         if (child_inode == NULL)
4054                 GOTO(out_free, rc = -EINVAL);
4055
4056         /*
4057          * lfs migrate command needs to be blocked on the client
4058          * by checking the migrate FID against the FID of the
4059          * filesystem root.
4060          */
4061         if (child_inode == parent->i_sb->s_root->d_inode)
4062                 GOTO(out_iput, rc = -EINVAL);
4063
4064         inode_lock(child_inode);
4065         op_data->op_fid3 = *ll_inode2fid(child_inode);
4066         if (!fid_is_sane(&op_data->op_fid3)) {
4067                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4068                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4069                        PFID(&op_data->op_fid3));
4070                 GOTO(out_unlock, rc = -EINVAL);
4071         }
4072
4073         rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3);
4074         if (rc < 0)
4075                 GOTO(out_unlock, rc);
4076
4077         if (rc == mdtidx) {
4078                 CDEBUG(D_INFO, "%s: "DFID" is already on MDT%04x\n", name,
4079                        PFID(&op_data->op_fid3), mdtidx);
4080                 GOTO(out_unlock, rc = 0);
4081         }
4082 again:
4083         if (S_ISREG(child_inode->i_mode)) {
4084                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4085                 if (IS_ERR(och)) {
4086                         rc = PTR_ERR(och);
4087                         och = NULL;
4088                         GOTO(out_unlock, rc);
4089                 }
4090
4091                 rc = ll_data_version(child_inode, &data_version,
4092                                      LL_DV_WR_FLUSH);
4093                 if (rc != 0)
4094                         GOTO(out_close, rc);
4095
4096                 op_data->op_handle = och->och_fh;
4097                 op_data->op_data = och->och_mod;
4098                 op_data->op_data_version = data_version;
4099                 op_data->op_lease_handle = och->och_lease_handle;
4100                 op_data->op_bias |= MDS_RENAME_MIGRATE;
4101         }
4102
4103         op_data->op_mds = mdtidx;
4104         op_data->op_cli_flags = CLI_MIGRATE;
4105         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name,
4106                        namelen, name, namelen, &request);
4107         if (rc == 0) {
4108                 LASSERT(request != NULL);
4109                 ll_update_times(request, parent);
4110
4111                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4112                 LASSERT(body != NULL);
4113
4114                 /* If the server does release layout lock, then we cleanup
4115                  * the client och here, otherwise release it in out_close: */
4116                 if (och != NULL &&
4117                     body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4118                         obd_mod_put(och->och_mod);
4119                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4120                                                   och);
4121                         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4122                         OBD_FREE_PTR(och);
4123                         och = NULL;
4124                 }
4125         }
4126
4127         if (request != NULL) {
4128                 ptlrpc_req_finished(request);
4129                 request = NULL;
4130         }
4131
4132         /* Try again if the file layout has changed. */
4133         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4134                 goto again;
4135
4136 out_close:
4137         if (och != NULL) /* close the file */
4138                 ll_lease_close(och, child_inode, NULL);
4139         if (rc == 0)
4140                 clear_nlink(child_inode);
4141 out_unlock:
4142         inode_unlock(child_inode);
4143 out_iput:
4144         iput(child_inode);
4145 out_free:
4146         ll_finish_md_op_data(op_data);
4147         RETURN(rc);
4148 }
4149
4150 static int
4151 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4152 {
4153         ENTRY;
4154
4155         RETURN(-ENOSYS);
4156 }
4157
4158 /**
4159  * test if some locks matching bits and l_req_mode are acquired
4160  * - bits can be in different locks
4161  * - if found clear the common lock bits in *bits
4162  * - the bits not found, are kept in *bits
4163  * \param inode [IN]
4164  * \param bits [IN] searched lock bits [IN]
4165  * \param l_req_mode [IN] searched lock mode
4166  * \retval boolean, true iff all bits are found
4167  */
4168 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4169 {
4170         struct lustre_handle lockh;
4171         union ldlm_policy_data policy;
4172         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4173                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4174         struct lu_fid *fid;
4175         __u64 flags;
4176         int i;
4177         ENTRY;
4178
4179         if (!inode)
4180                RETURN(0);
4181
4182         fid = &ll_i2info(inode)->lli_fid;
4183         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4184                ldlm_lockname[mode]);
4185
4186         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4187         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4188                 policy.l_inodebits.bits = *bits & (1 << i);
4189                 if (policy.l_inodebits.bits == 0)
4190                         continue;
4191
4192                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4193                                   &policy, mode, &lockh)) {
4194                         struct ldlm_lock *lock;
4195
4196                         lock = ldlm_handle2lock(&lockh);
4197                         if (lock) {
4198                                 *bits &=
4199                                       ~(lock->l_policy_data.l_inodebits.bits);
4200                                 LDLM_LOCK_PUT(lock);
4201                         } else {
4202                                 *bits &= ~policy.l_inodebits.bits;
4203                         }
4204                 }
4205         }
4206         RETURN(*bits == 0);
4207 }
4208
4209 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4210                                struct lustre_handle *lockh, __u64 flags,
4211                                enum ldlm_mode mode)
4212 {
4213         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4214         struct lu_fid *fid;
4215         enum ldlm_mode rc;
4216         ENTRY;
4217
4218         fid = &ll_i2info(inode)->lli_fid;
4219         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4220
4221         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4222                            fid, LDLM_IBITS, &policy, mode, lockh);
4223
4224         RETURN(rc);
4225 }
4226
4227 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4228 {
4229         /* Already unlinked. Just update nlink and return success */
4230         if (rc == -ENOENT) {
4231                 clear_nlink(inode);
4232                 /* If it is striped directory, and there is bad stripe
4233                  * Let's revalidate the dentry again, instead of returning
4234                  * error */
4235                 if (S_ISDIR(inode->i_mode) &&
4236                     ll_i2info(inode)->lli_lsm_md != NULL)
4237                         return 0;
4238
4239                 /* This path cannot be hit for regular files unless in
4240                  * case of obscure races, so no need to to validate
4241                  * size. */
4242                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4243                         return 0;
4244         } else if (rc != 0) {
4245                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4246                              "%s: revalidate FID "DFID" error: rc = %d\n",
4247                              ll_get_fsname(inode->i_sb, NULL, 0),
4248                              PFID(ll_inode2fid(inode)), rc);
4249         }
4250
4251         return rc;
4252 }
4253
4254 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4255 {
4256         struct inode *inode = dentry->d_inode;
4257         struct obd_export *exp = ll_i2mdexp(inode);
4258         struct lookup_intent oit = {
4259                 .it_op = op,
4260         };
4261         struct ptlrpc_request *req = NULL;
4262         struct md_op_data *op_data;
4263         int rc = 0;
4264         ENTRY;
4265
4266         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4267                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4268
4269         /* Call getattr by fid, so do not provide name at all. */
4270         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4271                                      LUSTRE_OPC_ANY, NULL);
4272         if (IS_ERR(op_data))
4273                 RETURN(PTR_ERR(op_data));
4274
4275         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4276         ll_finish_md_op_data(op_data);
4277         if (rc < 0) {
4278                 rc = ll_inode_revalidate_fini(inode, rc);
4279                 GOTO(out, rc);
4280         }
4281
4282         rc = ll_revalidate_it_finish(req, &oit, dentry);
4283         if (rc != 0) {
4284                 ll_intent_release(&oit);
4285                 GOTO(out, rc);
4286         }
4287
4288         /* Unlinked? Unhash dentry, so it is not picked up later by
4289          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4290          * here to preserve get_cwd functionality on 2.6.
4291          * Bug 10503 */
4292         if (!dentry->d_inode->i_nlink) {
4293                 ll_lock_dcache(inode);
4294                 d_lustre_invalidate(dentry, 0);
4295                 ll_unlock_dcache(inode);
4296         }
4297
4298         ll_lookup_finish_locks(&oit, dentry);
4299 out:
4300         ptlrpc_req_finished(req);
4301
4302         return rc;
4303 }
4304
4305 static int ll_merge_md_attr(struct inode *inode)
4306 {
4307         struct cl_attr attr = { 0 };
4308         int rc;
4309
4310         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4311         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4312                            &attr, ll_md_blocking_ast);
4313         if (rc != 0)
4314                 RETURN(rc);
4315
4316         set_nlink(inode, attr.cat_nlink);
4317         inode->i_blocks = attr.cat_blocks;
4318         i_size_write(inode, attr.cat_size);
4319
4320         ll_i2info(inode)->lli_atime = attr.cat_atime;
4321         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4322         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4323
4324         RETURN(0);
4325 }
4326
4327 static inline dev_t ll_compat_encode_dev(dev_t dev)
4328 {
4329         /* The compat_sys_*stat*() syscalls will fail unless the
4330          * device majors and minors are both less than 256. Note that
4331          * the value returned here will be passed through
4332          * old_encode_dev() in cp_compat_stat(). And so we are not
4333          * trying to return a valid compat (u16) device number, just
4334          * one that will pass the old_valid_dev() check. */
4335
4336         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4337 }
4338
4339 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4340 int ll_getattr(const struct path *path, struct kstat *stat,
4341                u32 request_mask, unsigned int flags)
4342 {
4343         struct dentry *de = path->dentry;
4344 #else
4345 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4346 {
4347 #endif
4348         struct inode *inode = de->d_inode;
4349         struct ll_sb_info *sbi = ll_i2sbi(inode);
4350         struct ll_inode_info *lli = ll_i2info(inode);
4351         int rc;
4352
4353         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4354
4355         rc = ll_inode_revalidate(de, IT_GETATTR);
4356         if (rc < 0)
4357                 RETURN(rc);
4358
4359         if (S_ISREG(inode->i_mode)) {
4360                 /* In case of restore, the MDT has the right size and has
4361                  * already send it back without granting the layout lock,
4362                  * inode is up-to-date so glimpse is useless.
4363                  * Also to glimpse we need the layout, in case of a running
4364                  * restore the MDT holds the layout lock so the glimpse will
4365                  * block up to the end of restore (getattr will block)
4366                  */
4367                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4368                         rc = ll_glimpse_size(inode);
4369                         if (rc < 0)
4370                                 RETURN(rc);
4371                 }
4372         } else {
4373                 /* If object isn't regular a file then don't validate size. */
4374                 if (S_ISDIR(inode->i_mode) &&
4375                     lli->lli_lsm_md != NULL) {
4376                         rc = ll_merge_md_attr(inode);
4377                         if (rc < 0)
4378                                 RETURN(rc);
4379                 }
4380
4381                 LTIME_S(inode->i_atime) = lli->lli_atime;
4382                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4383                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4384         }
4385
4386         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4387
4388         if (ll_need_32bit_api(sbi)) {
4389                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4390                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4391                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4392         } else {
4393                 stat->ino = inode->i_ino;
4394                 stat->dev = inode->i_sb->s_dev;
4395                 stat->rdev = inode->i_rdev;
4396         }
4397
4398         stat->mode = inode->i_mode;
4399         stat->uid = inode->i_uid;
4400         stat->gid = inode->i_gid;
4401         stat->atime = inode->i_atime;
4402         stat->mtime = inode->i_mtime;
4403         stat->ctime = inode->i_ctime;
4404         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4405
4406         stat->nlink = inode->i_nlink;
4407         stat->size = i_size_read(inode);
4408         stat->blocks = inode->i_blocks;
4409
4410         return 0;
4411 }
4412
4413 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4414                      __u64 start, __u64 len)
4415 {
4416         int             rc;
4417         size_t          num_bytes;
4418         struct fiemap   *fiemap;
4419         unsigned int    extent_count = fieinfo->fi_extents_max;
4420
4421         num_bytes = sizeof(*fiemap) + (extent_count *
4422                                        sizeof(struct fiemap_extent));
4423         OBD_ALLOC_LARGE(fiemap, num_bytes);
4424
4425         if (fiemap == NULL)
4426                 RETURN(-ENOMEM);
4427
4428         fiemap->fm_flags = fieinfo->fi_flags;
4429         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4430         fiemap->fm_start = start;
4431         fiemap->fm_length = len;
4432         if (extent_count > 0 &&
4433             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4434                            sizeof(struct fiemap_extent)) != 0)
4435                 GOTO(out, rc = -EFAULT);
4436
4437         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4438
4439         fieinfo->fi_flags = fiemap->fm_flags;
4440         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4441         if (extent_count > 0 &&
4442             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4443                          fiemap->fm_mapped_extents *
4444                          sizeof(struct fiemap_extent)) != 0)
4445                 GOTO(out, rc = -EFAULT);
4446 out:
4447         OBD_FREE_LARGE(fiemap, num_bytes);
4448         return rc;
4449 }
4450
4451 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4452 {
4453         struct ll_inode_info *lli = ll_i2info(inode);
4454         struct posix_acl *acl = NULL;
4455         ENTRY;
4456
4457         spin_lock(&lli->lli_lock);
4458         /* VFS' acl_permission_check->check_acl will release the refcount */
4459         acl = posix_acl_dup(lli->lli_posix_acl);
4460         spin_unlock(&lli->lli_lock);
4461
4462         RETURN(acl);
4463 }
4464
4465 #ifdef HAVE_IOP_SET_ACL
4466 #ifdef CONFIG_FS_POSIX_ACL
4467 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4468 {
4469         const char *name = NULL;
4470         char *value = NULL;
4471         size_t size = 0;
4472         int rc = 0;
4473         ENTRY;
4474
4475         switch (type) {
4476         case ACL_TYPE_ACCESS:
4477                 if (acl) {
4478                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4479                         if (rc)
4480                                 GOTO(out, rc);
4481                 }
4482                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4483                 break;
4484         case ACL_TYPE_DEFAULT:
4485                 if (!S_ISDIR(inode->i_mode))
4486                         GOTO(out, rc = acl ? -EACCES : 0);
4487                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4488                 break;
4489         default:
4490                 GOTO(out, rc = -EINVAL);
4491         }
4492
4493         if (acl) {
4494                 size = posix_acl_xattr_size(acl->a_count);
4495                 value = kmalloc(size, GFP_NOFS);
4496                 if (value == NULL)
4497                         GOTO(out, rc = -ENOMEM);
4498
4499                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, size);
4500                 if (rc < 0)
4501                         GOTO(out_free, rc);
4502         }
4503
4504         /* dentry is only used for *.lov attributes so it's safe to be NULL */
4505         rc = __vfs_setxattr(NULL, inode, name, value, size, XATTR_CREATE);
4506 out_free:
4507         kfree(value);
4508 out:
4509         if (!rc)
4510                 set_cached_acl(inode, type, acl);
4511         else
4512                 forget_cached_acl(inode, type);
4513         RETURN(rc);
4514 }
4515 #endif /* CONFIG_FS_POSIX_ACL */
4516 #endif /* HAVE_IOP_SET_ACL */
4517
4518 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4519 static int
4520 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4521 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4522 # else
4523 ll_check_acl(struct inode *inode, int mask)
4524 # endif
4525 {
4526 # ifdef CONFIG_FS_POSIX_ACL
4527         struct posix_acl *acl;
4528         int rc;
4529         ENTRY;
4530
4531 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4532         if (flags & IPERM_FLAG_RCU)
4533                 return -ECHILD;
4534 #  endif
4535         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4536
4537         if (!acl)
4538                 RETURN(-EAGAIN);
4539
4540         rc = posix_acl_permission(inode, acl, mask);
4541         posix_acl_release(acl);
4542
4543         RETURN(rc);
4544 # else /* !CONFIG_FS_POSIX_ACL */
4545         return -EAGAIN;
4546 # endif /* CONFIG_FS_POSIX_ACL */
4547 }
4548 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4549
4550 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4551 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4552 #else
4553 # ifdef HAVE_INODE_PERMISION_2ARGS
4554 int ll_inode_permission(struct inode *inode, int mask)
4555 # else
4556 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4557 # endif
4558 #endif
4559 {
4560         int rc = 0;
4561         struct ll_sb_info *sbi;
4562         struct root_squash_info *squash;
4563         struct cred *cred = NULL;
4564         const struct cred *old_cred = NULL;
4565         cfs_cap_t cap;
4566         bool squash_id = false;
4567         ENTRY;
4568
4569 #ifdef MAY_NOT_BLOCK
4570         if (mask & MAY_NOT_BLOCK)
4571                 return -ECHILD;
4572 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4573         if (flags & IPERM_FLAG_RCU)
4574                 return -ECHILD;
4575 #endif
4576
4577        /* as root inode are NOT getting validated in lookup operation,
4578         * need to do it before permission check. */
4579
4580         if (inode == inode->i_sb->s_root->d_inode) {
4581                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4582                 if (rc)
4583                         RETURN(rc);
4584         }
4585
4586         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4587                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4588
4589         /* squash fsuid/fsgid if needed */
4590         sbi = ll_i2sbi(inode);
4591         squash = &sbi->ll_squash;
4592         if (unlikely(squash->rsi_uid != 0 &&
4593                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4594                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4595                         squash_id = true;
4596         }
4597         if (squash_id) {
4598                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4599                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4600                        squash->rsi_uid, squash->rsi_gid);
4601
4602                 /* update current process's credentials
4603                  * and FS capability */
4604                 cred = prepare_creds();
4605                 if (cred == NULL)
4606                         RETURN(-ENOMEM);
4607
4608                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4609                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4610                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4611                         if ((1 << cap) & CFS_CAP_FS_MASK)
4612                                 cap_lower(cred->cap_effective, cap);
4613                 }
4614                 old_cred = override_creds(cred);
4615         }
4616
4617         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4618         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4619         /* restore current process's credentials and FS capability */
4620         if (squash_id) {
4621                 revert_creds(old_cred);
4622                 put_cred(cred);
4623         }
4624
4625         RETURN(rc);
4626 }
4627
4628 /* -o localflock - only provides locally consistent flock locks */
4629 struct file_operations ll_file_operations = {
4630 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4631 # ifdef HAVE_SYNC_READ_WRITE
4632         .read           = new_sync_read,
4633         .write          = new_sync_write,
4634 # endif
4635         .read_iter      = ll_file_read_iter,
4636         .write_iter     = ll_file_write_iter,
4637 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4638         .read           = ll_file_read,
4639         .aio_read       = ll_file_aio_read,
4640         .write          = ll_file_write,
4641         .aio_write      = ll_file_aio_write,
4642 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4643         .unlocked_ioctl = ll_file_ioctl,
4644         .open           = ll_file_open,
4645         .release        = ll_file_release,
4646         .mmap           = ll_file_mmap,
4647         .llseek         = ll_file_seek,
4648         .splice_read    = ll_file_splice_read,
4649         .fsync          = ll_fsync,
4650         .flush          = ll_flush
4651 };
4652
4653 struct file_operations ll_file_operations_flock = {
4654 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4655 # ifdef HAVE_SYNC_READ_WRITE
4656         .read           = new_sync_read,
4657         .write          = new_sync_write,
4658 # endif /* HAVE_SYNC_READ_WRITE */
4659         .read_iter      = ll_file_read_iter,
4660         .write_iter     = ll_file_write_iter,
4661 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4662         .read           = ll_file_read,
4663         .aio_read       = ll_file_aio_read,
4664         .write          = ll_file_write,
4665         .aio_write      = ll_file_aio_write,
4666 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4667         .unlocked_ioctl = ll_file_ioctl,
4668         .open           = ll_file_open,
4669         .release        = ll_file_release,
4670         .mmap           = ll_file_mmap,
4671         .llseek         = ll_file_seek,
4672         .splice_read    = ll_file_splice_read,
4673         .fsync          = ll_fsync,
4674         .flush          = ll_flush,
4675         .flock          = ll_file_flock,
4676         .lock           = ll_file_flock
4677 };
4678
4679 /* These are for -o noflock - to return ENOSYS on flock calls */
4680 struct file_operations ll_file_operations_noflock = {
4681 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4682 # ifdef HAVE_SYNC_READ_WRITE
4683         .read           = new_sync_read,
4684         .write          = new_sync_write,
4685 # endif /* HAVE_SYNC_READ_WRITE */
4686         .read_iter      = ll_file_read_iter,
4687         .write_iter     = ll_file_write_iter,
4688 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4689         .read           = ll_file_read,
4690         .aio_read       = ll_file_aio_read,
4691         .write          = ll_file_write,
4692         .aio_write      = ll_file_aio_write,
4693 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4694         .unlocked_ioctl = ll_file_ioctl,
4695         .open           = ll_file_open,
4696         .release        = ll_file_release,
4697         .mmap           = ll_file_mmap,
4698         .llseek         = ll_file_seek,
4699         .splice_read    = ll_file_splice_read,
4700         .fsync          = ll_fsync,
4701         .flush          = ll_flush,
4702         .flock          = ll_file_noflock,
4703         .lock           = ll_file_noflock
4704 };
4705
4706 struct inode_operations ll_file_inode_operations = {
4707         .setattr        = ll_setattr,
4708         .getattr        = ll_getattr,
4709         .permission     = ll_inode_permission,
4710 #ifdef HAVE_IOP_XATTR
4711         .setxattr       = ll_setxattr,
4712         .getxattr       = ll_getxattr,
4713         .removexattr    = ll_removexattr,
4714 #endif
4715         .listxattr      = ll_listxattr,
4716         .fiemap         = ll_fiemap,
4717 #ifdef HAVE_IOP_GET_ACL
4718         .get_acl        = ll_get_acl,
4719 #endif
4720 #ifdef HAVE_IOP_SET_ACL
4721         .set_acl        = ll_set_acl,
4722 #endif
4723 };
4724
4725 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4726 {
4727         struct ll_inode_info *lli = ll_i2info(inode);
4728         struct cl_object *obj = lli->lli_clob;
4729         struct lu_env *env;
4730         int rc;
4731         __u16 refcheck;
4732         ENTRY;
4733
4734         if (obj == NULL)
4735                 RETURN(0);
4736
4737         env = cl_env_get(&refcheck);
4738         if (IS_ERR(env))
4739                 RETURN(PTR_ERR(env));
4740
4741         rc = cl_conf_set(env, lli->lli_clob, conf);
4742         if (rc < 0)
4743                 GOTO(out, rc);
4744
4745         if (conf->coc_opc == OBJECT_CONF_SET) {
4746                 struct ldlm_lock *lock = conf->coc_lock;
4747                 struct cl_layout cl = {
4748                         .cl_layout_gen = 0,
4749                 };
4750
4751                 LASSERT(lock != NULL);
4752                 LASSERT(ldlm_has_layout(lock));
4753
4754                 /* it can only be allowed to match after layout is
4755                  * applied to inode otherwise false layout would be
4756                  * seen. Applying layout shoud happen before dropping
4757                  * the intent lock. */
4758                 ldlm_lock_allow_match(lock);
4759
4760                 rc = cl_object_layout_get(env, obj, &cl);
4761                 if (rc < 0)
4762                         GOTO(out, rc);
4763
4764                 CDEBUG(D_VFSTRACE,
4765                        DFID": layout version change: %u -> %u\n",
4766                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4767                        cl.cl_layout_gen);
4768                 ll_layout_version_set(lli, cl.cl_layout_gen);
4769         }
4770
4771 out:
4772         cl_env_put(env, &refcheck);
4773
4774         RETURN(rc);
4775 }
4776
4777 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4778 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4779
4780 {
4781         struct ll_sb_info *sbi = ll_i2sbi(inode);
4782         struct ptlrpc_request *req;
4783         struct mdt_body *body;
4784         void *lvbdata;
4785         void *lmm;
4786         int lmmsize;
4787         int rc;
4788         ENTRY;
4789
4790         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4791                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4792                lock->l_lvb_data, lock->l_lvb_len);
4793
4794         if (lock->l_lvb_data != NULL)
4795                 RETURN(0);
4796
4797         /* if layout lock was granted right away, the layout is returned
4798          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4799          * blocked and then granted via completion ast, we have to fetch
4800          * layout here. Please note that we can't use the LVB buffer in
4801          * completion AST because it doesn't have a large enough buffer */
4802         rc = ll_get_default_mdsize(sbi, &lmmsize);
4803         if (rc == 0)
4804                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4805                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, NULL, 0,
4806                                 lmmsize, 0, &req);
4807         if (rc < 0)
4808                 RETURN(rc);
4809
4810         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4811         if (body == NULL)
4812                 GOTO(out, rc = -EPROTO);
4813
4814         lmmsize = body->mbo_eadatasize;
4815         if (lmmsize == 0) /* empty layout */
4816                 GOTO(out, rc = 0);
4817
4818         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4819         if (lmm == NULL)
4820                 GOTO(out, rc = -EFAULT);
4821
4822         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4823         if (lvbdata == NULL)
4824                 GOTO(out, rc = -ENOMEM);
4825
4826         memcpy(lvbdata, lmm, lmmsize);
4827         lock_res_and_lock(lock);
4828         if (unlikely(lock->l_lvb_data == NULL)) {
4829                 lock->l_lvb_type = LVB_T_LAYOUT;
4830                 lock->l_lvb_data = lvbdata;
4831                 lock->l_lvb_len = lmmsize;
4832                 lvbdata = NULL;
4833         }
4834         unlock_res_and_lock(lock);
4835
4836         if (lvbdata)
4837                 OBD_FREE_LARGE(lvbdata, lmmsize);
4838
4839         EXIT;
4840
4841 out:
4842         ptlrpc_req_finished(req);
4843         return rc;
4844 }
4845
4846 /**
4847  * Apply the layout to the inode. Layout lock is held and will be released
4848  * in this function.
4849  */
4850 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4851                               struct inode *inode)
4852 {
4853         struct ll_inode_info *lli = ll_i2info(inode);
4854         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4855         struct ldlm_lock *lock;
4856         struct cl_object_conf conf;
4857         int rc = 0;
4858         bool lvb_ready;
4859         bool wait_layout = false;
4860         ENTRY;
4861
4862         LASSERT(lustre_handle_is_used(lockh));
4863
4864         lock = ldlm_handle2lock(lockh);
4865         LASSERT(lock != NULL);
4866         LASSERT(ldlm_has_layout(lock));
4867
4868         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4869                    PFID(&lli->lli_fid), inode);
4870
4871         /* in case this is a caching lock and reinstate with new inode */
4872         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4873
4874         lock_res_and_lock(lock);
4875         lvb_ready = ldlm_is_lvb_ready(lock);
4876         unlock_res_and_lock(lock);
4877
4878         /* checking lvb_ready is racy but this is okay. The worst case is
4879          * that multi processes may configure the file on the same time. */
4880         if (lvb_ready)
4881                 GOTO(out, rc = 0);
4882
4883         rc = ll_layout_fetch(inode, lock);
4884         if (rc < 0)
4885                 GOTO(out, rc);
4886
4887         /* for layout lock, lmm is stored in lock's lvb.
4888          * lvb_data is immutable if the lock is held so it's safe to access it
4889          * without res lock.
4890          *
4891          * set layout to file. Unlikely this will fail as old layout was
4892          * surely eliminated */
4893         memset(&conf, 0, sizeof conf);
4894         conf.coc_opc = OBJECT_CONF_SET;
4895         conf.coc_inode = inode;
4896         conf.coc_lock = lock;
4897         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4898         conf.u.coc_layout.lb_len = lock->l_lvb_len;
4899         rc = ll_layout_conf(inode, &conf);
4900
4901         /* refresh layout failed, need to wait */
4902         wait_layout = rc == -EBUSY;
4903         EXIT;
4904 out:
4905         LDLM_LOCK_PUT(lock);
4906         ldlm_lock_decref(lockh, mode);
4907
4908         /* wait for IO to complete if it's still being used. */
4909         if (wait_layout) {
4910                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4911                        ll_get_fsname(inode->i_sb, NULL, 0),
4912                        PFID(&lli->lli_fid), inode);
4913
4914                 memset(&conf, 0, sizeof conf);
4915                 conf.coc_opc = OBJECT_CONF_WAIT;
4916                 conf.coc_inode = inode;
4917                 rc = ll_layout_conf(inode, &conf);
4918                 if (rc == 0)
4919                         rc = -EAGAIN;
4920
4921                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4922                        ll_get_fsname(inode->i_sb, NULL, 0),
4923                        PFID(&lli->lli_fid), rc);
4924         }
4925         RETURN(rc);
4926 }
4927
4928 /**
4929  * Issue layout intent RPC to MDS.
4930  * \param inode [in]    file inode
4931  * \param intent [in]   layout intent
4932  *
4933  * \retval 0    on success
4934  * \retval < 0  error code
4935  */
4936 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4937 {
4938         struct ll_inode_info  *lli = ll_i2info(inode);
4939         struct ll_sb_info     *sbi = ll_i2sbi(inode);
4940         struct md_op_data     *op_data;
4941         struct lookup_intent it;
4942         struct ptlrpc_request *req;
4943         int rc;
4944         ENTRY;
4945
4946         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4947                                      0, 0, LUSTRE_OPC_ANY, NULL);
4948         if (IS_ERR(op_data))
4949                 RETURN(PTR_ERR(op_data));
4950
4951         op_data->op_data = intent;
4952         op_data->op_data_size = sizeof(*intent);
4953
4954         memset(&it, 0, sizeof(it));
4955         it.it_op = IT_LAYOUT;
4956         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4957             intent->li_opc == LAYOUT_INTENT_TRUNC)
4958                 it.it_flags = FMODE_WRITE;
4959
4960         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4961                           ll_get_fsname(inode->i_sb, NULL, 0),
4962                           PFID(&lli->lli_fid), inode);
4963
4964         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4965                             &ll_md_blocking_ast, 0);
4966         if (it.it_request != NULL)
4967                 ptlrpc_req_finished(it.it_request);
4968         it.it_request = NULL;
4969
4970         ll_finish_md_op_data(op_data);
4971
4972         /* set lock data in case this is a new lock */
4973         if (!rc)
4974                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
4975
4976         ll_intent_drop_lock(&it);
4977
4978         RETURN(rc);
4979 }
4980
4981 /**
4982  * This function checks if there exists a LAYOUT lock on the client side,
4983  * or enqueues it if it doesn't have one in cache.
4984  *
4985  * This function will not hold layout lock so it may be revoked any time after
4986  * this function returns. Any operations depend on layout should be redone
4987  * in that case.
4988  *
4989  * This function should be called before lov_io_init() to get an uptodate
4990  * layout version, the caller should save the version number and after IO
4991  * is finished, this function should be called again to verify that layout
4992  * is not changed during IO time.
4993  */
4994 int ll_layout_refresh(struct inode *inode, __u32 *gen)
4995 {
4996         struct ll_inode_info    *lli = ll_i2info(inode);
4997         struct ll_sb_info       *sbi = ll_i2sbi(inode);
4998         struct lustre_handle lockh;
4999         struct layout_intent intent = {
5000                 .li_opc = LAYOUT_INTENT_ACCESS,
5001         };
5002         enum ldlm_mode mode;
5003         int rc;
5004         ENTRY;
5005
5006         *gen = ll_layout_version_get(lli);
5007         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5008                 RETURN(0);
5009
5010         /* sanity checks */
5011         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5012         LASSERT(S_ISREG(inode->i_mode));
5013
5014         /* take layout lock mutex to enqueue layout lock exclusively. */
5015         mutex_lock(&lli->lli_layout_mutex);
5016
5017         while (1) {
5018                 /* mostly layout lock is caching on the local side, so try to
5019                  * match it before grabbing layout lock mutex. */
5020                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5021                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5022                 if (mode != 0) { /* hit cached lock */
5023                         rc = ll_layout_lock_set(&lockh, mode, inode);
5024                         if (rc == -EAGAIN)
5025                                 continue;
5026                         break;
5027                 }
5028
5029                 rc = ll_layout_intent(inode, &intent);
5030                 if (rc != 0)
5031                         break;
5032         }
5033
5034         if (rc == 0)
5035                 *gen = ll_layout_version_get(lli);
5036         mutex_unlock(&lli->lli_layout_mutex);
5037
5038         RETURN(rc);
5039 }
5040
5041 /**
5042  * Issue layout intent RPC indicating where in a file an IO is about to write.
5043  *
5044  * \param[in] inode     file inode.
5045  * \param[in] ext       write range with start offset of fille in bytes where
5046  *                      an IO is about to write, and exclusive end offset in
5047  *                      bytes.
5048  *
5049  * \retval 0    on success
5050  * \retval < 0  error code
5051  */
5052 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5053                            struct lu_extent *ext)
5054 {
5055         struct layout_intent intent = {
5056                 .li_opc = opc,
5057                 .li_extent.e_start = ext->e_start,
5058                 .li_extent.e_end = ext->e_end,
5059         };
5060         int rc;
5061         ENTRY;
5062
5063         rc = ll_layout_intent(inode, &intent);
5064
5065         RETURN(rc);
5066 }
5067
5068 /**
5069  *  This function send a restore request to the MDT
5070  */
5071 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5072 {
5073         struct hsm_user_request *hur;
5074         int                      len, rc;
5075         ENTRY;
5076
5077         len = sizeof(struct hsm_user_request) +
5078               sizeof(struct hsm_user_item);
5079         OBD_ALLOC(hur, len);
5080         if (hur == NULL)
5081                 RETURN(-ENOMEM);
5082
5083         hur->hur_request.hr_action = HUA_RESTORE;
5084         hur->hur_request.hr_archive_id = 0;
5085         hur->hur_request.hr_flags = 0;
5086         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5087                sizeof(hur->hur_user_item[0].hui_fid));
5088         hur->hur_user_item[0].hui_extent.offset = offset;
5089         hur->hur_user_item[0].hui_extent.length = length;
5090         hur->hur_request.hr_itemcount = 1;
5091         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5092                            len, hur, NULL);
5093         OBD_FREE(hur, len);
5094         RETURN(rc);
5095 }