lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                       ATTR_MTIME | ATTR_MTIME_SET |
 104                                       ATTR_CTIME);
 105         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 106         op_data->op_attr_blocks = inode->i_blocks;
 107         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 108         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 109                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 110         op_data->op_open_handle = och->och_open_handle;
 111
 112         if (och->och_flags & FMODE_WRITE &&
 113             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 114                 /* For HSM: if inode data has been modified, pack it so that
 115                  * MDT can set data dirty flag in the archive. */
 116                 op_data->op_bias |= MDS_DATA_MODIFIED;
 117
 118         EXIT;
 119 }
 120
 121 /**
 122  * Perform a close, possibly with a bias.
 123  * The meaning of "data" depends on the value of "bias".
 124  *
 125  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 126  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 127  * swap layouts with.
 128  */
 129 static int ll_close_inode_openhandle(struct inode *inode,
 130                                      struct obd_client_handle *och,
 131                                      enum mds_op_bias bias, void *data)
 132 {
 133         struct obd_export *md_exp = ll_i2mdexp(inode);
 134         const struct ll_inode_info *lli = ll_i2info(inode);
 135         struct md_op_data *op_data;
 136         struct ptlrpc_request *req = NULL;
 137         int rc;
 138         ENTRY;
 139
 140         if (class_exp2obd(md_exp) == NULL) {
 141                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 142                        ll_get_fsname(inode->i_sb, NULL, 0),
 143                        PFID(&lli->lli_fid));
 144                 GOTO(out, rc = 0);
 145         }
 146
 147         OBD_ALLOC_PTR(op_data);
 148         /* We leak openhandle and request here on error, but not much to be
 149          * done in OOM case since app won't retry close on error either. */
 150         if (op_data == NULL)
 151                 GOTO(out, rc = -ENOMEM);
 152
 153         ll_prepare_close(inode, op_data, och);
 154         switch (bias) {
 155         case MDS_CLOSE_LAYOUT_MERGE:
 156                 /* merge blocks from the victim inode */
 157                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 158                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 159                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 160         case MDS_CLOSE_LAYOUT_SPLIT:
 161         case MDS_CLOSE_LAYOUT_SWAP: {
 162                 struct split_param *sp = data;
 163
 164                 LASSERT(data != NULL);
 165                 op_data->op_bias |= bias;
 166                 op_data->op_data_version = 0;
 167                 op_data->op_lease_handle = och->och_lease_handle;
 168                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 169                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 170                         op_data->op_mirror_id = sp->sp_mirror_id;
 171                 } else {
 172                         op_data->op_fid2 = *ll_inode2fid(data);
 173                 }
 174                 break;
 175         }
 176
 177         case MDS_CLOSE_RESYNC_DONE: {
 178                 struct ll_ioc_lease *ioc = data;
 179
 180                 LASSERT(data != NULL);
 181                 op_data->op_attr_blocks +=
 182                         ioc->lil_count * op_data->op_attr_blocks;
 183                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 184                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 185                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 186
 187                 op_data->op_lease_handle = och->och_lease_handle;
 188                 op_data->op_data = &ioc->lil_ids[0];
 189                 op_data->op_data_size =
 190                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 191                 break;
 192         }
 193
 194         case MDS_HSM_RELEASE:
 195                 LASSERT(data != NULL);
 196                 op_data->op_bias |= MDS_HSM_RELEASE;
 197                 op_data->op_data_version = *(__u64 *)data;
 198                 op_data->op_lease_handle = och->och_lease_handle;
 199                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 200                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 201                 break;
 202
 203         default:
 204                 LASSERT(data == NULL);
 205                 break;
 206         }
 207
 208         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 209                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 210         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 211                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 212
 213         rc = md_close(md_exp, op_data, och->och_mod, &req);
 214         if (rc != 0 && rc != -EINTR)
 215                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 216                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 217
 218         if (rc == 0 && op_data->op_bias & bias) {
 219                 struct mdt_body *body;
 220
 221                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 222                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 223                         rc = -EBUSY;
 224         }
 225
 226         ll_finish_md_op_data(op_data);
 227         EXIT;
 228 out:
 229
 230         md_clear_open_replay_data(md_exp, och);
 231         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 232         OBD_FREE_PTR(och);
 233
 234         ptlrpc_req_finished(req);       /* This is close request */
 235         return rc;
 236 }
 237
 238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 239 {
 240         struct ll_inode_info *lli = ll_i2info(inode);
 241         struct obd_client_handle **och_p;
 242         struct obd_client_handle *och;
 243         __u64 *och_usecount;
 244         int rc = 0;
 245         ENTRY;
 246
 247         if (fmode & FMODE_WRITE) {
 248                 och_p = &lli->lli_mds_write_och;
 249                 och_usecount = &lli->lli_open_fd_write_count;
 250         } else if (fmode & FMODE_EXEC) {
 251                 och_p = &lli->lli_mds_exec_och;
 252                 och_usecount = &lli->lli_open_fd_exec_count;
 253         } else {
 254                 LASSERT(fmode & FMODE_READ);
 255                 och_p = &lli->lli_mds_read_och;
 256                 och_usecount = &lli->lli_open_fd_read_count;
 257         }
 258
 259         mutex_lock(&lli->lli_och_mutex);
 260         if (*och_usecount > 0) {
 261                 /* There are still users of this handle, so skip
 262                  * freeing it. */
 263                 mutex_unlock(&lli->lli_och_mutex);
 264                 RETURN(0);
 265         }
 266
 267         och = *och_p;
 268         *och_p = NULL;
 269         mutex_unlock(&lli->lli_och_mutex);
 270
 271         if (och != NULL) {
 272                 /* There might be a race and this handle may already
 273                  * be closed. */
 274                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 275         }
 276
 277         RETURN(rc);
 278 }
 279
 280 static int ll_md_close(struct inode *inode, struct file *file)
 281 {
 282         union ldlm_policy_data policy = {
 283                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 284         };
 285         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 286         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 287         struct ll_inode_info *lli = ll_i2info(inode);
 288         struct lustre_handle lockh;
 289         enum ldlm_mode lockmode;
 290         int rc = 0;
 291         ENTRY;
 292
 293         /* clear group lock, if present */
 294         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 295                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 296
 297         if (fd->fd_lease_och != NULL) {
 298                 bool lease_broken;
 299
 300                 /* Usually the lease is not released when the
 301                  * application crashed, we need to release here. */
 302                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 303                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 304                         PFID(&lli->lli_fid), rc, lease_broken);
 305
 306                 fd->fd_lease_och = NULL;
 307         }
 308
 309         if (fd->fd_och != NULL) {
 310                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 311                 fd->fd_och = NULL;
 312                 GOTO(out, rc);
 313         }
 314
 315         /* Let's see if we have good enough OPEN lock on the file and if
 316            we can skip talking to MDS */
 317         mutex_lock(&lli->lli_och_mutex);
 318         if (fd->fd_omode & FMODE_WRITE) {
 319                 lockmode = LCK_CW;
 320                 LASSERT(lli->lli_open_fd_write_count);
 321                 lli->lli_open_fd_write_count--;
 322         } else if (fd->fd_omode & FMODE_EXEC) {
 323                 lockmode = LCK_PR;
 324                 LASSERT(lli->lli_open_fd_exec_count);
 325                 lli->lli_open_fd_exec_count--;
 326         } else {
 327                 lockmode = LCK_CR;
 328                 LASSERT(lli->lli_open_fd_read_count);
 329                 lli->lli_open_fd_read_count--;
 330         }
 331         mutex_unlock(&lli->lli_och_mutex);
 332
 333         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 334                            LDLM_IBITS, &policy, lockmode, &lockh))
 335                 rc = ll_md_real_close(inode, fd->fd_omode);
 336
 337 out:
 338         LUSTRE_FPRIVATE(file) = NULL;
 339         ll_file_data_put(fd);
 340
 341         RETURN(rc);
 342 }
 343
 344 /* While this returns an error code, fput() the caller does not, so we need
 345  * to make every effort to clean up all of our state here.  Also, applications
 346  * rarely check close errors and even if an error is returned they will not
 347  * re-try the close call.
 348  */
 349 int ll_file_release(struct inode *inode, struct file *file)
 350 {
 351         struct ll_file_data *fd;
 352         struct ll_sb_info *sbi = ll_i2sbi(inode);
 353         struct ll_inode_info *lli = ll_i2info(inode);
 354         int rc;
 355         ENTRY;
 356
 357         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 358                PFID(ll_inode2fid(inode)), inode);
 359
 360         if (inode->i_sb->s_root != file_dentry(file))
 361                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 362         fd = LUSTRE_FPRIVATE(file);
 363         LASSERT(fd != NULL);
 364
 365         /* The last ref on @file, maybe not the the owner pid of statahead,
 366          * because parent and child process can share the same file handle. */
 367         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 368                 ll_deauthorize_statahead(inode, fd);
 369
 370         if (inode->i_sb->s_root == file_dentry(file)) {
 371                 LUSTRE_FPRIVATE(file) = NULL;
 372                 ll_file_data_put(fd);
 373                 RETURN(0);
 374         }
 375
 376         if (!S_ISDIR(inode->i_mode)) {
 377                 if (lli->lli_clob != NULL)
 378                         lov_read_and_clear_async_rc(lli->lli_clob);
 379                 lli->lli_async_rc = 0;
 380         }
 381
 382         rc = ll_md_close(inode, file);
 383
 384         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 385                 libcfs_debug_dumplog();
 386
 387         RETURN(rc);
 388 }
 389
 390 static inline int ll_dom_readpage(void *data, struct page *page)
 391 {
 392         struct niobuf_local *lnb = data;
 393         void *kaddr;
 394
 395         kaddr = ll_kmap_atomic(page, KM_USER0);
 396         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 397         if (lnb->lnb_len < PAGE_SIZE)
 398                 memset(kaddr + lnb->lnb_len, 0,
 399                        PAGE_SIZE - lnb->lnb_len);
 400         flush_dcache_page(page);
 401         SetPageUptodate(page);
 402         ll_kunmap_atomic(kaddr, KM_USER0);
 403         unlock_page(page);
 404
 405         return 0;
 406 }
 407
 408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 409                         struct lookup_intent *it)
 410 {
 411         struct ll_inode_info *lli = ll_i2info(inode);
 412         struct cl_object *obj = lli->lli_clob;
 413         struct address_space *mapping = inode->i_mapping;
 414         struct page *vmpage;
 415         struct niobuf_remote *rnb;
 416         char *data;
 417         struct lustre_handle lockh;
 418         struct ldlm_lock *lock;
 419         unsigned long index, start;
 420         struct niobuf_local lnb;
 421         bool dom_lock = false;
 422
 423         ENTRY;
 424
 425         if (obj == NULL)
 426                 RETURN_EXIT;
 427
 428         if (it->it_lock_mode != 0) {
 429                 lockh.cookie = it->it_lock_handle;
 430                 lock = ldlm_handle2lock(&lockh);
 431                 if (lock != NULL)
 432                         dom_lock = ldlm_has_dom(lock);
 433                 LDLM_LOCK_PUT(lock);
 434         }
 435         if (!dom_lock)
 436                 RETURN_EXIT;
 437
 438         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 439                                    RCL_SERVER))
 440                 RETURN_EXIT;
 441
 442         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 443         if (rnb == NULL || rnb->rnb_len == 0)
 444                 RETURN_EXIT;
 445
 446         /* LU-11595: Server may return whole file and that is OK always or
 447          * it may return just file tail and its offset must be aligned with
 448          * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
 449          * smaller then offset may be not aligned and that data is just ignored.
 450          */
 451         if (rnb->rnb_offset % PAGE_SIZE)
 452                 RETURN_EXIT;
 453
 454         /* Server returns whole file or just file tail if it fills in
 455          * reply buffer, in both cases total size should be inode size.
 456          */
 457         if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
 458                 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
 459                        ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
 460                        rnb->rnb_len, i_size_read(inode));
 461                 RETURN_EXIT;
 462         }
 463
 464         CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
 465                rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
 466
 467         data = (char *)rnb + sizeof(*rnb);
 468
 469         lnb.lnb_file_offset = rnb->rnb_offset;
 470         start = lnb.lnb_file_offset / PAGE_SIZE;
 471         index = 0;
 472         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 473         lnb.lnb_page_offset = 0;
 474         do {
 475                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 476                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 477                 if (lnb.lnb_len > PAGE_SIZE)
 478                         lnb.lnb_len = PAGE_SIZE;
 479
 480                 vmpage = read_cache_page(mapping, index + start,
 481                                          ll_dom_readpage, &lnb);
 482                 if (IS_ERR(vmpage)) {
 483                         CWARN("%s: cannot fill page %lu for "DFID
 484                               " with data: rc = %li\n",
 485                               ll_get_fsname(inode->i_sb, NULL, 0),
 486                               index + start, PFID(lu_object_fid(&obj->co_lu)),
 487                               PTR_ERR(vmpage));
 488                         break;
 489                 }
 490                 put_page(vmpage);
 491                 index++;
 492         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 493         EXIT;
 494 }
 495
 496 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 497                                 struct lookup_intent *itp)
 498 {
 499         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 500         struct dentry *parent = de->d_parent;
 501         const char *name = NULL;
 502         int len = 0;
 503         struct md_op_data *op_data;
 504         struct ptlrpc_request *req = NULL;
 505         int rc;
 506         ENTRY;
 507
 508         LASSERT(parent != NULL);
 509         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 510
 511         /* if server supports open-by-fid, or file name is invalid, don't pack
 512          * name in open request */
 513         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 514             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 515                 name = de->d_name.name;
 516                 len = de->d_name.len;
 517         }
 518
 519         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 520                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 521         if (IS_ERR(op_data))
 522                 RETURN(PTR_ERR(op_data));
 523         op_data->op_data = lmm;
 524         op_data->op_data_size = lmmsize;
 525
 526         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 527                             &ll_md_blocking_ast, 0);
 528         ll_finish_md_op_data(op_data);
 529         if (rc == -ESTALE) {
 530                 /* reason for keep own exit path - don`t flood log
 531                  * with messages with -ESTALE errors.
 532                  */
 533                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 534                      it_open_error(DISP_OPEN_OPEN, itp))
 535                         GOTO(out, rc);
 536                 ll_release_openhandle(de, itp);
 537                 GOTO(out, rc);
 538         }
 539
 540         if (it_disposition(itp, DISP_LOOKUP_NEG))
 541                 GOTO(out, rc = -ENOENT);
 542
 543         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 544                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 545                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 546                 GOTO(out, rc);
 547         }
 548
 549         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 550
 551         if (!rc && itp->it_lock_mode) {
 552                 ll_dom_finish_open(de->d_inode, req, itp);
 553                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 554         }
 555
 556 out:
 557         ptlrpc_req_finished(req);
 558         ll_intent_drop_lock(itp);
 559
 560         /* We did open by fid, but by the time we got to the server,
 561          * the object disappeared. If this is a create, we cannot really
 562          * tell the userspace that the file it was trying to create
 563          * does not exist. Instead let's return -ESTALE, and the VFS will
 564          * retry the create with LOOKUP_REVAL that we are going to catch
 565          * in ll_revalidate_dentry() and use lookup then.
 566          */
 567         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 568                 rc = -ESTALE;
 569
 570         RETURN(rc);
 571 }
 572
 573 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 574                        struct obd_client_handle *och)
 575 {
 576         struct mdt_body *body;
 577
 578         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 579         och->och_open_handle = body->mbo_open_handle;
 580         och->och_fid = body->mbo_fid1;
 581         och->och_lease_handle.cookie = it->it_lock_handle;
 582         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 583         och->och_flags = it->it_flags;
 584
 585         return md_set_open_replay_data(md_exp, och, it);
 586 }
 587
 588 static int ll_local_open(struct file *file, struct lookup_intent *it,
 589                          struct ll_file_data *fd, struct obd_client_handle *och)
 590 {
 591         struct inode *inode = file_inode(file);
 592         ENTRY;
 593
 594         LASSERT(!LUSTRE_FPRIVATE(file));
 595
 596         LASSERT(fd != NULL);
 597
 598         if (och) {
 599                 int rc;
 600
 601                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 602                 if (rc != 0)
 603                         RETURN(rc);
 604         }
 605
 606         LUSTRE_FPRIVATE(file) = fd;
 607         ll_readahead_init(inode, &fd->fd_ras);
 608         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 609
 610         /* ll_cl_context initialize */
 611         rwlock_init(&fd->fd_lock);
 612         INIT_LIST_HEAD(&fd->fd_lccs);
 613
 614         RETURN(0);
 615 }
 616
 617 /* Open a file, and (for the very first open) create objects on the OSTs at
 618  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 619  * creation or open until ll_lov_setstripe() ioctl is called.
 620  *
 621  * If we already have the stripe MD locally then we don't request it in
 622  * md_open(), by passing a lmm_size = 0.
 623  *
 624  * It is up to the application to ensure no other processes open this file
 625  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 626  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 627  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 628  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 629  */
 630 int ll_file_open(struct inode *inode, struct file *file)
 631 {
 632         struct ll_inode_info *lli = ll_i2info(inode);
 633         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 634                                           .it_flags = file->f_flags };
 635         struct obd_client_handle **och_p = NULL;
 636         __u64 *och_usecount = NULL;
 637         struct ll_file_data *fd;
 638         int rc = 0;
 639         ENTRY;
 640
 641         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 642                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 643
 644         it = file->private_data; /* XXX: compat macro */
 645         file->private_data = NULL; /* prevent ll_local_open assertion */
 646
 647         fd = ll_file_data_get();
 648         if (fd == NULL)
 649                 GOTO(out_nofiledata, rc = -ENOMEM);
 650
 651         fd->fd_file = file;
 652         if (S_ISDIR(inode->i_mode))
 653                 ll_authorize_statahead(inode, fd);
 654
 655         if (inode->i_sb->s_root == file_dentry(file)) {
 656                 LUSTRE_FPRIVATE(file) = fd;
 657                 RETURN(0);
 658         }
 659
 660         if (!it || !it->it_disposition) {
 661                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 662                  * because everything but O_ACCMODE mask was stripped from
 663                  * there */
 664                 if ((oit.it_flags + 1) & O_ACCMODE)
 665                         oit.it_flags++;
 666                 if (file->f_flags & O_TRUNC)
 667                         oit.it_flags |= FMODE_WRITE;
 668
 669                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 670                  * dentry_open after call to open_namei that checks permissions.
 671                  * Only nfsd_open call dentry_open directly without checking
 672                  * permissions and because of that this code below is safe.
 673                  */
 674                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 675                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 676
 677                 /* We do not want O_EXCL here, presumably we opened the file
 678                  * already? XXX - NFS implications? */
 679                 oit.it_flags &= ~O_EXCL;
 680
 681                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 682                  * created if necessary, then "IT_CREAT" should be set to keep
 683                  * consistent with it */
 684                 if (oit.it_flags & O_CREAT)
 685                         oit.it_op |= IT_CREAT;
 686
 687                 it = &oit;
 688         }
 689
 690 restart:
 691         /* Let's see if we have file open on MDS already. */
 692         if (it->it_flags & FMODE_WRITE) {
 693                 och_p = &lli->lli_mds_write_och;
 694                 och_usecount = &lli->lli_open_fd_write_count;
 695         } else if (it->it_flags & FMODE_EXEC) {
 696                 och_p = &lli->lli_mds_exec_och;
 697                 och_usecount = &lli->lli_open_fd_exec_count;
 698          } else {
 699                 och_p = &lli->lli_mds_read_och;
 700                 och_usecount = &lli->lli_open_fd_read_count;
 701         }
 702
 703         mutex_lock(&lli->lli_och_mutex);
 704         if (*och_p) { /* Open handle is present */
 705                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 706                         /* Well, there's extra open request that we do not need,
 707                            let's close it somehow. This will decref request. */
 708                         rc = it_open_error(DISP_OPEN_OPEN, it);
 709                         if (rc) {
 710                                 mutex_unlock(&lli->lli_och_mutex);
 711                                 GOTO(out_openerr, rc);
 712                         }
 713
 714                         ll_release_openhandle(file_dentry(file), it);
 715                 }
 716                 (*och_usecount)++;
 717
 718                 rc = ll_local_open(file, it, fd, NULL);
 719                 if (rc) {
 720                         (*och_usecount)--;
 721                         mutex_unlock(&lli->lli_och_mutex);
 722                         GOTO(out_openerr, rc);
 723                 }
 724         } else {
 725                 LASSERT(*och_usecount == 0);
 726                 if (!it->it_disposition) {
 727                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 728                         /* We cannot just request lock handle now, new ELC code
 729                            means that one of other OPEN locks for this file
 730                            could be cancelled, and since blocking ast handler
 731                            would attempt to grab och_mutex as well, that would
 732                            result in a deadlock */
 733                         mutex_unlock(&lli->lli_och_mutex);
 734                         /*
 735                          * Normally called under two situations:
 736                          * 1. NFS export.
 737                          * 2. A race/condition on MDS resulting in no open
 738                          *    handle to be returned from LOOKUP|OPEN request,
 739                          *    for example if the target entry was a symlink.
 740                          *
 741                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 742                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 743                          *  bit so that it's not confusing later callers.
 744                          *
 745                          *  NB; when ldd is NULL, it must have come via normal
 746                          *  lookup path only, since ll_iget_for_nfs always calls
 747                          *  ll_d_init().
 748                          */
 749                         if (ldd && ldd->lld_nfs_dentry) {
 750                                 ldd->lld_nfs_dentry = 0;
 751                                 it->it_flags |= MDS_OPEN_LOCK;
 752                         }
 753
 754                          /*
 755                          * Always specify MDS_OPEN_BY_FID because we don't want
 756                          * to get file with different fid.
 757                          */
 758                         it->it_flags |= MDS_OPEN_BY_FID;
 759                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 760                                                  it);
 761                         if (rc)
 762                                 GOTO(out_openerr, rc);
 763
 764                         goto restart;
 765                 }
 766                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 767                 if (!*och_p)
 768                         GOTO(out_och_free, rc = -ENOMEM);
 769
 770                 (*och_usecount)++;
 771
 772                 /* md_intent_lock() didn't get a request ref if there was an
 773                  * open error, so don't do cleanup on the request here
 774                  * (bug 3430) */
 775                 /* XXX (green): Should not we bail out on any error here, not
 776                  * just open error? */
 777                 rc = it_open_error(DISP_OPEN_OPEN, it);
 778                 if (rc != 0)
 779                         GOTO(out_och_free, rc);
 780
 781                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 782                          "inode %p: disposition %x, status %d\n", inode,
 783                          it_disposition(it, ~0), it->it_status);
 784
 785                 rc = ll_local_open(file, it, fd, *och_p);
 786                 if (rc)
 787                         GOTO(out_och_free, rc);
 788         }
 789         mutex_unlock(&lli->lli_och_mutex);
 790         fd = NULL;
 791
 792         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 793            different kind of OPEN lock for this same inode gets cancelled
 794            by ldlm_cancel_lru */
 795         if (!S_ISREG(inode->i_mode))
 796                 GOTO(out_och_free, rc);
 797
 798         cl_lov_delay_create_clear(&file->f_flags);
 799         GOTO(out_och_free, rc);
 800
 801 out_och_free:
 802         if (rc) {
 803                 if (och_p && *och_p) {
 804                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 805                         *och_p = NULL; /* OBD_FREE writes some magic there */
 806                         (*och_usecount)--;
 807                 }
 808                 mutex_unlock(&lli->lli_och_mutex);
 809
 810 out_openerr:
 811                 if (lli->lli_opendir_key == fd)
 812                         ll_deauthorize_statahead(inode, fd);
 813                 if (fd != NULL)
 814                         ll_file_data_put(fd);
 815         } else {
 816                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 817         }
 818
 819 out_nofiledata:
 820         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 821                 ptlrpc_req_finished(it->it_request);
 822                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 823         }
 824
 825         return rc;
 826 }
 827
 828 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 829                         struct ldlm_lock_desc *desc, void *data, int flag)
 830 {
 831         int rc;
 832         struct lustre_handle lockh;
 833         ENTRY;
 834
 835         switch (flag) {
 836         case LDLM_CB_BLOCKING:
 837                 ldlm_lock2handle(lock, &lockh);
 838                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 839                 if (rc < 0) {
 840                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 841                         RETURN(rc);
 842                 }
 843                 break;
 844         case LDLM_CB_CANCELING:
 845                 /* do nothing */
 846                 break;
 847         }
 848         RETURN(0);
 849 }
 850
 851 /**
 852  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 853  * and save it as fd->fd_och so as to force client to reopen the file even
 854  * if it has an open lock in cache already.
 855  */
 856 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 857                                 struct lustre_handle *old_open_handle)
 858 {
 859         struct ll_inode_info *lli = ll_i2info(inode);
 860         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 861         struct obd_client_handle **och_p;
 862         __u64 *och_usecount;
 863         int rc = 0;
 864         ENTRY;
 865
 866         /* Get the openhandle of the file */
 867         mutex_lock(&lli->lli_och_mutex);
 868         if (fd->fd_lease_och != NULL)
 869                 GOTO(out_unlock, rc = -EBUSY);
 870
 871         if (fd->fd_och == NULL) {
 872                 if (file->f_mode & FMODE_WRITE) {
 873                         LASSERT(lli->lli_mds_write_och != NULL);
 874                         och_p = &lli->lli_mds_write_och;
 875                         och_usecount = &lli->lli_open_fd_write_count;
 876                 } else {
 877                         LASSERT(lli->lli_mds_read_och != NULL);
 878                         och_p = &lli->lli_mds_read_och;
 879                         och_usecount = &lli->lli_open_fd_read_count;
 880                 }
 881
 882                 if (*och_usecount > 1)
 883                         GOTO(out_unlock, rc = -EBUSY);
 884
 885                 fd->fd_och = *och_p;
 886                 *och_usecount = 0;
 887                 *och_p = NULL;
 888         }
 889
 890         *old_open_handle = fd->fd_och->och_open_handle;
 891
 892         EXIT;
 893 out_unlock:
 894         mutex_unlock(&lli->lli_och_mutex);
 895         return rc;
 896 }
 897
 898 /**
 899  * Release ownership on lli_mds_*_och when putting back a file lease.
 900  */
 901 static int ll_lease_och_release(struct inode *inode, struct file *file)
 902 {
 903         struct ll_inode_info *lli = ll_i2info(inode);
 904         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 905         struct obd_client_handle **och_p;
 906         struct obd_client_handle *old_och = NULL;
 907         __u64 *och_usecount;
 908         int rc = 0;
 909         ENTRY;
 910
 911         mutex_lock(&lli->lli_och_mutex);
 912         if (file->f_mode & FMODE_WRITE) {
 913                 och_p = &lli->lli_mds_write_och;
 914                 och_usecount = &lli->lli_open_fd_write_count;
 915         } else {
 916                 och_p = &lli->lli_mds_read_och;
 917                 och_usecount = &lli->lli_open_fd_read_count;
 918         }
 919
 920         /* The file may have been open by another process (broken lease) so
 921          * *och_p is not NULL. In this case we should simply increase usecount
 922          * and close fd_och.
 923          */
 924         if (*och_p != NULL) {
 925                 old_och = fd->fd_och;
 926                 (*och_usecount)++;
 927         } else {
 928                 *och_p = fd->fd_och;
 929                 *och_usecount = 1;
 930         }
 931         fd->fd_och = NULL;
 932         mutex_unlock(&lli->lli_och_mutex);
 933
 934         if (old_och != NULL)
 935                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 936
 937         RETURN(rc);
 938 }
 939
 940 /**
 941  * Acquire a lease and open the file.
 942  */
 943 static struct obd_client_handle *
 944 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 945               __u64 open_flags)
 946 {
 947         struct lookup_intent it = { .it_op = IT_OPEN };
 948         struct ll_sb_info *sbi = ll_i2sbi(inode);
 949         struct md_op_data *op_data;
 950         struct ptlrpc_request *req = NULL;
 951         struct lustre_handle old_open_handle = { 0 };
 952         struct obd_client_handle *och = NULL;
 953         int rc;
 954         int rc2;
 955         ENTRY;
 956
 957         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 958                 RETURN(ERR_PTR(-EINVAL));
 959
 960         if (file != NULL) {
 961                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 962                         RETURN(ERR_PTR(-EPERM));
 963
 964                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
 965                 if (rc)
 966                         RETURN(ERR_PTR(rc));
 967         }
 968
 969         OBD_ALLOC_PTR(och);
 970         if (och == NULL)
 971                 RETURN(ERR_PTR(-ENOMEM));
 972
 973         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 974                                         LUSTRE_OPC_ANY, NULL);
 975         if (IS_ERR(op_data))
 976                 GOTO(out, rc = PTR_ERR(op_data));
 977
 978         /* To tell the MDT this openhandle is from the same owner */
 979         op_data->op_open_handle = old_open_handle;
 980
 981         it.it_flags = fmode | open_flags;
 982         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 983         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 984                             &ll_md_blocking_lease_ast,
 985         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 986          * it can be cancelled which may mislead applications that the lease is
 987          * broken;
 988          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 989          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 990          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 991                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 992         ll_finish_md_op_data(op_data);
 993         ptlrpc_req_finished(req);
 994         if (rc < 0)
 995                 GOTO(out_release_it, rc);
 996
 997         if (it_disposition(&it, DISP_LOOKUP_NEG))
 998                 GOTO(out_release_it, rc = -ENOENT);
 999
1000         rc = it_open_error(DISP_OPEN_OPEN, &it);
1001         if (rc)
1002                 GOTO(out_release_it, rc);
1003
1004         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1005         ll_och_fill(sbi->ll_md_exp, &it, och);
1006
1007         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1008                 GOTO(out_close, rc = -EOPNOTSUPP);
1009
1010         /* already get lease, handle lease lock */
1011         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1012         if (it.it_lock_mode == 0 ||
1013             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1014                 /* open lock must return for lease */
1015                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1016                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1017                         it.it_lock_bits);
1018                 GOTO(out_close, rc = -EPROTO);
1019         }
1020
1021         ll_intent_release(&it);
1022         RETURN(och);
1023
1024 out_close:
1025         /* Cancel open lock */
1026         if (it.it_lock_mode != 0) {
1027                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1028                                             it.it_lock_mode);
1029                 it.it_lock_mode = 0;
1030                 och->och_lease_handle.cookie = 0ULL;
1031         }
1032         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1033         if (rc2 < 0)
1034                 CERROR("%s: error closing file "DFID": %d\n",
1035                        ll_get_fsname(inode->i_sb, NULL, 0),
1036                        PFID(&ll_i2info(inode)->lli_fid), rc2);
1037         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1038 out_release_it:
1039         ll_intent_release(&it);
1040 out:
1041         if (och != NULL)
1042                 OBD_FREE_PTR(och);
1043         RETURN(ERR_PTR(rc));
1044 }
1045
1046 /**
1047  * Check whether a layout swap can be done between two inodes.
1048  *
1049  * \param[in] inode1  First inode to check
1050  * \param[in] inode2  Second inode to check
1051  *
1052  * \retval 0 on success, layout swap can be performed between both inodes
1053  * \retval negative error code if requirements are not met
1054  */
1055 static int ll_check_swap_layouts_validity(struct inode *inode1,
1056                                           struct inode *inode2)
1057 {
1058         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1059                 return -EINVAL;
1060
1061         if (inode_permission(inode1, MAY_WRITE) ||
1062             inode_permission(inode2, MAY_WRITE))
1063                 return -EPERM;
1064
1065         if (inode1->i_sb != inode2->i_sb)
1066                 return -EXDEV;
1067
1068         return 0;
1069 }
1070
1071 static int ll_swap_layouts_close(struct obd_client_handle *och,
1072                                  struct inode *inode, struct inode *inode2)
1073 {
1074         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1075         const struct lu_fid     *fid2;
1076         int                      rc;
1077         ENTRY;
1078
1079         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1080                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1081
1082         rc = ll_check_swap_layouts_validity(inode, inode2);
1083         if (rc < 0)
1084                 GOTO(out_free_och, rc);
1085
1086         /* We now know that inode2 is a lustre inode */
1087         fid2 = ll_inode2fid(inode2);
1088
1089         rc = lu_fid_cmp(fid1, fid2);
1090         if (rc == 0)
1091                 GOTO(out_free_och, rc = -EINVAL);
1092
1093         /* Close the file and {swap,merge} layouts between inode & inode2.
1094          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1095          * because we still need it to pack l_remote_handle to MDT. */
1096         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1097                                        inode2);
1098
1099         och = NULL; /* freed in ll_close_inode_openhandle() */
1100
1101 out_free_och:
1102         if (och != NULL)
1103                 OBD_FREE_PTR(och);
1104
1105         RETURN(rc);
1106 }
1107
1108 /**
1109  * Release lease and close the file.
1110  * It will check if the lease has ever broken.
1111  */
1112 static int ll_lease_close_intent(struct obd_client_handle *och,
1113                                  struct inode *inode,
1114                                  bool *lease_broken, enum mds_op_bias bias,
1115                                  void *data)
1116 {
1117         struct ldlm_lock *lock;
1118         bool cancelled = true;
1119         int rc;
1120         ENTRY;
1121
1122         lock = ldlm_handle2lock(&och->och_lease_handle);
1123         if (lock != NULL) {
1124                 lock_res_and_lock(lock);
1125                 cancelled = ldlm_is_cancel(lock);
1126                 unlock_res_and_lock(lock);
1127                 LDLM_LOCK_PUT(lock);
1128         }
1129
1130         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1131                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1132
1133         if (lease_broken != NULL)
1134                 *lease_broken = cancelled;
1135
1136         if (!cancelled && !bias)
1137                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1138
1139         if (cancelled) { /* no need to excute intent */
1140                 bias = 0;
1141                 data = NULL;
1142         }
1143
1144         rc = ll_close_inode_openhandle(inode, och, bias, data);
1145         RETURN(rc);
1146 }
1147
1148 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1149                           bool *lease_broken)
1150 {
1151         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1152 }
1153
1154 /**
1155  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1156  */
1157 static int ll_lease_file_resync(struct obd_client_handle *och,
1158                                 struct inode *inode, unsigned long arg)
1159 {
1160         struct ll_sb_info *sbi = ll_i2sbi(inode);
1161         struct md_op_data *op_data;
1162         struct ll_ioc_lease_id ioc;
1163         __u64 data_version_unused;
1164         int rc;
1165         ENTRY;
1166
1167         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1168                                      LUSTRE_OPC_ANY, NULL);
1169         if (IS_ERR(op_data))
1170                 RETURN(PTR_ERR(op_data));
1171
1172         if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1173                            sizeof(ioc)))
1174                 RETURN(-EFAULT);
1175
1176         /* before starting file resync, it's necessary to clean up page cache
1177          * in client memory, otherwise once the layout version is increased,
1178          * writing back cached data will be denied the OSTs. */
1179         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1180         if (rc)
1181                 GOTO(out, rc);
1182
1183         op_data->op_lease_handle = och->och_lease_handle;
1184         op_data->op_mirror_id = ioc.lil_mirror_id;
1185         rc = md_file_resync(sbi->ll_md_exp, op_data);
1186         if (rc)
1187                 GOTO(out, rc);
1188
1189         EXIT;
1190 out:
1191         ll_finish_md_op_data(op_data);
1192         return rc;
1193 }
1194
1195 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1196 {
1197         struct ll_inode_info *lli = ll_i2info(inode);
1198         struct cl_object *obj = lli->lli_clob;
1199         struct cl_attr *attr = vvp_env_thread_attr(env);
1200         s64 atime;
1201         s64 mtime;
1202         s64 ctime;
1203         int rc = 0;
1204
1205         ENTRY;
1206
1207         ll_inode_size_lock(inode);
1208
1209         /* Merge timestamps the most recently obtained from MDS with
1210          * timestamps obtained from OSTs.
1211          *
1212          * Do not overwrite atime of inode because it may be refreshed
1213          * by file_accessed() function. If the read was served by cache
1214          * data, there is no RPC to be sent so that atime may not be
1215          * transferred to OSTs at all. MDT only updates atime at close time
1216          * if it's at least 'mdd.*.atime_diff' older.
1217          * All in all, the atime in Lustre does not strictly comply with
1218          * POSIX. Solving this problem needs to send an RPC to MDT for each
1219          * read, this will hurt performance. */
1220         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1221                 LTIME_S(inode->i_atime) = lli->lli_atime;
1222                 lli->lli_update_atime = 0;
1223         }
1224         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1225         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1226
1227         atime = LTIME_S(inode->i_atime);
1228         mtime = LTIME_S(inode->i_mtime);
1229         ctime = LTIME_S(inode->i_ctime);
1230
1231         cl_object_attr_lock(obj);
1232         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1233                 rc = -EINVAL;
1234         else
1235                 rc = cl_object_attr_get(env, obj, attr);
1236         cl_object_attr_unlock(obj);
1237
1238         if (rc != 0)
1239                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1240
1241         if (atime < attr->cat_atime)
1242                 atime = attr->cat_atime;
1243
1244         if (ctime < attr->cat_ctime)
1245                 ctime = attr->cat_ctime;
1246
1247         if (mtime < attr->cat_mtime)
1248                 mtime = attr->cat_mtime;
1249
1250         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1251                PFID(&lli->lli_fid), attr->cat_size);
1252
1253         i_size_write(inode, attr->cat_size);
1254         inode->i_blocks = attr->cat_blocks;
1255
1256         LTIME_S(inode->i_atime) = atime;
1257         LTIME_S(inode->i_mtime) = mtime;
1258         LTIME_S(inode->i_ctime) = ctime;
1259
1260 out_size_unlock:
1261         ll_inode_size_unlock(inode);
1262
1263         RETURN(rc);
1264 }
1265
1266 /**
1267  * Set designated mirror for I/O.
1268  *
1269  * So far only read, write, and truncated can support to issue I/O to
1270  * designated mirror.
1271  */
1272 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1273 {
1274         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1275
1276         /* clear layout version for generic(non-resync) I/O in case it carries
1277          * stale layout version due to I/O restart */
1278         io->ci_layout_version = 0;
1279
1280         /* FLR: disable non-delay for designated mirror I/O because obviously
1281          * only one mirror is available */
1282         if (fd->fd_designated_mirror > 0) {
1283                 io->ci_ndelay = 0;
1284                 io->ci_designated_mirror = fd->fd_designated_mirror;
1285                 io->ci_layout_version = fd->fd_layout_version;
1286         }
1287
1288         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1289                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1290 }
1291
1292 static bool file_is_noatime(const struct file *file)
1293 {
1294         const struct vfsmount *mnt = file->f_path.mnt;
1295         const struct inode *inode = file_inode((struct file *)file);
1296
1297         /* Adapted from file_accessed() and touch_atime().*/
1298         if (file->f_flags & O_NOATIME)
1299                 return true;
1300
1301         if (inode->i_flags & S_NOATIME)
1302                 return true;
1303
1304         if (IS_NOATIME(inode))
1305                 return true;
1306
1307         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1308                 return true;
1309
1310         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1311                 return true;
1312
1313         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1314                 return true;
1315
1316         return false;
1317 }
1318
1319 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1320 {
1321         struct inode *inode = file_inode(file);
1322         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1323
1324         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1325         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1326
1327         if (iot == CIT_WRITE) {
1328                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1329                 io->u.ci_wr.wr_sync   = !!(file->f_flags & O_SYNC ||
1330                                            file->f_flags & O_DIRECT ||
1331                                            IS_SYNC(inode));
1332         }
1333         io->ci_obj = ll_i2info(inode)->lli_clob;
1334         io->ci_lockreq = CILR_MAYBE;
1335         if (ll_file_nolock(file)) {
1336                 io->ci_lockreq = CILR_NEVER;
1337                 io->ci_no_srvlock = 1;
1338         } else if (file->f_flags & O_APPEND) {
1339                 io->ci_lockreq = CILR_MANDATORY;
1340         }
1341         io->ci_noatime = file_is_noatime(file);
1342
1343         /* FLR: only use non-delay I/O for read as there is only one
1344          * avaliable mirror for write. */
1345         io->ci_ndelay = !(iot == CIT_WRITE);
1346
1347         ll_io_set_mirror(io, file);
1348 }
1349
1350 static ssize_t
1351 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1352                    struct file *file, enum cl_io_type iot,
1353                    loff_t *ppos, size_t count)
1354 {
1355         struct vvp_io           *vio = vvp_env_io(env);
1356         struct inode            *inode = file_inode(file);
1357         struct ll_inode_info    *lli = ll_i2info(inode);
1358         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1359         struct range_lock       range;
1360         struct cl_io            *io;
1361         ssize_t                 result = 0;
1362         int                     rc = 0;
1363         unsigned                retried = 0;
1364         bool                    restarted = false;
1365
1366         ENTRY;
1367
1368         CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1369                 file_dentry(file)->d_name.name,
1370                 iot == CIT_READ ? "read" : "write", *ppos, count);
1371
1372 restart:
1373         io = vvp_env_thread_io(env);
1374         ll_io_init(io, file, iot);
1375         io->ci_ndelay_tried = retried;
1376
1377         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1378                 bool range_locked = false;
1379
1380                 if (file->f_flags & O_APPEND)
1381                         range_lock_init(&range, 0, LUSTRE_EOF);
1382                 else
1383                         range_lock_init(&range, *ppos, *ppos + count - 1);
1384
1385                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1386                 vio->vui_io_subtype = args->via_io_subtype;
1387
1388                 switch (vio->vui_io_subtype) {
1389                 case IO_NORMAL:
1390                         vio->vui_iter = args->u.normal.via_iter;
1391                         vio->vui_iocb = args->u.normal.via_iocb;
1392                         /* Direct IO reads must also take range lock,
1393                          * or multiple reads will try to work on the same pages
1394                          * See LU-6227 for details. */
1395                         if (((iot == CIT_WRITE) ||
1396                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1397                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1398                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1399                                        RL_PARA(&range));
1400                                 rc = range_lock(&lli->lli_write_tree, &range);
1401                                 if (rc < 0)
1402                                         GOTO(out, rc);
1403
1404                                 range_locked = true;
1405                         }
1406                         break;
1407                 case IO_SPLICE:
1408                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1409                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1410                         break;
1411                 default:
1412                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1413                         LBUG();
1414                 }
1415
1416                 ll_cl_add(file, env, io, LCC_RW);
1417                 rc = cl_io_loop(env, io);
1418                 ll_cl_remove(file, env);
1419
1420                 if (range_locked) {
1421                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1422                                RL_PARA(&range));
1423                         range_unlock(&lli->lli_write_tree, &range);
1424                 }
1425         } else {
1426                 /* cl_io_rw_init() handled IO */
1427                 rc = io->ci_result;
1428         }
1429
1430         if (io->ci_nob > 0) {
1431                 result += io->ci_nob;
1432                 count  -= io->ci_nob;
1433                 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1434
1435                 /* prepare IO restart */
1436                 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1437                         args->u.normal.via_iter = vio->vui_iter;
1438         }
1439 out:
1440         cl_io_fini(env, io);
1441
1442         CDEBUG(D_VFSTRACE,
1443                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1444                file->f_path.dentry->d_name.name,
1445                iot, rc, result, io->ci_need_restart);
1446
1447         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1448                 CDEBUG(D_VFSTRACE,
1449                        "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1450                        file_dentry(file)->d_name.name,
1451                        iot == CIT_READ ? "read" : "write",
1452                        *ppos, count, result, rc);
1453                 /* preserve the tried count for FLR */
1454                 retried = io->ci_ndelay_tried;
1455                 restarted = true;
1456                 goto restart;
1457         }
1458
1459         if (iot == CIT_READ) {
1460                 if (result > 0)
1461                         ll_stats_ops_tally(ll_i2sbi(inode),
1462                                            LPROC_LL_READ_BYTES, result);
1463         } else if (iot == CIT_WRITE) {
1464                 if (result > 0) {
1465                         ll_stats_ops_tally(ll_i2sbi(inode),
1466                                            LPROC_LL_WRITE_BYTES, result);
1467                         fd->fd_write_failed = false;
1468                 } else if (result == 0 && rc == 0) {
1469                         rc = io->ci_result;
1470                         if (rc < 0)
1471                                 fd->fd_write_failed = true;
1472                         else
1473                                 fd->fd_write_failed = false;
1474                 } else if (rc != -ERESTARTSYS) {
1475                         fd->fd_write_failed = true;
1476                 }
1477         }
1478
1479         CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1480
1481         RETURN(result > 0 ? result : rc);
1482 }
1483
1484 /**
1485  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1486  * especially for small I/O.
1487  *
1488  * To serve a read request, CLIO has to create and initialize a cl_io and
1489  * then request DLM lock. This has turned out to have siginificant overhead
1490  * and affects the performance of small I/O dramatically.
1491  *
1492  * It's not necessary to create a cl_io for each I/O. Under the help of read
1493  * ahead, most of the pages being read are already in memory cache and we can
1494  * read those pages directly because if the pages exist, the corresponding DLM
1495  * lock must exist so that page content must be valid.
1496  *
1497  * In fast read implementation, the llite speculatively finds and reads pages
1498  * in memory cache. There are three scenarios for fast read:
1499  *   - If the page exists and is uptodate, kernel VM will provide the data and
1500  *     CLIO won't be intervened;
1501  *   - If the page was brought into memory by read ahead, it will be exported
1502  *     and read ahead parameters will be updated;
1503  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1504  *     it will go back and invoke normal read, i.e., a cl_io will be created
1505  *     and DLM lock will be requested.
1506  *
1507  * POSIX compliance: posix standard states that read is intended to be atomic.
1508  * Lustre read implementation is in line with Linux kernel read implementation
1509  * and neither of them complies with POSIX standard in this matter. Fast read
1510  * doesn't make the situation worse on single node but it may interleave write
1511  * results from multiple nodes due to short read handling in ll_file_aio_read().
1512  *
1513  * \param env - lu_env
1514  * \param iocb - kiocb from kernel
1515  * \param iter - user space buffers where the data will be copied
1516  *
1517  * \retval - number of bytes have been read, or error code if error occurred.
1518  */
1519 static ssize_t
1520 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1521 {
1522         ssize_t result;
1523
1524         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1525                 return 0;
1526
1527         /* NB: we can't do direct IO for fast read because it will need a lock
1528          * to make IO engine happy. */
1529         if (iocb->ki_filp->f_flags & O_DIRECT)
1530                 return 0;
1531
1532         result = generic_file_read_iter(iocb, iter);
1533
1534         /* If the first page is not in cache, generic_file_aio_read() will be
1535          * returned with -ENODATA.
1536          * See corresponding code in ll_readpage(). */
1537         if (result == -ENODATA)
1538                 result = 0;
1539
1540         if (result > 0)
1541                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1542                                 LPROC_LL_READ_BYTES, result);
1543
1544         return result;
1545 }
1546
1547 /*
1548  * Read from a file (through the page cache).
1549  */
1550 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1551 {
1552         struct lu_env *env;
1553         struct vvp_io_args *args;
1554         ssize_t result;
1555         ssize_t rc2;
1556         __u16 refcheck;
1557
1558         result = ll_do_fast_read(iocb, to);
1559         if (result < 0 || iov_iter_count(to) == 0)
1560                 GOTO(out, result);
1561
1562         env = cl_env_get(&refcheck);
1563         if (IS_ERR(env))
1564                 return PTR_ERR(env);
1565
1566         args = ll_env_args(env, IO_NORMAL);
1567         args->u.normal.via_iter = to;
1568         args->u.normal.via_iocb = iocb;
1569
1570         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1571                                  &iocb->ki_pos, iov_iter_count(to));
1572         if (rc2 > 0)
1573                 result += rc2;
1574         else if (result == 0)
1575                 result = rc2;
1576
1577         cl_env_put(env, &refcheck);
1578 out:
1579         return result;
1580 }
1581
1582 /**
1583  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1584  * If a page is already in the page cache and dirty (and some other things -
1585  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1586  * write to it without doing a full I/O, because Lustre already knows about it
1587  * and will write it out.  This saves a lot of processing time.
1588  *
1589  * All writes here are within one page, so exclusion is handled by the page
1590  * lock on the vm page.  We do not do tiny writes for writes which touch
1591  * multiple pages because it's very unlikely multiple sequential pages are
1592  * are already dirty.
1593  *
1594  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1595  * and are unlikely to be to already dirty pages.
1596  *
1597  * Attribute updates are important here, we do them in ll_tiny_write_end.
1598  */
1599 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1600 {
1601         ssize_t count = iov_iter_count(iter);
1602         struct file *file = iocb->ki_filp;
1603         struct inode *inode = file_inode(file);
1604         ssize_t result = 0;
1605
1606         ENTRY;
1607
1608         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1609          * of function for why.
1610          */
1611         if (count >= PAGE_SIZE ||
1612             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1613                 RETURN(0);
1614
1615         result = __generic_file_write_iter(iocb, iter);
1616
1617         /* If the page is not already dirty, ll_tiny_write_begin returns
1618          * -ENODATA.  We continue on to normal write.
1619          */
1620         if (result == -ENODATA)
1621                 result = 0;
1622
1623         if (result > 0) {
1624                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1625                                    result);
1626                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1627         }
1628
1629         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1630
1631         RETURN(result);
1632 }
1633
1634 /*
1635  * Write to a file (through the page cache).
1636  */
1637 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1638 {
1639         struct vvp_io_args *args;
1640         struct lu_env *env;
1641         ssize_t rc_tiny = 0, rc_normal;
1642         __u16 refcheck;
1643
1644         ENTRY;
1645
1646         /* NB: we can't do direct IO for tiny writes because they use the page
1647          * cache, we can't do sync writes because tiny writes can't flush
1648          * pages, and we can't do append writes because we can't guarantee the
1649          * required DLM locks are held to protect file size.
1650          */
1651         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1652             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1653                 rc_tiny = ll_do_tiny_write(iocb, from);
1654
1655         /* In case of error, go on and try normal write - Only stop if tiny
1656          * write completed I/O.
1657          */
1658         if (iov_iter_count(from) == 0)
1659                 GOTO(out, rc_normal = rc_tiny);
1660
1661         env = cl_env_get(&refcheck);
1662         if (IS_ERR(env))
1663                 return PTR_ERR(env);
1664
1665         args = ll_env_args(env, IO_NORMAL);
1666         args->u.normal.via_iter = from;
1667         args->u.normal.via_iocb = iocb;
1668
1669         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1670                                     &iocb->ki_pos, iov_iter_count(from));
1671
1672         /* On success, combine bytes written. */
1673         if (rc_tiny >= 0 && rc_normal > 0)
1674                 rc_normal += rc_tiny;
1675         /* On error, only return error from normal write if tiny write did not
1676          * write any bytes.  Otherwise return bytes written by tiny write.
1677          */
1678         else if (rc_tiny > 0)
1679                 rc_normal = rc_tiny;
1680
1681         cl_env_put(env, &refcheck);
1682 out:
1683         RETURN(rc_normal);
1684 }
1685
1686 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1687 /*
1688  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1689  */
1690 static int ll_file_get_iov_count(const struct iovec *iov,
1691                                  unsigned long *nr_segs, size_t *count)
1692 {
1693         size_t cnt = 0;
1694         unsigned long seg;
1695
1696         for (seg = 0; seg < *nr_segs; seg++) {
1697                 const struct iovec *iv = &iov[seg];
1698
1699                 /*
1700                  * If any segment has a negative length, or the cumulative
1701                  * length ever wraps negative then return -EINVAL.
1702                  */
1703                 cnt += iv->iov_len;
1704                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1705                         return -EINVAL;
1706                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1707                         continue;
1708                 if (seg == 0)
1709                         return -EFAULT;
1710                 *nr_segs = seg;
1711                 cnt -= iv->iov_len;     /* This segment is no good */
1712                 break;
1713         }
1714         *count = cnt;
1715         return 0;
1716 }
1717
1718 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1719                                 unsigned long nr_segs, loff_t pos)
1720 {
1721         struct iov_iter to;
1722         size_t iov_count;
1723         ssize_t result;
1724         ENTRY;
1725
1726         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1727         if (result)
1728                 RETURN(result);
1729
1730 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1731         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1732 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1733         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1734 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1735
1736         result = ll_file_read_iter(iocb, &to);
1737
1738         RETURN(result);
1739 }
1740
1741 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1742                             loff_t *ppos)
1743 {
1744         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1745         struct kiocb   kiocb;
1746         ssize_t        result;
1747         ENTRY;
1748
1749         init_sync_kiocb(&kiocb, file);
1750         kiocb.ki_pos = *ppos;
1751 #ifdef HAVE_KIOCB_KI_LEFT
1752         kiocb.ki_left = count;
1753 #elif defined(HAVE_KI_NBYTES)
1754         kiocb.i_nbytes = count;
1755 #endif
1756
1757         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1758         *ppos = kiocb.ki_pos;
1759
1760         RETURN(result);
1761 }
1762
1763 /*
1764  * Write to a file (through the page cache).
1765  * AIO stuff
1766  */
1767 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1768                                  unsigned long nr_segs, loff_t pos)
1769 {
1770         struct iov_iter from;
1771         size_t iov_count;
1772         ssize_t result;
1773         ENTRY;
1774
1775         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1776         if (result)
1777                 RETURN(result);
1778
1779 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1780         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1781 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1782         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1783 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1784
1785         result = ll_file_write_iter(iocb, &from);
1786
1787         RETURN(result);
1788 }
1789
1790 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1791                              size_t count, loff_t *ppos)
1792 {
1793         struct iovec   iov = { .iov_base = (void __user *)buf,
1794                                .iov_len = count };
1795         struct kiocb   kiocb;
1796         ssize_t        result;
1797
1798         ENTRY;
1799
1800         init_sync_kiocb(&kiocb, file);
1801         kiocb.ki_pos = *ppos;
1802 #ifdef HAVE_KIOCB_KI_LEFT
1803         kiocb.ki_left = count;
1804 #elif defined(HAVE_KI_NBYTES)
1805         kiocb.ki_nbytes = count;
1806 #endif
1807
1808         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1809         *ppos = kiocb.ki_pos;
1810
1811         RETURN(result);
1812 }
1813 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1814
1815 /*
1816  * Send file content (through pagecache) somewhere with helper
1817  */
1818 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1819                                    struct pipe_inode_info *pipe, size_t count,
1820                                    unsigned int flags)
1821 {
1822         struct lu_env      *env;
1823         struct vvp_io_args *args;
1824         ssize_t             result;
1825         __u16               refcheck;
1826         ENTRY;
1827
1828         env = cl_env_get(&refcheck);
1829         if (IS_ERR(env))
1830                 RETURN(PTR_ERR(env));
1831
1832         args = ll_env_args(env, IO_SPLICE);
1833         args->u.splice.via_pipe = pipe;
1834         args->u.splice.via_flags = flags;
1835
1836         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1837         cl_env_put(env, &refcheck);
1838         RETURN(result);
1839 }
1840
1841 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1842                              __u64 flags, struct lov_user_md *lum, int lum_size)
1843 {
1844         struct lookup_intent oit = {
1845                 .it_op = IT_OPEN,
1846                 .it_flags = flags | MDS_OPEN_BY_FID,
1847         };
1848         int rc;
1849         ENTRY;
1850
1851         ll_inode_size_lock(inode);
1852         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1853         if (rc < 0)
1854                 GOTO(out_unlock, rc);
1855
1856         ll_release_openhandle(dentry, &oit);
1857
1858 out_unlock:
1859         ll_inode_size_unlock(inode);
1860         ll_intent_release(&oit);
1861
1862         RETURN(rc);
1863 }
1864
1865 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1866                              struct lov_mds_md **lmmp, int *lmm_size,
1867                              struct ptlrpc_request **request)
1868 {
1869         struct ll_sb_info *sbi = ll_i2sbi(inode);
1870         struct mdt_body  *body;
1871         struct lov_mds_md *lmm = NULL;
1872         struct ptlrpc_request *req = NULL;
1873         struct md_op_data *op_data;
1874         int rc, lmmsize;
1875
1876         rc = ll_get_default_mdsize(sbi, &lmmsize);
1877         if (rc)
1878                 RETURN(rc);
1879
1880         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1881                                      strlen(filename), lmmsize,
1882                                      LUSTRE_OPC_ANY, NULL);
1883         if (IS_ERR(op_data))
1884                 RETURN(PTR_ERR(op_data));
1885
1886         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1887         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1888         ll_finish_md_op_data(op_data);
1889         if (rc < 0) {
1890                 CDEBUG(D_INFO, "md_getattr_name failed "
1891                        "on %s: rc %d\n", filename, rc);
1892                 GOTO(out, rc);
1893         }
1894
1895         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1896         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1897
1898         lmmsize = body->mbo_eadatasize;
1899
1900         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1901                         lmmsize == 0) {
1902                 GOTO(out, rc = -ENODATA);
1903         }
1904
1905         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1906         LASSERT(lmm != NULL);
1907
1908         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1909             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1910             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1911                 GOTO(out, rc = -EPROTO);
1912
1913         /*
1914          * This is coming from the MDS, so is probably in
1915          * little endian.  We convert it to host endian before
1916          * passing it to userspace.
1917          */
1918         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1919                 int stripe_count;
1920
1921                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1922                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1923                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1924                         if (le32_to_cpu(lmm->lmm_pattern) &
1925                             LOV_PATTERN_F_RELEASED)
1926                                 stripe_count = 0;
1927                 }
1928
1929                 /* if function called for directory - we should
1930                  * avoid swab not existent lsm objects */
1931                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1932                         lustre_swab_lov_user_md_v1(
1933                                         (struct lov_user_md_v1 *)lmm);
1934                         if (S_ISREG(body->mbo_mode))
1935                                 lustre_swab_lov_user_md_objects(
1936                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1937                                     stripe_count);
1938                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1939                         lustre_swab_lov_user_md_v3(
1940                                         (struct lov_user_md_v3 *)lmm);
1941                         if (S_ISREG(body->mbo_mode))
1942                                 lustre_swab_lov_user_md_objects(
1943                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1944                                     stripe_count);
1945                 } else if (lmm->lmm_magic ==
1946                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
1947                         lustre_swab_lov_comp_md_v1(
1948                                         (struct lov_comp_md_v1 *)lmm);
1949                 }
1950         }
1951
1952 out:
1953         *lmmp = lmm;
1954         *lmm_size = lmmsize;
1955         *request = req;
1956         return rc;
1957 }
1958
1959 static int ll_lov_setea(struct inode *inode, struct file *file,
1960                         void __user *arg)
1961 {
1962         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1963         struct lov_user_md      *lump;
1964         int                      lum_size = sizeof(struct lov_user_md) +
1965                                             sizeof(struct lov_user_ost_data);
1966         int                      rc;
1967         ENTRY;
1968
1969         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1970                 RETURN(-EPERM);
1971
1972         OBD_ALLOC_LARGE(lump, lum_size);
1973         if (lump == NULL)
1974                 RETURN(-ENOMEM);
1975
1976         if (copy_from_user(lump, arg, lum_size))
1977                 GOTO(out_lump, rc = -EFAULT);
1978
1979         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
1980                                       lum_size);
1981         cl_lov_delay_create_clear(&file->f_flags);
1982
1983 out_lump:
1984         OBD_FREE_LARGE(lump, lum_size);
1985         RETURN(rc);
1986 }
1987
1988 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
1989 {
1990         struct lu_env   *env;
1991         __u16           refcheck;
1992         int             rc;
1993         ENTRY;
1994
1995         env = cl_env_get(&refcheck);
1996         if (IS_ERR(env))
1997                 RETURN(PTR_ERR(env));
1998
1999         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2000         cl_env_put(env, &refcheck);
2001         RETURN(rc);
2002 }
2003
2004 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2005                             void __user *arg)
2006 {
2007         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2008         struct lov_user_md        *klum;
2009         int                        lum_size, rc;
2010         __u64                      flags = FMODE_WRITE;
2011         ENTRY;
2012
2013         rc = ll_copy_user_md(lum, &klum);
2014         if (rc < 0)
2015                 RETURN(rc);
2016
2017         lum_size = rc;
2018         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2019                                       lum_size);
2020         if (!rc) {
2021                 __u32 gen;
2022
2023                 rc = put_user(0, &lum->lmm_stripe_count);
2024                 if (rc)
2025                         GOTO(out, rc);
2026
2027                 rc = ll_layout_refresh(inode, &gen);
2028                 if (rc)
2029                         GOTO(out, rc);
2030
2031                 rc = ll_file_getstripe(inode, arg, lum_size);
2032         }
2033         cl_lov_delay_create_clear(&file->f_flags);
2034
2035 out:
2036         OBD_FREE(klum, lum_size);
2037         RETURN(rc);
2038 }
2039
2040 static int
2041 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2042 {
2043         struct ll_inode_info *lli = ll_i2info(inode);
2044         struct cl_object *obj = lli->lli_clob;
2045         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2046         struct ll_grouplock grouplock;
2047         int rc;
2048         ENTRY;
2049
2050         if (arg == 0) {
2051                 CWARN("group id for group lock must not be 0\n");
2052                 RETURN(-EINVAL);
2053         }
2054
2055         if (ll_file_nolock(file))
2056                 RETURN(-EOPNOTSUPP);
2057
2058         spin_lock(&lli->lli_lock);
2059         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2060                 CWARN("group lock already existed with gid %lu\n",
2061                       fd->fd_grouplock.lg_gid);
2062                 spin_unlock(&lli->lli_lock);
2063                 RETURN(-EINVAL);
2064         }
2065         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2066         spin_unlock(&lli->lli_lock);
2067
2068         /**
2069          * XXX: group lock needs to protect all OST objects while PFL
2070          * can add new OST objects during the IO, so we'd instantiate
2071          * all OST objects before getting its group lock.
2072          */
2073         if (obj) {
2074                 struct lu_env *env;
2075                 __u16 refcheck;
2076                 struct cl_layout cl = {
2077                         .cl_is_composite = false,
2078                 };
2079                 struct lu_extent ext = {
2080                         .e_start = 0,
2081                         .e_end = OBD_OBJECT_EOF,
2082                 };
2083
2084                 env = cl_env_get(&refcheck);
2085                 if (IS_ERR(env))
2086                         RETURN(PTR_ERR(env));
2087
2088                 rc = cl_object_layout_get(env, obj, &cl);
2089                 if (!rc && cl.cl_is_composite)
2090                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2091                                                     &ext);
2092
2093                 cl_env_put(env, &refcheck);
2094                 if (rc)
2095                         RETURN(rc);
2096         }
2097
2098         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2099                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2100         if (rc)
2101                 RETURN(rc);
2102
2103         spin_lock(&lli->lli_lock);
2104         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2105                 spin_unlock(&lli->lli_lock);
2106                 CERROR("another thread just won the race\n");
2107                 cl_put_grouplock(&grouplock);
2108                 RETURN(-EINVAL);
2109         }
2110
2111         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2112         fd->fd_grouplock = grouplock;
2113         spin_unlock(&lli->lli_lock);
2114
2115         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2116         RETURN(0);
2117 }
2118
2119 static int ll_put_grouplock(struct inode *inode, struct file *file,
2120                             unsigned long arg)
2121 {
2122         struct ll_inode_info   *lli = ll_i2info(inode);
2123         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2124         struct ll_grouplock     grouplock;
2125         ENTRY;
2126
2127         spin_lock(&lli->lli_lock);
2128         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2129                 spin_unlock(&lli->lli_lock);
2130                 CWARN("no group lock held\n");
2131                 RETURN(-EINVAL);
2132         }
2133
2134         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2135
2136         if (fd->fd_grouplock.lg_gid != arg) {
2137                 CWARN("group lock %lu doesn't match current id %lu\n",
2138                       arg, fd->fd_grouplock.lg_gid);
2139                 spin_unlock(&lli->lli_lock);
2140                 RETURN(-EINVAL);
2141         }
2142
2143         grouplock = fd->fd_grouplock;
2144         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2145         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2146         spin_unlock(&lli->lli_lock);
2147
2148         cl_put_grouplock(&grouplock);
2149         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2150         RETURN(0);
2151 }
2152
2153 /**
2154  * Close inode open handle
2155  *
2156  * \param dentry [in]     dentry which contains the inode
2157  * \param it     [in,out] intent which contains open info and result
2158  *
2159  * \retval 0     success
2160  * \retval <0    failure
2161  */
2162 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2163 {
2164         struct inode *inode = dentry->d_inode;
2165         struct obd_client_handle *och;
2166         int rc;
2167         ENTRY;
2168
2169         LASSERT(inode);
2170
2171         /* Root ? Do nothing. */
2172         if (dentry->d_inode->i_sb->s_root == dentry)
2173                 RETURN(0);
2174
2175         /* No open handle to close? Move away */
2176         if (!it_disposition(it, DISP_OPEN_OPEN))
2177                 RETURN(0);
2178
2179         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2180
2181         OBD_ALLOC(och, sizeof(*och));
2182         if (!och)
2183                 GOTO(out, rc = -ENOMEM);
2184
2185         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2186
2187         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2188 out:
2189         /* this one is in place of ll_file_open */
2190         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2191                 ptlrpc_req_finished(it->it_request);
2192                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2193         }
2194         RETURN(rc);
2195 }
2196
2197 /**
2198  * Get size for inode for which FIEMAP mapping is requested.
2199  * Make the FIEMAP get_info call and returns the result.
2200  * \param fiemap        kernel buffer to hold extens
2201  * \param num_bytes     kernel buffer size
2202  */
2203 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2204                         size_t num_bytes)
2205 {
2206         struct lu_env                   *env;
2207         __u16                           refcheck;
2208         int                             rc = 0;
2209         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2210         ENTRY;
2211
2212         /* Checks for fiemap flags */
2213         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2214                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2215                 return -EBADR;
2216         }
2217
2218         /* Check for FIEMAP_FLAG_SYNC */
2219         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2220                 rc = filemap_fdatawrite(inode->i_mapping);
2221                 if (rc)
2222                         return rc;
2223         }
2224
2225         env = cl_env_get(&refcheck);
2226         if (IS_ERR(env))
2227                 RETURN(PTR_ERR(env));
2228
2229         if (i_size_read(inode) == 0) {
2230                 rc = ll_glimpse_size(inode);
2231                 if (rc)
2232                         GOTO(out, rc);
2233         }
2234
2235         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2236         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2237         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2238
2239         /* If filesize is 0, then there would be no objects for mapping */
2240         if (fmkey.lfik_oa.o_size == 0) {
2241                 fiemap->fm_mapped_extents = 0;
2242                 GOTO(out, rc = 0);
2243         }
2244
2245         fmkey.lfik_fiemap = *fiemap;
2246
2247         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2248                               &fmkey, fiemap, &num_bytes);
2249 out:
2250         cl_env_put(env, &refcheck);
2251         RETURN(rc);
2252 }
2253
2254 int ll_fid2path(struct inode *inode, void __user *arg)
2255 {
2256         struct obd_export       *exp = ll_i2mdexp(inode);
2257         const struct getinfo_fid2path __user *gfin = arg;
2258         __u32                    pathlen;
2259         struct getinfo_fid2path *gfout;
2260         size_t                   outsize;
2261         int                      rc;
2262
2263         ENTRY;
2264
2265         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2266             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2267                 RETURN(-EPERM);
2268
2269         /* Only need to get the buflen */
2270         if (get_user(pathlen, &gfin->gf_pathlen))
2271                 RETURN(-EFAULT);
2272
2273         if (pathlen > PATH_MAX)
2274                 RETURN(-EINVAL);
2275
2276         outsize = sizeof(*gfout) + pathlen;
2277         OBD_ALLOC(gfout, outsize);
2278         if (gfout == NULL)
2279                 RETURN(-ENOMEM);
2280
2281         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2282                 GOTO(gf_free, rc = -EFAULT);
2283         /* append root FID after gfout to let MDT know the root FID so that it
2284          * can lookup the correct path, this is mainly for fileset.
2285          * old server without fileset mount support will ignore this. */
2286         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2287
2288         /* Call mdc_iocontrol */
2289         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2290         if (rc != 0)
2291                 GOTO(gf_free, rc);
2292
2293         if (copy_to_user(arg, gfout, outsize))
2294                 rc = -EFAULT;
2295
2296 gf_free:
2297         OBD_FREE(gfout, outsize);
2298         RETURN(rc);
2299 }
2300
2301 static int
2302 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2303 {
2304         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2305         struct lu_env *env;
2306         struct cl_io *io;
2307         __u16  refcheck;
2308         int result;
2309
2310         ENTRY;
2311
2312         ioc->idv_version = 0;
2313         ioc->idv_layout_version = UINT_MAX;
2314
2315         /* If no file object initialized, we consider its version is 0. */
2316         if (obj == NULL)
2317                 RETURN(0);
2318
2319         env = cl_env_get(&refcheck);
2320         if (IS_ERR(env))
2321                 RETURN(PTR_ERR(env));
2322
2323         io = vvp_env_thread_io(env);
2324         io->ci_obj = obj;
2325         io->u.ci_data_version.dv_data_version = 0;
2326         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2327         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2328
2329 restart:
2330         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2331                 result = cl_io_loop(env, io);
2332         else
2333                 result = io->ci_result;
2334
2335         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2336         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2337
2338         cl_io_fini(env, io);
2339
2340         if (unlikely(io->ci_need_restart))
2341                 goto restart;
2342
2343         cl_env_put(env, &refcheck);
2344
2345         RETURN(result);
2346 }
2347
2348 /*
2349  * Read the data_version for inode.
2350  *
2351  * This value is computed using stripe object version on OST.
2352  * Version is computed using server side locking.
2353  *
2354  * @param flags if do sync on the OST side;
2355  *              0: no sync
2356  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2357  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2358  */
2359 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2360 {
2361         struct ioc_data_version ioc = { .idv_flags = flags };
2362         int rc;
2363
2364         rc = ll_ioc_data_version(inode, &ioc);
2365         if (!rc)
2366                 *data_version = ioc.idv_version;
2367
2368         return rc;
2369 }
2370
2371 /*
2372  * Trigger a HSM release request for the provided inode.
2373  */
2374 int ll_hsm_release(struct inode *inode)
2375 {
2376         struct lu_env *env;
2377         struct obd_client_handle *och = NULL;
2378         __u64 data_version = 0;
2379         int rc;
2380         __u16 refcheck;
2381         ENTRY;
2382
2383         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2384                ll_get_fsname(inode->i_sb, NULL, 0),
2385                PFID(&ll_i2info(inode)->lli_fid));
2386
2387         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2388         if (IS_ERR(och))
2389                 GOTO(out, rc = PTR_ERR(och));
2390
2391         /* Grab latest data_version and [am]time values */
2392         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2393         if (rc != 0)
2394                 GOTO(out, rc);
2395
2396         env = cl_env_get(&refcheck);
2397         if (IS_ERR(env))
2398                 GOTO(out, rc = PTR_ERR(env));
2399
2400         rc = ll_merge_attr(env, inode);
2401         cl_env_put(env, &refcheck);
2402
2403         /* If error happen, we have the wrong size for a file.
2404          * Don't release it.
2405          */
2406         if (rc != 0)
2407                 GOTO(out, rc);
2408
2409         /* Release the file.
2410          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2411          * we still need it to pack l_remote_handle to MDT. */
2412         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2413                                        &data_version);
2414         och = NULL;
2415
2416         EXIT;
2417 out:
2418         if (och != NULL && !IS_ERR(och)) /* close the file */
2419                 ll_lease_close(och, inode, NULL);
2420
2421         return rc;
2422 }
2423
2424 struct ll_swap_stack {
2425         __u64                    dv1;
2426         __u64                    dv2;
2427         struct inode            *inode1;
2428         struct inode            *inode2;
2429         bool                     check_dv1;
2430         bool                     check_dv2;
2431 };
2432
2433 static int ll_swap_layouts(struct file *file1, struct file *file2,
2434                            struct lustre_swap_layouts *lsl)
2435 {
2436         struct mdc_swap_layouts  msl;
2437         struct md_op_data       *op_data;
2438         __u32                    gid;
2439         __u64                    dv;
2440         struct ll_swap_stack    *llss = NULL;
2441         int                      rc;
2442
2443         OBD_ALLOC_PTR(llss);
2444         if (llss == NULL)
2445                 RETURN(-ENOMEM);
2446
2447         llss->inode1 = file_inode(file1);
2448         llss->inode2 = file_inode(file2);
2449
2450         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2451         if (rc < 0)
2452                 GOTO(free, rc);
2453
2454         /* we use 2 bool because it is easier to swap than 2 bits */
2455         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2456                 llss->check_dv1 = true;
2457
2458         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2459                 llss->check_dv2 = true;
2460
2461         /* we cannot use lsl->sl_dvX directly because we may swap them */
2462         llss->dv1 = lsl->sl_dv1;
2463         llss->dv2 = lsl->sl_dv2;
2464
2465         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2466         if (rc == 0) /* same file, done! */
2467                 GOTO(free, rc);
2468
2469         if (rc < 0) { /* sequentialize it */
2470                 swap(llss->inode1, llss->inode2);
2471                 swap(file1, file2);
2472                 swap(llss->dv1, llss->dv2);
2473                 swap(llss->check_dv1, llss->check_dv2);
2474         }
2475
2476         gid = lsl->sl_gid;
2477         if (gid != 0) { /* application asks to flush dirty cache */
2478                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2479                 if (rc < 0)
2480                         GOTO(free, rc);
2481
2482                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2483                 if (rc < 0) {
2484                         ll_put_grouplock(llss->inode1, file1, gid);
2485                         GOTO(free, rc);
2486                 }
2487         }
2488
2489         /* ultimate check, before swaping the layouts we check if
2490          * dataversion has changed (if requested) */
2491         if (llss->check_dv1) {
2492                 rc = ll_data_version(llss->inode1, &dv, 0);
2493                 if (rc)
2494                         GOTO(putgl, rc);
2495                 if (dv != llss->dv1)
2496                         GOTO(putgl, rc = -EAGAIN);
2497         }
2498
2499         if (llss->check_dv2) {
2500                 rc = ll_data_version(llss->inode2, &dv, 0);
2501                 if (rc)
2502                         GOTO(putgl, rc);
2503                 if (dv != llss->dv2)
2504                         GOTO(putgl, rc = -EAGAIN);
2505         }
2506
2507         /* struct md_op_data is used to send the swap args to the mdt
2508          * only flags is missing, so we use struct mdc_swap_layouts
2509          * through the md_op_data->op_data */
2510         /* flags from user space have to be converted before they are send to
2511          * server, no flag is sent today, they are only used on the client */
2512         msl.msl_flags = 0;
2513         rc = -ENOMEM;
2514         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2515                                      0, LUSTRE_OPC_ANY, &msl);
2516         if (IS_ERR(op_data))
2517                 GOTO(free, rc = PTR_ERR(op_data));
2518
2519         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2520                            sizeof(*op_data), op_data, NULL);
2521         ll_finish_md_op_data(op_data);
2522
2523         if (rc < 0)
2524                 GOTO(putgl, rc);
2525
2526 putgl:
2527         if (gid != 0) {
2528                 ll_put_grouplock(llss->inode2, file2, gid);
2529                 ll_put_grouplock(llss->inode1, file1, gid);
2530         }
2531
2532 free:
2533         if (llss != NULL)
2534                 OBD_FREE_PTR(llss);
2535
2536         RETURN(rc);
2537 }
2538
2539 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2540 {
2541         struct obd_export *exp = ll_i2mdexp(inode);
2542         struct md_op_data *op_data;
2543         int rc;
2544         ENTRY;
2545
2546         /* Detect out-of range masks */
2547         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2548                 RETURN(-EINVAL);
2549
2550         /* Non-root users are forbidden to set or clear flags which are
2551          * NOT defined in HSM_USER_MASK. */
2552         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2553             !cfs_capable(CFS_CAP_SYS_ADMIN))
2554                 RETURN(-EPERM);
2555
2556         if (!exp_connect_archive_id_array(exp)) {
2557                 /* Detect out-of range archive id */
2558                 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2559                     (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2560                         RETURN(-EINVAL);
2561         }
2562
2563         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2564                                      LUSTRE_OPC_ANY, hss);
2565         if (IS_ERR(op_data))
2566                 RETURN(PTR_ERR(op_data));
2567
2568         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2569                            op_data, NULL);
2570
2571         ll_finish_md_op_data(op_data);
2572
2573         RETURN(rc);
2574 }
2575
2576 static int ll_hsm_import(struct inode *inode, struct file *file,
2577                          struct hsm_user_import *hui)
2578 {
2579         struct hsm_state_set    *hss = NULL;
2580         struct iattr            *attr = NULL;
2581         int                      rc;
2582         ENTRY;
2583
2584         if (!S_ISREG(inode->i_mode))
2585                 RETURN(-EINVAL);
2586
2587         /* set HSM flags */
2588         OBD_ALLOC_PTR(hss);
2589         if (hss == NULL)
2590                 GOTO(out, rc = -ENOMEM);
2591
2592         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2593         hss->hss_archive_id = hui->hui_archive_id;
2594         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2595         rc = ll_hsm_state_set(inode, hss);
2596         if (rc != 0)
2597                 GOTO(out, rc);
2598
2599         OBD_ALLOC_PTR(attr);
2600         if (attr == NULL)
2601                 GOTO(out, rc = -ENOMEM);
2602
2603         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2604         attr->ia_mode |= S_IFREG;
2605         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2606         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2607         attr->ia_size = hui->hui_size;
2608         attr->ia_mtime.tv_sec = hui->hui_mtime;
2609         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2610         attr->ia_atime.tv_sec = hui->hui_atime;
2611         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2612
2613         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2614                          ATTR_UID | ATTR_GID |
2615                          ATTR_MTIME | ATTR_MTIME_SET |
2616                          ATTR_ATIME | ATTR_ATIME_SET;
2617
2618         inode_lock(inode);
2619
2620         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2621         if (rc == -ENODATA)
2622                 rc = 0;
2623
2624         inode_unlock(inode);
2625
2626 out:
2627         if (hss != NULL)
2628                 OBD_FREE_PTR(hss);
2629
2630         if (attr != NULL)
2631                 OBD_FREE_PTR(attr);
2632
2633         RETURN(rc);
2634 }
2635
2636 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2637 {
2638         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2639                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2640 }
2641
2642 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2643 {
2644         struct inode *inode = file_inode(file);
2645         struct iattr ia = {
2646                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2647                             ATTR_MTIME | ATTR_MTIME_SET |
2648                             ATTR_CTIME,
2649                 .ia_atime = {
2650                         .tv_sec = lfu->lfu_atime_sec,
2651                         .tv_nsec = lfu->lfu_atime_nsec,
2652                 },
2653                 .ia_mtime = {
2654                         .tv_sec = lfu->lfu_mtime_sec,
2655                         .tv_nsec = lfu->lfu_mtime_nsec,
2656                 },
2657                 .ia_ctime = {
2658                         .tv_sec = lfu->lfu_ctime_sec,
2659                         .tv_nsec = lfu->lfu_ctime_nsec,
2660                 },
2661         };
2662         int rc;
2663         ENTRY;
2664
2665         if (!capable(CAP_SYS_ADMIN))
2666                 RETURN(-EPERM);
2667
2668         if (!S_ISREG(inode->i_mode))
2669                 RETURN(-EINVAL);
2670
2671         inode_lock(inode);
2672         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2673                             false);
2674         inode_unlock(inode);
2675
2676         RETURN(rc);
2677 }
2678
2679 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2680 {
2681         switch (mode) {
2682         case MODE_READ_USER:
2683                 return CLM_READ;
2684         case MODE_WRITE_USER:
2685                 return CLM_WRITE;
2686         default:
2687                 return -EINVAL;
2688         }
2689 }
2690
2691 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2692
2693 /* Used to allow the upper layers of the client to request an LDLM lock
2694  * without doing an actual read or write.
2695  *
2696  * Used for ladvise lockahead to manually request specific locks.
2697  *
2698  * \param[in] file      file this ladvise lock request is on
2699  * \param[in] ladvise   ladvise struct describing this lock request
2700  *
2701  * \retval 0            success, no detailed result available (sync requests
2702  *                      and requests sent to the server [not handled locally]
2703  *                      cannot return detailed results)
2704  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2705  *                                       see definitions for details.
2706  * \retval negative     negative errno on error
2707  */
2708 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2709 {
2710         struct lu_env *env = NULL;
2711         struct cl_io *io  = NULL;
2712         struct cl_lock *lock = NULL;
2713         struct cl_lock_descr *descr = NULL;
2714         struct dentry *dentry = file->f_path.dentry;
2715         struct inode *inode = dentry->d_inode;
2716         enum cl_lock_mode cl_mode;
2717         off_t start = ladvise->lla_start;
2718         off_t end = ladvise->lla_end;
2719         int result;
2720         __u16 refcheck;
2721
2722         ENTRY;
2723
2724         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2725                "start=%llu, end=%llu\n", dentry->d_name.len,
2726                dentry->d_name.name, dentry->d_inode,
2727                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2728                (__u64) end);
2729
2730         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2731         if (cl_mode < 0)
2732                 GOTO(out, result = cl_mode);
2733
2734         /* Get IO environment */
2735         result = cl_io_get(inode, &env, &io, &refcheck);
2736         if (result <= 0)
2737                 GOTO(out, result);
2738
2739         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2740         if (result > 0) {
2741                 /*
2742                  * nothing to do for this io. This currently happens when
2743                  * stripe sub-object's are not yet created.
2744                  */
2745                 result = io->ci_result;
2746         } else if (result == 0) {
2747                 lock = vvp_env_lock(env);
2748                 descr = &lock->cll_descr;
2749
2750                 descr->cld_obj   = io->ci_obj;
2751                 /* Convert byte offsets to pages */
2752                 descr->cld_start = cl_index(io->ci_obj, start);
2753                 descr->cld_end   = cl_index(io->ci_obj, end);
2754                 descr->cld_mode  = cl_mode;
2755                 /* CEF_MUST is used because we do not want to convert a
2756                  * lockahead request to a lockless lock */
2757                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2758                                        CEF_NONBLOCK;
2759
2760                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2761                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2762
2763                 result = cl_lock_request(env, io, lock);
2764
2765                 /* On success, we need to release the lock */
2766                 if (result >= 0)
2767                         cl_lock_release(env, lock);
2768         }
2769         cl_io_fini(env, io);
2770         cl_env_put(env, &refcheck);
2771
2772         /* -ECANCELED indicates a matching lock with a different extent
2773          * was already present, and -EEXIST indicates a matching lock
2774          * on exactly the same extent was already present.
2775          * We convert them to positive values for userspace to make
2776          * recognizing true errors easier.
2777          * Note we can only return these detailed results on async requests,
2778          * as sync requests look the same as i/o requests for locking. */
2779         if (result == -ECANCELED)
2780                 result = LLA_RESULT_DIFFERENT;
2781         else if (result == -EEXIST)
2782                 result = LLA_RESULT_SAME;
2783
2784 out:
2785         RETURN(result);
2786 }
2787 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2788
2789 static int ll_ladvise_sanity(struct inode *inode,
2790                              struct llapi_lu_ladvise *ladvise)
2791 {
2792         enum lu_ladvise_type advice = ladvise->lla_advice;
2793         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2794          * be in the first 32 bits of enum ladvise_flags */
2795         __u32 flags = ladvise->lla_peradvice_flags;
2796         /* 3 lines at 80 characters per line, should be plenty */
2797         int rc = 0;
2798
2799         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2800                 rc = -EINVAL;
2801                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2802                        "last supported advice is %s (value '%d'): rc = %d\n",
2803                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2804                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2805                 GOTO(out, rc);
2806         }
2807
2808         /* Per-advice checks */
2809         switch (advice) {
2810         case LU_LADVISE_LOCKNOEXPAND:
2811                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2812                         rc = -EINVAL;
2813                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2814                                "rc = %d\n",
2815                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2816                                ladvise_names[advice], rc);
2817                         GOTO(out, rc);
2818                 }
2819                 break;
2820         case LU_LADVISE_LOCKAHEAD:
2821                 /* Currently only READ and WRITE modes can be requested */
2822                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2823                     ladvise->lla_lockahead_mode == 0) {
2824                         rc = -EINVAL;
2825                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2826                                "rc = %d\n",
2827                                ll_get_fsname(inode->i_sb, NULL, 0),
2828                                ladvise->lla_lockahead_mode,
2829                                ladvise_names[advice], rc);
2830                         GOTO(out, rc);
2831                 }
2832         case LU_LADVISE_WILLREAD:
2833         case LU_LADVISE_DONTNEED:
2834         default:
2835                 /* Note fall through above - These checks apply to all advices
2836                  * except LOCKNOEXPAND */
2837                 if (flags & ~LF_DEFAULT_MASK) {
2838                         rc = -EINVAL;
2839                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2840                                "rc = %d\n",
2841                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2842                                ladvise_names[advice], rc);
2843                         GOTO(out, rc);
2844                 }
2845                 if (ladvise->lla_start >= ladvise->lla_end) {
2846                         rc = -EINVAL;
2847                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2848                                "for %s: rc = %d\n",
2849                                ll_get_fsname(inode->i_sb, NULL, 0),
2850                                ladvise->lla_start, ladvise->lla_end,
2851                                ladvise_names[advice], rc);
2852                         GOTO(out, rc);
2853                 }
2854                 break;
2855         }
2856
2857 out:
2858         return rc;
2859 }
2860 #undef ERRSIZE
2861
2862 /*
2863  * Give file access advices
2864  *
2865  * The ladvise interface is similar to Linux fadvise() system call, except it
2866  * forwards the advices directly from Lustre client to server. The server side
2867  * codes will apply appropriate read-ahead and caching techniques for the
2868  * corresponding files.
2869  *
2870  * A typical workload for ladvise is e.g. a bunch of different clients are
2871  * doing small random reads of a file, so prefetching pages into OSS cache
2872  * with big linear reads before the random IO is a net benefit. Fetching
2873  * all that data into each client cache with fadvise() may not be, due to
2874  * much more data being sent to the client.
2875  */
2876 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2877                       struct llapi_lu_ladvise *ladvise)
2878 {
2879         struct lu_env *env;
2880         struct cl_io *io;
2881         struct cl_ladvise_io *lio;
2882         int rc;
2883         __u16 refcheck;
2884         ENTRY;
2885
2886         env = cl_env_get(&refcheck);
2887         if (IS_ERR(env))
2888                 RETURN(PTR_ERR(env));
2889
2890         io = vvp_env_thread_io(env);
2891         io->ci_obj = ll_i2info(inode)->lli_clob;
2892
2893         /* initialize parameters for ladvise */
2894         lio = &io->u.ci_ladvise;
2895         lio->li_start = ladvise->lla_start;
2896         lio->li_end = ladvise->lla_end;
2897         lio->li_fid = ll_inode2fid(inode);
2898         lio->li_advice = ladvise->lla_advice;
2899         lio->li_flags = flags;
2900
2901         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2902                 rc = cl_io_loop(env, io);
2903         else
2904                 rc = io->ci_result;
2905
2906         cl_io_fini(env, io);
2907         cl_env_put(env, &refcheck);
2908         RETURN(rc);
2909 }
2910
2911 static int ll_lock_noexpand(struct file *file, int flags)
2912 {
2913         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2914
2915         fd->ll_lock_no_expand = !(flags & LF_UNSET);
2916
2917         return 0;
2918 }
2919
2920 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2921                         unsigned long arg)
2922 {
2923         struct fsxattr fsxattr;
2924
2925         if (copy_from_user(&fsxattr,
2926                            (const struct fsxattr __user *)arg,
2927                            sizeof(fsxattr)))
2928                 RETURN(-EFAULT);
2929
2930         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
2931         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
2932                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
2933         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2934         if (copy_to_user((struct fsxattr __user *)arg,
2935                          &fsxattr, sizeof(fsxattr)))
2936                 RETURN(-EFAULT);
2937
2938         RETURN(0);
2939 }
2940
2941 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
2942 {
2943         /*
2944          * Project Quota ID state is only allowed to change from within the init
2945          * namespace. Enforce that restriction only if we are trying to change
2946          * the quota ID state. Everything else is allowed in user namespaces.
2947          */
2948         if (current_user_ns() == &init_user_ns)
2949                 return 0;
2950
2951         if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
2952                 return -EINVAL;
2953
2954         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
2955                 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
2956                         return -EINVAL;
2957         } else {
2958                 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
2959                         return -EINVAL;
2960         }
2961
2962         return 0;
2963 }
2964
2965 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
2966                         unsigned long arg)
2967 {
2968
2969         struct md_op_data *op_data;
2970         struct ptlrpc_request *req = NULL;
2971         int rc = 0;
2972         struct fsxattr fsxattr;
2973         struct cl_object *obj;
2974         struct iattr *attr;
2975         int flags;
2976
2977         if (copy_from_user(&fsxattr,
2978                            (const struct fsxattr __user *)arg,
2979                            sizeof(fsxattr)))
2980                 RETURN(-EFAULT);
2981
2982         rc = ll_ioctl_check_project(inode, &fsxattr);
2983         if (rc)
2984                 RETURN(rc);
2985
2986         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2987                                      LUSTRE_OPC_ANY, NULL);
2988         if (IS_ERR(op_data))
2989                 RETURN(PTR_ERR(op_data));
2990
2991         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
2992         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
2993         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
2994                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
2995         op_data->op_projid = fsxattr.fsx_projid;
2996         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
2997         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
2998                         0, &req);
2999         ptlrpc_req_finished(req);
3000         if (rc)
3001                 GOTO(out_fsxattr, rc);
3002         ll_update_inode_flags(inode, op_data->op_attr_flags);
3003         obj = ll_i2info(inode)->lli_clob;
3004         if (obj == NULL)
3005                 GOTO(out_fsxattr, rc);
3006
3007         OBD_ALLOC_PTR(attr);
3008         if (attr == NULL)
3009                 GOTO(out_fsxattr, rc = -ENOMEM);
3010
3011         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3012                             fsxattr.fsx_xflags);
3013         OBD_FREE_PTR(attr);
3014 out_fsxattr:
3015         ll_finish_md_op_data(op_data);
3016         RETURN(rc);
3017 }
3018
3019 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3020                                  unsigned long arg)
3021 {
3022         struct inode            *inode = file_inode(file);
3023         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3024         struct ll_inode_info    *lli = ll_i2info(inode);
3025         struct obd_client_handle *och = NULL;
3026         struct split_param sp;
3027         bool lease_broken;
3028         fmode_t fmode = 0;
3029         enum mds_op_bias bias = 0;
3030         struct file *layout_file = NULL;
3031         void *data = NULL;
3032         size_t data_size = 0;
3033         long rc;
3034         ENTRY;
3035
3036         mutex_lock(&lli->lli_och_mutex);
3037         if (fd->fd_lease_och != NULL) {
3038                 och = fd->fd_lease_och;
3039                 fd->fd_lease_och = NULL;
3040         }
3041         mutex_unlock(&lli->lli_och_mutex);
3042
3043         if (och == NULL)
3044                 GOTO(out, rc = -ENOLCK);
3045
3046         fmode = och->och_flags;
3047
3048         switch (ioc->lil_flags) {
3049         case LL_LEASE_RESYNC_DONE:
3050                 if (ioc->lil_count > IOC_IDS_MAX)
3051                         GOTO(out, rc = -EINVAL);
3052
3053                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3054                 OBD_ALLOC(data, data_size);
3055                 if (!data)
3056                         GOTO(out, rc = -ENOMEM);
3057
3058                 if (copy_from_user(data, (void __user *)arg, data_size))
3059                         GOTO(out, rc = -EFAULT);
3060
3061                 bias = MDS_CLOSE_RESYNC_DONE;
3062                 break;
3063         case LL_LEASE_LAYOUT_MERGE: {
3064                 int fd;
3065
3066                 if (ioc->lil_count != 1)
3067                         GOTO(out, rc = -EINVAL);
3068
3069                 arg += sizeof(*ioc);
3070                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3071                         GOTO(out, rc = -EFAULT);
3072
3073                 layout_file = fget(fd);
3074                 if (!layout_file)
3075                         GOTO(out, rc = -EBADF);
3076
3077                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3078                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3079                         GOTO(out, rc = -EPERM);
3080
3081                 data = file_inode(layout_file);
3082                 bias = MDS_CLOSE_LAYOUT_MERGE;
3083                 break;
3084         }
3085         case LL_LEASE_LAYOUT_SPLIT: {
3086                 int fdv;
3087                 int mirror_id;
3088
3089                 if (ioc->lil_count != 2)
3090                         GOTO(out, rc = -EINVAL);
3091
3092                 arg += sizeof(*ioc);
3093                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3094                         GOTO(out, rc = -EFAULT);
3095
3096                 arg += sizeof(__u32);
3097                 if (copy_from_user(&mirror_id, (void __user *)arg,
3098                                    sizeof(__u32)))
3099                         GOTO(out, rc = -EFAULT);
3100
3101                 layout_file = fget(fdv);
3102                 if (!layout_file)
3103                         GOTO(out, rc = -EBADF);
3104
3105                 sp.sp_inode = file_inode(layout_file);
3106                 sp.sp_mirror_id = (__u16)mirror_id;
3107                 data = &sp;
3108                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3109                 break;
3110         }
3111         default:
3112                 /* without close intent */
3113                 break;
3114         }
3115
3116         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3117         if (rc < 0)
3118                 GOTO(out, rc);
3119
3120         rc = ll_lease_och_release(inode, file);
3121         if (rc < 0)
3122                 GOTO(out, rc);
3123
3124         if (lease_broken)
3125                 fmode = 0;
3126         EXIT;
3127
3128 out:
3129         switch (ioc->lil_flags) {
3130         case LL_LEASE_RESYNC_DONE:
3131                 if (data)
3132                         OBD_FREE(data, data_size);
3133                 break;
3134         case LL_LEASE_LAYOUT_MERGE:
3135         case LL_LEASE_LAYOUT_SPLIT:
3136                 if (layout_file)
3137                         fput(layout_file);
3138                 break;
3139         }
3140
3141         if (!rc)
3142                 rc = ll_lease_type_from_fmode(fmode);
3143         RETURN(rc);
3144 }
3145
3146 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3147                               unsigned long arg)
3148 {
3149         struct inode *inode = file_inode(file);
3150         struct ll_inode_info *lli = ll_i2info(inode);
3151         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3152         struct obd_client_handle *och = NULL;
3153         __u64 open_flags = 0;
3154         bool lease_broken;
3155         fmode_t fmode;
3156         long rc;
3157         ENTRY;
3158
3159         switch (ioc->lil_mode) {
3160         case LL_LEASE_WRLCK:
3161                 if (!(file->f_mode & FMODE_WRITE))
3162                         RETURN(-EPERM);
3163                 fmode = FMODE_WRITE;
3164                 break;
3165         case LL_LEASE_RDLCK:
3166                 if (!(file->f_mode & FMODE_READ))
3167                         RETURN(-EPERM);
3168                 fmode = FMODE_READ;
3169                 break;
3170         case LL_LEASE_UNLCK:
3171                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3172         default:
3173                 RETURN(-EINVAL);
3174         }
3175
3176         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3177
3178         /* apply for lease */
3179         if (ioc->lil_flags & LL_LEASE_RESYNC)
3180                 open_flags = MDS_OPEN_RESYNC;
3181         och = ll_lease_open(inode, file, fmode, open_flags);
3182         if (IS_ERR(och))
3183                 RETURN(PTR_ERR(och));
3184
3185         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3186                 rc = ll_lease_file_resync(och, inode, arg);
3187                 if (rc) {
3188                         ll_lease_close(och, inode, NULL);
3189                         RETURN(rc);
3190                 }
3191                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3192                 if (rc) {
3193                         ll_lease_close(och, inode, NULL);
3194                         RETURN(rc);
3195                 }
3196         }
3197
3198         rc = 0;
3199         mutex_lock(&lli->lli_och_mutex);
3200         if (fd->fd_lease_och == NULL) {
3201                 fd->fd_lease_och = och;
3202                 och = NULL;
3203         }
3204         mutex_unlock(&lli->lli_och_mutex);
3205         if (och != NULL) {
3206                 /* impossible now that only excl is supported for now */
3207                 ll_lease_close(och, inode, &lease_broken);
3208                 rc = -EBUSY;
3209         }
3210         RETURN(rc);
3211 }
3212
3213 static long
3214 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3215 {
3216         struct inode            *inode = file_inode(file);
3217         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3218         int                      flags, rc;
3219         ENTRY;
3220
3221         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3222                PFID(ll_inode2fid(inode)), inode, cmd);
3223         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3224
3225         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3226         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3227                 RETURN(-ENOTTY);
3228
3229         switch (cmd) {
3230         case LL_IOC_GETFLAGS:
3231                 /* Get the current value of the file flags */
3232                 return put_user(fd->fd_flags, (int __user *)arg);
3233         case LL_IOC_SETFLAGS:
3234         case LL_IOC_CLRFLAGS:
3235                 /* Set or clear specific file flags */
3236                 /* XXX This probably needs checks to ensure the flags are
3237                  *     not abused, and to handle any flag side effects.
3238                  */
3239                 if (get_user(flags, (int __user *) arg))
3240                         RETURN(-EFAULT);
3241
3242                 if (cmd == LL_IOC_SETFLAGS) {
3243                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3244                             !(file->f_flags & O_DIRECT)) {
3245                                 CERROR("%s: unable to disable locking on "
3246                                        "non-O_DIRECT file\n", current->comm);
3247                                 RETURN(-EINVAL);
3248                         }
3249
3250                         fd->fd_flags |= flags;
3251                 } else {
3252                         fd->fd_flags &= ~flags;
3253                 }
3254                 RETURN(0);
3255         case LL_IOC_LOV_SETSTRIPE:
3256         case LL_IOC_LOV_SETSTRIPE_NEW:
3257                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3258         case LL_IOC_LOV_SETEA:
3259                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3260         case LL_IOC_LOV_SWAP_LAYOUTS: {
3261                 struct file *file2;
3262                 struct lustre_swap_layouts lsl;
3263
3264                 if (copy_from_user(&lsl, (char __user *)arg,
3265                                    sizeof(struct lustre_swap_layouts)))
3266                         RETURN(-EFAULT);
3267
3268                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3269                         RETURN(-EPERM);
3270
3271                 file2 = fget(lsl.sl_fd);
3272                 if (file2 == NULL)
3273                         RETURN(-EBADF);
3274
3275                 /* O_WRONLY or O_RDWR */
3276                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3277                         GOTO(out, rc = -EPERM);
3278
3279                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3280                         struct inode                    *inode2;
3281                         struct ll_inode_info            *lli;
3282                         struct obd_client_handle        *och = NULL;
3283
3284                         lli = ll_i2info(inode);
3285                         mutex_lock(&lli->lli_och_mutex);
3286                         if (fd->fd_lease_och != NULL) {
3287                                 och = fd->fd_lease_och;
3288                                 fd->fd_lease_och = NULL;
3289                         }
3290                         mutex_unlock(&lli->lli_och_mutex);
3291                         if (och == NULL)
3292                                 GOTO(out, rc = -ENOLCK);
3293                         inode2 = file_inode(file2);
3294                         rc = ll_swap_layouts_close(och, inode, inode2);
3295                 } else {
3296                         rc = ll_swap_layouts(file, file2, &lsl);
3297                 }
3298 out:
3299                 fput(file2);
3300                 RETURN(rc);
3301         }
3302         case LL_IOC_LOV_GETSTRIPE:
3303         case LL_IOC_LOV_GETSTRIPE_NEW:
3304                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3305         case FS_IOC_GETFLAGS:
3306         case FS_IOC_SETFLAGS:
3307                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3308         case FSFILT_IOC_GETVERSION:
3309         case FS_IOC_GETVERSION:
3310                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3311         /* We need to special case any other ioctls we want to handle,
3312          * to send them to the MDS/OST as appropriate and to properly
3313          * network encode the arg field. */
3314         case FS_IOC_SETVERSION:
3315                 RETURN(-ENOTSUPP);
3316
3317         case LL_IOC_GROUP_LOCK:
3318                 RETURN(ll_get_grouplock(inode, file, arg));
3319         case LL_IOC_GROUP_UNLOCK:
3320                 RETURN(ll_put_grouplock(inode, file, arg));
3321         case IOC_OBD_STATFS:
3322                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3323
3324         case LL_IOC_FLUSHCTX:
3325                 RETURN(ll_flush_ctx(inode));
3326         case LL_IOC_PATH2FID: {
3327                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3328                                  sizeof(struct lu_fid)))
3329                         RETURN(-EFAULT);
3330
3331                 RETURN(0);
3332         }
3333         case LL_IOC_GETPARENT:
3334                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3335
3336         case OBD_IOC_FID2PATH:
3337                 RETURN(ll_fid2path(inode, (void __user *)arg));
3338         case LL_IOC_DATA_VERSION: {
3339                 struct ioc_data_version idv;
3340                 int rc;
3341
3342                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3343                         RETURN(-EFAULT);
3344
3345                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3346                 rc = ll_ioc_data_version(inode, &idv);
3347
3348                 if (rc == 0 &&
3349                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3350                         RETURN(-EFAULT);
3351
3352                 RETURN(rc);
3353         }
3354
3355         case LL_IOC_GET_MDTIDX: {
3356                 int mdtidx;
3357
3358                 mdtidx = ll_get_mdt_idx(inode);
3359                 if (mdtidx < 0)
3360                         RETURN(mdtidx);
3361
3362                 if (put_user((int)mdtidx, (int __user *)arg))
3363                         RETURN(-EFAULT);
3364
3365                 RETURN(0);
3366         }
3367         case OBD_IOC_GETDTNAME:
3368         case OBD_IOC_GETMDNAME:
3369                 RETURN(ll_get_obd_name(inode, cmd, arg));
3370         case LL_IOC_HSM_STATE_GET: {
3371                 struct md_op_data       *op_data;
3372                 struct hsm_user_state   *hus;
3373                 int                      rc;
3374
3375                 OBD_ALLOC_PTR(hus);
3376                 if (hus == NULL)
3377                         RETURN(-ENOMEM);
3378
3379                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3380                                              LUSTRE_OPC_ANY, hus);
3381                 if (IS_ERR(op_data)) {
3382                         OBD_FREE_PTR(hus);
3383                         RETURN(PTR_ERR(op_data));
3384                 }
3385
3386                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3387                                    op_data, NULL);
3388
3389                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3390                         rc = -EFAULT;
3391
3392                 ll_finish_md_op_data(op_data);
3393                 OBD_FREE_PTR(hus);
3394                 RETURN(rc);
3395         }
3396         case LL_IOC_HSM_STATE_SET: {
3397                 struct hsm_state_set    *hss;
3398                 int                      rc;
3399
3400                 OBD_ALLOC_PTR(hss);
3401                 if (hss == NULL)
3402                         RETURN(-ENOMEM);
3403
3404                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3405                         OBD_FREE_PTR(hss);
3406                         RETURN(-EFAULT);
3407                 }
3408
3409                 rc = ll_hsm_state_set(inode, hss);
3410
3411                 OBD_FREE_PTR(hss);
3412                 RETURN(rc);
3413         }
3414         case LL_IOC_HSM_ACTION: {
3415                 struct md_op_data               *op_data;
3416                 struct hsm_current_action       *hca;
3417                 int                              rc;
3418
3419                 OBD_ALLOC_PTR(hca);
3420                 if (hca == NULL)
3421                         RETURN(-ENOMEM);
3422
3423                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3424                                              LUSTRE_OPC_ANY, hca);
3425                 if (IS_ERR(op_data)) {
3426                         OBD_FREE_PTR(hca);
3427                         RETURN(PTR_ERR(op_data));
3428                 }
3429
3430                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3431                                    op_data, NULL);
3432
3433                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3434                         rc = -EFAULT;
3435
3436                 ll_finish_md_op_data(op_data);
3437                 OBD_FREE_PTR(hca);
3438                 RETURN(rc);
3439         }
3440         case LL_IOC_SET_LEASE_OLD: {
3441                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3442
3443                 RETURN(ll_file_set_lease(file, &ioc, 0));
3444         }
3445         case LL_IOC_SET_LEASE: {
3446                 struct ll_ioc_lease ioc;
3447
3448                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3449                         RETURN(-EFAULT);
3450
3451                 RETURN(ll_file_set_lease(file, &ioc, arg));
3452         }
3453         case LL_IOC_GET_LEASE: {
3454                 struct ll_inode_info *lli = ll_i2info(inode);
3455                 struct ldlm_lock *lock = NULL;
3456                 fmode_t fmode = 0;
3457
3458                 mutex_lock(&lli->lli_och_mutex);
3459                 if (fd->fd_lease_och != NULL) {
3460                         struct obd_client_handle *och = fd->fd_lease_och;
3461
3462                         lock = ldlm_handle2lock(&och->och_lease_handle);
3463                         if (lock != NULL) {
3464                                 lock_res_and_lock(lock);
3465                                 if (!ldlm_is_cancel(lock))
3466                                         fmode = och->och_flags;
3467
3468                                 unlock_res_and_lock(lock);
3469                                 LDLM_LOCK_PUT(lock);
3470                         }
3471                 }
3472                 mutex_unlock(&lli->lli_och_mutex);
3473
3474                 RETURN(ll_lease_type_from_fmode(fmode));
3475         }
3476         case LL_IOC_HSM_IMPORT: {
3477                 struct hsm_user_import *hui;
3478
3479                 OBD_ALLOC_PTR(hui);
3480                 if (hui == NULL)
3481                         RETURN(-ENOMEM);
3482
3483                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3484                         OBD_FREE_PTR(hui);
3485                         RETURN(-EFAULT);
3486                 }
3487
3488                 rc = ll_hsm_import(inode, file, hui);
3489
3490                 OBD_FREE_PTR(hui);
3491                 RETURN(rc);
3492         }
3493         case LL_IOC_FUTIMES_3: {
3494                 struct ll_futimes_3 lfu;
3495
3496                 if (copy_from_user(&lfu,
3497                                    (const struct ll_futimes_3 __user *)arg,
3498                                    sizeof(lfu)))
3499                         RETURN(-EFAULT);
3500
3501                 RETURN(ll_file_futimes_3(file, &lfu));
3502         }
3503         case LL_IOC_LADVISE: {
3504                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3505                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3506                 int i;
3507                 int num_advise;
3508                 int alloc_size = sizeof(*k_ladvise_hdr);
3509
3510                 rc = 0;
3511                 u_ladvise_hdr = (void __user *)arg;
3512                 OBD_ALLOC_PTR(k_ladvise_hdr);
3513                 if (k_ladvise_hdr == NULL)
3514                         RETURN(-ENOMEM);
3515
3516                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3517                         GOTO(out_ladvise, rc = -EFAULT);
3518
3519                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3520                     k_ladvise_hdr->lah_count < 1)
3521                         GOTO(out_ladvise, rc = -EINVAL);
3522
3523                 num_advise = k_ladvise_hdr->lah_count;
3524                 if (num_advise >= LAH_COUNT_MAX)
3525                         GOTO(out_ladvise, rc = -EFBIG);
3526
3527                 OBD_FREE_PTR(k_ladvise_hdr);
3528                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3529                                       lah_advise[num_advise]);
3530                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3531                 if (k_ladvise_hdr == NULL)
3532                         RETURN(-ENOMEM);
3533
3534                 /*
3535                  * TODO: submit multiple advices to one server in a single RPC
3536                  */
3537                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3538                         GOTO(out_ladvise, rc = -EFAULT);
3539
3540                 for (i = 0; i < num_advise; i++) {
3541                         struct llapi_lu_ladvise *k_ladvise =
3542                                         &k_ladvise_hdr->lah_advise[i];
3543                         struct llapi_lu_ladvise __user *u_ladvise =
3544                                         &u_ladvise_hdr->lah_advise[i];
3545
3546                         rc = ll_ladvise_sanity(inode, k_ladvise);
3547                         if (rc)
3548                                 GOTO(out_ladvise, rc);
3549
3550                         switch (k_ladvise->lla_advice) {
3551                         case LU_LADVISE_LOCKNOEXPAND:
3552                                 rc = ll_lock_noexpand(file,
3553                                                k_ladvise->lla_peradvice_flags);
3554                                 GOTO(out_ladvise, rc);
3555                         case LU_LADVISE_LOCKAHEAD:
3556
3557                                 rc = ll_file_lock_ahead(file, k_ladvise);
3558
3559                                 if (rc < 0)
3560                                         GOTO(out_ladvise, rc);
3561
3562                                 if (put_user(rc,
3563                                              &u_ladvise->lla_lockahead_result))
3564                                         GOTO(out_ladvise, rc = -EFAULT);
3565                                 break;
3566                         default:
3567                                 rc = ll_ladvise(inode, file,
3568                                                 k_ladvise_hdr->lah_flags,
3569                                                 k_ladvise);
3570                                 if (rc)
3571                                         GOTO(out_ladvise, rc);
3572                                 break;
3573                         }
3574
3575                 }
3576
3577 out_ladvise:
3578                 OBD_FREE(k_ladvise_hdr, alloc_size);
3579                 RETURN(rc);
3580         }
3581         case LL_IOC_FLR_SET_MIRROR: {
3582                 /* mirror I/O must be direct to avoid polluting page cache
3583                  * by stale data. */
3584                 if (!(file->f_flags & O_DIRECT))
3585                         RETURN(-EINVAL);
3586
3587                 fd->fd_designated_mirror = (__u32)arg;
3588                 RETURN(0);
3589         }
3590         case LL_IOC_FSGETXATTR:
3591                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3592         case LL_IOC_FSSETXATTR:
3593                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3594         case BLKSSZGET:
3595                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3596         default:
3597                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3598                                      (void __user *)arg));
3599         }
3600 }
3601
3602 #ifndef HAVE_FILE_LLSEEK_SIZE
3603 static inline loff_t
3604 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3605 {
3606         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3607                 return -EINVAL;
3608         if (offset > maxsize)
3609                 return -EINVAL;
3610
3611         if (offset != file->f_pos) {
3612                 file->f_pos = offset;
3613                 file->f_version = 0;
3614         }
3615         return offset;
3616 }
3617
3618 static loff_t
3619 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3620                 loff_t maxsize, loff_t eof)
3621 {
3622         struct inode *inode = file_inode(file);
3623
3624         switch (origin) {
3625         case SEEK_END:
3626                 offset += eof;
3627                 break;
3628         case SEEK_CUR:
3629                 /*
3630                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3631                  * position-querying operation.  Avoid rewriting the "same"
3632                  * f_pos value back to the file because a concurrent read(),
3633                  * write() or lseek() might have altered it
3634                  */
3635                 if (offset == 0)
3636                         return file->f_pos;
3637                 /*
3638                  * f_lock protects against read/modify/write race with other
3639                  * SEEK_CURs. Note that parallel writes and reads behave
3640                  * like SEEK_SET.
3641                  */
3642                 inode_lock(inode);
3643                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3644                 inode_unlock(inode);
3645                 return offset;
3646         case SEEK_DATA:
3647                 /*
3648                  * In the generic case the entire file is data, so as long as
3649                  * offset isn't at the end of the file then the offset is data.
3650                  */
3651                 if (offset >= eof)
3652                         return -ENXIO;
3653                 break;
3654         case SEEK_HOLE:
3655                 /*
3656                  * There is a virtual hole at the end of the file, so as long as
3657                  * offset isn't i_size or larger, return i_size.
3658                  */
3659                 if (offset >= eof)
3660                         return -ENXIO;
3661                 offset = eof;
3662                 break;
3663         }
3664
3665         return llseek_execute(file, offset, maxsize);
3666 }
3667 #endif
3668
3669 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3670 {
3671         struct inode *inode = file_inode(file);
3672         loff_t retval, eof = 0;
3673
3674         ENTRY;
3675         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3676                            (origin == SEEK_CUR) ? file->f_pos : 0);
3677         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3678                PFID(ll_inode2fid(inode)), inode, retval, retval,
3679                origin);
3680         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3681
3682         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3683                 retval = ll_glimpse_size(inode);
3684                 if (retval != 0)
3685                         RETURN(retval);
3686                 eof = i_size_read(inode);
3687         }
3688
3689         retval = ll_generic_file_llseek_size(file, offset, origin,
3690                                           ll_file_maxbytes(inode), eof);
3691         RETURN(retval);
3692 }
3693
3694 static int ll_flush(struct file *file, fl_owner_t id)
3695 {
3696         struct inode *inode = file_inode(file);
3697         struct ll_inode_info *lli = ll_i2info(inode);
3698         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3699         int rc, err;
3700
3701         LASSERT(!S_ISDIR(inode->i_mode));
3702
3703         /* catch async errors that were recorded back when async writeback
3704          * failed for pages in this mapping. */
3705         rc = lli->lli_async_rc;
3706         lli->lli_async_rc = 0;
3707         if (lli->lli_clob != NULL) {
3708                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3709                 if (rc == 0)
3710                         rc = err;
3711         }
3712
3713         /* The application has been told write failure already.
3714          * Do not report failure again. */
3715         if (fd->fd_write_failed)
3716                 return 0;
3717         return rc ? -EIO : 0;
3718 }
3719
3720 /**
3721  * Called to make sure a portion of file has been written out.
3722  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3723  *
3724  * Return how many pages have been written.
3725  */
3726 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3727                        enum cl_fsync_mode mode, int ignore_layout)
3728 {
3729         struct lu_env *env;
3730         struct cl_io *io;
3731         struct cl_fsync_io *fio;
3732         int result;
3733         __u16 refcheck;
3734         ENTRY;
3735
3736         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3737             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3738                 RETURN(-EINVAL);
3739
3740         env = cl_env_get(&refcheck);
3741         if (IS_ERR(env))
3742                 RETURN(PTR_ERR(env));
3743
3744         io = vvp_env_thread_io(env);
3745         io->ci_obj = ll_i2info(inode)->lli_clob;
3746         io->ci_ignore_layout = ignore_layout;
3747
3748         /* initialize parameters for sync */
3749         fio = &io->u.ci_fsync;
3750         fio->fi_start = start;
3751         fio->fi_end = end;
3752         fio->fi_fid = ll_inode2fid(inode);
3753         fio->fi_mode = mode;
3754         fio->fi_nr_written = 0;
3755
3756         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3757                 result = cl_io_loop(env, io);
3758         else
3759                 result = io->ci_result;
3760         if (result == 0)
3761                 result = fio->fi_nr_written;
3762         cl_io_fini(env, io);
3763         cl_env_put(env, &refcheck);
3764
3765         RETURN(result);
3766 }
3767
3768 /*
3769  * When dentry is provided (the 'else' case), file_dentry() may be
3770  * null and dentry must be used directly rather than pulled from
3771  * file_dentry() as is done otherwise.
3772  */
3773
3774 #ifdef HAVE_FILE_FSYNC_4ARGS
3775 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3776 {
3777         struct dentry *dentry = file_dentry(file);
3778 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3779 int ll_fsync(struct file *file, int datasync)
3780 {
3781         struct dentry *dentry = file_dentry(file);
3782         loff_t start = 0;
3783         loff_t end = LLONG_MAX;
3784 #else
3785 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3786 {
3787         loff_t start = 0;
3788         loff_t end = LLONG_MAX;
3789 #endif
3790         struct inode *inode = dentry->d_inode;
3791         struct ll_inode_info *lli = ll_i2info(inode);
3792         struct ptlrpc_request *req;
3793         int rc, err;
3794         ENTRY;
3795
3796         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3797                PFID(ll_inode2fid(inode)), inode);
3798         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3799
3800 #ifdef HAVE_FILE_FSYNC_4ARGS
3801         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3802         inode_lock(inode);
3803 #else
3804         /* fsync's caller has already called _fdata{sync,write}, we want
3805          * that IO to finish before calling the osc and mdc sync methods */
3806         rc = filemap_fdatawait(inode->i_mapping);
3807 #endif
3808
3809         /* catch async errors that were recorded back when async writeback
3810          * failed for pages in this mapping. */
3811         if (!S_ISDIR(inode->i_mode)) {
3812                 err = lli->lli_async_rc;
3813                 lli->lli_async_rc = 0;
3814                 if (rc == 0)
3815                         rc = err;
3816                 if (lli->lli_clob != NULL) {
3817                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3818                         if (rc == 0)
3819                                 rc = err;
3820                 }
3821         }
3822
3823         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3824         if (!rc)
3825                 rc = err;
3826         if (!err)
3827                 ptlrpc_req_finished(req);
3828
3829         if (S_ISREG(inode->i_mode)) {
3830                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3831
3832                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3833                 if (rc == 0 && err < 0)
3834                         rc = err;
3835                 if (rc < 0)
3836                         fd->fd_write_failed = true;
3837                 else
3838                         fd->fd_write_failed = false;
3839         }
3840
3841 #ifdef HAVE_FILE_FSYNC_4ARGS
3842         inode_unlock(inode);
3843 #endif
3844         RETURN(rc);
3845 }
3846
3847 static int
3848 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3849 {
3850         struct inode *inode = file_inode(file);
3851         struct ll_sb_info *sbi = ll_i2sbi(inode);
3852         struct ldlm_enqueue_info einfo = {
3853                 .ei_type        = LDLM_FLOCK,
3854                 .ei_cb_cp       = ldlm_flock_completion_ast,
3855                 .ei_cbdata      = file_lock,
3856         };
3857         struct md_op_data *op_data;
3858         struct lustre_handle lockh = { 0 };
3859         union ldlm_policy_data flock = { { 0 } };
3860         int fl_type = file_lock->fl_type;
3861         __u64 flags = 0;
3862         int rc;
3863         int rc2 = 0;
3864         ENTRY;
3865
3866         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3867                PFID(ll_inode2fid(inode)), file_lock);
3868
3869         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3870
3871         if (file_lock->fl_flags & FL_FLOCK) {
3872                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3873                 /* flocks are whole-file locks */
3874                 flock.l_flock.end = OFFSET_MAX;
3875                 /* For flocks owner is determined by the local file desctiptor*/
3876                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3877         } else if (file_lock->fl_flags & FL_POSIX) {
3878                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3879                 flock.l_flock.start = file_lock->fl_start;
3880                 flock.l_flock.end = file_lock->fl_end;
3881         } else {
3882                 RETURN(-EINVAL);
3883         }
3884         flock.l_flock.pid = file_lock->fl_pid;
3885
3886         /* Somewhat ugly workaround for svc lockd.
3887          * lockd installs custom fl_lmops->lm_compare_owner that checks
3888          * for the fl_owner to be the same (which it always is on local node
3889          * I guess between lockd processes) and then compares pid.
3890          * As such we assign pid to the owner field to make it all work,
3891          * conflict with normal locks is unlikely since pid space and
3892          * pointer space for current->files are not intersecting */
3893         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3894                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3895
3896         switch (fl_type) {
3897         case F_RDLCK:
3898                 einfo.ei_mode = LCK_PR;
3899                 break;
3900         case F_UNLCK:
3901                 /* An unlock request may or may not have any relation to
3902                  * existing locks so we may not be able to pass a lock handle
3903                  * via a normal ldlm_lock_cancel() request. The request may even
3904                  * unlock a byte range in the middle of an existing lock. In
3905                  * order to process an unlock request we need all of the same
3906                  * information that is given with a normal read or write record
3907                  * lock request. To avoid creating another ldlm unlock (cancel)
3908                  * message we'll treat a LCK_NL flock request as an unlock. */
3909                 einfo.ei_mode = LCK_NL;
3910                 break;
3911         case F_WRLCK:
3912                 einfo.ei_mode = LCK_PW;
3913                 break;
3914         default:
3915                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3916                 RETURN (-ENOTSUPP);
3917         }
3918
3919         switch (cmd) {
3920         case F_SETLKW:
3921 #ifdef F_SETLKW64
3922         case F_SETLKW64:
3923 #endif
3924                 flags = 0;
3925                 break;
3926         case F_SETLK:
3927 #ifdef F_SETLK64
3928         case F_SETLK64:
3929 #endif
3930                 flags = LDLM_FL_BLOCK_NOWAIT;
3931                 break;
3932         case F_GETLK:
3933 #ifdef F_GETLK64
3934         case F_GETLK64:
3935 #endif
3936                 flags = LDLM_FL_TEST_LOCK;
3937                 break;
3938         default:
3939                 CERROR("unknown fcntl lock command: %d\n", cmd);
3940                 RETURN (-EINVAL);
3941         }
3942
3943         /* Save the old mode so that if the mode in the lock changes we
3944          * can decrement the appropriate reader or writer refcount. */
3945         file_lock->fl_type = einfo.ei_mode;
3946
3947         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3948                                      LUSTRE_OPC_ANY, NULL);
3949         if (IS_ERR(op_data))
3950                 RETURN(PTR_ERR(op_data));
3951
3952         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
3953                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
3954                flock.l_flock.pid, flags, einfo.ei_mode,
3955                flock.l_flock.start, flock.l_flock.end);
3956
3957         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
3958                         flags);
3959
3960         /* Restore the file lock type if not TEST lock. */
3961         if (!(flags & LDLM_FL_TEST_LOCK))
3962                 file_lock->fl_type = fl_type;
3963
3964 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
3965         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
3966             !(flags & LDLM_FL_TEST_LOCK))
3967                 rc2  = locks_lock_file_wait(file, file_lock);
3968 #else
3969         if ((file_lock->fl_flags & FL_FLOCK) &&
3970             (rc == 0 || file_lock->fl_type == F_UNLCK))
3971                 rc2  = flock_lock_file_wait(file, file_lock);
3972         if ((file_lock->fl_flags & FL_POSIX) &&
3973             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3974             !(flags & LDLM_FL_TEST_LOCK))
3975                 rc2  = posix_lock_file_wait(file, file_lock);
3976 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
3977
3978         if (rc2 && file_lock->fl_type != F_UNLCK) {
3979                 einfo.ei_mode = LCK_NL;
3980                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
3981                            &lockh, flags);
3982                 rc = rc2;
3983         }
3984
3985         ll_finish_md_op_data(op_data);
3986
3987         RETURN(rc);
3988 }
3989
3990 int ll_get_fid_by_name(struct inode *parent, const char *name,
3991                        int namelen, struct lu_fid *fid,
3992                        struct inode **inode)
3993 {
3994         struct md_op_data       *op_data = NULL;
3995         struct mdt_body         *body;
3996         struct ptlrpc_request   *req;
3997         int                     rc;
3998         ENTRY;
3999
4000         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4001                                      LUSTRE_OPC_ANY, NULL);
4002         if (IS_ERR(op_data))
4003                 RETURN(PTR_ERR(op_data));
4004
4005         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4006         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4007         ll_finish_md_op_data(op_data);
4008         if (rc < 0)
4009                 RETURN(rc);
4010
4011         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4012         if (body == NULL)
4013                 GOTO(out_req, rc = -EFAULT);
4014         if (fid != NULL)
4015                 *fid = body->mbo_fid1;
4016
4017         if (inode != NULL)
4018                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4019 out_req:
4020         ptlrpc_req_finished(req);
4021         RETURN(rc);
4022 }
4023
4024 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4025                const char *name)
4026 {
4027         struct dentry *dchild = NULL;
4028         struct inode *child_inode = NULL;
4029         struct md_op_data *op_data;
4030         struct ptlrpc_request *request = NULL;
4031         struct obd_client_handle *och = NULL;
4032         struct qstr qstr;
4033         struct mdt_body *body;
4034         __u64 data_version = 0;
4035         size_t namelen = strlen(name);
4036         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4037         int rc;
4038         ENTRY;
4039
4040         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4041                PFID(ll_inode2fid(parent)), name,
4042                lum->lum_stripe_offset, lum->lum_stripe_count);
4043
4044         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4045             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4046                 lustre_swab_lmv_user_md(lum);
4047
4048         /* Get child FID first */
4049         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4050         qstr.name = name;
4051         qstr.len = namelen;
4052         dchild = d_lookup(file_dentry(file), &qstr);
4053         if (dchild) {
4054                 if (dchild->d_inode)
4055                         child_inode = igrab(dchild->d_inode);
4056                 dput(dchild);
4057         }
4058
4059         if (!child_inode) {
4060                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4061                                         &child_inode);
4062                 if (rc)
4063                         RETURN(rc);
4064         }
4065
4066         if (!child_inode)
4067                 RETURN(-ENOENT);
4068
4069         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4070               OBD_CONNECT2_DIR_MIGRATE)) {
4071                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4072                     ll_i2info(child_inode)->lli_lsm_md) {
4073                         CERROR("%s: MDT doesn't support stripe directory "
4074                                "migration!\n",
4075                                ll_get_fsname(parent->i_sb, NULL, 0));
4076                         GOTO(out_iput, rc = -EOPNOTSUPP);
4077                 }
4078         }
4079
4080         /*
4081          * lfs migrate command needs to be blocked on the client
4082          * by checking the migrate FID against the FID of the
4083          * filesystem root.
4084          */
4085         if (child_inode == parent->i_sb->s_root->d_inode)
4086                 GOTO(out_iput, rc = -EINVAL);
4087
4088         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4089                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4090         if (IS_ERR(op_data))
4091                 GOTO(out_iput, rc = PTR_ERR(op_data));
4092
4093         inode_lock(child_inode);
4094         op_data->op_fid3 = *ll_inode2fid(child_inode);
4095         if (!fid_is_sane(&op_data->op_fid3)) {
4096                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4097                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4098                        PFID(&op_data->op_fid3));
4099                 GOTO(out_unlock, rc = -EINVAL);
4100         }
4101
4102         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4103         op_data->op_data = lum;
4104         op_data->op_data_size = lumlen;
4105
4106 again:
4107         if (S_ISREG(child_inode->i_mode)) {
4108                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4109                 if (IS_ERR(och)) {
4110                         rc = PTR_ERR(och);
4111                         och = NULL;
4112                         GOTO(out_unlock, rc);
4113                 }
4114
4115                 rc = ll_data_version(child_inode, &data_version,
4116                                      LL_DV_WR_FLUSH);
4117                 if (rc != 0)
4118                         GOTO(out_close, rc);
4119
4120                 op_data->op_open_handle = och->och_open_handle;
4121                 op_data->op_data_version = data_version;
4122                 op_data->op_lease_handle = och->och_lease_handle;
4123                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4124
4125                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4126                 och->och_mod->mod_open_req->rq_replay = 0;
4127                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4128         }
4129
4130         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4131                        name, namelen, &request);
4132         if (rc == 0) {
4133                 LASSERT(request != NULL);
4134                 ll_update_times(request, parent);
4135
4136                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4137                 LASSERT(body != NULL);
4138
4139                 /* If the server does release layout lock, then we cleanup
4140                  * the client och here, otherwise release it in out_close: */
4141                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4142                         obd_mod_put(och->och_mod);
4143                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4144                                                   och);
4145                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4146                         OBD_FREE_PTR(och);
4147                         och = NULL;
4148                 }
4149         }
4150
4151         if (request != NULL) {
4152                 ptlrpc_req_finished(request);
4153                 request = NULL;
4154         }
4155
4156         /* Try again if the file layout has changed. */
4157         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4158                 goto again;
4159
4160 out_close:
4161         if (och)
4162                 ll_lease_close(och, child_inode, NULL);
4163         if (!rc)
4164                 clear_nlink(child_inode);
4165 out_unlock:
4166         inode_unlock(child_inode);
4167         ll_finish_md_op_data(op_data);
4168 out_iput:
4169         iput(child_inode);
4170         RETURN(rc);
4171 }
4172
4173 static int
4174 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4175 {
4176         ENTRY;
4177
4178         RETURN(-ENOSYS);
4179 }
4180
4181 /**
4182  * test if some locks matching bits and l_req_mode are acquired
4183  * - bits can be in different locks
4184  * - if found clear the common lock bits in *bits
4185  * - the bits not found, are kept in *bits
4186  * \param inode [IN]
4187  * \param bits [IN] searched lock bits [IN]
4188  * \param l_req_mode [IN] searched lock mode
4189  * \retval boolean, true iff all bits are found
4190  */
4191 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4192 {
4193         struct lustre_handle lockh;
4194         union ldlm_policy_data policy;
4195         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4196                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4197         struct lu_fid *fid;
4198         __u64 flags;
4199         int i;
4200         ENTRY;
4201
4202         if (!inode)
4203                RETURN(0);
4204
4205         fid = &ll_i2info(inode)->lli_fid;
4206         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4207                ldlm_lockname[mode]);
4208
4209         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4210         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4211                 policy.l_inodebits.bits = *bits & (1 << i);
4212                 if (policy.l_inodebits.bits == 0)
4213                         continue;
4214
4215                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4216                                   &policy, mode, &lockh)) {
4217                         struct ldlm_lock *lock;
4218
4219                         lock = ldlm_handle2lock(&lockh);
4220                         if (lock) {
4221                                 *bits &=
4222                                       ~(lock->l_policy_data.l_inodebits.bits);
4223                                 LDLM_LOCK_PUT(lock);
4224                         } else {
4225                                 *bits &= ~policy.l_inodebits.bits;
4226                         }
4227                 }
4228         }
4229         RETURN(*bits == 0);
4230 }
4231
4232 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4233                                struct lustre_handle *lockh, __u64 flags,
4234                                enum ldlm_mode mode)
4235 {
4236         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4237         struct lu_fid *fid;
4238         enum ldlm_mode rc;
4239         ENTRY;
4240
4241         fid = &ll_i2info(inode)->lli_fid;
4242         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4243
4244         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4245                            fid, LDLM_IBITS, &policy, mode, lockh);
4246
4247         RETURN(rc);
4248 }
4249
4250 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4251 {
4252         /* Already unlinked. Just update nlink and return success */
4253         if (rc == -ENOENT) {
4254                 clear_nlink(inode);
4255                 /* If it is striped directory, and there is bad stripe
4256                  * Let's revalidate the dentry again, instead of returning
4257                  * error */
4258                 if (S_ISDIR(inode->i_mode) &&
4259                     ll_i2info(inode)->lli_lsm_md != NULL)
4260                         return 0;
4261
4262                 /* This path cannot be hit for regular files unless in
4263                  * case of obscure races, so no need to to validate
4264                  * size. */
4265                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4266                         return 0;
4267         } else if (rc != 0) {
4268                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4269                              "%s: revalidate FID "DFID" error: rc = %d\n",
4270                              ll_get_fsname(inode->i_sb, NULL, 0),
4271                              PFID(ll_inode2fid(inode)), rc);
4272         }
4273
4274         return rc;
4275 }
4276
4277 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4278 {
4279         struct inode *inode = dentry->d_inode;
4280         struct obd_export *exp = ll_i2mdexp(inode);
4281         struct lookup_intent oit = {
4282                 .it_op = op,
4283         };
4284         struct ptlrpc_request *req = NULL;
4285         struct md_op_data *op_data;
4286         int rc = 0;
4287         ENTRY;
4288
4289         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4290                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4291
4292         /* Call getattr by fid, so do not provide name at all. */
4293         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4294                                      LUSTRE_OPC_ANY, NULL);
4295         if (IS_ERR(op_data))
4296                 RETURN(PTR_ERR(op_data));
4297
4298         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4299         ll_finish_md_op_data(op_data);
4300         if (rc < 0) {
4301                 rc = ll_inode_revalidate_fini(inode, rc);
4302                 GOTO(out, rc);
4303         }
4304
4305         rc = ll_revalidate_it_finish(req, &oit, dentry);
4306         if (rc != 0) {
4307                 ll_intent_release(&oit);
4308                 GOTO(out, rc);
4309         }
4310
4311         /* Unlinked? Unhash dentry, so it is not picked up later by
4312          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4313          * here to preserve get_cwd functionality on 2.6.
4314          * Bug 10503 */
4315         if (!dentry->d_inode->i_nlink) {
4316                 ll_lock_dcache(inode);
4317                 d_lustre_invalidate(dentry, 0);
4318                 ll_unlock_dcache(inode);
4319         }
4320
4321         ll_lookup_finish_locks(&oit, dentry);
4322 out:
4323         ptlrpc_req_finished(req);
4324
4325         return rc;
4326 }
4327
4328 static int ll_merge_md_attr(struct inode *inode)
4329 {
4330         struct ll_inode_info *lli = ll_i2info(inode);
4331         struct cl_attr attr = { 0 };
4332         int rc;
4333
4334         LASSERT(lli->lli_lsm_md != NULL);
4335         down_read(&lli->lli_lsm_sem);
4336         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4337                            &attr, ll_md_blocking_ast);
4338         up_read(&lli->lli_lsm_sem);
4339         if (rc != 0)
4340                 RETURN(rc);
4341
4342         set_nlink(inode, attr.cat_nlink);
4343         inode->i_blocks = attr.cat_blocks;
4344         i_size_write(inode, attr.cat_size);
4345
4346         ll_i2info(inode)->lli_atime = attr.cat_atime;
4347         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4348         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4349
4350         RETURN(0);
4351 }
4352
4353 static inline dev_t ll_compat_encode_dev(dev_t dev)
4354 {
4355         /* The compat_sys_*stat*() syscalls will fail unless the
4356          * device majors and minors are both less than 256. Note that
4357          * the value returned here will be passed through
4358          * old_encode_dev() in cp_compat_stat(). And so we are not
4359          * trying to return a valid compat (u16) device number, just
4360          * one that will pass the old_valid_dev() check. */
4361
4362         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4363 }
4364
4365 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4366 int ll_getattr(const struct path *path, struct kstat *stat,
4367                u32 request_mask, unsigned int flags)
4368 {
4369         struct dentry *de = path->dentry;
4370 #else
4371 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4372 {
4373 #endif
4374         struct inode *inode = de->d_inode;
4375         struct ll_sb_info *sbi = ll_i2sbi(inode);
4376         struct ll_inode_info *lli = ll_i2info(inode);
4377         int rc;
4378
4379         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4380
4381         rc = ll_inode_revalidate(de, IT_GETATTR);
4382         if (rc < 0)
4383                 RETURN(rc);
4384
4385         if (S_ISREG(inode->i_mode)) {
4386                 /* In case of restore, the MDT has the right size and has
4387                  * already send it back without granting the layout lock,
4388                  * inode is up-to-date so glimpse is useless.
4389                  * Also to glimpse we need the layout, in case of a running
4390                  * restore the MDT holds the layout lock so the glimpse will
4391                  * block up to the end of restore (getattr will block)
4392                  */
4393                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4394                         rc = ll_glimpse_size(inode);
4395                         if (rc < 0)
4396                                 RETURN(rc);
4397                 }
4398         } else {
4399                 /* If object isn't regular a file then don't validate size. */
4400                 if (S_ISDIR(inode->i_mode) &&
4401                     lli->lli_lsm_md != NULL) {
4402                         rc = ll_merge_md_attr(inode);
4403                         if (rc < 0)
4404                                 RETURN(rc);
4405                 }
4406
4407                 LTIME_S(inode->i_atime) = lli->lli_atime;
4408                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4409                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4410         }
4411
4412         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4413
4414         if (ll_need_32bit_api(sbi)) {
4415                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4416                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4417                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4418         } else {
4419                 stat->ino = inode->i_ino;
4420                 stat->dev = inode->i_sb->s_dev;
4421                 stat->rdev = inode->i_rdev;
4422         }
4423
4424         stat->mode = inode->i_mode;
4425         stat->uid = inode->i_uid;
4426         stat->gid = inode->i_gid;
4427         stat->atime = inode->i_atime;
4428         stat->mtime = inode->i_mtime;
4429         stat->ctime = inode->i_ctime;
4430         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4431
4432         stat->nlink = inode->i_nlink;
4433         stat->size = i_size_read(inode);
4434         stat->blocks = inode->i_blocks;
4435
4436         return 0;
4437 }
4438
4439 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4440                      __u64 start, __u64 len)
4441 {
4442         int             rc;
4443         size_t          num_bytes;
4444         struct fiemap   *fiemap;
4445         unsigned int    extent_count = fieinfo->fi_extents_max;
4446
4447         num_bytes = sizeof(*fiemap) + (extent_count *
4448                                        sizeof(struct fiemap_extent));
4449         OBD_ALLOC_LARGE(fiemap, num_bytes);
4450
4451         if (fiemap == NULL)
4452                 RETURN(-ENOMEM);
4453
4454         fiemap->fm_flags = fieinfo->fi_flags;
4455         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4456         fiemap->fm_start = start;
4457         fiemap->fm_length = len;
4458         if (extent_count > 0 &&
4459             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4460                            sizeof(struct fiemap_extent)) != 0)
4461                 GOTO(out, rc = -EFAULT);
4462
4463         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4464
4465         fieinfo->fi_flags = fiemap->fm_flags;
4466         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4467         if (extent_count > 0 &&
4468             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4469                          fiemap->fm_mapped_extents *
4470                          sizeof(struct fiemap_extent)) != 0)
4471                 GOTO(out, rc = -EFAULT);
4472 out:
4473         OBD_FREE_LARGE(fiemap, num_bytes);
4474         return rc;
4475 }
4476
4477 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4478 {
4479         struct ll_inode_info *lli = ll_i2info(inode);
4480         struct posix_acl *acl = NULL;
4481         ENTRY;
4482
4483         spin_lock(&lli->lli_lock);
4484         /* VFS' acl_permission_check->check_acl will release the refcount */
4485         acl = posix_acl_dup(lli->lli_posix_acl);
4486         spin_unlock(&lli->lli_lock);
4487
4488         RETURN(acl);
4489 }
4490
4491 #ifdef HAVE_IOP_SET_ACL
4492 #ifdef CONFIG_FS_POSIX_ACL
4493 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4494 {
4495         struct ll_sb_info *sbi = ll_i2sbi(inode);
4496         struct ptlrpc_request *req = NULL;
4497         const char *name = NULL;
4498         char *value = NULL;
4499         size_t value_size = 0;
4500         int rc = 0;
4501         ENTRY;
4502
4503         switch (type) {
4504         case ACL_TYPE_ACCESS:
4505                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4506                 if (acl)
4507                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4508                 break;
4509
4510         case ACL_TYPE_DEFAULT:
4511                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4512                 if (!S_ISDIR(inode->i_mode))
4513                         rc = acl ? -EACCES : 0;
4514                 break;
4515
4516         default:
4517                 rc = -EINVAL;
4518                 break;
4519         }
4520         if (rc)
4521                 return rc;
4522
4523         if (acl) {
4524                 value_size = posix_acl_xattr_size(acl->a_count);
4525                 value = kmalloc(value_size, GFP_NOFS);
4526                 if (value == NULL)
4527                         GOTO(out, rc = -ENOMEM);
4528
4529                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4530                 if (rc < 0)
4531                         GOTO(out_value, rc);
4532         }
4533
4534         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4535                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4536                          name, value, value_size, 0, 0, &req);
4537
4538         ptlrpc_req_finished(req);
4539 out_value:
4540         kfree(value);
4541 out:
4542         if (rc)
4543                 forget_cached_acl(inode, type);
4544         else
4545                 set_cached_acl(inode, type, acl);
4546         RETURN(rc);
4547 }
4548 #endif /* CONFIG_FS_POSIX_ACL */
4549 #endif /* HAVE_IOP_SET_ACL */
4550
4551 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4552 static int
4553 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4554 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4555 # else
4556 ll_check_acl(struct inode *inode, int mask)
4557 # endif
4558 {
4559 # ifdef CONFIG_FS_POSIX_ACL
4560         struct posix_acl *acl;
4561         int rc;
4562         ENTRY;
4563
4564 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4565         if (flags & IPERM_FLAG_RCU)
4566                 return -ECHILD;
4567 #  endif
4568         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4569
4570         if (!acl)
4571                 RETURN(-EAGAIN);
4572
4573         rc = posix_acl_permission(inode, acl, mask);
4574         posix_acl_release(acl);
4575
4576         RETURN(rc);
4577 # else /* !CONFIG_FS_POSIX_ACL */
4578         return -EAGAIN;
4579 # endif /* CONFIG_FS_POSIX_ACL */
4580 }
4581 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4582
4583 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4584 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4585 #else
4586 # ifdef HAVE_INODE_PERMISION_2ARGS
4587 int ll_inode_permission(struct inode *inode, int mask)
4588 # else
4589 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4590 # endif
4591 #endif
4592 {
4593         int rc = 0;
4594         struct ll_sb_info *sbi;
4595         struct root_squash_info *squash;
4596         struct cred *cred = NULL;
4597         const struct cred *old_cred = NULL;
4598         cfs_cap_t cap;
4599         bool squash_id = false;
4600         ENTRY;
4601
4602 #ifdef MAY_NOT_BLOCK
4603         if (mask & MAY_NOT_BLOCK)
4604                 return -ECHILD;
4605 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4606         if (flags & IPERM_FLAG_RCU)
4607                 return -ECHILD;
4608 #endif
4609
4610        /* as root inode are NOT getting validated in lookup operation,
4611         * need to do it before permission check. */
4612
4613         if (inode == inode->i_sb->s_root->d_inode) {
4614                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4615                 if (rc)
4616                         RETURN(rc);
4617         }
4618
4619         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4620                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4621
4622         /* squash fsuid/fsgid if needed */
4623         sbi = ll_i2sbi(inode);
4624         squash = &sbi->ll_squash;
4625         if (unlikely(squash->rsi_uid != 0 &&
4626                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4627                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4628                         squash_id = true;
4629         }
4630         if (squash_id) {
4631                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4632                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4633                        squash->rsi_uid, squash->rsi_gid);
4634
4635                 /* update current process's credentials
4636                  * and FS capability */
4637                 cred = prepare_creds();
4638                 if (cred == NULL)
4639                         RETURN(-ENOMEM);
4640
4641                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4642                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4643                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4644                         if ((1 << cap) & CFS_CAP_FS_MASK)
4645                                 cap_lower(cred->cap_effective, cap);
4646                 }
4647                 old_cred = override_creds(cred);
4648         }
4649
4650         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4651         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4652         /* restore current process's credentials and FS capability */
4653         if (squash_id) {
4654                 revert_creds(old_cred);
4655                 put_cred(cred);
4656         }
4657
4658         RETURN(rc);
4659 }
4660
4661 /* -o localflock - only provides locally consistent flock locks */
4662 struct file_operations ll_file_operations = {
4663 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4664 # ifdef HAVE_SYNC_READ_WRITE
4665         .read           = new_sync_read,
4666         .write          = new_sync_write,
4667 # endif
4668         .read_iter      = ll_file_read_iter,
4669         .write_iter     = ll_file_write_iter,
4670 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4671         .read           = ll_file_read,
4672         .aio_read       = ll_file_aio_read,
4673         .write          = ll_file_write,
4674         .aio_write      = ll_file_aio_write,
4675 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4676         .unlocked_ioctl = ll_file_ioctl,
4677         .open           = ll_file_open,
4678         .release        = ll_file_release,
4679         .mmap           = ll_file_mmap,
4680         .llseek         = ll_file_seek,
4681         .splice_read    = ll_file_splice_read,
4682         .fsync          = ll_fsync,
4683         .flush          = ll_flush
4684 };
4685
4686 struct file_operations ll_file_operations_flock = {
4687 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4688 # ifdef HAVE_SYNC_READ_WRITE
4689         .read           = new_sync_read,
4690         .write          = new_sync_write,
4691 # endif /* HAVE_SYNC_READ_WRITE */
4692         .read_iter      = ll_file_read_iter,
4693         .write_iter     = ll_file_write_iter,
4694 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4695         .read           = ll_file_read,
4696         .aio_read       = ll_file_aio_read,
4697         .write          = ll_file_write,
4698         .aio_write      = ll_file_aio_write,
4699 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4700         .unlocked_ioctl = ll_file_ioctl,
4701         .open           = ll_file_open,
4702         .release        = ll_file_release,
4703         .mmap           = ll_file_mmap,
4704         .llseek         = ll_file_seek,
4705         .splice_read    = ll_file_splice_read,
4706         .fsync          = ll_fsync,
4707         .flush          = ll_flush,
4708         .flock          = ll_file_flock,
4709         .lock           = ll_file_flock
4710 };
4711
4712 /* These are for -o noflock - to return ENOSYS on flock calls */
4713 struct file_operations ll_file_operations_noflock = {
4714 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4715 # ifdef HAVE_SYNC_READ_WRITE
4716         .read           = new_sync_read,
4717         .write          = new_sync_write,
4718 # endif /* HAVE_SYNC_READ_WRITE */
4719         .read_iter      = ll_file_read_iter,
4720         .write_iter     = ll_file_write_iter,
4721 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4722         .read           = ll_file_read,
4723         .aio_read       = ll_file_aio_read,
4724         .write          = ll_file_write,
4725         .aio_write      = ll_file_aio_write,
4726 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4727         .unlocked_ioctl = ll_file_ioctl,
4728         .open           = ll_file_open,
4729         .release        = ll_file_release,
4730         .mmap           = ll_file_mmap,
4731         .llseek         = ll_file_seek,
4732         .splice_read    = ll_file_splice_read,
4733         .fsync          = ll_fsync,
4734         .flush          = ll_flush,
4735         .flock          = ll_file_noflock,
4736         .lock           = ll_file_noflock
4737 };
4738
4739 struct inode_operations ll_file_inode_operations = {
4740         .setattr        = ll_setattr,
4741         .getattr        = ll_getattr,
4742         .permission     = ll_inode_permission,
4743 #ifdef HAVE_IOP_XATTR
4744         .setxattr       = ll_setxattr,
4745         .getxattr       = ll_getxattr,
4746         .removexattr    = ll_removexattr,
4747 #endif
4748         .listxattr      = ll_listxattr,
4749         .fiemap         = ll_fiemap,
4750 #ifdef HAVE_IOP_GET_ACL
4751         .get_acl        = ll_get_acl,
4752 #endif
4753 #ifdef HAVE_IOP_SET_ACL
4754         .set_acl        = ll_set_acl,
4755 #endif
4756 };
4757
4758 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4759 {
4760         struct ll_inode_info *lli = ll_i2info(inode);
4761         struct cl_object *obj = lli->lli_clob;
4762         struct lu_env *env;
4763         int rc;
4764         __u16 refcheck;
4765         ENTRY;
4766
4767         if (obj == NULL)
4768                 RETURN(0);
4769
4770         env = cl_env_get(&refcheck);
4771         if (IS_ERR(env))
4772                 RETURN(PTR_ERR(env));
4773
4774         rc = cl_conf_set(env, lli->lli_clob, conf);
4775         if (rc < 0)
4776                 GOTO(out, rc);
4777
4778         if (conf->coc_opc == OBJECT_CONF_SET) {
4779                 struct ldlm_lock *lock = conf->coc_lock;
4780                 struct cl_layout cl = {
4781                         .cl_layout_gen = 0,
4782                 };
4783
4784                 LASSERT(lock != NULL);
4785                 LASSERT(ldlm_has_layout(lock));
4786
4787                 /* it can only be allowed to match after layout is
4788                  * applied to inode otherwise false layout would be
4789                  * seen. Applying layout shoud happen before dropping
4790                  * the intent lock. */
4791                 ldlm_lock_allow_match(lock);
4792
4793                 rc = cl_object_layout_get(env, obj, &cl);
4794                 if (rc < 0)
4795                         GOTO(out, rc);
4796
4797                 CDEBUG(D_VFSTRACE,
4798                        DFID": layout version change: %u -> %u\n",
4799                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4800                        cl.cl_layout_gen);
4801                 ll_layout_version_set(lli, cl.cl_layout_gen);
4802         }
4803
4804 out:
4805         cl_env_put(env, &refcheck);
4806
4807         RETURN(rc);
4808 }
4809
4810 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4811 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4812
4813 {
4814         struct ll_sb_info *sbi = ll_i2sbi(inode);
4815         struct ptlrpc_request *req;
4816         void *lvbdata;
4817         void *lmm;
4818         int lmmsize;
4819         int rc;
4820         ENTRY;
4821
4822         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4823                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4824                lock->l_lvb_data, lock->l_lvb_len);
4825
4826         if (lock->l_lvb_data != NULL)
4827                 RETURN(0);
4828
4829         /* if layout lock was granted right away, the layout is returned
4830          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4831          * blocked and then granted via completion ast, we have to fetch
4832          * layout here. Please note that we can't use the LVB buffer in
4833          * completion AST because it doesn't have a large enough buffer */
4834         rc = ll_get_default_mdsize(sbi, &lmmsize);
4835         if (rc < 0)
4836                 RETURN(rc);
4837
4838         rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4839                          XATTR_NAME_LOV, lmmsize, &req);
4840         if (rc < 0) {
4841                 if (rc == -ENODATA)
4842                         GOTO(out, rc = 0); /* empty layout */
4843                 else
4844                         RETURN(rc);
4845         }
4846
4847         lmmsize = rc;
4848         rc = 0;
4849         if (lmmsize == 0) /* empty layout */
4850                 GOTO(out, rc = 0);
4851
4852         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4853         if (lmm == NULL)
4854                 GOTO(out, rc = -EFAULT);
4855
4856         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4857         if (lvbdata == NULL)
4858                 GOTO(out, rc = -ENOMEM);
4859
4860         memcpy(lvbdata, lmm, lmmsize);
4861         lock_res_and_lock(lock);
4862         if (unlikely(lock->l_lvb_data == NULL)) {
4863                 lock->l_lvb_type = LVB_T_LAYOUT;
4864                 lock->l_lvb_data = lvbdata;
4865                 lock->l_lvb_len = lmmsize;
4866                 lvbdata = NULL;
4867         }
4868         unlock_res_and_lock(lock);
4869
4870         if (lvbdata)
4871                 OBD_FREE_LARGE(lvbdata, lmmsize);
4872
4873         EXIT;
4874
4875 out:
4876         ptlrpc_req_finished(req);
4877         return rc;
4878 }
4879
4880 /**
4881  * Apply the layout to the inode. Layout lock is held and will be released
4882  * in this function.
4883  */
4884 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4885                               struct inode *inode)
4886 {
4887         struct ll_inode_info *lli = ll_i2info(inode);
4888         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4889         struct ldlm_lock *lock;
4890         struct cl_object_conf conf;
4891         int rc = 0;
4892         bool lvb_ready;
4893         bool wait_layout = false;
4894         ENTRY;
4895
4896         LASSERT(lustre_handle_is_used(lockh));
4897
4898         lock = ldlm_handle2lock(lockh);
4899         LASSERT(lock != NULL);
4900         LASSERT(ldlm_has_layout(lock));
4901
4902         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4903                    PFID(&lli->lli_fid), inode);
4904
4905         /* in case this is a caching lock and reinstate with new inode */
4906         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4907
4908         lock_res_and_lock(lock);
4909         lvb_ready = ldlm_is_lvb_ready(lock);
4910         unlock_res_and_lock(lock);
4911
4912         /* checking lvb_ready is racy but this is okay. The worst case is
4913          * that multi processes may configure the file on the same time. */
4914         if (lvb_ready)
4915                 GOTO(out, rc = 0);
4916
4917         rc = ll_layout_fetch(inode, lock);
4918         if (rc < 0)
4919                 GOTO(out, rc);
4920
4921         /* for layout lock, lmm is stored in lock's lvb.
4922          * lvb_data is immutable if the lock is held so it's safe to access it
4923          * without res lock.
4924          *
4925          * set layout to file. Unlikely this will fail as old layout was
4926          * surely eliminated */
4927         memset(&conf, 0, sizeof conf);
4928         conf.coc_opc = OBJECT_CONF_SET;
4929         conf.coc_inode = inode;
4930         conf.coc_lock = lock;
4931         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4932         conf.u.coc_layout.lb_len = lock->l_lvb_len;
4933         rc = ll_layout_conf(inode, &conf);
4934
4935         /* refresh layout failed, need to wait */
4936         wait_layout = rc == -EBUSY;
4937         EXIT;
4938 out:
4939         LDLM_LOCK_PUT(lock);
4940         ldlm_lock_decref(lockh, mode);
4941
4942         /* wait for IO to complete if it's still being used. */
4943         if (wait_layout) {
4944                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
4945                        ll_get_fsname(inode->i_sb, NULL, 0),
4946                        PFID(&lli->lli_fid), inode);
4947
4948                 memset(&conf, 0, sizeof conf);
4949                 conf.coc_opc = OBJECT_CONF_WAIT;
4950                 conf.coc_inode = inode;
4951                 rc = ll_layout_conf(inode, &conf);
4952                 if (rc == 0)
4953                         rc = -EAGAIN;
4954
4955                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
4956                        ll_get_fsname(inode->i_sb, NULL, 0),
4957                        PFID(&lli->lli_fid), rc);
4958         }
4959         RETURN(rc);
4960 }
4961
4962 /**
4963  * Issue layout intent RPC to MDS.
4964  * \param inode [in]    file inode
4965  * \param intent [in]   layout intent
4966  *
4967  * \retval 0    on success
4968  * \retval < 0  error code
4969  */
4970 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
4971 {
4972         struct ll_inode_info  *lli = ll_i2info(inode);
4973         struct ll_sb_info     *sbi = ll_i2sbi(inode);
4974         struct md_op_data     *op_data;
4975         struct lookup_intent it;
4976         struct ptlrpc_request *req;
4977         int rc;
4978         ENTRY;
4979
4980         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
4981                                      0, 0, LUSTRE_OPC_ANY, NULL);
4982         if (IS_ERR(op_data))
4983                 RETURN(PTR_ERR(op_data));
4984
4985         op_data->op_data = intent;
4986         op_data->op_data_size = sizeof(*intent);
4987
4988         memset(&it, 0, sizeof(it));
4989         it.it_op = IT_LAYOUT;
4990         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
4991             intent->li_opc == LAYOUT_INTENT_TRUNC)
4992                 it.it_flags = FMODE_WRITE;
4993
4994         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
4995                           ll_get_fsname(inode->i_sb, NULL, 0),
4996                           PFID(&lli->lli_fid), inode);
4997
4998         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
4999                             &ll_md_blocking_ast, 0);
5000         if (it.it_request != NULL)
5001                 ptlrpc_req_finished(it.it_request);
5002         it.it_request = NULL;
5003
5004         ll_finish_md_op_data(op_data);
5005
5006         /* set lock data in case this is a new lock */
5007         if (!rc)
5008                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5009
5010         ll_intent_drop_lock(&it);
5011
5012         RETURN(rc);
5013 }
5014
5015 /**
5016  * This function checks if there exists a LAYOUT lock on the client side,
5017  * or enqueues it if it doesn't have one in cache.
5018  *
5019  * This function will not hold layout lock so it may be revoked any time after
5020  * this function returns. Any operations depend on layout should be redone
5021  * in that case.
5022  *
5023  * This function should be called before lov_io_init() to get an uptodate
5024  * layout version, the caller should save the version number and after IO
5025  * is finished, this function should be called again to verify that layout
5026  * is not changed during IO time.
5027  */
5028 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5029 {
5030         struct ll_inode_info    *lli = ll_i2info(inode);
5031         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5032         struct lustre_handle lockh;
5033         struct layout_intent intent = {
5034                 .li_opc = LAYOUT_INTENT_ACCESS,
5035         };
5036         enum ldlm_mode mode;
5037         int rc;
5038         ENTRY;
5039
5040         *gen = ll_layout_version_get(lli);
5041         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5042                 RETURN(0);
5043
5044         /* sanity checks */
5045         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5046         LASSERT(S_ISREG(inode->i_mode));
5047
5048         /* take layout lock mutex to enqueue layout lock exclusively. */
5049         mutex_lock(&lli->lli_layout_mutex);
5050
5051         while (1) {
5052                 /* mostly layout lock is caching on the local side, so try to
5053                  * match it before grabbing layout lock mutex. */
5054                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5055                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5056                 if (mode != 0) { /* hit cached lock */
5057                         rc = ll_layout_lock_set(&lockh, mode, inode);
5058                         if (rc == -EAGAIN)
5059                                 continue;
5060                         break;
5061                 }
5062
5063                 rc = ll_layout_intent(inode, &intent);
5064                 if (rc != 0)
5065                         break;
5066         }
5067
5068         if (rc == 0)
5069                 *gen = ll_layout_version_get(lli);
5070         mutex_unlock(&lli->lli_layout_mutex);
5071
5072         RETURN(rc);
5073 }
5074
5075 /**
5076  * Issue layout intent RPC indicating where in a file an IO is about to write.
5077  *
5078  * \param[in] inode     file inode.
5079  * \param[in] ext       write range with start offset of fille in bytes where
5080  *                      an IO is about to write, and exclusive end offset in
5081  *                      bytes.
5082  *
5083  * \retval 0    on success
5084  * \retval < 0  error code
5085  */
5086 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5087                            struct lu_extent *ext)
5088 {
5089         struct layout_intent intent = {
5090                 .li_opc = opc,
5091                 .li_extent.e_start = ext->e_start,
5092                 .li_extent.e_end = ext->e_end,
5093         };
5094         int rc;
5095         ENTRY;
5096
5097         rc = ll_layout_intent(inode, &intent);
5098
5099         RETURN(rc);
5100 }
5101
5102 /**
5103  *  This function send a restore request to the MDT
5104  */
5105 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5106 {
5107         struct hsm_user_request *hur;
5108         int                      len, rc;
5109         ENTRY;
5110
5111         len = sizeof(struct hsm_user_request) +
5112               sizeof(struct hsm_user_item);
5113         OBD_ALLOC(hur, len);
5114         if (hur == NULL)
5115                 RETURN(-ENOMEM);
5116
5117         hur->hur_request.hr_action = HUA_RESTORE;
5118         hur->hur_request.hr_archive_id = 0;
5119         hur->hur_request.hr_flags = 0;
5120         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5121                sizeof(hur->hur_user_item[0].hui_fid));
5122         hur->hur_user_item[0].hui_extent.offset = offset;
5123         hur->hur_user_item[0].hui_extent.length = length;
5124         hur->hur_request.hr_itemcount = 1;
5125         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5126                            len, hur, NULL);
5127         OBD_FREE(hur, len);
5128         RETURN(rc);
5129 }