lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 struct pcc_param {
  62         __u64   pa_data_version;
  63         __u32   pa_archive_id;
  64         __u32   pa_layout_gen;
  65 };
  66
  67 static int
  68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  69
  70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  71                           bool *lease_broken);
  72
  73 static struct ll_file_data *ll_file_data_get(void)
  74 {
  75         struct ll_file_data *fd;
  76
  77         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  78         if (fd == NULL)
  79                 return NULL;
  80
  81         fd->fd_write_failed = false;
  82         pcc_file_init(&fd->fd_pcc_file);
  83
  84         return fd;
  85 }
  86
  87 static void ll_file_data_put(struct ll_file_data *fd)
  88 {
  89         if (fd != NULL)
  90                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  91 }
  92
  93 /**
  94  * Packs all the attributes into @op_data for the CLOSE rpc.
  95  */
  96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  97                              struct obd_client_handle *och)
  98 {
  99         ENTRY;
 100
 101         ll_prep_md_op_data(op_data, inode, NULL, NULL,
 102                            0, 0, LUSTRE_OPC_ANY, NULL);
 103
 104         op_data->op_attr.ia_mode = inode->i_mode;
 105         op_data->op_attr.ia_atime = inode->i_atime;
 106         op_data->op_attr.ia_mtime = inode->i_mtime;
 107         op_data->op_attr.ia_ctime = inode->i_ctime;
 108         op_data->op_attr.ia_size = i_size_read(inode);
 109         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 110                                       ATTR_MTIME | ATTR_MTIME_SET |
 111                                       ATTR_CTIME);
 112         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 113         op_data->op_attr_blocks = inode->i_blocks;
 114         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 115         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 116                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 117         op_data->op_open_handle = och->och_open_handle;
 118
 119         if (och->och_flags & FMODE_WRITE &&
 120             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 121                 /* For HSM: if inode data has been modified, pack it so that
 122                  * MDT can set data dirty flag in the archive. */
 123                 op_data->op_bias |= MDS_DATA_MODIFIED;
 124
 125         EXIT;
 126 }
 127
 128 /**
 129  * Perform a close, possibly with a bias.
 130  * The meaning of "data" depends on the value of "bias".
 131  *
 132  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 133  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 134  * swap layouts with.
 135  */
 136 static int ll_close_inode_openhandle(struct inode *inode,
 137                                      struct obd_client_handle *och,
 138                                      enum mds_op_bias bias, void *data)
 139 {
 140         struct obd_export *md_exp = ll_i2mdexp(inode);
 141         const struct ll_inode_info *lli = ll_i2info(inode);
 142         struct md_op_data *op_data;
 143         struct ptlrpc_request *req = NULL;
 144         int rc;
 145         ENTRY;
 146
 147         if (class_exp2obd(md_exp) == NULL) {
 148                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 149                        ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
 150                 GOTO(out, rc = 0);
 151         }
 152
 153         OBD_ALLOC_PTR(op_data);
 154         /* We leak openhandle and request here on error, but not much to be
 155          * done in OOM case since app won't retry close on error either. */
 156         if (op_data == NULL)
 157                 GOTO(out, rc = -ENOMEM);
 158
 159         ll_prepare_close(inode, op_data, och);
 160         switch (bias) {
 161         case MDS_CLOSE_LAYOUT_MERGE:
 162                 /* merge blocks from the victim inode */
 163                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 164                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 165                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 166                 /* fallthrough */
 167         case MDS_CLOSE_LAYOUT_SPLIT:
 168         case MDS_CLOSE_LAYOUT_SWAP: {
 169                 struct split_param *sp = data;
 170
 171                 LASSERT(data != NULL);
 172                 op_data->op_bias |= bias;
 173                 op_data->op_data_version = 0;
 174                 op_data->op_lease_handle = och->och_lease_handle;
 175                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 176                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 177                         op_data->op_mirror_id = sp->sp_mirror_id;
 178                 } else {
 179                         op_data->op_fid2 = *ll_inode2fid(data);
 180                 }
 181                 break;
 182         }
 183
 184         case MDS_CLOSE_RESYNC_DONE: {
 185                 struct ll_ioc_lease *ioc = data;
 186
 187                 LASSERT(data != NULL);
 188                 op_data->op_attr_blocks +=
 189                         ioc->lil_count * op_data->op_attr_blocks;
 190                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 191                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 192                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 193
 194                 op_data->op_lease_handle = och->och_lease_handle;
 195                 op_data->op_data = &ioc->lil_ids[0];
 196                 op_data->op_data_size =
 197                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 198                 break;
 199         }
 200
 201         case MDS_PCC_ATTACH: {
 202                 struct pcc_param *param = data;
 203
 204                 LASSERT(data != NULL);
 205                 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
 206                 op_data->op_archive_id = param->pa_archive_id;
 207                 op_data->op_data_version = param->pa_data_version;
 208                 op_data->op_lease_handle = och->och_lease_handle;
 209                 break;
 210         }
 211
 212         case MDS_HSM_RELEASE:
 213                 LASSERT(data != NULL);
 214                 op_data->op_bias |= MDS_HSM_RELEASE;
 215                 op_data->op_data_version = *(__u64 *)data;
 216                 op_data->op_lease_handle = och->och_lease_handle;
 217                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 218                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 219                 break;
 220
 221         default:
 222                 LASSERT(data == NULL);
 223                 break;
 224         }
 225
 226         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 227                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 228         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 229                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 230
 231         rc = md_close(md_exp, op_data, och->och_mod, &req);
 232         if (rc != 0 && rc != -EINTR)
 233                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 234                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 235
 236         if (rc == 0 && op_data->op_bias & bias) {
 237                 struct mdt_body *body;
 238
 239                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 240                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 241                         rc = -EBUSY;
 242
 243                 if (bias & MDS_PCC_ATTACH) {
 244                         struct pcc_param *param = data;
 245
 246                         param->pa_layout_gen = body->mbo_layout_gen;
 247                 }
 248         }
 249
 250         ll_finish_md_op_data(op_data);
 251         EXIT;
 252 out:
 253
 254         md_clear_open_replay_data(md_exp, och);
 255         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 256         OBD_FREE_PTR(och);
 257
 258         ptlrpc_req_finished(req);       /* This is close request */
 259         return rc;
 260 }
 261
 262 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 263 {
 264         struct ll_inode_info *lli = ll_i2info(inode);
 265         struct obd_client_handle **och_p;
 266         struct obd_client_handle *och;
 267         __u64 *och_usecount;
 268         int rc = 0;
 269         ENTRY;
 270
 271         if (fmode & FMODE_WRITE) {
 272                 och_p = &lli->lli_mds_write_och;
 273                 och_usecount = &lli->lli_open_fd_write_count;
 274         } else if (fmode & FMODE_EXEC) {
 275                 och_p = &lli->lli_mds_exec_och;
 276                 och_usecount = &lli->lli_open_fd_exec_count;
 277         } else {
 278                 LASSERT(fmode & FMODE_READ);
 279                 och_p = &lli->lli_mds_read_och;
 280                 och_usecount = &lli->lli_open_fd_read_count;
 281         }
 282
 283         mutex_lock(&lli->lli_och_mutex);
 284         if (*och_usecount > 0) {
 285                 /* There are still users of this handle, so skip
 286                  * freeing it. */
 287                 mutex_unlock(&lli->lli_och_mutex);
 288                 RETURN(0);
 289         }
 290
 291         och = *och_p;
 292         *och_p = NULL;
 293         mutex_unlock(&lli->lli_och_mutex);
 294
 295         if (och != NULL) {
 296                 /* There might be a race and this handle may already
 297                  * be closed. */
 298                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 299         }
 300
 301         RETURN(rc);
 302 }
 303
 304 static int ll_md_close(struct inode *inode, struct file *file)
 305 {
 306         union ldlm_policy_data policy = {
 307                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 308         };
 309         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 310         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 311         struct ll_inode_info *lli = ll_i2info(inode);
 312         struct lustre_handle lockh;
 313         enum ldlm_mode lockmode;
 314         int rc = 0;
 315         ENTRY;
 316
 317         /* clear group lock, if present */
 318         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 319                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 320
 321         if (fd->fd_lease_och != NULL) {
 322                 bool lease_broken;
 323
 324                 /* Usually the lease is not released when the
 325                  * application crashed, we need to release here. */
 326                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 327                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 328                         PFID(&lli->lli_fid), rc, lease_broken);
 329
 330                 fd->fd_lease_och = NULL;
 331         }
 332
 333         if (fd->fd_och != NULL) {
 334                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 335                 fd->fd_och = NULL;
 336                 GOTO(out, rc);
 337         }
 338
 339         /* Let's see if we have good enough OPEN lock on the file and if
 340            we can skip talking to MDS */
 341         mutex_lock(&lli->lli_och_mutex);
 342         if (fd->fd_omode & FMODE_WRITE) {
 343                 lockmode = LCK_CW;
 344                 LASSERT(lli->lli_open_fd_write_count);
 345                 lli->lli_open_fd_write_count--;
 346         } else if (fd->fd_omode & FMODE_EXEC) {
 347                 lockmode = LCK_PR;
 348                 LASSERT(lli->lli_open_fd_exec_count);
 349                 lli->lli_open_fd_exec_count--;
 350         } else {
 351                 lockmode = LCK_CR;
 352                 LASSERT(lli->lli_open_fd_read_count);
 353                 lli->lli_open_fd_read_count--;
 354         }
 355         mutex_unlock(&lli->lli_och_mutex);
 356
 357         /* LU-4398: do not cache write open lock if the file has exec bit */
 358         if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
 359             !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 360                            LDLM_IBITS, &policy, lockmode, &lockh))
 361                 rc = ll_md_real_close(inode, fd->fd_omode);
 362
 363 out:
 364         LUSTRE_FPRIVATE(file) = NULL;
 365         ll_file_data_put(fd);
 366
 367         RETURN(rc);
 368 }
 369
 370 /* While this returns an error code, fput() the caller does not, so we need
 371  * to make every effort to clean up all of our state here.  Also, applications
 372  * rarely check close errors and even if an error is returned they will not
 373  * re-try the close call.
 374  */
 375 int ll_file_release(struct inode *inode, struct file *file)
 376 {
 377         struct ll_file_data *fd;
 378         struct ll_sb_info *sbi = ll_i2sbi(inode);
 379         struct ll_inode_info *lli = ll_i2info(inode);
 380         ktime_t kstart = ktime_get();
 381         int rc;
 382
 383         ENTRY;
 384
 385         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 386                PFID(ll_inode2fid(inode)), inode);
 387
 388         fd = LUSTRE_FPRIVATE(file);
 389         LASSERT(fd != NULL);
 390
 391         /* The last ref on @file, maybe not the the owner pid of statahead,
 392          * because parent and child process can share the same file handle. */
 393         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 394                 ll_deauthorize_statahead(inode, fd);
 395
 396         if (inode->i_sb->s_root == file_dentry(file)) {
 397                 LUSTRE_FPRIVATE(file) = NULL;
 398                 ll_file_data_put(fd);
 399                 GOTO(out, rc = 0);
 400         }
 401
 402         pcc_file_release(inode, file);
 403
 404         if (!S_ISDIR(inode->i_mode)) {
 405                 if (lli->lli_clob != NULL)
 406                         lov_read_and_clear_async_rc(lli->lli_clob);
 407                 lli->lli_async_rc = 0;
 408         }
 409
 410         rc = ll_md_close(inode, file);
 411
 412         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 413                 libcfs_debug_dumplog();
 414
 415 out:
 416         if (!rc && inode->i_sb->s_root != file_dentry(file))
 417                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE,
 418                                    ktime_us_delta(ktime_get(), kstart));
 419         RETURN(rc);
 420 }
 421
 422 static inline int ll_dom_readpage(void *data, struct page *page)
 423 {
 424         struct niobuf_local *lnb = data;
 425         void *kaddr;
 426
 427         kaddr = ll_kmap_atomic(page, KM_USER0);
 428         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 429         if (lnb->lnb_len < PAGE_SIZE)
 430                 memset(kaddr + lnb->lnb_len, 0,
 431                        PAGE_SIZE - lnb->lnb_len);
 432         flush_dcache_page(page);
 433         SetPageUptodate(page);
 434         ll_kunmap_atomic(kaddr, KM_USER0);
 435         unlock_page(page);
 436
 437         return 0;
 438 }
 439
 440 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 441                         struct lookup_intent *it)
 442 {
 443         struct ll_inode_info *lli = ll_i2info(inode);
 444         struct cl_object *obj = lli->lli_clob;
 445         struct address_space *mapping = inode->i_mapping;
 446         struct page *vmpage;
 447         struct niobuf_remote *rnb;
 448         struct mdt_body *body;
 449         char *data;
 450         unsigned long index, start;
 451         struct niobuf_local lnb;
 452
 453         ENTRY;
 454
 455         if (obj == NULL)
 456                 RETURN_EXIT;
 457
 458         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 459                                    RCL_SERVER))
 460                 RETURN_EXIT;
 461
 462         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 463         if (rnb == NULL || rnb->rnb_len == 0)
 464                 RETURN_EXIT;
 465
 466         /* LU-11595: Server may return whole file and that is OK always or
 467          * it may return just file tail and its offset must be aligned with
 468          * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
 469          * smaller then offset may be not aligned and that data is just ignored.
 470          */
 471         if (rnb->rnb_offset % PAGE_SIZE)
 472                 RETURN_EXIT;
 473
 474         /* Server returns whole file or just file tail if it fills in reply
 475          * buffer, in both cases total size should be equal to the file size.
 476          */
 477         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 478         if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
 479                 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
 480                        ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
 481                        rnb->rnb_len, body->mbo_dom_size);
 482                 RETURN_EXIT;
 483         }
 484
 485         CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
 486                rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
 487
 488         data = (char *)rnb + sizeof(*rnb);
 489
 490         lnb.lnb_file_offset = rnb->rnb_offset;
 491         start = lnb.lnb_file_offset / PAGE_SIZE;
 492         index = 0;
 493         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 494         lnb.lnb_page_offset = 0;
 495         do {
 496                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 497                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 498                 if (lnb.lnb_len > PAGE_SIZE)
 499                         lnb.lnb_len = PAGE_SIZE;
 500
 501                 vmpage = read_cache_page(mapping, index + start,
 502                                          ll_dom_readpage, &lnb);
 503                 if (IS_ERR(vmpage)) {
 504                         CWARN("%s: cannot fill page %lu for "DFID
 505                               " with data: rc = %li\n",
 506                               ll_i2sbi(inode)->ll_fsname, index + start,
 507                               PFID(lu_object_fid(&obj->co_lu)),
 508                               PTR_ERR(vmpage));
 509                         break;
 510                 }
 511                 put_page(vmpage);
 512                 index++;
 513         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 514         EXIT;
 515 }
 516
 517 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 518                                 struct lookup_intent *itp)
 519 {
 520         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 521         struct dentry *parent = de->d_parent;
 522         char *name = NULL;
 523         int len = 0;
 524         struct md_op_data *op_data;
 525         struct ptlrpc_request *req = NULL;
 526         int rc;
 527         ENTRY;
 528
 529         LASSERT(parent != NULL);
 530         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 531
 532         /* if server supports open-by-fid, or file name is invalid, don't pack
 533          * name in open request */
 534         if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
 535             !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
 536 retry:
 537                 len = de->d_name.len;
 538                 name = kmalloc(len + 1, GFP_NOFS);
 539                 if (!name)
 540                         RETURN(-ENOMEM);
 541
 542                 /* race here */
 543                 spin_lock(&de->d_lock);
 544                 if (len != de->d_name.len) {
 545                         spin_unlock(&de->d_lock);
 546                         kfree(name);
 547                         goto retry;
 548                 }
 549                 memcpy(name, de->d_name.name, len);
 550                 name[len] = '\0';
 551                 spin_unlock(&de->d_lock);
 552
 553                 if (!lu_name_is_valid_2(name, len)) {
 554                         kfree(name);
 555                         RETURN(-ESTALE);
 556                 }
 557         }
 558
 559         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 560                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 561         if (IS_ERR(op_data)) {
 562                 kfree(name);
 563                 RETURN(PTR_ERR(op_data));
 564         }
 565         op_data->op_data = lmm;
 566         op_data->op_data_size = lmmsize;
 567
 568         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 569                             &ll_md_blocking_ast, 0);
 570         kfree(name);
 571         ll_finish_md_op_data(op_data);
 572         if (rc == -ESTALE) {
 573                 /* reason for keep own exit path - don`t flood log
 574                  * with messages with -ESTALE errors.
 575                  */
 576                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 577                      it_open_error(DISP_OPEN_OPEN, itp))
 578                         GOTO(out, rc);
 579                 ll_release_openhandle(de, itp);
 580                 GOTO(out, rc);
 581         }
 582
 583         if (it_disposition(itp, DISP_LOOKUP_NEG))
 584                 GOTO(out, rc = -ENOENT);
 585
 586         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 587                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 588                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 589                 GOTO(out, rc);
 590         }
 591
 592         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 593
 594         if (!rc && itp->it_lock_mode) {
 595                 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
 596                 struct ldlm_lock *lock;
 597                 bool has_dom_bit = false;
 598
 599                 /* If we got a lock back and it has a LOOKUP bit set,
 600                  * make sure the dentry is marked as valid so we can find it.
 601                  * We don't need to care about actual hashing since other bits
 602                  * of kernel will deal with that later.
 603                  */
 604                 lock = ldlm_handle2lock(&handle);
 605                 if (lock) {
 606                         has_dom_bit = ldlm_has_dom(lock);
 607                         if (lock->l_policy_data.l_inodebits.bits &
 608                             MDS_INODELOCK_LOOKUP)
 609                                 d_lustre_revalidate(de);
 610
 611                         LDLM_LOCK_PUT(lock);
 612                 }
 613                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 614                 if (has_dom_bit)
 615                         ll_dom_finish_open(de->d_inode, req, itp);
 616         }
 617
 618 out:
 619         ptlrpc_req_finished(req);
 620         ll_intent_drop_lock(itp);
 621
 622         /* We did open by fid, but by the time we got to the server,
 623          * the object disappeared. If this is a create, we cannot really
 624          * tell the userspace that the file it was trying to create
 625          * does not exist. Instead let's return -ESTALE, and the VFS will
 626          * retry the create with LOOKUP_REVAL that we are going to catch
 627          * in ll_revalidate_dentry() and use lookup then.
 628          */
 629         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 630                 rc = -ESTALE;
 631
 632         RETURN(rc);
 633 }
 634
 635 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 636                        struct obd_client_handle *och)
 637 {
 638         struct mdt_body *body;
 639
 640         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 641         och->och_open_handle = body->mbo_open_handle;
 642         och->och_fid = body->mbo_fid1;
 643         och->och_lease_handle.cookie = it->it_lock_handle;
 644         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 645         och->och_flags = it->it_flags;
 646
 647         return md_set_open_replay_data(md_exp, och, it);
 648 }
 649
 650 static int ll_local_open(struct file *file, struct lookup_intent *it,
 651                          struct ll_file_data *fd, struct obd_client_handle *och)
 652 {
 653         struct inode *inode = file_inode(file);
 654         ENTRY;
 655
 656         LASSERT(!LUSTRE_FPRIVATE(file));
 657
 658         LASSERT(fd != NULL);
 659
 660         if (och) {
 661                 int rc;
 662
 663                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 664                 if (rc != 0)
 665                         RETURN(rc);
 666         }
 667
 668         LUSTRE_FPRIVATE(file) = fd;
 669         ll_readahead_init(inode, &fd->fd_ras);
 670         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 671
 672         /* ll_cl_context initialize */
 673         rwlock_init(&fd->fd_lock);
 674         INIT_LIST_HEAD(&fd->fd_lccs);
 675
 676         RETURN(0);
 677 }
 678
 679 /* Open a file, and (for the very first open) create objects on the OSTs at
 680  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 681  * creation or open until ll_lov_setstripe() ioctl is called.
 682  *
 683  * If we already have the stripe MD locally then we don't request it in
 684  * md_open(), by passing a lmm_size = 0.
 685  *
 686  * It is up to the application to ensure no other processes open this file
 687  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 688  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 689  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 690  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 691  */
 692 int ll_file_open(struct inode *inode, struct file *file)
 693 {
 694         struct ll_inode_info *lli = ll_i2info(inode);
 695         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 696                                           .it_flags = file->f_flags };
 697         struct obd_client_handle **och_p = NULL;
 698         __u64 *och_usecount = NULL;
 699         struct ll_file_data *fd;
 700         ktime_t kstart = ktime_get();
 701         int rc = 0;
 702         ENTRY;
 703
 704         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 705                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 706
 707         it = file->private_data; /* XXX: compat macro */
 708         file->private_data = NULL; /* prevent ll_local_open assertion */
 709
 710         fd = ll_file_data_get();
 711         if (fd == NULL)
 712                 GOTO(out_nofiledata, rc = -ENOMEM);
 713
 714         fd->fd_file = file;
 715         if (S_ISDIR(inode->i_mode))
 716                 ll_authorize_statahead(inode, fd);
 717
 718         if (inode->i_sb->s_root == file_dentry(file)) {
 719                 LUSTRE_FPRIVATE(file) = fd;
 720                 RETURN(0);
 721         }
 722
 723         if (!it || !it->it_disposition) {
 724                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 725                  * because everything but O_ACCMODE mask was stripped from
 726                  * there */
 727                 if ((oit.it_flags + 1) & O_ACCMODE)
 728                         oit.it_flags++;
 729                 if (file->f_flags & O_TRUNC)
 730                         oit.it_flags |= FMODE_WRITE;
 731
 732                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 733                  * dentry_open after call to open_namei that checks permissions.
 734                  * Only nfsd_open call dentry_open directly without checking
 735                  * permissions and because of that this code below is safe.
 736                  */
 737                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 738                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 739
 740                 /* We do not want O_EXCL here, presumably we opened the file
 741                  * already? XXX - NFS implications? */
 742                 oit.it_flags &= ~O_EXCL;
 743
 744                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 745                  * created if necessary, then "IT_CREAT" should be set to keep
 746                  * consistent with it */
 747                 if (oit.it_flags & O_CREAT)
 748                         oit.it_op |= IT_CREAT;
 749
 750                 it = &oit;
 751         }
 752
 753 restart:
 754         /* Let's see if we have file open on MDS already. */
 755         if (it->it_flags & FMODE_WRITE) {
 756                 och_p = &lli->lli_mds_write_och;
 757                 och_usecount = &lli->lli_open_fd_write_count;
 758         } else if (it->it_flags & FMODE_EXEC) {
 759                 och_p = &lli->lli_mds_exec_och;
 760                 och_usecount = &lli->lli_open_fd_exec_count;
 761          } else {
 762                 och_p = &lli->lli_mds_read_och;
 763                 och_usecount = &lli->lli_open_fd_read_count;
 764         }
 765
 766         mutex_lock(&lli->lli_och_mutex);
 767         if (*och_p) { /* Open handle is present */
 768                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 769                         /* Well, there's extra open request that we do not need,
 770                            let's close it somehow. This will decref request. */
 771                         rc = it_open_error(DISP_OPEN_OPEN, it);
 772                         if (rc) {
 773                                 mutex_unlock(&lli->lli_och_mutex);
 774                                 GOTO(out_openerr, rc);
 775                         }
 776
 777                         ll_release_openhandle(file_dentry(file), it);
 778                 }
 779                 (*och_usecount)++;
 780
 781                 rc = ll_local_open(file, it, fd, NULL);
 782                 if (rc) {
 783                         (*och_usecount)--;
 784                         mutex_unlock(&lli->lli_och_mutex);
 785                         GOTO(out_openerr, rc);
 786                 }
 787         } else {
 788                 LASSERT(*och_usecount == 0);
 789                 if (!it->it_disposition) {
 790                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 791                         /* We cannot just request lock handle now, new ELC code
 792                            means that one of other OPEN locks for this file
 793                            could be cancelled, and since blocking ast handler
 794                            would attempt to grab och_mutex as well, that would
 795                            result in a deadlock */
 796                         mutex_unlock(&lli->lli_och_mutex);
 797                         /*
 798                          * Normally called under two situations:
 799                          * 1. NFS export.
 800                          * 2. A race/condition on MDS resulting in no open
 801                          *    handle to be returned from LOOKUP|OPEN request,
 802                          *    for example if the target entry was a symlink.
 803                          *
 804                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 805                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 806                          *  bit so that it's not confusing later callers.
 807                          *
 808                          *  NB; when ldd is NULL, it must have come via normal
 809                          *  lookup path only, since ll_iget_for_nfs always calls
 810                          *  ll_d_init().
 811                          */
 812                         if (ldd && ldd->lld_nfs_dentry) {
 813                                 ldd->lld_nfs_dentry = 0;
 814                                 it->it_flags |= MDS_OPEN_LOCK;
 815                         }
 816
 817                          /*
 818                          * Always specify MDS_OPEN_BY_FID because we don't want
 819                          * to get file with different fid.
 820                          */
 821                         it->it_flags |= MDS_OPEN_BY_FID;
 822                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 823                                                  it);
 824                         if (rc)
 825                                 GOTO(out_openerr, rc);
 826
 827                         goto restart;
 828                 }
 829                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 830                 if (!*och_p)
 831                         GOTO(out_och_free, rc = -ENOMEM);
 832
 833                 (*och_usecount)++;
 834
 835                 /* md_intent_lock() didn't get a request ref if there was an
 836                  * open error, so don't do cleanup on the request here
 837                  * (bug 3430) */
 838                 /* XXX (green): Should not we bail out on any error here, not
 839                  * just open error? */
 840                 rc = it_open_error(DISP_OPEN_OPEN, it);
 841                 if (rc != 0)
 842                         GOTO(out_och_free, rc);
 843
 844                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 845                          "inode %p: disposition %x, status %d\n", inode,
 846                          it_disposition(it, ~0), it->it_status);
 847
 848                 rc = ll_local_open(file, it, fd, *och_p);
 849                 if (rc)
 850                         GOTO(out_och_free, rc);
 851         }
 852
 853         rc = pcc_file_open(inode, file);
 854         if (rc)
 855                 GOTO(out_och_free, rc);
 856
 857         mutex_unlock(&lli->lli_och_mutex);
 858         fd = NULL;
 859
 860         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 861            different kind of OPEN lock for this same inode gets cancelled
 862            by ldlm_cancel_lru */
 863         if (!S_ISREG(inode->i_mode))
 864                 GOTO(out_och_free, rc);
 865
 866         cl_lov_delay_create_clear(&file->f_flags);
 867         GOTO(out_och_free, rc);
 868
 869 out_och_free:
 870         if (rc) {
 871                 if (och_p && *och_p) {
 872                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 873                         *och_p = NULL; /* OBD_FREE writes some magic there */
 874                         (*och_usecount)--;
 875                 }
 876                 mutex_unlock(&lli->lli_och_mutex);
 877
 878 out_openerr:
 879                 if (lli->lli_opendir_key == fd)
 880                         ll_deauthorize_statahead(inode, fd);
 881
 882                 if (fd != NULL)
 883                         ll_file_data_put(fd);
 884         } else {
 885                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN,
 886                                    ktime_us_delta(ktime_get(), kstart));
 887         }
 888
 889 out_nofiledata:
 890         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 891                 ptlrpc_req_finished(it->it_request);
 892                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 893         }
 894
 895         return rc;
 896 }
 897
 898 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 899                         struct ldlm_lock_desc *desc, void *data, int flag)
 900 {
 901         int rc;
 902         struct lustre_handle lockh;
 903         ENTRY;
 904
 905         switch (flag) {
 906         case LDLM_CB_BLOCKING:
 907                 ldlm_lock2handle(lock, &lockh);
 908                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 909                 if (rc < 0) {
 910                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 911                         RETURN(rc);
 912                 }
 913                 break;
 914         case LDLM_CB_CANCELING:
 915                 /* do nothing */
 916                 break;
 917         }
 918         RETURN(0);
 919 }
 920
 921 /**
 922  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 923  * and save it as fd->fd_och so as to force client to reopen the file even
 924  * if it has an open lock in cache already.
 925  */
 926 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 927                                 struct lustre_handle *old_open_handle)
 928 {
 929         struct ll_inode_info *lli = ll_i2info(inode);
 930         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 931         struct obd_client_handle **och_p;
 932         __u64 *och_usecount;
 933         int rc = 0;
 934         ENTRY;
 935
 936         /* Get the openhandle of the file */
 937         mutex_lock(&lli->lli_och_mutex);
 938         if (fd->fd_lease_och != NULL)
 939                 GOTO(out_unlock, rc = -EBUSY);
 940
 941         if (fd->fd_och == NULL) {
 942                 if (file->f_mode & FMODE_WRITE) {
 943                         LASSERT(lli->lli_mds_write_och != NULL);
 944                         och_p = &lli->lli_mds_write_och;
 945                         och_usecount = &lli->lli_open_fd_write_count;
 946                 } else {
 947                         LASSERT(lli->lli_mds_read_och != NULL);
 948                         och_p = &lli->lli_mds_read_och;
 949                         och_usecount = &lli->lli_open_fd_read_count;
 950                 }
 951
 952                 if (*och_usecount > 1)
 953                         GOTO(out_unlock, rc = -EBUSY);
 954
 955                 fd->fd_och = *och_p;
 956                 *och_usecount = 0;
 957                 *och_p = NULL;
 958         }
 959
 960         *old_open_handle = fd->fd_och->och_open_handle;
 961
 962         EXIT;
 963 out_unlock:
 964         mutex_unlock(&lli->lli_och_mutex);
 965         return rc;
 966 }
 967
 968 /**
 969  * Release ownership on lli_mds_*_och when putting back a file lease.
 970  */
 971 static int ll_lease_och_release(struct inode *inode, struct file *file)
 972 {
 973         struct ll_inode_info *lli = ll_i2info(inode);
 974         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 975         struct obd_client_handle **och_p;
 976         struct obd_client_handle *old_och = NULL;
 977         __u64 *och_usecount;
 978         int rc = 0;
 979         ENTRY;
 980
 981         mutex_lock(&lli->lli_och_mutex);
 982         if (file->f_mode & FMODE_WRITE) {
 983                 och_p = &lli->lli_mds_write_och;
 984                 och_usecount = &lli->lli_open_fd_write_count;
 985         } else {
 986                 och_p = &lli->lli_mds_read_och;
 987                 och_usecount = &lli->lli_open_fd_read_count;
 988         }
 989
 990         /* The file may have been open by another process (broken lease) so
 991          * *och_p is not NULL. In this case we should simply increase usecount
 992          * and close fd_och.
 993          */
 994         if (*och_p != NULL) {
 995                 old_och = fd->fd_och;
 996                 (*och_usecount)++;
 997         } else {
 998                 *och_p = fd->fd_och;
 999                 *och_usecount = 1;
1000         }
1001         fd->fd_och = NULL;
1002         mutex_unlock(&lli->lli_och_mutex);
1003
1004         if (old_och != NULL)
1005                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1006
1007         RETURN(rc);
1008 }
1009
1010 /**
1011  * Acquire a lease and open the file.
1012  */
1013 static struct obd_client_handle *
1014 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1015               __u64 open_flags)
1016 {
1017         struct lookup_intent it = { .it_op = IT_OPEN };
1018         struct ll_sb_info *sbi = ll_i2sbi(inode);
1019         struct md_op_data *op_data;
1020         struct ptlrpc_request *req = NULL;
1021         struct lustre_handle old_open_handle = { 0 };
1022         struct obd_client_handle *och = NULL;
1023         int rc;
1024         int rc2;
1025         ENTRY;
1026
1027         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1028                 RETURN(ERR_PTR(-EINVAL));
1029
1030         if (file != NULL) {
1031                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1032                         RETURN(ERR_PTR(-EPERM));
1033
1034                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1035                 if (rc)
1036                         RETURN(ERR_PTR(rc));
1037         }
1038
1039         OBD_ALLOC_PTR(och);
1040         if (och == NULL)
1041                 RETURN(ERR_PTR(-ENOMEM));
1042
1043         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1044                                         LUSTRE_OPC_ANY, NULL);
1045         if (IS_ERR(op_data))
1046                 GOTO(out, rc = PTR_ERR(op_data));
1047
1048         /* To tell the MDT this openhandle is from the same owner */
1049         op_data->op_open_handle = old_open_handle;
1050
1051         it.it_flags = fmode | open_flags;
1052         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1053         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1054                             &ll_md_blocking_lease_ast,
1055         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1056          * it can be cancelled which may mislead applications that the lease is
1057          * broken;
1058          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1059          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1060          * doesn't deal with openhandle, so normal openhandle will be leaked. */
1061                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1062         ll_finish_md_op_data(op_data);
1063         ptlrpc_req_finished(req);
1064         if (rc < 0)
1065                 GOTO(out_release_it, rc);
1066
1067         if (it_disposition(&it, DISP_LOOKUP_NEG))
1068                 GOTO(out_release_it, rc = -ENOENT);
1069
1070         rc = it_open_error(DISP_OPEN_OPEN, &it);
1071         if (rc)
1072                 GOTO(out_release_it, rc);
1073
1074         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1075         rc = ll_och_fill(sbi->ll_md_exp, &it, och);
1076         if (rc)
1077                 GOTO(out_release_it, rc);
1078
1079         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1080                 GOTO(out_close, rc = -EOPNOTSUPP);
1081
1082         /* already get lease, handle lease lock */
1083         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1084         if (it.it_lock_mode == 0 ||
1085             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1086                 /* open lock must return for lease */
1087                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1088                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1089                         it.it_lock_bits);
1090                 GOTO(out_close, rc = -EPROTO);
1091         }
1092
1093         ll_intent_release(&it);
1094         RETURN(och);
1095
1096 out_close:
1097         /* Cancel open lock */
1098         if (it.it_lock_mode != 0) {
1099                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1100                                             it.it_lock_mode);
1101                 it.it_lock_mode = 0;
1102                 och->och_lease_handle.cookie = 0ULL;
1103         }
1104         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1105         if (rc2 < 0)
1106                 CERROR("%s: error closing file "DFID": %d\n",
1107                        sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1108         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1109 out_release_it:
1110         ll_intent_release(&it);
1111 out:
1112         if (och != NULL)
1113                 OBD_FREE_PTR(och);
1114         RETURN(ERR_PTR(rc));
1115 }
1116
1117 /**
1118  * Check whether a layout swap can be done between two inodes.
1119  *
1120  * \param[in] inode1  First inode to check
1121  * \param[in] inode2  Second inode to check
1122  *
1123  * \retval 0 on success, layout swap can be performed between both inodes
1124  * \retval negative error code if requirements are not met
1125  */
1126 static int ll_check_swap_layouts_validity(struct inode *inode1,
1127                                           struct inode *inode2)
1128 {
1129         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1130                 return -EINVAL;
1131
1132         if (inode_permission(inode1, MAY_WRITE) ||
1133             inode_permission(inode2, MAY_WRITE))
1134                 return -EPERM;
1135
1136         if (inode1->i_sb != inode2->i_sb)
1137                 return -EXDEV;
1138
1139         return 0;
1140 }
1141
1142 static int ll_swap_layouts_close(struct obd_client_handle *och,
1143                                  struct inode *inode, struct inode *inode2)
1144 {
1145         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1146         const struct lu_fid     *fid2;
1147         int                      rc;
1148         ENTRY;
1149
1150         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1151                ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1152
1153         rc = ll_check_swap_layouts_validity(inode, inode2);
1154         if (rc < 0)
1155                 GOTO(out_free_och, rc);
1156
1157         /* We now know that inode2 is a lustre inode */
1158         fid2 = ll_inode2fid(inode2);
1159
1160         rc = lu_fid_cmp(fid1, fid2);
1161         if (rc == 0)
1162                 GOTO(out_free_och, rc = -EINVAL);
1163
1164         /* Close the file and {swap,merge} layouts between inode & inode2.
1165          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1166          * because we still need it to pack l_remote_handle to MDT. */
1167         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1168                                        inode2);
1169
1170         och = NULL; /* freed in ll_close_inode_openhandle() */
1171
1172 out_free_och:
1173         if (och != NULL)
1174                 OBD_FREE_PTR(och);
1175
1176         RETURN(rc);
1177 }
1178
1179 /**
1180  * Release lease and close the file.
1181  * It will check if the lease has ever broken.
1182  */
1183 static int ll_lease_close_intent(struct obd_client_handle *och,
1184                                  struct inode *inode,
1185                                  bool *lease_broken, enum mds_op_bias bias,
1186                                  void *data)
1187 {
1188         struct ldlm_lock *lock;
1189         bool cancelled = true;
1190         int rc;
1191         ENTRY;
1192
1193         lock = ldlm_handle2lock(&och->och_lease_handle);
1194         if (lock != NULL) {
1195                 lock_res_and_lock(lock);
1196                 cancelled = ldlm_is_cancel(lock);
1197                 unlock_res_and_lock(lock);
1198                 LDLM_LOCK_PUT(lock);
1199         }
1200
1201         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1202                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1203
1204         if (lease_broken != NULL)
1205                 *lease_broken = cancelled;
1206
1207         if (!cancelled && !bias)
1208                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1209
1210         if (cancelled) { /* no need to excute intent */
1211                 bias = 0;
1212                 data = NULL;
1213         }
1214
1215         rc = ll_close_inode_openhandle(inode, och, bias, data);
1216         RETURN(rc);
1217 }
1218
1219 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1220                           bool *lease_broken)
1221 {
1222         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1223 }
1224
1225 /**
1226  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1227  */
1228 static int ll_lease_file_resync(struct obd_client_handle *och,
1229                                 struct inode *inode, unsigned long arg)
1230 {
1231         struct ll_sb_info *sbi = ll_i2sbi(inode);
1232         struct md_op_data *op_data;
1233         struct ll_ioc_lease_id ioc;
1234         __u64 data_version_unused;
1235         int rc;
1236         ENTRY;
1237
1238         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1239                                      LUSTRE_OPC_ANY, NULL);
1240         if (IS_ERR(op_data))
1241                 RETURN(PTR_ERR(op_data));
1242
1243         if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1244                            sizeof(ioc)))
1245                 RETURN(-EFAULT);
1246
1247         /* before starting file resync, it's necessary to clean up page cache
1248          * in client memory, otherwise once the layout version is increased,
1249          * writing back cached data will be denied the OSTs. */
1250         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1251         if (rc)
1252                 GOTO(out, rc);
1253
1254         op_data->op_lease_handle = och->och_lease_handle;
1255         op_data->op_mirror_id = ioc.lil_mirror_id;
1256         rc = md_file_resync(sbi->ll_md_exp, op_data);
1257         if (rc)
1258                 GOTO(out, rc);
1259
1260         EXIT;
1261 out:
1262         ll_finish_md_op_data(op_data);
1263         return rc;
1264 }
1265
1266 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1267 {
1268         struct ll_inode_info *lli = ll_i2info(inode);
1269         struct cl_object *obj = lli->lli_clob;
1270         struct cl_attr *attr = vvp_env_thread_attr(env);
1271         s64 atime;
1272         s64 mtime;
1273         s64 ctime;
1274         int rc = 0;
1275
1276         ENTRY;
1277
1278         ll_inode_size_lock(inode);
1279
1280         /* Merge timestamps the most recently obtained from MDS with
1281          * timestamps obtained from OSTs.
1282          *
1283          * Do not overwrite atime of inode because it may be refreshed
1284          * by file_accessed() function. If the read was served by cache
1285          * data, there is no RPC to be sent so that atime may not be
1286          * transferred to OSTs at all. MDT only updates atime at close time
1287          * if it's at least 'mdd.*.atime_diff' older.
1288          * All in all, the atime in Lustre does not strictly comply with
1289          * POSIX. Solving this problem needs to send an RPC to MDT for each
1290          * read, this will hurt performance.
1291          */
1292         if (inode->i_atime.tv_sec < lli->lli_atime ||
1293             lli->lli_update_atime) {
1294                 inode->i_atime.tv_sec = lli->lli_atime;
1295                 lli->lli_update_atime = 0;
1296         }
1297         inode->i_mtime.tv_sec = lli->lli_mtime;
1298         inode->i_ctime.tv_sec = lli->lli_ctime;
1299
1300         mtime = inode->i_mtime.tv_sec;
1301         atime = inode->i_atime.tv_sec;
1302         ctime = inode->i_ctime.tv_sec;
1303
1304         cl_object_attr_lock(obj);
1305         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1306                 rc = -EINVAL;
1307         else
1308                 rc = cl_object_attr_get(env, obj, attr);
1309         cl_object_attr_unlock(obj);
1310
1311         if (rc != 0)
1312                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1313
1314         if (atime < attr->cat_atime)
1315                 atime = attr->cat_atime;
1316
1317         if (ctime < attr->cat_ctime)
1318                 ctime = attr->cat_ctime;
1319
1320         if (mtime < attr->cat_mtime)
1321                 mtime = attr->cat_mtime;
1322
1323         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1324                PFID(&lli->lli_fid), attr->cat_size);
1325
1326         i_size_write(inode, attr->cat_size);
1327         inode->i_blocks = attr->cat_blocks;
1328
1329         inode->i_mtime.tv_sec = mtime;
1330         inode->i_atime.tv_sec = atime;
1331         inode->i_ctime.tv_sec = ctime;
1332
1333 out_size_unlock:
1334         ll_inode_size_unlock(inode);
1335
1336         RETURN(rc);
1337 }
1338
1339 /**
1340  * Set designated mirror for I/O.
1341  *
1342  * So far only read, write, and truncated can support to issue I/O to
1343  * designated mirror.
1344  */
1345 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1346 {
1347         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1348
1349         /* clear layout version for generic(non-resync) I/O in case it carries
1350          * stale layout version due to I/O restart */
1351         io->ci_layout_version = 0;
1352
1353         /* FLR: disable non-delay for designated mirror I/O because obviously
1354          * only one mirror is available */
1355         if (fd->fd_designated_mirror > 0) {
1356                 io->ci_ndelay = 0;
1357                 io->ci_designated_mirror = fd->fd_designated_mirror;
1358                 io->ci_layout_version = fd->fd_layout_version;
1359         }
1360
1361         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1362                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1363 }
1364
1365 static bool file_is_noatime(const struct file *file)
1366 {
1367         const struct vfsmount *mnt = file->f_path.mnt;
1368         const struct inode *inode = file_inode((struct file *)file);
1369
1370         /* Adapted from file_accessed() and touch_atime().*/
1371         if (file->f_flags & O_NOATIME)
1372                 return true;
1373
1374         if (inode->i_flags & S_NOATIME)
1375                 return true;
1376
1377         if (IS_NOATIME(inode))
1378                 return true;
1379
1380         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1381                 return true;
1382
1383         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1384                 return true;
1385
1386         if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1387                 return true;
1388
1389         return false;
1390 }
1391
1392 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1393                 struct vvp_io_args *args)
1394 {
1395         struct inode *inode = file_inode(file);
1396         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1397
1398         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1399         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1400
1401         if (iot == CIT_WRITE) {
1402                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1403                 io->u.ci_wr.wr_sync   = !!(file->f_flags & O_SYNC ||
1404                                            file->f_flags & O_DIRECT ||
1405                                            IS_SYNC(inode));
1406 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1407                 io->u.ci_wr.wr_sync  |= !!(args &&
1408                                            args->via_io_subtype == IO_NORMAL &&
1409                                            args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1410 #endif
1411         }
1412
1413         io->ci_obj = ll_i2info(inode)->lli_clob;
1414         io->ci_lockreq = CILR_MAYBE;
1415         if (ll_file_nolock(file)) {
1416                 io->ci_lockreq = CILR_NEVER;
1417                 io->ci_no_srvlock = 1;
1418         } else if (file->f_flags & O_APPEND) {
1419                 io->ci_lockreq = CILR_MANDATORY;
1420         }
1421         io->ci_noatime = file_is_noatime(file);
1422         io->ci_async_readahead = false;
1423
1424         /* FLR: only use non-delay I/O for read as there is only one
1425          * avaliable mirror for write. */
1426         io->ci_ndelay = !(iot == CIT_WRITE);
1427
1428         ll_io_set_mirror(io, file);
1429 }
1430
1431 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1432                         __u64 count)
1433 {
1434         struct ll_inode_info *lli = ll_i2info(inode);
1435         struct ll_sb_info *sbi = ll_i2sbi(inode);
1436         enum obd_heat_type sample_type;
1437         enum obd_heat_type iobyte_type;
1438         __u64 now = ktime_get_real_seconds();
1439
1440         if (!ll_sbi_has_file_heat(sbi) ||
1441             lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1442                 return;
1443
1444         if (iot == CIT_READ) {
1445                 sample_type = OBD_HEAT_READSAMPLE;
1446                 iobyte_type = OBD_HEAT_READBYTE;
1447         } else if (iot == CIT_WRITE) {
1448                 sample_type = OBD_HEAT_WRITESAMPLE;
1449                 iobyte_type = OBD_HEAT_WRITEBYTE;
1450         } else {
1451                 return;
1452         }
1453
1454         spin_lock(&lli->lli_heat_lock);
1455         obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1456                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1457         obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1458                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1459         spin_unlock(&lli->lli_heat_lock);
1460 }
1461
1462 static ssize_t
1463 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1464                    struct file *file, enum cl_io_type iot,
1465                    loff_t *ppos, size_t count)
1466 {
1467         struct vvp_io           *vio = vvp_env_io(env);
1468         struct inode            *inode = file_inode(file);
1469         struct ll_inode_info    *lli = ll_i2info(inode);
1470         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1471         struct range_lock       range;
1472         struct cl_io            *io;
1473         ssize_t                 result = 0;
1474         int                     rc = 0;
1475         unsigned                retried = 0;
1476         bool                    restarted = false;
1477
1478         ENTRY;
1479
1480         CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1481                 file_dentry(file)->d_name.name,
1482                 iot == CIT_READ ? "read" : "write", *ppos, count);
1483
1484 restart:
1485         io = vvp_env_thread_io(env);
1486         ll_io_init(io, file, iot, args);
1487         io->ci_ndelay_tried = retried;
1488
1489         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1490                 bool range_locked = false;
1491
1492                 if (file->f_flags & O_APPEND)
1493                         range_lock_init(&range, 0, LUSTRE_EOF);
1494                 else
1495                         range_lock_init(&range, *ppos, *ppos + count - 1);
1496
1497                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1498                 vio->vui_io_subtype = args->via_io_subtype;
1499
1500                 switch (vio->vui_io_subtype) {
1501                 case IO_NORMAL:
1502                         vio->vui_iter = args->u.normal.via_iter;
1503                         vio->vui_iocb = args->u.normal.via_iocb;
1504                         /* Direct IO reads must also take range lock,
1505                          * or multiple reads will try to work on the same pages
1506                          * See LU-6227 for details. */
1507                         if (((iot == CIT_WRITE) ||
1508                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1509                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1510                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1511                                        RL_PARA(&range));
1512                                 rc = range_lock(&lli->lli_write_tree, &range);
1513                                 if (rc < 0)
1514                                         GOTO(out, rc);
1515
1516                                 range_locked = true;
1517                         }
1518                         break;
1519                 case IO_SPLICE:
1520                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1521                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1522                         break;
1523                 default:
1524                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1525                         LBUG();
1526                 }
1527
1528                 ll_cl_add(file, env, io, LCC_RW);
1529                 rc = cl_io_loop(env, io);
1530                 ll_cl_remove(file, env);
1531
1532                 if (range_locked) {
1533                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1534                                RL_PARA(&range));
1535                         range_unlock(&lli->lli_write_tree, &range);
1536                 }
1537         } else {
1538                 /* cl_io_rw_init() handled IO */
1539                 rc = io->ci_result;
1540         }
1541
1542         if (io->ci_nob > 0) {
1543                 result += io->ci_nob;
1544                 count  -= io->ci_nob;
1545                 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1546
1547                 /* prepare IO restart */
1548                 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1549                         args->u.normal.via_iter = vio->vui_iter;
1550         }
1551 out:
1552         cl_io_fini(env, io);
1553
1554         CDEBUG(D_VFSTRACE,
1555                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1556                file->f_path.dentry->d_name.name,
1557                iot, rc, result, io->ci_need_restart);
1558
1559         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1560                 CDEBUG(D_VFSTRACE,
1561                        "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1562                        file_dentry(file)->d_name.name,
1563                        iot == CIT_READ ? "read" : "write",
1564                        *ppos, count, result, rc);
1565                 /* preserve the tried count for FLR */
1566                 retried = io->ci_ndelay_tried;
1567                 restarted = true;
1568                 goto restart;
1569         }
1570
1571         if (iot == CIT_READ) {
1572                 if (result > 0)
1573                         ll_stats_ops_tally(ll_i2sbi(inode),
1574                                            LPROC_LL_READ_BYTES, result);
1575         } else if (iot == CIT_WRITE) {
1576                 if (result > 0) {
1577                         ll_stats_ops_tally(ll_i2sbi(inode),
1578                                            LPROC_LL_WRITE_BYTES, result);
1579                         fd->fd_write_failed = false;
1580                 } else if (result == 0 && rc == 0) {
1581                         rc = io->ci_result;
1582                         if (rc < 0)
1583                                 fd->fd_write_failed = true;
1584                         else
1585                                 fd->fd_write_failed = false;
1586                 } else if (rc != -ERESTARTSYS) {
1587                         fd->fd_write_failed = true;
1588                 }
1589         }
1590
1591         CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1592         if (result > 0)
1593                 ll_heat_add(inode, iot, result);
1594
1595         RETURN(result > 0 ? result : rc);
1596 }
1597
1598 /**
1599  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1600  * especially for small I/O.
1601  *
1602  * To serve a read request, CLIO has to create and initialize a cl_io and
1603  * then request DLM lock. This has turned out to have siginificant overhead
1604  * and affects the performance of small I/O dramatically.
1605  *
1606  * It's not necessary to create a cl_io for each I/O. Under the help of read
1607  * ahead, most of the pages being read are already in memory cache and we can
1608  * read those pages directly because if the pages exist, the corresponding DLM
1609  * lock must exist so that page content must be valid.
1610  *
1611  * In fast read implementation, the llite speculatively finds and reads pages
1612  * in memory cache. There are three scenarios for fast read:
1613  *   - If the page exists and is uptodate, kernel VM will provide the data and
1614  *     CLIO won't be intervened;
1615  *   - If the page was brought into memory by read ahead, it will be exported
1616  *     and read ahead parameters will be updated;
1617  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1618  *     it will go back and invoke normal read, i.e., a cl_io will be created
1619  *     and DLM lock will be requested.
1620  *
1621  * POSIX compliance: posix standard states that read is intended to be atomic.
1622  * Lustre read implementation is in line with Linux kernel read implementation
1623  * and neither of them complies with POSIX standard in this matter. Fast read
1624  * doesn't make the situation worse on single node but it may interleave write
1625  * results from multiple nodes due to short read handling in ll_file_aio_read().
1626  *
1627  * \param env - lu_env
1628  * \param iocb - kiocb from kernel
1629  * \param iter - user space buffers where the data will be copied
1630  *
1631  * \retval - number of bytes have been read, or error code if error occurred.
1632  */
1633 static ssize_t
1634 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1635 {
1636         ssize_t result;
1637
1638         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1639                 return 0;
1640
1641         /* NB: we can't do direct IO for fast read because it will need a lock
1642          * to make IO engine happy. */
1643         if (iocb->ki_filp->f_flags & O_DIRECT)
1644                 return 0;
1645
1646         result = generic_file_read_iter(iocb, iter);
1647
1648         /* If the first page is not in cache, generic_file_aio_read() will be
1649          * returned with -ENODATA.
1650          * See corresponding code in ll_readpage(). */
1651         if (result == -ENODATA)
1652                 result = 0;
1653
1654         if (result > 0) {
1655                 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1656                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1657                                    LPROC_LL_READ_BYTES, result);
1658         }
1659
1660         return result;
1661 }
1662
1663 /*
1664  * Read from a file (through the page cache).
1665  */
1666 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1667 {
1668         struct lu_env *env;
1669         struct vvp_io_args *args;
1670         struct file *file = iocb->ki_filp;
1671         ssize_t result;
1672         ssize_t rc2;
1673         __u16 refcheck;
1674         ktime_t kstart = ktime_get();
1675         bool cached;
1676
1677         if (!iov_iter_count(to))
1678                 return 0;
1679
1680         /**
1681          * Currently when PCC read failed, we do not fall back to the
1682          * normal read path, just return the error.
1683          * The resaon is that: for RW-PCC, the file data may be modified
1684          * in the PCC and inconsistent with the data on OSTs (or file
1685          * data has been removed from the Lustre file system), at this
1686          * time, fallback to the normal read path may read the wrong
1687          * data.
1688          * TODO: for RO-PCC (readonly PCC), fall back to normal read
1689          * path: read data from data copy on OSTs.
1690          */
1691         result = pcc_file_read_iter(iocb, to, &cached);
1692         if (cached)
1693                 GOTO(out, result);
1694
1695         ll_ras_enter(file);
1696
1697         result = ll_do_fast_read(iocb, to);
1698         if (result < 0 || iov_iter_count(to) == 0)
1699                 GOTO(out, result);
1700
1701         env = cl_env_get(&refcheck);
1702         if (IS_ERR(env))
1703                 return PTR_ERR(env);
1704
1705         args = ll_env_args(env, IO_NORMAL);
1706         args->u.normal.via_iter = to;
1707         args->u.normal.via_iocb = iocb;
1708
1709         rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1710                                  &iocb->ki_pos, iov_iter_count(to));
1711         if (rc2 > 0)
1712                 result += rc2;
1713         else if (result == 0)
1714                 result = rc2;
1715
1716         cl_env_put(env, &refcheck);
1717 out:
1718         if (result > 0) {
1719                 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1720                                   LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1721                                   READ);
1722                 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_READ,
1723                                    ktime_us_delta(ktime_get(), kstart));
1724         }
1725
1726         return result;
1727 }
1728
1729 /**
1730  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1731  * If a page is already in the page cache and dirty (and some other things -
1732  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1733  * write to it without doing a full I/O, because Lustre already knows about it
1734  * and will write it out.  This saves a lot of processing time.
1735  *
1736  * All writes here are within one page, so exclusion is handled by the page
1737  * lock on the vm page.  We do not do tiny writes for writes which touch
1738  * multiple pages because it's very unlikely multiple sequential pages are
1739  * are already dirty.
1740  *
1741  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1742  * and are unlikely to be to already dirty pages.
1743  *
1744  * Attribute updates are important here, we do them in ll_tiny_write_end.
1745  */
1746 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1747 {
1748         ssize_t count = iov_iter_count(iter);
1749         struct  file *file = iocb->ki_filp;
1750         struct  inode *inode = file_inode(file);
1751         bool    lock_inode = !IS_NOSEC(inode);
1752         ssize_t result = 0;
1753
1754         ENTRY;
1755
1756         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1757          * of function for why.
1758          */
1759         if (count >= PAGE_SIZE ||
1760             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1761                 RETURN(0);
1762
1763         if (unlikely(lock_inode))
1764                 inode_lock(inode);
1765         result = __generic_file_write_iter(iocb, iter);
1766
1767         if (unlikely(lock_inode))
1768                 inode_unlock(inode);
1769
1770         /* If the page is not already dirty, ll_tiny_write_begin returns
1771          * -ENODATA.  We continue on to normal write.
1772          */
1773         if (result == -ENODATA)
1774                 result = 0;
1775
1776         if (result > 0) {
1777                 ll_heat_add(inode, CIT_WRITE, result);
1778                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1779                                    result);
1780                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1781         }
1782
1783         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1784
1785         RETURN(result);
1786 }
1787
1788 /*
1789  * Write to a file (through the page cache).
1790  */
1791 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1792 {
1793         struct vvp_io_args *args;
1794         struct lu_env *env;
1795         ssize_t rc_tiny = 0, rc_normal;
1796         struct file *file = iocb->ki_filp;
1797         __u16 refcheck;
1798         bool cached;
1799         ktime_t kstart = ktime_get();
1800         int result;
1801
1802         ENTRY;
1803
1804         if (!iov_iter_count(from))
1805                 GOTO(out, rc_normal = 0);
1806
1807         /**
1808          * When PCC write failed, we usually do not fall back to the normal
1809          * write path, just return the error. But there is a special case when
1810          * returned error code is -ENOSPC due to running out of space on PCC HSM
1811          * bakcend. At this time, it will fall back to normal I/O path and
1812          * retry the I/O. As the file is in HSM released state, it will restore
1813          * the file data to OSTs first and redo the write again. And the
1814          * restore process will revoke the layout lock and detach the file
1815          * from PCC cache automatically.
1816          */
1817         result = pcc_file_write_iter(iocb, from, &cached);
1818         if (cached && result != -ENOSPC && result != -EDQUOT)
1819                 GOTO(out, rc_normal = result);
1820
1821         /* NB: we can't do direct IO for tiny writes because they use the page
1822          * cache, we can't do sync writes because tiny writes can't flush
1823          * pages, and we can't do append writes because we can't guarantee the
1824          * required DLM locks are held to protect file size.
1825          */
1826         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1827             !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1828                 rc_tiny = ll_do_tiny_write(iocb, from);
1829
1830         /* In case of error, go on and try normal write - Only stop if tiny
1831          * write completed I/O.
1832          */
1833         if (iov_iter_count(from) == 0)
1834                 GOTO(out, rc_normal = rc_tiny);
1835
1836         env = cl_env_get(&refcheck);
1837         if (IS_ERR(env))
1838                 return PTR_ERR(env);
1839
1840         args = ll_env_args(env, IO_NORMAL);
1841         args->u.normal.via_iter = from;
1842         args->u.normal.via_iocb = iocb;
1843
1844         rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1845                                        &iocb->ki_pos, iov_iter_count(from));
1846
1847         /* On success, combine bytes written. */
1848         if (rc_tiny >= 0 && rc_normal > 0)
1849                 rc_normal += rc_tiny;
1850         /* On error, only return error from normal write if tiny write did not
1851          * write any bytes.  Otherwise return bytes written by tiny write.
1852          */
1853         else if (rc_tiny > 0)
1854                 rc_normal = rc_tiny;
1855
1856         cl_env_put(env, &refcheck);
1857 out:
1858         if (rc_normal > 0) {
1859                 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1860                                   LUSTRE_FPRIVATE(file), iocb->ki_pos,
1861                                   rc_normal, WRITE);
1862                 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_WRITE,
1863                                    ktime_us_delta(ktime_get(), kstart));
1864         }
1865
1866         RETURN(rc_normal);
1867 }
1868
1869 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1870 /*
1871  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1872  */
1873 static int ll_file_get_iov_count(const struct iovec *iov,
1874                                  unsigned long *nr_segs, size_t *count)
1875 {
1876         size_t cnt = 0;
1877         unsigned long seg;
1878
1879         for (seg = 0; seg < *nr_segs; seg++) {
1880                 const struct iovec *iv = &iov[seg];
1881
1882                 /*
1883                  * If any segment has a negative length, or the cumulative
1884                  * length ever wraps negative then return -EINVAL.
1885                  */
1886                 cnt += iv->iov_len;
1887                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1888                         return -EINVAL;
1889                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1890                         continue;
1891                 if (seg == 0)
1892                         return -EFAULT;
1893                 *nr_segs = seg;
1894                 cnt -= iv->iov_len;     /* This segment is no good */
1895                 break;
1896         }
1897         *count = cnt;
1898         return 0;
1899 }
1900
1901 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1902                                 unsigned long nr_segs, loff_t pos)
1903 {
1904         struct iov_iter to;
1905         size_t iov_count;
1906         ssize_t result;
1907         ENTRY;
1908
1909         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1910         if (result)
1911                 RETURN(result);
1912
1913         if (!iov_count)
1914                 RETURN(0);
1915
1916 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1917         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1918 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1919         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1920 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1921
1922         result = ll_file_read_iter(iocb, &to);
1923
1924         RETURN(result);
1925 }
1926
1927 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1928                             loff_t *ppos)
1929 {
1930         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1931         struct kiocb   kiocb;
1932         ssize_t        result;
1933
1934         ENTRY;
1935
1936         if (!count)
1937                 RETURN(0);
1938
1939         init_sync_kiocb(&kiocb, file);
1940         kiocb.ki_pos = *ppos;
1941 #ifdef HAVE_KIOCB_KI_LEFT
1942         kiocb.ki_left = count;
1943 #elif defined(HAVE_KI_NBYTES)
1944         kiocb.i_nbytes = count;
1945 #endif
1946
1947         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1948         *ppos = kiocb.ki_pos;
1949
1950         RETURN(result);
1951 }
1952
1953 /*
1954  * Write to a file (through the page cache).
1955  * AIO stuff
1956  */
1957 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1958                                  unsigned long nr_segs, loff_t pos)
1959 {
1960         struct iov_iter from;
1961         size_t iov_count;
1962         ssize_t result;
1963         ENTRY;
1964
1965         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1966         if (result)
1967                 RETURN(result);
1968
1969         if (!iov_count)
1970                 RETURN(0);
1971
1972 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1973         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1974 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1975         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1976 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1977
1978         result = ll_file_write_iter(iocb, &from);
1979
1980         RETURN(result);
1981 }
1982
1983 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1984                              size_t count, loff_t *ppos)
1985 {
1986         struct iovec   iov = { .iov_base = (void __user *)buf,
1987                                .iov_len = count };
1988         struct kiocb   kiocb;
1989         ssize_t        result;
1990
1991         ENTRY;
1992
1993         if (!count)
1994                 RETURN(0);
1995
1996         init_sync_kiocb(&kiocb, file);
1997         kiocb.ki_pos = *ppos;
1998 #ifdef HAVE_KIOCB_KI_LEFT
1999         kiocb.ki_left = count;
2000 #elif defined(HAVE_KI_NBYTES)
2001         kiocb.ki_nbytes = count;
2002 #endif
2003
2004         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
2005         *ppos = kiocb.ki_pos;
2006
2007         RETURN(result);
2008 }
2009 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
2010
2011 /*
2012  * Send file content (through pagecache) somewhere with helper
2013  */
2014 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
2015                                    struct pipe_inode_info *pipe, size_t count,
2016                                    unsigned int flags)
2017 {
2018         struct lu_env *env;
2019         struct vvp_io_args *args;
2020         ssize_t result;
2021         __u16 refcheck;
2022         bool cached;
2023
2024         ENTRY;
2025
2026         result = pcc_file_splice_read(in_file, ppos, pipe,
2027                                       count, flags, &cached);
2028         if (cached)
2029                 RETURN(result);
2030
2031         ll_ras_enter(in_file);
2032
2033         env = cl_env_get(&refcheck);
2034         if (IS_ERR(env))
2035                 RETURN(PTR_ERR(env));
2036
2037         args = ll_env_args(env, IO_SPLICE);
2038         args->u.splice.via_pipe = pipe;
2039         args->u.splice.via_flags = flags;
2040
2041         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2042         cl_env_put(env, &refcheck);
2043
2044         if (result > 0)
2045                 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
2046                                   LUSTRE_FPRIVATE(in_file), *ppos, result,
2047                                   READ);
2048         RETURN(result);
2049 }
2050
2051 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2052                              __u64 flags, struct lov_user_md *lum, int lum_size)
2053 {
2054         struct lookup_intent oit = {
2055                 .it_op = IT_OPEN,
2056                 .it_flags = flags | MDS_OPEN_BY_FID,
2057         };
2058         int rc;
2059         ENTRY;
2060
2061         if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2062             le32_to_cpu(LOV_MAGIC_MAGIC)) {
2063                 /* this code will only exist for big-endian systems */
2064                 lustre_swab_lov_user_md(lum, 0);
2065         }
2066
2067         ll_inode_size_lock(inode);
2068         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2069         if (rc < 0)
2070                 GOTO(out_unlock, rc);
2071
2072         ll_release_openhandle(dentry, &oit);
2073
2074 out_unlock:
2075         ll_inode_size_unlock(inode);
2076         ll_intent_release(&oit);
2077
2078         RETURN(rc);
2079 }
2080
2081 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2082                              struct lov_mds_md **lmmp, int *lmm_size,
2083                              struct ptlrpc_request **request)
2084 {
2085         struct ll_sb_info *sbi = ll_i2sbi(inode);
2086         struct mdt_body  *body;
2087         struct lov_mds_md *lmm = NULL;
2088         struct ptlrpc_request *req = NULL;
2089         struct md_op_data *op_data;
2090         int rc, lmmsize;
2091
2092         rc = ll_get_default_mdsize(sbi, &lmmsize);
2093         if (rc)
2094                 RETURN(rc);
2095
2096         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2097                                      strlen(filename), lmmsize,
2098                                      LUSTRE_OPC_ANY, NULL);
2099         if (IS_ERR(op_data))
2100                 RETURN(PTR_ERR(op_data));
2101
2102         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2103         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2104         ll_finish_md_op_data(op_data);
2105         if (rc < 0) {
2106                 CDEBUG(D_INFO, "md_getattr_name failed "
2107                        "on %s: rc %d\n", filename, rc);
2108                 GOTO(out, rc);
2109         }
2110
2111         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2112         LASSERT(body != NULL); /* checked by mdc_getattr_name */
2113
2114         lmmsize = body->mbo_eadatasize;
2115
2116         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2117                         lmmsize == 0) {
2118                 GOTO(out, rc = -ENODATA);
2119         }
2120
2121         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2122         LASSERT(lmm != NULL);
2123
2124         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2125             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2126             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2127             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2128                 GOTO(out, rc = -EPROTO);
2129
2130         /*
2131          * This is coming from the MDS, so is probably in
2132          * little endian.  We convert it to host endian before
2133          * passing it to userspace.
2134          */
2135         if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2136             __swab32(LOV_MAGIC_MAGIC)) {
2137                 int stripe_count = 0;
2138
2139                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2140                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2141                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2142                         if (le32_to_cpu(lmm->lmm_pattern) &
2143                             LOV_PATTERN_F_RELEASED)
2144                                 stripe_count = 0;
2145                 }
2146
2147                 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2148
2149                 /* if function called for directory - we should
2150                  * avoid swab not existent lsm objects */
2151                 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2152                         lustre_swab_lov_user_md_objects(
2153                                 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2154                                 stripe_count);
2155                 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2156                          S_ISREG(body->mbo_mode))
2157                         lustre_swab_lov_user_md_objects(
2158                                 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2159                                 stripe_count);
2160         }
2161
2162 out:
2163         *lmmp = lmm;
2164         *lmm_size = lmmsize;
2165         *request = req;
2166         return rc;
2167 }
2168
2169 static int ll_lov_setea(struct inode *inode, struct file *file,
2170                         void __user *arg)
2171 {
2172         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2173         struct lov_user_md      *lump;
2174         int                      lum_size = sizeof(struct lov_user_md) +
2175                                             sizeof(struct lov_user_ost_data);
2176         int                      rc;
2177         ENTRY;
2178
2179         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2180                 RETURN(-EPERM);
2181
2182         OBD_ALLOC_LARGE(lump, lum_size);
2183         if (lump == NULL)
2184                 RETURN(-ENOMEM);
2185
2186         if (copy_from_user(lump, arg, lum_size))
2187                 GOTO(out_lump, rc = -EFAULT);
2188
2189         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2190                                       lum_size);
2191         cl_lov_delay_create_clear(&file->f_flags);
2192
2193 out_lump:
2194         OBD_FREE_LARGE(lump, lum_size);
2195         RETURN(rc);
2196 }
2197
2198 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2199 {
2200         struct lu_env   *env;
2201         __u16           refcheck;
2202         int             rc;
2203         ENTRY;
2204
2205         env = cl_env_get(&refcheck);
2206         if (IS_ERR(env))
2207                 RETURN(PTR_ERR(env));
2208
2209         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2210         cl_env_put(env, &refcheck);
2211         RETURN(rc);
2212 }
2213
2214 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2215                             void __user *arg)
2216 {
2217         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2218         struct lov_user_md        *klum;
2219         int                        lum_size, rc;
2220         __u64                      flags = FMODE_WRITE;
2221         ENTRY;
2222
2223         rc = ll_copy_user_md(lum, &klum);
2224         if (rc < 0)
2225                 RETURN(rc);
2226
2227         lum_size = rc;
2228         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2229                                       lum_size);
2230         if (!rc) {
2231                 __u32 gen;
2232
2233                 rc = put_user(0, &lum->lmm_stripe_count);
2234                 if (rc)
2235                         GOTO(out, rc);
2236
2237                 rc = ll_layout_refresh(inode, &gen);
2238                 if (rc)
2239                         GOTO(out, rc);
2240
2241                 rc = ll_file_getstripe(inode, arg, lum_size);
2242         }
2243         cl_lov_delay_create_clear(&file->f_flags);
2244
2245 out:
2246         OBD_FREE_LARGE(klum, lum_size);
2247         RETURN(rc);
2248 }
2249
2250
2251 static int
2252 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2253 {
2254         struct ll_inode_info *lli = ll_i2info(inode);
2255         struct cl_object *obj = lli->lli_clob;
2256         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2257         struct ll_grouplock grouplock;
2258         int rc;
2259         ENTRY;
2260
2261         if (arg == 0) {
2262                 CWARN("group id for group lock must not be 0\n");
2263                 RETURN(-EINVAL);
2264         }
2265
2266         if (ll_file_nolock(file))
2267                 RETURN(-EOPNOTSUPP);
2268 retry:
2269         if (file->f_flags & O_NONBLOCK) {
2270                 if (!mutex_trylock(&lli->lli_group_mutex))
2271                         RETURN(-EAGAIN);
2272         } else
2273                 mutex_lock(&lli->lli_group_mutex);
2274
2275         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2276                 CWARN("group lock already existed with gid %lu\n",
2277                       fd->fd_grouplock.lg_gid);
2278                 GOTO(out, rc = -EINVAL);
2279         }
2280         if (arg != lli->lli_group_gid && lli->lli_group_users != 0) {
2281                 if (file->f_flags & O_NONBLOCK)
2282                         GOTO(out, rc = -EAGAIN);
2283                 mutex_unlock(&lli->lli_group_mutex);
2284                 wait_var_event(&lli->lli_group_users, !lli->lli_group_users);
2285                 GOTO(retry, rc = 0);
2286         }
2287         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2288
2289         /**
2290          * XXX: group lock needs to protect all OST objects while PFL
2291          * can add new OST objects during the IO, so we'd instantiate
2292          * all OST objects before getting its group lock.
2293          */
2294         if (obj) {
2295                 struct lu_env *env;
2296                 __u16 refcheck;
2297                 struct cl_layout cl = {
2298                         .cl_is_composite = false,
2299                 };
2300                 struct lu_extent ext = {
2301                         .e_start = 0,
2302                         .e_end = OBD_OBJECT_EOF,
2303                 };
2304
2305                 env = cl_env_get(&refcheck);
2306                 if (IS_ERR(env))
2307                         GOTO(out, rc = PTR_ERR(env));
2308
2309                 rc = cl_object_layout_get(env, obj, &cl);
2310                 if (!rc && cl.cl_is_composite)
2311                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2312                                                     &ext);
2313
2314                 cl_env_put(env, &refcheck);
2315                 if (rc)
2316                         GOTO(out, rc);
2317         }
2318
2319         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2320                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2321
2322         if (rc)
2323                 GOTO(out, rc);
2324
2325         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2326         fd->fd_grouplock = grouplock;
2327         if (lli->lli_group_users == 0)
2328                 lli->lli_group_gid = grouplock.lg_gid;
2329         lli->lli_group_users++;
2330
2331         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2332 out:
2333         mutex_unlock(&lli->lli_group_mutex);
2334
2335         RETURN(rc);
2336 }
2337
2338 static int ll_put_grouplock(struct inode *inode, struct file *file,
2339                             unsigned long arg)
2340 {
2341         struct ll_inode_info   *lli = ll_i2info(inode);
2342         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2343         struct ll_grouplock     grouplock;
2344         int                     rc;
2345         ENTRY;
2346
2347         mutex_lock(&lli->lli_group_mutex);
2348         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2349                 CWARN("no group lock held\n");
2350                 GOTO(out, rc = -EINVAL);
2351         }
2352
2353         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2354
2355         if (fd->fd_grouplock.lg_gid != arg) {
2356                 CWARN("group lock %lu doesn't match current id %lu\n",
2357                       arg, fd->fd_grouplock.lg_gid);
2358                 GOTO(out, rc = -EINVAL);
2359         }
2360
2361         grouplock = fd->fd_grouplock;
2362         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2363         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2364
2365         cl_put_grouplock(&grouplock);
2366
2367         lli->lli_group_users--;
2368         if (lli->lli_group_users == 0) {
2369                 lli->lli_group_gid = 0;
2370                 wake_up_var(&lli->lli_group_users);
2371         }
2372         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2373         GOTO(out, rc = 0);
2374 out:
2375         mutex_unlock(&lli->lli_group_mutex);
2376
2377         RETURN(rc);
2378 }
2379
2380 /**
2381  * Close inode open handle
2382  *
2383  * \param dentry [in]     dentry which contains the inode
2384  * \param it     [in,out] intent which contains open info and result
2385  *
2386  * \retval 0     success
2387  * \retval <0    failure
2388  */
2389 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2390 {
2391         struct inode *inode = dentry->d_inode;
2392         struct obd_client_handle *och;
2393         int rc;
2394         ENTRY;
2395
2396         LASSERT(inode);
2397
2398         /* Root ? Do nothing. */
2399         if (dentry->d_inode->i_sb->s_root == dentry)
2400                 RETURN(0);
2401
2402         /* No open handle to close? Move away */
2403         if (!it_disposition(it, DISP_OPEN_OPEN))
2404                 RETURN(0);
2405
2406         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2407
2408         OBD_ALLOC(och, sizeof(*och));
2409         if (!och)
2410                 GOTO(out, rc = -ENOMEM);
2411
2412         rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2413         if (rc)
2414                 GOTO(out, rc);
2415
2416         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2417 out:
2418         /* this one is in place of ll_file_open */
2419         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2420                 ptlrpc_req_finished(it->it_request);
2421                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2422         }
2423         RETURN(rc);
2424 }
2425
2426 /**
2427  * Get size for inode for which FIEMAP mapping is requested.
2428  * Make the FIEMAP get_info call and returns the result.
2429  * \param fiemap        kernel buffer to hold extens
2430  * \param num_bytes     kernel buffer size
2431  */
2432 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2433                         size_t num_bytes)
2434 {
2435         struct lu_env                   *env;
2436         __u16                           refcheck;
2437         int                             rc = 0;
2438         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2439         ENTRY;
2440
2441         /* Checks for fiemap flags */
2442         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2443                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2444                 return -EBADR;
2445         }
2446
2447         /* Check for FIEMAP_FLAG_SYNC */
2448         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2449                 rc = filemap_fdatawrite(inode->i_mapping);
2450                 if (rc)
2451                         return rc;
2452         }
2453
2454         env = cl_env_get(&refcheck);
2455         if (IS_ERR(env))
2456                 RETURN(PTR_ERR(env));
2457
2458         if (i_size_read(inode) == 0) {
2459                 rc = ll_glimpse_size(inode);
2460                 if (rc)
2461                         GOTO(out, rc);
2462         }
2463
2464         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2465         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2466         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2467
2468         /* If filesize is 0, then there would be no objects for mapping */
2469         if (fmkey.lfik_oa.o_size == 0) {
2470                 fiemap->fm_mapped_extents = 0;
2471                 GOTO(out, rc = 0);
2472         }
2473
2474         fmkey.lfik_fiemap = *fiemap;
2475
2476         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2477                               &fmkey, fiemap, &num_bytes);
2478 out:
2479         cl_env_put(env, &refcheck);
2480         RETURN(rc);
2481 }
2482
2483 int ll_fid2path(struct inode *inode, void __user *arg)
2484 {
2485         struct obd_export       *exp = ll_i2mdexp(inode);
2486         const struct getinfo_fid2path __user *gfin = arg;
2487         __u32                    pathlen;
2488         struct getinfo_fid2path *gfout;
2489         size_t                   outsize;
2490         int                      rc;
2491
2492         ENTRY;
2493
2494         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2495             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2496                 RETURN(-EPERM);
2497
2498         /* Only need to get the buflen */
2499         if (get_user(pathlen, &gfin->gf_pathlen))
2500                 RETURN(-EFAULT);
2501
2502         if (pathlen > PATH_MAX)
2503                 RETURN(-EINVAL);
2504
2505         outsize = sizeof(*gfout) + pathlen;
2506         OBD_ALLOC(gfout, outsize);
2507         if (gfout == NULL)
2508                 RETURN(-ENOMEM);
2509
2510         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2511                 GOTO(gf_free, rc = -EFAULT);
2512         /* append root FID after gfout to let MDT know the root FID so that it
2513          * can lookup the correct path, this is mainly for fileset.
2514          * old server without fileset mount support will ignore this. */
2515         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2516
2517         /* Call mdc_iocontrol */
2518         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2519         if (rc != 0)
2520                 GOTO(gf_free, rc);
2521
2522         if (copy_to_user(arg, gfout, outsize))
2523                 rc = -EFAULT;
2524
2525 gf_free:
2526         OBD_FREE(gfout, outsize);
2527         RETURN(rc);
2528 }
2529
2530 static int
2531 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2532 {
2533         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2534         struct lu_env *env;
2535         struct cl_io *io;
2536         __u16  refcheck;
2537         int result;
2538
2539         ENTRY;
2540
2541         ioc->idv_version = 0;
2542         ioc->idv_layout_version = UINT_MAX;
2543
2544         /* If no file object initialized, we consider its version is 0. */
2545         if (obj == NULL)
2546                 RETURN(0);
2547
2548         env = cl_env_get(&refcheck);
2549         if (IS_ERR(env))
2550                 RETURN(PTR_ERR(env));
2551
2552         io = vvp_env_thread_io(env);
2553         io->ci_obj = obj;
2554         io->u.ci_data_version.dv_data_version = 0;
2555         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2556         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2557
2558 restart:
2559         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2560                 result = cl_io_loop(env, io);
2561         else
2562                 result = io->ci_result;
2563
2564         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2565         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2566
2567         cl_io_fini(env, io);
2568
2569         if (unlikely(io->ci_need_restart))
2570                 goto restart;
2571
2572         cl_env_put(env, &refcheck);
2573
2574         RETURN(result);
2575 }
2576
2577 /*
2578  * Read the data_version for inode.
2579  *
2580  * This value is computed using stripe object version on OST.
2581  * Version is computed using server side locking.
2582  *
2583  * @param flags if do sync on the OST side;
2584  *              0: no sync
2585  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2586  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2587  */
2588 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2589 {
2590         struct ioc_data_version ioc = { .idv_flags = flags };
2591         int rc;
2592
2593         rc = ll_ioc_data_version(inode, &ioc);
2594         if (!rc)
2595                 *data_version = ioc.idv_version;
2596
2597         return rc;
2598 }
2599
2600 /*
2601  * Trigger a HSM release request for the provided inode.
2602  */
2603 int ll_hsm_release(struct inode *inode)
2604 {
2605         struct lu_env *env;
2606         struct obd_client_handle *och = NULL;
2607         __u64 data_version = 0;
2608         int rc;
2609         __u16 refcheck;
2610         ENTRY;
2611
2612         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2613                ll_i2sbi(inode)->ll_fsname,
2614                PFID(&ll_i2info(inode)->lli_fid));
2615
2616         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2617         if (IS_ERR(och))
2618                 GOTO(out, rc = PTR_ERR(och));
2619
2620         /* Grab latest data_version and [am]time values */
2621         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2622         if (rc != 0)
2623                 GOTO(out, rc);
2624
2625         env = cl_env_get(&refcheck);
2626         if (IS_ERR(env))
2627                 GOTO(out, rc = PTR_ERR(env));
2628
2629         rc = ll_merge_attr(env, inode);
2630         cl_env_put(env, &refcheck);
2631
2632         /* If error happen, we have the wrong size for a file.
2633          * Don't release it.
2634          */
2635         if (rc != 0)
2636                 GOTO(out, rc);
2637
2638         /* Release the file.
2639          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2640          * we still need it to pack l_remote_handle to MDT. */
2641         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2642                                        &data_version);
2643         och = NULL;
2644
2645         EXIT;
2646 out:
2647         if (och != NULL && !IS_ERR(och)) /* close the file */
2648                 ll_lease_close(och, inode, NULL);
2649
2650         return rc;
2651 }
2652
2653 struct ll_swap_stack {
2654         __u64                    dv1;
2655         __u64                    dv2;
2656         struct inode            *inode1;
2657         struct inode            *inode2;
2658         bool                     check_dv1;
2659         bool                     check_dv2;
2660 };
2661
2662 static int ll_swap_layouts(struct file *file1, struct file *file2,
2663                            struct lustre_swap_layouts *lsl)
2664 {
2665         struct mdc_swap_layouts  msl;
2666         struct md_op_data       *op_data;
2667         __u32                    gid;
2668         __u64                    dv;
2669         struct ll_swap_stack    *llss = NULL;
2670         int                      rc;
2671
2672         OBD_ALLOC_PTR(llss);
2673         if (llss == NULL)
2674                 RETURN(-ENOMEM);
2675
2676         llss->inode1 = file_inode(file1);
2677         llss->inode2 = file_inode(file2);
2678
2679         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2680         if (rc < 0)
2681                 GOTO(free, rc);
2682
2683         /* we use 2 bool because it is easier to swap than 2 bits */
2684         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2685                 llss->check_dv1 = true;
2686
2687         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2688                 llss->check_dv2 = true;
2689
2690         /* we cannot use lsl->sl_dvX directly because we may swap them */
2691         llss->dv1 = lsl->sl_dv1;
2692         llss->dv2 = lsl->sl_dv2;
2693
2694         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2695         if (rc == 0) /* same file, done! */
2696                 GOTO(free, rc);
2697
2698         if (rc < 0) { /* sequentialize it */
2699                 swap(llss->inode1, llss->inode2);
2700                 swap(file1, file2);
2701                 swap(llss->dv1, llss->dv2);
2702                 swap(llss->check_dv1, llss->check_dv2);
2703         }
2704
2705         gid = lsl->sl_gid;
2706         if (gid != 0) { /* application asks to flush dirty cache */
2707                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2708                 if (rc < 0)
2709                         GOTO(free, rc);
2710
2711                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2712                 if (rc < 0) {
2713                         ll_put_grouplock(llss->inode1, file1, gid);
2714                         GOTO(free, rc);
2715                 }
2716         }
2717
2718         /* ultimate check, before swaping the layouts we check if
2719          * dataversion has changed (if requested) */
2720         if (llss->check_dv1) {
2721                 rc = ll_data_version(llss->inode1, &dv, 0);
2722                 if (rc)
2723                         GOTO(putgl, rc);
2724                 if (dv != llss->dv1)
2725                         GOTO(putgl, rc = -EAGAIN);
2726         }
2727
2728         if (llss->check_dv2) {
2729                 rc = ll_data_version(llss->inode2, &dv, 0);
2730                 if (rc)
2731                         GOTO(putgl, rc);
2732                 if (dv != llss->dv2)
2733                         GOTO(putgl, rc = -EAGAIN);
2734         }
2735
2736         /* struct md_op_data is used to send the swap args to the mdt
2737          * only flags is missing, so we use struct mdc_swap_layouts
2738          * through the md_op_data->op_data */
2739         /* flags from user space have to be converted before they are send to
2740          * server, no flag is sent today, they are only used on the client */
2741         msl.msl_flags = 0;
2742         rc = -ENOMEM;
2743         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2744                                      0, LUSTRE_OPC_ANY, &msl);
2745         if (IS_ERR(op_data))
2746                 GOTO(free, rc = PTR_ERR(op_data));
2747
2748         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2749                            sizeof(*op_data), op_data, NULL);
2750         ll_finish_md_op_data(op_data);
2751
2752         if (rc < 0)
2753                 GOTO(putgl, rc);
2754
2755 putgl:
2756         if (gid != 0) {
2757                 ll_put_grouplock(llss->inode2, file2, gid);
2758                 ll_put_grouplock(llss->inode1, file1, gid);
2759         }
2760
2761 free:
2762         if (llss != NULL)
2763                 OBD_FREE_PTR(llss);
2764
2765         RETURN(rc);
2766 }
2767
2768 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2769 {
2770         struct obd_export *exp = ll_i2mdexp(inode);
2771         struct md_op_data *op_data;
2772         int rc;
2773         ENTRY;
2774
2775         /* Detect out-of range masks */
2776         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2777                 RETURN(-EINVAL);
2778
2779         /* Non-root users are forbidden to set or clear flags which are
2780          * NOT defined in HSM_USER_MASK. */
2781         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2782             !cfs_capable(CFS_CAP_SYS_ADMIN))
2783                 RETURN(-EPERM);
2784
2785         if (!exp_connect_archive_id_array(exp)) {
2786                 /* Detect out-of range archive id */
2787                 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2788                     (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2789                         RETURN(-EINVAL);
2790         }
2791
2792         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2793                                      LUSTRE_OPC_ANY, hss);
2794         if (IS_ERR(op_data))
2795                 RETURN(PTR_ERR(op_data));
2796
2797         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2798                            op_data, NULL);
2799
2800         ll_finish_md_op_data(op_data);
2801
2802         RETURN(rc);
2803 }
2804
2805 static int ll_hsm_import(struct inode *inode, struct file *file,
2806                          struct hsm_user_import *hui)
2807 {
2808         struct hsm_state_set    *hss = NULL;
2809         struct iattr            *attr = NULL;
2810         int                      rc;
2811         ENTRY;
2812
2813         if (!S_ISREG(inode->i_mode))
2814                 RETURN(-EINVAL);
2815
2816         /* set HSM flags */
2817         OBD_ALLOC_PTR(hss);
2818         if (hss == NULL)
2819                 GOTO(out, rc = -ENOMEM);
2820
2821         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2822         hss->hss_archive_id = hui->hui_archive_id;
2823         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2824         rc = ll_hsm_state_set(inode, hss);
2825         if (rc != 0)
2826                 GOTO(out, rc);
2827
2828         OBD_ALLOC_PTR(attr);
2829         if (attr == NULL)
2830                 GOTO(out, rc = -ENOMEM);
2831
2832         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2833         attr->ia_mode |= S_IFREG;
2834         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2835         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2836         attr->ia_size = hui->hui_size;
2837         attr->ia_mtime.tv_sec = hui->hui_mtime;
2838         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2839         attr->ia_atime.tv_sec = hui->hui_atime;
2840         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2841
2842         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2843                          ATTR_UID | ATTR_GID |
2844                          ATTR_MTIME | ATTR_MTIME_SET |
2845                          ATTR_ATIME | ATTR_ATIME_SET;
2846
2847         inode_lock(inode);
2848
2849         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2850         if (rc == -ENODATA)
2851                 rc = 0;
2852
2853         inode_unlock(inode);
2854
2855 out:
2856         if (hss != NULL)
2857                 OBD_FREE_PTR(hss);
2858
2859         if (attr != NULL)
2860                 OBD_FREE_PTR(attr);
2861
2862         RETURN(rc);
2863 }
2864
2865 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2866 {
2867         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2868                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2869 }
2870
2871 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2872 {
2873         struct inode *inode = file_inode(file);
2874         struct iattr ia = {
2875                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2876                             ATTR_MTIME | ATTR_MTIME_SET |
2877                             ATTR_CTIME,
2878                 .ia_atime = {
2879                         .tv_sec = lfu->lfu_atime_sec,
2880                         .tv_nsec = lfu->lfu_atime_nsec,
2881                 },
2882                 .ia_mtime = {
2883                         .tv_sec = lfu->lfu_mtime_sec,
2884                         .tv_nsec = lfu->lfu_mtime_nsec,
2885                 },
2886                 .ia_ctime = {
2887                         .tv_sec = lfu->lfu_ctime_sec,
2888                         .tv_nsec = lfu->lfu_ctime_nsec,
2889                 },
2890         };
2891         int rc;
2892         ENTRY;
2893
2894         if (!capable(CAP_SYS_ADMIN))
2895                 RETURN(-EPERM);
2896
2897         if (!S_ISREG(inode->i_mode))
2898                 RETURN(-EINVAL);
2899
2900         inode_lock(inode);
2901         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2902                             false);
2903         inode_unlock(inode);
2904
2905         RETURN(rc);
2906 }
2907
2908 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2909 {
2910         switch (mode) {
2911         case MODE_READ_USER:
2912                 return CLM_READ;
2913         case MODE_WRITE_USER:
2914                 return CLM_WRITE;
2915         default:
2916                 return -EINVAL;
2917         }
2918 }
2919
2920 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2921
2922 /* Used to allow the upper layers of the client to request an LDLM lock
2923  * without doing an actual read or write.
2924  *
2925  * Used for ladvise lockahead to manually request specific locks.
2926  *
2927  * \param[in] file      file this ladvise lock request is on
2928  * \param[in] ladvise   ladvise struct describing this lock request
2929  *
2930  * \retval 0            success, no detailed result available (sync requests
2931  *                      and requests sent to the server [not handled locally]
2932  *                      cannot return detailed results)
2933  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2934  *                                       see definitions for details.
2935  * \retval negative     negative errno on error
2936  */
2937 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2938 {
2939         struct lu_env *env = NULL;
2940         struct cl_io *io  = NULL;
2941         struct cl_lock *lock = NULL;
2942         struct cl_lock_descr *descr = NULL;
2943         struct dentry *dentry = file->f_path.dentry;
2944         struct inode *inode = dentry->d_inode;
2945         enum cl_lock_mode cl_mode;
2946         off_t start = ladvise->lla_start;
2947         off_t end = ladvise->lla_end;
2948         int result;
2949         __u16 refcheck;
2950
2951         ENTRY;
2952
2953         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2954                "start=%llu, end=%llu\n", dentry->d_name.len,
2955                dentry->d_name.name, dentry->d_inode,
2956                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2957                (__u64) end);
2958
2959         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2960         if (cl_mode < 0)
2961                 GOTO(out, result = cl_mode);
2962
2963         /* Get IO environment */
2964         result = cl_io_get(inode, &env, &io, &refcheck);
2965         if (result <= 0)
2966                 GOTO(out, result);
2967
2968         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2969         if (result > 0) {
2970                 /*
2971                  * nothing to do for this io. This currently happens when
2972                  * stripe sub-object's are not yet created.
2973                  */
2974                 result = io->ci_result;
2975         } else if (result == 0) {
2976                 lock = vvp_env_lock(env);
2977                 descr = &lock->cll_descr;
2978
2979                 descr->cld_obj   = io->ci_obj;
2980                 /* Convert byte offsets to pages */
2981                 descr->cld_start = cl_index(io->ci_obj, start);
2982                 descr->cld_end   = cl_index(io->ci_obj, end);
2983                 descr->cld_mode  = cl_mode;
2984                 /* CEF_MUST is used because we do not want to convert a
2985                  * lockahead request to a lockless lock */
2986                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2987                                        CEF_NONBLOCK;
2988
2989                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2990                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2991
2992                 result = cl_lock_request(env, io, lock);
2993
2994                 /* On success, we need to release the lock */
2995                 if (result >= 0)
2996                         cl_lock_release(env, lock);
2997         }
2998         cl_io_fini(env, io);
2999         cl_env_put(env, &refcheck);
3000
3001         /* -ECANCELED indicates a matching lock with a different extent
3002          * was already present, and -EEXIST indicates a matching lock
3003          * on exactly the same extent was already present.
3004          * We convert them to positive values for userspace to make
3005          * recognizing true errors easier.
3006          * Note we can only return these detailed results on async requests,
3007          * as sync requests look the same as i/o requests for locking. */
3008         if (result == -ECANCELED)
3009                 result = LLA_RESULT_DIFFERENT;
3010         else if (result == -EEXIST)
3011                 result = LLA_RESULT_SAME;
3012
3013 out:
3014         RETURN(result);
3015 }
3016 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
3017
3018 static int ll_ladvise_sanity(struct inode *inode,
3019                              struct llapi_lu_ladvise *ladvise)
3020 {
3021         struct ll_sb_info *sbi = ll_i2sbi(inode);
3022         enum lu_ladvise_type advice = ladvise->lla_advice;
3023         /* Note the peradvice flags is a 32 bit field, so per advice flags must
3024          * be in the first 32 bits of enum ladvise_flags */
3025         __u32 flags = ladvise->lla_peradvice_flags;
3026         /* 3 lines at 80 characters per line, should be plenty */
3027         int rc = 0;
3028
3029         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
3030                 rc = -EINVAL;
3031                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
3032                        "last supported advice is %s (value '%d'): rc = %d\n",
3033                        sbi->ll_fsname, advice,
3034                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
3035                 GOTO(out, rc);
3036         }
3037
3038         /* Per-advice checks */
3039         switch (advice) {
3040         case LU_LADVISE_LOCKNOEXPAND:
3041                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
3042                         rc = -EINVAL;
3043                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3044                                "rc = %d\n", sbi->ll_fsname, flags,
3045                                ladvise_names[advice], rc);
3046                         GOTO(out, rc);
3047                 }
3048                 break;
3049         case LU_LADVISE_LOCKAHEAD:
3050                 /* Currently only READ and WRITE modes can be requested */
3051                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3052                     ladvise->lla_lockahead_mode == 0) {
3053                         rc = -EINVAL;
3054                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3055                                "rc = %d\n", sbi->ll_fsname,
3056                                ladvise->lla_lockahead_mode,
3057                                ladvise_names[advice], rc);
3058                         GOTO(out, rc);
3059                 }
3060                 /* fallthrough */
3061         case LU_LADVISE_WILLREAD:
3062         case LU_LADVISE_DONTNEED:
3063         default:
3064                 /* Note fall through above - These checks apply to all advices
3065                  * except LOCKNOEXPAND */
3066                 if (flags & ~LF_DEFAULT_MASK) {
3067                         rc = -EINVAL;
3068                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3069                                "rc = %d\n", sbi->ll_fsname, flags,
3070                                ladvise_names[advice], rc);
3071                         GOTO(out, rc);
3072                 }
3073                 if (ladvise->lla_start >= ladvise->lla_end) {
3074                         rc = -EINVAL;
3075                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3076                                "for %s: rc = %d\n", sbi->ll_fsname,
3077                                ladvise->lla_start, ladvise->lla_end,
3078                                ladvise_names[advice], rc);
3079                         GOTO(out, rc);
3080                 }
3081                 break;
3082         }
3083
3084 out:
3085         return rc;
3086 }
3087 #undef ERRSIZE
3088
3089 /*
3090  * Give file access advices
3091  *
3092  * The ladvise interface is similar to Linux fadvise() system call, except it
3093  * forwards the advices directly from Lustre client to server. The server side
3094  * codes will apply appropriate read-ahead and caching techniques for the
3095  * corresponding files.
3096  *
3097  * A typical workload for ladvise is e.g. a bunch of different clients are
3098  * doing small random reads of a file, so prefetching pages into OSS cache
3099  * with big linear reads before the random IO is a net benefit. Fetching
3100  * all that data into each client cache with fadvise() may not be, due to
3101  * much more data being sent to the client.
3102  */
3103 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3104                       struct llapi_lu_ladvise *ladvise)
3105 {
3106         struct lu_env *env;
3107         struct cl_io *io;
3108         struct cl_ladvise_io *lio;
3109         int rc;
3110         __u16 refcheck;
3111         ENTRY;
3112
3113         env = cl_env_get(&refcheck);
3114         if (IS_ERR(env))
3115                 RETURN(PTR_ERR(env));
3116
3117         io = vvp_env_thread_io(env);
3118         io->ci_obj = ll_i2info(inode)->lli_clob;
3119
3120         /* initialize parameters for ladvise */
3121         lio = &io->u.ci_ladvise;
3122         lio->li_start = ladvise->lla_start;
3123         lio->li_end = ladvise->lla_end;
3124         lio->li_fid = ll_inode2fid(inode);
3125         lio->li_advice = ladvise->lla_advice;
3126         lio->li_flags = flags;
3127
3128         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3129                 rc = cl_io_loop(env, io);
3130         else
3131                 rc = io->ci_result;
3132
3133         cl_io_fini(env, io);
3134         cl_env_put(env, &refcheck);
3135         RETURN(rc);
3136 }
3137
3138 static int ll_lock_noexpand(struct file *file, int flags)
3139 {
3140         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3141
3142         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3143
3144         return 0;
3145 }
3146
3147 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3148                         unsigned long arg)
3149 {
3150         struct fsxattr fsxattr;
3151
3152         if (copy_from_user(&fsxattr,
3153                            (const struct fsxattr __user *)arg,
3154                            sizeof(fsxattr)))
3155                 RETURN(-EFAULT);
3156
3157         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3158         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3159                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3160         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3161         if (copy_to_user((struct fsxattr __user *)arg,
3162                          &fsxattr, sizeof(fsxattr)))
3163                 RETURN(-EFAULT);
3164
3165         RETURN(0);
3166 }
3167
3168 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3169 {
3170         /*
3171          * Project Quota ID state is only allowed to change from within the init
3172          * namespace. Enforce that restriction only if we are trying to change
3173          * the quota ID state. Everything else is allowed in user namespaces.
3174          */
3175         if (current_user_ns() == &init_user_ns)
3176                 return 0;
3177
3178         if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3179                 return -EINVAL;
3180
3181         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3182                 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3183                         return -EINVAL;
3184         } else {
3185                 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3186                         return -EINVAL;
3187         }
3188
3189         return 0;
3190 }
3191
3192 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3193                         unsigned long arg)
3194 {
3195
3196         struct md_op_data *op_data;
3197         struct ptlrpc_request *req = NULL;
3198         int rc = 0;
3199         struct fsxattr fsxattr;
3200         struct cl_object *obj;
3201         struct iattr *attr;
3202         int flags;
3203
3204         if (copy_from_user(&fsxattr,
3205                            (const struct fsxattr __user *)arg,
3206                            sizeof(fsxattr)))
3207                 RETURN(-EFAULT);
3208
3209         rc = ll_ioctl_check_project(inode, &fsxattr);
3210         if (rc)
3211                 RETURN(rc);
3212
3213         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3214                                      LUSTRE_OPC_ANY, NULL);
3215         if (IS_ERR(op_data))
3216                 RETURN(PTR_ERR(op_data));
3217
3218         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3219         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3220         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3221                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3222         op_data->op_projid = fsxattr.fsx_projid;
3223         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3224         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3225                         0, &req);
3226         ptlrpc_req_finished(req);
3227         if (rc)
3228                 GOTO(out_fsxattr, rc);
3229         ll_update_inode_flags(inode, op_data->op_attr_flags);
3230         obj = ll_i2info(inode)->lli_clob;
3231         if (obj == NULL)
3232                 GOTO(out_fsxattr, rc);
3233
3234         OBD_ALLOC_PTR(attr);
3235         if (attr == NULL)
3236                 GOTO(out_fsxattr, rc = -ENOMEM);
3237
3238         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3239                             fsxattr.fsx_xflags);
3240         OBD_FREE_PTR(attr);
3241 out_fsxattr:
3242         ll_finish_md_op_data(op_data);
3243         RETURN(rc);
3244 }
3245
3246 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3247                                  unsigned long arg)
3248 {
3249         struct inode            *inode = file_inode(file);
3250         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3251         struct ll_inode_info    *lli = ll_i2info(inode);
3252         struct obd_client_handle *och = NULL;
3253         struct split_param sp;
3254         struct pcc_param param;
3255         bool lease_broken = false;
3256         fmode_t fmode = 0;
3257         enum mds_op_bias bias = 0;
3258         struct file *layout_file = NULL;
3259         void *data = NULL;
3260         size_t data_size = 0;
3261         bool attached = false;
3262         long rc, rc2 = 0;
3263
3264         ENTRY;
3265
3266         mutex_lock(&lli->lli_och_mutex);
3267         if (fd->fd_lease_och != NULL) {
3268                 och = fd->fd_lease_och;
3269                 fd->fd_lease_och = NULL;
3270         }
3271         mutex_unlock(&lli->lli_och_mutex);
3272
3273         if (och == NULL)
3274                 RETURN(-ENOLCK);
3275
3276         fmode = och->och_flags;
3277
3278         switch (ioc->lil_flags) {
3279         case LL_LEASE_RESYNC_DONE:
3280                 if (ioc->lil_count > IOC_IDS_MAX)
3281                         GOTO(out_lease_close, rc = -EINVAL);
3282
3283                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3284                 OBD_ALLOC(data, data_size);
3285                 if (!data)
3286                         GOTO(out_lease_close, rc = -ENOMEM);
3287
3288                 if (copy_from_user(data, (void __user *)arg, data_size))
3289                         GOTO(out_lease_close, rc = -EFAULT);
3290
3291                 bias = MDS_CLOSE_RESYNC_DONE;
3292                 break;
3293         case LL_LEASE_LAYOUT_MERGE: {
3294                 int fd;
3295
3296                 if (ioc->lil_count != 1)
3297                         GOTO(out_lease_close, rc = -EINVAL);
3298
3299                 arg += sizeof(*ioc);
3300                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3301                         GOTO(out_lease_close, rc = -EFAULT);
3302
3303                 layout_file = fget(fd);
3304                 if (!layout_file)
3305                         GOTO(out_lease_close, rc = -EBADF);
3306
3307                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3308                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3309                         GOTO(out_lease_close, rc = -EPERM);
3310
3311                 data = file_inode(layout_file);
3312                 bias = MDS_CLOSE_LAYOUT_MERGE;
3313                 break;
3314         }
3315         case LL_LEASE_LAYOUT_SPLIT: {
3316                 int fdv;
3317                 int mirror_id;
3318
3319                 if (ioc->lil_count != 2)
3320                         GOTO(out_lease_close, rc = -EINVAL);
3321
3322                 arg += sizeof(*ioc);
3323                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3324                         GOTO(out_lease_close, rc = -EFAULT);
3325
3326                 arg += sizeof(__u32);
3327                 if (copy_from_user(&mirror_id, (void __user *)arg,
3328                                    sizeof(__u32)))
3329                         GOTO(out_lease_close, rc = -EFAULT);
3330
3331                 layout_file = fget(fdv);
3332                 if (!layout_file)
3333                         GOTO(out_lease_close, rc = -EBADF);
3334
3335                 sp.sp_inode = file_inode(layout_file);
3336                 sp.sp_mirror_id = (__u16)mirror_id;
3337                 data = &sp;
3338                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3339                 break;
3340         }
3341         case LL_LEASE_PCC_ATTACH:
3342                 if (ioc->lil_count != 1)
3343                         RETURN(-EINVAL);
3344
3345                 arg += sizeof(*ioc);
3346                 if (copy_from_user(&param.pa_archive_id, (void __user *)arg,
3347                                    sizeof(__u32)))
3348                         GOTO(out_lease_close, rc2 = -EFAULT);
3349
3350                 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3351                 if (rc2)
3352                         GOTO(out_lease_close, rc2);
3353
3354                 attached = true;
3355                 /* Grab latest data version */
3356                 rc2 = ll_data_version(inode, &param.pa_data_version,
3357                                      LL_DV_WR_FLUSH);
3358                 if (rc2)
3359                         GOTO(out_lease_close, rc2);
3360
3361                 data = &param;
3362                 bias = MDS_PCC_ATTACH;
3363                 break;
3364         default:
3365                 /* without close intent */
3366                 break;
3367         }
3368
3369 out_lease_close:
3370         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3371         if (rc < 0)
3372                 GOTO(out, rc);
3373
3374         rc = ll_lease_och_release(inode, file);
3375         if (rc < 0)
3376                 GOTO(out, rc);
3377
3378         if (lease_broken)
3379                 fmode = 0;
3380         EXIT;
3381
3382 out:
3383         switch (ioc->lil_flags) {
3384         case LL_LEASE_RESYNC_DONE:
3385                 if (data)
3386                         OBD_FREE(data, data_size);
3387                 break;
3388         case LL_LEASE_LAYOUT_MERGE:
3389         case LL_LEASE_LAYOUT_SPLIT:
3390                 if (layout_file)
3391                         fput(layout_file);
3392                 break;
3393         case LL_LEASE_PCC_ATTACH:
3394                 if (!rc)
3395                         rc = rc2;
3396                 rc = pcc_readwrite_attach_fini(file, inode,
3397                                                param.pa_layout_gen,
3398                                                lease_broken, rc,
3399                                                attached);
3400                 break;
3401         }
3402
3403         if (!rc)
3404                 rc = ll_lease_type_from_fmode(fmode);
3405         RETURN(rc);
3406 }
3407
3408 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3409                               unsigned long arg)
3410 {
3411         struct inode *inode = file_inode(file);
3412         struct ll_inode_info *lli = ll_i2info(inode);
3413         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3414         struct obd_client_handle *och = NULL;
3415         __u64 open_flags = 0;
3416         bool lease_broken;
3417         fmode_t fmode;
3418         long rc;
3419         ENTRY;
3420
3421         switch (ioc->lil_mode) {
3422         case LL_LEASE_WRLCK:
3423                 if (!(file->f_mode & FMODE_WRITE))
3424                         RETURN(-EPERM);
3425                 fmode = FMODE_WRITE;
3426                 break;
3427         case LL_LEASE_RDLCK:
3428                 if (!(file->f_mode & FMODE_READ))
3429                         RETURN(-EPERM);
3430                 fmode = FMODE_READ;
3431                 break;
3432         case LL_LEASE_UNLCK:
3433                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3434         default:
3435                 RETURN(-EINVAL);
3436         }
3437
3438         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3439
3440         /* apply for lease */
3441         if (ioc->lil_flags & LL_LEASE_RESYNC)
3442                 open_flags = MDS_OPEN_RESYNC;
3443         och = ll_lease_open(inode, file, fmode, open_flags);
3444         if (IS_ERR(och))
3445                 RETURN(PTR_ERR(och));
3446
3447         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3448                 rc = ll_lease_file_resync(och, inode, arg);
3449                 if (rc) {
3450                         ll_lease_close(och, inode, NULL);
3451                         RETURN(rc);
3452                 }
3453                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3454                 if (rc) {
3455                         ll_lease_close(och, inode, NULL);
3456                         RETURN(rc);
3457                 }
3458         }
3459
3460         rc = 0;
3461         mutex_lock(&lli->lli_och_mutex);
3462         if (fd->fd_lease_och == NULL) {
3463                 fd->fd_lease_och = och;
3464                 och = NULL;
3465         }
3466         mutex_unlock(&lli->lli_och_mutex);
3467         if (och != NULL) {
3468                 /* impossible now that only excl is supported for now */
3469                 ll_lease_close(och, inode, &lease_broken);
3470                 rc = -EBUSY;
3471         }
3472         RETURN(rc);
3473 }
3474
3475 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3476 {
3477         struct ll_inode_info *lli = ll_i2info(inode);
3478         struct ll_sb_info *sbi = ll_i2sbi(inode);
3479         __u64 now = ktime_get_real_seconds();
3480         int i;
3481
3482         spin_lock(&lli->lli_heat_lock);
3483         heat->lh_flags = lli->lli_heat_flags;
3484         for (i = 0; i < heat->lh_count; i++)
3485                 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3486                                                 now, sbi->ll_heat_decay_weight,
3487                                                 sbi->ll_heat_period_second);
3488         spin_unlock(&lli->lli_heat_lock);
3489 }
3490
3491 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3492 {
3493         struct ll_inode_info *lli = ll_i2info(inode);
3494         int rc = 0;
3495
3496         spin_lock(&lli->lli_heat_lock);
3497         if (flags & LU_HEAT_FLAG_CLEAR)
3498                 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3499
3500         if (flags & LU_HEAT_FLAG_OFF)
3501                 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3502         else
3503                 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3504
3505         spin_unlock(&lli->lli_heat_lock);
3506
3507         RETURN(rc);
3508 }
3509
3510 static long
3511 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3512 {
3513         struct inode            *inode = file_inode(file);
3514         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3515         int                      flags, rc;
3516         ENTRY;
3517
3518         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3519                PFID(ll_inode2fid(inode)), inode, cmd);
3520         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3521
3522         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3523         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3524                 RETURN(-ENOTTY);
3525
3526         switch (cmd) {
3527         case LL_IOC_GETFLAGS:
3528                 /* Get the current value of the file flags */
3529                 return put_user(fd->fd_flags, (int __user *)arg);
3530         case LL_IOC_SETFLAGS:
3531         case LL_IOC_CLRFLAGS:
3532                 /* Set or clear specific file flags */
3533                 /* XXX This probably needs checks to ensure the flags are
3534                  *     not abused, and to handle any flag side effects.
3535                  */
3536                 if (get_user(flags, (int __user *) arg))
3537                         RETURN(-EFAULT);
3538
3539                 if (cmd == LL_IOC_SETFLAGS) {
3540                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3541                             !(file->f_flags & O_DIRECT)) {
3542                                 CERROR("%s: unable to disable locking on "
3543                                        "non-O_DIRECT file\n", current->comm);
3544                                 RETURN(-EINVAL);
3545                         }
3546
3547                         fd->fd_flags |= flags;
3548                 } else {
3549                         fd->fd_flags &= ~flags;
3550                 }
3551                 RETURN(0);
3552         case LL_IOC_LOV_SETSTRIPE:
3553         case LL_IOC_LOV_SETSTRIPE_NEW:
3554                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3555         case LL_IOC_LOV_SETEA:
3556                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3557         case LL_IOC_LOV_SWAP_LAYOUTS: {
3558                 struct file *file2;
3559                 struct lustre_swap_layouts lsl;
3560
3561                 if (copy_from_user(&lsl, (char __user *)arg,
3562                                    sizeof(struct lustre_swap_layouts)))
3563                         RETURN(-EFAULT);
3564
3565                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3566                         RETURN(-EPERM);
3567
3568                 file2 = fget(lsl.sl_fd);
3569                 if (file2 == NULL)
3570                         RETURN(-EBADF);
3571
3572                 /* O_WRONLY or O_RDWR */
3573                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3574                         GOTO(out, rc = -EPERM);
3575
3576                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3577                         struct inode                    *inode2;
3578                         struct ll_inode_info            *lli;
3579                         struct obd_client_handle        *och = NULL;
3580
3581                         lli = ll_i2info(inode);
3582                         mutex_lock(&lli->lli_och_mutex);
3583                         if (fd->fd_lease_och != NULL) {
3584                                 och = fd->fd_lease_och;
3585                                 fd->fd_lease_och = NULL;
3586                         }
3587                         mutex_unlock(&lli->lli_och_mutex);
3588                         if (och == NULL)
3589                                 GOTO(out, rc = -ENOLCK);
3590                         inode2 = file_inode(file2);
3591                         rc = ll_swap_layouts_close(och, inode, inode2);
3592                 } else {
3593                         rc = ll_swap_layouts(file, file2, &lsl);
3594                 }
3595 out:
3596                 fput(file2);
3597                 RETURN(rc);
3598         }
3599         case LL_IOC_LOV_GETSTRIPE:
3600         case LL_IOC_LOV_GETSTRIPE_NEW:
3601                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3602         case FS_IOC_GETFLAGS:
3603         case FS_IOC_SETFLAGS:
3604                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3605         case FSFILT_IOC_GETVERSION:
3606         case FS_IOC_GETVERSION:
3607                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3608         /* We need to special case any other ioctls we want to handle,
3609          * to send them to the MDS/OST as appropriate and to properly
3610          * network encode the arg field. */
3611         case FS_IOC_SETVERSION:
3612                 RETURN(-ENOTSUPP);
3613
3614         case LL_IOC_GROUP_LOCK:
3615                 RETURN(ll_get_grouplock(inode, file, arg));
3616         case LL_IOC_GROUP_UNLOCK:
3617                 RETURN(ll_put_grouplock(inode, file, arg));
3618         case IOC_OBD_STATFS:
3619                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3620
3621         case LL_IOC_FLUSHCTX:
3622                 RETURN(ll_flush_ctx(inode));
3623         case LL_IOC_PATH2FID: {
3624                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3625                                  sizeof(struct lu_fid)))
3626                         RETURN(-EFAULT);
3627
3628                 RETURN(0);
3629         }
3630         case LL_IOC_GETPARENT:
3631                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3632
3633         case OBD_IOC_FID2PATH:
3634                 RETURN(ll_fid2path(inode, (void __user *)arg));
3635         case LL_IOC_DATA_VERSION: {
3636                 struct ioc_data_version idv;
3637                 int rc;
3638
3639                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3640                         RETURN(-EFAULT);
3641
3642                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3643                 rc = ll_ioc_data_version(inode, &idv);
3644
3645                 if (rc == 0 &&
3646                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3647                         RETURN(-EFAULT);
3648
3649                 RETURN(rc);
3650         }
3651
3652         case LL_IOC_GET_MDTIDX: {
3653                 int mdtidx;
3654
3655                 mdtidx = ll_get_mdt_idx(inode);
3656                 if (mdtidx < 0)
3657                         RETURN(mdtidx);
3658
3659                 if (put_user((int)mdtidx, (int __user *)arg))
3660                         RETURN(-EFAULT);
3661
3662                 RETURN(0);
3663         }
3664         case OBD_IOC_GETDTNAME:
3665         case OBD_IOC_GETMDNAME:
3666                 RETURN(ll_get_obd_name(inode, cmd, arg));
3667         case LL_IOC_HSM_STATE_GET: {
3668                 struct md_op_data       *op_data;
3669                 struct hsm_user_state   *hus;
3670                 int                      rc;
3671
3672                 OBD_ALLOC_PTR(hus);
3673                 if (hus == NULL)
3674                         RETURN(-ENOMEM);
3675
3676                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3677                                              LUSTRE_OPC_ANY, hus);
3678                 if (IS_ERR(op_data)) {
3679                         OBD_FREE_PTR(hus);
3680                         RETURN(PTR_ERR(op_data));
3681                 }
3682
3683                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3684                                    op_data, NULL);
3685
3686                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3687                         rc = -EFAULT;
3688
3689                 ll_finish_md_op_data(op_data);
3690                 OBD_FREE_PTR(hus);
3691                 RETURN(rc);
3692         }
3693         case LL_IOC_HSM_STATE_SET: {
3694                 struct hsm_state_set    *hss;
3695                 int                      rc;
3696
3697                 OBD_ALLOC_PTR(hss);
3698                 if (hss == NULL)
3699                         RETURN(-ENOMEM);
3700
3701                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3702                         OBD_FREE_PTR(hss);
3703                         RETURN(-EFAULT);
3704                 }
3705
3706                 rc = ll_hsm_state_set(inode, hss);
3707
3708                 OBD_FREE_PTR(hss);
3709                 RETURN(rc);
3710         }
3711         case LL_IOC_HSM_ACTION: {
3712                 struct md_op_data               *op_data;
3713                 struct hsm_current_action       *hca;
3714                 int                              rc;
3715
3716                 OBD_ALLOC_PTR(hca);
3717                 if (hca == NULL)
3718                         RETURN(-ENOMEM);
3719
3720                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3721                                              LUSTRE_OPC_ANY, hca);
3722                 if (IS_ERR(op_data)) {
3723                         OBD_FREE_PTR(hca);
3724                         RETURN(PTR_ERR(op_data));
3725                 }
3726
3727                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3728                                    op_data, NULL);
3729
3730                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3731                         rc = -EFAULT;
3732
3733                 ll_finish_md_op_data(op_data);
3734                 OBD_FREE_PTR(hca);
3735                 RETURN(rc);
3736         }
3737         case LL_IOC_SET_LEASE_OLD: {
3738                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3739
3740                 RETURN(ll_file_set_lease(file, &ioc, 0));
3741         }
3742         case LL_IOC_SET_LEASE: {
3743                 struct ll_ioc_lease ioc;
3744
3745                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3746                         RETURN(-EFAULT);
3747
3748                 RETURN(ll_file_set_lease(file, &ioc, arg));
3749         }
3750         case LL_IOC_GET_LEASE: {
3751                 struct ll_inode_info *lli = ll_i2info(inode);
3752                 struct ldlm_lock *lock = NULL;
3753                 fmode_t fmode = 0;
3754
3755                 mutex_lock(&lli->lli_och_mutex);
3756                 if (fd->fd_lease_och != NULL) {
3757                         struct obd_client_handle *och = fd->fd_lease_och;
3758
3759                         lock = ldlm_handle2lock(&och->och_lease_handle);
3760                         if (lock != NULL) {
3761                                 lock_res_and_lock(lock);
3762                                 if (!ldlm_is_cancel(lock))
3763                                         fmode = och->och_flags;
3764
3765                                 unlock_res_and_lock(lock);
3766                                 LDLM_LOCK_PUT(lock);
3767                         }
3768                 }
3769                 mutex_unlock(&lli->lli_och_mutex);
3770
3771                 RETURN(ll_lease_type_from_fmode(fmode));
3772         }
3773         case LL_IOC_HSM_IMPORT: {
3774                 struct hsm_user_import *hui;
3775
3776                 OBD_ALLOC_PTR(hui);
3777                 if (hui == NULL)
3778                         RETURN(-ENOMEM);
3779
3780                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3781                         OBD_FREE_PTR(hui);
3782                         RETURN(-EFAULT);
3783                 }
3784
3785                 rc = ll_hsm_import(inode, file, hui);
3786
3787                 OBD_FREE_PTR(hui);
3788                 RETURN(rc);
3789         }
3790         case LL_IOC_FUTIMES_3: {
3791                 struct ll_futimes_3 lfu;
3792
3793                 if (copy_from_user(&lfu,
3794                                    (const struct ll_futimes_3 __user *)arg,
3795                                    sizeof(lfu)))
3796                         RETURN(-EFAULT);
3797
3798                 RETURN(ll_file_futimes_3(file, &lfu));
3799         }
3800         case LL_IOC_LADVISE: {
3801                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3802                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3803                 int i;
3804                 int num_advise;
3805                 int alloc_size = sizeof(*k_ladvise_hdr);
3806
3807                 rc = 0;
3808                 u_ladvise_hdr = (void __user *)arg;
3809                 OBD_ALLOC_PTR(k_ladvise_hdr);
3810                 if (k_ladvise_hdr == NULL)
3811                         RETURN(-ENOMEM);
3812
3813                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3814                         GOTO(out_ladvise, rc = -EFAULT);
3815
3816                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3817                     k_ladvise_hdr->lah_count < 1)
3818                         GOTO(out_ladvise, rc = -EINVAL);
3819
3820                 num_advise = k_ladvise_hdr->lah_count;
3821                 if (num_advise >= LAH_COUNT_MAX)
3822                         GOTO(out_ladvise, rc = -EFBIG);
3823
3824                 OBD_FREE_PTR(k_ladvise_hdr);
3825                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3826                                       lah_advise[num_advise]);
3827                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3828                 if (k_ladvise_hdr == NULL)
3829                         RETURN(-ENOMEM);
3830
3831                 /*
3832                  * TODO: submit multiple advices to one server in a single RPC
3833                  */
3834                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3835                         GOTO(out_ladvise, rc = -EFAULT);
3836
3837                 for (i = 0; i < num_advise; i++) {
3838                         struct llapi_lu_ladvise *k_ladvise =
3839                                         &k_ladvise_hdr->lah_advise[i];
3840                         struct llapi_lu_ladvise __user *u_ladvise =
3841                                         &u_ladvise_hdr->lah_advise[i];
3842
3843                         rc = ll_ladvise_sanity(inode, k_ladvise);
3844                         if (rc)
3845                                 GOTO(out_ladvise, rc);
3846
3847                         switch (k_ladvise->lla_advice) {
3848                         case LU_LADVISE_LOCKNOEXPAND:
3849                                 rc = ll_lock_noexpand(file,
3850                                                k_ladvise->lla_peradvice_flags);
3851                                 GOTO(out_ladvise, rc);
3852                         case LU_LADVISE_LOCKAHEAD:
3853
3854                                 rc = ll_file_lock_ahead(file, k_ladvise);
3855
3856                                 if (rc < 0)
3857                                         GOTO(out_ladvise, rc);
3858
3859                                 if (put_user(rc,
3860                                              &u_ladvise->lla_lockahead_result))
3861                                         GOTO(out_ladvise, rc = -EFAULT);
3862                                 break;
3863                         default:
3864                                 rc = ll_ladvise(inode, file,
3865                                                 k_ladvise_hdr->lah_flags,
3866                                                 k_ladvise);
3867                                 if (rc)
3868                                         GOTO(out_ladvise, rc);
3869                                 break;
3870                         }
3871
3872                 }
3873
3874 out_ladvise:
3875                 OBD_FREE(k_ladvise_hdr, alloc_size);
3876                 RETURN(rc);
3877         }
3878         case LL_IOC_FLR_SET_MIRROR: {
3879                 /* mirror I/O must be direct to avoid polluting page cache
3880                  * by stale data. */
3881                 if (!(file->f_flags & O_DIRECT))
3882                         RETURN(-EINVAL);
3883
3884                 fd->fd_designated_mirror = (__u32)arg;
3885                 RETURN(0);
3886         }
3887         case LL_IOC_FSGETXATTR:
3888                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3889         case LL_IOC_FSSETXATTR:
3890                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3891         case BLKSSZGET:
3892                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3893         case LL_IOC_HEAT_GET: {
3894                 struct lu_heat uheat;
3895                 struct lu_heat *heat;
3896                 int size;
3897
3898                 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3899                         RETURN(-EFAULT);
3900
3901                 if (uheat.lh_count > OBD_HEAT_COUNT)
3902                         uheat.lh_count = OBD_HEAT_COUNT;
3903
3904                 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3905                 OBD_ALLOC(heat, size);
3906                 if (heat == NULL)
3907                         RETURN(-ENOMEM);
3908
3909                 heat->lh_count = uheat.lh_count;
3910                 ll_heat_get(inode, heat);
3911                 rc = copy_to_user((char __user *)arg, heat, size);
3912                 OBD_FREE(heat, size);
3913                 RETURN(rc ? -EFAULT : 0);
3914         }
3915         case LL_IOC_HEAT_SET: {
3916                 __u64 flags;
3917
3918                 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3919                         RETURN(-EFAULT);
3920
3921                 rc = ll_heat_set(inode, flags);
3922                 RETURN(rc);
3923         }
3924         case LL_IOC_PCC_DETACH: {
3925                 struct lu_pcc_detach *detach;
3926
3927                 OBD_ALLOC_PTR(detach);
3928                 if (detach == NULL)
3929                         RETURN(-ENOMEM);
3930
3931                 if (copy_from_user(detach,
3932                                    (const struct lu_pcc_detach __user *)arg,
3933                                    sizeof(*detach)))
3934                         GOTO(out_detach_free, rc = -EFAULT);
3935
3936                 if (!S_ISREG(inode->i_mode))
3937                         GOTO(out_detach_free, rc = -EINVAL);
3938
3939                 if (!inode_owner_or_capable(inode))
3940                         GOTO(out_detach_free, rc = -EPERM);
3941
3942                 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3943 out_detach_free:
3944                 OBD_FREE_PTR(detach);
3945                 RETURN(rc);
3946         }
3947         case LL_IOC_PCC_STATE: {
3948                 struct lu_pcc_state __user *ustate =
3949                         (struct lu_pcc_state __user *)arg;
3950                 struct lu_pcc_state *state;
3951
3952                 OBD_ALLOC_PTR(state);
3953                 if (state == NULL)
3954                         RETURN(-ENOMEM);
3955
3956                 if (copy_from_user(state, ustate, sizeof(*state)))
3957                         GOTO(out_state, rc = -EFAULT);
3958
3959                 rc = pcc_ioctl_state(file, inode, state);
3960                 if (rc)
3961                         GOTO(out_state, rc);
3962
3963                 if (copy_to_user(ustate, state, sizeof(*state)))
3964                         GOTO(out_state, rc = -EFAULT);
3965
3966 out_state:
3967                 OBD_FREE_PTR(state);
3968                 RETURN(rc);
3969         }
3970         default:
3971                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3972                                      (void __user *)arg));
3973         }
3974 }
3975
3976 #ifndef HAVE_FILE_LLSEEK_SIZE
3977 static inline loff_t
3978 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3979 {
3980         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3981                 return -EINVAL;
3982         if (offset > maxsize)
3983                 return -EINVAL;
3984
3985         if (offset != file->f_pos) {
3986                 file->f_pos = offset;
3987                 file->f_version = 0;
3988         }
3989         return offset;
3990 }
3991
3992 static loff_t
3993 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3994                 loff_t maxsize, loff_t eof)
3995 {
3996         struct inode *inode = file_inode(file);
3997
3998         switch (origin) {
3999         case SEEK_END:
4000                 offset += eof;
4001                 break;
4002         case SEEK_CUR:
4003                 /*
4004                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
4005                  * position-querying operation.  Avoid rewriting the "same"
4006                  * f_pos value back to the file because a concurrent read(),
4007                  * write() or lseek() might have altered it
4008                  */
4009                 if (offset == 0)
4010                         return file->f_pos;
4011                 /*
4012                  * f_lock protects against read/modify/write race with other
4013                  * SEEK_CURs. Note that parallel writes and reads behave
4014                  * like SEEK_SET.
4015                  */
4016                 inode_lock(inode);
4017                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
4018                 inode_unlock(inode);
4019                 return offset;
4020         case SEEK_DATA:
4021                 /*
4022                  * In the generic case the entire file is data, so as long as
4023                  * offset isn't at the end of the file then the offset is data.
4024                  */
4025                 if (offset >= eof)
4026                         return -ENXIO;
4027                 break;
4028         case SEEK_HOLE:
4029                 /*
4030                  * There is a virtual hole at the end of the file, so as long as
4031                  * offset isn't i_size or larger, return i_size.
4032                  */
4033                 if (offset >= eof)
4034                         return -ENXIO;
4035                 offset = eof;
4036                 break;
4037         }
4038
4039         return llseek_execute(file, offset, maxsize);
4040 }
4041 #endif
4042
4043 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
4044 {
4045         struct inode *inode = file_inode(file);
4046         loff_t retval, eof = 0;
4047         ktime_t kstart = ktime_get();
4048
4049         ENTRY;
4050         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
4051                            (origin == SEEK_CUR) ? file->f_pos : 0);
4052         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
4053                PFID(ll_inode2fid(inode)), inode, retval, retval,
4054                origin);
4055
4056         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4057                 retval = ll_glimpse_size(inode);
4058                 if (retval != 0)
4059                         RETURN(retval);
4060                 eof = i_size_read(inode);
4061         }
4062
4063         retval = ll_generic_file_llseek_size(file, offset, origin,
4064                                              ll_file_maxbytes(inode), eof);
4065         if (retval >= 0)
4066                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK,
4067                                    ktime_us_delta(ktime_get(), kstart));
4068         RETURN(retval);
4069 }
4070
4071 static int ll_flush(struct file *file, fl_owner_t id)
4072 {
4073         struct inode *inode = file_inode(file);
4074         struct ll_inode_info *lli = ll_i2info(inode);
4075         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4076         int rc, err;
4077
4078         LASSERT(!S_ISDIR(inode->i_mode));
4079
4080         /* catch async errors that were recorded back when async writeback
4081          * failed for pages in this mapping. */
4082         rc = lli->lli_async_rc;
4083         lli->lli_async_rc = 0;
4084         if (lli->lli_clob != NULL) {
4085                 err = lov_read_and_clear_async_rc(lli->lli_clob);
4086                 if (rc == 0)
4087                         rc = err;
4088         }
4089
4090         /* The application has been told write failure already.
4091          * Do not report failure again. */
4092         if (fd->fd_write_failed)
4093                 return 0;
4094         return rc ? -EIO : 0;
4095 }
4096
4097 /**
4098  * Called to make sure a portion of file has been written out.
4099  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4100  *
4101  * Return how many pages have been written.
4102  */
4103 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4104                        enum cl_fsync_mode mode, int ignore_layout)
4105 {
4106         struct lu_env *env;
4107         struct cl_io *io;
4108         struct cl_fsync_io *fio;
4109         int result;
4110         __u16 refcheck;
4111         ENTRY;
4112
4113         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4114             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4115                 RETURN(-EINVAL);
4116
4117         env = cl_env_get(&refcheck);
4118         if (IS_ERR(env))
4119                 RETURN(PTR_ERR(env));
4120
4121         io = vvp_env_thread_io(env);
4122         io->ci_obj = ll_i2info(inode)->lli_clob;
4123         io->ci_ignore_layout = ignore_layout;
4124
4125         /* initialize parameters for sync */
4126         fio = &io->u.ci_fsync;
4127         fio->fi_start = start;
4128         fio->fi_end = end;
4129         fio->fi_fid = ll_inode2fid(inode);
4130         fio->fi_mode = mode;
4131         fio->fi_nr_written = 0;
4132
4133         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4134                 result = cl_io_loop(env, io);
4135         else
4136                 result = io->ci_result;
4137         if (result == 0)
4138                 result = fio->fi_nr_written;
4139         cl_io_fini(env, io);
4140         cl_env_put(env, &refcheck);
4141
4142         RETURN(result);
4143 }
4144
4145 /*
4146  * When dentry is provided (the 'else' case), file_dentry() may be
4147  * null and dentry must be used directly rather than pulled from
4148  * file_dentry() as is done otherwise.
4149  */
4150
4151 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4152 {
4153         struct dentry *dentry = file_dentry(file);
4154         struct inode *inode = dentry->d_inode;
4155         struct ll_inode_info *lli = ll_i2info(inode);
4156         struct ptlrpc_request *req;
4157         ktime_t kstart = ktime_get();
4158         int rc, err;
4159
4160         ENTRY;
4161
4162         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4163                "datasync %d\n",
4164                PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4165
4166         /* fsync's caller has already called _fdata{sync,write}, we want
4167          * that IO to finish before calling the osc and mdc sync methods */
4168         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4169         inode_lock(inode);
4170
4171         /* catch async errors that were recorded back when async writeback
4172          * failed for pages in this mapping. */
4173         if (!S_ISDIR(inode->i_mode)) {
4174                 err = lli->lli_async_rc;
4175                 lli->lli_async_rc = 0;
4176                 if (rc == 0)
4177                         rc = err;
4178                 if (lli->lli_clob != NULL) {
4179                         err = lov_read_and_clear_async_rc(lli->lli_clob);
4180                         if (rc == 0)
4181                                 rc = err;
4182                 }
4183         }
4184
4185         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4186         if (!rc)
4187                 rc = err;
4188         if (!err)
4189                 ptlrpc_req_finished(req);
4190
4191         if (S_ISREG(inode->i_mode)) {
4192                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4193                 bool cached;
4194
4195                 /* Sync metadata on MDT first, and then sync the cached data
4196                  * on PCC.
4197                  */
4198                 err = pcc_fsync(file, start, end, datasync, &cached);
4199                 if (!cached)
4200                         err = cl_sync_file_range(inode, start, end,
4201                                                  CL_FSYNC_ALL, 0);
4202                 if (rc == 0 && err < 0)
4203                         rc = err;
4204                 if (rc < 0)
4205                         fd->fd_write_failed = true;
4206                 else
4207                         fd->fd_write_failed = false;
4208         }
4209
4210         inode_unlock(inode);
4211
4212         if (!rc)
4213                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC,
4214                                    ktime_us_delta(ktime_get(), kstart));
4215         RETURN(rc);
4216 }
4217
4218 static int
4219 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4220 {
4221         struct inode *inode = file_inode(file);
4222         struct ll_sb_info *sbi = ll_i2sbi(inode);
4223         struct ldlm_enqueue_info einfo = {
4224                 .ei_type        = LDLM_FLOCK,
4225                 .ei_cb_cp       = ldlm_flock_completion_ast,
4226                 .ei_cbdata      = file_lock,
4227         };
4228         struct md_op_data *op_data;
4229         struct lustre_handle lockh = { 0 };
4230         union ldlm_policy_data flock = { { 0 } };
4231         int fl_type = file_lock->fl_type;
4232         ktime_t kstart = ktime_get();
4233         __u64 flags = 0;
4234         int rc;
4235         int rc2 = 0;
4236         ENTRY;
4237
4238         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4239                PFID(ll_inode2fid(inode)), file_lock);
4240
4241         if (file_lock->fl_flags & FL_FLOCK) {
4242                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4243                 /* flocks are whole-file locks */
4244                 flock.l_flock.end = OFFSET_MAX;
4245                 /* For flocks owner is determined by the local file desctiptor*/
4246                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4247         } else if (file_lock->fl_flags & FL_POSIX) {
4248                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4249                 flock.l_flock.start = file_lock->fl_start;
4250                 flock.l_flock.end = file_lock->fl_end;
4251         } else {
4252                 RETURN(-EINVAL);
4253         }
4254         flock.l_flock.pid = file_lock->fl_pid;
4255
4256 #if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner)
4257         /* Somewhat ugly workaround for svc lockd.
4258          * lockd installs custom fl_lmops->lm_compare_owner that checks
4259          * for the fl_owner to be the same (which it always is on local node
4260          * I guess between lockd processes) and then compares pid.
4261          * As such we assign pid to the owner field to make it all work,
4262          * conflict with normal locks is unlikely since pid space and
4263          * pointer space for current->files are not intersecting */
4264         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4265                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4266 #endif
4267
4268         switch (fl_type) {
4269         case F_RDLCK:
4270                 einfo.ei_mode = LCK_PR;
4271                 break;
4272         case F_UNLCK:
4273                 /* An unlock request may or may not have any relation to
4274                  * existing locks so we may not be able to pass a lock handle
4275                  * via a normal ldlm_lock_cancel() request. The request may even
4276                  * unlock a byte range in the middle of an existing lock. In
4277                  * order to process an unlock request we need all of the same
4278                  * information that is given with a normal read or write record
4279                  * lock request. To avoid creating another ldlm unlock (cancel)
4280                  * message we'll treat a LCK_NL flock request as an unlock. */
4281                 einfo.ei_mode = LCK_NL;
4282                 break;
4283         case F_WRLCK:
4284                 einfo.ei_mode = LCK_PW;
4285                 break;
4286         default:
4287                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4288                 RETURN (-ENOTSUPP);
4289         }
4290
4291         switch (cmd) {
4292         case F_SETLKW:
4293 #ifdef F_SETLKW64
4294         case F_SETLKW64:
4295 #endif
4296                 flags = 0;
4297                 break;
4298         case F_SETLK:
4299 #ifdef F_SETLK64
4300         case F_SETLK64:
4301 #endif
4302                 flags = LDLM_FL_BLOCK_NOWAIT;
4303                 break;
4304         case F_GETLK:
4305 #ifdef F_GETLK64
4306         case F_GETLK64:
4307 #endif
4308                 flags = LDLM_FL_TEST_LOCK;
4309                 break;
4310         default:
4311                 CERROR("unknown fcntl lock command: %d\n", cmd);
4312                 RETURN (-EINVAL);
4313         }
4314
4315         /* Save the old mode so that if the mode in the lock changes we
4316          * can decrement the appropriate reader or writer refcount. */
4317         file_lock->fl_type = einfo.ei_mode;
4318
4319         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4320                                      LUSTRE_OPC_ANY, NULL);
4321         if (IS_ERR(op_data))
4322                 RETURN(PTR_ERR(op_data));
4323
4324         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4325                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4326                flock.l_flock.pid, flags, einfo.ei_mode,
4327                flock.l_flock.start, flock.l_flock.end);
4328
4329         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4330                         flags);
4331
4332         /* Restore the file lock type if not TEST lock. */
4333         if (!(flags & LDLM_FL_TEST_LOCK))
4334                 file_lock->fl_type = fl_type;
4335
4336 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4337         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4338             !(flags & LDLM_FL_TEST_LOCK))
4339                 rc2  = locks_lock_file_wait(file, file_lock);
4340 #else
4341         if ((file_lock->fl_flags & FL_FLOCK) &&
4342             (rc == 0 || file_lock->fl_type == F_UNLCK))
4343                 rc2  = flock_lock_file_wait(file, file_lock);
4344         if ((file_lock->fl_flags & FL_POSIX) &&
4345             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4346             !(flags & LDLM_FL_TEST_LOCK))
4347                 rc2  = posix_lock_file_wait(file, file_lock);
4348 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4349
4350         if (rc2 && file_lock->fl_type != F_UNLCK) {
4351                 einfo.ei_mode = LCK_NL;
4352                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4353                            &lockh, flags);
4354                 rc = rc2;
4355         }
4356
4357         ll_finish_md_op_data(op_data);
4358
4359         if (!rc)
4360                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK,
4361                                    ktime_us_delta(ktime_get(), kstart));
4362         RETURN(rc);
4363 }
4364
4365 int ll_get_fid_by_name(struct inode *parent, const char *name,
4366                        int namelen, struct lu_fid *fid,
4367                        struct inode **inode)
4368 {
4369         struct md_op_data       *op_data = NULL;
4370         struct mdt_body         *body;
4371         struct ptlrpc_request   *req;
4372         int                     rc;
4373         ENTRY;
4374
4375         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4376                                      LUSTRE_OPC_ANY, NULL);
4377         if (IS_ERR(op_data))
4378                 RETURN(PTR_ERR(op_data));
4379
4380         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4381         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4382         ll_finish_md_op_data(op_data);
4383         if (rc < 0)
4384                 RETURN(rc);
4385
4386         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4387         if (body == NULL)
4388                 GOTO(out_req, rc = -EFAULT);
4389         if (fid != NULL)
4390                 *fid = body->mbo_fid1;
4391
4392         if (inode != NULL)
4393                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4394 out_req:
4395         ptlrpc_req_finished(req);
4396         RETURN(rc);
4397 }
4398
4399 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4400                const char *name)
4401 {
4402         struct dentry *dchild = NULL;
4403         struct inode *child_inode = NULL;
4404         struct md_op_data *op_data;
4405         struct ptlrpc_request *request = NULL;
4406         struct obd_client_handle *och = NULL;
4407         struct qstr qstr;
4408         struct mdt_body *body;
4409         __u64 data_version = 0;
4410         size_t namelen = strlen(name);
4411         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4412         int rc;
4413         ENTRY;
4414
4415         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4416                PFID(ll_inode2fid(parent)), name,
4417                lum->lum_stripe_offset, lum->lum_stripe_count);
4418
4419         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4420             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4421                 lustre_swab_lmv_user_md(lum);
4422
4423         /* Get child FID first */
4424         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4425         qstr.name = name;
4426         qstr.len = namelen;
4427         dchild = d_lookup(file_dentry(file), &qstr);
4428         if (dchild) {
4429                 if (dchild->d_inode)
4430                         child_inode = igrab(dchild->d_inode);
4431                 dput(dchild);
4432         }
4433
4434         if (!child_inode) {
4435                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4436                                         &child_inode);
4437                 if (rc)
4438                         RETURN(rc);
4439         }
4440
4441         if (!child_inode)
4442                 RETURN(-ENOENT);
4443
4444         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4445               OBD_CONNECT2_DIR_MIGRATE)) {
4446                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4447                     ll_dir_striped(child_inode)) {
4448                         CERROR("%s: MDT doesn't support stripe directory "
4449                                "migration!\n", ll_i2sbi(parent)->ll_fsname);
4450                         GOTO(out_iput, rc = -EOPNOTSUPP);
4451                 }
4452         }
4453
4454         /*
4455          * lfs migrate command needs to be blocked on the client
4456          * by checking the migrate FID against the FID of the
4457          * filesystem root.
4458          */
4459         if (child_inode == parent->i_sb->s_root->d_inode)
4460                 GOTO(out_iput, rc = -EINVAL);
4461
4462         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4463                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4464         if (IS_ERR(op_data))
4465                 GOTO(out_iput, rc = PTR_ERR(op_data));
4466
4467         inode_lock(child_inode);
4468         op_data->op_fid3 = *ll_inode2fid(child_inode);
4469         if (!fid_is_sane(&op_data->op_fid3)) {
4470                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4471                        ll_i2sbi(parent)->ll_fsname, name,
4472                        PFID(&op_data->op_fid3));
4473                 GOTO(out_unlock, rc = -EINVAL);
4474         }
4475
4476         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4477         op_data->op_data = lum;
4478         op_data->op_data_size = lumlen;
4479
4480 again:
4481         if (S_ISREG(child_inode->i_mode)) {
4482                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4483                 if (IS_ERR(och)) {
4484                         rc = PTR_ERR(och);
4485                         och = NULL;
4486                         GOTO(out_unlock, rc);
4487                 }
4488
4489                 rc = ll_data_version(child_inode, &data_version,
4490                                      LL_DV_WR_FLUSH);
4491                 if (rc != 0)
4492                         GOTO(out_close, rc);
4493
4494                 op_data->op_open_handle = och->och_open_handle;
4495                 op_data->op_data_version = data_version;
4496                 op_data->op_lease_handle = och->och_lease_handle;
4497                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4498
4499                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4500                 och->och_mod->mod_open_req->rq_replay = 0;
4501                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4502         }
4503
4504         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4505                        name, namelen, &request);
4506         if (rc == 0) {
4507                 LASSERT(request != NULL);
4508                 ll_update_times(request, parent);
4509         }
4510
4511         if (rc == 0 || rc == -EAGAIN) {
4512                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4513                 LASSERT(body != NULL);
4514
4515                 /* If the server does release layout lock, then we cleanup
4516                  * the client och here, otherwise release it in out_close: */
4517                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4518                         obd_mod_put(och->och_mod);
4519                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4520                                                   och);
4521                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4522                         OBD_FREE_PTR(och);
4523                         och = NULL;
4524                 }
4525         }
4526
4527         if (request != NULL) {
4528                 ptlrpc_req_finished(request);
4529                 request = NULL;
4530         }
4531
4532         /* Try again if the lease has cancelled. */
4533         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4534                 goto again;
4535
4536 out_close:
4537         if (och)
4538                 ll_lease_close(och, child_inode, NULL);
4539         if (!rc)
4540                 clear_nlink(child_inode);
4541 out_unlock:
4542         inode_unlock(child_inode);
4543         ll_finish_md_op_data(op_data);
4544 out_iput:
4545         iput(child_inode);
4546         RETURN(rc);
4547 }
4548
4549 static int
4550 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4551 {
4552         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4553         ENTRY;
4554
4555         /*
4556          * In order to avoid flood of warning messages, only print one message
4557          * for one file. And the entire message rate on the client is limited
4558          * by CDEBUG_LIMIT too.
4559          */
4560         if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4561                 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4562                 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4563                              "flock disabled, mount with '-o [local]flock' to enable\r\n");
4564         }
4565         RETURN(-ENOSYS);
4566 }
4567
4568 /**
4569  * test if some locks matching bits and l_req_mode are acquired
4570  * - bits can be in different locks
4571  * - if found clear the common lock bits in *bits
4572  * - the bits not found, are kept in *bits
4573  * \param inode [IN]
4574  * \param bits [IN] searched lock bits [IN]
4575  * \param l_req_mode [IN] searched lock mode
4576  * \retval boolean, true iff all bits are found
4577  */
4578 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4579 {
4580         struct lustre_handle lockh;
4581         union ldlm_policy_data policy;
4582         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4583                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4584         struct lu_fid *fid;
4585         __u64 flags;
4586         int i;
4587         ENTRY;
4588
4589         if (!inode)
4590                RETURN(0);
4591
4592         fid = &ll_i2info(inode)->lli_fid;
4593         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4594                ldlm_lockname[mode]);
4595
4596         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4597         for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4598                 policy.l_inodebits.bits = *bits & (1 << i);
4599                 if (policy.l_inodebits.bits == 0)
4600                         continue;
4601
4602                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4603                                   &policy, mode, &lockh)) {
4604                         struct ldlm_lock *lock;
4605
4606                         lock = ldlm_handle2lock(&lockh);
4607                         if (lock) {
4608                                 *bits &=
4609                                       ~(lock->l_policy_data.l_inodebits.bits);
4610                                 LDLM_LOCK_PUT(lock);
4611                         } else {
4612                                 *bits &= ~policy.l_inodebits.bits;
4613                         }
4614                 }
4615         }
4616         RETURN(*bits == 0);
4617 }
4618
4619 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4620                                struct lustre_handle *lockh, __u64 flags,
4621                                enum ldlm_mode mode)
4622 {
4623         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4624         struct lu_fid *fid;
4625         enum ldlm_mode rc;
4626         ENTRY;
4627
4628         fid = &ll_i2info(inode)->lli_fid;
4629         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4630
4631         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4632                            fid, LDLM_IBITS, &policy, mode, lockh);
4633
4634         RETURN(rc);
4635 }
4636
4637 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4638 {
4639         /* Already unlinked. Just update nlink and return success */
4640         if (rc == -ENOENT) {
4641                 clear_nlink(inode);
4642                 /* If it is striped directory, and there is bad stripe
4643                  * Let's revalidate the dentry again, instead of returning
4644                  * error */
4645                 if (ll_dir_striped(inode))
4646                         return 0;
4647
4648                 /* This path cannot be hit for regular files unless in
4649                  * case of obscure races, so no need to to validate
4650                  * size. */
4651                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4652                         return 0;
4653         } else if (rc != 0) {
4654                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4655                              "%s: revalidate FID "DFID" error: rc = %d\n",
4656                              ll_i2sbi(inode)->ll_fsname,
4657                              PFID(ll_inode2fid(inode)), rc);
4658         }
4659
4660         return rc;
4661 }
4662
4663 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4664 {
4665         struct inode *inode = dentry->d_inode;
4666         struct obd_export *exp = ll_i2mdexp(inode);
4667         struct lookup_intent oit = {
4668                 .it_op = op,
4669         };
4670         struct ptlrpc_request *req = NULL;
4671         struct md_op_data *op_data;
4672         int rc = 0;
4673         ENTRY;
4674
4675         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4676                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4677
4678         /* Call getattr by fid, so do not provide name at all. */
4679         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4680                                      LUSTRE_OPC_ANY, NULL);
4681         if (IS_ERR(op_data))
4682                 RETURN(PTR_ERR(op_data));
4683
4684         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4685         ll_finish_md_op_data(op_data);
4686         if (rc < 0) {
4687                 rc = ll_inode_revalidate_fini(inode, rc);
4688                 GOTO(out, rc);
4689         }
4690
4691         rc = ll_revalidate_it_finish(req, &oit, dentry);
4692         if (rc != 0) {
4693                 ll_intent_release(&oit);
4694                 GOTO(out, rc);
4695         }
4696
4697         /* Unlinked? Unhash dentry, so it is not picked up later by
4698          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4699          * here to preserve get_cwd functionality on 2.6.
4700          * Bug 10503 */
4701         if (!dentry->d_inode->i_nlink) {
4702                 spin_lock(&inode->i_lock);
4703                 d_lustre_invalidate(dentry, 0);
4704                 spin_unlock(&inode->i_lock);
4705         }
4706
4707         ll_lookup_finish_locks(&oit, dentry);
4708 out:
4709         ptlrpc_req_finished(req);
4710
4711         return rc;
4712 }
4713
4714 static int ll_merge_md_attr(struct inode *inode)
4715 {
4716         struct ll_inode_info *lli = ll_i2info(inode);
4717         struct cl_attr attr = { 0 };
4718         int rc;
4719
4720         LASSERT(lli->lli_lsm_md != NULL);
4721
4722         if (!lmv_dir_striped(lli->lli_lsm_md))
4723                 RETURN(0);
4724
4725         down_read(&lli->lli_lsm_sem);
4726         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4727                            &attr, ll_md_blocking_ast);
4728         up_read(&lli->lli_lsm_sem);
4729         if (rc != 0)
4730                 RETURN(rc);
4731
4732         set_nlink(inode, attr.cat_nlink);
4733         inode->i_blocks = attr.cat_blocks;
4734         i_size_write(inode, attr.cat_size);
4735
4736         ll_i2info(inode)->lli_atime = attr.cat_atime;
4737         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4738         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4739
4740         RETURN(0);
4741 }
4742
4743 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4744 {
4745         struct inode *inode = de->d_inode;
4746         struct ll_sb_info *sbi = ll_i2sbi(inode);
4747         struct ll_inode_info *lli = ll_i2info(inode);
4748         ktime_t kstart = ktime_get();
4749         int rc;
4750
4751         rc = ll_inode_revalidate(de, IT_GETATTR);
4752         if (rc < 0)
4753                 RETURN(rc);
4754
4755         if (S_ISREG(inode->i_mode)) {
4756                 bool cached;
4757
4758                 rc = pcc_inode_getattr(inode, &cached);
4759                 if (cached && rc < 0)
4760                         RETURN(rc);
4761
4762                 /* In case of restore, the MDT has the right size and has
4763                  * already send it back without granting the layout lock,
4764                  * inode is up-to-date so glimpse is useless.
4765                  * Also to glimpse we need the layout, in case of a running
4766                  * restore the MDT holds the layout lock so the glimpse will
4767                  * block up to the end of restore (getattr will block)
4768                  */
4769                 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4770                         rc = ll_glimpse_size(inode);
4771                         if (rc < 0)
4772                                 RETURN(rc);
4773                 }
4774         } else {
4775                 /* If object isn't regular a file then don't validate size. */
4776                 if (ll_dir_striped(inode)) {
4777                         rc = ll_merge_md_attr(inode);
4778                         if (rc < 0)
4779                                 RETURN(rc);
4780                 }
4781
4782                 inode->i_atime.tv_sec = lli->lli_atime;
4783                 inode->i_mtime.tv_sec = lli->lli_mtime;
4784                 inode->i_ctime.tv_sec = lli->lli_ctime;
4785         }
4786
4787         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4788
4789         if (ll_need_32bit_api(sbi)) {
4790                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4791                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4792                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4793         } else {
4794                 stat->ino = inode->i_ino;
4795                 stat->dev = inode->i_sb->s_dev;
4796                 stat->rdev = inode->i_rdev;
4797         }
4798
4799         stat->mode = inode->i_mode;
4800         stat->uid = inode->i_uid;
4801         stat->gid = inode->i_gid;
4802         stat->atime = inode->i_atime;
4803         stat->mtime = inode->i_mtime;
4804         stat->ctime = inode->i_ctime;
4805         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4806
4807         stat->nlink = inode->i_nlink;
4808         stat->size = i_size_read(inode);
4809         stat->blocks = inode->i_blocks;
4810
4811         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR,
4812                            ktime_us_delta(ktime_get(), kstart));
4813
4814         return 0;
4815 }
4816
4817 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4818 int ll_getattr(const struct path *path, struct kstat *stat,
4819                u32 request_mask, unsigned int flags)
4820 {
4821         struct dentry *de = path->dentry;
4822 #else
4823 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4824 {
4825 #endif
4826         return ll_getattr_dentry(de, stat);
4827 }
4828
4829 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4830                      __u64 start, __u64 len)
4831 {
4832         int             rc;
4833         size_t          num_bytes;
4834         struct fiemap   *fiemap;
4835         unsigned int    extent_count = fieinfo->fi_extents_max;
4836
4837         num_bytes = sizeof(*fiemap) + (extent_count *
4838                                        sizeof(struct fiemap_extent));
4839         OBD_ALLOC_LARGE(fiemap, num_bytes);
4840
4841         if (fiemap == NULL)
4842                 RETURN(-ENOMEM);
4843
4844         fiemap->fm_flags = fieinfo->fi_flags;
4845         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4846         fiemap->fm_start = start;
4847         fiemap->fm_length = len;
4848         if (extent_count > 0 &&
4849             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4850                            sizeof(struct fiemap_extent)) != 0)
4851                 GOTO(out, rc = -EFAULT);
4852
4853         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4854
4855         fieinfo->fi_flags = fiemap->fm_flags;
4856         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4857         if (extent_count > 0 &&
4858             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4859                          fiemap->fm_mapped_extents *
4860                          sizeof(struct fiemap_extent)) != 0)
4861                 GOTO(out, rc = -EFAULT);
4862 out:
4863         OBD_FREE_LARGE(fiemap, num_bytes);
4864         return rc;
4865 }
4866
4867 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4868 {
4869         struct ll_inode_info *lli = ll_i2info(inode);
4870         struct posix_acl *acl = NULL;
4871         ENTRY;
4872
4873         spin_lock(&lli->lli_lock);
4874         /* VFS' acl_permission_check->check_acl will release the refcount */
4875         acl = posix_acl_dup(lli->lli_posix_acl);
4876         spin_unlock(&lli->lli_lock);
4877
4878         RETURN(acl);
4879 }
4880
4881 #ifdef HAVE_IOP_SET_ACL
4882 #ifdef CONFIG_LUSTRE_FS_POSIX_ACL
4883 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4884 {
4885         struct ll_sb_info *sbi = ll_i2sbi(inode);
4886         struct ptlrpc_request *req = NULL;
4887         const char *name = NULL;
4888         char *value = NULL;
4889         size_t value_size = 0;
4890         int rc = 0;
4891         ENTRY;
4892
4893         switch (type) {
4894         case ACL_TYPE_ACCESS:
4895                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4896                 if (acl)
4897                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4898                 break;
4899
4900         case ACL_TYPE_DEFAULT:
4901                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4902                 if (!S_ISDIR(inode->i_mode))
4903                         rc = acl ? -EACCES : 0;
4904                 break;
4905
4906         default:
4907                 rc = -EINVAL;
4908                 break;
4909         }
4910         if (rc)
4911                 return rc;
4912
4913         if (acl) {
4914                 value_size = posix_acl_xattr_size(acl->a_count);
4915                 value = kmalloc(value_size, GFP_NOFS);
4916                 if (value == NULL)
4917                         GOTO(out, rc = -ENOMEM);
4918
4919                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4920                 if (rc < 0)
4921                         GOTO(out_value, rc);
4922         }
4923
4924         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4925                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4926                          name, value, value_size, 0, 0, &req);
4927
4928         ptlrpc_req_finished(req);
4929 out_value:
4930         kfree(value);
4931 out:
4932         if (rc)
4933                 forget_cached_acl(inode, type);
4934         else
4935                 set_cached_acl(inode, type, acl);
4936         RETURN(rc);
4937 }
4938 #endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
4939 #endif /* HAVE_IOP_SET_ACL */
4940
4941 int ll_inode_permission(struct inode *inode, int mask)
4942 {
4943         int rc = 0;
4944         struct ll_sb_info *sbi;
4945         struct root_squash_info *squash;
4946         struct cred *cred = NULL;
4947         const struct cred *old_cred = NULL;
4948         cfs_cap_t cap;
4949         bool squash_id = false;
4950         ktime_t kstart = ktime_get();
4951         ENTRY;
4952
4953         if (mask & MAY_NOT_BLOCK)
4954                 return -ECHILD;
4955
4956        /* as root inode are NOT getting validated in lookup operation,
4957         * need to do it before permission check. */
4958
4959         if (inode == inode->i_sb->s_root->d_inode) {
4960                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4961                 if (rc)
4962                         RETURN(rc);
4963         }
4964
4965         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4966                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4967
4968         /* squash fsuid/fsgid if needed */
4969         sbi = ll_i2sbi(inode);
4970         squash = &sbi->ll_squash;
4971         if (unlikely(squash->rsi_uid != 0 &&
4972                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4973                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4974                         squash_id = true;
4975         }
4976         if (squash_id) {
4977                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4978                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4979                        squash->rsi_uid, squash->rsi_gid);
4980
4981                 /* update current process's credentials
4982                  * and FS capability */
4983                 cred = prepare_creds();
4984                 if (cred == NULL)
4985                         RETURN(-ENOMEM);
4986
4987                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4988                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4989                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4990                         if ((1 << cap) & CFS_CAP_FS_MASK)
4991                                 cap_lower(cred->cap_effective, cap);
4992                 }
4993                 old_cred = override_creds(cred);
4994         }
4995
4996         rc = generic_permission(inode, mask);
4997         /* restore current process's credentials and FS capability */
4998         if (squash_id) {
4999                 revert_creds(old_cred);
5000                 put_cred(cred);
5001         }
5002
5003         if (!rc)
5004                 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM,
5005                                    ktime_us_delta(ktime_get(), kstart));
5006
5007         RETURN(rc);
5008 }
5009
5010 /* -o localflock - only provides locally consistent flock locks */
5011 struct file_operations ll_file_operations = {
5012 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5013 # ifdef HAVE_SYNC_READ_WRITE
5014         .read           = new_sync_read,
5015         .write          = new_sync_write,
5016 # endif
5017         .read_iter      = ll_file_read_iter,
5018         .write_iter     = ll_file_write_iter,
5019 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5020         .read           = ll_file_read,
5021         .aio_read       = ll_file_aio_read,
5022         .write          = ll_file_write,
5023         .aio_write      = ll_file_aio_write,
5024 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5025         .unlocked_ioctl = ll_file_ioctl,
5026         .open           = ll_file_open,
5027         .release        = ll_file_release,
5028         .mmap           = ll_file_mmap,
5029         .llseek         = ll_file_seek,
5030         .splice_read    = ll_file_splice_read,
5031         .fsync          = ll_fsync,
5032         .flush          = ll_flush
5033 };
5034
5035 struct file_operations ll_file_operations_flock = {
5036 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5037 # ifdef HAVE_SYNC_READ_WRITE
5038         .read           = new_sync_read,
5039         .write          = new_sync_write,
5040 # endif /* HAVE_SYNC_READ_WRITE */
5041         .read_iter      = ll_file_read_iter,
5042         .write_iter     = ll_file_write_iter,
5043 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5044         .read           = ll_file_read,
5045         .aio_read       = ll_file_aio_read,
5046         .write          = ll_file_write,
5047         .aio_write      = ll_file_aio_write,
5048 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5049         .unlocked_ioctl = ll_file_ioctl,
5050         .open           = ll_file_open,
5051         .release        = ll_file_release,
5052         .mmap           = ll_file_mmap,
5053         .llseek         = ll_file_seek,
5054         .splice_read    = ll_file_splice_read,
5055         .fsync          = ll_fsync,
5056         .flush          = ll_flush,
5057         .flock          = ll_file_flock,
5058         .lock           = ll_file_flock
5059 };
5060
5061 /* These are for -o noflock - to return ENOSYS on flock calls */
5062 struct file_operations ll_file_operations_noflock = {
5063 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5064 # ifdef HAVE_SYNC_READ_WRITE
5065         .read           = new_sync_read,
5066         .write          = new_sync_write,
5067 # endif /* HAVE_SYNC_READ_WRITE */
5068         .read_iter      = ll_file_read_iter,
5069         .write_iter     = ll_file_write_iter,
5070 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5071         .read           = ll_file_read,
5072         .aio_read       = ll_file_aio_read,
5073         .write          = ll_file_write,
5074         .aio_write      = ll_file_aio_write,
5075 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5076         .unlocked_ioctl = ll_file_ioctl,
5077         .open           = ll_file_open,
5078         .release        = ll_file_release,
5079         .mmap           = ll_file_mmap,
5080         .llseek         = ll_file_seek,
5081         .splice_read    = ll_file_splice_read,
5082         .fsync          = ll_fsync,
5083         .flush          = ll_flush,
5084         .flock          = ll_file_noflock,
5085         .lock           = ll_file_noflock
5086 };
5087
5088 struct inode_operations ll_file_inode_operations = {
5089         .setattr        = ll_setattr,
5090         .getattr        = ll_getattr,
5091         .permission     = ll_inode_permission,
5092 #ifdef HAVE_IOP_XATTR
5093         .setxattr       = ll_setxattr,
5094         .getxattr       = ll_getxattr,
5095         .removexattr    = ll_removexattr,
5096 #endif
5097         .listxattr      = ll_listxattr,
5098         .fiemap         = ll_fiemap,
5099 #ifdef HAVE_IOP_GET_ACL
5100         .get_acl        = ll_get_acl,
5101 #endif
5102 #ifdef HAVE_IOP_SET_ACL
5103         .set_acl        = ll_set_acl,
5104 #endif
5105 };
5106
5107 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5108 {
5109         struct ll_inode_info *lli = ll_i2info(inode);
5110         struct cl_object *obj = lli->lli_clob;
5111         struct lu_env *env;
5112         int rc;
5113         __u16 refcheck;
5114         ENTRY;
5115
5116         if (obj == NULL)
5117                 RETURN(0);
5118
5119         env = cl_env_get(&refcheck);
5120         if (IS_ERR(env))
5121                 RETURN(PTR_ERR(env));
5122
5123         rc = cl_conf_set(env, lli->lli_clob, conf);
5124         if (rc < 0)
5125                 GOTO(out, rc);
5126
5127         if (conf->coc_opc == OBJECT_CONF_SET) {
5128                 struct ldlm_lock *lock = conf->coc_lock;
5129                 struct cl_layout cl = {
5130                         .cl_layout_gen = 0,
5131                 };
5132
5133                 LASSERT(lock != NULL);
5134                 LASSERT(ldlm_has_layout(lock));
5135
5136                 /* it can only be allowed to match after layout is
5137                  * applied to inode otherwise false layout would be
5138                  * seen. Applying layout shoud happen before dropping
5139                  * the intent lock. */
5140                 ldlm_lock_allow_match(lock);
5141
5142                 rc = cl_object_layout_get(env, obj, &cl);
5143                 if (rc < 0)
5144                         GOTO(out, rc);
5145
5146                 CDEBUG(D_VFSTRACE,
5147                        DFID": layout version change: %u -> %u\n",
5148                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
5149                        cl.cl_layout_gen);
5150                 ll_layout_version_set(lli, cl.cl_layout_gen);
5151         }
5152
5153 out:
5154         cl_env_put(env, &refcheck);
5155
5156         RETURN(rc);
5157 }
5158
5159 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5160 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5161
5162 {
5163         struct ll_sb_info *sbi = ll_i2sbi(inode);
5164         struct ptlrpc_request *req;
5165         void *lvbdata;
5166         void *lmm;
5167         int lmmsize;
5168         int rc;
5169         ENTRY;
5170
5171         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5172                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5173                lock->l_lvb_data, lock->l_lvb_len);
5174
5175         if (lock->l_lvb_data != NULL)
5176                 RETURN(0);
5177
5178         /* if layout lock was granted right away, the layout is returned
5179          * within DLM_LVB of dlm reply; otherwise if the lock was ever
5180          * blocked and then granted via completion ast, we have to fetch
5181          * layout here. Please note that we can't use the LVB buffer in
5182          * completion AST because it doesn't have a large enough buffer */
5183         rc = ll_get_default_mdsize(sbi, &lmmsize);
5184         if (rc < 0)
5185                 RETURN(rc);
5186
5187         rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5188                          XATTR_NAME_LOV, lmmsize, &req);
5189         if (rc < 0) {
5190                 if (rc == -ENODATA)
5191                         GOTO(out, rc = 0); /* empty layout */
5192                 else
5193                         RETURN(rc);
5194         }
5195
5196         lmmsize = rc;
5197         rc = 0;
5198         if (lmmsize == 0) /* empty layout */
5199                 GOTO(out, rc = 0);
5200
5201         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5202         if (lmm == NULL)
5203                 GOTO(out, rc = -EFAULT);
5204
5205         OBD_ALLOC_LARGE(lvbdata, lmmsize);
5206         if (lvbdata == NULL)
5207                 GOTO(out, rc = -ENOMEM);
5208
5209         memcpy(lvbdata, lmm, lmmsize);
5210         lock_res_and_lock(lock);
5211         if (unlikely(lock->l_lvb_data == NULL)) {
5212                 lock->l_lvb_type = LVB_T_LAYOUT;
5213                 lock->l_lvb_data = lvbdata;
5214                 lock->l_lvb_len = lmmsize;
5215                 lvbdata = NULL;
5216         }
5217         unlock_res_and_lock(lock);
5218
5219         if (lvbdata)
5220                 OBD_FREE_LARGE(lvbdata, lmmsize);
5221
5222         EXIT;
5223
5224 out:
5225         ptlrpc_req_finished(req);
5226         return rc;
5227 }
5228
5229 /**
5230  * Apply the layout to the inode. Layout lock is held and will be released
5231  * in this function.
5232  */
5233 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5234                               struct inode *inode)
5235 {
5236         struct ll_inode_info *lli = ll_i2info(inode);
5237         struct ll_sb_info    *sbi = ll_i2sbi(inode);
5238         struct ldlm_lock *lock;
5239         struct cl_object_conf conf;
5240         int rc = 0;
5241         bool lvb_ready;
5242         bool wait_layout = false;
5243         ENTRY;
5244
5245         LASSERT(lustre_handle_is_used(lockh));
5246
5247         lock = ldlm_handle2lock(lockh);
5248         LASSERT(lock != NULL);
5249         LASSERT(ldlm_has_layout(lock));
5250
5251         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5252                    PFID(&lli->lli_fid), inode);
5253
5254         /* in case this is a caching lock and reinstate with new inode */
5255         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5256
5257         lock_res_and_lock(lock);
5258         lvb_ready = ldlm_is_lvb_ready(lock);
5259         unlock_res_and_lock(lock);
5260
5261         /* checking lvb_ready is racy but this is okay. The worst case is
5262          * that multi processes may configure the file on the same time. */
5263         if (lvb_ready)
5264                 GOTO(out, rc = 0);
5265
5266         rc = ll_layout_fetch(inode, lock);
5267         if (rc < 0)
5268                 GOTO(out, rc);
5269
5270         /* for layout lock, lmm is stored in lock's lvb.
5271          * lvb_data is immutable if the lock is held so it's safe to access it
5272          * without res lock.
5273          *
5274          * set layout to file. Unlikely this will fail as old layout was
5275          * surely eliminated */
5276         memset(&conf, 0, sizeof conf);
5277         conf.coc_opc = OBJECT_CONF_SET;
5278         conf.coc_inode = inode;
5279         conf.coc_lock = lock;
5280         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5281         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5282         rc = ll_layout_conf(inode, &conf);
5283
5284         /* refresh layout failed, need to wait */
5285         wait_layout = rc == -EBUSY;
5286         EXIT;
5287 out:
5288         LDLM_LOCK_PUT(lock);
5289         ldlm_lock_decref(lockh, mode);
5290
5291         /* wait for IO to complete if it's still being used. */
5292         if (wait_layout) {
5293                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5294                        sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5295
5296                 memset(&conf, 0, sizeof conf);
5297                 conf.coc_opc = OBJECT_CONF_WAIT;
5298                 conf.coc_inode = inode;
5299                 rc = ll_layout_conf(inode, &conf);
5300                 if (rc == 0)
5301                         rc = -EAGAIN;
5302
5303                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5304                        sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5305         }
5306         RETURN(rc);
5307 }
5308
5309 /**
5310  * Issue layout intent RPC to MDS.
5311  * \param inode [in]    file inode
5312  * \param intent [in]   layout intent
5313  *
5314  * \retval 0    on success
5315  * \retval < 0  error code
5316  */
5317 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5318 {
5319         struct ll_inode_info  *lli = ll_i2info(inode);
5320         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5321         struct md_op_data     *op_data;
5322         struct lookup_intent it;
5323         struct ptlrpc_request *req;
5324         int rc;
5325         ENTRY;
5326
5327         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5328                                      0, 0, LUSTRE_OPC_ANY, NULL);
5329         if (IS_ERR(op_data))
5330                 RETURN(PTR_ERR(op_data));
5331
5332         op_data->op_data = intent;
5333         op_data->op_data_size = sizeof(*intent);
5334
5335         memset(&it, 0, sizeof(it));
5336         it.it_op = IT_LAYOUT;
5337         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5338             intent->li_opc == LAYOUT_INTENT_TRUNC)
5339                 it.it_flags = FMODE_WRITE;
5340
5341         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5342                           sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5343
5344         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5345                             &ll_md_blocking_ast, 0);
5346         if (it.it_request != NULL)
5347                 ptlrpc_req_finished(it.it_request);
5348         it.it_request = NULL;
5349
5350         ll_finish_md_op_data(op_data);
5351
5352         /* set lock data in case this is a new lock */
5353         if (!rc)
5354                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5355
5356         ll_intent_drop_lock(&it);
5357
5358         RETURN(rc);
5359 }
5360
5361 /**
5362  * This function checks if there exists a LAYOUT lock on the client side,
5363  * or enqueues it if it doesn't have one in cache.
5364  *
5365  * This function will not hold layout lock so it may be revoked any time after
5366  * this function returns. Any operations depend on layout should be redone
5367  * in that case.
5368  *
5369  * This function should be called before lov_io_init() to get an uptodate
5370  * layout version, the caller should save the version number and after IO
5371  * is finished, this function should be called again to verify that layout
5372  * is not changed during IO time.
5373  */
5374 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5375 {
5376         struct ll_inode_info    *lli = ll_i2info(inode);
5377         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5378         struct lustre_handle lockh;
5379         struct layout_intent intent = {
5380                 .li_opc = LAYOUT_INTENT_ACCESS,
5381         };
5382         enum ldlm_mode mode;
5383         int rc;
5384         ENTRY;
5385
5386         *gen = ll_layout_version_get(lli);
5387         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5388                 RETURN(0);
5389
5390         /* sanity checks */
5391         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5392         LASSERT(S_ISREG(inode->i_mode));
5393
5394         /* take layout lock mutex to enqueue layout lock exclusively. */
5395         mutex_lock(&lli->lli_layout_mutex);
5396
5397         while (1) {
5398                 /* mostly layout lock is caching on the local side, so try to
5399                  * match it before grabbing layout lock mutex. */
5400                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5401                                        LCK_CR | LCK_CW | LCK_PR |
5402                                        LCK_PW | LCK_EX);
5403                 if (mode != 0) { /* hit cached lock */
5404                         rc = ll_layout_lock_set(&lockh, mode, inode);
5405                         if (rc == -EAGAIN)
5406                                 continue;
5407                         break;
5408                 }
5409
5410                 rc = ll_layout_intent(inode, &intent);
5411                 if (rc != 0)
5412                         break;
5413         }
5414
5415         if (rc == 0)
5416                 *gen = ll_layout_version_get(lli);
5417         mutex_unlock(&lli->lli_layout_mutex);
5418
5419         RETURN(rc);
5420 }
5421
5422 /**
5423  * Issue layout intent RPC indicating where in a file an IO is about to write.
5424  *
5425  * \param[in] inode     file inode.
5426  * \param[in] ext       write range with start offset of fille in bytes where
5427  *                      an IO is about to write, and exclusive end offset in
5428  *                      bytes.
5429  *
5430  * \retval 0    on success
5431  * \retval < 0  error code
5432  */
5433 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5434                            struct lu_extent *ext)
5435 {
5436         struct layout_intent intent = {
5437                 .li_opc = opc,
5438                 .li_extent.e_start = ext->e_start,
5439                 .li_extent.e_end = ext->e_end,
5440         };
5441         int rc;
5442         ENTRY;
5443
5444         rc = ll_layout_intent(inode, &intent);
5445
5446         RETURN(rc);
5447 }
5448
5449 /**
5450  *  This function send a restore request to the MDT
5451  */
5452 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5453 {
5454         struct hsm_user_request *hur;
5455         int                      len, rc;
5456         ENTRY;
5457
5458         len = sizeof(struct hsm_user_request) +
5459               sizeof(struct hsm_user_item);
5460         OBD_ALLOC(hur, len);
5461         if (hur == NULL)
5462                 RETURN(-ENOMEM);
5463
5464         hur->hur_request.hr_action = HUA_RESTORE;
5465         hur->hur_request.hr_archive_id = 0;
5466         hur->hur_request.hr_flags = 0;
5467         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5468                sizeof(hur->hur_user_item[0].hui_fid));
5469         hur->hur_user_item[0].hui_extent.offset = offset;
5470         hur->hur_user_item[0].hui_extent.length = length;
5471         hur->hur_request.hr_itemcount = 1;
5472         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5473                            len, hur, NULL);
5474         OBD_FREE(hur, len);
5475         RETURN(rc);
5476 }