lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 struct pcc_param {
  62         __u64   pa_data_version;
  63         __u32   pa_archive_id;
  64         __u32   pa_layout_gen;
  65 };
  66
  67 static int
  68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  69
  70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  71                           bool *lease_broken);
  72
  73 static struct ll_file_data *ll_file_data_get(void)
  74 {
  75         struct ll_file_data *fd;
  76
  77         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  78         if (fd == NULL)
  79                 return NULL;
  80
  81         fd->fd_write_failed = false;
  82         pcc_file_init(&fd->fd_pcc_file);
  83
  84         return fd;
  85 }
  86
  87 static void ll_file_data_put(struct ll_file_data *fd)
  88 {
  89         if (fd != NULL)
  90                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  91 }
  92
  93 /**
  94  * Packs all the attributes into @op_data for the CLOSE rpc.
  95  */
  96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  97                              struct obd_client_handle *och)
  98 {
  99         ENTRY;
 100
 101         ll_prep_md_op_data(op_data, inode, NULL, NULL,
 102                            0, 0, LUSTRE_OPC_ANY, NULL);
 103
 104         op_data->op_attr.ia_mode = inode->i_mode;
 105         op_data->op_attr.ia_atime = inode->i_atime;
 106         op_data->op_attr.ia_mtime = inode->i_mtime;
 107         op_data->op_attr.ia_ctime = inode->i_ctime;
 108         op_data->op_attr.ia_size = i_size_read(inode);
 109         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 110                                       ATTR_MTIME | ATTR_MTIME_SET |
 111                                       ATTR_CTIME);
 112         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 113         op_data->op_attr_blocks = inode->i_blocks;
 114         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 115         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 116                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 117         op_data->op_open_handle = och->och_open_handle;
 118
 119         if (och->och_flags & FMODE_WRITE &&
 120             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 121                 /* For HSM: if inode data has been modified, pack it so that
 122                  * MDT can set data dirty flag in the archive. */
 123                 op_data->op_bias |= MDS_DATA_MODIFIED;
 124
 125         EXIT;
 126 }
 127
 128 /**
 129  * Perform a close, possibly with a bias.
 130  * The meaning of "data" depends on the value of "bias".
 131  *
 132  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 133  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 134  * swap layouts with.
 135  */
 136 static int ll_close_inode_openhandle(struct inode *inode,
 137                                      struct obd_client_handle *och,
 138                                      enum mds_op_bias bias, void *data)
 139 {
 140         struct obd_export *md_exp = ll_i2mdexp(inode);
 141         const struct ll_inode_info *lli = ll_i2info(inode);
 142         struct md_op_data *op_data;
 143         struct ptlrpc_request *req = NULL;
 144         int rc;
 145         ENTRY;
 146
 147         if (class_exp2obd(md_exp) == NULL) {
 148                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 149                        ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
 150                 GOTO(out, rc = 0);
 151         }
 152
 153         OBD_ALLOC_PTR(op_data);
 154         /* We leak openhandle and request here on error, but not much to be
 155          * done in OOM case since app won't retry close on error either. */
 156         if (op_data == NULL)
 157                 GOTO(out, rc = -ENOMEM);
 158
 159         ll_prepare_close(inode, op_data, och);
 160         switch (bias) {
 161         case MDS_CLOSE_LAYOUT_MERGE:
 162                 /* merge blocks from the victim inode */
 163                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 164                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 165                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 166                 /* fallthrough */
 167         case MDS_CLOSE_LAYOUT_SPLIT:
 168         case MDS_CLOSE_LAYOUT_SWAP: {
 169                 struct split_param *sp = data;
 170
 171                 LASSERT(data != NULL);
 172                 op_data->op_bias |= bias;
 173                 op_data->op_data_version = 0;
 174                 op_data->op_lease_handle = och->och_lease_handle;
 175                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 176                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 177                         op_data->op_mirror_id = sp->sp_mirror_id;
 178                 } else {
 179                         op_data->op_fid2 = *ll_inode2fid(data);
 180                 }
 181                 break;
 182         }
 183
 184         case MDS_CLOSE_RESYNC_DONE: {
 185                 struct ll_ioc_lease *ioc = data;
 186
 187                 LASSERT(data != NULL);
 188                 op_data->op_attr_blocks +=
 189                         ioc->lil_count * op_data->op_attr_blocks;
 190                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 191                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 192                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 193
 194                 op_data->op_lease_handle = och->och_lease_handle;
 195                 op_data->op_data = &ioc->lil_ids[0];
 196                 op_data->op_data_size =
 197                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 198                 break;
 199         }
 200
 201         case MDS_PCC_ATTACH: {
 202                 struct pcc_param *param = data;
 203
 204                 LASSERT(data != NULL);
 205                 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
 206                 op_data->op_archive_id = param->pa_archive_id;
 207                 op_data->op_data_version = param->pa_data_version;
 208                 op_data->op_lease_handle = och->och_lease_handle;
 209                 break;
 210         }
 211
 212         case MDS_HSM_RELEASE:
 213                 LASSERT(data != NULL);
 214                 op_data->op_bias |= MDS_HSM_RELEASE;
 215                 op_data->op_data_version = *(__u64 *)data;
 216                 op_data->op_lease_handle = och->och_lease_handle;
 217                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 218                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 219                 break;
 220
 221         default:
 222                 LASSERT(data == NULL);
 223                 break;
 224         }
 225
 226         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 227                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 228         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 229                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 230
 231         rc = md_close(md_exp, op_data, och->och_mod, &req);
 232         if (rc != 0 && rc != -EINTR)
 233                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 234                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 235
 236         if (rc == 0 && op_data->op_bias & bias) {
 237                 struct mdt_body *body;
 238
 239                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 240                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 241                         rc = -EBUSY;
 242
 243                 if (bias & MDS_PCC_ATTACH) {
 244                         struct pcc_param *param = data;
 245
 246                         param->pa_layout_gen = body->mbo_layout_gen;
 247                 }
 248         }
 249
 250         ll_finish_md_op_data(op_data);
 251         EXIT;
 252 out:
 253
 254         md_clear_open_replay_data(md_exp, och);
 255         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 256         OBD_FREE_PTR(och);
 257
 258         ptlrpc_req_finished(req);       /* This is close request */
 259         return rc;
 260 }
 261
 262 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 263 {
 264         struct ll_inode_info *lli = ll_i2info(inode);
 265         struct obd_client_handle **och_p;
 266         struct obd_client_handle *och;
 267         __u64 *och_usecount;
 268         int rc = 0;
 269         ENTRY;
 270
 271         if (fmode & FMODE_WRITE) {
 272                 och_p = &lli->lli_mds_write_och;
 273                 och_usecount = &lli->lli_open_fd_write_count;
 274         } else if (fmode & FMODE_EXEC) {
 275                 och_p = &lli->lli_mds_exec_och;
 276                 och_usecount = &lli->lli_open_fd_exec_count;
 277         } else {
 278                 LASSERT(fmode & FMODE_READ);
 279                 och_p = &lli->lli_mds_read_och;
 280                 och_usecount = &lli->lli_open_fd_read_count;
 281         }
 282
 283         mutex_lock(&lli->lli_och_mutex);
 284         if (*och_usecount > 0) {
 285                 /* There are still users of this handle, so skip
 286                  * freeing it. */
 287                 mutex_unlock(&lli->lli_och_mutex);
 288                 RETURN(0);
 289         }
 290
 291         och = *och_p;
 292         *och_p = NULL;
 293         mutex_unlock(&lli->lli_och_mutex);
 294
 295         if (och != NULL) {
 296                 /* There might be a race and this handle may already
 297                  * be closed. */
 298                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 299         }
 300
 301         RETURN(rc);
 302 }
 303
 304 static int ll_md_close(struct inode *inode, struct file *file)
 305 {
 306         union ldlm_policy_data policy = {
 307                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 308         };
 309         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 310         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 311         struct ll_inode_info *lli = ll_i2info(inode);
 312         struct lustre_handle lockh;
 313         enum ldlm_mode lockmode;
 314         int rc = 0;
 315         ENTRY;
 316
 317         /* clear group lock, if present */
 318         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 319                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 320
 321         if (fd->fd_lease_och != NULL) {
 322                 bool lease_broken;
 323
 324                 /* Usually the lease is not released when the
 325                  * application crashed, we need to release here. */
 326                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 327                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 328                         PFID(&lli->lli_fid), rc, lease_broken);
 329
 330                 fd->fd_lease_och = NULL;
 331         }
 332
 333         if (fd->fd_och != NULL) {
 334                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 335                 fd->fd_och = NULL;
 336                 GOTO(out, rc);
 337         }
 338
 339         /* Let's see if we have good enough OPEN lock on the file and if
 340            we can skip talking to MDS */
 341         mutex_lock(&lli->lli_och_mutex);
 342         if (fd->fd_omode & FMODE_WRITE) {
 343                 lockmode = LCK_CW;
 344                 LASSERT(lli->lli_open_fd_write_count);
 345                 lli->lli_open_fd_write_count--;
 346         } else if (fd->fd_omode & FMODE_EXEC) {
 347                 lockmode = LCK_PR;
 348                 LASSERT(lli->lli_open_fd_exec_count);
 349                 lli->lli_open_fd_exec_count--;
 350         } else {
 351                 lockmode = LCK_CR;
 352                 LASSERT(lli->lli_open_fd_read_count);
 353                 lli->lli_open_fd_read_count--;
 354         }
 355         mutex_unlock(&lli->lli_och_mutex);
 356
 357         /* LU-4398: do not cache write open lock if the file has exec bit */
 358         if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
 359             !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 360                            LDLM_IBITS, &policy, lockmode, &lockh))
 361                 rc = ll_md_real_close(inode, fd->fd_omode);
 362
 363 out:
 364         LUSTRE_FPRIVATE(file) = NULL;
 365         ll_file_data_put(fd);
 366
 367         RETURN(rc);
 368 }
 369
 370 /* While this returns an error code, fput() the caller does not, so we need
 371  * to make every effort to clean up all of our state here.  Also, applications
 372  * rarely check close errors and even if an error is returned they will not
 373  * re-try the close call.
 374  */
 375 int ll_file_release(struct inode *inode, struct file *file)
 376 {
 377         struct ll_file_data *fd;
 378         struct ll_sb_info *sbi = ll_i2sbi(inode);
 379         struct ll_inode_info *lli = ll_i2info(inode);
 380         ktime_t kstart = ktime_get();
 381         int rc;
 382
 383         ENTRY;
 384
 385         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 386                PFID(ll_inode2fid(inode)), inode);
 387
 388         fd = LUSTRE_FPRIVATE(file);
 389         LASSERT(fd != NULL);
 390
 391         /* The last ref on @file, maybe not the the owner pid of statahead,
 392          * because parent and child process can share the same file handle. */
 393         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 394                 ll_deauthorize_statahead(inode, fd);
 395
 396         if (inode->i_sb->s_root == file_dentry(file)) {
 397                 LUSTRE_FPRIVATE(file) = NULL;
 398                 ll_file_data_put(fd);
 399                 GOTO(out, rc = 0);
 400         }
 401
 402         pcc_file_release(inode, file);
 403
 404         if (!S_ISDIR(inode->i_mode)) {
 405                 if (lli->lli_clob != NULL)
 406                         lov_read_and_clear_async_rc(lli->lli_clob);
 407                 lli->lli_async_rc = 0;
 408         }
 409
 410         rc = ll_md_close(inode, file);
 411
 412         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 413                 libcfs_debug_dumplog();
 414
 415 out:
 416         if (!rc && inode->i_sb->s_root != file_dentry(file))
 417                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE,
 418                                    ktime_us_delta(ktime_get(), kstart));
 419         RETURN(rc);
 420 }
 421
 422 static inline int ll_dom_readpage(void *data, struct page *page)
 423 {
 424         struct niobuf_local *lnb = data;
 425         void *kaddr;
 426
 427         kaddr = ll_kmap_atomic(page, KM_USER0);
 428         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 429         if (lnb->lnb_len < PAGE_SIZE)
 430                 memset(kaddr + lnb->lnb_len, 0,
 431                        PAGE_SIZE - lnb->lnb_len);
 432         flush_dcache_page(page);
 433         SetPageUptodate(page);
 434         ll_kunmap_atomic(kaddr, KM_USER0);
 435         unlock_page(page);
 436
 437         return 0;
 438 }
 439
 440 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 441                         struct lookup_intent *it)
 442 {
 443         struct ll_inode_info *lli = ll_i2info(inode);
 444         struct cl_object *obj = lli->lli_clob;
 445         struct address_space *mapping = inode->i_mapping;
 446         struct page *vmpage;
 447         struct niobuf_remote *rnb;
 448         struct mdt_body *body;
 449         char *data;
 450         unsigned long index, start;
 451         struct niobuf_local lnb;
 452
 453         ENTRY;
 454
 455         if (obj == NULL)
 456                 RETURN_EXIT;
 457
 458         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 459                                    RCL_SERVER))
 460                 RETURN_EXIT;
 461
 462         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 463         if (rnb == NULL || rnb->rnb_len == 0)
 464                 RETURN_EXIT;
 465
 466         /* LU-11595: Server may return whole file and that is OK always or
 467          * it may return just file tail and its offset must be aligned with
 468          * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
 469          * smaller then offset may be not aligned and that data is just ignored.
 470          */
 471         if (rnb->rnb_offset % PAGE_SIZE)
 472                 RETURN_EXIT;
 473
 474         /* Server returns whole file or just file tail if it fills in reply
 475          * buffer, in both cases total size should be equal to the file size.
 476          */
 477         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 478         if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
 479                 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
 480                        ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
 481                        rnb->rnb_len, body->mbo_dom_size);
 482                 RETURN_EXIT;
 483         }
 484
 485         CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
 486                rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
 487
 488         data = (char *)rnb + sizeof(*rnb);
 489
 490         lnb.lnb_file_offset = rnb->rnb_offset;
 491         start = lnb.lnb_file_offset / PAGE_SIZE;
 492         index = 0;
 493         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 494         lnb.lnb_page_offset = 0;
 495         do {
 496                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 497                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 498                 if (lnb.lnb_len > PAGE_SIZE)
 499                         lnb.lnb_len = PAGE_SIZE;
 500
 501                 vmpage = read_cache_page(mapping, index + start,
 502                                          ll_dom_readpage, &lnb);
 503                 if (IS_ERR(vmpage)) {
 504                         CWARN("%s: cannot fill page %lu for "DFID
 505                               " with data: rc = %li\n",
 506                               ll_i2sbi(inode)->ll_fsname, index + start,
 507                               PFID(lu_object_fid(&obj->co_lu)),
 508                               PTR_ERR(vmpage));
 509                         break;
 510                 }
 511                 put_page(vmpage);
 512                 index++;
 513         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 514         EXIT;
 515 }
 516
 517 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 518                                 struct lookup_intent *itp)
 519 {
 520         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 521         struct dentry *parent = de->d_parent;
 522         char *name = NULL;
 523         int len = 0;
 524         struct md_op_data *op_data;
 525         struct ptlrpc_request *req = NULL;
 526         int rc;
 527         ENTRY;
 528
 529         LASSERT(parent != NULL);
 530         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 531
 532         /* if server supports open-by-fid, or file name is invalid, don't pack
 533          * name in open request */
 534         if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
 535             !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
 536 retry:
 537                 len = de->d_name.len;
 538                 name = kmalloc(len + 1, GFP_NOFS);
 539                 if (!name)
 540                         RETURN(-ENOMEM);
 541
 542                 /* race here */
 543                 spin_lock(&de->d_lock);
 544                 if (len != de->d_name.len) {
 545                         spin_unlock(&de->d_lock);
 546                         kfree(name);
 547                         goto retry;
 548                 }
 549                 memcpy(name, de->d_name.name, len);
 550                 name[len] = '\0';
 551                 spin_unlock(&de->d_lock);
 552
 553                 if (!lu_name_is_valid_2(name, len)) {
 554                         kfree(name);
 555                         RETURN(-ESTALE);
 556                 }
 557         }
 558
 559         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 560                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 561         if (IS_ERR(op_data)) {
 562                 kfree(name);
 563                 RETURN(PTR_ERR(op_data));
 564         }
 565         op_data->op_data = lmm;
 566         op_data->op_data_size = lmmsize;
 567
 568         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 569                             &ll_md_blocking_ast, 0);
 570         kfree(name);
 571         ll_finish_md_op_data(op_data);
 572         if (rc == -ESTALE) {
 573                 /* reason for keep own exit path - don`t flood log
 574                  * with messages with -ESTALE errors.
 575                  */
 576                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 577                      it_open_error(DISP_OPEN_OPEN, itp))
 578                         GOTO(out, rc);
 579                 ll_release_openhandle(de, itp);
 580                 GOTO(out, rc);
 581         }
 582
 583         if (it_disposition(itp, DISP_LOOKUP_NEG))
 584                 GOTO(out, rc = -ENOENT);
 585
 586         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 587                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 588                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 589                 GOTO(out, rc);
 590         }
 591
 592         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 593
 594         if (!rc && itp->it_lock_mode) {
 595                 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
 596                 struct ldlm_lock *lock;
 597                 bool has_dom_bit = false;
 598
 599                 /* If we got a lock back and it has a LOOKUP bit set,
 600                  * make sure the dentry is marked as valid so we can find it.
 601                  * We don't need to care about actual hashing since other bits
 602                  * of kernel will deal with that later.
 603                  */
 604                 lock = ldlm_handle2lock(&handle);
 605                 if (lock) {
 606                         has_dom_bit = ldlm_has_dom(lock);
 607                         if (lock->l_policy_data.l_inodebits.bits &
 608                             MDS_INODELOCK_LOOKUP)
 609                                 d_lustre_revalidate(de);
 610
 611                         LDLM_LOCK_PUT(lock);
 612                 }
 613                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 614                 if (has_dom_bit)
 615                         ll_dom_finish_open(de->d_inode, req, itp);
 616         }
 617
 618 out:
 619         ptlrpc_req_finished(req);
 620         ll_intent_drop_lock(itp);
 621
 622         /* We did open by fid, but by the time we got to the server,
 623          * the object disappeared. If this is a create, we cannot really
 624          * tell the userspace that the file it was trying to create
 625          * does not exist. Instead let's return -ESTALE, and the VFS will
 626          * retry the create with LOOKUP_REVAL that we are going to catch
 627          * in ll_revalidate_dentry() and use lookup then.
 628          */
 629         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 630                 rc = -ESTALE;
 631
 632         RETURN(rc);
 633 }
 634
 635 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 636                        struct obd_client_handle *och)
 637 {
 638         struct mdt_body *body;
 639
 640         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 641         och->och_open_handle = body->mbo_open_handle;
 642         och->och_fid = body->mbo_fid1;
 643         och->och_lease_handle.cookie = it->it_lock_handle;
 644         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 645         och->och_flags = it->it_flags;
 646
 647         return md_set_open_replay_data(md_exp, och, it);
 648 }
 649
 650 static int ll_local_open(struct file *file, struct lookup_intent *it,
 651                          struct ll_file_data *fd, struct obd_client_handle *och)
 652 {
 653         struct inode *inode = file_inode(file);
 654         ENTRY;
 655
 656         LASSERT(!LUSTRE_FPRIVATE(file));
 657
 658         LASSERT(fd != NULL);
 659
 660         if (och) {
 661                 int rc;
 662
 663                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 664                 if (rc != 0)
 665                         RETURN(rc);
 666         }
 667
 668         LUSTRE_FPRIVATE(file) = fd;
 669         ll_readahead_init(inode, &fd->fd_ras);
 670         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 671
 672         /* ll_cl_context initialize */
 673         rwlock_init(&fd->fd_lock);
 674         INIT_LIST_HEAD(&fd->fd_lccs);
 675
 676         RETURN(0);
 677 }
 678
 679 /* Open a file, and (for the very first open) create objects on the OSTs at
 680  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 681  * creation or open until ll_lov_setstripe() ioctl is called.
 682  *
 683  * If we already have the stripe MD locally then we don't request it in
 684  * md_open(), by passing a lmm_size = 0.
 685  *
 686  * It is up to the application to ensure no other processes open this file
 687  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 688  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 689  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 690  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 691  */
 692 int ll_file_open(struct inode *inode, struct file *file)
 693 {
 694         struct ll_inode_info *lli = ll_i2info(inode);
 695         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 696                                           .it_flags = file->f_flags };
 697         struct obd_client_handle **och_p = NULL;
 698         __u64 *och_usecount = NULL;
 699         struct ll_file_data *fd;
 700         ktime_t kstart = ktime_get();
 701         int rc = 0;
 702         ENTRY;
 703
 704         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 705                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 706
 707         it = file->private_data; /* XXX: compat macro */
 708         file->private_data = NULL; /* prevent ll_local_open assertion */
 709
 710         fd = ll_file_data_get();
 711         if (fd == NULL)
 712                 GOTO(out_nofiledata, rc = -ENOMEM);
 713
 714         fd->fd_file = file;
 715         if (S_ISDIR(inode->i_mode))
 716                 ll_authorize_statahead(inode, fd);
 717
 718         if (inode->i_sb->s_root == file_dentry(file)) {
 719                 LUSTRE_FPRIVATE(file) = fd;
 720                 RETURN(0);
 721         }
 722
 723         if (!it || !it->it_disposition) {
 724                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 725                  * because everything but O_ACCMODE mask was stripped from
 726                  * there */
 727                 if ((oit.it_flags + 1) & O_ACCMODE)
 728                         oit.it_flags++;
 729                 if (file->f_flags & O_TRUNC)
 730                         oit.it_flags |= FMODE_WRITE;
 731
 732                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 733                  * dentry_open after call to open_namei that checks permissions.
 734                  * Only nfsd_open call dentry_open directly without checking
 735                  * permissions and because of that this code below is safe.
 736                  */
 737                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 738                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 739
 740                 /* We do not want O_EXCL here, presumably we opened the file
 741                  * already? XXX - NFS implications? */
 742                 oit.it_flags &= ~O_EXCL;
 743
 744                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 745                  * created if necessary, then "IT_CREAT" should be set to keep
 746                  * consistent with it */
 747                 if (oit.it_flags & O_CREAT)
 748                         oit.it_op |= IT_CREAT;
 749
 750                 it = &oit;
 751         }
 752
 753 restart:
 754         /* Let's see if we have file open on MDS already. */
 755         if (it->it_flags & FMODE_WRITE) {
 756                 och_p = &lli->lli_mds_write_och;
 757                 och_usecount = &lli->lli_open_fd_write_count;
 758         } else if (it->it_flags & FMODE_EXEC) {
 759                 och_p = &lli->lli_mds_exec_och;
 760                 och_usecount = &lli->lli_open_fd_exec_count;
 761          } else {
 762                 och_p = &lli->lli_mds_read_och;
 763                 och_usecount = &lli->lli_open_fd_read_count;
 764         }
 765
 766         mutex_lock(&lli->lli_och_mutex);
 767         if (*och_p) { /* Open handle is present */
 768                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 769                         /* Well, there's extra open request that we do not need,
 770                            let's close it somehow. This will decref request. */
 771                         rc = it_open_error(DISP_OPEN_OPEN, it);
 772                         if (rc) {
 773                                 mutex_unlock(&lli->lli_och_mutex);
 774                                 GOTO(out_openerr, rc);
 775                         }
 776
 777                         ll_release_openhandle(file_dentry(file), it);
 778                 }
 779                 (*och_usecount)++;
 780
 781                 rc = ll_local_open(file, it, fd, NULL);
 782                 if (rc) {
 783                         (*och_usecount)--;
 784                         mutex_unlock(&lli->lli_och_mutex);
 785                         GOTO(out_openerr, rc);
 786                 }
 787         } else {
 788                 LASSERT(*och_usecount == 0);
 789                 if (!it->it_disposition) {
 790                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 791                         /* We cannot just request lock handle now, new ELC code
 792                            means that one of other OPEN locks for this file
 793                            could be cancelled, and since blocking ast handler
 794                            would attempt to grab och_mutex as well, that would
 795                            result in a deadlock */
 796                         mutex_unlock(&lli->lli_och_mutex);
 797                         /*
 798                          * Normally called under two situations:
 799                          * 1. NFS export.
 800                          * 2. A race/condition on MDS resulting in no open
 801                          *    handle to be returned from LOOKUP|OPEN request,
 802                          *    for example if the target entry was a symlink.
 803                          *
 804                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 805                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 806                          *  bit so that it's not confusing later callers.
 807                          *
 808                          *  NB; when ldd is NULL, it must have come via normal
 809                          *  lookup path only, since ll_iget_for_nfs always calls
 810                          *  ll_d_init().
 811                          */
 812                         if (ldd && ldd->lld_nfs_dentry) {
 813                                 ldd->lld_nfs_dentry = 0;
 814                                 it->it_flags |= MDS_OPEN_LOCK;
 815                         }
 816
 817                          /*
 818                          * Always specify MDS_OPEN_BY_FID because we don't want
 819                          * to get file with different fid.
 820                          */
 821                         it->it_flags |= MDS_OPEN_BY_FID;
 822                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 823                                                  it);
 824                         if (rc)
 825                                 GOTO(out_openerr, rc);
 826
 827                         goto restart;
 828                 }
 829                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 830                 if (!*och_p)
 831                         GOTO(out_och_free, rc = -ENOMEM);
 832
 833                 (*och_usecount)++;
 834
 835                 /* md_intent_lock() didn't get a request ref if there was an
 836                  * open error, so don't do cleanup on the request here
 837                  * (bug 3430) */
 838                 /* XXX (green): Should not we bail out on any error here, not
 839                  * just open error? */
 840                 rc = it_open_error(DISP_OPEN_OPEN, it);
 841                 if (rc != 0)
 842                         GOTO(out_och_free, rc);
 843
 844                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 845                          "inode %p: disposition %x, status %d\n", inode,
 846                          it_disposition(it, ~0), it->it_status);
 847
 848                 rc = ll_local_open(file, it, fd, *och_p);
 849                 if (rc)
 850                         GOTO(out_och_free, rc);
 851         }
 852
 853         rc = pcc_file_open(inode, file);
 854         if (rc)
 855                 GOTO(out_och_free, rc);
 856
 857         mutex_unlock(&lli->lli_och_mutex);
 858         fd = NULL;
 859
 860         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 861            different kind of OPEN lock for this same inode gets cancelled
 862            by ldlm_cancel_lru */
 863         if (!S_ISREG(inode->i_mode))
 864                 GOTO(out_och_free, rc);
 865
 866         cl_lov_delay_create_clear(&file->f_flags);
 867         GOTO(out_och_free, rc);
 868
 869 out_och_free:
 870         if (rc) {
 871                 if (och_p && *och_p) {
 872                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 873                         *och_p = NULL; /* OBD_FREE writes some magic there */
 874                         (*och_usecount)--;
 875                 }
 876                 mutex_unlock(&lli->lli_och_mutex);
 877
 878 out_openerr:
 879                 if (lli->lli_opendir_key == fd)
 880                         ll_deauthorize_statahead(inode, fd);
 881
 882                 if (fd != NULL)
 883                         ll_file_data_put(fd);
 884         } else {
 885                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN,
 886                                    ktime_us_delta(ktime_get(), kstart));
 887         }
 888
 889 out_nofiledata:
 890         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 891                 ptlrpc_req_finished(it->it_request);
 892                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 893         }
 894
 895         return rc;
 896 }
 897
 898 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 899                         struct ldlm_lock_desc *desc, void *data, int flag)
 900 {
 901         int rc;
 902         struct lustre_handle lockh;
 903         ENTRY;
 904
 905         switch (flag) {
 906         case LDLM_CB_BLOCKING:
 907                 ldlm_lock2handle(lock, &lockh);
 908                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 909                 if (rc < 0) {
 910                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 911                         RETURN(rc);
 912                 }
 913                 break;
 914         case LDLM_CB_CANCELING:
 915                 /* do nothing */
 916                 break;
 917         }
 918         RETURN(0);
 919 }
 920
 921 /**
 922  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 923  * and save it as fd->fd_och so as to force client to reopen the file even
 924  * if it has an open lock in cache already.
 925  */
 926 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 927                                 struct lustre_handle *old_open_handle)
 928 {
 929         struct ll_inode_info *lli = ll_i2info(inode);
 930         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 931         struct obd_client_handle **och_p;
 932         __u64 *och_usecount;
 933         int rc = 0;
 934         ENTRY;
 935
 936         /* Get the openhandle of the file */
 937         mutex_lock(&lli->lli_och_mutex);
 938         if (fd->fd_lease_och != NULL)
 939                 GOTO(out_unlock, rc = -EBUSY);
 940
 941         if (fd->fd_och == NULL) {
 942                 if (file->f_mode & FMODE_WRITE) {
 943                         LASSERT(lli->lli_mds_write_och != NULL);
 944                         och_p = &lli->lli_mds_write_och;
 945                         och_usecount = &lli->lli_open_fd_write_count;
 946                 } else {
 947                         LASSERT(lli->lli_mds_read_och != NULL);
 948                         och_p = &lli->lli_mds_read_och;
 949                         och_usecount = &lli->lli_open_fd_read_count;
 950                 }
 951
 952                 if (*och_usecount > 1)
 953                         GOTO(out_unlock, rc = -EBUSY);
 954
 955                 fd->fd_och = *och_p;
 956                 *och_usecount = 0;
 957                 *och_p = NULL;
 958         }
 959
 960         *old_open_handle = fd->fd_och->och_open_handle;
 961
 962         EXIT;
 963 out_unlock:
 964         mutex_unlock(&lli->lli_och_mutex);
 965         return rc;
 966 }
 967
 968 /**
 969  * Release ownership on lli_mds_*_och when putting back a file lease.
 970  */
 971 static int ll_lease_och_release(struct inode *inode, struct file *file)
 972 {
 973         struct ll_inode_info *lli = ll_i2info(inode);
 974         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 975         struct obd_client_handle **och_p;
 976         struct obd_client_handle *old_och = NULL;
 977         __u64 *och_usecount;
 978         int rc = 0;
 979         ENTRY;
 980
 981         mutex_lock(&lli->lli_och_mutex);
 982         if (file->f_mode & FMODE_WRITE) {
 983                 och_p = &lli->lli_mds_write_och;
 984                 och_usecount = &lli->lli_open_fd_write_count;
 985         } else {
 986                 och_p = &lli->lli_mds_read_och;
 987                 och_usecount = &lli->lli_open_fd_read_count;
 988         }
 989
 990         /* The file may have been open by another process (broken lease) so
 991          * *och_p is not NULL. In this case we should simply increase usecount
 992          * and close fd_och.
 993          */
 994         if (*och_p != NULL) {
 995                 old_och = fd->fd_och;
 996                 (*och_usecount)++;
 997         } else {
 998                 *och_p = fd->fd_och;
 999                 *och_usecount = 1;
1000         }
1001         fd->fd_och = NULL;
1002         mutex_unlock(&lli->lli_och_mutex);
1003
1004         if (old_och != NULL)
1005                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1006
1007         RETURN(rc);
1008 }
1009
1010 /**
1011  * Acquire a lease and open the file.
1012  */
1013 static struct obd_client_handle *
1014 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1015               __u64 open_flags)
1016 {
1017         struct lookup_intent it = { .it_op = IT_OPEN };
1018         struct ll_sb_info *sbi = ll_i2sbi(inode);
1019         struct md_op_data *op_data;
1020         struct ptlrpc_request *req = NULL;
1021         struct lustre_handle old_open_handle = { 0 };
1022         struct obd_client_handle *och = NULL;
1023         int rc;
1024         int rc2;
1025         ENTRY;
1026
1027         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1028                 RETURN(ERR_PTR(-EINVAL));
1029
1030         if (file != NULL) {
1031                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1032                         RETURN(ERR_PTR(-EPERM));
1033
1034                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1035                 if (rc)
1036                         RETURN(ERR_PTR(rc));
1037         }
1038
1039         OBD_ALLOC_PTR(och);
1040         if (och == NULL)
1041                 RETURN(ERR_PTR(-ENOMEM));
1042
1043         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1044                                         LUSTRE_OPC_ANY, NULL);
1045         if (IS_ERR(op_data))
1046                 GOTO(out, rc = PTR_ERR(op_data));
1047
1048         /* To tell the MDT this openhandle is from the same owner */
1049         op_data->op_open_handle = old_open_handle;
1050
1051         it.it_flags = fmode | open_flags;
1052         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1053         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1054                             &ll_md_blocking_lease_ast,
1055         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1056          * it can be cancelled which may mislead applications that the lease is
1057          * broken;
1058          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1059          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1060          * doesn't deal with openhandle, so normal openhandle will be leaked. */
1061                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1062         ll_finish_md_op_data(op_data);
1063         ptlrpc_req_finished(req);
1064         if (rc < 0)
1065                 GOTO(out_release_it, rc);
1066
1067         if (it_disposition(&it, DISP_LOOKUP_NEG))
1068                 GOTO(out_release_it, rc = -ENOENT);
1069
1070         rc = it_open_error(DISP_OPEN_OPEN, &it);
1071         if (rc)
1072                 GOTO(out_release_it, rc);
1073
1074         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1075         rc = ll_och_fill(sbi->ll_md_exp, &it, och);
1076         if (rc)
1077                 GOTO(out_release_it, rc);
1078
1079         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1080                 GOTO(out_close, rc = -EOPNOTSUPP);
1081
1082         /* already get lease, handle lease lock */
1083         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1084         if (it.it_lock_mode == 0 ||
1085             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1086                 /* open lock must return for lease */
1087                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1088                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1089                         it.it_lock_bits);
1090                 GOTO(out_close, rc = -EPROTO);
1091         }
1092
1093         ll_intent_release(&it);
1094         RETURN(och);
1095
1096 out_close:
1097         /* Cancel open lock */
1098         if (it.it_lock_mode != 0) {
1099                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1100                                             it.it_lock_mode);
1101                 it.it_lock_mode = 0;
1102                 och->och_lease_handle.cookie = 0ULL;
1103         }
1104         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1105         if (rc2 < 0)
1106                 CERROR("%s: error closing file "DFID": %d\n",
1107                        sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1108         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1109 out_release_it:
1110         ll_intent_release(&it);
1111 out:
1112         if (och != NULL)
1113                 OBD_FREE_PTR(och);
1114         RETURN(ERR_PTR(rc));
1115 }
1116
1117 /**
1118  * Check whether a layout swap can be done between two inodes.
1119  *
1120  * \param[in] inode1  First inode to check
1121  * \param[in] inode2  Second inode to check
1122  *
1123  * \retval 0 on success, layout swap can be performed between both inodes
1124  * \retval negative error code if requirements are not met
1125  */
1126 static int ll_check_swap_layouts_validity(struct inode *inode1,
1127                                           struct inode *inode2)
1128 {
1129         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1130                 return -EINVAL;
1131
1132         if (inode_permission(inode1, MAY_WRITE) ||
1133             inode_permission(inode2, MAY_WRITE))
1134                 return -EPERM;
1135
1136         if (inode1->i_sb != inode2->i_sb)
1137                 return -EXDEV;
1138
1139         return 0;
1140 }
1141
1142 static int ll_swap_layouts_close(struct obd_client_handle *och,
1143                                  struct inode *inode, struct inode *inode2)
1144 {
1145         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1146         const struct lu_fid     *fid2;
1147         int                      rc;
1148         ENTRY;
1149
1150         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1151                ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1152
1153         rc = ll_check_swap_layouts_validity(inode, inode2);
1154         if (rc < 0)
1155                 GOTO(out_free_och, rc);
1156
1157         /* We now know that inode2 is a lustre inode */
1158         fid2 = ll_inode2fid(inode2);
1159
1160         rc = lu_fid_cmp(fid1, fid2);
1161         if (rc == 0)
1162                 GOTO(out_free_och, rc = -EINVAL);
1163
1164         /* Close the file and {swap,merge} layouts between inode & inode2.
1165          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1166          * because we still need it to pack l_remote_handle to MDT. */
1167         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1168                                        inode2);
1169
1170         och = NULL; /* freed in ll_close_inode_openhandle() */
1171
1172 out_free_och:
1173         if (och != NULL)
1174                 OBD_FREE_PTR(och);
1175
1176         RETURN(rc);
1177 }
1178
1179 /**
1180  * Release lease and close the file.
1181  * It will check if the lease has ever broken.
1182  */
1183 static int ll_lease_close_intent(struct obd_client_handle *och,
1184                                  struct inode *inode,
1185                                  bool *lease_broken, enum mds_op_bias bias,
1186                                  void *data)
1187 {
1188         struct ldlm_lock *lock;
1189         bool cancelled = true;
1190         int rc;
1191         ENTRY;
1192
1193         lock = ldlm_handle2lock(&och->och_lease_handle);
1194         if (lock != NULL) {
1195                 lock_res_and_lock(lock);
1196                 cancelled = ldlm_is_cancel(lock);
1197                 unlock_res_and_lock(lock);
1198                 LDLM_LOCK_PUT(lock);
1199         }
1200
1201         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1202                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1203
1204         if (lease_broken != NULL)
1205                 *lease_broken = cancelled;
1206
1207         if (!cancelled && !bias)
1208                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1209
1210         if (cancelled) { /* no need to excute intent */
1211                 bias = 0;
1212                 data = NULL;
1213         }
1214
1215         rc = ll_close_inode_openhandle(inode, och, bias, data);
1216         RETURN(rc);
1217 }
1218
1219 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1220                           bool *lease_broken)
1221 {
1222         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1223 }
1224
1225 /**
1226  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1227  */
1228 static int ll_lease_file_resync(struct obd_client_handle *och,
1229                                 struct inode *inode, unsigned long arg)
1230 {
1231         struct ll_sb_info *sbi = ll_i2sbi(inode);
1232         struct md_op_data *op_data;
1233         struct ll_ioc_lease_id ioc;
1234         __u64 data_version_unused;
1235         int rc;
1236         ENTRY;
1237
1238         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1239                                      LUSTRE_OPC_ANY, NULL);
1240         if (IS_ERR(op_data))
1241                 RETURN(PTR_ERR(op_data));
1242
1243         if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1244                            sizeof(ioc)))
1245                 RETURN(-EFAULT);
1246
1247         /* before starting file resync, it's necessary to clean up page cache
1248          * in client memory, otherwise once the layout version is increased,
1249          * writing back cached data will be denied the OSTs. */
1250         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1251         if (rc)
1252                 GOTO(out, rc);
1253
1254         op_data->op_lease_handle = och->och_lease_handle;
1255         op_data->op_mirror_id = ioc.lil_mirror_id;
1256         rc = md_file_resync(sbi->ll_md_exp, op_data);
1257         if (rc)
1258                 GOTO(out, rc);
1259
1260         EXIT;
1261 out:
1262         ll_finish_md_op_data(op_data);
1263         return rc;
1264 }
1265
1266 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1267 {
1268         struct ll_inode_info *lli = ll_i2info(inode);
1269         struct cl_object *obj = lli->lli_clob;
1270         struct cl_attr *attr = vvp_env_thread_attr(env);
1271         s64 atime;
1272         s64 mtime;
1273         s64 ctime;
1274         int rc = 0;
1275
1276         ENTRY;
1277
1278         ll_inode_size_lock(inode);
1279
1280         /* Merge timestamps the most recently obtained from MDS with
1281          * timestamps obtained from OSTs.
1282          *
1283          * Do not overwrite atime of inode because it may be refreshed
1284          * by file_accessed() function. If the read was served by cache
1285          * data, there is no RPC to be sent so that atime may not be
1286          * transferred to OSTs at all. MDT only updates atime at close time
1287          * if it's at least 'mdd.*.atime_diff' older.
1288          * All in all, the atime in Lustre does not strictly comply with
1289          * POSIX. Solving this problem needs to send an RPC to MDT for each
1290          * read, this will hurt performance.
1291          */
1292         if (ll_file_test_and_clear_flag(lli, LLIF_UPDATE_ATIME) ||
1293             inode->i_atime.tv_sec < lli->lli_atime)
1294                 inode->i_atime.tv_sec = lli->lli_atime;
1295
1296         inode->i_mtime.tv_sec = lli->lli_mtime;
1297         inode->i_ctime.tv_sec = lli->lli_ctime;
1298
1299         mtime = inode->i_mtime.tv_sec;
1300         atime = inode->i_atime.tv_sec;
1301         ctime = inode->i_ctime.tv_sec;
1302
1303         cl_object_attr_lock(obj);
1304         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1305                 rc = -EINVAL;
1306         else
1307                 rc = cl_object_attr_get(env, obj, attr);
1308         cl_object_attr_unlock(obj);
1309
1310         if (rc != 0)
1311                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1312
1313         if (atime < attr->cat_atime)
1314                 atime = attr->cat_atime;
1315
1316         if (ctime < attr->cat_ctime)
1317                 ctime = attr->cat_ctime;
1318
1319         if (mtime < attr->cat_mtime)
1320                 mtime = attr->cat_mtime;
1321
1322         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1323                PFID(&lli->lli_fid), attr->cat_size);
1324
1325         i_size_write(inode, attr->cat_size);
1326         inode->i_blocks = attr->cat_blocks;
1327
1328         inode->i_mtime.tv_sec = mtime;
1329         inode->i_atime.tv_sec = atime;
1330         inode->i_ctime.tv_sec = ctime;
1331
1332 out_size_unlock:
1333         ll_inode_size_unlock(inode);
1334
1335         RETURN(rc);
1336 }
1337
1338 /**
1339  * Set designated mirror for I/O.
1340  *
1341  * So far only read, write, and truncated can support to issue I/O to
1342  * designated mirror.
1343  */
1344 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1345 {
1346         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1347
1348         /* clear layout version for generic(non-resync) I/O in case it carries
1349          * stale layout version due to I/O restart */
1350         io->ci_layout_version = 0;
1351
1352         /* FLR: disable non-delay for designated mirror I/O because obviously
1353          * only one mirror is available */
1354         if (fd->fd_designated_mirror > 0) {
1355                 io->ci_ndelay = 0;
1356                 io->ci_designated_mirror = fd->fd_designated_mirror;
1357                 io->ci_layout_version = fd->fd_layout_version;
1358         }
1359
1360         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1361                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1362 }
1363
1364 static bool file_is_noatime(const struct file *file)
1365 {
1366         const struct vfsmount *mnt = file->f_path.mnt;
1367         const struct inode *inode = file_inode((struct file *)file);
1368
1369         /* Adapted from file_accessed() and touch_atime().*/
1370         if (file->f_flags & O_NOATIME)
1371                 return true;
1372
1373         if (inode->i_flags & S_NOATIME)
1374                 return true;
1375
1376         if (IS_NOATIME(inode))
1377                 return true;
1378
1379         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1380                 return true;
1381
1382         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1383                 return true;
1384
1385         if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1386                 return true;
1387
1388         return false;
1389 }
1390
1391 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1392                 struct vvp_io_args *args)
1393 {
1394         struct inode *inode = file_inode(file);
1395         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1396
1397         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1398         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1399
1400         if (iot == CIT_WRITE) {
1401                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1402                 io->u.ci_wr.wr_sync   = !!(file->f_flags & O_SYNC ||
1403                                            file->f_flags & O_DIRECT ||
1404                                            IS_SYNC(inode));
1405 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1406                 io->u.ci_wr.wr_sync  |= !!(args &&
1407                                            args->via_io_subtype == IO_NORMAL &&
1408                                            args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1409 #endif
1410         }
1411
1412         io->ci_obj = ll_i2info(inode)->lli_clob;
1413         io->ci_lockreq = CILR_MAYBE;
1414         if (ll_file_nolock(file)) {
1415                 io->ci_lockreq = CILR_NEVER;
1416                 io->ci_no_srvlock = 1;
1417         } else if (file->f_flags & O_APPEND) {
1418                 io->ci_lockreq = CILR_MANDATORY;
1419         }
1420         io->ci_noatime = file_is_noatime(file);
1421         io->ci_async_readahead = false;
1422
1423         /* FLR: only use non-delay I/O for read as there is only one
1424          * avaliable mirror for write. */
1425         io->ci_ndelay = !(iot == CIT_WRITE);
1426
1427         ll_io_set_mirror(io, file);
1428 }
1429
1430 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1431                         __u64 count)
1432 {
1433         struct ll_inode_info *lli = ll_i2info(inode);
1434         struct ll_sb_info *sbi = ll_i2sbi(inode);
1435         enum obd_heat_type sample_type;
1436         enum obd_heat_type iobyte_type;
1437         __u64 now = ktime_get_real_seconds();
1438
1439         if (!ll_sbi_has_file_heat(sbi) ||
1440             lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1441                 return;
1442
1443         if (iot == CIT_READ) {
1444                 sample_type = OBD_HEAT_READSAMPLE;
1445                 iobyte_type = OBD_HEAT_READBYTE;
1446         } else if (iot == CIT_WRITE) {
1447                 sample_type = OBD_HEAT_WRITESAMPLE;
1448                 iobyte_type = OBD_HEAT_WRITEBYTE;
1449         } else {
1450                 return;
1451         }
1452
1453         spin_lock(&lli->lli_heat_lock);
1454         obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1455                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1456         obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1457                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1458         spin_unlock(&lli->lli_heat_lock);
1459 }
1460
1461 static ssize_t
1462 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1463                    struct file *file, enum cl_io_type iot,
1464                    loff_t *ppos, size_t count)
1465 {
1466         struct vvp_io           *vio = vvp_env_io(env);
1467         struct inode            *inode = file_inode(file);
1468         struct ll_inode_info    *lli = ll_i2info(inode);
1469         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1470         struct range_lock       range;
1471         struct cl_io            *io;
1472         ssize_t                 result = 0;
1473         int                     rc = 0;
1474         unsigned                retried = 0;
1475         bool                    restarted = false;
1476
1477         ENTRY;
1478
1479         CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1480                 file_dentry(file)->d_name.name,
1481                 iot == CIT_READ ? "read" : "write", *ppos, count);
1482
1483 restart:
1484         io = vvp_env_thread_io(env);
1485         ll_io_init(io, file, iot, args);
1486         io->ci_ndelay_tried = retried;
1487
1488         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1489                 bool range_locked = false;
1490
1491                 if (file->f_flags & O_APPEND)
1492                         range_lock_init(&range, 0, LUSTRE_EOF);
1493                 else
1494                         range_lock_init(&range, *ppos, *ppos + count - 1);
1495
1496                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1497                 vio->vui_io_subtype = args->via_io_subtype;
1498
1499                 switch (vio->vui_io_subtype) {
1500                 case IO_NORMAL:
1501                         vio->vui_iter = args->u.normal.via_iter;
1502                         vio->vui_iocb = args->u.normal.via_iocb;
1503                         /* Direct IO reads must also take range lock,
1504                          * or multiple reads will try to work on the same pages
1505                          * See LU-6227 for details. */
1506                         if (((iot == CIT_WRITE) ||
1507                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1508                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1509                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1510                                        RL_PARA(&range));
1511                                 rc = range_lock(&lli->lli_write_tree, &range);
1512                                 if (rc < 0)
1513                                         GOTO(out, rc);
1514
1515                                 range_locked = true;
1516                         }
1517                         break;
1518                 case IO_SPLICE:
1519                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1520                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1521                         break;
1522                 default:
1523                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1524                         LBUG();
1525                 }
1526
1527                 ll_cl_add(file, env, io, LCC_RW);
1528                 rc = cl_io_loop(env, io);
1529                 ll_cl_remove(file, env);
1530
1531                 if (range_locked) {
1532                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1533                                RL_PARA(&range));
1534                         range_unlock(&lli->lli_write_tree, &range);
1535                 }
1536         } else {
1537                 /* cl_io_rw_init() handled IO */
1538                 rc = io->ci_result;
1539         }
1540
1541         if (io->ci_nob > 0) {
1542                 result += io->ci_nob;
1543                 count  -= io->ci_nob;
1544                 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1545
1546                 /* prepare IO restart */
1547                 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1548                         args->u.normal.via_iter = vio->vui_iter;
1549         }
1550 out:
1551         cl_io_fini(env, io);
1552
1553         CDEBUG(D_VFSTRACE,
1554                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1555                file->f_path.dentry->d_name.name,
1556                iot, rc, result, io->ci_need_restart);
1557
1558         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1559                 CDEBUG(D_VFSTRACE,
1560                        "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1561                        file_dentry(file)->d_name.name,
1562                        iot == CIT_READ ? "read" : "write",
1563                        *ppos, count, result, rc);
1564                 /* preserve the tried count for FLR */
1565                 retried = io->ci_ndelay_tried;
1566                 restarted = true;
1567                 goto restart;
1568         }
1569
1570         if (iot == CIT_READ) {
1571                 if (result > 0)
1572                         ll_stats_ops_tally(ll_i2sbi(inode),
1573                                            LPROC_LL_READ_BYTES, result);
1574         } else if (iot == CIT_WRITE) {
1575                 if (result > 0) {
1576                         ll_stats_ops_tally(ll_i2sbi(inode),
1577                                            LPROC_LL_WRITE_BYTES, result);
1578                         fd->fd_write_failed = false;
1579                 } else if (result == 0 && rc == 0) {
1580                         rc = io->ci_result;
1581                         if (rc < 0)
1582                                 fd->fd_write_failed = true;
1583                         else
1584                                 fd->fd_write_failed = false;
1585                 } else if (rc != -ERESTARTSYS) {
1586                         fd->fd_write_failed = true;
1587                 }
1588         }
1589
1590         CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1591         if (result > 0)
1592                 ll_heat_add(inode, iot, result);
1593
1594         RETURN(result > 0 ? result : rc);
1595 }
1596
1597 /**
1598  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1599  * especially for small I/O.
1600  *
1601  * To serve a read request, CLIO has to create and initialize a cl_io and
1602  * then request DLM lock. This has turned out to have siginificant overhead
1603  * and affects the performance of small I/O dramatically.
1604  *
1605  * It's not necessary to create a cl_io for each I/O. Under the help of read
1606  * ahead, most of the pages being read are already in memory cache and we can
1607  * read those pages directly because if the pages exist, the corresponding DLM
1608  * lock must exist so that page content must be valid.
1609  *
1610  * In fast read implementation, the llite speculatively finds and reads pages
1611  * in memory cache. There are three scenarios for fast read:
1612  *   - If the page exists and is uptodate, kernel VM will provide the data and
1613  *     CLIO won't be intervened;
1614  *   - If the page was brought into memory by read ahead, it will be exported
1615  *     and read ahead parameters will be updated;
1616  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1617  *     it will go back and invoke normal read, i.e., a cl_io will be created
1618  *     and DLM lock will be requested.
1619  *
1620  * POSIX compliance: posix standard states that read is intended to be atomic.
1621  * Lustre read implementation is in line with Linux kernel read implementation
1622  * and neither of them complies with POSIX standard in this matter. Fast read
1623  * doesn't make the situation worse on single node but it may interleave write
1624  * results from multiple nodes due to short read handling in ll_file_aio_read().
1625  *
1626  * \param env - lu_env
1627  * \param iocb - kiocb from kernel
1628  * \param iter - user space buffers where the data will be copied
1629  *
1630  * \retval - number of bytes have been read, or error code if error occurred.
1631  */
1632 static ssize_t
1633 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1634 {
1635         ssize_t result;
1636
1637         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1638                 return 0;
1639
1640         /* NB: we can't do direct IO for fast read because it will need a lock
1641          * to make IO engine happy. */
1642         if (iocb->ki_filp->f_flags & O_DIRECT)
1643                 return 0;
1644
1645         result = generic_file_read_iter(iocb, iter);
1646
1647         /* If the first page is not in cache, generic_file_aio_read() will be
1648          * returned with -ENODATA.
1649          * See corresponding code in ll_readpage(). */
1650         if (result == -ENODATA)
1651                 result = 0;
1652
1653         if (result > 0) {
1654                 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1655                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1656                                    LPROC_LL_READ_BYTES, result);
1657         }
1658
1659         return result;
1660 }
1661
1662 /*
1663  * Read from a file (through the page cache).
1664  */
1665 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1666 {
1667         struct lu_env *env;
1668         struct vvp_io_args *args;
1669         struct file *file = iocb->ki_filp;
1670         ssize_t result;
1671         ssize_t rc2;
1672         __u16 refcheck;
1673         ktime_t kstart = ktime_get();
1674         bool cached;
1675
1676         if (!iov_iter_count(to))
1677                 return 0;
1678
1679         /**
1680          * Currently when PCC read failed, we do not fall back to the
1681          * normal read path, just return the error.
1682          * The resaon is that: for RW-PCC, the file data may be modified
1683          * in the PCC and inconsistent with the data on OSTs (or file
1684          * data has been removed from the Lustre file system), at this
1685          * time, fallback to the normal read path may read the wrong
1686          * data.
1687          * TODO: for RO-PCC (readonly PCC), fall back to normal read
1688          * path: read data from data copy on OSTs.
1689          */
1690         result = pcc_file_read_iter(iocb, to, &cached);
1691         if (cached)
1692                 GOTO(out, result);
1693
1694         ll_ras_enter(file, iocb->ki_pos, iov_iter_count(to));
1695
1696         result = ll_do_fast_read(iocb, to);
1697         if (result < 0 || iov_iter_count(to) == 0)
1698                 GOTO(out, result);
1699
1700         env = cl_env_get(&refcheck);
1701         if (IS_ERR(env))
1702                 return PTR_ERR(env);
1703
1704         args = ll_env_args(env, IO_NORMAL);
1705         args->u.normal.via_iter = to;
1706         args->u.normal.via_iocb = iocb;
1707
1708         rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1709                                  &iocb->ki_pos, iov_iter_count(to));
1710         if (rc2 > 0)
1711                 result += rc2;
1712         else if (result == 0)
1713                 result = rc2;
1714
1715         cl_env_put(env, &refcheck);
1716 out:
1717         if (result > 0) {
1718                 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1719                                   LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1720                                   READ);
1721                 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_READ,
1722                                    ktime_us_delta(ktime_get(), kstart));
1723         }
1724
1725         return result;
1726 }
1727
1728 /**
1729  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1730  * If a page is already in the page cache and dirty (and some other things -
1731  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1732  * write to it without doing a full I/O, because Lustre already knows about it
1733  * and will write it out.  This saves a lot of processing time.
1734  *
1735  * All writes here are within one page, so exclusion is handled by the page
1736  * lock on the vm page.  We do not do tiny writes for writes which touch
1737  * multiple pages because it's very unlikely multiple sequential pages are
1738  * are already dirty.
1739  *
1740  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1741  * and are unlikely to be to already dirty pages.
1742  *
1743  * Attribute updates are important here, we do them in ll_tiny_write_end.
1744  */
1745 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1746 {
1747         ssize_t count = iov_iter_count(iter);
1748         struct  file *file = iocb->ki_filp;
1749         struct  inode *inode = file_inode(file);
1750         bool    lock_inode = !IS_NOSEC(inode);
1751         ssize_t result = 0;
1752
1753         ENTRY;
1754
1755         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1756          * of function for why.
1757          */
1758         if (count >= PAGE_SIZE ||
1759             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1760                 RETURN(0);
1761
1762         if (unlikely(lock_inode))
1763                 inode_lock(inode);
1764         result = __generic_file_write_iter(iocb, iter);
1765
1766         if (unlikely(lock_inode))
1767                 inode_unlock(inode);
1768
1769         /* If the page is not already dirty, ll_tiny_write_begin returns
1770          * -ENODATA.  We continue on to normal write.
1771          */
1772         if (result == -ENODATA)
1773                 result = 0;
1774
1775         if (result > 0) {
1776                 ll_heat_add(inode, CIT_WRITE, result);
1777                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1778                                    result);
1779                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1780         }
1781
1782         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1783
1784         RETURN(result);
1785 }
1786
1787 /*
1788  * Write to a file (through the page cache).
1789  */
1790 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1791 {
1792         struct vvp_io_args *args;
1793         struct lu_env *env;
1794         ssize_t rc_tiny = 0, rc_normal;
1795         struct file *file = iocb->ki_filp;
1796         __u16 refcheck;
1797         bool cached;
1798         ktime_t kstart = ktime_get();
1799         int result;
1800
1801         ENTRY;
1802
1803         if (!iov_iter_count(from))
1804                 GOTO(out, rc_normal = 0);
1805
1806         /**
1807          * When PCC write failed, we usually do not fall back to the normal
1808          * write path, just return the error. But there is a special case when
1809          * returned error code is -ENOSPC due to running out of space on PCC HSM
1810          * bakcend. At this time, it will fall back to normal I/O path and
1811          * retry the I/O. As the file is in HSM released state, it will restore
1812          * the file data to OSTs first and redo the write again. And the
1813          * restore process will revoke the layout lock and detach the file
1814          * from PCC cache automatically.
1815          */
1816         result = pcc_file_write_iter(iocb, from, &cached);
1817         if (cached && result != -ENOSPC && result != -EDQUOT)
1818                 GOTO(out, rc_normal = result);
1819
1820         /* NB: we can't do direct IO for tiny writes because they use the page
1821          * cache, we can't do sync writes because tiny writes can't flush
1822          * pages, and we can't do append writes because we can't guarantee the
1823          * required DLM locks are held to protect file size.
1824          */
1825         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1826             !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1827                 rc_tiny = ll_do_tiny_write(iocb, from);
1828
1829         /* In case of error, go on and try normal write - Only stop if tiny
1830          * write completed I/O.
1831          */
1832         if (iov_iter_count(from) == 0)
1833                 GOTO(out, rc_normal = rc_tiny);
1834
1835         env = cl_env_get(&refcheck);
1836         if (IS_ERR(env))
1837                 return PTR_ERR(env);
1838
1839         args = ll_env_args(env, IO_NORMAL);
1840         args->u.normal.via_iter = from;
1841         args->u.normal.via_iocb = iocb;
1842
1843         rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1844                                        &iocb->ki_pos, iov_iter_count(from));
1845
1846         /* On success, combine bytes written. */
1847         if (rc_tiny >= 0 && rc_normal > 0)
1848                 rc_normal += rc_tiny;
1849         /* On error, only return error from normal write if tiny write did not
1850          * write any bytes.  Otherwise return bytes written by tiny write.
1851          */
1852         else if (rc_tiny > 0)
1853                 rc_normal = rc_tiny;
1854
1855         cl_env_put(env, &refcheck);
1856 out:
1857         if (rc_normal > 0) {
1858                 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1859                                   LUSTRE_FPRIVATE(file), iocb->ki_pos,
1860                                   rc_normal, WRITE);
1861                 ll_stats_ops_tally(ll_i2sbi(file_inode(file)), LPROC_LL_WRITE,
1862                                    ktime_us_delta(ktime_get(), kstart));
1863         }
1864
1865         RETURN(rc_normal);
1866 }
1867
1868 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1869 /*
1870  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1871  */
1872 static int ll_file_get_iov_count(const struct iovec *iov,
1873                                  unsigned long *nr_segs, size_t *count)
1874 {
1875         size_t cnt = 0;
1876         unsigned long seg;
1877
1878         for (seg = 0; seg < *nr_segs; seg++) {
1879                 const struct iovec *iv = &iov[seg];
1880
1881                 /*
1882                  * If any segment has a negative length, or the cumulative
1883                  * length ever wraps negative then return -EINVAL.
1884                  */
1885                 cnt += iv->iov_len;
1886                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1887                         return -EINVAL;
1888                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1889                         continue;
1890                 if (seg == 0)
1891                         return -EFAULT;
1892                 *nr_segs = seg;
1893                 cnt -= iv->iov_len;     /* This segment is no good */
1894                 break;
1895         }
1896         *count = cnt;
1897         return 0;
1898 }
1899
1900 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1901                                 unsigned long nr_segs, loff_t pos)
1902 {
1903         struct iov_iter to;
1904         size_t iov_count;
1905         ssize_t result;
1906         ENTRY;
1907
1908         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1909         if (result)
1910                 RETURN(result);
1911
1912         if (!iov_count)
1913                 RETURN(0);
1914
1915 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1916         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1917 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1918         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1919 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1920
1921         result = ll_file_read_iter(iocb, &to);
1922
1923         RETURN(result);
1924 }
1925
1926 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1927                             loff_t *ppos)
1928 {
1929         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1930         struct kiocb   kiocb;
1931         ssize_t        result;
1932
1933         ENTRY;
1934
1935         if (!count)
1936                 RETURN(0);
1937
1938         init_sync_kiocb(&kiocb, file);
1939         kiocb.ki_pos = *ppos;
1940 #ifdef HAVE_KIOCB_KI_LEFT
1941         kiocb.ki_left = count;
1942 #elif defined(HAVE_KI_NBYTES)
1943         kiocb.i_nbytes = count;
1944 #endif
1945
1946         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1947         *ppos = kiocb.ki_pos;
1948
1949         RETURN(result);
1950 }
1951
1952 /*
1953  * Write to a file (through the page cache).
1954  * AIO stuff
1955  */
1956 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1957                                  unsigned long nr_segs, loff_t pos)
1958 {
1959         struct iov_iter from;
1960         size_t iov_count;
1961         ssize_t result;
1962         ENTRY;
1963
1964         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1965         if (result)
1966                 RETURN(result);
1967
1968         if (!iov_count)
1969                 RETURN(0);
1970
1971 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1972         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1973 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1974         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1975 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1976
1977         result = ll_file_write_iter(iocb, &from);
1978
1979         RETURN(result);
1980 }
1981
1982 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1983                              size_t count, loff_t *ppos)
1984 {
1985         struct iovec   iov = { .iov_base = (void __user *)buf,
1986                                .iov_len = count };
1987         struct kiocb   kiocb;
1988         ssize_t        result;
1989
1990         ENTRY;
1991
1992         if (!count)
1993                 RETURN(0);
1994
1995         init_sync_kiocb(&kiocb, file);
1996         kiocb.ki_pos = *ppos;
1997 #ifdef HAVE_KIOCB_KI_LEFT
1998         kiocb.ki_left = count;
1999 #elif defined(HAVE_KI_NBYTES)
2000         kiocb.ki_nbytes = count;
2001 #endif
2002
2003         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
2004         *ppos = kiocb.ki_pos;
2005
2006         RETURN(result);
2007 }
2008 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
2009
2010 /*
2011  * Send file content (through pagecache) somewhere with helper
2012  */
2013 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
2014                                    struct pipe_inode_info *pipe, size_t count,
2015                                    unsigned int flags)
2016 {
2017         struct lu_env *env;
2018         struct vvp_io_args *args;
2019         ssize_t result;
2020         __u16 refcheck;
2021         bool cached;
2022
2023         ENTRY;
2024
2025         result = pcc_file_splice_read(in_file, ppos, pipe,
2026                                       count, flags, &cached);
2027         if (cached)
2028                 RETURN(result);
2029
2030         ll_ras_enter(in_file, *ppos, count);
2031
2032         env = cl_env_get(&refcheck);
2033         if (IS_ERR(env))
2034                 RETURN(PTR_ERR(env));
2035
2036         args = ll_env_args(env, IO_SPLICE);
2037         args->u.splice.via_pipe = pipe;
2038         args->u.splice.via_flags = flags;
2039
2040         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2041         cl_env_put(env, &refcheck);
2042
2043         if (result > 0)
2044                 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
2045                                   LUSTRE_FPRIVATE(in_file), *ppos, result,
2046                                   READ);
2047         RETURN(result);
2048 }
2049
2050 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2051                              __u64 flags, struct lov_user_md *lum, int lum_size)
2052 {
2053         struct lookup_intent oit = {
2054                 .it_op = IT_OPEN,
2055                 .it_flags = flags | MDS_OPEN_BY_FID,
2056         };
2057         int rc;
2058         ENTRY;
2059
2060         if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2061             le32_to_cpu(LOV_MAGIC_MAGIC)) {
2062                 /* this code will only exist for big-endian systems */
2063                 lustre_swab_lov_user_md(lum, 0);
2064         }
2065
2066         ll_inode_size_lock(inode);
2067         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2068         if (rc < 0)
2069                 GOTO(out_unlock, rc);
2070
2071         ll_release_openhandle(dentry, &oit);
2072
2073 out_unlock:
2074         ll_inode_size_unlock(inode);
2075         ll_intent_release(&oit);
2076
2077         RETURN(rc);
2078 }
2079
2080 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2081                              struct lov_mds_md **lmmp, int *lmm_size,
2082                              struct ptlrpc_request **request)
2083 {
2084         struct ll_sb_info *sbi = ll_i2sbi(inode);
2085         struct mdt_body  *body;
2086         struct lov_mds_md *lmm = NULL;
2087         struct ptlrpc_request *req = NULL;
2088         struct md_op_data *op_data;
2089         int rc, lmmsize;
2090
2091         rc = ll_get_default_mdsize(sbi, &lmmsize);
2092         if (rc)
2093                 RETURN(rc);
2094
2095         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2096                                      strlen(filename), lmmsize,
2097                                      LUSTRE_OPC_ANY, NULL);
2098         if (IS_ERR(op_data))
2099                 RETURN(PTR_ERR(op_data));
2100
2101         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2102         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2103         ll_finish_md_op_data(op_data);
2104         if (rc < 0) {
2105                 CDEBUG(D_INFO, "md_getattr_name failed "
2106                        "on %s: rc %d\n", filename, rc);
2107                 GOTO(out, rc);
2108         }
2109
2110         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2111         LASSERT(body != NULL); /* checked by mdc_getattr_name */
2112
2113         lmmsize = body->mbo_eadatasize;
2114
2115         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2116                         lmmsize == 0) {
2117                 GOTO(out, rc = -ENODATA);
2118         }
2119
2120         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2121         LASSERT(lmm != NULL);
2122
2123         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2124             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2125             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2126             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2127                 GOTO(out, rc = -EPROTO);
2128
2129         /*
2130          * This is coming from the MDS, so is probably in
2131          * little endian.  We convert it to host endian before
2132          * passing it to userspace.
2133          */
2134         if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2135             __swab32(LOV_MAGIC_MAGIC)) {
2136                 int stripe_count = 0;
2137
2138                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2139                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2140                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2141                         if (le32_to_cpu(lmm->lmm_pattern) &
2142                             LOV_PATTERN_F_RELEASED)
2143                                 stripe_count = 0;
2144                 }
2145
2146                 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2147
2148                 /* if function called for directory - we should
2149                  * avoid swab not existent lsm objects */
2150                 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2151                         lustre_swab_lov_user_md_objects(
2152                                 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2153                                 stripe_count);
2154                 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2155                          S_ISREG(body->mbo_mode))
2156                         lustre_swab_lov_user_md_objects(
2157                                 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2158                                 stripe_count);
2159         }
2160
2161 out:
2162         *lmmp = lmm;
2163         *lmm_size = lmmsize;
2164         *request = req;
2165         return rc;
2166 }
2167
2168 static int ll_lov_setea(struct inode *inode, struct file *file,
2169                         void __user *arg)
2170 {
2171         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2172         struct lov_user_md      *lump;
2173         int                      lum_size = sizeof(struct lov_user_md) +
2174                                             sizeof(struct lov_user_ost_data);
2175         int                      rc;
2176         ENTRY;
2177
2178         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2179                 RETURN(-EPERM);
2180
2181         OBD_ALLOC_LARGE(lump, lum_size);
2182         if (lump == NULL)
2183                 RETURN(-ENOMEM);
2184
2185         if (copy_from_user(lump, arg, lum_size))
2186                 GOTO(out_lump, rc = -EFAULT);
2187
2188         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2189                                       lum_size);
2190         cl_lov_delay_create_clear(&file->f_flags);
2191
2192 out_lump:
2193         OBD_FREE_LARGE(lump, lum_size);
2194         RETURN(rc);
2195 }
2196
2197 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2198 {
2199         struct lu_env   *env;
2200         __u16           refcheck;
2201         int             rc;
2202         ENTRY;
2203
2204         env = cl_env_get(&refcheck);
2205         if (IS_ERR(env))
2206                 RETURN(PTR_ERR(env));
2207
2208         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2209         cl_env_put(env, &refcheck);
2210         RETURN(rc);
2211 }
2212
2213 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2214                             void __user *arg)
2215 {
2216         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2217         struct lov_user_md        *klum;
2218         int                        lum_size, rc;
2219         __u64                      flags = FMODE_WRITE;
2220         ENTRY;
2221
2222         rc = ll_copy_user_md(lum, &klum);
2223         if (rc < 0)
2224                 RETURN(rc);
2225
2226         lum_size = rc;
2227         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2228                                       lum_size);
2229         if (!rc) {
2230                 __u32 gen;
2231
2232                 rc = put_user(0, &lum->lmm_stripe_count);
2233                 if (rc)
2234                         GOTO(out, rc);
2235
2236                 rc = ll_layout_refresh(inode, &gen);
2237                 if (rc)
2238                         GOTO(out, rc);
2239
2240                 rc = ll_file_getstripe(inode, arg, lum_size);
2241         }
2242         cl_lov_delay_create_clear(&file->f_flags);
2243
2244 out:
2245         OBD_FREE_LARGE(klum, lum_size);
2246         RETURN(rc);
2247 }
2248
2249
2250 static int
2251 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2252 {
2253         struct ll_inode_info *lli = ll_i2info(inode);
2254         struct cl_object *obj = lli->lli_clob;
2255         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2256         struct ll_grouplock grouplock;
2257         int rc;
2258         ENTRY;
2259
2260         if (arg == 0) {
2261                 CWARN("group id for group lock must not be 0\n");
2262                 RETURN(-EINVAL);
2263         }
2264
2265         if (ll_file_nolock(file))
2266                 RETURN(-EOPNOTSUPP);
2267 retry:
2268         if (file->f_flags & O_NONBLOCK) {
2269                 if (!mutex_trylock(&lli->lli_group_mutex))
2270                         RETURN(-EAGAIN);
2271         } else
2272                 mutex_lock(&lli->lli_group_mutex);
2273
2274         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2275                 CWARN("group lock already existed with gid %lu\n",
2276                       fd->fd_grouplock.lg_gid);
2277                 GOTO(out, rc = -EINVAL);
2278         }
2279         if (arg != lli->lli_group_gid && lli->lli_group_users != 0) {
2280                 if (file->f_flags & O_NONBLOCK)
2281                         GOTO(out, rc = -EAGAIN);
2282                 mutex_unlock(&lli->lli_group_mutex);
2283                 wait_var_event(&lli->lli_group_users, !lli->lli_group_users);
2284                 GOTO(retry, rc = 0);
2285         }
2286         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2287
2288         /**
2289          * XXX: group lock needs to protect all OST objects while PFL
2290          * can add new OST objects during the IO, so we'd instantiate
2291          * all OST objects before getting its group lock.
2292          */
2293         if (obj) {
2294                 struct lu_env *env;
2295                 __u16 refcheck;
2296                 struct cl_layout cl = {
2297                         .cl_is_composite = false,
2298                 };
2299                 struct lu_extent ext = {
2300                         .e_start = 0,
2301                         .e_end = OBD_OBJECT_EOF,
2302                 };
2303
2304                 env = cl_env_get(&refcheck);
2305                 if (IS_ERR(env))
2306                         GOTO(out, rc = PTR_ERR(env));
2307
2308                 rc = cl_object_layout_get(env, obj, &cl);
2309                 if (!rc && cl.cl_is_composite)
2310                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2311                                                     &ext);
2312
2313                 cl_env_put(env, &refcheck);
2314                 if (rc)
2315                         GOTO(out, rc);
2316         }
2317
2318         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2319                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2320
2321         if (rc)
2322                 GOTO(out, rc);
2323
2324         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2325         fd->fd_grouplock = grouplock;
2326         if (lli->lli_group_users == 0)
2327                 lli->lli_group_gid = grouplock.lg_gid;
2328         lli->lli_group_users++;
2329
2330         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2331 out:
2332         mutex_unlock(&lli->lli_group_mutex);
2333
2334         RETURN(rc);
2335 }
2336
2337 static int ll_put_grouplock(struct inode *inode, struct file *file,
2338                             unsigned long arg)
2339 {
2340         struct ll_inode_info   *lli = ll_i2info(inode);
2341         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2342         struct ll_grouplock     grouplock;
2343         int                     rc;
2344         ENTRY;
2345
2346         mutex_lock(&lli->lli_group_mutex);
2347         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2348                 CWARN("no group lock held\n");
2349                 GOTO(out, rc = -EINVAL);
2350         }
2351
2352         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2353
2354         if (fd->fd_grouplock.lg_gid != arg) {
2355                 CWARN("group lock %lu doesn't match current id %lu\n",
2356                       arg, fd->fd_grouplock.lg_gid);
2357                 GOTO(out, rc = -EINVAL);
2358         }
2359
2360         grouplock = fd->fd_grouplock;
2361         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2362         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2363
2364         cl_put_grouplock(&grouplock);
2365
2366         lli->lli_group_users--;
2367         if (lli->lli_group_users == 0) {
2368                 lli->lli_group_gid = 0;
2369                 wake_up_var(&lli->lli_group_users);
2370         }
2371         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2372         GOTO(out, rc = 0);
2373 out:
2374         mutex_unlock(&lli->lli_group_mutex);
2375
2376         RETURN(rc);
2377 }
2378
2379 /**
2380  * Close inode open handle
2381  *
2382  * \param dentry [in]     dentry which contains the inode
2383  * \param it     [in,out] intent which contains open info and result
2384  *
2385  * \retval 0     success
2386  * \retval <0    failure
2387  */
2388 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2389 {
2390         struct inode *inode = dentry->d_inode;
2391         struct obd_client_handle *och;
2392         int rc;
2393         ENTRY;
2394
2395         LASSERT(inode);
2396
2397         /* Root ? Do nothing. */
2398         if (dentry->d_inode->i_sb->s_root == dentry)
2399                 RETURN(0);
2400
2401         /* No open handle to close? Move away */
2402         if (!it_disposition(it, DISP_OPEN_OPEN))
2403                 RETURN(0);
2404
2405         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2406
2407         OBD_ALLOC(och, sizeof(*och));
2408         if (!och)
2409                 GOTO(out, rc = -ENOMEM);
2410
2411         rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2412         if (rc)
2413                 GOTO(out, rc);
2414
2415         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2416 out:
2417         /* this one is in place of ll_file_open */
2418         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2419                 ptlrpc_req_finished(it->it_request);
2420                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2421         }
2422         RETURN(rc);
2423 }
2424
2425 /**
2426  * Get size for inode for which FIEMAP mapping is requested.
2427  * Make the FIEMAP get_info call and returns the result.
2428  * \param fiemap        kernel buffer to hold extens
2429  * \param num_bytes     kernel buffer size
2430  */
2431 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2432                         size_t num_bytes)
2433 {
2434         struct lu_env                   *env;
2435         __u16                           refcheck;
2436         int                             rc = 0;
2437         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2438         ENTRY;
2439
2440         /* Checks for fiemap flags */
2441         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2442                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2443                 return -EBADR;
2444         }
2445
2446         /* Check for FIEMAP_FLAG_SYNC */
2447         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2448                 rc = filemap_fdatawrite(inode->i_mapping);
2449                 if (rc)
2450                         return rc;
2451         }
2452
2453         env = cl_env_get(&refcheck);
2454         if (IS_ERR(env))
2455                 RETURN(PTR_ERR(env));
2456
2457         if (i_size_read(inode) == 0) {
2458                 rc = ll_glimpse_size(inode);
2459                 if (rc)
2460                         GOTO(out, rc);
2461         }
2462
2463         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2464         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2465         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2466
2467         /* If filesize is 0, then there would be no objects for mapping */
2468         if (fmkey.lfik_oa.o_size == 0) {
2469                 fiemap->fm_mapped_extents = 0;
2470                 GOTO(out, rc = 0);
2471         }
2472
2473         fmkey.lfik_fiemap = *fiemap;
2474
2475         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2476                               &fmkey, fiemap, &num_bytes);
2477 out:
2478         cl_env_put(env, &refcheck);
2479         RETURN(rc);
2480 }
2481
2482 int ll_fid2path(struct inode *inode, void __user *arg)
2483 {
2484         struct obd_export       *exp = ll_i2mdexp(inode);
2485         const struct getinfo_fid2path __user *gfin = arg;
2486         __u32                    pathlen;
2487         struct getinfo_fid2path *gfout;
2488         size_t                   outsize;
2489         int                      rc;
2490
2491         ENTRY;
2492
2493         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2494             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2495                 RETURN(-EPERM);
2496
2497         /* Only need to get the buflen */
2498         if (get_user(pathlen, &gfin->gf_pathlen))
2499                 RETURN(-EFAULT);
2500
2501         if (pathlen > PATH_MAX)
2502                 RETURN(-EINVAL);
2503
2504         outsize = sizeof(*gfout) + pathlen;
2505         OBD_ALLOC(gfout, outsize);
2506         if (gfout == NULL)
2507                 RETURN(-ENOMEM);
2508
2509         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2510                 GOTO(gf_free, rc = -EFAULT);
2511         /* append root FID after gfout to let MDT know the root FID so that it
2512          * can lookup the correct path, this is mainly for fileset.
2513          * old server without fileset mount support will ignore this. */
2514         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2515
2516         /* Call mdc_iocontrol */
2517         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2518         if (rc != 0)
2519                 GOTO(gf_free, rc);
2520
2521         if (copy_to_user(arg, gfout, outsize))
2522                 rc = -EFAULT;
2523
2524 gf_free:
2525         OBD_FREE(gfout, outsize);
2526         RETURN(rc);
2527 }
2528
2529 static int
2530 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2531 {
2532         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2533         struct lu_env *env;
2534         struct cl_io *io;
2535         __u16  refcheck;
2536         int result;
2537
2538         ENTRY;
2539
2540         ioc->idv_version = 0;
2541         ioc->idv_layout_version = UINT_MAX;
2542
2543         /* If no file object initialized, we consider its version is 0. */
2544         if (obj == NULL)
2545                 RETURN(0);
2546
2547         env = cl_env_get(&refcheck);
2548         if (IS_ERR(env))
2549                 RETURN(PTR_ERR(env));
2550
2551         io = vvp_env_thread_io(env);
2552         io->ci_obj = obj;
2553         io->u.ci_data_version.dv_data_version = 0;
2554         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2555         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2556
2557 restart:
2558         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2559                 result = cl_io_loop(env, io);
2560         else
2561                 result = io->ci_result;
2562
2563         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2564         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2565
2566         cl_io_fini(env, io);
2567
2568         if (unlikely(io->ci_need_restart))
2569                 goto restart;
2570
2571         cl_env_put(env, &refcheck);
2572
2573         RETURN(result);
2574 }
2575
2576 /*
2577  * Read the data_version for inode.
2578  *
2579  * This value is computed using stripe object version on OST.
2580  * Version is computed using server side locking.
2581  *
2582  * @param flags if do sync on the OST side;
2583  *              0: no sync
2584  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2585  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2586  */
2587 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2588 {
2589         struct ioc_data_version ioc = { .idv_flags = flags };
2590         int rc;
2591
2592         rc = ll_ioc_data_version(inode, &ioc);
2593         if (!rc)
2594                 *data_version = ioc.idv_version;
2595
2596         return rc;
2597 }
2598
2599 /*
2600  * Trigger a HSM release request for the provided inode.
2601  */
2602 int ll_hsm_release(struct inode *inode)
2603 {
2604         struct lu_env *env;
2605         struct obd_client_handle *och = NULL;
2606         __u64 data_version = 0;
2607         int rc;
2608         __u16 refcheck;
2609         ENTRY;
2610
2611         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2612                ll_i2sbi(inode)->ll_fsname,
2613                PFID(&ll_i2info(inode)->lli_fid));
2614
2615         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2616         if (IS_ERR(och))
2617                 GOTO(out, rc = PTR_ERR(och));
2618
2619         /* Grab latest data_version and [am]time values */
2620         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2621         if (rc != 0)
2622                 GOTO(out, rc);
2623
2624         env = cl_env_get(&refcheck);
2625         if (IS_ERR(env))
2626                 GOTO(out, rc = PTR_ERR(env));
2627
2628         rc = ll_merge_attr(env, inode);
2629         cl_env_put(env, &refcheck);
2630
2631         /* If error happen, we have the wrong size for a file.
2632          * Don't release it.
2633          */
2634         if (rc != 0)
2635                 GOTO(out, rc);
2636
2637         /* Release the file.
2638          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2639          * we still need it to pack l_remote_handle to MDT. */
2640         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2641                                        &data_version);
2642         och = NULL;
2643
2644         EXIT;
2645 out:
2646         if (och != NULL && !IS_ERR(och)) /* close the file */
2647                 ll_lease_close(och, inode, NULL);
2648
2649         return rc;
2650 }
2651
2652 struct ll_swap_stack {
2653         __u64                    dv1;
2654         __u64                    dv2;
2655         struct inode            *inode1;
2656         struct inode            *inode2;
2657         bool                     check_dv1;
2658         bool                     check_dv2;
2659 };
2660
2661 static int ll_swap_layouts(struct file *file1, struct file *file2,
2662                            struct lustre_swap_layouts *lsl)
2663 {
2664         struct mdc_swap_layouts  msl;
2665         struct md_op_data       *op_data;
2666         __u32                    gid;
2667         __u64                    dv;
2668         struct ll_swap_stack    *llss = NULL;
2669         int                      rc;
2670
2671         OBD_ALLOC_PTR(llss);
2672         if (llss == NULL)
2673                 RETURN(-ENOMEM);
2674
2675         llss->inode1 = file_inode(file1);
2676         llss->inode2 = file_inode(file2);
2677
2678         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2679         if (rc < 0)
2680                 GOTO(free, rc);
2681
2682         /* we use 2 bool because it is easier to swap than 2 bits */
2683         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2684                 llss->check_dv1 = true;
2685
2686         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2687                 llss->check_dv2 = true;
2688
2689         /* we cannot use lsl->sl_dvX directly because we may swap them */
2690         llss->dv1 = lsl->sl_dv1;
2691         llss->dv2 = lsl->sl_dv2;
2692
2693         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2694         if (rc == 0) /* same file, done! */
2695                 GOTO(free, rc);
2696
2697         if (rc < 0) { /* sequentialize it */
2698                 swap(llss->inode1, llss->inode2);
2699                 swap(file1, file2);
2700                 swap(llss->dv1, llss->dv2);
2701                 swap(llss->check_dv1, llss->check_dv2);
2702         }
2703
2704         gid = lsl->sl_gid;
2705         if (gid != 0) { /* application asks to flush dirty cache */
2706                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2707                 if (rc < 0)
2708                         GOTO(free, rc);
2709
2710                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2711                 if (rc < 0) {
2712                         ll_put_grouplock(llss->inode1, file1, gid);
2713                         GOTO(free, rc);
2714                 }
2715         }
2716
2717         /* ultimate check, before swaping the layouts we check if
2718          * dataversion has changed (if requested) */
2719         if (llss->check_dv1) {
2720                 rc = ll_data_version(llss->inode1, &dv, 0);
2721                 if (rc)
2722                         GOTO(putgl, rc);
2723                 if (dv != llss->dv1)
2724                         GOTO(putgl, rc = -EAGAIN);
2725         }
2726
2727         if (llss->check_dv2) {
2728                 rc = ll_data_version(llss->inode2, &dv, 0);
2729                 if (rc)
2730                         GOTO(putgl, rc);
2731                 if (dv != llss->dv2)
2732                         GOTO(putgl, rc = -EAGAIN);
2733         }
2734
2735         /* struct md_op_data is used to send the swap args to the mdt
2736          * only flags is missing, so we use struct mdc_swap_layouts
2737          * through the md_op_data->op_data */
2738         /* flags from user space have to be converted before they are send to
2739          * server, no flag is sent today, they are only used on the client */
2740         msl.msl_flags = 0;
2741         rc = -ENOMEM;
2742         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2743                                      0, LUSTRE_OPC_ANY, &msl);
2744         if (IS_ERR(op_data))
2745                 GOTO(free, rc = PTR_ERR(op_data));
2746
2747         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2748                            sizeof(*op_data), op_data, NULL);
2749         ll_finish_md_op_data(op_data);
2750
2751         if (rc < 0)
2752                 GOTO(putgl, rc);
2753
2754 putgl:
2755         if (gid != 0) {
2756                 ll_put_grouplock(llss->inode2, file2, gid);
2757                 ll_put_grouplock(llss->inode1, file1, gid);
2758         }
2759
2760 free:
2761         if (llss != NULL)
2762                 OBD_FREE_PTR(llss);
2763
2764         RETURN(rc);
2765 }
2766
2767 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2768 {
2769         struct obd_export *exp = ll_i2mdexp(inode);
2770         struct md_op_data *op_data;
2771         int rc;
2772         ENTRY;
2773
2774         /* Detect out-of range masks */
2775         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2776                 RETURN(-EINVAL);
2777
2778         /* Non-root users are forbidden to set or clear flags which are
2779          * NOT defined in HSM_USER_MASK. */
2780         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2781             !cfs_capable(CFS_CAP_SYS_ADMIN))
2782                 RETURN(-EPERM);
2783
2784         if (!exp_connect_archive_id_array(exp)) {
2785                 /* Detect out-of range archive id */
2786                 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2787                     (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2788                         RETURN(-EINVAL);
2789         }
2790
2791         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2792                                      LUSTRE_OPC_ANY, hss);
2793         if (IS_ERR(op_data))
2794                 RETURN(PTR_ERR(op_data));
2795
2796         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2797                            op_data, NULL);
2798
2799         ll_finish_md_op_data(op_data);
2800
2801         RETURN(rc);
2802 }
2803
2804 static int ll_hsm_import(struct inode *inode, struct file *file,
2805                          struct hsm_user_import *hui)
2806 {
2807         struct hsm_state_set    *hss = NULL;
2808         struct iattr            *attr = NULL;
2809         int                      rc;
2810         ENTRY;
2811
2812         if (!S_ISREG(inode->i_mode))
2813                 RETURN(-EINVAL);
2814
2815         /* set HSM flags */
2816         OBD_ALLOC_PTR(hss);
2817         if (hss == NULL)
2818                 GOTO(out, rc = -ENOMEM);
2819
2820         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2821         hss->hss_archive_id = hui->hui_archive_id;
2822         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2823         rc = ll_hsm_state_set(inode, hss);
2824         if (rc != 0)
2825                 GOTO(out, rc);
2826
2827         OBD_ALLOC_PTR(attr);
2828         if (attr == NULL)
2829                 GOTO(out, rc = -ENOMEM);
2830
2831         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2832         attr->ia_mode |= S_IFREG;
2833         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2834         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2835         attr->ia_size = hui->hui_size;
2836         attr->ia_mtime.tv_sec = hui->hui_mtime;
2837         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2838         attr->ia_atime.tv_sec = hui->hui_atime;
2839         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2840
2841         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2842                          ATTR_UID | ATTR_GID |
2843                          ATTR_MTIME | ATTR_MTIME_SET |
2844                          ATTR_ATIME | ATTR_ATIME_SET;
2845
2846         inode_lock(inode);
2847
2848         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2849         if (rc == -ENODATA)
2850                 rc = 0;
2851
2852         inode_unlock(inode);
2853
2854 out:
2855         if (hss != NULL)
2856                 OBD_FREE_PTR(hss);
2857
2858         if (attr != NULL)
2859                 OBD_FREE_PTR(attr);
2860
2861         RETURN(rc);
2862 }
2863
2864 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2865 {
2866         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2867                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2868 }
2869
2870 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2871 {
2872         struct inode *inode = file_inode(file);
2873         struct iattr ia = {
2874                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2875                             ATTR_MTIME | ATTR_MTIME_SET |
2876                             ATTR_CTIME,
2877                 .ia_atime = {
2878                         .tv_sec = lfu->lfu_atime_sec,
2879                         .tv_nsec = lfu->lfu_atime_nsec,
2880                 },
2881                 .ia_mtime = {
2882                         .tv_sec = lfu->lfu_mtime_sec,
2883                         .tv_nsec = lfu->lfu_mtime_nsec,
2884                 },
2885                 .ia_ctime = {
2886                         .tv_sec = lfu->lfu_ctime_sec,
2887                         .tv_nsec = lfu->lfu_ctime_nsec,
2888                 },
2889         };
2890         int rc;
2891         ENTRY;
2892
2893         if (!capable(CAP_SYS_ADMIN))
2894                 RETURN(-EPERM);
2895
2896         if (!S_ISREG(inode->i_mode))
2897                 RETURN(-EINVAL);
2898
2899         inode_lock(inode);
2900         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2901                             false);
2902         inode_unlock(inode);
2903
2904         RETURN(rc);
2905 }
2906
2907 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2908 {
2909         switch (mode) {
2910         case MODE_READ_USER:
2911                 return CLM_READ;
2912         case MODE_WRITE_USER:
2913                 return CLM_WRITE;
2914         default:
2915                 return -EINVAL;
2916         }
2917 }
2918
2919 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2920
2921 /* Used to allow the upper layers of the client to request an LDLM lock
2922  * without doing an actual read or write.
2923  *
2924  * Used for ladvise lockahead to manually request specific locks.
2925  *
2926  * \param[in] file      file this ladvise lock request is on
2927  * \param[in] ladvise   ladvise struct describing this lock request
2928  *
2929  * \retval 0            success, no detailed result available (sync requests
2930  *                      and requests sent to the server [not handled locally]
2931  *                      cannot return detailed results)
2932  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2933  *                                       see definitions for details.
2934  * \retval negative     negative errno on error
2935  */
2936 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2937 {
2938         struct lu_env *env = NULL;
2939         struct cl_io *io  = NULL;
2940         struct cl_lock *lock = NULL;
2941         struct cl_lock_descr *descr = NULL;
2942         struct dentry *dentry = file->f_path.dentry;
2943         struct inode *inode = dentry->d_inode;
2944         enum cl_lock_mode cl_mode;
2945         off_t start = ladvise->lla_start;
2946         off_t end = ladvise->lla_end;
2947         int result;
2948         __u16 refcheck;
2949
2950         ENTRY;
2951
2952         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2953                "start=%llu, end=%llu\n", dentry->d_name.len,
2954                dentry->d_name.name, dentry->d_inode,
2955                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2956                (__u64) end);
2957
2958         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2959         if (cl_mode < 0)
2960                 GOTO(out, result = cl_mode);
2961
2962         /* Get IO environment */
2963         result = cl_io_get(inode, &env, &io, &refcheck);
2964         if (result <= 0)
2965                 GOTO(out, result);
2966
2967         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2968         if (result > 0) {
2969                 /*
2970                  * nothing to do for this io. This currently happens when
2971                  * stripe sub-object's are not yet created.
2972                  */
2973                 result = io->ci_result;
2974         } else if (result == 0) {
2975                 lock = vvp_env_lock(env);
2976                 descr = &lock->cll_descr;
2977
2978                 descr->cld_obj   = io->ci_obj;
2979                 /* Convert byte offsets to pages */
2980                 descr->cld_start = cl_index(io->ci_obj, start);
2981                 descr->cld_end   = cl_index(io->ci_obj, end);
2982                 descr->cld_mode  = cl_mode;
2983                 /* CEF_MUST is used because we do not want to convert a
2984                  * lockahead request to a lockless lock */
2985                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2986                                        CEF_NONBLOCK;
2987
2988                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2989                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2990
2991                 result = cl_lock_request(env, io, lock);
2992
2993                 /* On success, we need to release the lock */
2994                 if (result >= 0)
2995                         cl_lock_release(env, lock);
2996         }
2997         cl_io_fini(env, io);
2998         cl_env_put(env, &refcheck);
2999
3000         /* -ECANCELED indicates a matching lock with a different extent
3001          * was already present, and -EEXIST indicates a matching lock
3002          * on exactly the same extent was already present.
3003          * We convert them to positive values for userspace to make
3004          * recognizing true errors easier.
3005          * Note we can only return these detailed results on async requests,
3006          * as sync requests look the same as i/o requests for locking. */
3007         if (result == -ECANCELED)
3008                 result = LLA_RESULT_DIFFERENT;
3009         else if (result == -EEXIST)
3010                 result = LLA_RESULT_SAME;
3011
3012 out:
3013         RETURN(result);
3014 }
3015 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
3016
3017 static int ll_ladvise_sanity(struct inode *inode,
3018                              struct llapi_lu_ladvise *ladvise)
3019 {
3020         struct ll_sb_info *sbi = ll_i2sbi(inode);
3021         enum lu_ladvise_type advice = ladvise->lla_advice;
3022         /* Note the peradvice flags is a 32 bit field, so per advice flags must
3023          * be in the first 32 bits of enum ladvise_flags */
3024         __u32 flags = ladvise->lla_peradvice_flags;
3025         /* 3 lines at 80 characters per line, should be plenty */
3026         int rc = 0;
3027
3028         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
3029                 rc = -EINVAL;
3030                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
3031                        "last supported advice is %s (value '%d'): rc = %d\n",
3032                        sbi->ll_fsname, advice,
3033                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
3034                 GOTO(out, rc);
3035         }
3036
3037         /* Per-advice checks */
3038         switch (advice) {
3039         case LU_LADVISE_LOCKNOEXPAND:
3040                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
3041                         rc = -EINVAL;
3042                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3043                                "rc = %d\n", sbi->ll_fsname, flags,
3044                                ladvise_names[advice], rc);
3045                         GOTO(out, rc);
3046                 }
3047                 break;
3048         case LU_LADVISE_LOCKAHEAD:
3049                 /* Currently only READ and WRITE modes can be requested */
3050                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3051                     ladvise->lla_lockahead_mode == 0) {
3052                         rc = -EINVAL;
3053                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3054                                "rc = %d\n", sbi->ll_fsname,
3055                                ladvise->lla_lockahead_mode,
3056                                ladvise_names[advice], rc);
3057                         GOTO(out, rc);
3058                 }
3059                 /* fallthrough */
3060         case LU_LADVISE_WILLREAD:
3061         case LU_LADVISE_DONTNEED:
3062         default:
3063                 /* Note fall through above - These checks apply to all advices
3064                  * except LOCKNOEXPAND */
3065                 if (flags & ~LF_DEFAULT_MASK) {
3066                         rc = -EINVAL;
3067                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3068                                "rc = %d\n", sbi->ll_fsname, flags,
3069                                ladvise_names[advice], rc);
3070                         GOTO(out, rc);
3071                 }
3072                 if (ladvise->lla_start >= ladvise->lla_end) {
3073                         rc = -EINVAL;
3074                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3075                                "for %s: rc = %d\n", sbi->ll_fsname,
3076                                ladvise->lla_start, ladvise->lla_end,
3077                                ladvise_names[advice], rc);
3078                         GOTO(out, rc);
3079                 }
3080                 break;
3081         }
3082
3083 out:
3084         return rc;
3085 }
3086 #undef ERRSIZE
3087
3088 /*
3089  * Give file access advices
3090  *
3091  * The ladvise interface is similar to Linux fadvise() system call, except it
3092  * forwards the advices directly from Lustre client to server. The server side
3093  * codes will apply appropriate read-ahead and caching techniques for the
3094  * corresponding files.
3095  *
3096  * A typical workload for ladvise is e.g. a bunch of different clients are
3097  * doing small random reads of a file, so prefetching pages into OSS cache
3098  * with big linear reads before the random IO is a net benefit. Fetching
3099  * all that data into each client cache with fadvise() may not be, due to
3100  * much more data being sent to the client.
3101  */
3102 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3103                       struct llapi_lu_ladvise *ladvise)
3104 {
3105         struct lu_env *env;
3106         struct cl_io *io;
3107         struct cl_ladvise_io *lio;
3108         int rc;
3109         __u16 refcheck;
3110         ENTRY;
3111
3112         env = cl_env_get(&refcheck);
3113         if (IS_ERR(env))
3114                 RETURN(PTR_ERR(env));
3115
3116         io = vvp_env_thread_io(env);
3117         io->ci_obj = ll_i2info(inode)->lli_clob;
3118
3119         /* initialize parameters for ladvise */
3120         lio = &io->u.ci_ladvise;
3121         lio->li_start = ladvise->lla_start;
3122         lio->li_end = ladvise->lla_end;
3123         lio->li_fid = ll_inode2fid(inode);
3124         lio->li_advice = ladvise->lla_advice;
3125         lio->li_flags = flags;
3126
3127         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3128                 rc = cl_io_loop(env, io);
3129         else
3130                 rc = io->ci_result;
3131
3132         cl_io_fini(env, io);
3133         cl_env_put(env, &refcheck);
3134         RETURN(rc);
3135 }
3136
3137 static int ll_lock_noexpand(struct file *file, int flags)
3138 {
3139         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3140
3141         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3142
3143         return 0;
3144 }
3145
3146 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3147                         unsigned long arg)
3148 {
3149         struct fsxattr fsxattr;
3150
3151         if (copy_from_user(&fsxattr,
3152                            (const struct fsxattr __user *)arg,
3153                            sizeof(fsxattr)))
3154                 RETURN(-EFAULT);
3155
3156         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3157         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3158                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3159         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3160         if (copy_to_user((struct fsxattr __user *)arg,
3161                          &fsxattr, sizeof(fsxattr)))
3162                 RETURN(-EFAULT);
3163
3164         RETURN(0);
3165 }
3166
3167 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3168 {
3169         /*
3170          * Project Quota ID state is only allowed to change from within the init
3171          * namespace. Enforce that restriction only if we are trying to change
3172          * the quota ID state. Everything else is allowed in user namespaces.
3173          */
3174         if (current_user_ns() == &init_user_ns)
3175                 return 0;
3176
3177         if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3178                 return -EINVAL;
3179
3180         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3181                 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3182                         return -EINVAL;
3183         } else {
3184                 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3185                         return -EINVAL;
3186         }
3187
3188         return 0;
3189 }
3190
3191 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3192                         unsigned long arg)
3193 {
3194
3195         struct md_op_data *op_data;
3196         struct ptlrpc_request *req = NULL;
3197         int rc = 0;
3198         struct fsxattr fsxattr;
3199         struct cl_object *obj;
3200         struct iattr *attr;
3201         int flags;
3202
3203         if (copy_from_user(&fsxattr,
3204                            (const struct fsxattr __user *)arg,
3205                            sizeof(fsxattr)))
3206                 RETURN(-EFAULT);
3207
3208         rc = ll_ioctl_check_project(inode, &fsxattr);
3209         if (rc)
3210                 RETURN(rc);
3211
3212         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3213                                      LUSTRE_OPC_ANY, NULL);
3214         if (IS_ERR(op_data))
3215                 RETURN(PTR_ERR(op_data));
3216
3217         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3218         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3219         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3220                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3221         op_data->op_projid = fsxattr.fsx_projid;
3222         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3223         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3224                         0, &req);
3225         ptlrpc_req_finished(req);
3226         if (rc)
3227                 GOTO(out_fsxattr, rc);
3228         ll_update_inode_flags(inode, op_data->op_attr_flags);
3229         obj = ll_i2info(inode)->lli_clob;
3230         if (obj == NULL)
3231                 GOTO(out_fsxattr, rc);
3232
3233         OBD_ALLOC_PTR(attr);
3234         if (attr == NULL)
3235                 GOTO(out_fsxattr, rc = -ENOMEM);
3236
3237         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3238                             fsxattr.fsx_xflags);
3239         OBD_FREE_PTR(attr);
3240 out_fsxattr:
3241         ll_finish_md_op_data(op_data);
3242         RETURN(rc);
3243 }
3244
3245 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3246                                  unsigned long arg)
3247 {
3248         struct inode            *inode = file_inode(file);
3249         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3250         struct ll_inode_info    *lli = ll_i2info(inode);
3251         struct obd_client_handle *och = NULL;
3252         struct split_param sp;
3253         struct pcc_param param;
3254         bool lease_broken = false;
3255         fmode_t fmode = 0;
3256         enum mds_op_bias bias = 0;
3257         struct file *layout_file = NULL;
3258         void *data = NULL;
3259         size_t data_size = 0;
3260         bool attached = false;
3261         long rc, rc2 = 0;
3262
3263         ENTRY;
3264
3265         mutex_lock(&lli->lli_och_mutex);
3266         if (fd->fd_lease_och != NULL) {
3267                 och = fd->fd_lease_och;
3268                 fd->fd_lease_och = NULL;
3269         }
3270         mutex_unlock(&lli->lli_och_mutex);
3271
3272         if (och == NULL)
3273                 RETURN(-ENOLCK);
3274
3275         fmode = och->och_flags;
3276
3277         switch (ioc->lil_flags) {
3278         case LL_LEASE_RESYNC_DONE:
3279                 if (ioc->lil_count > IOC_IDS_MAX)
3280                         GOTO(out_lease_close, rc = -EINVAL);
3281
3282                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3283                 OBD_ALLOC(data, data_size);
3284                 if (!data)
3285                         GOTO(out_lease_close, rc = -ENOMEM);
3286
3287                 if (copy_from_user(data, (void __user *)arg, data_size))
3288                         GOTO(out_lease_close, rc = -EFAULT);
3289
3290                 bias = MDS_CLOSE_RESYNC_DONE;
3291                 break;
3292         case LL_LEASE_LAYOUT_MERGE: {
3293                 int fd;
3294
3295                 if (ioc->lil_count != 1)
3296                         GOTO(out_lease_close, rc = -EINVAL);
3297
3298                 arg += sizeof(*ioc);
3299                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3300                         GOTO(out_lease_close, rc = -EFAULT);
3301
3302                 layout_file = fget(fd);
3303                 if (!layout_file)
3304                         GOTO(out_lease_close, rc = -EBADF);
3305
3306                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3307                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3308                         GOTO(out_lease_close, rc = -EPERM);
3309
3310                 data = file_inode(layout_file);
3311                 bias = MDS_CLOSE_LAYOUT_MERGE;
3312                 break;
3313         }
3314         case LL_LEASE_LAYOUT_SPLIT: {
3315                 int fdv;
3316                 int mirror_id;
3317
3318                 if (ioc->lil_count != 2)
3319                         GOTO(out_lease_close, rc = -EINVAL);
3320
3321                 arg += sizeof(*ioc);
3322                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3323                         GOTO(out_lease_close, rc = -EFAULT);
3324
3325                 arg += sizeof(__u32);
3326                 if (copy_from_user(&mirror_id, (void __user *)arg,
3327                                    sizeof(__u32)))
3328                         GOTO(out_lease_close, rc = -EFAULT);
3329
3330                 layout_file = fget(fdv);
3331                 if (!layout_file)
3332                         GOTO(out_lease_close, rc = -EBADF);
3333
3334                 sp.sp_inode = file_inode(layout_file);
3335                 sp.sp_mirror_id = (__u16)mirror_id;
3336                 data = &sp;
3337                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3338                 break;
3339         }
3340         case LL_LEASE_PCC_ATTACH:
3341                 if (ioc->lil_count != 1)
3342                         RETURN(-EINVAL);
3343
3344                 arg += sizeof(*ioc);
3345                 if (copy_from_user(&param.pa_archive_id, (void __user *)arg,
3346                                    sizeof(__u32)))
3347                         GOTO(out_lease_close, rc2 = -EFAULT);
3348
3349                 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3350                 if (rc2)
3351                         GOTO(out_lease_close, rc2);
3352
3353                 attached = true;
3354                 /* Grab latest data version */
3355                 rc2 = ll_data_version(inode, &param.pa_data_version,
3356                                      LL_DV_WR_FLUSH);
3357                 if (rc2)
3358                         GOTO(out_lease_close, rc2);
3359
3360                 data = &param;
3361                 bias = MDS_PCC_ATTACH;
3362                 break;
3363         default:
3364                 /* without close intent */
3365                 break;
3366         }
3367
3368 out_lease_close:
3369         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3370         if (rc < 0)
3371                 GOTO(out, rc);
3372
3373         rc = ll_lease_och_release(inode, file);
3374         if (rc < 0)
3375                 GOTO(out, rc);
3376
3377         if (lease_broken)
3378                 fmode = 0;
3379         EXIT;
3380
3381 out:
3382         switch (ioc->lil_flags) {
3383         case LL_LEASE_RESYNC_DONE:
3384                 if (data)
3385                         OBD_FREE(data, data_size);
3386                 break;
3387         case LL_LEASE_LAYOUT_MERGE:
3388         case LL_LEASE_LAYOUT_SPLIT:
3389                 if (layout_file)
3390                         fput(layout_file);
3391                 break;
3392         case LL_LEASE_PCC_ATTACH:
3393                 if (!rc)
3394                         rc = rc2;
3395                 rc = pcc_readwrite_attach_fini(file, inode,
3396                                                param.pa_layout_gen,
3397                                                lease_broken, rc,
3398                                                attached);
3399                 break;
3400         }
3401
3402         if (!rc)
3403                 rc = ll_lease_type_from_fmode(fmode);
3404         RETURN(rc);
3405 }
3406
3407 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3408                               unsigned long arg)
3409 {
3410         struct inode *inode = file_inode(file);
3411         struct ll_inode_info *lli = ll_i2info(inode);
3412         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3413         struct obd_client_handle *och = NULL;
3414         __u64 open_flags = 0;
3415         bool lease_broken;
3416         fmode_t fmode;
3417         long rc;
3418         ENTRY;
3419
3420         switch (ioc->lil_mode) {
3421         case LL_LEASE_WRLCK:
3422                 if (!(file->f_mode & FMODE_WRITE))
3423                         RETURN(-EPERM);
3424                 fmode = FMODE_WRITE;
3425                 break;
3426         case LL_LEASE_RDLCK:
3427                 if (!(file->f_mode & FMODE_READ))
3428                         RETURN(-EPERM);
3429                 fmode = FMODE_READ;
3430                 break;
3431         case LL_LEASE_UNLCK:
3432                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3433         default:
3434                 RETURN(-EINVAL);
3435         }
3436
3437         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3438
3439         /* apply for lease */
3440         if (ioc->lil_flags & LL_LEASE_RESYNC)
3441                 open_flags = MDS_OPEN_RESYNC;
3442         och = ll_lease_open(inode, file, fmode, open_flags);
3443         if (IS_ERR(och))
3444                 RETURN(PTR_ERR(och));
3445
3446         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3447                 rc = ll_lease_file_resync(och, inode, arg);
3448                 if (rc) {
3449                         ll_lease_close(och, inode, NULL);
3450                         RETURN(rc);
3451                 }
3452                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3453                 if (rc) {
3454                         ll_lease_close(och, inode, NULL);
3455                         RETURN(rc);
3456                 }
3457         }
3458
3459         rc = 0;
3460         mutex_lock(&lli->lli_och_mutex);
3461         if (fd->fd_lease_och == NULL) {
3462                 fd->fd_lease_och = och;
3463                 och = NULL;
3464         }
3465         mutex_unlock(&lli->lli_och_mutex);
3466         if (och != NULL) {
3467                 /* impossible now that only excl is supported for now */
3468                 ll_lease_close(och, inode, &lease_broken);
3469                 rc = -EBUSY;
3470         }
3471         RETURN(rc);
3472 }
3473
3474 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3475 {
3476         struct ll_inode_info *lli = ll_i2info(inode);
3477         struct ll_sb_info *sbi = ll_i2sbi(inode);
3478         __u64 now = ktime_get_real_seconds();
3479         int i;
3480
3481         spin_lock(&lli->lli_heat_lock);
3482         heat->lh_flags = lli->lli_heat_flags;
3483         for (i = 0; i < heat->lh_count; i++)
3484                 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3485                                                 now, sbi->ll_heat_decay_weight,
3486                                                 sbi->ll_heat_period_second);
3487         spin_unlock(&lli->lli_heat_lock);
3488 }
3489
3490 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3491 {
3492         struct ll_inode_info *lli = ll_i2info(inode);
3493         int rc = 0;
3494
3495         spin_lock(&lli->lli_heat_lock);
3496         if (flags & LU_HEAT_FLAG_CLEAR)
3497                 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3498
3499         if (flags & LU_HEAT_FLAG_OFF)
3500                 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3501         else
3502                 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3503
3504         spin_unlock(&lli->lli_heat_lock);
3505
3506         RETURN(rc);
3507 }
3508
3509 static long
3510 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3511 {
3512         struct inode            *inode = file_inode(file);
3513         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3514         int                      flags, rc;
3515         ENTRY;
3516
3517         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3518                PFID(ll_inode2fid(inode)), inode, cmd);
3519         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3520
3521         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3522         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3523                 RETURN(-ENOTTY);
3524
3525         switch (cmd) {
3526         case LL_IOC_GETFLAGS:
3527                 /* Get the current value of the file flags */
3528                 return put_user(fd->fd_flags, (int __user *)arg);
3529         case LL_IOC_SETFLAGS:
3530         case LL_IOC_CLRFLAGS:
3531                 /* Set or clear specific file flags */
3532                 /* XXX This probably needs checks to ensure the flags are
3533                  *     not abused, and to handle any flag side effects.
3534                  */
3535                 if (get_user(flags, (int __user *) arg))
3536                         RETURN(-EFAULT);
3537
3538                 if (cmd == LL_IOC_SETFLAGS) {
3539                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3540                             !(file->f_flags & O_DIRECT)) {
3541                                 CERROR("%s: unable to disable locking on "
3542                                        "non-O_DIRECT file\n", current->comm);
3543                                 RETURN(-EINVAL);
3544                         }
3545
3546                         fd->fd_flags |= flags;
3547                 } else {
3548                         fd->fd_flags &= ~flags;
3549                 }
3550                 RETURN(0);
3551         case LL_IOC_LOV_SETSTRIPE:
3552         case LL_IOC_LOV_SETSTRIPE_NEW:
3553                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3554         case LL_IOC_LOV_SETEA:
3555                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3556         case LL_IOC_LOV_SWAP_LAYOUTS: {
3557                 struct file *file2;
3558                 struct lustre_swap_layouts lsl;
3559
3560                 if (copy_from_user(&lsl, (char __user *)arg,
3561                                    sizeof(struct lustre_swap_layouts)))
3562                         RETURN(-EFAULT);
3563
3564                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3565                         RETURN(-EPERM);
3566
3567                 file2 = fget(lsl.sl_fd);
3568                 if (file2 == NULL)
3569                         RETURN(-EBADF);
3570
3571                 /* O_WRONLY or O_RDWR */
3572                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3573                         GOTO(out, rc = -EPERM);
3574
3575                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3576                         struct inode                    *inode2;
3577                         struct ll_inode_info            *lli;
3578                         struct obd_client_handle        *och = NULL;
3579
3580                         lli = ll_i2info(inode);
3581                         mutex_lock(&lli->lli_och_mutex);
3582                         if (fd->fd_lease_och != NULL) {
3583                                 och = fd->fd_lease_och;
3584                                 fd->fd_lease_och = NULL;
3585                         }
3586                         mutex_unlock(&lli->lli_och_mutex);
3587                         if (och == NULL)
3588                                 GOTO(out, rc = -ENOLCK);
3589                         inode2 = file_inode(file2);
3590                         rc = ll_swap_layouts_close(och, inode, inode2);
3591                 } else {
3592                         rc = ll_swap_layouts(file, file2, &lsl);
3593                 }
3594 out:
3595                 fput(file2);
3596                 RETURN(rc);
3597         }
3598         case LL_IOC_LOV_GETSTRIPE:
3599         case LL_IOC_LOV_GETSTRIPE_NEW:
3600                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3601         case FS_IOC_GETFLAGS:
3602         case FS_IOC_SETFLAGS:
3603                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3604         case FSFILT_IOC_GETVERSION:
3605         case FS_IOC_GETVERSION:
3606                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3607         /* We need to special case any other ioctls we want to handle,
3608          * to send them to the MDS/OST as appropriate and to properly
3609          * network encode the arg field. */
3610         case FS_IOC_SETVERSION:
3611                 RETURN(-ENOTSUPP);
3612
3613         case LL_IOC_GROUP_LOCK:
3614                 RETURN(ll_get_grouplock(inode, file, arg));
3615         case LL_IOC_GROUP_UNLOCK:
3616                 RETURN(ll_put_grouplock(inode, file, arg));
3617         case IOC_OBD_STATFS:
3618                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3619
3620         case LL_IOC_FLUSHCTX:
3621                 RETURN(ll_flush_ctx(inode));
3622         case LL_IOC_PATH2FID: {
3623                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3624                                  sizeof(struct lu_fid)))
3625                         RETURN(-EFAULT);
3626
3627                 RETURN(0);
3628         }
3629         case LL_IOC_GETPARENT:
3630                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3631
3632         case OBD_IOC_FID2PATH:
3633                 RETURN(ll_fid2path(inode, (void __user *)arg));
3634         case LL_IOC_DATA_VERSION: {
3635                 struct ioc_data_version idv;
3636                 int rc;
3637
3638                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3639                         RETURN(-EFAULT);
3640
3641                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3642                 rc = ll_ioc_data_version(inode, &idv);
3643
3644                 if (rc == 0 &&
3645                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3646                         RETURN(-EFAULT);
3647
3648                 RETURN(rc);
3649         }
3650
3651         case LL_IOC_GET_MDTIDX: {
3652                 int mdtidx;
3653
3654                 mdtidx = ll_get_mdt_idx(inode);
3655                 if (mdtidx < 0)
3656                         RETURN(mdtidx);
3657
3658                 if (put_user((int)mdtidx, (int __user *)arg))
3659                         RETURN(-EFAULT);
3660
3661                 RETURN(0);
3662         }
3663         case OBD_IOC_GETDTNAME:
3664         case OBD_IOC_GETMDNAME:
3665                 RETURN(ll_get_obd_name(inode, cmd, arg));
3666         case LL_IOC_HSM_STATE_GET: {
3667                 struct md_op_data       *op_data;
3668                 struct hsm_user_state   *hus;
3669                 int                      rc;
3670
3671                 OBD_ALLOC_PTR(hus);
3672                 if (hus == NULL)
3673                         RETURN(-ENOMEM);
3674
3675                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3676                                              LUSTRE_OPC_ANY, hus);
3677                 if (IS_ERR(op_data)) {
3678                         OBD_FREE_PTR(hus);
3679                         RETURN(PTR_ERR(op_data));
3680                 }
3681
3682                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3683                                    op_data, NULL);
3684
3685                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3686                         rc = -EFAULT;
3687
3688                 ll_finish_md_op_data(op_data);
3689                 OBD_FREE_PTR(hus);
3690                 RETURN(rc);
3691         }
3692         case LL_IOC_HSM_STATE_SET: {
3693                 struct hsm_state_set    *hss;
3694                 int                      rc;
3695
3696                 OBD_ALLOC_PTR(hss);
3697                 if (hss == NULL)
3698                         RETURN(-ENOMEM);
3699
3700                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3701                         OBD_FREE_PTR(hss);
3702                         RETURN(-EFAULT);
3703                 }
3704
3705                 rc = ll_hsm_state_set(inode, hss);
3706
3707                 OBD_FREE_PTR(hss);
3708                 RETURN(rc);
3709         }
3710         case LL_IOC_HSM_ACTION: {
3711                 struct md_op_data               *op_data;
3712                 struct hsm_current_action       *hca;
3713                 int                              rc;
3714
3715                 OBD_ALLOC_PTR(hca);
3716                 if (hca == NULL)
3717                         RETURN(-ENOMEM);
3718
3719                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3720                                              LUSTRE_OPC_ANY, hca);
3721                 if (IS_ERR(op_data)) {
3722                         OBD_FREE_PTR(hca);
3723                         RETURN(PTR_ERR(op_data));
3724                 }
3725
3726                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3727                                    op_data, NULL);
3728
3729                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3730                         rc = -EFAULT;
3731
3732                 ll_finish_md_op_data(op_data);
3733                 OBD_FREE_PTR(hca);
3734                 RETURN(rc);
3735         }
3736         case LL_IOC_SET_LEASE_OLD: {
3737                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3738
3739                 RETURN(ll_file_set_lease(file, &ioc, 0));
3740         }
3741         case LL_IOC_SET_LEASE: {
3742                 struct ll_ioc_lease ioc;
3743
3744                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3745                         RETURN(-EFAULT);
3746
3747                 RETURN(ll_file_set_lease(file, &ioc, arg));
3748         }
3749         case LL_IOC_GET_LEASE: {
3750                 struct ll_inode_info *lli = ll_i2info(inode);
3751                 struct ldlm_lock *lock = NULL;
3752                 fmode_t fmode = 0;
3753
3754                 mutex_lock(&lli->lli_och_mutex);
3755                 if (fd->fd_lease_och != NULL) {
3756                         struct obd_client_handle *och = fd->fd_lease_och;
3757
3758                         lock = ldlm_handle2lock(&och->och_lease_handle);
3759                         if (lock != NULL) {
3760                                 lock_res_and_lock(lock);
3761                                 if (!ldlm_is_cancel(lock))
3762                                         fmode = och->och_flags;
3763
3764                                 unlock_res_and_lock(lock);
3765                                 LDLM_LOCK_PUT(lock);
3766                         }
3767                 }
3768                 mutex_unlock(&lli->lli_och_mutex);
3769
3770                 RETURN(ll_lease_type_from_fmode(fmode));
3771         }
3772         case LL_IOC_HSM_IMPORT: {
3773                 struct hsm_user_import *hui;
3774
3775                 OBD_ALLOC_PTR(hui);
3776                 if (hui == NULL)
3777                         RETURN(-ENOMEM);
3778
3779                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3780                         OBD_FREE_PTR(hui);
3781                         RETURN(-EFAULT);
3782                 }
3783
3784                 rc = ll_hsm_import(inode, file, hui);
3785
3786                 OBD_FREE_PTR(hui);
3787                 RETURN(rc);
3788         }
3789         case LL_IOC_FUTIMES_3: {
3790                 struct ll_futimes_3 lfu;
3791
3792                 if (copy_from_user(&lfu,
3793                                    (const struct ll_futimes_3 __user *)arg,
3794                                    sizeof(lfu)))
3795                         RETURN(-EFAULT);
3796
3797                 RETURN(ll_file_futimes_3(file, &lfu));
3798         }
3799         case LL_IOC_LADVISE: {
3800                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3801                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3802                 int i;
3803                 int num_advise;
3804                 int alloc_size = sizeof(*k_ladvise_hdr);
3805
3806                 rc = 0;
3807                 u_ladvise_hdr = (void __user *)arg;
3808                 OBD_ALLOC_PTR(k_ladvise_hdr);
3809                 if (k_ladvise_hdr == NULL)
3810                         RETURN(-ENOMEM);
3811
3812                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3813                         GOTO(out_ladvise, rc = -EFAULT);
3814
3815                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3816                     k_ladvise_hdr->lah_count < 1)
3817                         GOTO(out_ladvise, rc = -EINVAL);
3818
3819                 num_advise = k_ladvise_hdr->lah_count;
3820                 if (num_advise >= LAH_COUNT_MAX)
3821                         GOTO(out_ladvise, rc = -EFBIG);
3822
3823                 OBD_FREE_PTR(k_ladvise_hdr);
3824                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3825                                       lah_advise[num_advise]);
3826                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3827                 if (k_ladvise_hdr == NULL)
3828                         RETURN(-ENOMEM);
3829
3830                 /*
3831                  * TODO: submit multiple advices to one server in a single RPC
3832                  */
3833                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3834                         GOTO(out_ladvise, rc = -EFAULT);
3835
3836                 for (i = 0; i < num_advise; i++) {
3837                         struct llapi_lu_ladvise *k_ladvise =
3838                                         &k_ladvise_hdr->lah_advise[i];
3839                         struct llapi_lu_ladvise __user *u_ladvise =
3840                                         &u_ladvise_hdr->lah_advise[i];
3841
3842                         rc = ll_ladvise_sanity(inode, k_ladvise);
3843                         if (rc)
3844                                 GOTO(out_ladvise, rc);
3845
3846                         switch (k_ladvise->lla_advice) {
3847                         case LU_LADVISE_LOCKNOEXPAND:
3848                                 rc = ll_lock_noexpand(file,
3849                                                k_ladvise->lla_peradvice_flags);
3850                                 GOTO(out_ladvise, rc);
3851                         case LU_LADVISE_LOCKAHEAD:
3852
3853                                 rc = ll_file_lock_ahead(file, k_ladvise);
3854
3855                                 if (rc < 0)
3856                                         GOTO(out_ladvise, rc);
3857
3858                                 if (put_user(rc,
3859                                              &u_ladvise->lla_lockahead_result))
3860                                         GOTO(out_ladvise, rc = -EFAULT);
3861                                 break;
3862                         default:
3863                                 rc = ll_ladvise(inode, file,
3864                                                 k_ladvise_hdr->lah_flags,
3865                                                 k_ladvise);
3866                                 if (rc)
3867                                         GOTO(out_ladvise, rc);
3868                                 break;
3869                         }
3870
3871                 }
3872
3873 out_ladvise:
3874                 OBD_FREE(k_ladvise_hdr, alloc_size);
3875                 RETURN(rc);
3876         }
3877         case LL_IOC_FLR_SET_MIRROR: {
3878                 /* mirror I/O must be direct to avoid polluting page cache
3879                  * by stale data. */
3880                 if (!(file->f_flags & O_DIRECT))
3881                         RETURN(-EINVAL);
3882
3883                 fd->fd_designated_mirror = (__u32)arg;
3884                 RETURN(0);
3885         }
3886         case LL_IOC_FSGETXATTR:
3887                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3888         case LL_IOC_FSSETXATTR:
3889                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3890         case BLKSSZGET:
3891                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3892         case LL_IOC_HEAT_GET: {
3893                 struct lu_heat uheat;
3894                 struct lu_heat *heat;
3895                 int size;
3896
3897                 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3898                         RETURN(-EFAULT);
3899
3900                 if (uheat.lh_count > OBD_HEAT_COUNT)
3901                         uheat.lh_count = OBD_HEAT_COUNT;
3902
3903                 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3904                 OBD_ALLOC(heat, size);
3905                 if (heat == NULL)
3906                         RETURN(-ENOMEM);
3907
3908                 heat->lh_count = uheat.lh_count;
3909                 ll_heat_get(inode, heat);
3910                 rc = copy_to_user((char __user *)arg, heat, size);
3911                 OBD_FREE(heat, size);
3912                 RETURN(rc ? -EFAULT : 0);
3913         }
3914         case LL_IOC_HEAT_SET: {
3915                 __u64 flags;
3916
3917                 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3918                         RETURN(-EFAULT);
3919
3920                 rc = ll_heat_set(inode, flags);
3921                 RETURN(rc);
3922         }
3923         case LL_IOC_PCC_DETACH: {
3924                 struct lu_pcc_detach *detach;
3925
3926                 OBD_ALLOC_PTR(detach);
3927                 if (detach == NULL)
3928                         RETURN(-ENOMEM);
3929
3930                 if (copy_from_user(detach,
3931                                    (const struct lu_pcc_detach __user *)arg,
3932                                    sizeof(*detach)))
3933                         GOTO(out_detach_free, rc = -EFAULT);
3934
3935                 if (!S_ISREG(inode->i_mode))
3936                         GOTO(out_detach_free, rc = -EINVAL);
3937
3938                 if (!inode_owner_or_capable(inode))
3939                         GOTO(out_detach_free, rc = -EPERM);
3940
3941                 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3942 out_detach_free:
3943                 OBD_FREE_PTR(detach);
3944                 RETURN(rc);
3945         }
3946         case LL_IOC_PCC_STATE: {
3947                 struct lu_pcc_state __user *ustate =
3948                         (struct lu_pcc_state __user *)arg;
3949                 struct lu_pcc_state *state;
3950
3951                 OBD_ALLOC_PTR(state);
3952                 if (state == NULL)
3953                         RETURN(-ENOMEM);
3954
3955                 if (copy_from_user(state, ustate, sizeof(*state)))
3956                         GOTO(out_state, rc = -EFAULT);
3957
3958                 rc = pcc_ioctl_state(file, inode, state);
3959                 if (rc)
3960                         GOTO(out_state, rc);
3961
3962                 if (copy_to_user(ustate, state, sizeof(*state)))
3963                         GOTO(out_state, rc = -EFAULT);
3964
3965 out_state:
3966                 OBD_FREE_PTR(state);
3967                 RETURN(rc);
3968         }
3969         default:
3970                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3971                                      (void __user *)arg));
3972         }
3973 }
3974
3975 #ifndef HAVE_FILE_LLSEEK_SIZE
3976 static inline loff_t
3977 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3978 {
3979         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3980                 return -EINVAL;
3981         if (offset > maxsize)
3982                 return -EINVAL;
3983
3984         if (offset != file->f_pos) {
3985                 file->f_pos = offset;
3986                 file->f_version = 0;
3987         }
3988         return offset;
3989 }
3990
3991 static loff_t
3992 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3993                 loff_t maxsize, loff_t eof)
3994 {
3995         struct inode *inode = file_inode(file);
3996
3997         switch (origin) {
3998         case SEEK_END:
3999                 offset += eof;
4000                 break;
4001         case SEEK_CUR:
4002                 /*
4003                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
4004                  * position-querying operation.  Avoid rewriting the "same"
4005                  * f_pos value back to the file because a concurrent read(),
4006                  * write() or lseek() might have altered it
4007                  */
4008                 if (offset == 0)
4009                         return file->f_pos;
4010                 /*
4011                  * f_lock protects against read/modify/write race with other
4012                  * SEEK_CURs. Note that parallel writes and reads behave
4013                  * like SEEK_SET.
4014                  */
4015                 inode_lock(inode);
4016                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
4017                 inode_unlock(inode);
4018                 return offset;
4019         case SEEK_DATA:
4020                 /*
4021                  * In the generic case the entire file is data, so as long as
4022                  * offset isn't at the end of the file then the offset is data.
4023                  */
4024                 if (offset >= eof)
4025                         return -ENXIO;
4026                 break;
4027         case SEEK_HOLE:
4028                 /*
4029                  * There is a virtual hole at the end of the file, so as long as
4030                  * offset isn't i_size or larger, return i_size.
4031                  */
4032                 if (offset >= eof)
4033                         return -ENXIO;
4034                 offset = eof;
4035                 break;
4036         }
4037
4038         return llseek_execute(file, offset, maxsize);
4039 }
4040 #endif
4041
4042 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
4043 {
4044         struct inode *inode = file_inode(file);
4045         loff_t retval, eof = 0;
4046         ktime_t kstart = ktime_get();
4047
4048         ENTRY;
4049         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
4050                            (origin == SEEK_CUR) ? file->f_pos : 0);
4051         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
4052                PFID(ll_inode2fid(inode)), inode, retval, retval,
4053                origin);
4054
4055         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4056                 retval = ll_glimpse_size(inode);
4057                 if (retval != 0)
4058                         RETURN(retval);
4059                 eof = i_size_read(inode);
4060         }
4061
4062         retval = ll_generic_file_llseek_size(file, offset, origin,
4063                                              ll_file_maxbytes(inode), eof);
4064         if (retval >= 0)
4065                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK,
4066                                    ktime_us_delta(ktime_get(), kstart));
4067         RETURN(retval);
4068 }
4069
4070 static int ll_flush(struct file *file, fl_owner_t id)
4071 {
4072         struct inode *inode = file_inode(file);
4073         struct ll_inode_info *lli = ll_i2info(inode);
4074         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4075         int rc, err;
4076
4077         LASSERT(!S_ISDIR(inode->i_mode));
4078
4079         /* catch async errors that were recorded back when async writeback
4080          * failed for pages in this mapping. */
4081         rc = lli->lli_async_rc;
4082         lli->lli_async_rc = 0;
4083         if (lli->lli_clob != NULL) {
4084                 err = lov_read_and_clear_async_rc(lli->lli_clob);
4085                 if (rc == 0)
4086                         rc = err;
4087         }
4088
4089         /* The application has been told write failure already.
4090          * Do not report failure again. */
4091         if (fd->fd_write_failed)
4092                 return 0;
4093         return rc ? -EIO : 0;
4094 }
4095
4096 /**
4097  * Called to make sure a portion of file has been written out.
4098  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4099  *
4100  * Return how many pages have been written.
4101  */
4102 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4103                        enum cl_fsync_mode mode, int ignore_layout)
4104 {
4105         struct lu_env *env;
4106         struct cl_io *io;
4107         struct cl_fsync_io *fio;
4108         int result;
4109         __u16 refcheck;
4110         ENTRY;
4111
4112         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4113             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4114                 RETURN(-EINVAL);
4115
4116         env = cl_env_get(&refcheck);
4117         if (IS_ERR(env))
4118                 RETURN(PTR_ERR(env));
4119
4120         io = vvp_env_thread_io(env);
4121         io->ci_obj = ll_i2info(inode)->lli_clob;
4122         io->ci_ignore_layout = ignore_layout;
4123
4124         /* initialize parameters for sync */
4125         fio = &io->u.ci_fsync;
4126         fio->fi_start = start;
4127         fio->fi_end = end;
4128         fio->fi_fid = ll_inode2fid(inode);
4129         fio->fi_mode = mode;
4130         fio->fi_nr_written = 0;
4131
4132         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4133                 result = cl_io_loop(env, io);
4134         else
4135                 result = io->ci_result;
4136         if (result == 0)
4137                 result = fio->fi_nr_written;
4138         cl_io_fini(env, io);
4139         cl_env_put(env, &refcheck);
4140
4141         RETURN(result);
4142 }
4143
4144 /*
4145  * When dentry is provided (the 'else' case), file_dentry() may be
4146  * null and dentry must be used directly rather than pulled from
4147  * file_dentry() as is done otherwise.
4148  */
4149
4150 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4151 {
4152         struct dentry *dentry = file_dentry(file);
4153         struct inode *inode = dentry->d_inode;
4154         struct ll_inode_info *lli = ll_i2info(inode);
4155         struct ptlrpc_request *req;
4156         ktime_t kstart = ktime_get();
4157         int rc, err;
4158
4159         ENTRY;
4160
4161         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4162                "datasync %d\n",
4163                PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4164
4165         /* fsync's caller has already called _fdata{sync,write}, we want
4166          * that IO to finish before calling the osc and mdc sync methods */
4167         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4168         inode_lock(inode);
4169
4170         /* catch async errors that were recorded back when async writeback
4171          * failed for pages in this mapping. */
4172         if (!S_ISDIR(inode->i_mode)) {
4173                 err = lli->lli_async_rc;
4174                 lli->lli_async_rc = 0;
4175                 if (rc == 0)
4176                         rc = err;
4177                 if (lli->lli_clob != NULL) {
4178                         err = lov_read_and_clear_async_rc(lli->lli_clob);
4179                         if (rc == 0)
4180                                 rc = err;
4181                 }
4182         }
4183
4184         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4185         if (!rc)
4186                 rc = err;
4187         if (!err)
4188                 ptlrpc_req_finished(req);
4189
4190         if (S_ISREG(inode->i_mode)) {
4191                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4192                 bool cached;
4193
4194                 /* Sync metadata on MDT first, and then sync the cached data
4195                  * on PCC.
4196                  */
4197                 err = pcc_fsync(file, start, end, datasync, &cached);
4198                 if (!cached)
4199                         err = cl_sync_file_range(inode, start, end,
4200                                                  CL_FSYNC_ALL, 0);
4201                 if (rc == 0 && err < 0)
4202                         rc = err;
4203                 if (rc < 0)
4204                         fd->fd_write_failed = true;
4205                 else
4206                         fd->fd_write_failed = false;
4207         }
4208
4209         inode_unlock(inode);
4210
4211         if (!rc)
4212                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC,
4213                                    ktime_us_delta(ktime_get(), kstart));
4214         RETURN(rc);
4215 }
4216
4217 static int
4218 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4219 {
4220         struct inode *inode = file_inode(file);
4221         struct ll_sb_info *sbi = ll_i2sbi(inode);
4222         struct ldlm_enqueue_info einfo = {
4223                 .ei_type        = LDLM_FLOCK,
4224                 .ei_cb_cp       = ldlm_flock_completion_ast,
4225                 .ei_cbdata      = file_lock,
4226         };
4227         struct md_op_data *op_data;
4228         struct lustre_handle lockh = { 0 };
4229         union ldlm_policy_data flock = { { 0 } };
4230         int fl_type = file_lock->fl_type;
4231         ktime_t kstart = ktime_get();
4232         __u64 flags = 0;
4233         int rc;
4234         int rc2 = 0;
4235         ENTRY;
4236
4237         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4238                PFID(ll_inode2fid(inode)), file_lock);
4239
4240         if (file_lock->fl_flags & FL_FLOCK) {
4241                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4242                 /* flocks are whole-file locks */
4243                 flock.l_flock.end = OFFSET_MAX;
4244                 /* For flocks owner is determined by the local file desctiptor*/
4245                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4246         } else if (file_lock->fl_flags & FL_POSIX) {
4247                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4248                 flock.l_flock.start = file_lock->fl_start;
4249                 flock.l_flock.end = file_lock->fl_end;
4250         } else {
4251                 RETURN(-EINVAL);
4252         }
4253         flock.l_flock.pid = file_lock->fl_pid;
4254
4255 #if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner)
4256         /* Somewhat ugly workaround for svc lockd.
4257          * lockd installs custom fl_lmops->lm_compare_owner that checks
4258          * for the fl_owner to be the same (which it always is on local node
4259          * I guess between lockd processes) and then compares pid.
4260          * As such we assign pid to the owner field to make it all work,
4261          * conflict with normal locks is unlikely since pid space and
4262          * pointer space for current->files are not intersecting */
4263         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4264                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4265 #endif
4266
4267         switch (fl_type) {
4268         case F_RDLCK:
4269                 einfo.ei_mode = LCK_PR;
4270                 break;
4271         case F_UNLCK:
4272                 /* An unlock request may or may not have any relation to
4273                  * existing locks so we may not be able to pass a lock handle
4274                  * via a normal ldlm_lock_cancel() request. The request may even
4275                  * unlock a byte range in the middle of an existing lock. In
4276                  * order to process an unlock request we need all of the same
4277                  * information that is given with a normal read or write record
4278                  * lock request. To avoid creating another ldlm unlock (cancel)
4279                  * message we'll treat a LCK_NL flock request as an unlock. */
4280                 einfo.ei_mode = LCK_NL;
4281                 break;
4282         case F_WRLCK:
4283                 einfo.ei_mode = LCK_PW;
4284                 break;
4285         default:
4286                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4287                 RETURN (-ENOTSUPP);
4288         }
4289
4290         switch (cmd) {
4291         case F_SETLKW:
4292 #ifdef F_SETLKW64
4293         case F_SETLKW64:
4294 #endif
4295                 flags = 0;
4296                 break;
4297         case F_SETLK:
4298 #ifdef F_SETLK64
4299         case F_SETLK64:
4300 #endif
4301                 flags = LDLM_FL_BLOCK_NOWAIT;
4302                 break;
4303         case F_GETLK:
4304 #ifdef F_GETLK64
4305         case F_GETLK64:
4306 #endif
4307                 flags = LDLM_FL_TEST_LOCK;
4308                 break;
4309         default:
4310                 CERROR("unknown fcntl lock command: %d\n", cmd);
4311                 RETURN (-EINVAL);
4312         }
4313
4314         /* Save the old mode so that if the mode in the lock changes we
4315          * can decrement the appropriate reader or writer refcount. */
4316         file_lock->fl_type = einfo.ei_mode;
4317
4318         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4319                                      LUSTRE_OPC_ANY, NULL);
4320         if (IS_ERR(op_data))
4321                 RETURN(PTR_ERR(op_data));
4322
4323         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4324                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4325                flock.l_flock.pid, flags, einfo.ei_mode,
4326                flock.l_flock.start, flock.l_flock.end);
4327
4328         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4329                         flags);
4330
4331         /* Restore the file lock type if not TEST lock. */
4332         if (!(flags & LDLM_FL_TEST_LOCK))
4333                 file_lock->fl_type = fl_type;
4334
4335 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4336         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4337             !(flags & LDLM_FL_TEST_LOCK))
4338                 rc2  = locks_lock_file_wait(file, file_lock);
4339 #else
4340         if ((file_lock->fl_flags & FL_FLOCK) &&
4341             (rc == 0 || file_lock->fl_type == F_UNLCK))
4342                 rc2  = flock_lock_file_wait(file, file_lock);
4343         if ((file_lock->fl_flags & FL_POSIX) &&
4344             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4345             !(flags & LDLM_FL_TEST_LOCK))
4346                 rc2  = posix_lock_file_wait(file, file_lock);
4347 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4348
4349         if (rc2 && file_lock->fl_type != F_UNLCK) {
4350                 einfo.ei_mode = LCK_NL;
4351                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4352                            &lockh, flags);
4353                 rc = rc2;
4354         }
4355
4356         ll_finish_md_op_data(op_data);
4357
4358         if (!rc)
4359                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK,
4360                                    ktime_us_delta(ktime_get(), kstart));
4361         RETURN(rc);
4362 }
4363
4364 int ll_get_fid_by_name(struct inode *parent, const char *name,
4365                        int namelen, struct lu_fid *fid,
4366                        struct inode **inode)
4367 {
4368         struct md_op_data       *op_data = NULL;
4369         struct mdt_body         *body;
4370         struct ptlrpc_request   *req;
4371         int                     rc;
4372         ENTRY;
4373
4374         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4375                                      LUSTRE_OPC_ANY, NULL);
4376         if (IS_ERR(op_data))
4377                 RETURN(PTR_ERR(op_data));
4378
4379         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4380         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4381         ll_finish_md_op_data(op_data);
4382         if (rc < 0)
4383                 RETURN(rc);
4384
4385         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4386         if (body == NULL)
4387                 GOTO(out_req, rc = -EFAULT);
4388         if (fid != NULL)
4389                 *fid = body->mbo_fid1;
4390
4391         if (inode != NULL)
4392                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4393 out_req:
4394         ptlrpc_req_finished(req);
4395         RETURN(rc);
4396 }
4397
4398 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4399                const char *name)
4400 {
4401         struct dentry *dchild = NULL;
4402         struct inode *child_inode = NULL;
4403         struct md_op_data *op_data;
4404         struct ptlrpc_request *request = NULL;
4405         struct obd_client_handle *och = NULL;
4406         struct qstr qstr;
4407         struct mdt_body *body;
4408         __u64 data_version = 0;
4409         size_t namelen = strlen(name);
4410         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4411         int rc;
4412         ENTRY;
4413
4414         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4415                PFID(ll_inode2fid(parent)), name,
4416                lum->lum_stripe_offset, lum->lum_stripe_count);
4417
4418         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4419             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4420                 lustre_swab_lmv_user_md(lum);
4421
4422         /* Get child FID first */
4423         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4424         qstr.name = name;
4425         qstr.len = namelen;
4426         dchild = d_lookup(file_dentry(file), &qstr);
4427         if (dchild) {
4428                 if (dchild->d_inode)
4429                         child_inode = igrab(dchild->d_inode);
4430                 dput(dchild);
4431         }
4432
4433         if (!child_inode) {
4434                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4435                                         &child_inode);
4436                 if (rc)
4437                         RETURN(rc);
4438         }
4439
4440         if (!child_inode)
4441                 RETURN(-ENOENT);
4442
4443         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4444               OBD_CONNECT2_DIR_MIGRATE)) {
4445                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4446                     ll_dir_striped(child_inode)) {
4447                         CERROR("%s: MDT doesn't support stripe directory "
4448                                "migration!\n", ll_i2sbi(parent)->ll_fsname);
4449                         GOTO(out_iput, rc = -EOPNOTSUPP);
4450                 }
4451         }
4452
4453         /*
4454          * lfs migrate command needs to be blocked on the client
4455          * by checking the migrate FID against the FID of the
4456          * filesystem root.
4457          */
4458         if (child_inode == parent->i_sb->s_root->d_inode)
4459                 GOTO(out_iput, rc = -EINVAL);
4460
4461         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4462                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4463         if (IS_ERR(op_data))
4464                 GOTO(out_iput, rc = PTR_ERR(op_data));
4465
4466         inode_lock(child_inode);
4467         op_data->op_fid3 = *ll_inode2fid(child_inode);
4468         if (!fid_is_sane(&op_data->op_fid3)) {
4469                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4470                        ll_i2sbi(parent)->ll_fsname, name,
4471                        PFID(&op_data->op_fid3));
4472                 GOTO(out_unlock, rc = -EINVAL);
4473         }
4474
4475         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4476         op_data->op_data = lum;
4477         op_data->op_data_size = lumlen;
4478
4479 again:
4480         if (S_ISREG(child_inode->i_mode)) {
4481                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4482                 if (IS_ERR(och)) {
4483                         rc = PTR_ERR(och);
4484                         och = NULL;
4485                         GOTO(out_unlock, rc);
4486                 }
4487
4488                 rc = ll_data_version(child_inode, &data_version,
4489                                      LL_DV_WR_FLUSH);
4490                 if (rc != 0)
4491                         GOTO(out_close, rc);
4492
4493                 op_data->op_open_handle = och->och_open_handle;
4494                 op_data->op_data_version = data_version;
4495                 op_data->op_lease_handle = och->och_lease_handle;
4496                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4497
4498                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4499                 och->och_mod->mod_open_req->rq_replay = 0;
4500                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4501         }
4502
4503         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4504                        name, namelen, &request);
4505         if (rc == 0) {
4506                 LASSERT(request != NULL);
4507                 ll_update_times(request, parent);
4508         }
4509
4510         if (rc == 0 || rc == -EAGAIN) {
4511                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4512                 LASSERT(body != NULL);
4513
4514                 /* If the server does release layout lock, then we cleanup
4515                  * the client och here, otherwise release it in out_close: */
4516                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4517                         obd_mod_put(och->och_mod);
4518                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4519                                                   och);
4520                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4521                         OBD_FREE_PTR(och);
4522                         och = NULL;
4523                 }
4524         }
4525
4526         if (request != NULL) {
4527                 ptlrpc_req_finished(request);
4528                 request = NULL;
4529         }
4530
4531         /* Try again if the lease has cancelled. */
4532         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4533                 goto again;
4534
4535 out_close:
4536         if (och)
4537                 ll_lease_close(och, child_inode, NULL);
4538         if (!rc)
4539                 clear_nlink(child_inode);
4540 out_unlock:
4541         inode_unlock(child_inode);
4542         ll_finish_md_op_data(op_data);
4543 out_iput:
4544         iput(child_inode);
4545         RETURN(rc);
4546 }
4547
4548 static int
4549 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4550 {
4551         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4552         ENTRY;
4553
4554         /*
4555          * In order to avoid flood of warning messages, only print one message
4556          * for one file. And the entire message rate on the client is limited
4557          * by CDEBUG_LIMIT too.
4558          */
4559         if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4560                 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4561                 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4562                              "flock disabled, mount with '-o [local]flock' to enable\r\n");
4563         }
4564         RETURN(-ENOSYS);
4565 }
4566
4567 /**
4568  * test if some locks matching bits and l_req_mode are acquired
4569  * - bits can be in different locks
4570  * - if found clear the common lock bits in *bits
4571  * - the bits not found, are kept in *bits
4572  * \param inode [IN]
4573  * \param bits [IN] searched lock bits [IN]
4574  * \param l_req_mode [IN] searched lock mode
4575  * \retval boolean, true iff all bits are found
4576  */
4577 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4578 {
4579         struct lustre_handle lockh;
4580         union ldlm_policy_data policy;
4581         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4582                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4583         struct lu_fid *fid;
4584         __u64 flags;
4585         int i;
4586         ENTRY;
4587
4588         if (!inode)
4589                RETURN(0);
4590
4591         fid = &ll_i2info(inode)->lli_fid;
4592         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4593                ldlm_lockname[mode]);
4594
4595         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4596         for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4597                 policy.l_inodebits.bits = *bits & (1 << i);
4598                 if (policy.l_inodebits.bits == 0)
4599                         continue;
4600
4601                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4602                                   &policy, mode, &lockh)) {
4603                         struct ldlm_lock *lock;
4604
4605                         lock = ldlm_handle2lock(&lockh);
4606                         if (lock) {
4607                                 *bits &=
4608                                       ~(lock->l_policy_data.l_inodebits.bits);
4609                                 LDLM_LOCK_PUT(lock);
4610                         } else {
4611                                 *bits &= ~policy.l_inodebits.bits;
4612                         }
4613                 }
4614         }
4615         RETURN(*bits == 0);
4616 }
4617
4618 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4619                                struct lustre_handle *lockh, __u64 flags,
4620                                enum ldlm_mode mode)
4621 {
4622         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4623         struct lu_fid *fid;
4624         enum ldlm_mode rc;
4625         ENTRY;
4626
4627         fid = &ll_i2info(inode)->lli_fid;
4628         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4629
4630         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4631                            fid, LDLM_IBITS, &policy, mode, lockh);
4632
4633         RETURN(rc);
4634 }
4635
4636 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4637 {
4638         /* Already unlinked. Just update nlink and return success */
4639         if (rc == -ENOENT) {
4640                 clear_nlink(inode);
4641                 /* If it is striped directory, and there is bad stripe
4642                  * Let's revalidate the dentry again, instead of returning
4643                  * error */
4644                 if (ll_dir_striped(inode))
4645                         return 0;
4646
4647                 /* This path cannot be hit for regular files unless in
4648                  * case of obscure races, so no need to to validate
4649                  * size. */
4650                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4651                         return 0;
4652         } else if (rc != 0) {
4653                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4654                              "%s: revalidate FID "DFID" error: rc = %d\n",
4655                              ll_i2sbi(inode)->ll_fsname,
4656                              PFID(ll_inode2fid(inode)), rc);
4657         }
4658
4659         return rc;
4660 }
4661
4662 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4663 {
4664         struct inode *inode = dentry->d_inode;
4665         struct obd_export *exp = ll_i2mdexp(inode);
4666         struct lookup_intent oit = {
4667                 .it_op = op,
4668         };
4669         struct ptlrpc_request *req = NULL;
4670         struct md_op_data *op_data;
4671         int rc = 0;
4672         ENTRY;
4673
4674         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4675                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4676
4677         /* Call getattr by fid, so do not provide name at all. */
4678         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4679                                      LUSTRE_OPC_ANY, NULL);
4680         if (IS_ERR(op_data))
4681                 RETURN(PTR_ERR(op_data));
4682
4683         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4684         ll_finish_md_op_data(op_data);
4685         if (rc < 0) {
4686                 rc = ll_inode_revalidate_fini(inode, rc);
4687                 GOTO(out, rc);
4688         }
4689
4690         rc = ll_revalidate_it_finish(req, &oit, dentry);
4691         if (rc != 0) {
4692                 ll_intent_release(&oit);
4693                 GOTO(out, rc);
4694         }
4695
4696         /* Unlinked? Unhash dentry, so it is not picked up later by
4697          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4698          * here to preserve get_cwd functionality on 2.6.
4699          * Bug 10503 */
4700         if (!dentry->d_inode->i_nlink) {
4701                 spin_lock(&inode->i_lock);
4702                 d_lustre_invalidate(dentry, 0);
4703                 spin_unlock(&inode->i_lock);
4704         }
4705
4706         ll_lookup_finish_locks(&oit, dentry);
4707 out:
4708         ptlrpc_req_finished(req);
4709
4710         return rc;
4711 }
4712
4713 static int ll_merge_md_attr(struct inode *inode)
4714 {
4715         struct ll_inode_info *lli = ll_i2info(inode);
4716         struct cl_attr attr = { 0 };
4717         int rc;
4718
4719         LASSERT(lli->lli_lsm_md != NULL);
4720
4721         if (!lmv_dir_striped(lli->lli_lsm_md))
4722                 RETURN(0);
4723
4724         down_read(&lli->lli_lsm_sem);
4725         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4726                            &attr, ll_md_blocking_ast);
4727         up_read(&lli->lli_lsm_sem);
4728         if (rc != 0)
4729                 RETURN(rc);
4730
4731         set_nlink(inode, attr.cat_nlink);
4732         inode->i_blocks = attr.cat_blocks;
4733         i_size_write(inode, attr.cat_size);
4734
4735         ll_i2info(inode)->lli_atime = attr.cat_atime;
4736         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4737         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4738
4739         RETURN(0);
4740 }
4741
4742 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4743 {
4744         struct inode *inode = de->d_inode;
4745         struct ll_sb_info *sbi = ll_i2sbi(inode);
4746         struct ll_inode_info *lli = ll_i2info(inode);
4747         ktime_t kstart = ktime_get();
4748         int rc;
4749
4750         rc = ll_inode_revalidate(de, IT_GETATTR);
4751         if (rc < 0)
4752                 RETURN(rc);
4753
4754         if (S_ISREG(inode->i_mode)) {
4755                 bool cached;
4756
4757                 rc = pcc_inode_getattr(inode, &cached);
4758                 if (cached && rc < 0)
4759                         RETURN(rc);
4760
4761                 /* In case of restore, the MDT has the right size and has
4762                  * already send it back without granting the layout lock,
4763                  * inode is up-to-date so glimpse is useless.
4764                  * Also to glimpse we need the layout, in case of a running
4765                  * restore the MDT holds the layout lock so the glimpse will
4766                  * block up to the end of restore (getattr will block)
4767                  */
4768                 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4769                         rc = ll_glimpse_size(inode);
4770                         if (rc < 0)
4771                                 RETURN(rc);
4772                 }
4773         } else {
4774                 /* If object isn't regular a file then don't validate size. */
4775                 if (ll_dir_striped(inode)) {
4776                         rc = ll_merge_md_attr(inode);
4777                         if (rc < 0)
4778                                 RETURN(rc);
4779                 }
4780
4781                 inode->i_atime.tv_sec = lli->lli_atime;
4782                 inode->i_mtime.tv_sec = lli->lli_mtime;
4783                 inode->i_ctime.tv_sec = lli->lli_ctime;
4784         }
4785
4786         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4787
4788         if (ll_need_32bit_api(sbi)) {
4789                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4790                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4791                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4792         } else {
4793                 stat->ino = inode->i_ino;
4794                 stat->dev = inode->i_sb->s_dev;
4795                 stat->rdev = inode->i_rdev;
4796         }
4797
4798         stat->mode = inode->i_mode;
4799         stat->uid = inode->i_uid;
4800         stat->gid = inode->i_gid;
4801         stat->atime = inode->i_atime;
4802         stat->mtime = inode->i_mtime;
4803         stat->ctime = inode->i_ctime;
4804         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4805
4806         stat->nlink = inode->i_nlink;
4807         stat->size = i_size_read(inode);
4808         stat->blocks = inode->i_blocks;
4809
4810         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR,
4811                            ktime_us_delta(ktime_get(), kstart));
4812
4813         return 0;
4814 }
4815
4816 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4817 int ll_getattr(const struct path *path, struct kstat *stat,
4818                u32 request_mask, unsigned int flags)
4819 {
4820         struct dentry *de = path->dentry;
4821 #else
4822 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4823 {
4824 #endif
4825         return ll_getattr_dentry(de, stat);
4826 }
4827
4828 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4829                      __u64 start, __u64 len)
4830 {
4831         int             rc;
4832         size_t          num_bytes;
4833         struct fiemap   *fiemap;
4834         unsigned int    extent_count = fieinfo->fi_extents_max;
4835
4836         num_bytes = sizeof(*fiemap) + (extent_count *
4837                                        sizeof(struct fiemap_extent));
4838         OBD_ALLOC_LARGE(fiemap, num_bytes);
4839
4840         if (fiemap == NULL)
4841                 RETURN(-ENOMEM);
4842
4843         fiemap->fm_flags = fieinfo->fi_flags;
4844         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4845         fiemap->fm_start = start;
4846         fiemap->fm_length = len;
4847         if (extent_count > 0 &&
4848             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4849                            sizeof(struct fiemap_extent)) != 0)
4850                 GOTO(out, rc = -EFAULT);
4851
4852         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4853
4854         fieinfo->fi_flags = fiemap->fm_flags;
4855         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4856         if (extent_count > 0 &&
4857             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4858                          fiemap->fm_mapped_extents *
4859                          sizeof(struct fiemap_extent)) != 0)
4860                 GOTO(out, rc = -EFAULT);
4861 out:
4862         OBD_FREE_LARGE(fiemap, num_bytes);
4863         return rc;
4864 }
4865
4866 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4867 {
4868         struct ll_inode_info *lli = ll_i2info(inode);
4869         struct posix_acl *acl = NULL;
4870         ENTRY;
4871
4872         spin_lock(&lli->lli_lock);
4873         /* VFS' acl_permission_check->check_acl will release the refcount */
4874         acl = posix_acl_dup(lli->lli_posix_acl);
4875         spin_unlock(&lli->lli_lock);
4876
4877         RETURN(acl);
4878 }
4879
4880 #ifdef HAVE_IOP_SET_ACL
4881 #ifdef CONFIG_LUSTRE_FS_POSIX_ACL
4882 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4883 {
4884         struct ll_sb_info *sbi = ll_i2sbi(inode);
4885         struct ptlrpc_request *req = NULL;
4886         const char *name = NULL;
4887         char *value = NULL;
4888         size_t value_size = 0;
4889         int rc = 0;
4890         ENTRY;
4891
4892         switch (type) {
4893         case ACL_TYPE_ACCESS:
4894                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4895                 if (acl)
4896                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4897                 break;
4898
4899         case ACL_TYPE_DEFAULT:
4900                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4901                 if (!S_ISDIR(inode->i_mode))
4902                         rc = acl ? -EACCES : 0;
4903                 break;
4904
4905         default:
4906                 rc = -EINVAL;
4907                 break;
4908         }
4909         if (rc)
4910                 return rc;
4911
4912         if (acl) {
4913                 value_size = posix_acl_xattr_size(acl->a_count);
4914                 value = kmalloc(value_size, GFP_NOFS);
4915                 if (value == NULL)
4916                         GOTO(out, rc = -ENOMEM);
4917
4918                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4919                 if (rc < 0)
4920                         GOTO(out_value, rc);
4921         }
4922
4923         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4924                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4925                          name, value, value_size, 0, 0, &req);
4926
4927         ptlrpc_req_finished(req);
4928 out_value:
4929         kfree(value);
4930 out:
4931         if (rc)
4932                 forget_cached_acl(inode, type);
4933         else
4934                 set_cached_acl(inode, type, acl);
4935         RETURN(rc);
4936 }
4937 #endif /* CONFIG_LUSTRE_FS_POSIX_ACL */
4938 #endif /* HAVE_IOP_SET_ACL */
4939
4940 int ll_inode_permission(struct inode *inode, int mask)
4941 {
4942         int rc = 0;
4943         struct ll_sb_info *sbi;
4944         struct root_squash_info *squash;
4945         struct cred *cred = NULL;
4946         const struct cred *old_cred = NULL;
4947         cfs_cap_t cap;
4948         bool squash_id = false;
4949         ktime_t kstart = ktime_get();
4950         ENTRY;
4951
4952         if (mask & MAY_NOT_BLOCK)
4953                 return -ECHILD;
4954
4955        /* as root inode are NOT getting validated in lookup operation,
4956         * need to do it before permission check. */
4957
4958         if (inode == inode->i_sb->s_root->d_inode) {
4959                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4960                 if (rc)
4961                         RETURN(rc);
4962         }
4963
4964         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4965                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4966
4967         /* squash fsuid/fsgid if needed */
4968         sbi = ll_i2sbi(inode);
4969         squash = &sbi->ll_squash;
4970         if (unlikely(squash->rsi_uid != 0 &&
4971                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4972                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4973                         squash_id = true;
4974         }
4975         if (squash_id) {
4976                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4977                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4978                        squash->rsi_uid, squash->rsi_gid);
4979
4980                 /* update current process's credentials
4981                  * and FS capability */
4982                 cred = prepare_creds();
4983                 if (cred == NULL)
4984                         RETURN(-ENOMEM);
4985
4986                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4987                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4988                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4989                         if ((1 << cap) & CFS_CAP_FS_MASK)
4990                                 cap_lower(cred->cap_effective, cap);
4991                 }
4992                 old_cred = override_creds(cred);
4993         }
4994
4995         rc = generic_permission(inode, mask);
4996         /* restore current process's credentials and FS capability */
4997         if (squash_id) {
4998                 revert_creds(old_cred);
4999                 put_cred(cred);
5000         }
5001
5002         if (!rc)
5003                 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM,
5004                                    ktime_us_delta(ktime_get(), kstart));
5005
5006         RETURN(rc);
5007 }
5008
5009 /* -o localflock - only provides locally consistent flock locks */
5010 struct file_operations ll_file_operations = {
5011 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5012 # ifdef HAVE_SYNC_READ_WRITE
5013         .read           = new_sync_read,
5014         .write          = new_sync_write,
5015 # endif
5016         .read_iter      = ll_file_read_iter,
5017         .write_iter     = ll_file_write_iter,
5018 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5019         .read           = ll_file_read,
5020         .aio_read       = ll_file_aio_read,
5021         .write          = ll_file_write,
5022         .aio_write      = ll_file_aio_write,
5023 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5024         .unlocked_ioctl = ll_file_ioctl,
5025         .open           = ll_file_open,
5026         .release        = ll_file_release,
5027         .mmap           = ll_file_mmap,
5028         .llseek         = ll_file_seek,
5029         .splice_read    = ll_file_splice_read,
5030         .fsync          = ll_fsync,
5031         .flush          = ll_flush
5032 };
5033
5034 struct file_operations ll_file_operations_flock = {
5035 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5036 # ifdef HAVE_SYNC_READ_WRITE
5037         .read           = new_sync_read,
5038         .write          = new_sync_write,
5039 # endif /* HAVE_SYNC_READ_WRITE */
5040         .read_iter      = ll_file_read_iter,
5041         .write_iter     = ll_file_write_iter,
5042 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5043         .read           = ll_file_read,
5044         .aio_read       = ll_file_aio_read,
5045         .write          = ll_file_write,
5046         .aio_write      = ll_file_aio_write,
5047 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5048         .unlocked_ioctl = ll_file_ioctl,
5049         .open           = ll_file_open,
5050         .release        = ll_file_release,
5051         .mmap           = ll_file_mmap,
5052         .llseek         = ll_file_seek,
5053         .splice_read    = ll_file_splice_read,
5054         .fsync          = ll_fsync,
5055         .flush          = ll_flush,
5056         .flock          = ll_file_flock,
5057         .lock           = ll_file_flock
5058 };
5059
5060 /* These are for -o noflock - to return ENOSYS on flock calls */
5061 struct file_operations ll_file_operations_noflock = {
5062 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5063 # ifdef HAVE_SYNC_READ_WRITE
5064         .read           = new_sync_read,
5065         .write          = new_sync_write,
5066 # endif /* HAVE_SYNC_READ_WRITE */
5067         .read_iter      = ll_file_read_iter,
5068         .write_iter     = ll_file_write_iter,
5069 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5070         .read           = ll_file_read,
5071         .aio_read       = ll_file_aio_read,
5072         .write          = ll_file_write,
5073         .aio_write      = ll_file_aio_write,
5074 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5075         .unlocked_ioctl = ll_file_ioctl,
5076         .open           = ll_file_open,
5077         .release        = ll_file_release,
5078         .mmap           = ll_file_mmap,
5079         .llseek         = ll_file_seek,
5080         .splice_read    = ll_file_splice_read,
5081         .fsync          = ll_fsync,
5082         .flush          = ll_flush,
5083         .flock          = ll_file_noflock,
5084         .lock           = ll_file_noflock
5085 };
5086
5087 struct inode_operations ll_file_inode_operations = {
5088         .setattr        = ll_setattr,
5089         .getattr        = ll_getattr,
5090         .permission     = ll_inode_permission,
5091 #ifdef HAVE_IOP_XATTR
5092         .setxattr       = ll_setxattr,
5093         .getxattr       = ll_getxattr,
5094         .removexattr    = ll_removexattr,
5095 #endif
5096         .listxattr      = ll_listxattr,
5097         .fiemap         = ll_fiemap,
5098 #ifdef HAVE_IOP_GET_ACL
5099         .get_acl        = ll_get_acl,
5100 #endif
5101 #ifdef HAVE_IOP_SET_ACL
5102         .set_acl        = ll_set_acl,
5103 #endif
5104 };
5105
5106 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5107 {
5108         struct ll_inode_info *lli = ll_i2info(inode);
5109         struct cl_object *obj = lli->lli_clob;
5110         struct lu_env *env;
5111         int rc;
5112         __u16 refcheck;
5113         ENTRY;
5114
5115         if (obj == NULL)
5116                 RETURN(0);
5117
5118         env = cl_env_get(&refcheck);
5119         if (IS_ERR(env))
5120                 RETURN(PTR_ERR(env));
5121
5122         rc = cl_conf_set(env, lli->lli_clob, conf);
5123         if (rc < 0)
5124                 GOTO(out, rc);
5125
5126         if (conf->coc_opc == OBJECT_CONF_SET) {
5127                 struct ldlm_lock *lock = conf->coc_lock;
5128                 struct cl_layout cl = {
5129                         .cl_layout_gen = 0,
5130                 };
5131
5132                 LASSERT(lock != NULL);
5133                 LASSERT(ldlm_has_layout(lock));
5134
5135                 /* it can only be allowed to match after layout is
5136                  * applied to inode otherwise false layout would be
5137                  * seen. Applying layout shoud happen before dropping
5138                  * the intent lock. */
5139                 ldlm_lock_allow_match(lock);
5140
5141                 rc = cl_object_layout_get(env, obj, &cl);
5142                 if (rc < 0)
5143                         GOTO(out, rc);
5144
5145                 CDEBUG(D_VFSTRACE,
5146                        DFID": layout version change: %u -> %u\n",
5147                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
5148                        cl.cl_layout_gen);
5149                 ll_layout_version_set(lli, cl.cl_layout_gen);
5150         }
5151
5152 out:
5153         cl_env_put(env, &refcheck);
5154
5155         RETURN(rc);
5156 }
5157
5158 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5159 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5160
5161 {
5162         struct ll_sb_info *sbi = ll_i2sbi(inode);
5163         struct ptlrpc_request *req;
5164         void *lvbdata;
5165         void *lmm;
5166         int lmmsize;
5167         int rc;
5168         ENTRY;
5169
5170         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5171                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5172                lock->l_lvb_data, lock->l_lvb_len);
5173
5174         if (lock->l_lvb_data != NULL)
5175                 RETURN(0);
5176
5177         /* if layout lock was granted right away, the layout is returned
5178          * within DLM_LVB of dlm reply; otherwise if the lock was ever
5179          * blocked and then granted via completion ast, we have to fetch
5180          * layout here. Please note that we can't use the LVB buffer in
5181          * completion AST because it doesn't have a large enough buffer */
5182         rc = ll_get_default_mdsize(sbi, &lmmsize);
5183         if (rc < 0)
5184                 RETURN(rc);
5185
5186         rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5187                          XATTR_NAME_LOV, lmmsize, &req);
5188         if (rc < 0) {
5189                 if (rc == -ENODATA)
5190                         GOTO(out, rc = 0); /* empty layout */
5191                 else
5192                         RETURN(rc);
5193         }
5194
5195         lmmsize = rc;
5196         rc = 0;
5197         if (lmmsize == 0) /* empty layout */
5198                 GOTO(out, rc = 0);
5199
5200         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5201         if (lmm == NULL)
5202                 GOTO(out, rc = -EFAULT);
5203
5204         OBD_ALLOC_LARGE(lvbdata, lmmsize);
5205         if (lvbdata == NULL)
5206                 GOTO(out, rc = -ENOMEM);
5207
5208         memcpy(lvbdata, lmm, lmmsize);
5209         lock_res_and_lock(lock);
5210         if (unlikely(lock->l_lvb_data == NULL)) {
5211                 lock->l_lvb_type = LVB_T_LAYOUT;
5212                 lock->l_lvb_data = lvbdata;
5213                 lock->l_lvb_len = lmmsize;
5214                 lvbdata = NULL;
5215         }
5216         unlock_res_and_lock(lock);
5217
5218         if (lvbdata)
5219                 OBD_FREE_LARGE(lvbdata, lmmsize);
5220
5221         EXIT;
5222
5223 out:
5224         ptlrpc_req_finished(req);
5225         return rc;
5226 }
5227
5228 /**
5229  * Apply the layout to the inode. Layout lock is held and will be released
5230  * in this function.
5231  */
5232 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5233                               struct inode *inode)
5234 {
5235         struct ll_inode_info *lli = ll_i2info(inode);
5236         struct ll_sb_info    *sbi = ll_i2sbi(inode);
5237         struct ldlm_lock *lock;
5238         struct cl_object_conf conf;
5239         int rc = 0;
5240         bool lvb_ready;
5241         bool wait_layout = false;
5242         ENTRY;
5243
5244         LASSERT(lustre_handle_is_used(lockh));
5245
5246         lock = ldlm_handle2lock(lockh);
5247         LASSERT(lock != NULL);
5248         LASSERT(ldlm_has_layout(lock));
5249
5250         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5251                    PFID(&lli->lli_fid), inode);
5252
5253         /* in case this is a caching lock and reinstate with new inode */
5254         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5255
5256         lock_res_and_lock(lock);
5257         lvb_ready = ldlm_is_lvb_ready(lock);
5258         unlock_res_and_lock(lock);
5259
5260         /* checking lvb_ready is racy but this is okay. The worst case is
5261          * that multi processes may configure the file on the same time. */
5262         if (lvb_ready)
5263                 GOTO(out, rc = 0);
5264
5265         rc = ll_layout_fetch(inode, lock);
5266         if (rc < 0)
5267                 GOTO(out, rc);
5268
5269         /* for layout lock, lmm is stored in lock's lvb.
5270          * lvb_data is immutable if the lock is held so it's safe to access it
5271          * without res lock.
5272          *
5273          * set layout to file. Unlikely this will fail as old layout was
5274          * surely eliminated */
5275         memset(&conf, 0, sizeof conf);
5276         conf.coc_opc = OBJECT_CONF_SET;
5277         conf.coc_inode = inode;
5278         conf.coc_lock = lock;
5279         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5280         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5281         rc = ll_layout_conf(inode, &conf);
5282
5283         /* refresh layout failed, need to wait */
5284         wait_layout = rc == -EBUSY;
5285         EXIT;
5286 out:
5287         LDLM_LOCK_PUT(lock);
5288         ldlm_lock_decref(lockh, mode);
5289
5290         /* wait for IO to complete if it's still being used. */
5291         if (wait_layout) {
5292                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5293                        sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5294
5295                 memset(&conf, 0, sizeof conf);
5296                 conf.coc_opc = OBJECT_CONF_WAIT;
5297                 conf.coc_inode = inode;
5298                 rc = ll_layout_conf(inode, &conf);
5299                 if (rc == 0)
5300                         rc = -EAGAIN;
5301
5302                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5303                        sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5304         }
5305         RETURN(rc);
5306 }
5307
5308 /**
5309  * Issue layout intent RPC to MDS.
5310  * \param inode [in]    file inode
5311  * \param intent [in]   layout intent
5312  *
5313  * \retval 0    on success
5314  * \retval < 0  error code
5315  */
5316 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5317 {
5318         struct ll_inode_info  *lli = ll_i2info(inode);
5319         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5320         struct md_op_data     *op_data;
5321         struct lookup_intent it;
5322         struct ptlrpc_request *req;
5323         int rc;
5324         ENTRY;
5325
5326         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5327                                      0, 0, LUSTRE_OPC_ANY, NULL);
5328         if (IS_ERR(op_data))
5329                 RETURN(PTR_ERR(op_data));
5330
5331         op_data->op_data = intent;
5332         op_data->op_data_size = sizeof(*intent);
5333
5334         memset(&it, 0, sizeof(it));
5335         it.it_op = IT_LAYOUT;
5336         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5337             intent->li_opc == LAYOUT_INTENT_TRUNC)
5338                 it.it_flags = FMODE_WRITE;
5339
5340         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5341                           sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5342
5343         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5344                             &ll_md_blocking_ast, 0);
5345         if (it.it_request != NULL)
5346                 ptlrpc_req_finished(it.it_request);
5347         it.it_request = NULL;
5348
5349         ll_finish_md_op_data(op_data);
5350
5351         /* set lock data in case this is a new lock */
5352         if (!rc)
5353                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5354
5355         ll_intent_drop_lock(&it);
5356
5357         RETURN(rc);
5358 }
5359
5360 /**
5361  * This function checks if there exists a LAYOUT lock on the client side,
5362  * or enqueues it if it doesn't have one in cache.
5363  *
5364  * This function will not hold layout lock so it may be revoked any time after
5365  * this function returns. Any operations depend on layout should be redone
5366  * in that case.
5367  *
5368  * This function should be called before lov_io_init() to get an uptodate
5369  * layout version, the caller should save the version number and after IO
5370  * is finished, this function should be called again to verify that layout
5371  * is not changed during IO time.
5372  */
5373 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5374 {
5375         struct ll_inode_info    *lli = ll_i2info(inode);
5376         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5377         struct lustre_handle lockh;
5378         struct layout_intent intent = {
5379                 .li_opc = LAYOUT_INTENT_ACCESS,
5380         };
5381         enum ldlm_mode mode;
5382         int rc;
5383         ENTRY;
5384
5385         *gen = ll_layout_version_get(lli);
5386         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5387                 RETURN(0);
5388
5389         /* sanity checks */
5390         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5391         LASSERT(S_ISREG(inode->i_mode));
5392
5393         /* take layout lock mutex to enqueue layout lock exclusively. */
5394         mutex_lock(&lli->lli_layout_mutex);
5395
5396         while (1) {
5397                 /* mostly layout lock is caching on the local side, so try to
5398                  * match it before grabbing layout lock mutex. */
5399                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5400                                        LCK_CR | LCK_CW | LCK_PR |
5401                                        LCK_PW | LCK_EX);
5402                 if (mode != 0) { /* hit cached lock */
5403                         rc = ll_layout_lock_set(&lockh, mode, inode);
5404                         if (rc == -EAGAIN)
5405                                 continue;
5406                         break;
5407                 }
5408
5409                 rc = ll_layout_intent(inode, &intent);
5410                 if (rc != 0)
5411                         break;
5412         }
5413
5414         if (rc == 0)
5415                 *gen = ll_layout_version_get(lli);
5416         mutex_unlock(&lli->lli_layout_mutex);
5417
5418         RETURN(rc);
5419 }
5420
5421 /**
5422  * Issue layout intent RPC indicating where in a file an IO is about to write.
5423  *
5424  * \param[in] inode     file inode.
5425  * \param[in] ext       write range with start offset of fille in bytes where
5426  *                      an IO is about to write, and exclusive end offset in
5427  *                      bytes.
5428  *
5429  * \retval 0    on success
5430  * \retval < 0  error code
5431  */
5432 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5433                            struct lu_extent *ext)
5434 {
5435         struct layout_intent intent = {
5436                 .li_opc = opc,
5437                 .li_extent.e_start = ext->e_start,
5438                 .li_extent.e_end = ext->e_end,
5439         };
5440         int rc;
5441         ENTRY;
5442
5443         rc = ll_layout_intent(inode, &intent);
5444
5445         RETURN(rc);
5446 }
5447
5448 /**
5449  *  This function send a restore request to the MDT
5450  */
5451 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5452 {
5453         struct hsm_user_request *hur;
5454         int                      len, rc;
5455         ENTRY;
5456
5457         len = sizeof(struct hsm_user_request) +
5458               sizeof(struct hsm_user_item);
5459         OBD_ALLOC(hur, len);
5460         if (hur == NULL)
5461                 RETURN(-ENOMEM);
5462
5463         hur->hur_request.hr_action = HUA_RESTORE;
5464         hur->hur_request.hr_archive_id = 0;
5465         hur->hur_request.hr_flags = 0;
5466         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5467                sizeof(hur->hur_user_item[0].hui_fid));
5468         hur->hur_user_item[0].hui_extent.offset = offset;
5469         hur->hur_user_item[0].hui_extent.length = length;
5470         hur->hur_request.hr_itemcount = 1;
5471         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5472                            len, hur, NULL);
5473         OBD_FREE(hur, len);
5474         RETURN(rc);
5475 }