lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 struct pcc_param {
  62         __u64   pa_data_version;
  63         __u32   pa_archive_id;
  64         __u32   pa_layout_gen;
  65 };
  66
  67 static int
  68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  69
  70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  71                           bool *lease_broken);
  72
  73 static struct ll_file_data *ll_file_data_get(void)
  74 {
  75         struct ll_file_data *fd;
  76
  77         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  78         if (fd == NULL)
  79                 return NULL;
  80
  81         fd->fd_write_failed = false;
  82         pcc_file_init(&fd->fd_pcc_file);
  83
  84         return fd;
  85 }
  86
  87 static void ll_file_data_put(struct ll_file_data *fd)
  88 {
  89         if (fd != NULL)
  90                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  91 }
  92
  93 /**
  94  * Packs all the attributes into @op_data for the CLOSE rpc.
  95  */
  96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  97                              struct obd_client_handle *och)
  98 {
  99         ENTRY;
 100
 101         ll_prep_md_op_data(op_data, inode, NULL, NULL,
 102                            0, 0, LUSTRE_OPC_ANY, NULL);
 103
 104         op_data->op_attr.ia_mode = inode->i_mode;
 105         op_data->op_attr.ia_atime = inode->i_atime;
 106         op_data->op_attr.ia_mtime = inode->i_mtime;
 107         op_data->op_attr.ia_ctime = inode->i_ctime;
 108         op_data->op_attr.ia_size = i_size_read(inode);
 109         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 110                                       ATTR_MTIME | ATTR_MTIME_SET |
 111                                       ATTR_CTIME);
 112         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 113         op_data->op_attr_blocks = inode->i_blocks;
 114         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 115         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 116                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 117         op_data->op_open_handle = och->och_open_handle;
 118
 119         if (och->och_flags & FMODE_WRITE &&
 120             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 121                 /* For HSM: if inode data has been modified, pack it so that
 122                  * MDT can set data dirty flag in the archive. */
 123                 op_data->op_bias |= MDS_DATA_MODIFIED;
 124
 125         EXIT;
 126 }
 127
 128 /**
 129  * Perform a close, possibly with a bias.
 130  * The meaning of "data" depends on the value of "bias".
 131  *
 132  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 133  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 134  * swap layouts with.
 135  */
 136 static int ll_close_inode_openhandle(struct inode *inode,
 137                                      struct obd_client_handle *och,
 138                                      enum mds_op_bias bias, void *data)
 139 {
 140         struct obd_export *md_exp = ll_i2mdexp(inode);
 141         const struct ll_inode_info *lli = ll_i2info(inode);
 142         struct md_op_data *op_data;
 143         struct ptlrpc_request *req = NULL;
 144         int rc;
 145         ENTRY;
 146
 147         if (class_exp2obd(md_exp) == NULL) {
 148                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 149                        ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
 150                 GOTO(out, rc = 0);
 151         }
 152
 153         OBD_ALLOC_PTR(op_data);
 154         /* We leak openhandle and request here on error, but not much to be
 155          * done in OOM case since app won't retry close on error either. */
 156         if (op_data == NULL)
 157                 GOTO(out, rc = -ENOMEM);
 158
 159         ll_prepare_close(inode, op_data, och);
 160         switch (bias) {
 161         case MDS_CLOSE_LAYOUT_MERGE:
 162                 /* merge blocks from the victim inode */
 163                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 164                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 165                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 166                 /* fallthrough */
 167         case MDS_CLOSE_LAYOUT_SPLIT:
 168         case MDS_CLOSE_LAYOUT_SWAP: {
 169                 struct split_param *sp = data;
 170
 171                 LASSERT(data != NULL);
 172                 op_data->op_bias |= bias;
 173                 op_data->op_data_version = 0;
 174                 op_data->op_lease_handle = och->och_lease_handle;
 175                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 176                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 177                         op_data->op_mirror_id = sp->sp_mirror_id;
 178                 } else {
 179                         op_data->op_fid2 = *ll_inode2fid(data);
 180                 }
 181                 break;
 182         }
 183
 184         case MDS_CLOSE_RESYNC_DONE: {
 185                 struct ll_ioc_lease *ioc = data;
 186
 187                 LASSERT(data != NULL);
 188                 op_data->op_attr_blocks +=
 189                         ioc->lil_count * op_data->op_attr_blocks;
 190                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 191                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 192                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 193
 194                 op_data->op_lease_handle = och->och_lease_handle;
 195                 op_data->op_data = &ioc->lil_ids[0];
 196                 op_data->op_data_size =
 197                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 198                 break;
 199         }
 200
 201         case MDS_PCC_ATTACH: {
 202                 struct pcc_param *param = data;
 203
 204                 LASSERT(data != NULL);
 205                 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
 206                 op_data->op_archive_id = param->pa_archive_id;
 207                 op_data->op_data_version = param->pa_data_version;
 208                 op_data->op_lease_handle = och->och_lease_handle;
 209                 break;
 210         }
 211
 212         case MDS_HSM_RELEASE:
 213                 LASSERT(data != NULL);
 214                 op_data->op_bias |= MDS_HSM_RELEASE;
 215                 op_data->op_data_version = *(__u64 *)data;
 216                 op_data->op_lease_handle = och->och_lease_handle;
 217                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 218                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 219                 break;
 220
 221         default:
 222                 LASSERT(data == NULL);
 223                 break;
 224         }
 225
 226         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 227                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 228         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 229                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 230
 231         rc = md_close(md_exp, op_data, och->och_mod, &req);
 232         if (rc != 0 && rc != -EINTR)
 233                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 234                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 235
 236         if (rc == 0 && op_data->op_bias & bias) {
 237                 struct mdt_body *body;
 238
 239                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 240                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 241                         rc = -EBUSY;
 242
 243                 if (bias & MDS_PCC_ATTACH) {
 244                         struct pcc_param *param = data;
 245
 246                         param->pa_layout_gen = body->mbo_layout_gen;
 247                 }
 248         }
 249
 250         ll_finish_md_op_data(op_data);
 251         EXIT;
 252 out:
 253
 254         md_clear_open_replay_data(md_exp, och);
 255         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 256         OBD_FREE_PTR(och);
 257
 258         ptlrpc_req_finished(req);       /* This is close request */
 259         return rc;
 260 }
 261
 262 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 263 {
 264         struct ll_inode_info *lli = ll_i2info(inode);
 265         struct obd_client_handle **och_p;
 266         struct obd_client_handle *och;
 267         __u64 *och_usecount;
 268         int rc = 0;
 269         ENTRY;
 270
 271         if (fmode & FMODE_WRITE) {
 272                 och_p = &lli->lli_mds_write_och;
 273                 och_usecount = &lli->lli_open_fd_write_count;
 274         } else if (fmode & FMODE_EXEC) {
 275                 och_p = &lli->lli_mds_exec_och;
 276                 och_usecount = &lli->lli_open_fd_exec_count;
 277         } else {
 278                 LASSERT(fmode & FMODE_READ);
 279                 och_p = &lli->lli_mds_read_och;
 280                 och_usecount = &lli->lli_open_fd_read_count;
 281         }
 282
 283         mutex_lock(&lli->lli_och_mutex);
 284         if (*och_usecount > 0) {
 285                 /* There are still users of this handle, so skip
 286                  * freeing it. */
 287                 mutex_unlock(&lli->lli_och_mutex);
 288                 RETURN(0);
 289         }
 290
 291         och = *och_p;
 292         *och_p = NULL;
 293         mutex_unlock(&lli->lli_och_mutex);
 294
 295         if (och != NULL) {
 296                 /* There might be a race and this handle may already
 297                  * be closed. */
 298                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 299         }
 300
 301         RETURN(rc);
 302 }
 303
 304 static int ll_md_close(struct inode *inode, struct file *file)
 305 {
 306         union ldlm_policy_data policy = {
 307                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 308         };
 309         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 310         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 311         struct ll_inode_info *lli = ll_i2info(inode);
 312         struct lustre_handle lockh;
 313         enum ldlm_mode lockmode;
 314         int rc = 0;
 315         ENTRY;
 316
 317         /* clear group lock, if present */
 318         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 319                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 320
 321         if (fd->fd_lease_och != NULL) {
 322                 bool lease_broken;
 323
 324                 /* Usually the lease is not released when the
 325                  * application crashed, we need to release here. */
 326                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 327                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 328                         PFID(&lli->lli_fid), rc, lease_broken);
 329
 330                 fd->fd_lease_och = NULL;
 331         }
 332
 333         if (fd->fd_och != NULL) {
 334                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 335                 fd->fd_och = NULL;
 336                 GOTO(out, rc);
 337         }
 338
 339         /* Let's see if we have good enough OPEN lock on the file and if
 340            we can skip talking to MDS */
 341         mutex_lock(&lli->lli_och_mutex);
 342         if (fd->fd_omode & FMODE_WRITE) {
 343                 lockmode = LCK_CW;
 344                 LASSERT(lli->lli_open_fd_write_count);
 345                 lli->lli_open_fd_write_count--;
 346         } else if (fd->fd_omode & FMODE_EXEC) {
 347                 lockmode = LCK_PR;
 348                 LASSERT(lli->lli_open_fd_exec_count);
 349                 lli->lli_open_fd_exec_count--;
 350         } else {
 351                 lockmode = LCK_CR;
 352                 LASSERT(lli->lli_open_fd_read_count);
 353                 lli->lli_open_fd_read_count--;
 354         }
 355         mutex_unlock(&lli->lli_och_mutex);
 356
 357         /* LU-4398: do not cache write open lock if the file has exec bit */
 358         if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
 359             !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 360                            LDLM_IBITS, &policy, lockmode, &lockh))
 361                 rc = ll_md_real_close(inode, fd->fd_omode);
 362
 363 out:
 364         LUSTRE_FPRIVATE(file) = NULL;
 365         ll_file_data_put(fd);
 366
 367         RETURN(rc);
 368 }
 369
 370 /* While this returns an error code, fput() the caller does not, so we need
 371  * to make every effort to clean up all of our state here.  Also, applications
 372  * rarely check close errors and even if an error is returned they will not
 373  * re-try the close call.
 374  */
 375 int ll_file_release(struct inode *inode, struct file *file)
 376 {
 377         struct ll_file_data *fd;
 378         struct ll_sb_info *sbi = ll_i2sbi(inode);
 379         struct ll_inode_info *lli = ll_i2info(inode);
 380         int rc;
 381         ENTRY;
 382
 383         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 384                PFID(ll_inode2fid(inode)), inode);
 385
 386         if (inode->i_sb->s_root != file_dentry(file))
 387                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 388         fd = LUSTRE_FPRIVATE(file);
 389         LASSERT(fd != NULL);
 390
 391         /* The last ref on @file, maybe not the the owner pid of statahead,
 392          * because parent and child process can share the same file handle. */
 393         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 394                 ll_deauthorize_statahead(inode, fd);
 395
 396         if (inode->i_sb->s_root == file_dentry(file)) {
 397                 LUSTRE_FPRIVATE(file) = NULL;
 398                 ll_file_data_put(fd);
 399                 RETURN(0);
 400         }
 401
 402         pcc_file_release(inode, file);
 403
 404         if (!S_ISDIR(inode->i_mode)) {
 405                 if (lli->lli_clob != NULL)
 406                         lov_read_and_clear_async_rc(lli->lli_clob);
 407                 lli->lli_async_rc = 0;
 408         }
 409
 410         rc = ll_md_close(inode, file);
 411
 412         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 413                 libcfs_debug_dumplog();
 414
 415         RETURN(rc);
 416 }
 417
 418 static inline int ll_dom_readpage(void *data, struct page *page)
 419 {
 420         struct niobuf_local *lnb = data;
 421         void *kaddr;
 422
 423         kaddr = ll_kmap_atomic(page, KM_USER0);
 424         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 425         if (lnb->lnb_len < PAGE_SIZE)
 426                 memset(kaddr + lnb->lnb_len, 0,
 427                        PAGE_SIZE - lnb->lnb_len);
 428         flush_dcache_page(page);
 429         SetPageUptodate(page);
 430         ll_kunmap_atomic(kaddr, KM_USER0);
 431         unlock_page(page);
 432
 433         return 0;
 434 }
 435
 436 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 437                         struct lookup_intent *it)
 438 {
 439         struct ll_inode_info *lli = ll_i2info(inode);
 440         struct cl_object *obj = lli->lli_clob;
 441         struct address_space *mapping = inode->i_mapping;
 442         struct page *vmpage;
 443         struct niobuf_remote *rnb;
 444         struct mdt_body *body;
 445         char *data;
 446         unsigned long index, start;
 447         struct niobuf_local lnb;
 448
 449         ENTRY;
 450
 451         if (obj == NULL)
 452                 RETURN_EXIT;
 453
 454         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 455                                    RCL_SERVER))
 456                 RETURN_EXIT;
 457
 458         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 459         if (rnb == NULL || rnb->rnb_len == 0)
 460                 RETURN_EXIT;
 461
 462         /* LU-11595: Server may return whole file and that is OK always or
 463          * it may return just file tail and its offset must be aligned with
 464          * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
 465          * smaller then offset may be not aligned and that data is just ignored.
 466          */
 467         if (rnb->rnb_offset % PAGE_SIZE)
 468                 RETURN_EXIT;
 469
 470         /* Server returns whole file or just file tail if it fills in reply
 471          * buffer, in both cases total size should be equal to the file size.
 472          */
 473         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 474         if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
 475                 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
 476                        ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
 477                        rnb->rnb_len, body->mbo_dom_size);
 478                 RETURN_EXIT;
 479         }
 480
 481         CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
 482                rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
 483
 484         data = (char *)rnb + sizeof(*rnb);
 485
 486         lnb.lnb_file_offset = rnb->rnb_offset;
 487         start = lnb.lnb_file_offset / PAGE_SIZE;
 488         index = 0;
 489         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 490         lnb.lnb_page_offset = 0;
 491         do {
 492                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 493                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 494                 if (lnb.lnb_len > PAGE_SIZE)
 495                         lnb.lnb_len = PAGE_SIZE;
 496
 497                 vmpage = read_cache_page(mapping, index + start,
 498                                          ll_dom_readpage, &lnb);
 499                 if (IS_ERR(vmpage)) {
 500                         CWARN("%s: cannot fill page %lu for "DFID
 501                               " with data: rc = %li\n",
 502                               ll_i2sbi(inode)->ll_fsname, index + start,
 503                               PFID(lu_object_fid(&obj->co_lu)),
 504                               PTR_ERR(vmpage));
 505                         break;
 506                 }
 507                 put_page(vmpage);
 508                 index++;
 509         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 510         EXIT;
 511 }
 512
 513 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 514                                 struct lookup_intent *itp)
 515 {
 516         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 517         struct dentry *parent = de->d_parent;
 518         char *name = NULL;
 519         int len = 0;
 520         struct md_op_data *op_data;
 521         struct ptlrpc_request *req = NULL;
 522         int rc;
 523         ENTRY;
 524
 525         LASSERT(parent != NULL);
 526         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 527
 528         /* if server supports open-by-fid, or file name is invalid, don't pack
 529          * name in open request */
 530         if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
 531             !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
 532 retry:
 533                 len = de->d_name.len;
 534                 name = kmalloc(len + 1, GFP_NOFS);
 535                 if (!name)
 536                         RETURN(-ENOMEM);
 537
 538                 /* race here */
 539                 spin_lock(&de->d_lock);
 540                 if (len != de->d_name.len) {
 541                         spin_unlock(&de->d_lock);
 542                         kfree(name);
 543                         goto retry;
 544                 }
 545                 memcpy(name, de->d_name.name, len);
 546                 name[len] = '\0';
 547                 spin_unlock(&de->d_lock);
 548
 549                 if (!lu_name_is_valid_2(name, len)) {
 550                         kfree(name);
 551                         RETURN(-ESTALE);
 552                 }
 553         }
 554
 555         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 556                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 557         if (IS_ERR(op_data)) {
 558                 kfree(name);
 559                 RETURN(PTR_ERR(op_data));
 560         }
 561         op_data->op_data = lmm;
 562         op_data->op_data_size = lmmsize;
 563
 564         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 565                             &ll_md_blocking_ast, 0);
 566         kfree(name);
 567         ll_finish_md_op_data(op_data);
 568         if (rc == -ESTALE) {
 569                 /* reason for keep own exit path - don`t flood log
 570                  * with messages with -ESTALE errors.
 571                  */
 572                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 573                      it_open_error(DISP_OPEN_OPEN, itp))
 574                         GOTO(out, rc);
 575                 ll_release_openhandle(de, itp);
 576                 GOTO(out, rc);
 577         }
 578
 579         if (it_disposition(itp, DISP_LOOKUP_NEG))
 580                 GOTO(out, rc = -ENOENT);
 581
 582         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 583                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 584                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 585                 GOTO(out, rc);
 586         }
 587
 588         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 589
 590         if (!rc && itp->it_lock_mode) {
 591                 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
 592                 struct ldlm_lock *lock;
 593                 bool has_dom_bit = false;
 594
 595                 /* If we got a lock back and it has a LOOKUP bit set,
 596                  * make sure the dentry is marked as valid so we can find it.
 597                  * We don't need to care about actual hashing since other bits
 598                  * of kernel will deal with that later.
 599                  */
 600                 lock = ldlm_handle2lock(&handle);
 601                 if (lock) {
 602                         has_dom_bit = ldlm_has_dom(lock);
 603                         if (lock->l_policy_data.l_inodebits.bits &
 604                             MDS_INODELOCK_LOOKUP)
 605                                 d_lustre_revalidate(de);
 606
 607                         LDLM_LOCK_PUT(lock);
 608                 }
 609                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 610                 if (has_dom_bit)
 611                         ll_dom_finish_open(de->d_inode, req, itp);
 612         }
 613
 614 out:
 615         ptlrpc_req_finished(req);
 616         ll_intent_drop_lock(itp);
 617
 618         /* We did open by fid, but by the time we got to the server,
 619          * the object disappeared. If this is a create, we cannot really
 620          * tell the userspace that the file it was trying to create
 621          * does not exist. Instead let's return -ESTALE, and the VFS will
 622          * retry the create with LOOKUP_REVAL that we are going to catch
 623          * in ll_revalidate_dentry() and use lookup then.
 624          */
 625         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 626                 rc = -ESTALE;
 627
 628         RETURN(rc);
 629 }
 630
 631 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 632                        struct obd_client_handle *och)
 633 {
 634         struct mdt_body *body;
 635
 636         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 637         och->och_open_handle = body->mbo_open_handle;
 638         och->och_fid = body->mbo_fid1;
 639         och->och_lease_handle.cookie = it->it_lock_handle;
 640         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 641         och->och_flags = it->it_flags;
 642
 643         return md_set_open_replay_data(md_exp, och, it);
 644 }
 645
 646 static int ll_local_open(struct file *file, struct lookup_intent *it,
 647                          struct ll_file_data *fd, struct obd_client_handle *och)
 648 {
 649         struct inode *inode = file_inode(file);
 650         ENTRY;
 651
 652         LASSERT(!LUSTRE_FPRIVATE(file));
 653
 654         LASSERT(fd != NULL);
 655
 656         if (och) {
 657                 int rc;
 658
 659                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 660                 if (rc != 0)
 661                         RETURN(rc);
 662         }
 663
 664         LUSTRE_FPRIVATE(file) = fd;
 665         ll_readahead_init(inode, &fd->fd_ras);
 666         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 667
 668         /* ll_cl_context initialize */
 669         rwlock_init(&fd->fd_lock);
 670         INIT_LIST_HEAD(&fd->fd_lccs);
 671
 672         RETURN(0);
 673 }
 674
 675 /* Open a file, and (for the very first open) create objects on the OSTs at
 676  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 677  * creation or open until ll_lov_setstripe() ioctl is called.
 678  *
 679  * If we already have the stripe MD locally then we don't request it in
 680  * md_open(), by passing a lmm_size = 0.
 681  *
 682  * It is up to the application to ensure no other processes open this file
 683  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 684  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 685  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 686  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 687  */
 688 int ll_file_open(struct inode *inode, struct file *file)
 689 {
 690         struct ll_inode_info *lli = ll_i2info(inode);
 691         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 692                                           .it_flags = file->f_flags };
 693         struct obd_client_handle **och_p = NULL;
 694         __u64 *och_usecount = NULL;
 695         struct ll_file_data *fd;
 696         int rc = 0;
 697         ENTRY;
 698
 699         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 700                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 701
 702         it = file->private_data; /* XXX: compat macro */
 703         file->private_data = NULL; /* prevent ll_local_open assertion */
 704
 705         fd = ll_file_data_get();
 706         if (fd == NULL)
 707                 GOTO(out_nofiledata, rc = -ENOMEM);
 708
 709         fd->fd_file = file;
 710         if (S_ISDIR(inode->i_mode))
 711                 ll_authorize_statahead(inode, fd);
 712
 713         if (inode->i_sb->s_root == file_dentry(file)) {
 714                 LUSTRE_FPRIVATE(file) = fd;
 715                 RETURN(0);
 716         }
 717
 718         if (!it || !it->it_disposition) {
 719                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 720                  * because everything but O_ACCMODE mask was stripped from
 721                  * there */
 722                 if ((oit.it_flags + 1) & O_ACCMODE)
 723                         oit.it_flags++;
 724                 if (file->f_flags & O_TRUNC)
 725                         oit.it_flags |= FMODE_WRITE;
 726
 727                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 728                  * dentry_open after call to open_namei that checks permissions.
 729                  * Only nfsd_open call dentry_open directly without checking
 730                  * permissions and because of that this code below is safe.
 731                  */
 732                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 733                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 734
 735                 /* We do not want O_EXCL here, presumably we opened the file
 736                  * already? XXX - NFS implications? */
 737                 oit.it_flags &= ~O_EXCL;
 738
 739                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 740                  * created if necessary, then "IT_CREAT" should be set to keep
 741                  * consistent with it */
 742                 if (oit.it_flags & O_CREAT)
 743                         oit.it_op |= IT_CREAT;
 744
 745                 it = &oit;
 746         }
 747
 748 restart:
 749         /* Let's see if we have file open on MDS already. */
 750         if (it->it_flags & FMODE_WRITE) {
 751                 och_p = &lli->lli_mds_write_och;
 752                 och_usecount = &lli->lli_open_fd_write_count;
 753         } else if (it->it_flags & FMODE_EXEC) {
 754                 och_p = &lli->lli_mds_exec_och;
 755                 och_usecount = &lli->lli_open_fd_exec_count;
 756          } else {
 757                 och_p = &lli->lli_mds_read_och;
 758                 och_usecount = &lli->lli_open_fd_read_count;
 759         }
 760
 761         mutex_lock(&lli->lli_och_mutex);
 762         if (*och_p) { /* Open handle is present */
 763                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 764                         /* Well, there's extra open request that we do not need,
 765                            let's close it somehow. This will decref request. */
 766                         rc = it_open_error(DISP_OPEN_OPEN, it);
 767                         if (rc) {
 768                                 mutex_unlock(&lli->lli_och_mutex);
 769                                 GOTO(out_openerr, rc);
 770                         }
 771
 772                         ll_release_openhandle(file_dentry(file), it);
 773                 }
 774                 (*och_usecount)++;
 775
 776                 rc = ll_local_open(file, it, fd, NULL);
 777                 if (rc) {
 778                         (*och_usecount)--;
 779                         mutex_unlock(&lli->lli_och_mutex);
 780                         GOTO(out_openerr, rc);
 781                 }
 782         } else {
 783                 LASSERT(*och_usecount == 0);
 784                 if (!it->it_disposition) {
 785                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 786                         /* We cannot just request lock handle now, new ELC code
 787                            means that one of other OPEN locks for this file
 788                            could be cancelled, and since blocking ast handler
 789                            would attempt to grab och_mutex as well, that would
 790                            result in a deadlock */
 791                         mutex_unlock(&lli->lli_och_mutex);
 792                         /*
 793                          * Normally called under two situations:
 794                          * 1. NFS export.
 795                          * 2. A race/condition on MDS resulting in no open
 796                          *    handle to be returned from LOOKUP|OPEN request,
 797                          *    for example if the target entry was a symlink.
 798                          *
 799                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 800                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 801                          *  bit so that it's not confusing later callers.
 802                          *
 803                          *  NB; when ldd is NULL, it must have come via normal
 804                          *  lookup path only, since ll_iget_for_nfs always calls
 805                          *  ll_d_init().
 806                          */
 807                         if (ldd && ldd->lld_nfs_dentry) {
 808                                 ldd->lld_nfs_dentry = 0;
 809                                 it->it_flags |= MDS_OPEN_LOCK;
 810                         }
 811
 812                          /*
 813                          * Always specify MDS_OPEN_BY_FID because we don't want
 814                          * to get file with different fid.
 815                          */
 816                         it->it_flags |= MDS_OPEN_BY_FID;
 817                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 818                                                  it);
 819                         if (rc)
 820                                 GOTO(out_openerr, rc);
 821
 822                         goto restart;
 823                 }
 824                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 825                 if (!*och_p)
 826                         GOTO(out_och_free, rc = -ENOMEM);
 827
 828                 (*och_usecount)++;
 829
 830                 /* md_intent_lock() didn't get a request ref if there was an
 831                  * open error, so don't do cleanup on the request here
 832                  * (bug 3430) */
 833                 /* XXX (green): Should not we bail out on any error here, not
 834                  * just open error? */
 835                 rc = it_open_error(DISP_OPEN_OPEN, it);
 836                 if (rc != 0)
 837                         GOTO(out_och_free, rc);
 838
 839                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 840                          "inode %p: disposition %x, status %d\n", inode,
 841                          it_disposition(it, ~0), it->it_status);
 842
 843                 rc = ll_local_open(file, it, fd, *och_p);
 844                 if (rc)
 845                         GOTO(out_och_free, rc);
 846         }
 847
 848         rc = pcc_file_open(inode, file);
 849         if (rc)
 850                 GOTO(out_och_free, rc);
 851
 852         mutex_unlock(&lli->lli_och_mutex);
 853         fd = NULL;
 854
 855         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 856            different kind of OPEN lock for this same inode gets cancelled
 857            by ldlm_cancel_lru */
 858         if (!S_ISREG(inode->i_mode))
 859                 GOTO(out_och_free, rc);
 860
 861         cl_lov_delay_create_clear(&file->f_flags);
 862         GOTO(out_och_free, rc);
 863
 864 out_och_free:
 865         if (rc) {
 866                 if (och_p && *och_p) {
 867                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 868                         *och_p = NULL; /* OBD_FREE writes some magic there */
 869                         (*och_usecount)--;
 870                 }
 871                 mutex_unlock(&lli->lli_och_mutex);
 872
 873 out_openerr:
 874                 if (lli->lli_opendir_key == fd)
 875                         ll_deauthorize_statahead(inode, fd);
 876
 877                 if (fd != NULL)
 878                         ll_file_data_put(fd);
 879         } else {
 880                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 881         }
 882
 883 out_nofiledata:
 884         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 885                 ptlrpc_req_finished(it->it_request);
 886                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 887         }
 888
 889         return rc;
 890 }
 891
 892 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 893                         struct ldlm_lock_desc *desc, void *data, int flag)
 894 {
 895         int rc;
 896         struct lustre_handle lockh;
 897         ENTRY;
 898
 899         switch (flag) {
 900         case LDLM_CB_BLOCKING:
 901                 ldlm_lock2handle(lock, &lockh);
 902                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 903                 if (rc < 0) {
 904                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 905                         RETURN(rc);
 906                 }
 907                 break;
 908         case LDLM_CB_CANCELING:
 909                 /* do nothing */
 910                 break;
 911         }
 912         RETURN(0);
 913 }
 914
 915 /**
 916  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 917  * and save it as fd->fd_och so as to force client to reopen the file even
 918  * if it has an open lock in cache already.
 919  */
 920 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 921                                 struct lustre_handle *old_open_handle)
 922 {
 923         struct ll_inode_info *lli = ll_i2info(inode);
 924         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 925         struct obd_client_handle **och_p;
 926         __u64 *och_usecount;
 927         int rc = 0;
 928         ENTRY;
 929
 930         /* Get the openhandle of the file */
 931         mutex_lock(&lli->lli_och_mutex);
 932         if (fd->fd_lease_och != NULL)
 933                 GOTO(out_unlock, rc = -EBUSY);
 934
 935         if (fd->fd_och == NULL) {
 936                 if (file->f_mode & FMODE_WRITE) {
 937                         LASSERT(lli->lli_mds_write_och != NULL);
 938                         och_p = &lli->lli_mds_write_och;
 939                         och_usecount = &lli->lli_open_fd_write_count;
 940                 } else {
 941                         LASSERT(lli->lli_mds_read_och != NULL);
 942                         och_p = &lli->lli_mds_read_och;
 943                         och_usecount = &lli->lli_open_fd_read_count;
 944                 }
 945
 946                 if (*och_usecount > 1)
 947                         GOTO(out_unlock, rc = -EBUSY);
 948
 949                 fd->fd_och = *och_p;
 950                 *och_usecount = 0;
 951                 *och_p = NULL;
 952         }
 953
 954         *old_open_handle = fd->fd_och->och_open_handle;
 955
 956         EXIT;
 957 out_unlock:
 958         mutex_unlock(&lli->lli_och_mutex);
 959         return rc;
 960 }
 961
 962 /**
 963  * Release ownership on lli_mds_*_och when putting back a file lease.
 964  */
 965 static int ll_lease_och_release(struct inode *inode, struct file *file)
 966 {
 967         struct ll_inode_info *lli = ll_i2info(inode);
 968         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 969         struct obd_client_handle **och_p;
 970         struct obd_client_handle *old_och = NULL;
 971         __u64 *och_usecount;
 972         int rc = 0;
 973         ENTRY;
 974
 975         mutex_lock(&lli->lli_och_mutex);
 976         if (file->f_mode & FMODE_WRITE) {
 977                 och_p = &lli->lli_mds_write_och;
 978                 och_usecount = &lli->lli_open_fd_write_count;
 979         } else {
 980                 och_p = &lli->lli_mds_read_och;
 981                 och_usecount = &lli->lli_open_fd_read_count;
 982         }
 983
 984         /* The file may have been open by another process (broken lease) so
 985          * *och_p is not NULL. In this case we should simply increase usecount
 986          * and close fd_och.
 987          */
 988         if (*och_p != NULL) {
 989                 old_och = fd->fd_och;
 990                 (*och_usecount)++;
 991         } else {
 992                 *och_p = fd->fd_och;
 993                 *och_usecount = 1;
 994         }
 995         fd->fd_och = NULL;
 996         mutex_unlock(&lli->lli_och_mutex);
 997
 998         if (old_och != NULL)
 999                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1000
1001         RETURN(rc);
1002 }
1003
1004 /**
1005  * Acquire a lease and open the file.
1006  */
1007 static struct obd_client_handle *
1008 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1009               __u64 open_flags)
1010 {
1011         struct lookup_intent it = { .it_op = IT_OPEN };
1012         struct ll_sb_info *sbi = ll_i2sbi(inode);
1013         struct md_op_data *op_data;
1014         struct ptlrpc_request *req = NULL;
1015         struct lustre_handle old_open_handle = { 0 };
1016         struct obd_client_handle *och = NULL;
1017         int rc;
1018         int rc2;
1019         ENTRY;
1020
1021         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1022                 RETURN(ERR_PTR(-EINVAL));
1023
1024         if (file != NULL) {
1025                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1026                         RETURN(ERR_PTR(-EPERM));
1027
1028                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1029                 if (rc)
1030                         RETURN(ERR_PTR(rc));
1031         }
1032
1033         OBD_ALLOC_PTR(och);
1034         if (och == NULL)
1035                 RETURN(ERR_PTR(-ENOMEM));
1036
1037         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1038                                         LUSTRE_OPC_ANY, NULL);
1039         if (IS_ERR(op_data))
1040                 GOTO(out, rc = PTR_ERR(op_data));
1041
1042         /* To tell the MDT this openhandle is from the same owner */
1043         op_data->op_open_handle = old_open_handle;
1044
1045         it.it_flags = fmode | open_flags;
1046         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1047         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1048                             &ll_md_blocking_lease_ast,
1049         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1050          * it can be cancelled which may mislead applications that the lease is
1051          * broken;
1052          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1053          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1054          * doesn't deal with openhandle, so normal openhandle will be leaked. */
1055                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1056         ll_finish_md_op_data(op_data);
1057         ptlrpc_req_finished(req);
1058         if (rc < 0)
1059                 GOTO(out_release_it, rc);
1060
1061         if (it_disposition(&it, DISP_LOOKUP_NEG))
1062                 GOTO(out_release_it, rc = -ENOENT);
1063
1064         rc = it_open_error(DISP_OPEN_OPEN, &it);
1065         if (rc)
1066                 GOTO(out_release_it, rc);
1067
1068         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1069         ll_och_fill(sbi->ll_md_exp, &it, och);
1070
1071         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1072                 GOTO(out_close, rc = -EOPNOTSUPP);
1073
1074         /* already get lease, handle lease lock */
1075         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1076         if (it.it_lock_mode == 0 ||
1077             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1078                 /* open lock must return for lease */
1079                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1080                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1081                         it.it_lock_bits);
1082                 GOTO(out_close, rc = -EPROTO);
1083         }
1084
1085         ll_intent_release(&it);
1086         RETURN(och);
1087
1088 out_close:
1089         /* Cancel open lock */
1090         if (it.it_lock_mode != 0) {
1091                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1092                                             it.it_lock_mode);
1093                 it.it_lock_mode = 0;
1094                 och->och_lease_handle.cookie = 0ULL;
1095         }
1096         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1097         if (rc2 < 0)
1098                 CERROR("%s: error closing file "DFID": %d\n",
1099                        sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1100         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1101 out_release_it:
1102         ll_intent_release(&it);
1103 out:
1104         if (och != NULL)
1105                 OBD_FREE_PTR(och);
1106         RETURN(ERR_PTR(rc));
1107 }
1108
1109 /**
1110  * Check whether a layout swap can be done between two inodes.
1111  *
1112  * \param[in] inode1  First inode to check
1113  * \param[in] inode2  Second inode to check
1114  *
1115  * \retval 0 on success, layout swap can be performed between both inodes
1116  * \retval negative error code if requirements are not met
1117  */
1118 static int ll_check_swap_layouts_validity(struct inode *inode1,
1119                                           struct inode *inode2)
1120 {
1121         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1122                 return -EINVAL;
1123
1124         if (inode_permission(inode1, MAY_WRITE) ||
1125             inode_permission(inode2, MAY_WRITE))
1126                 return -EPERM;
1127
1128         if (inode1->i_sb != inode2->i_sb)
1129                 return -EXDEV;
1130
1131         return 0;
1132 }
1133
1134 static int ll_swap_layouts_close(struct obd_client_handle *och,
1135                                  struct inode *inode, struct inode *inode2)
1136 {
1137         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1138         const struct lu_fid     *fid2;
1139         int                      rc;
1140         ENTRY;
1141
1142         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1143                ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1144
1145         rc = ll_check_swap_layouts_validity(inode, inode2);
1146         if (rc < 0)
1147                 GOTO(out_free_och, rc);
1148
1149         /* We now know that inode2 is a lustre inode */
1150         fid2 = ll_inode2fid(inode2);
1151
1152         rc = lu_fid_cmp(fid1, fid2);
1153         if (rc == 0)
1154                 GOTO(out_free_och, rc = -EINVAL);
1155
1156         /* Close the file and {swap,merge} layouts between inode & inode2.
1157          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1158          * because we still need it to pack l_remote_handle to MDT. */
1159         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1160                                        inode2);
1161
1162         och = NULL; /* freed in ll_close_inode_openhandle() */
1163
1164 out_free_och:
1165         if (och != NULL)
1166                 OBD_FREE_PTR(och);
1167
1168         RETURN(rc);
1169 }
1170
1171 /**
1172  * Release lease and close the file.
1173  * It will check if the lease has ever broken.
1174  */
1175 static int ll_lease_close_intent(struct obd_client_handle *och,
1176                                  struct inode *inode,
1177                                  bool *lease_broken, enum mds_op_bias bias,
1178                                  void *data)
1179 {
1180         struct ldlm_lock *lock;
1181         bool cancelled = true;
1182         int rc;
1183         ENTRY;
1184
1185         lock = ldlm_handle2lock(&och->och_lease_handle);
1186         if (lock != NULL) {
1187                 lock_res_and_lock(lock);
1188                 cancelled = ldlm_is_cancel(lock);
1189                 unlock_res_and_lock(lock);
1190                 LDLM_LOCK_PUT(lock);
1191         }
1192
1193         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1194                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1195
1196         if (lease_broken != NULL)
1197                 *lease_broken = cancelled;
1198
1199         if (!cancelled && !bias)
1200                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1201
1202         if (cancelled) { /* no need to excute intent */
1203                 bias = 0;
1204                 data = NULL;
1205         }
1206
1207         rc = ll_close_inode_openhandle(inode, och, bias, data);
1208         RETURN(rc);
1209 }
1210
1211 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1212                           bool *lease_broken)
1213 {
1214         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1215 }
1216
1217 /**
1218  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1219  */
1220 static int ll_lease_file_resync(struct obd_client_handle *och,
1221                                 struct inode *inode, unsigned long arg)
1222 {
1223         struct ll_sb_info *sbi = ll_i2sbi(inode);
1224         struct md_op_data *op_data;
1225         struct ll_ioc_lease_id ioc;
1226         __u64 data_version_unused;
1227         int rc;
1228         ENTRY;
1229
1230         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1231                                      LUSTRE_OPC_ANY, NULL);
1232         if (IS_ERR(op_data))
1233                 RETURN(PTR_ERR(op_data));
1234
1235         if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1236                            sizeof(ioc)))
1237                 RETURN(-EFAULT);
1238
1239         /* before starting file resync, it's necessary to clean up page cache
1240          * in client memory, otherwise once the layout version is increased,
1241          * writing back cached data will be denied the OSTs. */
1242         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1243         if (rc)
1244                 GOTO(out, rc);
1245
1246         op_data->op_lease_handle = och->och_lease_handle;
1247         op_data->op_mirror_id = ioc.lil_mirror_id;
1248         rc = md_file_resync(sbi->ll_md_exp, op_data);
1249         if (rc)
1250                 GOTO(out, rc);
1251
1252         EXIT;
1253 out:
1254         ll_finish_md_op_data(op_data);
1255         return rc;
1256 }
1257
1258 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1259 {
1260         struct ll_inode_info *lli = ll_i2info(inode);
1261         struct cl_object *obj = lli->lli_clob;
1262         struct cl_attr *attr = vvp_env_thread_attr(env);
1263         s64 atime;
1264         s64 mtime;
1265         s64 ctime;
1266         int rc = 0;
1267
1268         ENTRY;
1269
1270         ll_inode_size_lock(inode);
1271
1272         /* Merge timestamps the most recently obtained from MDS with
1273          * timestamps obtained from OSTs.
1274          *
1275          * Do not overwrite atime of inode because it may be refreshed
1276          * by file_accessed() function. If the read was served by cache
1277          * data, there is no RPC to be sent so that atime may not be
1278          * transferred to OSTs at all. MDT only updates atime at close time
1279          * if it's at least 'mdd.*.atime_diff' older.
1280          * All in all, the atime in Lustre does not strictly comply with
1281          * POSIX. Solving this problem needs to send an RPC to MDT for each
1282          * read, this will hurt performance.
1283          */
1284         if (inode->i_atime.tv_sec < lli->lli_atime ||
1285             lli->lli_update_atime) {
1286                 inode->i_atime.tv_sec = lli->lli_atime;
1287                 lli->lli_update_atime = 0;
1288         }
1289         inode->i_mtime.tv_sec = lli->lli_mtime;
1290         inode->i_ctime.tv_sec = lli->lli_ctime;
1291
1292         mtime = inode->i_mtime.tv_sec;
1293         atime = inode->i_atime.tv_sec;
1294         ctime = inode->i_ctime.tv_sec;
1295
1296         cl_object_attr_lock(obj);
1297         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1298                 rc = -EINVAL;
1299         else
1300                 rc = cl_object_attr_get(env, obj, attr);
1301         cl_object_attr_unlock(obj);
1302
1303         if (rc != 0)
1304                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1305
1306         if (atime < attr->cat_atime)
1307                 atime = attr->cat_atime;
1308
1309         if (ctime < attr->cat_ctime)
1310                 ctime = attr->cat_ctime;
1311
1312         if (mtime < attr->cat_mtime)
1313                 mtime = attr->cat_mtime;
1314
1315         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1316                PFID(&lli->lli_fid), attr->cat_size);
1317
1318         i_size_write(inode, attr->cat_size);
1319         inode->i_blocks = attr->cat_blocks;
1320
1321         inode->i_mtime.tv_sec = mtime;
1322         inode->i_atime.tv_sec = atime;
1323         inode->i_ctime.tv_sec = ctime;
1324
1325 out_size_unlock:
1326         ll_inode_size_unlock(inode);
1327
1328         RETURN(rc);
1329 }
1330
1331 /**
1332  * Set designated mirror for I/O.
1333  *
1334  * So far only read, write, and truncated can support to issue I/O to
1335  * designated mirror.
1336  */
1337 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1338 {
1339         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1340
1341         /* clear layout version for generic(non-resync) I/O in case it carries
1342          * stale layout version due to I/O restart */
1343         io->ci_layout_version = 0;
1344
1345         /* FLR: disable non-delay for designated mirror I/O because obviously
1346          * only one mirror is available */
1347         if (fd->fd_designated_mirror > 0) {
1348                 io->ci_ndelay = 0;
1349                 io->ci_designated_mirror = fd->fd_designated_mirror;
1350                 io->ci_layout_version = fd->fd_layout_version;
1351         }
1352
1353         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1354                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1355 }
1356
1357 static bool file_is_noatime(const struct file *file)
1358 {
1359         const struct vfsmount *mnt = file->f_path.mnt;
1360         const struct inode *inode = file_inode((struct file *)file);
1361
1362         /* Adapted from file_accessed() and touch_atime().*/
1363         if (file->f_flags & O_NOATIME)
1364                 return true;
1365
1366         if (inode->i_flags & S_NOATIME)
1367                 return true;
1368
1369         if (IS_NOATIME(inode))
1370                 return true;
1371
1372         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1373                 return true;
1374
1375         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1376                 return true;
1377
1378         if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1379                 return true;
1380
1381         return false;
1382 }
1383
1384 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1385                 struct vvp_io_args *args)
1386 {
1387         struct inode *inode = file_inode(file);
1388         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1389
1390         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1391         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1392
1393         if (iot == CIT_WRITE) {
1394                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1395                 io->u.ci_wr.wr_sync   = !!(file->f_flags & O_SYNC ||
1396                                            file->f_flags & O_DIRECT ||
1397                                            IS_SYNC(inode));
1398 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1399                 io->u.ci_wr.wr_sync  |= !!(args &&
1400                                            args->via_io_subtype == IO_NORMAL &&
1401                                            args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1402 #endif
1403         }
1404
1405         io->ci_obj = ll_i2info(inode)->lli_clob;
1406         io->ci_lockreq = CILR_MAYBE;
1407         if (ll_file_nolock(file)) {
1408                 io->ci_lockreq = CILR_NEVER;
1409                 io->ci_no_srvlock = 1;
1410         } else if (file->f_flags & O_APPEND) {
1411                 io->ci_lockreq = CILR_MANDATORY;
1412         }
1413         io->ci_noatime = file_is_noatime(file);
1414         io->ci_async_readahead = false;
1415
1416         /* FLR: only use non-delay I/O for read as there is only one
1417          * avaliable mirror for write. */
1418         io->ci_ndelay = !(iot == CIT_WRITE);
1419
1420         ll_io_set_mirror(io, file);
1421 }
1422
1423 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1424                         __u64 count)
1425 {
1426         struct ll_inode_info *lli = ll_i2info(inode);
1427         struct ll_sb_info *sbi = ll_i2sbi(inode);
1428         enum obd_heat_type sample_type;
1429         enum obd_heat_type iobyte_type;
1430         __u64 now = ktime_get_real_seconds();
1431
1432         if (!ll_sbi_has_file_heat(sbi) ||
1433             lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1434                 return;
1435
1436         if (iot == CIT_READ) {
1437                 sample_type = OBD_HEAT_READSAMPLE;
1438                 iobyte_type = OBD_HEAT_READBYTE;
1439         } else if (iot == CIT_WRITE) {
1440                 sample_type = OBD_HEAT_WRITESAMPLE;
1441                 iobyte_type = OBD_HEAT_WRITEBYTE;
1442         } else {
1443                 return;
1444         }
1445
1446         spin_lock(&lli->lli_heat_lock);
1447         obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1448                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1449         obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1450                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1451         spin_unlock(&lli->lli_heat_lock);
1452 }
1453
1454 static ssize_t
1455 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1456                    struct file *file, enum cl_io_type iot,
1457                    loff_t *ppos, size_t count)
1458 {
1459         struct vvp_io           *vio = vvp_env_io(env);
1460         struct inode            *inode = file_inode(file);
1461         struct ll_inode_info    *lli = ll_i2info(inode);
1462         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1463         struct range_lock       range;
1464         struct cl_io            *io;
1465         ssize_t                 result = 0;
1466         int                     rc = 0;
1467         unsigned                retried = 0;
1468         bool                    restarted = false;
1469
1470         ENTRY;
1471
1472         CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1473                 file_dentry(file)->d_name.name,
1474                 iot == CIT_READ ? "read" : "write", *ppos, count);
1475
1476 restart:
1477         io = vvp_env_thread_io(env);
1478         ll_io_init(io, file, iot, args);
1479         io->ci_ndelay_tried = retried;
1480
1481         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1482                 bool range_locked = false;
1483
1484                 if (file->f_flags & O_APPEND)
1485                         range_lock_init(&range, 0, LUSTRE_EOF);
1486                 else
1487                         range_lock_init(&range, *ppos, *ppos + count - 1);
1488
1489                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1490                 vio->vui_io_subtype = args->via_io_subtype;
1491
1492                 switch (vio->vui_io_subtype) {
1493                 case IO_NORMAL:
1494                         vio->vui_iter = args->u.normal.via_iter;
1495                         vio->vui_iocb = args->u.normal.via_iocb;
1496                         /* Direct IO reads must also take range lock,
1497                          * or multiple reads will try to work on the same pages
1498                          * See LU-6227 for details. */
1499                         if (((iot == CIT_WRITE) ||
1500                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1501                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1502                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1503                                        RL_PARA(&range));
1504                                 rc = range_lock(&lli->lli_write_tree, &range);
1505                                 if (rc < 0)
1506                                         GOTO(out, rc);
1507
1508                                 range_locked = true;
1509                         }
1510                         break;
1511                 case IO_SPLICE:
1512                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1513                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1514                         break;
1515                 default:
1516                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1517                         LBUG();
1518                 }
1519
1520                 ll_cl_add(file, env, io, LCC_RW);
1521                 rc = cl_io_loop(env, io);
1522                 ll_cl_remove(file, env);
1523
1524                 if (range_locked) {
1525                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1526                                RL_PARA(&range));
1527                         range_unlock(&lli->lli_write_tree, &range);
1528                 }
1529         } else {
1530                 /* cl_io_rw_init() handled IO */
1531                 rc = io->ci_result;
1532         }
1533
1534         if (io->ci_nob > 0) {
1535                 result += io->ci_nob;
1536                 count  -= io->ci_nob;
1537                 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1538
1539                 /* prepare IO restart */
1540                 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1541                         args->u.normal.via_iter = vio->vui_iter;
1542         }
1543 out:
1544         cl_io_fini(env, io);
1545
1546         CDEBUG(D_VFSTRACE,
1547                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1548                file->f_path.dentry->d_name.name,
1549                iot, rc, result, io->ci_need_restart);
1550
1551         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1552                 CDEBUG(D_VFSTRACE,
1553                        "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1554                        file_dentry(file)->d_name.name,
1555                        iot == CIT_READ ? "read" : "write",
1556                        *ppos, count, result, rc);
1557                 /* preserve the tried count for FLR */
1558                 retried = io->ci_ndelay_tried;
1559                 restarted = true;
1560                 goto restart;
1561         }
1562
1563         if (iot == CIT_READ) {
1564                 if (result > 0)
1565                         ll_stats_ops_tally(ll_i2sbi(inode),
1566                                            LPROC_LL_READ_BYTES, result);
1567         } else if (iot == CIT_WRITE) {
1568                 if (result > 0) {
1569                         ll_stats_ops_tally(ll_i2sbi(inode),
1570                                            LPROC_LL_WRITE_BYTES, result);
1571                         fd->fd_write_failed = false;
1572                 } else if (result == 0 && rc == 0) {
1573                         rc = io->ci_result;
1574                         if (rc < 0)
1575                                 fd->fd_write_failed = true;
1576                         else
1577                                 fd->fd_write_failed = false;
1578                 } else if (rc != -ERESTARTSYS) {
1579                         fd->fd_write_failed = true;
1580                 }
1581         }
1582
1583         CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1584         if (result > 0)
1585                 ll_heat_add(inode, iot, result);
1586
1587         RETURN(result > 0 ? result : rc);
1588 }
1589
1590 /**
1591  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1592  * especially for small I/O.
1593  *
1594  * To serve a read request, CLIO has to create and initialize a cl_io and
1595  * then request DLM lock. This has turned out to have siginificant overhead
1596  * and affects the performance of small I/O dramatically.
1597  *
1598  * It's not necessary to create a cl_io for each I/O. Under the help of read
1599  * ahead, most of the pages being read are already in memory cache and we can
1600  * read those pages directly because if the pages exist, the corresponding DLM
1601  * lock must exist so that page content must be valid.
1602  *
1603  * In fast read implementation, the llite speculatively finds and reads pages
1604  * in memory cache. There are three scenarios for fast read:
1605  *   - If the page exists and is uptodate, kernel VM will provide the data and
1606  *     CLIO won't be intervened;
1607  *   - If the page was brought into memory by read ahead, it will be exported
1608  *     and read ahead parameters will be updated;
1609  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1610  *     it will go back and invoke normal read, i.e., a cl_io will be created
1611  *     and DLM lock will be requested.
1612  *
1613  * POSIX compliance: posix standard states that read is intended to be atomic.
1614  * Lustre read implementation is in line with Linux kernel read implementation
1615  * and neither of them complies with POSIX standard in this matter. Fast read
1616  * doesn't make the situation worse on single node but it may interleave write
1617  * results from multiple nodes due to short read handling in ll_file_aio_read().
1618  *
1619  * \param env - lu_env
1620  * \param iocb - kiocb from kernel
1621  * \param iter - user space buffers where the data will be copied
1622  *
1623  * \retval - number of bytes have been read, or error code if error occurred.
1624  */
1625 static ssize_t
1626 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1627 {
1628         ssize_t result;
1629
1630         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1631                 return 0;
1632
1633         /* NB: we can't do direct IO for fast read because it will need a lock
1634          * to make IO engine happy. */
1635         if (iocb->ki_filp->f_flags & O_DIRECT)
1636                 return 0;
1637
1638         result = generic_file_read_iter(iocb, iter);
1639
1640         /* If the first page is not in cache, generic_file_aio_read() will be
1641          * returned with -ENODATA.
1642          * See corresponding code in ll_readpage(). */
1643         if (result == -ENODATA)
1644                 result = 0;
1645
1646         if (result > 0) {
1647                 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1648                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1649                                 LPROC_LL_READ_BYTES, result);
1650         }
1651
1652         return result;
1653 }
1654
1655 /*
1656  * Read from a file (through the page cache).
1657  */
1658 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1659 {
1660         struct lu_env *env;
1661         struct vvp_io_args *args;
1662         struct file *file = iocb->ki_filp;
1663         ssize_t result;
1664         ssize_t rc2;
1665         __u16 refcheck;
1666         bool cached;
1667
1668         if (!iov_iter_count(to))
1669                 return 0;
1670
1671         /**
1672          * Currently when PCC read failed, we do not fall back to the
1673          * normal read path, just return the error.
1674          * The resaon is that: for RW-PCC, the file data may be modified
1675          * in the PCC and inconsistent with the data on OSTs (or file
1676          * data has been removed from the Lustre file system), at this
1677          * time, fallback to the normal read path may read the wrong
1678          * data.
1679          * TODO: for RO-PCC (readonly PCC), fall back to normal read
1680          * path: read data from data copy on OSTs.
1681          */
1682         result = pcc_file_read_iter(iocb, to, &cached);
1683         if (cached)
1684                 return result;
1685
1686         ll_ras_enter(file);
1687
1688         result = ll_do_fast_read(iocb, to);
1689         if (result < 0 || iov_iter_count(to) == 0)
1690                 GOTO(out, result);
1691
1692         env = cl_env_get(&refcheck);
1693         if (IS_ERR(env))
1694                 return PTR_ERR(env);
1695
1696         args = ll_env_args(env, IO_NORMAL);
1697         args->u.normal.via_iter = to;
1698         args->u.normal.via_iocb = iocb;
1699
1700         rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1701                                  &iocb->ki_pos, iov_iter_count(to));
1702         if (rc2 > 0)
1703                 result += rc2;
1704         else if (result == 0)
1705                 result = rc2;
1706
1707         cl_env_put(env, &refcheck);
1708 out:
1709         if (result > 0)
1710                 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1711                                   LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1712                                   READ);
1713
1714         return result;
1715 }
1716
1717 /**
1718  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1719  * If a page is already in the page cache and dirty (and some other things -
1720  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1721  * write to it without doing a full I/O, because Lustre already knows about it
1722  * and will write it out.  This saves a lot of processing time.
1723  *
1724  * All writes here are within one page, so exclusion is handled by the page
1725  * lock on the vm page.  We do not do tiny writes for writes which touch
1726  * multiple pages because it's very unlikely multiple sequential pages are
1727  * are already dirty.
1728  *
1729  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1730  * and are unlikely to be to already dirty pages.
1731  *
1732  * Attribute updates are important here, we do them in ll_tiny_write_end.
1733  */
1734 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1735 {
1736         ssize_t count = iov_iter_count(iter);
1737         struct  file *file = iocb->ki_filp;
1738         struct  inode *inode = file_inode(file);
1739         bool    lock_inode = !IS_NOSEC(inode);
1740         ssize_t result = 0;
1741
1742         ENTRY;
1743
1744         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1745          * of function for why.
1746          */
1747         if (count >= PAGE_SIZE ||
1748             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1749                 RETURN(0);
1750
1751         if (unlikely(lock_inode))
1752                 inode_lock(inode);
1753         result = __generic_file_write_iter(iocb, iter);
1754
1755         if (unlikely(lock_inode))
1756                 inode_unlock(inode);
1757
1758         /* If the page is not already dirty, ll_tiny_write_begin returns
1759          * -ENODATA.  We continue on to normal write.
1760          */
1761         if (result == -ENODATA)
1762                 result = 0;
1763
1764         if (result > 0) {
1765                 ll_heat_add(inode, CIT_WRITE, result);
1766                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1767                                    result);
1768                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1769         }
1770
1771         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1772
1773         RETURN(result);
1774 }
1775
1776 /*
1777  * Write to a file (through the page cache).
1778  */
1779 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1780 {
1781         struct vvp_io_args *args;
1782         struct lu_env *env;
1783         ssize_t rc_tiny = 0, rc_normal;
1784         struct file *file = iocb->ki_filp;
1785         __u16 refcheck;
1786         bool cached;
1787         int result;
1788
1789         ENTRY;
1790
1791         if (!iov_iter_count(from))
1792                 GOTO(out, rc_normal = 0);
1793
1794         /**
1795          * When PCC write failed, we usually do not fall back to the normal
1796          * write path, just return the error. But there is a special case when
1797          * returned error code is -ENOSPC due to running out of space on PCC HSM
1798          * bakcend. At this time, it will fall back to normal I/O path and
1799          * retry the I/O. As the file is in HSM released state, it will restore
1800          * the file data to OSTs first and redo the write again. And the
1801          * restore process will revoke the layout lock and detach the file
1802          * from PCC cache automatically.
1803          */
1804         result = pcc_file_write_iter(iocb, from, &cached);
1805         if (cached && result != -ENOSPC && result != -EDQUOT)
1806                 return result;
1807
1808         /* NB: we can't do direct IO for tiny writes because they use the page
1809          * cache, we can't do sync writes because tiny writes can't flush
1810          * pages, and we can't do append writes because we can't guarantee the
1811          * required DLM locks are held to protect file size.
1812          */
1813         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1814             !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1815                 rc_tiny = ll_do_tiny_write(iocb, from);
1816
1817         /* In case of error, go on and try normal write - Only stop if tiny
1818          * write completed I/O.
1819          */
1820         if (iov_iter_count(from) == 0)
1821                 GOTO(out, rc_normal = rc_tiny);
1822
1823         env = cl_env_get(&refcheck);
1824         if (IS_ERR(env))
1825                 return PTR_ERR(env);
1826
1827         args = ll_env_args(env, IO_NORMAL);
1828         args->u.normal.via_iter = from;
1829         args->u.normal.via_iocb = iocb;
1830
1831         rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1832                                        &iocb->ki_pos, iov_iter_count(from));
1833
1834         /* On success, combine bytes written. */
1835         if (rc_tiny >= 0 && rc_normal > 0)
1836                 rc_normal += rc_tiny;
1837         /* On error, only return error from normal write if tiny write did not
1838          * write any bytes.  Otherwise return bytes written by tiny write.
1839          */
1840         else if (rc_tiny > 0)
1841                 rc_normal = rc_tiny;
1842
1843         cl_env_put(env, &refcheck);
1844 out:
1845         if (rc_normal > 0)
1846                 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1847                                   LUSTRE_FPRIVATE(file), iocb->ki_pos,
1848                                   rc_normal, WRITE);
1849         RETURN(rc_normal);
1850 }
1851
1852 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1853 /*
1854  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1855  */
1856 static int ll_file_get_iov_count(const struct iovec *iov,
1857                                  unsigned long *nr_segs, size_t *count)
1858 {
1859         size_t cnt = 0;
1860         unsigned long seg;
1861
1862         for (seg = 0; seg < *nr_segs; seg++) {
1863                 const struct iovec *iv = &iov[seg];
1864
1865                 /*
1866                  * If any segment has a negative length, or the cumulative
1867                  * length ever wraps negative then return -EINVAL.
1868                  */
1869                 cnt += iv->iov_len;
1870                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1871                         return -EINVAL;
1872                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1873                         continue;
1874                 if (seg == 0)
1875                         return -EFAULT;
1876                 *nr_segs = seg;
1877                 cnt -= iv->iov_len;     /* This segment is no good */
1878                 break;
1879         }
1880         *count = cnt;
1881         return 0;
1882 }
1883
1884 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1885                                 unsigned long nr_segs, loff_t pos)
1886 {
1887         struct iov_iter to;
1888         size_t iov_count;
1889         ssize_t result;
1890         ENTRY;
1891
1892         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1893         if (result)
1894                 RETURN(result);
1895
1896         if (!iov_count)
1897                 RETURN(0);
1898
1899 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1900         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1901 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1902         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1903 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1904
1905         result = ll_file_read_iter(iocb, &to);
1906
1907         RETURN(result);
1908 }
1909
1910 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1911                             loff_t *ppos)
1912 {
1913         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1914         struct kiocb   kiocb;
1915         ssize_t        result;
1916
1917         ENTRY;
1918
1919         if (!count)
1920                 RETURN(0);
1921
1922         init_sync_kiocb(&kiocb, file);
1923         kiocb.ki_pos = *ppos;
1924 #ifdef HAVE_KIOCB_KI_LEFT
1925         kiocb.ki_left = count;
1926 #elif defined(HAVE_KI_NBYTES)
1927         kiocb.i_nbytes = count;
1928 #endif
1929
1930         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1931         *ppos = kiocb.ki_pos;
1932
1933         RETURN(result);
1934 }
1935
1936 /*
1937  * Write to a file (through the page cache).
1938  * AIO stuff
1939  */
1940 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1941                                  unsigned long nr_segs, loff_t pos)
1942 {
1943         struct iov_iter from;
1944         size_t iov_count;
1945         ssize_t result;
1946         ENTRY;
1947
1948         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1949         if (result)
1950                 RETURN(result);
1951
1952         if (!iov_count)
1953                 RETURN(0);
1954
1955 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1956         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1957 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1958         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1959 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1960
1961         result = ll_file_write_iter(iocb, &from);
1962
1963         RETURN(result);
1964 }
1965
1966 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1967                              size_t count, loff_t *ppos)
1968 {
1969         struct iovec   iov = { .iov_base = (void __user *)buf,
1970                                .iov_len = count };
1971         struct kiocb   kiocb;
1972         ssize_t        result;
1973
1974         ENTRY;
1975
1976         if (!count)
1977                 RETURN(0);
1978
1979         init_sync_kiocb(&kiocb, file);
1980         kiocb.ki_pos = *ppos;
1981 #ifdef HAVE_KIOCB_KI_LEFT
1982         kiocb.ki_left = count;
1983 #elif defined(HAVE_KI_NBYTES)
1984         kiocb.ki_nbytes = count;
1985 #endif
1986
1987         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1988         *ppos = kiocb.ki_pos;
1989
1990         RETURN(result);
1991 }
1992 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1993
1994 /*
1995  * Send file content (through pagecache) somewhere with helper
1996  */
1997 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1998                                    struct pipe_inode_info *pipe, size_t count,
1999                                    unsigned int flags)
2000 {
2001         struct lu_env *env;
2002         struct vvp_io_args *args;
2003         ssize_t result;
2004         __u16 refcheck;
2005         bool cached;
2006
2007         ENTRY;
2008
2009         result = pcc_file_splice_read(in_file, ppos, pipe,
2010                                       count, flags, &cached);
2011         if (cached)
2012                 RETURN(result);
2013
2014         ll_ras_enter(in_file);
2015
2016         env = cl_env_get(&refcheck);
2017         if (IS_ERR(env))
2018                 RETURN(PTR_ERR(env));
2019
2020         args = ll_env_args(env, IO_SPLICE);
2021         args->u.splice.via_pipe = pipe;
2022         args->u.splice.via_flags = flags;
2023
2024         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2025         cl_env_put(env, &refcheck);
2026
2027         if (result > 0)
2028                 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
2029                                   LUSTRE_FPRIVATE(in_file), *ppos, result,
2030                                   READ);
2031         RETURN(result);
2032 }
2033
2034 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2035                              __u64 flags, struct lov_user_md *lum, int lum_size)
2036 {
2037         struct lookup_intent oit = {
2038                 .it_op = IT_OPEN,
2039                 .it_flags = flags | MDS_OPEN_BY_FID,
2040         };
2041         int rc;
2042         ENTRY;
2043
2044         if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2045             le32_to_cpu(LOV_MAGIC_MAGIC)) {
2046                 /* this code will only exist for big-endian systems */
2047                 lustre_swab_lov_user_md(lum, 0);
2048         }
2049
2050         ll_inode_size_lock(inode);
2051         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2052         if (rc < 0)
2053                 GOTO(out_unlock, rc);
2054
2055         ll_release_openhandle(dentry, &oit);
2056
2057 out_unlock:
2058         ll_inode_size_unlock(inode);
2059         ll_intent_release(&oit);
2060
2061         RETURN(rc);
2062 }
2063
2064 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2065                              struct lov_mds_md **lmmp, int *lmm_size,
2066                              struct ptlrpc_request **request)
2067 {
2068         struct ll_sb_info *sbi = ll_i2sbi(inode);
2069         struct mdt_body  *body;
2070         struct lov_mds_md *lmm = NULL;
2071         struct ptlrpc_request *req = NULL;
2072         struct md_op_data *op_data;
2073         int rc, lmmsize;
2074
2075         rc = ll_get_default_mdsize(sbi, &lmmsize);
2076         if (rc)
2077                 RETURN(rc);
2078
2079         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2080                                      strlen(filename), lmmsize,
2081                                      LUSTRE_OPC_ANY, NULL);
2082         if (IS_ERR(op_data))
2083                 RETURN(PTR_ERR(op_data));
2084
2085         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2086         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2087         ll_finish_md_op_data(op_data);
2088         if (rc < 0) {
2089                 CDEBUG(D_INFO, "md_getattr_name failed "
2090                        "on %s: rc %d\n", filename, rc);
2091                 GOTO(out, rc);
2092         }
2093
2094         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2095         LASSERT(body != NULL); /* checked by mdc_getattr_name */
2096
2097         lmmsize = body->mbo_eadatasize;
2098
2099         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2100                         lmmsize == 0) {
2101                 GOTO(out, rc = -ENODATA);
2102         }
2103
2104         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2105         LASSERT(lmm != NULL);
2106
2107         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2108             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2109             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2110             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2111                 GOTO(out, rc = -EPROTO);
2112
2113         /*
2114          * This is coming from the MDS, so is probably in
2115          * little endian.  We convert it to host endian before
2116          * passing it to userspace.
2117          */
2118         if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2119             __swab32(LOV_MAGIC_MAGIC)) {
2120                 int stripe_count = 0;
2121
2122                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2123                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2124                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2125                         if (le32_to_cpu(lmm->lmm_pattern) &
2126                             LOV_PATTERN_F_RELEASED)
2127                                 stripe_count = 0;
2128                 }
2129
2130                 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2131
2132                 /* if function called for directory - we should
2133                  * avoid swab not existent lsm objects */
2134                 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2135                         lustre_swab_lov_user_md_objects(
2136                                 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2137                                 stripe_count);
2138                 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2139                          S_ISREG(body->mbo_mode))
2140                         lustre_swab_lov_user_md_objects(
2141                                 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2142                                 stripe_count);
2143         }
2144
2145 out:
2146         *lmmp = lmm;
2147         *lmm_size = lmmsize;
2148         *request = req;
2149         return rc;
2150 }
2151
2152 static int ll_lov_setea(struct inode *inode, struct file *file,
2153                         void __user *arg)
2154 {
2155         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2156         struct lov_user_md      *lump;
2157         int                      lum_size = sizeof(struct lov_user_md) +
2158                                             sizeof(struct lov_user_ost_data);
2159         int                      rc;
2160         ENTRY;
2161
2162         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2163                 RETURN(-EPERM);
2164
2165         OBD_ALLOC_LARGE(lump, lum_size);
2166         if (lump == NULL)
2167                 RETURN(-ENOMEM);
2168
2169         if (copy_from_user(lump, arg, lum_size))
2170                 GOTO(out_lump, rc = -EFAULT);
2171
2172         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2173                                       lum_size);
2174         cl_lov_delay_create_clear(&file->f_flags);
2175
2176 out_lump:
2177         OBD_FREE_LARGE(lump, lum_size);
2178         RETURN(rc);
2179 }
2180
2181 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2182 {
2183         struct lu_env   *env;
2184         __u16           refcheck;
2185         int             rc;
2186         ENTRY;
2187
2188         env = cl_env_get(&refcheck);
2189         if (IS_ERR(env))
2190                 RETURN(PTR_ERR(env));
2191
2192         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2193         cl_env_put(env, &refcheck);
2194         RETURN(rc);
2195 }
2196
2197 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2198                             void __user *arg)
2199 {
2200         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2201         struct lov_user_md        *klum;
2202         int                        lum_size, rc;
2203         __u64                      flags = FMODE_WRITE;
2204         ENTRY;
2205
2206         rc = ll_copy_user_md(lum, &klum);
2207         if (rc < 0)
2208                 RETURN(rc);
2209
2210         lum_size = rc;
2211         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2212                                       lum_size);
2213         if (!rc) {
2214                 __u32 gen;
2215
2216                 rc = put_user(0, &lum->lmm_stripe_count);
2217                 if (rc)
2218                         GOTO(out, rc);
2219
2220                 rc = ll_layout_refresh(inode, &gen);
2221                 if (rc)
2222                         GOTO(out, rc);
2223
2224                 rc = ll_file_getstripe(inode, arg, lum_size);
2225         }
2226         cl_lov_delay_create_clear(&file->f_flags);
2227
2228 out:
2229         OBD_FREE_LARGE(klum, lum_size);
2230         RETURN(rc);
2231 }
2232
2233
2234 static int
2235 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2236 {
2237         struct ll_inode_info *lli = ll_i2info(inode);
2238         struct cl_object *obj = lli->lli_clob;
2239         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2240         struct ll_grouplock grouplock;
2241         int rc;
2242         ENTRY;
2243
2244         if (arg == 0) {
2245                 CWARN("group id for group lock must not be 0\n");
2246                 RETURN(-EINVAL);
2247         }
2248
2249         if (ll_file_nolock(file))
2250                 RETURN(-EOPNOTSUPP);
2251 retry:
2252         if (file->f_flags & O_NONBLOCK) {
2253                 if (!mutex_trylock(&lli->lli_group_mutex))
2254                         RETURN(-EAGAIN);
2255         } else
2256                 mutex_lock(&lli->lli_group_mutex);
2257
2258         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2259                 CWARN("group lock already existed with gid %lu\n",
2260                       fd->fd_grouplock.lg_gid);
2261                 GOTO(out, rc = -EINVAL);
2262         }
2263         if (arg != lli->lli_group_gid && lli->lli_group_users != 0) {
2264                 if (file->f_flags & O_NONBLOCK)
2265                         GOTO(out, rc = -EAGAIN);
2266                 mutex_unlock(&lli->lli_group_mutex);
2267                 wait_var_event(&lli->lli_group_users, !lli->lli_group_users);
2268                 GOTO(retry, rc = 0);
2269         }
2270         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2271
2272         /**
2273          * XXX: group lock needs to protect all OST objects while PFL
2274          * can add new OST objects during the IO, so we'd instantiate
2275          * all OST objects before getting its group lock.
2276          */
2277         if (obj) {
2278                 struct lu_env *env;
2279                 __u16 refcheck;
2280                 struct cl_layout cl = {
2281                         .cl_is_composite = false,
2282                 };
2283                 struct lu_extent ext = {
2284                         .e_start = 0,
2285                         .e_end = OBD_OBJECT_EOF,
2286                 };
2287
2288                 env = cl_env_get(&refcheck);
2289                 if (IS_ERR(env))
2290                         GOTO(out, rc = PTR_ERR(env));
2291
2292                 rc = cl_object_layout_get(env, obj, &cl);
2293                 if (!rc && cl.cl_is_composite)
2294                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2295                                                     &ext);
2296
2297                 cl_env_put(env, &refcheck);
2298                 if (rc)
2299                         GOTO(out, rc);
2300         }
2301
2302         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2303                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2304
2305         if (rc)
2306                 GOTO(out, rc);
2307
2308         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2309         fd->fd_grouplock = grouplock;
2310         if (lli->lli_group_users == 0)
2311                 lli->lli_group_gid = grouplock.lg_gid;
2312         lli->lli_group_users++;
2313
2314         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2315 out:
2316         mutex_unlock(&lli->lli_group_mutex);
2317
2318         RETURN(rc);
2319 }
2320
2321 static int ll_put_grouplock(struct inode *inode, struct file *file,
2322                             unsigned long arg)
2323 {
2324         struct ll_inode_info   *lli = ll_i2info(inode);
2325         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2326         struct ll_grouplock     grouplock;
2327         int                     rc;
2328         ENTRY;
2329
2330         mutex_lock(&lli->lli_group_mutex);
2331         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2332                 CWARN("no group lock held\n");
2333                 GOTO(out, rc = -EINVAL);
2334         }
2335
2336         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2337
2338         if (fd->fd_grouplock.lg_gid != arg) {
2339                 CWARN("group lock %lu doesn't match current id %lu\n",
2340                       arg, fd->fd_grouplock.lg_gid);
2341                 GOTO(out, rc = -EINVAL);
2342         }
2343
2344         grouplock = fd->fd_grouplock;
2345         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2346         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2347
2348         cl_put_grouplock(&grouplock);
2349
2350         lli->lli_group_users--;
2351         if (lli->lli_group_users == 0) {
2352                 lli->lli_group_gid = 0;
2353                 wake_up_var(&lli->lli_group_users);
2354         }
2355         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2356         GOTO(out, rc = 0);
2357 out:
2358         mutex_unlock(&lli->lli_group_mutex);
2359
2360         RETURN(rc);
2361 }
2362
2363 /**
2364  * Close inode open handle
2365  *
2366  * \param dentry [in]     dentry which contains the inode
2367  * \param it     [in,out] intent which contains open info and result
2368  *
2369  * \retval 0     success
2370  * \retval <0    failure
2371  */
2372 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2373 {
2374         struct inode *inode = dentry->d_inode;
2375         struct obd_client_handle *och;
2376         int rc;
2377         ENTRY;
2378
2379         LASSERT(inode);
2380
2381         /* Root ? Do nothing. */
2382         if (dentry->d_inode->i_sb->s_root == dentry)
2383                 RETURN(0);
2384
2385         /* No open handle to close? Move away */
2386         if (!it_disposition(it, DISP_OPEN_OPEN))
2387                 RETURN(0);
2388
2389         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2390
2391         OBD_ALLOC(och, sizeof(*och));
2392         if (!och)
2393                 GOTO(out, rc = -ENOMEM);
2394
2395         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2396
2397         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2398 out:
2399         /* this one is in place of ll_file_open */
2400         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2401                 ptlrpc_req_finished(it->it_request);
2402                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2403         }
2404         RETURN(rc);
2405 }
2406
2407 /**
2408  * Get size for inode for which FIEMAP mapping is requested.
2409  * Make the FIEMAP get_info call and returns the result.
2410  * \param fiemap        kernel buffer to hold extens
2411  * \param num_bytes     kernel buffer size
2412  */
2413 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2414                         size_t num_bytes)
2415 {
2416         struct lu_env                   *env;
2417         __u16                           refcheck;
2418         int                             rc = 0;
2419         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2420         ENTRY;
2421
2422         /* Checks for fiemap flags */
2423         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2424                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2425                 return -EBADR;
2426         }
2427
2428         /* Check for FIEMAP_FLAG_SYNC */
2429         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2430                 rc = filemap_fdatawrite(inode->i_mapping);
2431                 if (rc)
2432                         return rc;
2433         }
2434
2435         env = cl_env_get(&refcheck);
2436         if (IS_ERR(env))
2437                 RETURN(PTR_ERR(env));
2438
2439         if (i_size_read(inode) == 0) {
2440                 rc = ll_glimpse_size(inode);
2441                 if (rc)
2442                         GOTO(out, rc);
2443         }
2444
2445         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2446         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2447         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2448
2449         /* If filesize is 0, then there would be no objects for mapping */
2450         if (fmkey.lfik_oa.o_size == 0) {
2451                 fiemap->fm_mapped_extents = 0;
2452                 GOTO(out, rc = 0);
2453         }
2454
2455         fmkey.lfik_fiemap = *fiemap;
2456
2457         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2458                               &fmkey, fiemap, &num_bytes);
2459 out:
2460         cl_env_put(env, &refcheck);
2461         RETURN(rc);
2462 }
2463
2464 int ll_fid2path(struct inode *inode, void __user *arg)
2465 {
2466         struct obd_export       *exp = ll_i2mdexp(inode);
2467         const struct getinfo_fid2path __user *gfin = arg;
2468         __u32                    pathlen;
2469         struct getinfo_fid2path *gfout;
2470         size_t                   outsize;
2471         int                      rc;
2472
2473         ENTRY;
2474
2475         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2476             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2477                 RETURN(-EPERM);
2478
2479         /* Only need to get the buflen */
2480         if (get_user(pathlen, &gfin->gf_pathlen))
2481                 RETURN(-EFAULT);
2482
2483         if (pathlen > PATH_MAX)
2484                 RETURN(-EINVAL);
2485
2486         outsize = sizeof(*gfout) + pathlen;
2487         OBD_ALLOC(gfout, outsize);
2488         if (gfout == NULL)
2489                 RETURN(-ENOMEM);
2490
2491         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2492                 GOTO(gf_free, rc = -EFAULT);
2493         /* append root FID after gfout to let MDT know the root FID so that it
2494          * can lookup the correct path, this is mainly for fileset.
2495          * old server without fileset mount support will ignore this. */
2496         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2497
2498         /* Call mdc_iocontrol */
2499         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2500         if (rc != 0)
2501                 GOTO(gf_free, rc);
2502
2503         if (copy_to_user(arg, gfout, outsize))
2504                 rc = -EFAULT;
2505
2506 gf_free:
2507         OBD_FREE(gfout, outsize);
2508         RETURN(rc);
2509 }
2510
2511 static int
2512 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2513 {
2514         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2515         struct lu_env *env;
2516         struct cl_io *io;
2517         __u16  refcheck;
2518         int result;
2519
2520         ENTRY;
2521
2522         ioc->idv_version = 0;
2523         ioc->idv_layout_version = UINT_MAX;
2524
2525         /* If no file object initialized, we consider its version is 0. */
2526         if (obj == NULL)
2527                 RETURN(0);
2528
2529         env = cl_env_get(&refcheck);
2530         if (IS_ERR(env))
2531                 RETURN(PTR_ERR(env));
2532
2533         io = vvp_env_thread_io(env);
2534         io->ci_obj = obj;
2535         io->u.ci_data_version.dv_data_version = 0;
2536         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2537         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2538
2539 restart:
2540         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2541                 result = cl_io_loop(env, io);
2542         else
2543                 result = io->ci_result;
2544
2545         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2546         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2547
2548         cl_io_fini(env, io);
2549
2550         if (unlikely(io->ci_need_restart))
2551                 goto restart;
2552
2553         cl_env_put(env, &refcheck);
2554
2555         RETURN(result);
2556 }
2557
2558 /*
2559  * Read the data_version for inode.
2560  *
2561  * This value is computed using stripe object version on OST.
2562  * Version is computed using server side locking.
2563  *
2564  * @param flags if do sync on the OST side;
2565  *              0: no sync
2566  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2567  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2568  */
2569 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2570 {
2571         struct ioc_data_version ioc = { .idv_flags = flags };
2572         int rc;
2573
2574         rc = ll_ioc_data_version(inode, &ioc);
2575         if (!rc)
2576                 *data_version = ioc.idv_version;
2577
2578         return rc;
2579 }
2580
2581 /*
2582  * Trigger a HSM release request for the provided inode.
2583  */
2584 int ll_hsm_release(struct inode *inode)
2585 {
2586         struct lu_env *env;
2587         struct obd_client_handle *och = NULL;
2588         __u64 data_version = 0;
2589         int rc;
2590         __u16 refcheck;
2591         ENTRY;
2592
2593         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2594                ll_i2sbi(inode)->ll_fsname,
2595                PFID(&ll_i2info(inode)->lli_fid));
2596
2597         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2598         if (IS_ERR(och))
2599                 GOTO(out, rc = PTR_ERR(och));
2600
2601         /* Grab latest data_version and [am]time values */
2602         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2603         if (rc != 0)
2604                 GOTO(out, rc);
2605
2606         env = cl_env_get(&refcheck);
2607         if (IS_ERR(env))
2608                 GOTO(out, rc = PTR_ERR(env));
2609
2610         rc = ll_merge_attr(env, inode);
2611         cl_env_put(env, &refcheck);
2612
2613         /* If error happen, we have the wrong size for a file.
2614          * Don't release it.
2615          */
2616         if (rc != 0)
2617                 GOTO(out, rc);
2618
2619         /* Release the file.
2620          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2621          * we still need it to pack l_remote_handle to MDT. */
2622         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2623                                        &data_version);
2624         och = NULL;
2625
2626         EXIT;
2627 out:
2628         if (och != NULL && !IS_ERR(och)) /* close the file */
2629                 ll_lease_close(och, inode, NULL);
2630
2631         return rc;
2632 }
2633
2634 struct ll_swap_stack {
2635         __u64                    dv1;
2636         __u64                    dv2;
2637         struct inode            *inode1;
2638         struct inode            *inode2;
2639         bool                     check_dv1;
2640         bool                     check_dv2;
2641 };
2642
2643 static int ll_swap_layouts(struct file *file1, struct file *file2,
2644                            struct lustre_swap_layouts *lsl)
2645 {
2646         struct mdc_swap_layouts  msl;
2647         struct md_op_data       *op_data;
2648         __u32                    gid;
2649         __u64                    dv;
2650         struct ll_swap_stack    *llss = NULL;
2651         int                      rc;
2652
2653         OBD_ALLOC_PTR(llss);
2654         if (llss == NULL)
2655                 RETURN(-ENOMEM);
2656
2657         llss->inode1 = file_inode(file1);
2658         llss->inode2 = file_inode(file2);
2659
2660         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2661         if (rc < 0)
2662                 GOTO(free, rc);
2663
2664         /* we use 2 bool because it is easier to swap than 2 bits */
2665         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2666                 llss->check_dv1 = true;
2667
2668         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2669                 llss->check_dv2 = true;
2670
2671         /* we cannot use lsl->sl_dvX directly because we may swap them */
2672         llss->dv1 = lsl->sl_dv1;
2673         llss->dv2 = lsl->sl_dv2;
2674
2675         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2676         if (rc == 0) /* same file, done! */
2677                 GOTO(free, rc);
2678
2679         if (rc < 0) { /* sequentialize it */
2680                 swap(llss->inode1, llss->inode2);
2681                 swap(file1, file2);
2682                 swap(llss->dv1, llss->dv2);
2683                 swap(llss->check_dv1, llss->check_dv2);
2684         }
2685
2686         gid = lsl->sl_gid;
2687         if (gid != 0) { /* application asks to flush dirty cache */
2688                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2689                 if (rc < 0)
2690                         GOTO(free, rc);
2691
2692                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2693                 if (rc < 0) {
2694                         ll_put_grouplock(llss->inode1, file1, gid);
2695                         GOTO(free, rc);
2696                 }
2697         }
2698
2699         /* ultimate check, before swaping the layouts we check if
2700          * dataversion has changed (if requested) */
2701         if (llss->check_dv1) {
2702                 rc = ll_data_version(llss->inode1, &dv, 0);
2703                 if (rc)
2704                         GOTO(putgl, rc);
2705                 if (dv != llss->dv1)
2706                         GOTO(putgl, rc = -EAGAIN);
2707         }
2708
2709         if (llss->check_dv2) {
2710                 rc = ll_data_version(llss->inode2, &dv, 0);
2711                 if (rc)
2712                         GOTO(putgl, rc);
2713                 if (dv != llss->dv2)
2714                         GOTO(putgl, rc = -EAGAIN);
2715         }
2716
2717         /* struct md_op_data is used to send the swap args to the mdt
2718          * only flags is missing, so we use struct mdc_swap_layouts
2719          * through the md_op_data->op_data */
2720         /* flags from user space have to be converted before they are send to
2721          * server, no flag is sent today, they are only used on the client */
2722         msl.msl_flags = 0;
2723         rc = -ENOMEM;
2724         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2725                                      0, LUSTRE_OPC_ANY, &msl);
2726         if (IS_ERR(op_data))
2727                 GOTO(free, rc = PTR_ERR(op_data));
2728
2729         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2730                            sizeof(*op_data), op_data, NULL);
2731         ll_finish_md_op_data(op_data);
2732
2733         if (rc < 0)
2734                 GOTO(putgl, rc);
2735
2736 putgl:
2737         if (gid != 0) {
2738                 ll_put_grouplock(llss->inode2, file2, gid);
2739                 ll_put_grouplock(llss->inode1, file1, gid);
2740         }
2741
2742 free:
2743         if (llss != NULL)
2744                 OBD_FREE_PTR(llss);
2745
2746         RETURN(rc);
2747 }
2748
2749 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2750 {
2751         struct obd_export *exp = ll_i2mdexp(inode);
2752         struct md_op_data *op_data;
2753         int rc;
2754         ENTRY;
2755
2756         /* Detect out-of range masks */
2757         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2758                 RETURN(-EINVAL);
2759
2760         /* Non-root users are forbidden to set or clear flags which are
2761          * NOT defined in HSM_USER_MASK. */
2762         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2763             !cfs_capable(CFS_CAP_SYS_ADMIN))
2764                 RETURN(-EPERM);
2765
2766         if (!exp_connect_archive_id_array(exp)) {
2767                 /* Detect out-of range archive id */
2768                 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2769                     (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2770                         RETURN(-EINVAL);
2771         }
2772
2773         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2774                                      LUSTRE_OPC_ANY, hss);
2775         if (IS_ERR(op_data))
2776                 RETURN(PTR_ERR(op_data));
2777
2778         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2779                            op_data, NULL);
2780
2781         ll_finish_md_op_data(op_data);
2782
2783         RETURN(rc);
2784 }
2785
2786 static int ll_hsm_import(struct inode *inode, struct file *file,
2787                          struct hsm_user_import *hui)
2788 {
2789         struct hsm_state_set    *hss = NULL;
2790         struct iattr            *attr = NULL;
2791         int                      rc;
2792         ENTRY;
2793
2794         if (!S_ISREG(inode->i_mode))
2795                 RETURN(-EINVAL);
2796
2797         /* set HSM flags */
2798         OBD_ALLOC_PTR(hss);
2799         if (hss == NULL)
2800                 GOTO(out, rc = -ENOMEM);
2801
2802         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2803         hss->hss_archive_id = hui->hui_archive_id;
2804         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2805         rc = ll_hsm_state_set(inode, hss);
2806         if (rc != 0)
2807                 GOTO(out, rc);
2808
2809         OBD_ALLOC_PTR(attr);
2810         if (attr == NULL)
2811                 GOTO(out, rc = -ENOMEM);
2812
2813         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2814         attr->ia_mode |= S_IFREG;
2815         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2816         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2817         attr->ia_size = hui->hui_size;
2818         attr->ia_mtime.tv_sec = hui->hui_mtime;
2819         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2820         attr->ia_atime.tv_sec = hui->hui_atime;
2821         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2822
2823         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2824                          ATTR_UID | ATTR_GID |
2825                          ATTR_MTIME | ATTR_MTIME_SET |
2826                          ATTR_ATIME | ATTR_ATIME_SET;
2827
2828         inode_lock(inode);
2829
2830         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2831         if (rc == -ENODATA)
2832                 rc = 0;
2833
2834         inode_unlock(inode);
2835
2836 out:
2837         if (hss != NULL)
2838                 OBD_FREE_PTR(hss);
2839
2840         if (attr != NULL)
2841                 OBD_FREE_PTR(attr);
2842
2843         RETURN(rc);
2844 }
2845
2846 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2847 {
2848         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2849                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2850 }
2851
2852 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2853 {
2854         struct inode *inode = file_inode(file);
2855         struct iattr ia = {
2856                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2857                             ATTR_MTIME | ATTR_MTIME_SET |
2858                             ATTR_CTIME,
2859                 .ia_atime = {
2860                         .tv_sec = lfu->lfu_atime_sec,
2861                         .tv_nsec = lfu->lfu_atime_nsec,
2862                 },
2863                 .ia_mtime = {
2864                         .tv_sec = lfu->lfu_mtime_sec,
2865                         .tv_nsec = lfu->lfu_mtime_nsec,
2866                 },
2867                 .ia_ctime = {
2868                         .tv_sec = lfu->lfu_ctime_sec,
2869                         .tv_nsec = lfu->lfu_ctime_nsec,
2870                 },
2871         };
2872         int rc;
2873         ENTRY;
2874
2875         if (!capable(CAP_SYS_ADMIN))
2876                 RETURN(-EPERM);
2877
2878         if (!S_ISREG(inode->i_mode))
2879                 RETURN(-EINVAL);
2880
2881         inode_lock(inode);
2882         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2883                             false);
2884         inode_unlock(inode);
2885
2886         RETURN(rc);
2887 }
2888
2889 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2890 {
2891         switch (mode) {
2892         case MODE_READ_USER:
2893                 return CLM_READ;
2894         case MODE_WRITE_USER:
2895                 return CLM_WRITE;
2896         default:
2897                 return -EINVAL;
2898         }
2899 }
2900
2901 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2902
2903 /* Used to allow the upper layers of the client to request an LDLM lock
2904  * without doing an actual read or write.
2905  *
2906  * Used for ladvise lockahead to manually request specific locks.
2907  *
2908  * \param[in] file      file this ladvise lock request is on
2909  * \param[in] ladvise   ladvise struct describing this lock request
2910  *
2911  * \retval 0            success, no detailed result available (sync requests
2912  *                      and requests sent to the server [not handled locally]
2913  *                      cannot return detailed results)
2914  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2915  *                                       see definitions for details.
2916  * \retval negative     negative errno on error
2917  */
2918 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2919 {
2920         struct lu_env *env = NULL;
2921         struct cl_io *io  = NULL;
2922         struct cl_lock *lock = NULL;
2923         struct cl_lock_descr *descr = NULL;
2924         struct dentry *dentry = file->f_path.dentry;
2925         struct inode *inode = dentry->d_inode;
2926         enum cl_lock_mode cl_mode;
2927         off_t start = ladvise->lla_start;
2928         off_t end = ladvise->lla_end;
2929         int result;
2930         __u16 refcheck;
2931
2932         ENTRY;
2933
2934         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2935                "start=%llu, end=%llu\n", dentry->d_name.len,
2936                dentry->d_name.name, dentry->d_inode,
2937                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2938                (__u64) end);
2939
2940         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2941         if (cl_mode < 0)
2942                 GOTO(out, result = cl_mode);
2943
2944         /* Get IO environment */
2945         result = cl_io_get(inode, &env, &io, &refcheck);
2946         if (result <= 0)
2947                 GOTO(out, result);
2948
2949         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2950         if (result > 0) {
2951                 /*
2952                  * nothing to do for this io. This currently happens when
2953                  * stripe sub-object's are not yet created.
2954                  */
2955                 result = io->ci_result;
2956         } else if (result == 0) {
2957                 lock = vvp_env_lock(env);
2958                 descr = &lock->cll_descr;
2959
2960                 descr->cld_obj   = io->ci_obj;
2961                 /* Convert byte offsets to pages */
2962                 descr->cld_start = cl_index(io->ci_obj, start);
2963                 descr->cld_end   = cl_index(io->ci_obj, end);
2964                 descr->cld_mode  = cl_mode;
2965                 /* CEF_MUST is used because we do not want to convert a
2966                  * lockahead request to a lockless lock */
2967                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2968                                        CEF_NONBLOCK;
2969
2970                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2971                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2972
2973                 result = cl_lock_request(env, io, lock);
2974
2975                 /* On success, we need to release the lock */
2976                 if (result >= 0)
2977                         cl_lock_release(env, lock);
2978         }
2979         cl_io_fini(env, io);
2980         cl_env_put(env, &refcheck);
2981
2982         /* -ECANCELED indicates a matching lock with a different extent
2983          * was already present, and -EEXIST indicates a matching lock
2984          * on exactly the same extent was already present.
2985          * We convert them to positive values for userspace to make
2986          * recognizing true errors easier.
2987          * Note we can only return these detailed results on async requests,
2988          * as sync requests look the same as i/o requests for locking. */
2989         if (result == -ECANCELED)
2990                 result = LLA_RESULT_DIFFERENT;
2991         else if (result == -EEXIST)
2992                 result = LLA_RESULT_SAME;
2993
2994 out:
2995         RETURN(result);
2996 }
2997 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2998
2999 static int ll_ladvise_sanity(struct inode *inode,
3000                              struct llapi_lu_ladvise *ladvise)
3001 {
3002         struct ll_sb_info *sbi = ll_i2sbi(inode);
3003         enum lu_ladvise_type advice = ladvise->lla_advice;
3004         /* Note the peradvice flags is a 32 bit field, so per advice flags must
3005          * be in the first 32 bits of enum ladvise_flags */
3006         __u32 flags = ladvise->lla_peradvice_flags;
3007         /* 3 lines at 80 characters per line, should be plenty */
3008         int rc = 0;
3009
3010         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
3011                 rc = -EINVAL;
3012                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
3013                        "last supported advice is %s (value '%d'): rc = %d\n",
3014                        sbi->ll_fsname, advice,
3015                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
3016                 GOTO(out, rc);
3017         }
3018
3019         /* Per-advice checks */
3020         switch (advice) {
3021         case LU_LADVISE_LOCKNOEXPAND:
3022                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
3023                         rc = -EINVAL;
3024                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3025                                "rc = %d\n", sbi->ll_fsname, flags,
3026                                ladvise_names[advice], rc);
3027                         GOTO(out, rc);
3028                 }
3029                 break;
3030         case LU_LADVISE_LOCKAHEAD:
3031                 /* Currently only READ and WRITE modes can be requested */
3032                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3033                     ladvise->lla_lockahead_mode == 0) {
3034                         rc = -EINVAL;
3035                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3036                                "rc = %d\n", sbi->ll_fsname,
3037                                ladvise->lla_lockahead_mode,
3038                                ladvise_names[advice], rc);
3039                         GOTO(out, rc);
3040                 }
3041                 /* fallthrough */
3042         case LU_LADVISE_WILLREAD:
3043         case LU_LADVISE_DONTNEED:
3044         default:
3045                 /* Note fall through above - These checks apply to all advices
3046                  * except LOCKNOEXPAND */
3047                 if (flags & ~LF_DEFAULT_MASK) {
3048                         rc = -EINVAL;
3049                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3050                                "rc = %d\n", sbi->ll_fsname, flags,
3051                                ladvise_names[advice], rc);
3052                         GOTO(out, rc);
3053                 }
3054                 if (ladvise->lla_start >= ladvise->lla_end) {
3055                         rc = -EINVAL;
3056                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3057                                "for %s: rc = %d\n", sbi->ll_fsname,
3058                                ladvise->lla_start, ladvise->lla_end,
3059                                ladvise_names[advice], rc);
3060                         GOTO(out, rc);
3061                 }
3062                 break;
3063         }
3064
3065 out:
3066         return rc;
3067 }
3068 #undef ERRSIZE
3069
3070 /*
3071  * Give file access advices
3072  *
3073  * The ladvise interface is similar to Linux fadvise() system call, except it
3074  * forwards the advices directly from Lustre client to server. The server side
3075  * codes will apply appropriate read-ahead and caching techniques for the
3076  * corresponding files.
3077  *
3078  * A typical workload for ladvise is e.g. a bunch of different clients are
3079  * doing small random reads of a file, so prefetching pages into OSS cache
3080  * with big linear reads before the random IO is a net benefit. Fetching
3081  * all that data into each client cache with fadvise() may not be, due to
3082  * much more data being sent to the client.
3083  */
3084 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3085                       struct llapi_lu_ladvise *ladvise)
3086 {
3087         struct lu_env *env;
3088         struct cl_io *io;
3089         struct cl_ladvise_io *lio;
3090         int rc;
3091         __u16 refcheck;
3092         ENTRY;
3093
3094         env = cl_env_get(&refcheck);
3095         if (IS_ERR(env))
3096                 RETURN(PTR_ERR(env));
3097
3098         io = vvp_env_thread_io(env);
3099         io->ci_obj = ll_i2info(inode)->lli_clob;
3100
3101         /* initialize parameters for ladvise */
3102         lio = &io->u.ci_ladvise;
3103         lio->li_start = ladvise->lla_start;
3104         lio->li_end = ladvise->lla_end;
3105         lio->li_fid = ll_inode2fid(inode);
3106         lio->li_advice = ladvise->lla_advice;
3107         lio->li_flags = flags;
3108
3109         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3110                 rc = cl_io_loop(env, io);
3111         else
3112                 rc = io->ci_result;
3113
3114         cl_io_fini(env, io);
3115         cl_env_put(env, &refcheck);
3116         RETURN(rc);
3117 }
3118
3119 static int ll_lock_noexpand(struct file *file, int flags)
3120 {
3121         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3122
3123         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3124
3125         return 0;
3126 }
3127
3128 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3129                         unsigned long arg)
3130 {
3131         struct fsxattr fsxattr;
3132
3133         if (copy_from_user(&fsxattr,
3134                            (const struct fsxattr __user *)arg,
3135                            sizeof(fsxattr)))
3136                 RETURN(-EFAULT);
3137
3138         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3139         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3140                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3141         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3142         if (copy_to_user((struct fsxattr __user *)arg,
3143                          &fsxattr, sizeof(fsxattr)))
3144                 RETURN(-EFAULT);
3145
3146         RETURN(0);
3147 }
3148
3149 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3150 {
3151         /*
3152          * Project Quota ID state is only allowed to change from within the init
3153          * namespace. Enforce that restriction only if we are trying to change
3154          * the quota ID state. Everything else is allowed in user namespaces.
3155          */
3156         if (current_user_ns() == &init_user_ns)
3157                 return 0;
3158
3159         if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3160                 return -EINVAL;
3161
3162         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3163                 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3164                         return -EINVAL;
3165         } else {
3166                 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3167                         return -EINVAL;
3168         }
3169
3170         return 0;
3171 }
3172
3173 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3174                         unsigned long arg)
3175 {
3176
3177         struct md_op_data *op_data;
3178         struct ptlrpc_request *req = NULL;
3179         int rc = 0;
3180         struct fsxattr fsxattr;
3181         struct cl_object *obj;
3182         struct iattr *attr;
3183         int flags;
3184
3185         if (copy_from_user(&fsxattr,
3186                            (const struct fsxattr __user *)arg,
3187                            sizeof(fsxattr)))
3188                 RETURN(-EFAULT);
3189
3190         rc = ll_ioctl_check_project(inode, &fsxattr);
3191         if (rc)
3192                 RETURN(rc);
3193
3194         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3195                                      LUSTRE_OPC_ANY, NULL);
3196         if (IS_ERR(op_data))
3197                 RETURN(PTR_ERR(op_data));
3198
3199         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3200         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3201         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3202                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3203         op_data->op_projid = fsxattr.fsx_projid;
3204         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3205         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3206                         0, &req);
3207         ptlrpc_req_finished(req);
3208         if (rc)
3209                 GOTO(out_fsxattr, rc);
3210         ll_update_inode_flags(inode, op_data->op_attr_flags);
3211         obj = ll_i2info(inode)->lli_clob;
3212         if (obj == NULL)
3213                 GOTO(out_fsxattr, rc);
3214
3215         OBD_ALLOC_PTR(attr);
3216         if (attr == NULL)
3217                 GOTO(out_fsxattr, rc = -ENOMEM);
3218
3219         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3220                             fsxattr.fsx_xflags);
3221         OBD_FREE_PTR(attr);
3222 out_fsxattr:
3223         ll_finish_md_op_data(op_data);
3224         RETURN(rc);
3225 }
3226
3227 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3228                                  unsigned long arg)
3229 {
3230         struct inode            *inode = file_inode(file);
3231         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3232         struct ll_inode_info    *lli = ll_i2info(inode);
3233         struct obd_client_handle *och = NULL;
3234         struct split_param sp;
3235         struct pcc_param param;
3236         bool lease_broken = false;
3237         fmode_t fmode = 0;
3238         enum mds_op_bias bias = 0;
3239         struct file *layout_file = NULL;
3240         void *data = NULL;
3241         size_t data_size = 0;
3242         bool attached = false;
3243         long rc, rc2 = 0;
3244
3245         ENTRY;
3246
3247         mutex_lock(&lli->lli_och_mutex);
3248         if (fd->fd_lease_och != NULL) {
3249                 och = fd->fd_lease_och;
3250                 fd->fd_lease_och = NULL;
3251         }
3252         mutex_unlock(&lli->lli_och_mutex);
3253
3254         if (och == NULL)
3255                 RETURN(-ENOLCK);
3256
3257         fmode = och->och_flags;
3258
3259         switch (ioc->lil_flags) {
3260         case LL_LEASE_RESYNC_DONE:
3261                 if (ioc->lil_count > IOC_IDS_MAX)
3262                         GOTO(out_lease_close, rc = -EINVAL);
3263
3264                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3265                 OBD_ALLOC(data, data_size);
3266                 if (!data)
3267                         GOTO(out_lease_close, rc = -ENOMEM);
3268
3269                 if (copy_from_user(data, (void __user *)arg, data_size))
3270                         GOTO(out_lease_close, rc = -EFAULT);
3271
3272                 bias = MDS_CLOSE_RESYNC_DONE;
3273                 break;
3274         case LL_LEASE_LAYOUT_MERGE: {
3275                 int fd;
3276
3277                 if (ioc->lil_count != 1)
3278                         GOTO(out_lease_close, rc = -EINVAL);
3279
3280                 arg += sizeof(*ioc);
3281                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3282                         GOTO(out_lease_close, rc = -EFAULT);
3283
3284                 layout_file = fget(fd);
3285                 if (!layout_file)
3286                         GOTO(out_lease_close, rc = -EBADF);
3287
3288                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3289                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3290                         GOTO(out_lease_close, rc = -EPERM);
3291
3292                 data = file_inode(layout_file);
3293                 bias = MDS_CLOSE_LAYOUT_MERGE;
3294                 break;
3295         }
3296         case LL_LEASE_LAYOUT_SPLIT: {
3297                 int fdv;
3298                 int mirror_id;
3299
3300                 if (ioc->lil_count != 2)
3301                         GOTO(out_lease_close, rc = -EINVAL);
3302
3303                 arg += sizeof(*ioc);
3304                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3305                         GOTO(out_lease_close, rc = -EFAULT);
3306
3307                 arg += sizeof(__u32);
3308                 if (copy_from_user(&mirror_id, (void __user *)arg,
3309                                    sizeof(__u32)))
3310                         GOTO(out_lease_close, rc = -EFAULT);
3311
3312                 layout_file = fget(fdv);
3313                 if (!layout_file)
3314                         GOTO(out_lease_close, rc = -EBADF);
3315
3316                 sp.sp_inode = file_inode(layout_file);
3317                 sp.sp_mirror_id = (__u16)mirror_id;
3318                 data = &sp;
3319                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3320                 break;
3321         }
3322         case LL_LEASE_PCC_ATTACH:
3323                 if (ioc->lil_count != 1)
3324                         RETURN(-EINVAL);
3325
3326                 arg += sizeof(*ioc);
3327                 if (copy_from_user(&param.pa_archive_id, (void __user *)arg,
3328                                    sizeof(__u32)))
3329                         GOTO(out_lease_close, rc2 = -EFAULT);
3330
3331                 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3332                 if (rc2)
3333                         GOTO(out_lease_close, rc2);
3334
3335                 attached = true;
3336                 /* Grab latest data version */
3337                 rc2 = ll_data_version(inode, &param.pa_data_version,
3338                                      LL_DV_WR_FLUSH);
3339                 if (rc2)
3340                         GOTO(out_lease_close, rc2);
3341
3342                 data = &param;
3343                 bias = MDS_PCC_ATTACH;
3344                 break;
3345         default:
3346                 /* without close intent */
3347                 break;
3348         }
3349
3350 out_lease_close:
3351         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3352         if (rc < 0)
3353                 GOTO(out, rc);
3354
3355         rc = ll_lease_och_release(inode, file);
3356         if (rc < 0)
3357                 GOTO(out, rc);
3358
3359         if (lease_broken)
3360                 fmode = 0;
3361         EXIT;
3362
3363 out:
3364         switch (ioc->lil_flags) {
3365         case LL_LEASE_RESYNC_DONE:
3366                 if (data)
3367                         OBD_FREE(data, data_size);
3368                 break;
3369         case LL_LEASE_LAYOUT_MERGE:
3370         case LL_LEASE_LAYOUT_SPLIT:
3371                 if (layout_file)
3372                         fput(layout_file);
3373                 break;
3374         case LL_LEASE_PCC_ATTACH:
3375                 if (!rc)
3376                         rc = rc2;
3377                 rc = pcc_readwrite_attach_fini(file, inode,
3378                                                param.pa_layout_gen,
3379                                                lease_broken, rc,
3380                                                attached);
3381                 break;
3382         }
3383
3384         if (!rc)
3385                 rc = ll_lease_type_from_fmode(fmode);
3386         RETURN(rc);
3387 }
3388
3389 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3390                               unsigned long arg)
3391 {
3392         struct inode *inode = file_inode(file);
3393         struct ll_inode_info *lli = ll_i2info(inode);
3394         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3395         struct obd_client_handle *och = NULL;
3396         __u64 open_flags = 0;
3397         bool lease_broken;
3398         fmode_t fmode;
3399         long rc;
3400         ENTRY;
3401
3402         switch (ioc->lil_mode) {
3403         case LL_LEASE_WRLCK:
3404                 if (!(file->f_mode & FMODE_WRITE))
3405                         RETURN(-EPERM);
3406                 fmode = FMODE_WRITE;
3407                 break;
3408         case LL_LEASE_RDLCK:
3409                 if (!(file->f_mode & FMODE_READ))
3410                         RETURN(-EPERM);
3411                 fmode = FMODE_READ;
3412                 break;
3413         case LL_LEASE_UNLCK:
3414                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3415         default:
3416                 RETURN(-EINVAL);
3417         }
3418
3419         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3420
3421         /* apply for lease */
3422         if (ioc->lil_flags & LL_LEASE_RESYNC)
3423                 open_flags = MDS_OPEN_RESYNC;
3424         och = ll_lease_open(inode, file, fmode, open_flags);
3425         if (IS_ERR(och))
3426                 RETURN(PTR_ERR(och));
3427
3428         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3429                 rc = ll_lease_file_resync(och, inode, arg);
3430                 if (rc) {
3431                         ll_lease_close(och, inode, NULL);
3432                         RETURN(rc);
3433                 }
3434                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3435                 if (rc) {
3436                         ll_lease_close(och, inode, NULL);
3437                         RETURN(rc);
3438                 }
3439         }
3440
3441         rc = 0;
3442         mutex_lock(&lli->lli_och_mutex);
3443         if (fd->fd_lease_och == NULL) {
3444                 fd->fd_lease_och = och;
3445                 och = NULL;
3446         }
3447         mutex_unlock(&lli->lli_och_mutex);
3448         if (och != NULL) {
3449                 /* impossible now that only excl is supported for now */
3450                 ll_lease_close(och, inode, &lease_broken);
3451                 rc = -EBUSY;
3452         }
3453         RETURN(rc);
3454 }
3455
3456 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3457 {
3458         struct ll_inode_info *lli = ll_i2info(inode);
3459         struct ll_sb_info *sbi = ll_i2sbi(inode);
3460         __u64 now = ktime_get_real_seconds();
3461         int i;
3462
3463         spin_lock(&lli->lli_heat_lock);
3464         heat->lh_flags = lli->lli_heat_flags;
3465         for (i = 0; i < heat->lh_count; i++)
3466                 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3467                                                 now, sbi->ll_heat_decay_weight,
3468                                                 sbi->ll_heat_period_second);
3469         spin_unlock(&lli->lli_heat_lock);
3470 }
3471
3472 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3473 {
3474         struct ll_inode_info *lli = ll_i2info(inode);
3475         int rc = 0;
3476
3477         spin_lock(&lli->lli_heat_lock);
3478         if (flags & LU_HEAT_FLAG_CLEAR)
3479                 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3480
3481         if (flags & LU_HEAT_FLAG_OFF)
3482                 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3483         else
3484                 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3485
3486         spin_unlock(&lli->lli_heat_lock);
3487
3488         RETURN(rc);
3489 }
3490
3491 static long
3492 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3493 {
3494         struct inode            *inode = file_inode(file);
3495         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3496         int                      flags, rc;
3497         ENTRY;
3498
3499         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3500                PFID(ll_inode2fid(inode)), inode, cmd);
3501         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3502
3503         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3504         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3505                 RETURN(-ENOTTY);
3506
3507         switch (cmd) {
3508         case LL_IOC_GETFLAGS:
3509                 /* Get the current value of the file flags */
3510                 return put_user(fd->fd_flags, (int __user *)arg);
3511         case LL_IOC_SETFLAGS:
3512         case LL_IOC_CLRFLAGS:
3513                 /* Set or clear specific file flags */
3514                 /* XXX This probably needs checks to ensure the flags are
3515                  *     not abused, and to handle any flag side effects.
3516                  */
3517                 if (get_user(flags, (int __user *) arg))
3518                         RETURN(-EFAULT);
3519
3520                 if (cmd == LL_IOC_SETFLAGS) {
3521                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3522                             !(file->f_flags & O_DIRECT)) {
3523                                 CERROR("%s: unable to disable locking on "
3524                                        "non-O_DIRECT file\n", current->comm);
3525                                 RETURN(-EINVAL);
3526                         }
3527
3528                         fd->fd_flags |= flags;
3529                 } else {
3530                         fd->fd_flags &= ~flags;
3531                 }
3532                 RETURN(0);
3533         case LL_IOC_LOV_SETSTRIPE:
3534         case LL_IOC_LOV_SETSTRIPE_NEW:
3535                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3536         case LL_IOC_LOV_SETEA:
3537                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3538         case LL_IOC_LOV_SWAP_LAYOUTS: {
3539                 struct file *file2;
3540                 struct lustre_swap_layouts lsl;
3541
3542                 if (copy_from_user(&lsl, (char __user *)arg,
3543                                    sizeof(struct lustre_swap_layouts)))
3544                         RETURN(-EFAULT);
3545
3546                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3547                         RETURN(-EPERM);
3548
3549                 file2 = fget(lsl.sl_fd);
3550                 if (file2 == NULL)
3551                         RETURN(-EBADF);
3552
3553                 /* O_WRONLY or O_RDWR */
3554                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3555                         GOTO(out, rc = -EPERM);
3556
3557                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3558                         struct inode                    *inode2;
3559                         struct ll_inode_info            *lli;
3560                         struct obd_client_handle        *och = NULL;
3561
3562                         lli = ll_i2info(inode);
3563                         mutex_lock(&lli->lli_och_mutex);
3564                         if (fd->fd_lease_och != NULL) {
3565                                 och = fd->fd_lease_och;
3566                                 fd->fd_lease_och = NULL;
3567                         }
3568                         mutex_unlock(&lli->lli_och_mutex);
3569                         if (och == NULL)
3570                                 GOTO(out, rc = -ENOLCK);
3571                         inode2 = file_inode(file2);
3572                         rc = ll_swap_layouts_close(och, inode, inode2);
3573                 } else {
3574                         rc = ll_swap_layouts(file, file2, &lsl);
3575                 }
3576 out:
3577                 fput(file2);
3578                 RETURN(rc);
3579         }
3580         case LL_IOC_LOV_GETSTRIPE:
3581         case LL_IOC_LOV_GETSTRIPE_NEW:
3582                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3583         case FS_IOC_GETFLAGS:
3584         case FS_IOC_SETFLAGS:
3585                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3586         case FSFILT_IOC_GETVERSION:
3587         case FS_IOC_GETVERSION:
3588                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3589         /* We need to special case any other ioctls we want to handle,
3590          * to send them to the MDS/OST as appropriate and to properly
3591          * network encode the arg field. */
3592         case FS_IOC_SETVERSION:
3593                 RETURN(-ENOTSUPP);
3594
3595         case LL_IOC_GROUP_LOCK:
3596                 RETURN(ll_get_grouplock(inode, file, arg));
3597         case LL_IOC_GROUP_UNLOCK:
3598                 RETURN(ll_put_grouplock(inode, file, arg));
3599         case IOC_OBD_STATFS:
3600                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3601
3602         case LL_IOC_FLUSHCTX:
3603                 RETURN(ll_flush_ctx(inode));
3604         case LL_IOC_PATH2FID: {
3605                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3606                                  sizeof(struct lu_fid)))
3607                         RETURN(-EFAULT);
3608
3609                 RETURN(0);
3610         }
3611         case LL_IOC_GETPARENT:
3612                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3613
3614         case OBD_IOC_FID2PATH:
3615                 RETURN(ll_fid2path(inode, (void __user *)arg));
3616         case LL_IOC_DATA_VERSION: {
3617                 struct ioc_data_version idv;
3618                 int rc;
3619
3620                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3621                         RETURN(-EFAULT);
3622
3623                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3624                 rc = ll_ioc_data_version(inode, &idv);
3625
3626                 if (rc == 0 &&
3627                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3628                         RETURN(-EFAULT);
3629
3630                 RETURN(rc);
3631         }
3632
3633         case LL_IOC_GET_MDTIDX: {
3634                 int mdtidx;
3635
3636                 mdtidx = ll_get_mdt_idx(inode);
3637                 if (mdtidx < 0)
3638                         RETURN(mdtidx);
3639
3640                 if (put_user((int)mdtidx, (int __user *)arg))
3641                         RETURN(-EFAULT);
3642
3643                 RETURN(0);
3644         }
3645         case OBD_IOC_GETDTNAME:
3646         case OBD_IOC_GETMDNAME:
3647                 RETURN(ll_get_obd_name(inode, cmd, arg));
3648         case LL_IOC_HSM_STATE_GET: {
3649                 struct md_op_data       *op_data;
3650                 struct hsm_user_state   *hus;
3651                 int                      rc;
3652
3653                 OBD_ALLOC_PTR(hus);
3654                 if (hus == NULL)
3655                         RETURN(-ENOMEM);
3656
3657                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3658                                              LUSTRE_OPC_ANY, hus);
3659                 if (IS_ERR(op_data)) {
3660                         OBD_FREE_PTR(hus);
3661                         RETURN(PTR_ERR(op_data));
3662                 }
3663
3664                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3665                                    op_data, NULL);
3666
3667                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3668                         rc = -EFAULT;
3669
3670                 ll_finish_md_op_data(op_data);
3671                 OBD_FREE_PTR(hus);
3672                 RETURN(rc);
3673         }
3674         case LL_IOC_HSM_STATE_SET: {
3675                 struct hsm_state_set    *hss;
3676                 int                      rc;
3677
3678                 OBD_ALLOC_PTR(hss);
3679                 if (hss == NULL)
3680                         RETURN(-ENOMEM);
3681
3682                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3683                         OBD_FREE_PTR(hss);
3684                         RETURN(-EFAULT);
3685                 }
3686
3687                 rc = ll_hsm_state_set(inode, hss);
3688
3689                 OBD_FREE_PTR(hss);
3690                 RETURN(rc);
3691         }
3692         case LL_IOC_HSM_ACTION: {
3693                 struct md_op_data               *op_data;
3694                 struct hsm_current_action       *hca;
3695                 int                              rc;
3696
3697                 OBD_ALLOC_PTR(hca);
3698                 if (hca == NULL)
3699                         RETURN(-ENOMEM);
3700
3701                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3702                                              LUSTRE_OPC_ANY, hca);
3703                 if (IS_ERR(op_data)) {
3704                         OBD_FREE_PTR(hca);
3705                         RETURN(PTR_ERR(op_data));
3706                 }
3707
3708                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3709                                    op_data, NULL);
3710
3711                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3712                         rc = -EFAULT;
3713
3714                 ll_finish_md_op_data(op_data);
3715                 OBD_FREE_PTR(hca);
3716                 RETURN(rc);
3717         }
3718         case LL_IOC_SET_LEASE_OLD: {
3719                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3720
3721                 RETURN(ll_file_set_lease(file, &ioc, 0));
3722         }
3723         case LL_IOC_SET_LEASE: {
3724                 struct ll_ioc_lease ioc;
3725
3726                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3727                         RETURN(-EFAULT);
3728
3729                 RETURN(ll_file_set_lease(file, &ioc, arg));
3730         }
3731         case LL_IOC_GET_LEASE: {
3732                 struct ll_inode_info *lli = ll_i2info(inode);
3733                 struct ldlm_lock *lock = NULL;
3734                 fmode_t fmode = 0;
3735
3736                 mutex_lock(&lli->lli_och_mutex);
3737                 if (fd->fd_lease_och != NULL) {
3738                         struct obd_client_handle *och = fd->fd_lease_och;
3739
3740                         lock = ldlm_handle2lock(&och->och_lease_handle);
3741                         if (lock != NULL) {
3742                                 lock_res_and_lock(lock);
3743                                 if (!ldlm_is_cancel(lock))
3744                                         fmode = och->och_flags;
3745
3746                                 unlock_res_and_lock(lock);
3747                                 LDLM_LOCK_PUT(lock);
3748                         }
3749                 }
3750                 mutex_unlock(&lli->lli_och_mutex);
3751
3752                 RETURN(ll_lease_type_from_fmode(fmode));
3753         }
3754         case LL_IOC_HSM_IMPORT: {
3755                 struct hsm_user_import *hui;
3756
3757                 OBD_ALLOC_PTR(hui);
3758                 if (hui == NULL)
3759                         RETURN(-ENOMEM);
3760
3761                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3762                         OBD_FREE_PTR(hui);
3763                         RETURN(-EFAULT);
3764                 }
3765
3766                 rc = ll_hsm_import(inode, file, hui);
3767
3768                 OBD_FREE_PTR(hui);
3769                 RETURN(rc);
3770         }
3771         case LL_IOC_FUTIMES_3: {
3772                 struct ll_futimes_3 lfu;
3773
3774                 if (copy_from_user(&lfu,
3775                                    (const struct ll_futimes_3 __user *)arg,
3776                                    sizeof(lfu)))
3777                         RETURN(-EFAULT);
3778
3779                 RETURN(ll_file_futimes_3(file, &lfu));
3780         }
3781         case LL_IOC_LADVISE: {
3782                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3783                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3784                 int i;
3785                 int num_advise;
3786                 int alloc_size = sizeof(*k_ladvise_hdr);
3787
3788                 rc = 0;
3789                 u_ladvise_hdr = (void __user *)arg;
3790                 OBD_ALLOC_PTR(k_ladvise_hdr);
3791                 if (k_ladvise_hdr == NULL)
3792                         RETURN(-ENOMEM);
3793
3794                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3795                         GOTO(out_ladvise, rc = -EFAULT);
3796
3797                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3798                     k_ladvise_hdr->lah_count < 1)
3799                         GOTO(out_ladvise, rc = -EINVAL);
3800
3801                 num_advise = k_ladvise_hdr->lah_count;
3802                 if (num_advise >= LAH_COUNT_MAX)
3803                         GOTO(out_ladvise, rc = -EFBIG);
3804
3805                 OBD_FREE_PTR(k_ladvise_hdr);
3806                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3807                                       lah_advise[num_advise]);
3808                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3809                 if (k_ladvise_hdr == NULL)
3810                         RETURN(-ENOMEM);
3811
3812                 /*
3813                  * TODO: submit multiple advices to one server in a single RPC
3814                  */
3815                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3816                         GOTO(out_ladvise, rc = -EFAULT);
3817
3818                 for (i = 0; i < num_advise; i++) {
3819                         struct llapi_lu_ladvise *k_ladvise =
3820                                         &k_ladvise_hdr->lah_advise[i];
3821                         struct llapi_lu_ladvise __user *u_ladvise =
3822                                         &u_ladvise_hdr->lah_advise[i];
3823
3824                         rc = ll_ladvise_sanity(inode, k_ladvise);
3825                         if (rc)
3826                                 GOTO(out_ladvise, rc);
3827
3828                         switch (k_ladvise->lla_advice) {
3829                         case LU_LADVISE_LOCKNOEXPAND:
3830                                 rc = ll_lock_noexpand(file,
3831                                                k_ladvise->lla_peradvice_flags);
3832                                 GOTO(out_ladvise, rc);
3833                         case LU_LADVISE_LOCKAHEAD:
3834
3835                                 rc = ll_file_lock_ahead(file, k_ladvise);
3836
3837                                 if (rc < 0)
3838                                         GOTO(out_ladvise, rc);
3839
3840                                 if (put_user(rc,
3841                                              &u_ladvise->lla_lockahead_result))
3842                                         GOTO(out_ladvise, rc = -EFAULT);
3843                                 break;
3844                         default:
3845                                 rc = ll_ladvise(inode, file,
3846                                                 k_ladvise_hdr->lah_flags,
3847                                                 k_ladvise);
3848                                 if (rc)
3849                                         GOTO(out_ladvise, rc);
3850                                 break;
3851                         }
3852
3853                 }
3854
3855 out_ladvise:
3856                 OBD_FREE(k_ladvise_hdr, alloc_size);
3857                 RETURN(rc);
3858         }
3859         case LL_IOC_FLR_SET_MIRROR: {
3860                 /* mirror I/O must be direct to avoid polluting page cache
3861                  * by stale data. */
3862                 if (!(file->f_flags & O_DIRECT))
3863                         RETURN(-EINVAL);
3864
3865                 fd->fd_designated_mirror = (__u32)arg;
3866                 RETURN(0);
3867         }
3868         case LL_IOC_FSGETXATTR:
3869                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3870         case LL_IOC_FSSETXATTR:
3871                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3872         case BLKSSZGET:
3873                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3874         case LL_IOC_HEAT_GET: {
3875                 struct lu_heat uheat;
3876                 struct lu_heat *heat;
3877                 int size;
3878
3879                 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3880                         RETURN(-EFAULT);
3881
3882                 if (uheat.lh_count > OBD_HEAT_COUNT)
3883                         uheat.lh_count = OBD_HEAT_COUNT;
3884
3885                 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3886                 OBD_ALLOC(heat, size);
3887                 if (heat == NULL)
3888                         RETURN(-ENOMEM);
3889
3890                 heat->lh_count = uheat.lh_count;
3891                 ll_heat_get(inode, heat);
3892                 rc = copy_to_user((char __user *)arg, heat, size);
3893                 OBD_FREE(heat, size);
3894                 RETURN(rc ? -EFAULT : 0);
3895         }
3896         case LL_IOC_HEAT_SET: {
3897                 __u64 flags;
3898
3899                 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3900                         RETURN(-EFAULT);
3901
3902                 rc = ll_heat_set(inode, flags);
3903                 RETURN(rc);
3904         }
3905         case LL_IOC_PCC_DETACH: {
3906                 struct lu_pcc_detach *detach;
3907
3908                 OBD_ALLOC_PTR(detach);
3909                 if (detach == NULL)
3910                         RETURN(-ENOMEM);
3911
3912                 if (copy_from_user(detach,
3913                                    (const struct lu_pcc_detach __user *)arg,
3914                                    sizeof(*detach)))
3915                         GOTO(out_detach_free, rc = -EFAULT);
3916
3917                 if (!S_ISREG(inode->i_mode))
3918                         GOTO(out_detach_free, rc = -EINVAL);
3919
3920                 if (!inode_owner_or_capable(inode))
3921                         GOTO(out_detach_free, rc = -EPERM);
3922
3923                 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3924 out_detach_free:
3925                 OBD_FREE_PTR(detach);
3926                 RETURN(rc);
3927         }
3928         case LL_IOC_PCC_STATE: {
3929                 struct lu_pcc_state __user *ustate =
3930                         (struct lu_pcc_state __user *)arg;
3931                 struct lu_pcc_state *state;
3932
3933                 OBD_ALLOC_PTR(state);
3934                 if (state == NULL)
3935                         RETURN(-ENOMEM);
3936
3937                 if (copy_from_user(state, ustate, sizeof(*state)))
3938                         GOTO(out_state, rc = -EFAULT);
3939
3940                 rc = pcc_ioctl_state(file, inode, state);
3941                 if (rc)
3942                         GOTO(out_state, rc);
3943
3944                 if (copy_to_user(ustate, state, sizeof(*state)))
3945                         GOTO(out_state, rc = -EFAULT);
3946
3947 out_state:
3948                 OBD_FREE_PTR(state);
3949                 RETURN(rc);
3950         }
3951         default:
3952                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3953                                      (void __user *)arg));
3954         }
3955 }
3956
3957 #ifndef HAVE_FILE_LLSEEK_SIZE
3958 static inline loff_t
3959 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3960 {
3961         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3962                 return -EINVAL;
3963         if (offset > maxsize)
3964                 return -EINVAL;
3965
3966         if (offset != file->f_pos) {
3967                 file->f_pos = offset;
3968                 file->f_version = 0;
3969         }
3970         return offset;
3971 }
3972
3973 static loff_t
3974 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3975                 loff_t maxsize, loff_t eof)
3976 {
3977         struct inode *inode = file_inode(file);
3978
3979         switch (origin) {
3980         case SEEK_END:
3981                 offset += eof;
3982                 break;
3983         case SEEK_CUR:
3984                 /*
3985                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3986                  * position-querying operation.  Avoid rewriting the "same"
3987                  * f_pos value back to the file because a concurrent read(),
3988                  * write() or lseek() might have altered it
3989                  */
3990                 if (offset == 0)
3991                         return file->f_pos;
3992                 /*
3993                  * f_lock protects against read/modify/write race with other
3994                  * SEEK_CURs. Note that parallel writes and reads behave
3995                  * like SEEK_SET.
3996                  */
3997                 inode_lock(inode);
3998                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3999                 inode_unlock(inode);
4000                 return offset;
4001         case SEEK_DATA:
4002                 /*
4003                  * In the generic case the entire file is data, so as long as
4004                  * offset isn't at the end of the file then the offset is data.
4005                  */
4006                 if (offset >= eof)
4007                         return -ENXIO;
4008                 break;
4009         case SEEK_HOLE:
4010                 /*
4011                  * There is a virtual hole at the end of the file, so as long as
4012                  * offset isn't i_size or larger, return i_size.
4013                  */
4014                 if (offset >= eof)
4015                         return -ENXIO;
4016                 offset = eof;
4017                 break;
4018         }
4019
4020         return llseek_execute(file, offset, maxsize);
4021 }
4022 #endif
4023
4024 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
4025 {
4026         struct inode *inode = file_inode(file);
4027         loff_t retval, eof = 0;
4028
4029         ENTRY;
4030         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
4031                            (origin == SEEK_CUR) ? file->f_pos : 0);
4032         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
4033                PFID(ll_inode2fid(inode)), inode, retval, retval,
4034                origin);
4035         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
4036
4037         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4038                 retval = ll_glimpse_size(inode);
4039                 if (retval != 0)
4040                         RETURN(retval);
4041                 eof = i_size_read(inode);
4042         }
4043
4044         retval = ll_generic_file_llseek_size(file, offset, origin,
4045                                           ll_file_maxbytes(inode), eof);
4046         RETURN(retval);
4047 }
4048
4049 static int ll_flush(struct file *file, fl_owner_t id)
4050 {
4051         struct inode *inode = file_inode(file);
4052         struct ll_inode_info *lli = ll_i2info(inode);
4053         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4054         int rc, err;
4055
4056         LASSERT(!S_ISDIR(inode->i_mode));
4057
4058         /* catch async errors that were recorded back when async writeback
4059          * failed for pages in this mapping. */
4060         rc = lli->lli_async_rc;
4061         lli->lli_async_rc = 0;
4062         if (lli->lli_clob != NULL) {
4063                 err = lov_read_and_clear_async_rc(lli->lli_clob);
4064                 if (rc == 0)
4065                         rc = err;
4066         }
4067
4068         /* The application has been told write failure already.
4069          * Do not report failure again. */
4070         if (fd->fd_write_failed)
4071                 return 0;
4072         return rc ? -EIO : 0;
4073 }
4074
4075 /**
4076  * Called to make sure a portion of file has been written out.
4077  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4078  *
4079  * Return how many pages have been written.
4080  */
4081 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4082                        enum cl_fsync_mode mode, int ignore_layout)
4083 {
4084         struct lu_env *env;
4085         struct cl_io *io;
4086         struct cl_fsync_io *fio;
4087         int result;
4088         __u16 refcheck;
4089         ENTRY;
4090
4091         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4092             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4093                 RETURN(-EINVAL);
4094
4095         env = cl_env_get(&refcheck);
4096         if (IS_ERR(env))
4097                 RETURN(PTR_ERR(env));
4098
4099         io = vvp_env_thread_io(env);
4100         io->ci_obj = ll_i2info(inode)->lli_clob;
4101         io->ci_ignore_layout = ignore_layout;
4102
4103         /* initialize parameters for sync */
4104         fio = &io->u.ci_fsync;
4105         fio->fi_start = start;
4106         fio->fi_end = end;
4107         fio->fi_fid = ll_inode2fid(inode);
4108         fio->fi_mode = mode;
4109         fio->fi_nr_written = 0;
4110
4111         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4112                 result = cl_io_loop(env, io);
4113         else
4114                 result = io->ci_result;
4115         if (result == 0)
4116                 result = fio->fi_nr_written;
4117         cl_io_fini(env, io);
4118         cl_env_put(env, &refcheck);
4119
4120         RETURN(result);
4121 }
4122
4123 /*
4124  * When dentry is provided (the 'else' case), file_dentry() may be
4125  * null and dentry must be used directly rather than pulled from
4126  * file_dentry() as is done otherwise.
4127  */
4128
4129 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4130 {
4131         struct dentry *dentry = file_dentry(file);
4132         struct inode *inode = dentry->d_inode;
4133         struct ll_inode_info *lli = ll_i2info(inode);
4134         struct ptlrpc_request *req;
4135         int rc, err;
4136
4137         ENTRY;
4138
4139         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4140                "datasync %d\n",
4141                PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4142
4143         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
4144
4145         /* fsync's caller has already called _fdata{sync,write}, we want
4146          * that IO to finish before calling the osc and mdc sync methods */
4147         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4148         inode_lock(inode);
4149
4150         /* catch async errors that were recorded back when async writeback
4151          * failed for pages in this mapping. */
4152         if (!S_ISDIR(inode->i_mode)) {
4153                 err = lli->lli_async_rc;
4154                 lli->lli_async_rc = 0;
4155                 if (rc == 0)
4156                         rc = err;
4157                 if (lli->lli_clob != NULL) {
4158                         err = lov_read_and_clear_async_rc(lli->lli_clob);
4159                         if (rc == 0)
4160                                 rc = err;
4161                 }
4162         }
4163
4164         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4165         if (!rc)
4166                 rc = err;
4167         if (!err)
4168                 ptlrpc_req_finished(req);
4169
4170         if (S_ISREG(inode->i_mode)) {
4171                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4172                 bool cached;
4173
4174                 /* Sync metadata on MDT first, and then sync the cached data
4175                  * on PCC.
4176                  */
4177                 err = pcc_fsync(file, start, end, datasync, &cached);
4178                 if (!cached)
4179                         err = cl_sync_file_range(inode, start, end,
4180                                                  CL_FSYNC_ALL, 0);
4181                 if (rc == 0 && err < 0)
4182                         rc = err;
4183                 if (rc < 0)
4184                         fd->fd_write_failed = true;
4185                 else
4186                         fd->fd_write_failed = false;
4187         }
4188
4189         inode_unlock(inode);
4190         RETURN(rc);
4191 }
4192
4193 static int
4194 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4195 {
4196         struct inode *inode = file_inode(file);
4197         struct ll_sb_info *sbi = ll_i2sbi(inode);
4198         struct ldlm_enqueue_info einfo = {
4199                 .ei_type        = LDLM_FLOCK,
4200                 .ei_cb_cp       = ldlm_flock_completion_ast,
4201                 .ei_cbdata      = file_lock,
4202         };
4203         struct md_op_data *op_data;
4204         struct lustre_handle lockh = { 0 };
4205         union ldlm_policy_data flock = { { 0 } };
4206         int fl_type = file_lock->fl_type;
4207         __u64 flags = 0;
4208         int rc;
4209         int rc2 = 0;
4210         ENTRY;
4211
4212         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4213                PFID(ll_inode2fid(inode)), file_lock);
4214
4215         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4216
4217         if (file_lock->fl_flags & FL_FLOCK) {
4218                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4219                 /* flocks are whole-file locks */
4220                 flock.l_flock.end = OFFSET_MAX;
4221                 /* For flocks owner is determined by the local file desctiptor*/
4222                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4223         } else if (file_lock->fl_flags & FL_POSIX) {
4224                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4225                 flock.l_flock.start = file_lock->fl_start;
4226                 flock.l_flock.end = file_lock->fl_end;
4227         } else {
4228                 RETURN(-EINVAL);
4229         }
4230         flock.l_flock.pid = file_lock->fl_pid;
4231
4232 #if defined(HAVE_LM_COMPARE_OWNER) || defined(lm_compare_owner)
4233         /* Somewhat ugly workaround for svc lockd.
4234          * lockd installs custom fl_lmops->lm_compare_owner that checks
4235          * for the fl_owner to be the same (which it always is on local node
4236          * I guess between lockd processes) and then compares pid.
4237          * As such we assign pid to the owner field to make it all work,
4238          * conflict with normal locks is unlikely since pid space and
4239          * pointer space for current->files are not intersecting */
4240         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4241                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4242 #endif
4243
4244         switch (fl_type) {
4245         case F_RDLCK:
4246                 einfo.ei_mode = LCK_PR;
4247                 break;
4248         case F_UNLCK:
4249                 /* An unlock request may or may not have any relation to
4250                  * existing locks so we may not be able to pass a lock handle
4251                  * via a normal ldlm_lock_cancel() request. The request may even
4252                  * unlock a byte range in the middle of an existing lock. In
4253                  * order to process an unlock request we need all of the same
4254                  * information that is given with a normal read or write record
4255                  * lock request. To avoid creating another ldlm unlock (cancel)
4256                  * message we'll treat a LCK_NL flock request as an unlock. */
4257                 einfo.ei_mode = LCK_NL;
4258                 break;
4259         case F_WRLCK:
4260                 einfo.ei_mode = LCK_PW;
4261                 break;
4262         default:
4263                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4264                 RETURN (-ENOTSUPP);
4265         }
4266
4267         switch (cmd) {
4268         case F_SETLKW:
4269 #ifdef F_SETLKW64
4270         case F_SETLKW64:
4271 #endif
4272                 flags = 0;
4273                 break;
4274         case F_SETLK:
4275 #ifdef F_SETLK64
4276         case F_SETLK64:
4277 #endif
4278                 flags = LDLM_FL_BLOCK_NOWAIT;
4279                 break;
4280         case F_GETLK:
4281 #ifdef F_GETLK64
4282         case F_GETLK64:
4283 #endif
4284                 flags = LDLM_FL_TEST_LOCK;
4285                 break;
4286         default:
4287                 CERROR("unknown fcntl lock command: %d\n", cmd);
4288                 RETURN (-EINVAL);
4289         }
4290
4291         /* Save the old mode so that if the mode in the lock changes we
4292          * can decrement the appropriate reader or writer refcount. */
4293         file_lock->fl_type = einfo.ei_mode;
4294
4295         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4296                                      LUSTRE_OPC_ANY, NULL);
4297         if (IS_ERR(op_data))
4298                 RETURN(PTR_ERR(op_data));
4299
4300         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4301                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4302                flock.l_flock.pid, flags, einfo.ei_mode,
4303                flock.l_flock.start, flock.l_flock.end);
4304
4305         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4306                         flags);
4307
4308         /* Restore the file lock type if not TEST lock. */
4309         if (!(flags & LDLM_FL_TEST_LOCK))
4310                 file_lock->fl_type = fl_type;
4311
4312 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4313         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4314             !(flags & LDLM_FL_TEST_LOCK))
4315                 rc2  = locks_lock_file_wait(file, file_lock);
4316 #else
4317         if ((file_lock->fl_flags & FL_FLOCK) &&
4318             (rc == 0 || file_lock->fl_type == F_UNLCK))
4319                 rc2  = flock_lock_file_wait(file, file_lock);
4320         if ((file_lock->fl_flags & FL_POSIX) &&
4321             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4322             !(flags & LDLM_FL_TEST_LOCK))
4323                 rc2  = posix_lock_file_wait(file, file_lock);
4324 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4325
4326         if (rc2 && file_lock->fl_type != F_UNLCK) {
4327                 einfo.ei_mode = LCK_NL;
4328                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4329                            &lockh, flags);
4330                 rc = rc2;
4331         }
4332
4333         ll_finish_md_op_data(op_data);
4334
4335         RETURN(rc);
4336 }
4337
4338 int ll_get_fid_by_name(struct inode *parent, const char *name,
4339                        int namelen, struct lu_fid *fid,
4340                        struct inode **inode)
4341 {
4342         struct md_op_data       *op_data = NULL;
4343         struct mdt_body         *body;
4344         struct ptlrpc_request   *req;
4345         int                     rc;
4346         ENTRY;
4347
4348         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4349                                      LUSTRE_OPC_ANY, NULL);
4350         if (IS_ERR(op_data))
4351                 RETURN(PTR_ERR(op_data));
4352
4353         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4354         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4355         ll_finish_md_op_data(op_data);
4356         if (rc < 0)
4357                 RETURN(rc);
4358
4359         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4360         if (body == NULL)
4361                 GOTO(out_req, rc = -EFAULT);
4362         if (fid != NULL)
4363                 *fid = body->mbo_fid1;
4364
4365         if (inode != NULL)
4366                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4367 out_req:
4368         ptlrpc_req_finished(req);
4369         RETURN(rc);
4370 }
4371
4372 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4373                const char *name)
4374 {
4375         struct dentry *dchild = NULL;
4376         struct inode *child_inode = NULL;
4377         struct md_op_data *op_data;
4378         struct ptlrpc_request *request = NULL;
4379         struct obd_client_handle *och = NULL;
4380         struct qstr qstr;
4381         struct mdt_body *body;
4382         __u64 data_version = 0;
4383         size_t namelen = strlen(name);
4384         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4385         int rc;
4386         ENTRY;
4387
4388         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4389                PFID(ll_inode2fid(parent)), name,
4390                lum->lum_stripe_offset, lum->lum_stripe_count);
4391
4392         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4393             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4394                 lustre_swab_lmv_user_md(lum);
4395
4396         /* Get child FID first */
4397         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4398         qstr.name = name;
4399         qstr.len = namelen;
4400         dchild = d_lookup(file_dentry(file), &qstr);
4401         if (dchild) {
4402                 if (dchild->d_inode)
4403                         child_inode = igrab(dchild->d_inode);
4404                 dput(dchild);
4405         }
4406
4407         if (!child_inode) {
4408                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4409                                         &child_inode);
4410                 if (rc)
4411                         RETURN(rc);
4412         }
4413
4414         if (!child_inode)
4415                 RETURN(-ENOENT);
4416
4417         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4418               OBD_CONNECT2_DIR_MIGRATE)) {
4419                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4420                     ll_dir_striped(child_inode)) {
4421                         CERROR("%s: MDT doesn't support stripe directory "
4422                                "migration!\n", ll_i2sbi(parent)->ll_fsname);
4423                         GOTO(out_iput, rc = -EOPNOTSUPP);
4424                 }
4425         }
4426
4427         /*
4428          * lfs migrate command needs to be blocked on the client
4429          * by checking the migrate FID against the FID of the
4430          * filesystem root.
4431          */
4432         if (child_inode == parent->i_sb->s_root->d_inode)
4433                 GOTO(out_iput, rc = -EINVAL);
4434
4435         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4436                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4437         if (IS_ERR(op_data))
4438                 GOTO(out_iput, rc = PTR_ERR(op_data));
4439
4440         inode_lock(child_inode);
4441         op_data->op_fid3 = *ll_inode2fid(child_inode);
4442         if (!fid_is_sane(&op_data->op_fid3)) {
4443                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4444                        ll_i2sbi(parent)->ll_fsname, name,
4445                        PFID(&op_data->op_fid3));
4446                 GOTO(out_unlock, rc = -EINVAL);
4447         }
4448
4449         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4450         op_data->op_data = lum;
4451         op_data->op_data_size = lumlen;
4452
4453 again:
4454         if (S_ISREG(child_inode->i_mode)) {
4455                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4456                 if (IS_ERR(och)) {
4457                         rc = PTR_ERR(och);
4458                         och = NULL;
4459                         GOTO(out_unlock, rc);
4460                 }
4461
4462                 rc = ll_data_version(child_inode, &data_version,
4463                                      LL_DV_WR_FLUSH);
4464                 if (rc != 0)
4465                         GOTO(out_close, rc);
4466
4467                 op_data->op_open_handle = och->och_open_handle;
4468                 op_data->op_data_version = data_version;
4469                 op_data->op_lease_handle = och->och_lease_handle;
4470                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4471
4472                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4473                 och->och_mod->mod_open_req->rq_replay = 0;
4474                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4475         }
4476
4477         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4478                        name, namelen, &request);
4479         if (rc == 0) {
4480                 LASSERT(request != NULL);
4481                 ll_update_times(request, parent);
4482         }
4483
4484         if (rc == 0 || rc == -EAGAIN) {
4485                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4486                 LASSERT(body != NULL);
4487
4488                 /* If the server does release layout lock, then we cleanup
4489                  * the client och here, otherwise release it in out_close: */
4490                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4491                         obd_mod_put(och->och_mod);
4492                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4493                                                   och);
4494                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4495                         OBD_FREE_PTR(och);
4496                         och = NULL;
4497                 }
4498         }
4499
4500         if (request != NULL) {
4501                 ptlrpc_req_finished(request);
4502                 request = NULL;
4503         }
4504
4505         /* Try again if the lease has cancelled. */
4506         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4507                 goto again;
4508
4509 out_close:
4510         if (och)
4511                 ll_lease_close(och, child_inode, NULL);
4512         if (!rc)
4513                 clear_nlink(child_inode);
4514 out_unlock:
4515         inode_unlock(child_inode);
4516         ll_finish_md_op_data(op_data);
4517 out_iput:
4518         iput(child_inode);
4519         RETURN(rc);
4520 }
4521
4522 static int
4523 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4524 {
4525         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4526         ENTRY;
4527
4528         /*
4529          * In order to avoid flood of warning messages, only print one message
4530          * for one file. And the entire message rate on the client is limited
4531          * by CDEBUG_LIMIT too.
4532          */
4533         if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4534                 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4535                 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4536                              "flock disabled, mount with '-o [local]flock' to enable\r\n");
4537         }
4538         RETURN(-ENOSYS);
4539 }
4540
4541 /**
4542  * test if some locks matching bits and l_req_mode are acquired
4543  * - bits can be in different locks
4544  * - if found clear the common lock bits in *bits
4545  * - the bits not found, are kept in *bits
4546  * \param inode [IN]
4547  * \param bits [IN] searched lock bits [IN]
4548  * \param l_req_mode [IN] searched lock mode
4549  * \retval boolean, true iff all bits are found
4550  */
4551 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4552 {
4553         struct lustre_handle lockh;
4554         union ldlm_policy_data policy;
4555         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4556                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4557         struct lu_fid *fid;
4558         __u64 flags;
4559         int i;
4560         ENTRY;
4561
4562         if (!inode)
4563                RETURN(0);
4564
4565         fid = &ll_i2info(inode)->lli_fid;
4566         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4567                ldlm_lockname[mode]);
4568
4569         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4570         for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4571                 policy.l_inodebits.bits = *bits & (1 << i);
4572                 if (policy.l_inodebits.bits == 0)
4573                         continue;
4574
4575                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4576                                   &policy, mode, &lockh)) {
4577                         struct ldlm_lock *lock;
4578
4579                         lock = ldlm_handle2lock(&lockh);
4580                         if (lock) {
4581                                 *bits &=
4582                                       ~(lock->l_policy_data.l_inodebits.bits);
4583                                 LDLM_LOCK_PUT(lock);
4584                         } else {
4585                                 *bits &= ~policy.l_inodebits.bits;
4586                         }
4587                 }
4588         }
4589         RETURN(*bits == 0);
4590 }
4591
4592 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4593                                struct lustre_handle *lockh, __u64 flags,
4594                                enum ldlm_mode mode)
4595 {
4596         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4597         struct lu_fid *fid;
4598         enum ldlm_mode rc;
4599         ENTRY;
4600
4601         fid = &ll_i2info(inode)->lli_fid;
4602         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4603
4604         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4605                            fid, LDLM_IBITS, &policy, mode, lockh);
4606
4607         RETURN(rc);
4608 }
4609
4610 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4611 {
4612         /* Already unlinked. Just update nlink and return success */
4613         if (rc == -ENOENT) {
4614                 clear_nlink(inode);
4615                 /* If it is striped directory, and there is bad stripe
4616                  * Let's revalidate the dentry again, instead of returning
4617                  * error */
4618                 if (ll_dir_striped(inode))
4619                         return 0;
4620
4621                 /* This path cannot be hit for regular files unless in
4622                  * case of obscure races, so no need to to validate
4623                  * size. */
4624                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4625                         return 0;
4626         } else if (rc != 0) {
4627                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4628                              "%s: revalidate FID "DFID" error: rc = %d\n",
4629                              ll_i2sbi(inode)->ll_fsname,
4630                              PFID(ll_inode2fid(inode)), rc);
4631         }
4632
4633         return rc;
4634 }
4635
4636 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4637 {
4638         struct inode *inode = dentry->d_inode;
4639         struct obd_export *exp = ll_i2mdexp(inode);
4640         struct lookup_intent oit = {
4641                 .it_op = op,
4642         };
4643         struct ptlrpc_request *req = NULL;
4644         struct md_op_data *op_data;
4645         int rc = 0;
4646         ENTRY;
4647
4648         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4649                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4650
4651         /* Call getattr by fid, so do not provide name at all. */
4652         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4653                                      LUSTRE_OPC_ANY, NULL);
4654         if (IS_ERR(op_data))
4655                 RETURN(PTR_ERR(op_data));
4656
4657         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4658         ll_finish_md_op_data(op_data);
4659         if (rc < 0) {
4660                 rc = ll_inode_revalidate_fini(inode, rc);
4661                 GOTO(out, rc);
4662         }
4663
4664         rc = ll_revalidate_it_finish(req, &oit, dentry);
4665         if (rc != 0) {
4666                 ll_intent_release(&oit);
4667                 GOTO(out, rc);
4668         }
4669
4670         /* Unlinked? Unhash dentry, so it is not picked up later by
4671          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4672          * here to preserve get_cwd functionality on 2.6.
4673          * Bug 10503 */
4674         if (!dentry->d_inode->i_nlink) {
4675                 spin_lock(&inode->i_lock);
4676                 d_lustre_invalidate(dentry, 0);
4677                 spin_unlock(&inode->i_lock);
4678         }
4679
4680         ll_lookup_finish_locks(&oit, dentry);
4681 out:
4682         ptlrpc_req_finished(req);
4683
4684         return rc;
4685 }
4686
4687 static int ll_merge_md_attr(struct inode *inode)
4688 {
4689         struct ll_inode_info *lli = ll_i2info(inode);
4690         struct cl_attr attr = { 0 };
4691         int rc;
4692
4693         LASSERT(lli->lli_lsm_md != NULL);
4694
4695         if (!lmv_dir_striped(lli->lli_lsm_md))
4696                 RETURN(0);
4697
4698         down_read(&lli->lli_lsm_sem);
4699         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4700                            &attr, ll_md_blocking_ast);
4701         up_read(&lli->lli_lsm_sem);
4702         if (rc != 0)
4703                 RETURN(rc);
4704
4705         set_nlink(inode, attr.cat_nlink);
4706         inode->i_blocks = attr.cat_blocks;
4707         i_size_write(inode, attr.cat_size);
4708
4709         ll_i2info(inode)->lli_atime = attr.cat_atime;
4710         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4711         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4712
4713         RETURN(0);
4714 }
4715
4716 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4717 {
4718         struct inode *inode = de->d_inode;
4719         struct ll_sb_info *sbi = ll_i2sbi(inode);
4720         struct ll_inode_info *lli = ll_i2info(inode);
4721         int rc;
4722
4723         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4724
4725         rc = ll_inode_revalidate(de, IT_GETATTR);
4726         if (rc < 0)
4727                 RETURN(rc);
4728
4729         if (S_ISREG(inode->i_mode)) {
4730                 bool cached;
4731
4732                 rc = pcc_inode_getattr(inode, &cached);
4733                 if (cached && rc < 0)
4734                         RETURN(rc);
4735
4736                 /* In case of restore, the MDT has the right size and has
4737                  * already send it back without granting the layout lock,
4738                  * inode is up-to-date so glimpse is useless.
4739                  * Also to glimpse we need the layout, in case of a running
4740                  * restore the MDT holds the layout lock so the glimpse will
4741                  * block up to the end of restore (getattr will block)
4742                  */
4743                 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4744                         rc = ll_glimpse_size(inode);
4745                         if (rc < 0)
4746                                 RETURN(rc);
4747                 }
4748         } else {
4749                 /* If object isn't regular a file then don't validate size. */
4750                 if (ll_dir_striped(inode)) {
4751                         rc = ll_merge_md_attr(inode);
4752                         if (rc < 0)
4753                                 RETURN(rc);
4754                 }
4755
4756                 inode->i_atime.tv_sec = lli->lli_atime;
4757                 inode->i_mtime.tv_sec = lli->lli_mtime;
4758                 inode->i_ctime.tv_sec = lli->lli_ctime;
4759         }
4760
4761         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4762
4763         if (ll_need_32bit_api(sbi)) {
4764                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4765                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4766                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4767         } else {
4768                 stat->ino = inode->i_ino;
4769                 stat->dev = inode->i_sb->s_dev;
4770                 stat->rdev = inode->i_rdev;
4771         }
4772
4773         stat->mode = inode->i_mode;
4774         stat->uid = inode->i_uid;
4775         stat->gid = inode->i_gid;
4776         stat->atime = inode->i_atime;
4777         stat->mtime = inode->i_mtime;
4778         stat->ctime = inode->i_ctime;
4779         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4780
4781         stat->nlink = inode->i_nlink;
4782         stat->size = i_size_read(inode);
4783         stat->blocks = inode->i_blocks;
4784
4785         return 0;
4786 }
4787
4788 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4789 int ll_getattr(const struct path *path, struct kstat *stat,
4790                u32 request_mask, unsigned int flags)
4791 {
4792         struct dentry *de = path->dentry;
4793 #else
4794 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4795 {
4796 #endif
4797         return ll_getattr_dentry(de, stat);
4798 }
4799
4800 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4801                      __u64 start, __u64 len)
4802 {
4803         int             rc;
4804         size_t          num_bytes;
4805         struct fiemap   *fiemap;
4806         unsigned int    extent_count = fieinfo->fi_extents_max;
4807
4808         num_bytes = sizeof(*fiemap) + (extent_count *
4809                                        sizeof(struct fiemap_extent));
4810         OBD_ALLOC_LARGE(fiemap, num_bytes);
4811
4812         if (fiemap == NULL)
4813                 RETURN(-ENOMEM);
4814
4815         fiemap->fm_flags = fieinfo->fi_flags;
4816         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4817         fiemap->fm_start = start;
4818         fiemap->fm_length = len;
4819         if (extent_count > 0 &&
4820             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4821                            sizeof(struct fiemap_extent)) != 0)
4822                 GOTO(out, rc = -EFAULT);
4823
4824         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4825
4826         fieinfo->fi_flags = fiemap->fm_flags;
4827         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4828         if (extent_count > 0 &&
4829             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4830                          fiemap->fm_mapped_extents *
4831                          sizeof(struct fiemap_extent)) != 0)
4832                 GOTO(out, rc = -EFAULT);
4833 out:
4834         OBD_FREE_LARGE(fiemap, num_bytes);
4835         return rc;
4836 }
4837
4838 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4839 {
4840         struct ll_inode_info *lli = ll_i2info(inode);
4841         struct posix_acl *acl = NULL;
4842         ENTRY;
4843
4844         spin_lock(&lli->lli_lock);
4845         /* VFS' acl_permission_check->check_acl will release the refcount */
4846         acl = posix_acl_dup(lli->lli_posix_acl);
4847         spin_unlock(&lli->lli_lock);
4848
4849         RETURN(acl);
4850 }
4851
4852 #ifdef HAVE_IOP_SET_ACL
4853 #ifdef CONFIG_FS_POSIX_ACL
4854 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4855 {
4856         struct ll_sb_info *sbi = ll_i2sbi(inode);
4857         struct ptlrpc_request *req = NULL;
4858         const char *name = NULL;
4859         char *value = NULL;
4860         size_t value_size = 0;
4861         int rc = 0;
4862         ENTRY;
4863
4864         switch (type) {
4865         case ACL_TYPE_ACCESS:
4866                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4867                 if (acl)
4868                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4869                 break;
4870
4871         case ACL_TYPE_DEFAULT:
4872                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4873                 if (!S_ISDIR(inode->i_mode))
4874                         rc = acl ? -EACCES : 0;
4875                 break;
4876
4877         default:
4878                 rc = -EINVAL;
4879                 break;
4880         }
4881         if (rc)
4882                 return rc;
4883
4884         if (acl) {
4885                 value_size = posix_acl_xattr_size(acl->a_count);
4886                 value = kmalloc(value_size, GFP_NOFS);
4887                 if (value == NULL)
4888                         GOTO(out, rc = -ENOMEM);
4889
4890                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4891                 if (rc < 0)
4892                         GOTO(out_value, rc);
4893         }
4894
4895         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4896                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4897                          name, value, value_size, 0, 0, &req);
4898
4899         ptlrpc_req_finished(req);
4900 out_value:
4901         kfree(value);
4902 out:
4903         if (rc)
4904                 forget_cached_acl(inode, type);
4905         else
4906                 set_cached_acl(inode, type, acl);
4907         RETURN(rc);
4908 }
4909 #endif /* CONFIG_FS_POSIX_ACL */
4910 #endif /* HAVE_IOP_SET_ACL */
4911
4912 int ll_inode_permission(struct inode *inode, int mask)
4913 {
4914         int rc = 0;
4915         struct ll_sb_info *sbi;
4916         struct root_squash_info *squash;
4917         struct cred *cred = NULL;
4918         const struct cred *old_cred = NULL;
4919         cfs_cap_t cap;
4920         bool squash_id = false;
4921         ENTRY;
4922
4923         if (mask & MAY_NOT_BLOCK)
4924                 return -ECHILD;
4925
4926        /* as root inode are NOT getting validated in lookup operation,
4927         * need to do it before permission check. */
4928
4929         if (inode == inode->i_sb->s_root->d_inode) {
4930                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4931                 if (rc)
4932                         RETURN(rc);
4933         }
4934
4935         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4936                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4937
4938         /* squash fsuid/fsgid if needed */
4939         sbi = ll_i2sbi(inode);
4940         squash = &sbi->ll_squash;
4941         if (unlikely(squash->rsi_uid != 0 &&
4942                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4943                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4944                         squash_id = true;
4945         }
4946         if (squash_id) {
4947                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4948                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4949                        squash->rsi_uid, squash->rsi_gid);
4950
4951                 /* update current process's credentials
4952                  * and FS capability */
4953                 cred = prepare_creds();
4954                 if (cred == NULL)
4955                         RETURN(-ENOMEM);
4956
4957                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4958                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4959                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4960                         if ((1 << cap) & CFS_CAP_FS_MASK)
4961                                 cap_lower(cred->cap_effective, cap);
4962                 }
4963                 old_cred = override_creds(cred);
4964         }
4965
4966         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4967         rc = generic_permission(inode, mask);
4968         /* restore current process's credentials and FS capability */
4969         if (squash_id) {
4970                 revert_creds(old_cred);
4971                 put_cred(cred);
4972         }
4973
4974         RETURN(rc);
4975 }
4976
4977 /* -o localflock - only provides locally consistent flock locks */
4978 struct file_operations ll_file_operations = {
4979 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4980 # ifdef HAVE_SYNC_READ_WRITE
4981         .read           = new_sync_read,
4982         .write          = new_sync_write,
4983 # endif
4984         .read_iter      = ll_file_read_iter,
4985         .write_iter     = ll_file_write_iter,
4986 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4987         .read           = ll_file_read,
4988         .aio_read       = ll_file_aio_read,
4989         .write          = ll_file_write,
4990         .aio_write      = ll_file_aio_write,
4991 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4992         .unlocked_ioctl = ll_file_ioctl,
4993         .open           = ll_file_open,
4994         .release        = ll_file_release,
4995         .mmap           = ll_file_mmap,
4996         .llseek         = ll_file_seek,
4997         .splice_read    = ll_file_splice_read,
4998         .fsync          = ll_fsync,
4999         .flush          = ll_flush
5000 };
5001
5002 struct file_operations ll_file_operations_flock = {
5003 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5004 # ifdef HAVE_SYNC_READ_WRITE
5005         .read           = new_sync_read,
5006         .write          = new_sync_write,
5007 # endif /* HAVE_SYNC_READ_WRITE */
5008         .read_iter      = ll_file_read_iter,
5009         .write_iter     = ll_file_write_iter,
5010 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5011         .read           = ll_file_read,
5012         .aio_read       = ll_file_aio_read,
5013         .write          = ll_file_write,
5014         .aio_write      = ll_file_aio_write,
5015 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5016         .unlocked_ioctl = ll_file_ioctl,
5017         .open           = ll_file_open,
5018         .release        = ll_file_release,
5019         .mmap           = ll_file_mmap,
5020         .llseek         = ll_file_seek,
5021         .splice_read    = ll_file_splice_read,
5022         .fsync          = ll_fsync,
5023         .flush          = ll_flush,
5024         .flock          = ll_file_flock,
5025         .lock           = ll_file_flock
5026 };
5027
5028 /* These are for -o noflock - to return ENOSYS on flock calls */
5029 struct file_operations ll_file_operations_noflock = {
5030 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5031 # ifdef HAVE_SYNC_READ_WRITE
5032         .read           = new_sync_read,
5033         .write          = new_sync_write,
5034 # endif /* HAVE_SYNC_READ_WRITE */
5035         .read_iter      = ll_file_read_iter,
5036         .write_iter     = ll_file_write_iter,
5037 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5038         .read           = ll_file_read,
5039         .aio_read       = ll_file_aio_read,
5040         .write          = ll_file_write,
5041         .aio_write      = ll_file_aio_write,
5042 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5043         .unlocked_ioctl = ll_file_ioctl,
5044         .open           = ll_file_open,
5045         .release        = ll_file_release,
5046         .mmap           = ll_file_mmap,
5047         .llseek         = ll_file_seek,
5048         .splice_read    = ll_file_splice_read,
5049         .fsync          = ll_fsync,
5050         .flush          = ll_flush,
5051         .flock          = ll_file_noflock,
5052         .lock           = ll_file_noflock
5053 };
5054
5055 struct inode_operations ll_file_inode_operations = {
5056         .setattr        = ll_setattr,
5057         .getattr        = ll_getattr,
5058         .permission     = ll_inode_permission,
5059 #ifdef HAVE_IOP_XATTR
5060         .setxattr       = ll_setxattr,
5061         .getxattr       = ll_getxattr,
5062         .removexattr    = ll_removexattr,
5063 #endif
5064         .listxattr      = ll_listxattr,
5065         .fiemap         = ll_fiemap,
5066 #ifdef HAVE_IOP_GET_ACL
5067         .get_acl        = ll_get_acl,
5068 #endif
5069 #ifdef HAVE_IOP_SET_ACL
5070         .set_acl        = ll_set_acl,
5071 #endif
5072 };
5073
5074 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5075 {
5076         struct ll_inode_info *lli = ll_i2info(inode);
5077         struct cl_object *obj = lli->lli_clob;
5078         struct lu_env *env;
5079         int rc;
5080         __u16 refcheck;
5081         ENTRY;
5082
5083         if (obj == NULL)
5084                 RETURN(0);
5085
5086         env = cl_env_get(&refcheck);
5087         if (IS_ERR(env))
5088                 RETURN(PTR_ERR(env));
5089
5090         rc = cl_conf_set(env, lli->lli_clob, conf);
5091         if (rc < 0)
5092                 GOTO(out, rc);
5093
5094         if (conf->coc_opc == OBJECT_CONF_SET) {
5095                 struct ldlm_lock *lock = conf->coc_lock;
5096                 struct cl_layout cl = {
5097                         .cl_layout_gen = 0,
5098                 };
5099
5100                 LASSERT(lock != NULL);
5101                 LASSERT(ldlm_has_layout(lock));
5102
5103                 /* it can only be allowed to match after layout is
5104                  * applied to inode otherwise false layout would be
5105                  * seen. Applying layout shoud happen before dropping
5106                  * the intent lock. */
5107                 ldlm_lock_allow_match(lock);
5108
5109                 rc = cl_object_layout_get(env, obj, &cl);
5110                 if (rc < 0)
5111                         GOTO(out, rc);
5112
5113                 CDEBUG(D_VFSTRACE,
5114                        DFID": layout version change: %u -> %u\n",
5115                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
5116                        cl.cl_layout_gen);
5117                 ll_layout_version_set(lli, cl.cl_layout_gen);
5118         }
5119
5120 out:
5121         cl_env_put(env, &refcheck);
5122
5123         RETURN(rc);
5124 }
5125
5126 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5127 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5128
5129 {
5130         struct ll_sb_info *sbi = ll_i2sbi(inode);
5131         struct ptlrpc_request *req;
5132         void *lvbdata;
5133         void *lmm;
5134         int lmmsize;
5135         int rc;
5136         ENTRY;
5137
5138         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5139                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5140                lock->l_lvb_data, lock->l_lvb_len);
5141
5142         if (lock->l_lvb_data != NULL)
5143                 RETURN(0);
5144
5145         /* if layout lock was granted right away, the layout is returned
5146          * within DLM_LVB of dlm reply; otherwise if the lock was ever
5147          * blocked and then granted via completion ast, we have to fetch
5148          * layout here. Please note that we can't use the LVB buffer in
5149          * completion AST because it doesn't have a large enough buffer */
5150         rc = ll_get_default_mdsize(sbi, &lmmsize);
5151         if (rc < 0)
5152                 RETURN(rc);
5153
5154         rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5155                          XATTR_NAME_LOV, lmmsize, &req);
5156         if (rc < 0) {
5157                 if (rc == -ENODATA)
5158                         GOTO(out, rc = 0); /* empty layout */
5159                 else
5160                         RETURN(rc);
5161         }
5162
5163         lmmsize = rc;
5164         rc = 0;
5165         if (lmmsize == 0) /* empty layout */
5166                 GOTO(out, rc = 0);
5167
5168         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5169         if (lmm == NULL)
5170                 GOTO(out, rc = -EFAULT);
5171
5172         OBD_ALLOC_LARGE(lvbdata, lmmsize);
5173         if (lvbdata == NULL)
5174                 GOTO(out, rc = -ENOMEM);
5175
5176         memcpy(lvbdata, lmm, lmmsize);
5177         lock_res_and_lock(lock);
5178         if (unlikely(lock->l_lvb_data == NULL)) {
5179                 lock->l_lvb_type = LVB_T_LAYOUT;
5180                 lock->l_lvb_data = lvbdata;
5181                 lock->l_lvb_len = lmmsize;
5182                 lvbdata = NULL;
5183         }
5184         unlock_res_and_lock(lock);
5185
5186         if (lvbdata)
5187                 OBD_FREE_LARGE(lvbdata, lmmsize);
5188
5189         EXIT;
5190
5191 out:
5192         ptlrpc_req_finished(req);
5193         return rc;
5194 }
5195
5196 /**
5197  * Apply the layout to the inode. Layout lock is held and will be released
5198  * in this function.
5199  */
5200 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5201                               struct inode *inode)
5202 {
5203         struct ll_inode_info *lli = ll_i2info(inode);
5204         struct ll_sb_info    *sbi = ll_i2sbi(inode);
5205         struct ldlm_lock *lock;
5206         struct cl_object_conf conf;
5207         int rc = 0;
5208         bool lvb_ready;
5209         bool wait_layout = false;
5210         ENTRY;
5211
5212         LASSERT(lustre_handle_is_used(lockh));
5213
5214         lock = ldlm_handle2lock(lockh);
5215         LASSERT(lock != NULL);
5216         LASSERT(ldlm_has_layout(lock));
5217
5218         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5219                    PFID(&lli->lli_fid), inode);
5220
5221         /* in case this is a caching lock and reinstate with new inode */
5222         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5223
5224         lock_res_and_lock(lock);
5225         lvb_ready = ldlm_is_lvb_ready(lock);
5226         unlock_res_and_lock(lock);
5227
5228         /* checking lvb_ready is racy but this is okay. The worst case is
5229          * that multi processes may configure the file on the same time. */
5230         if (lvb_ready)
5231                 GOTO(out, rc = 0);
5232
5233         rc = ll_layout_fetch(inode, lock);
5234         if (rc < 0)
5235                 GOTO(out, rc);
5236
5237         /* for layout lock, lmm is stored in lock's lvb.
5238          * lvb_data is immutable if the lock is held so it's safe to access it
5239          * without res lock.
5240          *
5241          * set layout to file. Unlikely this will fail as old layout was
5242          * surely eliminated */
5243         memset(&conf, 0, sizeof conf);
5244         conf.coc_opc = OBJECT_CONF_SET;
5245         conf.coc_inode = inode;
5246         conf.coc_lock = lock;
5247         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5248         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5249         rc = ll_layout_conf(inode, &conf);
5250
5251         /* refresh layout failed, need to wait */
5252         wait_layout = rc == -EBUSY;
5253         EXIT;
5254 out:
5255         LDLM_LOCK_PUT(lock);
5256         ldlm_lock_decref(lockh, mode);
5257
5258         /* wait for IO to complete if it's still being used. */
5259         if (wait_layout) {
5260                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5261                        sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5262
5263                 memset(&conf, 0, sizeof conf);
5264                 conf.coc_opc = OBJECT_CONF_WAIT;
5265                 conf.coc_inode = inode;
5266                 rc = ll_layout_conf(inode, &conf);
5267                 if (rc == 0)
5268                         rc = -EAGAIN;
5269
5270                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5271                        sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5272         }
5273         RETURN(rc);
5274 }
5275
5276 /**
5277  * Issue layout intent RPC to MDS.
5278  * \param inode [in]    file inode
5279  * \param intent [in]   layout intent
5280  *
5281  * \retval 0    on success
5282  * \retval < 0  error code
5283  */
5284 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5285 {
5286         struct ll_inode_info  *lli = ll_i2info(inode);
5287         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5288         struct md_op_data     *op_data;
5289         struct lookup_intent it;
5290         struct ptlrpc_request *req;
5291         int rc;
5292         ENTRY;
5293
5294         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5295                                      0, 0, LUSTRE_OPC_ANY, NULL);
5296         if (IS_ERR(op_data))
5297                 RETURN(PTR_ERR(op_data));
5298
5299         op_data->op_data = intent;
5300         op_data->op_data_size = sizeof(*intent);
5301
5302         memset(&it, 0, sizeof(it));
5303         it.it_op = IT_LAYOUT;
5304         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5305             intent->li_opc == LAYOUT_INTENT_TRUNC)
5306                 it.it_flags = FMODE_WRITE;
5307
5308         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5309                           sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5310
5311         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5312                             &ll_md_blocking_ast, 0);
5313         if (it.it_request != NULL)
5314                 ptlrpc_req_finished(it.it_request);
5315         it.it_request = NULL;
5316
5317         ll_finish_md_op_data(op_data);
5318
5319         /* set lock data in case this is a new lock */
5320         if (!rc)
5321                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5322
5323         ll_intent_drop_lock(&it);
5324
5325         RETURN(rc);
5326 }
5327
5328 /**
5329  * This function checks if there exists a LAYOUT lock on the client side,
5330  * or enqueues it if it doesn't have one in cache.
5331  *
5332  * This function will not hold layout lock so it may be revoked any time after
5333  * this function returns. Any operations depend on layout should be redone
5334  * in that case.
5335  *
5336  * This function should be called before lov_io_init() to get an uptodate
5337  * layout version, the caller should save the version number and after IO
5338  * is finished, this function should be called again to verify that layout
5339  * is not changed during IO time.
5340  */
5341 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5342 {
5343         struct ll_inode_info    *lli = ll_i2info(inode);
5344         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5345         struct lustre_handle lockh;
5346         struct layout_intent intent = {
5347                 .li_opc = LAYOUT_INTENT_ACCESS,
5348         };
5349         enum ldlm_mode mode;
5350         int rc;
5351         ENTRY;
5352
5353         *gen = ll_layout_version_get(lli);
5354         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5355                 RETURN(0);
5356
5357         /* sanity checks */
5358         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5359         LASSERT(S_ISREG(inode->i_mode));
5360
5361         /* take layout lock mutex to enqueue layout lock exclusively. */
5362         mutex_lock(&lli->lli_layout_mutex);
5363
5364         while (1) {
5365                 /* mostly layout lock is caching on the local side, so try to
5366                  * match it before grabbing layout lock mutex. */
5367                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5368                                        LCK_CR | LCK_CW | LCK_PR |
5369                                        LCK_PW | LCK_EX);
5370                 if (mode != 0) { /* hit cached lock */
5371                         rc = ll_layout_lock_set(&lockh, mode, inode);
5372                         if (rc == -EAGAIN)
5373                                 continue;
5374                         break;
5375                 }
5376
5377                 rc = ll_layout_intent(inode, &intent);
5378                 if (rc != 0)
5379                         break;
5380         }
5381
5382         if (rc == 0)
5383                 *gen = ll_layout_version_get(lli);
5384         mutex_unlock(&lli->lli_layout_mutex);
5385
5386         RETURN(rc);
5387 }
5388
5389 /**
5390  * Issue layout intent RPC indicating where in a file an IO is about to write.
5391  *
5392  * \param[in] inode     file inode.
5393  * \param[in] ext       write range with start offset of fille in bytes where
5394  *                      an IO is about to write, and exclusive end offset in
5395  *                      bytes.
5396  *
5397  * \retval 0    on success
5398  * \retval < 0  error code
5399  */
5400 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5401                            struct lu_extent *ext)
5402 {
5403         struct layout_intent intent = {
5404                 .li_opc = opc,
5405                 .li_extent.e_start = ext->e_start,
5406                 .li_extent.e_end = ext->e_end,
5407         };
5408         int rc;
5409         ENTRY;
5410
5411         rc = ll_layout_intent(inode, &intent);
5412
5413         RETURN(rc);
5414 }
5415
5416 /**
5417  *  This function send a restore request to the MDT
5418  */
5419 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5420 {
5421         struct hsm_user_request *hur;
5422         int                      len, rc;
5423         ENTRY;
5424
5425         len = sizeof(struct hsm_user_request) +
5426               sizeof(struct hsm_user_item);
5427         OBD_ALLOC(hur, len);
5428         if (hur == NULL)
5429                 RETURN(-ENOMEM);
5430
5431         hur->hur_request.hr_action = HUA_RESTORE;
5432         hur->hur_request.hr_archive_id = 0;
5433         hur->hur_request.hr_flags = 0;
5434         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5435                sizeof(hur->hur_user_item[0].hui_fid));
5436         hur->hur_user_item[0].hui_extent.offset = offset;
5437         hur->hur_user_item[0].hui_extent.length = length;
5438         hur->hur_request.hr_itemcount = 1;
5439         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5440                            len, hur, NULL);
5441         OBD_FREE(hur, len);
5442         RETURN(rc);
5443 }