lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 struct pcc_param {
  62         __u64   pa_data_version;
  63         __u32   pa_archive_id;
  64         __u32   pa_layout_gen;
  65 };
  66
  67 static int
  68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  69
  70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  71                           bool *lease_broken);
  72
  73 static struct ll_file_data *ll_file_data_get(void)
  74 {
  75         struct ll_file_data *fd;
  76
  77         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  78         if (fd == NULL)
  79                 return NULL;
  80
  81         fd->fd_write_failed = false;
  82         pcc_file_init(&fd->fd_pcc_file);
  83
  84         return fd;
  85 }
  86
  87 static void ll_file_data_put(struct ll_file_data *fd)
  88 {
  89         if (fd != NULL)
  90                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  91 }
  92
  93 /**
  94  * Packs all the attributes into @op_data for the CLOSE rpc.
  95  */
  96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  97                              struct obd_client_handle *och)
  98 {
  99         ENTRY;
 100
 101         ll_prep_md_op_data(op_data, inode, NULL, NULL,
 102                            0, 0, LUSTRE_OPC_ANY, NULL);
 103
 104         op_data->op_attr.ia_mode = inode->i_mode;
 105         op_data->op_attr.ia_atime = inode->i_atime;
 106         op_data->op_attr.ia_mtime = inode->i_mtime;
 107         op_data->op_attr.ia_ctime = inode->i_ctime;
 108         op_data->op_attr.ia_size = i_size_read(inode);
 109         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 110                                       ATTR_MTIME | ATTR_MTIME_SET |
 111                                       ATTR_CTIME);
 112         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 113         op_data->op_attr_blocks = inode->i_blocks;
 114         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 115         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 116                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 117         op_data->op_open_handle = och->och_open_handle;
 118
 119         if (och->och_flags & FMODE_WRITE &&
 120             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 121                 /* For HSM: if inode data has been modified, pack it so that
 122                  * MDT can set data dirty flag in the archive. */
 123                 op_data->op_bias |= MDS_DATA_MODIFIED;
 124
 125         EXIT;
 126 }
 127
 128 /**
 129  * Perform a close, possibly with a bias.
 130  * The meaning of "data" depends on the value of "bias".
 131  *
 132  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 133  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 134  * swap layouts with.
 135  */
 136 static int ll_close_inode_openhandle(struct inode *inode,
 137                                      struct obd_client_handle *och,
 138                                      enum mds_op_bias bias, void *data)
 139 {
 140         struct obd_export *md_exp = ll_i2mdexp(inode);
 141         const struct ll_inode_info *lli = ll_i2info(inode);
 142         struct md_op_data *op_data;
 143         struct ptlrpc_request *req = NULL;
 144         int rc;
 145         ENTRY;
 146
 147         if (class_exp2obd(md_exp) == NULL) {
 148                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 149                        ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
 150                 GOTO(out, rc = 0);
 151         }
 152
 153         OBD_ALLOC_PTR(op_data);
 154         /* We leak openhandle and request here on error, but not much to be
 155          * done in OOM case since app won't retry close on error either. */
 156         if (op_data == NULL)
 157                 GOTO(out, rc = -ENOMEM);
 158
 159         ll_prepare_close(inode, op_data, och);
 160         switch (bias) {
 161         case MDS_CLOSE_LAYOUT_MERGE:
 162                 /* merge blocks from the victim inode */
 163                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 164                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 165                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 166         case MDS_CLOSE_LAYOUT_SPLIT:
 167         case MDS_CLOSE_LAYOUT_SWAP: {
 168                 struct split_param *sp = data;
 169
 170                 LASSERT(data != NULL);
 171                 op_data->op_bias |= bias;
 172                 op_data->op_data_version = 0;
 173                 op_data->op_lease_handle = och->och_lease_handle;
 174                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 175                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 176                         op_data->op_mirror_id = sp->sp_mirror_id;
 177                 } else {
 178                         op_data->op_fid2 = *ll_inode2fid(data);
 179                 }
 180                 break;
 181         }
 182
 183         case MDS_CLOSE_RESYNC_DONE: {
 184                 struct ll_ioc_lease *ioc = data;
 185
 186                 LASSERT(data != NULL);
 187                 op_data->op_attr_blocks +=
 188                         ioc->lil_count * op_data->op_attr_blocks;
 189                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 190                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 191                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 192
 193                 op_data->op_lease_handle = och->och_lease_handle;
 194                 op_data->op_data = &ioc->lil_ids[0];
 195                 op_data->op_data_size =
 196                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 197                 break;
 198         }
 199
 200         case MDS_PCC_ATTACH: {
 201                 struct pcc_param *param = data;
 202
 203                 LASSERT(data != NULL);
 204                 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
 205                 op_data->op_archive_id = param->pa_archive_id;
 206                 op_data->op_data_version = param->pa_data_version;
 207                 op_data->op_lease_handle = och->och_lease_handle;
 208                 break;
 209         }
 210
 211         case MDS_HSM_RELEASE:
 212                 LASSERT(data != NULL);
 213                 op_data->op_bias |= MDS_HSM_RELEASE;
 214                 op_data->op_data_version = *(__u64 *)data;
 215                 op_data->op_lease_handle = och->och_lease_handle;
 216                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 217                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 218                 break;
 219
 220         default:
 221                 LASSERT(data == NULL);
 222                 break;
 223         }
 224
 225         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 226                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 227         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 228                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 229
 230         rc = md_close(md_exp, op_data, och->och_mod, &req);
 231         if (rc != 0 && rc != -EINTR)
 232                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 233                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 234
 235         if (rc == 0 && op_data->op_bias & bias) {
 236                 struct mdt_body *body;
 237
 238                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 239                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 240                         rc = -EBUSY;
 241
 242                 if (bias & MDS_PCC_ATTACH) {
 243                         struct pcc_param *param = data;
 244
 245                         param->pa_layout_gen = body->mbo_layout_gen;
 246                 }
 247         }
 248
 249         ll_finish_md_op_data(op_data);
 250         EXIT;
 251 out:
 252
 253         md_clear_open_replay_data(md_exp, och);
 254         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 255         OBD_FREE_PTR(och);
 256
 257         ptlrpc_req_finished(req);       /* This is close request */
 258         return rc;
 259 }
 260
 261 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 262 {
 263         struct ll_inode_info *lli = ll_i2info(inode);
 264         struct obd_client_handle **och_p;
 265         struct obd_client_handle *och;
 266         __u64 *och_usecount;
 267         int rc = 0;
 268         ENTRY;
 269
 270         if (fmode & FMODE_WRITE) {
 271                 och_p = &lli->lli_mds_write_och;
 272                 och_usecount = &lli->lli_open_fd_write_count;
 273         } else if (fmode & FMODE_EXEC) {
 274                 och_p = &lli->lli_mds_exec_och;
 275                 och_usecount = &lli->lli_open_fd_exec_count;
 276         } else {
 277                 LASSERT(fmode & FMODE_READ);
 278                 och_p = &lli->lli_mds_read_och;
 279                 och_usecount = &lli->lli_open_fd_read_count;
 280         }
 281
 282         mutex_lock(&lli->lli_och_mutex);
 283         if (*och_usecount > 0) {
 284                 /* There are still users of this handle, so skip
 285                  * freeing it. */
 286                 mutex_unlock(&lli->lli_och_mutex);
 287                 RETURN(0);
 288         }
 289
 290         och = *och_p;
 291         *och_p = NULL;
 292         mutex_unlock(&lli->lli_och_mutex);
 293
 294         if (och != NULL) {
 295                 /* There might be a race and this handle may already
 296                  * be closed. */
 297                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 298         }
 299
 300         RETURN(rc);
 301 }
 302
 303 static int ll_md_close(struct inode *inode, struct file *file)
 304 {
 305         union ldlm_policy_data policy = {
 306                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 307         };
 308         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 309         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 310         struct ll_inode_info *lli = ll_i2info(inode);
 311         struct lustre_handle lockh;
 312         enum ldlm_mode lockmode;
 313         int rc = 0;
 314         ENTRY;
 315
 316         /* clear group lock, if present */
 317         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 318                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 319
 320         if (fd->fd_lease_och != NULL) {
 321                 bool lease_broken;
 322
 323                 /* Usually the lease is not released when the
 324                  * application crashed, we need to release here. */
 325                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 326                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 327                         PFID(&lli->lli_fid), rc, lease_broken);
 328
 329                 fd->fd_lease_och = NULL;
 330         }
 331
 332         if (fd->fd_och != NULL) {
 333                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 334                 fd->fd_och = NULL;
 335                 GOTO(out, rc);
 336         }
 337
 338         /* Let's see if we have good enough OPEN lock on the file and if
 339            we can skip talking to MDS */
 340         mutex_lock(&lli->lli_och_mutex);
 341         if (fd->fd_omode & FMODE_WRITE) {
 342                 lockmode = LCK_CW;
 343                 LASSERT(lli->lli_open_fd_write_count);
 344                 lli->lli_open_fd_write_count--;
 345         } else if (fd->fd_omode & FMODE_EXEC) {
 346                 lockmode = LCK_PR;
 347                 LASSERT(lli->lli_open_fd_exec_count);
 348                 lli->lli_open_fd_exec_count--;
 349         } else {
 350                 lockmode = LCK_CR;
 351                 LASSERT(lli->lli_open_fd_read_count);
 352                 lli->lli_open_fd_read_count--;
 353         }
 354         mutex_unlock(&lli->lli_och_mutex);
 355
 356         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 357                            LDLM_IBITS, &policy, lockmode, &lockh))
 358                 rc = ll_md_real_close(inode, fd->fd_omode);
 359
 360 out:
 361         LUSTRE_FPRIVATE(file) = NULL;
 362         ll_file_data_put(fd);
 363
 364         RETURN(rc);
 365 }
 366
 367 /* While this returns an error code, fput() the caller does not, so we need
 368  * to make every effort to clean up all of our state here.  Also, applications
 369  * rarely check close errors and even if an error is returned they will not
 370  * re-try the close call.
 371  */
 372 int ll_file_release(struct inode *inode, struct file *file)
 373 {
 374         struct ll_file_data *fd;
 375         struct ll_sb_info *sbi = ll_i2sbi(inode);
 376         struct ll_inode_info *lli = ll_i2info(inode);
 377         int rc;
 378         ENTRY;
 379
 380         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 381                PFID(ll_inode2fid(inode)), inode);
 382
 383         if (inode->i_sb->s_root != file_dentry(file))
 384                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 385         fd = LUSTRE_FPRIVATE(file);
 386         LASSERT(fd != NULL);
 387
 388         /* The last ref on @file, maybe not the the owner pid of statahead,
 389          * because parent and child process can share the same file handle. */
 390         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 391                 ll_deauthorize_statahead(inode, fd);
 392
 393         if (inode->i_sb->s_root == file_dentry(file)) {
 394                 LUSTRE_FPRIVATE(file) = NULL;
 395                 ll_file_data_put(fd);
 396                 RETURN(0);
 397         }
 398
 399         pcc_file_release(inode, file);
 400
 401         if (!S_ISDIR(inode->i_mode)) {
 402                 if (lli->lli_clob != NULL)
 403                         lov_read_and_clear_async_rc(lli->lli_clob);
 404                 lli->lli_async_rc = 0;
 405         }
 406
 407         rc = ll_md_close(inode, file);
 408
 409         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 410                 libcfs_debug_dumplog();
 411
 412         RETURN(rc);
 413 }
 414
 415 static inline int ll_dom_readpage(void *data, struct page *page)
 416 {
 417         struct niobuf_local *lnb = data;
 418         void *kaddr;
 419
 420         kaddr = ll_kmap_atomic(page, KM_USER0);
 421         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 422         if (lnb->lnb_len < PAGE_SIZE)
 423                 memset(kaddr + lnb->lnb_len, 0,
 424                        PAGE_SIZE - lnb->lnb_len);
 425         flush_dcache_page(page);
 426         SetPageUptodate(page);
 427         ll_kunmap_atomic(kaddr, KM_USER0);
 428         unlock_page(page);
 429
 430         return 0;
 431 }
 432
 433 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 434                         struct lookup_intent *it)
 435 {
 436         struct ll_inode_info *lli = ll_i2info(inode);
 437         struct cl_object *obj = lli->lli_clob;
 438         struct address_space *mapping = inode->i_mapping;
 439         struct page *vmpage;
 440         struct niobuf_remote *rnb;
 441         struct mdt_body *body;
 442         char *data;
 443         unsigned long index, start;
 444         struct niobuf_local lnb;
 445
 446         ENTRY;
 447
 448         if (obj == NULL)
 449                 RETURN_EXIT;
 450
 451         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 452                                    RCL_SERVER))
 453                 RETURN_EXIT;
 454
 455         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 456         if (rnb == NULL || rnb->rnb_len == 0)
 457                 RETURN_EXIT;
 458
 459         /* LU-11595: Server may return whole file and that is OK always or
 460          * it may return just file tail and its offset must be aligned with
 461          * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
 462          * smaller then offset may be not aligned and that data is just ignored.
 463          */
 464         if (rnb->rnb_offset % PAGE_SIZE)
 465                 RETURN_EXIT;
 466
 467         /* Server returns whole file or just file tail if it fills in reply
 468          * buffer, in both cases total size should be equal to the file size.
 469          */
 470         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 471         if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
 472                 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
 473                        ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
 474                        rnb->rnb_len, body->mbo_dom_size);
 475                 RETURN_EXIT;
 476         }
 477
 478         CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
 479                rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
 480
 481         data = (char *)rnb + sizeof(*rnb);
 482
 483         lnb.lnb_file_offset = rnb->rnb_offset;
 484         start = lnb.lnb_file_offset / PAGE_SIZE;
 485         index = 0;
 486         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 487         lnb.lnb_page_offset = 0;
 488         do {
 489                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 490                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 491                 if (lnb.lnb_len > PAGE_SIZE)
 492                         lnb.lnb_len = PAGE_SIZE;
 493
 494                 vmpage = read_cache_page(mapping, index + start,
 495                                          ll_dom_readpage, &lnb);
 496                 if (IS_ERR(vmpage)) {
 497                         CWARN("%s: cannot fill page %lu for "DFID
 498                               " with data: rc = %li\n",
 499                               ll_i2sbi(inode)->ll_fsname, index + start,
 500                               PFID(lu_object_fid(&obj->co_lu)),
 501                               PTR_ERR(vmpage));
 502                         break;
 503                 }
 504                 put_page(vmpage);
 505                 index++;
 506         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 507         EXIT;
 508 }
 509
 510 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 511                                 struct lookup_intent *itp)
 512 {
 513         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 514         struct dentry *parent = de->d_parent;
 515         char *name = NULL;
 516         int len = 0;
 517         struct md_op_data *op_data;
 518         struct ptlrpc_request *req = NULL;
 519         int rc;
 520         ENTRY;
 521
 522         LASSERT(parent != NULL);
 523         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 524
 525         /* if server supports open-by-fid, or file name is invalid, don't pack
 526          * name in open request */
 527         if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
 528             !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
 529 retry:
 530                 len = de->d_name.len;
 531                 name = kmalloc(len + 1, GFP_NOFS);
 532                 if (!name)
 533                         RETURN(-ENOMEM);
 534
 535                 /* race here */
 536                 spin_lock(&de->d_lock);
 537                 if (len != de->d_name.len) {
 538                         spin_unlock(&de->d_lock);
 539                         kfree(name);
 540                         goto retry;
 541                 }
 542                 memcpy(name, de->d_name.name, len);
 543                 name[len] = '\0';
 544                 spin_unlock(&de->d_lock);
 545
 546                 if (!lu_name_is_valid_2(name, len)) {
 547                         kfree(name);
 548                         RETURN(-ESTALE);
 549                 }
 550         }
 551
 552         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 553                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 554         if (IS_ERR(op_data)) {
 555                 kfree(name);
 556                 RETURN(PTR_ERR(op_data));
 557         }
 558         op_data->op_data = lmm;
 559         op_data->op_data_size = lmmsize;
 560
 561         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 562                             &ll_md_blocking_ast, 0);
 563         kfree(name);
 564         ll_finish_md_op_data(op_data);
 565         if (rc == -ESTALE) {
 566                 /* reason for keep own exit path - don`t flood log
 567                  * with messages with -ESTALE errors.
 568                  */
 569                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 570                      it_open_error(DISP_OPEN_OPEN, itp))
 571                         GOTO(out, rc);
 572                 ll_release_openhandle(de, itp);
 573                 GOTO(out, rc);
 574         }
 575
 576         if (it_disposition(itp, DISP_LOOKUP_NEG))
 577                 GOTO(out, rc = -ENOENT);
 578
 579         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 580                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 581                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 582                 GOTO(out, rc);
 583         }
 584
 585         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 586
 587         if (!rc && itp->it_lock_mode) {
 588                 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
 589                 struct ldlm_lock *lock;
 590                 bool has_dom_bit = false;
 591
 592                 /* If we got a lock back and it has a LOOKUP bit set,
 593                  * make sure the dentry is marked as valid so we can find it.
 594                  * We don't need to care about actual hashing since other bits
 595                  * of kernel will deal with that later.
 596                  */
 597                 lock = ldlm_handle2lock(&handle);
 598                 if (lock) {
 599                         has_dom_bit = ldlm_has_dom(lock);
 600                         if (lock->l_policy_data.l_inodebits.bits &
 601                             MDS_INODELOCK_LOOKUP)
 602                                 d_lustre_revalidate(de);
 603
 604                         LDLM_LOCK_PUT(lock);
 605                 }
 606                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 607                 if (has_dom_bit)
 608                         ll_dom_finish_open(de->d_inode, req, itp);
 609         }
 610
 611 out:
 612         ptlrpc_req_finished(req);
 613         ll_intent_drop_lock(itp);
 614
 615         /* We did open by fid, but by the time we got to the server,
 616          * the object disappeared. If this is a create, we cannot really
 617          * tell the userspace that the file it was trying to create
 618          * does not exist. Instead let's return -ESTALE, and the VFS will
 619          * retry the create with LOOKUP_REVAL that we are going to catch
 620          * in ll_revalidate_dentry() and use lookup then.
 621          */
 622         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 623                 rc = -ESTALE;
 624
 625         RETURN(rc);
 626 }
 627
 628 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 629                        struct obd_client_handle *och)
 630 {
 631         struct mdt_body *body;
 632
 633         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 634         och->och_open_handle = body->mbo_open_handle;
 635         och->och_fid = body->mbo_fid1;
 636         och->och_lease_handle.cookie = it->it_lock_handle;
 637         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 638         och->och_flags = it->it_flags;
 639
 640         return md_set_open_replay_data(md_exp, och, it);
 641 }
 642
 643 static int ll_local_open(struct file *file, struct lookup_intent *it,
 644                          struct ll_file_data *fd, struct obd_client_handle *och)
 645 {
 646         struct inode *inode = file_inode(file);
 647         ENTRY;
 648
 649         LASSERT(!LUSTRE_FPRIVATE(file));
 650
 651         LASSERT(fd != NULL);
 652
 653         if (och) {
 654                 int rc;
 655
 656                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 657                 if (rc != 0)
 658                         RETURN(rc);
 659         }
 660
 661         LUSTRE_FPRIVATE(file) = fd;
 662         ll_readahead_init(inode, &fd->fd_ras);
 663         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 664
 665         /* ll_cl_context initialize */
 666         rwlock_init(&fd->fd_lock);
 667         INIT_LIST_HEAD(&fd->fd_lccs);
 668
 669         RETURN(0);
 670 }
 671
 672 /* Open a file, and (for the very first open) create objects on the OSTs at
 673  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 674  * creation or open until ll_lov_setstripe() ioctl is called.
 675  *
 676  * If we already have the stripe MD locally then we don't request it in
 677  * md_open(), by passing a lmm_size = 0.
 678  *
 679  * It is up to the application to ensure no other processes open this file
 680  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 681  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 682  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 683  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 684  */
 685 int ll_file_open(struct inode *inode, struct file *file)
 686 {
 687         struct ll_inode_info *lli = ll_i2info(inode);
 688         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 689                                           .it_flags = file->f_flags };
 690         struct obd_client_handle **och_p = NULL;
 691         __u64 *och_usecount = NULL;
 692         struct ll_file_data *fd;
 693         int rc = 0;
 694         ENTRY;
 695
 696         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 697                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 698
 699         it = file->private_data; /* XXX: compat macro */
 700         file->private_data = NULL; /* prevent ll_local_open assertion */
 701
 702         fd = ll_file_data_get();
 703         if (fd == NULL)
 704                 GOTO(out_nofiledata, rc = -ENOMEM);
 705
 706         fd->fd_file = file;
 707         if (S_ISDIR(inode->i_mode))
 708                 ll_authorize_statahead(inode, fd);
 709
 710         if (inode->i_sb->s_root == file_dentry(file)) {
 711                 LUSTRE_FPRIVATE(file) = fd;
 712                 RETURN(0);
 713         }
 714
 715         if (!it || !it->it_disposition) {
 716                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 717                  * because everything but O_ACCMODE mask was stripped from
 718                  * there */
 719                 if ((oit.it_flags + 1) & O_ACCMODE)
 720                         oit.it_flags++;
 721                 if (file->f_flags & O_TRUNC)
 722                         oit.it_flags |= FMODE_WRITE;
 723
 724                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 725                  * dentry_open after call to open_namei that checks permissions.
 726                  * Only nfsd_open call dentry_open directly without checking
 727                  * permissions and because of that this code below is safe.
 728                  */
 729                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 730                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 731
 732                 /* We do not want O_EXCL here, presumably we opened the file
 733                  * already? XXX - NFS implications? */
 734                 oit.it_flags &= ~O_EXCL;
 735
 736                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 737                  * created if necessary, then "IT_CREAT" should be set to keep
 738                  * consistent with it */
 739                 if (oit.it_flags & O_CREAT)
 740                         oit.it_op |= IT_CREAT;
 741
 742                 it = &oit;
 743         }
 744
 745 restart:
 746         /* Let's see if we have file open on MDS already. */
 747         if (it->it_flags & FMODE_WRITE) {
 748                 och_p = &lli->lli_mds_write_och;
 749                 och_usecount = &lli->lli_open_fd_write_count;
 750         } else if (it->it_flags & FMODE_EXEC) {
 751                 och_p = &lli->lli_mds_exec_och;
 752                 och_usecount = &lli->lli_open_fd_exec_count;
 753          } else {
 754                 och_p = &lli->lli_mds_read_och;
 755                 och_usecount = &lli->lli_open_fd_read_count;
 756         }
 757
 758         mutex_lock(&lli->lli_och_mutex);
 759         if (*och_p) { /* Open handle is present */
 760                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 761                         /* Well, there's extra open request that we do not need,
 762                            let's close it somehow. This will decref request. */
 763                         rc = it_open_error(DISP_OPEN_OPEN, it);
 764                         if (rc) {
 765                                 mutex_unlock(&lli->lli_och_mutex);
 766                                 GOTO(out_openerr, rc);
 767                         }
 768
 769                         ll_release_openhandle(file_dentry(file), it);
 770                 }
 771                 (*och_usecount)++;
 772
 773                 rc = ll_local_open(file, it, fd, NULL);
 774                 if (rc) {
 775                         (*och_usecount)--;
 776                         mutex_unlock(&lli->lli_och_mutex);
 777                         GOTO(out_openerr, rc);
 778                 }
 779         } else {
 780                 LASSERT(*och_usecount == 0);
 781                 if (!it->it_disposition) {
 782                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 783                         /* We cannot just request lock handle now, new ELC code
 784                            means that one of other OPEN locks for this file
 785                            could be cancelled, and since blocking ast handler
 786                            would attempt to grab och_mutex as well, that would
 787                            result in a deadlock */
 788                         mutex_unlock(&lli->lli_och_mutex);
 789                         /*
 790                          * Normally called under two situations:
 791                          * 1. NFS export.
 792                          * 2. A race/condition on MDS resulting in no open
 793                          *    handle to be returned from LOOKUP|OPEN request,
 794                          *    for example if the target entry was a symlink.
 795                          *
 796                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 797                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 798                          *  bit so that it's not confusing later callers.
 799                          *
 800                          *  NB; when ldd is NULL, it must have come via normal
 801                          *  lookup path only, since ll_iget_for_nfs always calls
 802                          *  ll_d_init().
 803                          */
 804                         if (ldd && ldd->lld_nfs_dentry) {
 805                                 ldd->lld_nfs_dentry = 0;
 806                                 it->it_flags |= MDS_OPEN_LOCK;
 807                         }
 808
 809                          /*
 810                          * Always specify MDS_OPEN_BY_FID because we don't want
 811                          * to get file with different fid.
 812                          */
 813                         it->it_flags |= MDS_OPEN_BY_FID;
 814                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 815                                                  it);
 816                         if (rc)
 817                                 GOTO(out_openerr, rc);
 818
 819                         goto restart;
 820                 }
 821                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 822                 if (!*och_p)
 823                         GOTO(out_och_free, rc = -ENOMEM);
 824
 825                 (*och_usecount)++;
 826
 827                 /* md_intent_lock() didn't get a request ref if there was an
 828                  * open error, so don't do cleanup on the request here
 829                  * (bug 3430) */
 830                 /* XXX (green): Should not we bail out on any error here, not
 831                  * just open error? */
 832                 rc = it_open_error(DISP_OPEN_OPEN, it);
 833                 if (rc != 0)
 834                         GOTO(out_och_free, rc);
 835
 836                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 837                          "inode %p: disposition %x, status %d\n", inode,
 838                          it_disposition(it, ~0), it->it_status);
 839
 840                 rc = ll_local_open(file, it, fd, *och_p);
 841                 if (rc)
 842                         GOTO(out_och_free, rc);
 843         }
 844
 845         rc = pcc_file_open(inode, file);
 846         if (rc)
 847                 GOTO(out_och_free, rc);
 848
 849         mutex_unlock(&lli->lli_och_mutex);
 850         fd = NULL;
 851
 852         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 853            different kind of OPEN lock for this same inode gets cancelled
 854            by ldlm_cancel_lru */
 855         if (!S_ISREG(inode->i_mode))
 856                 GOTO(out_och_free, rc);
 857
 858         cl_lov_delay_create_clear(&file->f_flags);
 859         GOTO(out_och_free, rc);
 860
 861 out_och_free:
 862         if (rc) {
 863                 if (och_p && *och_p) {
 864                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 865                         *och_p = NULL; /* OBD_FREE writes some magic there */
 866                         (*och_usecount)--;
 867                 }
 868                 mutex_unlock(&lli->lli_och_mutex);
 869
 870 out_openerr:
 871                 if (lli->lli_opendir_key == fd)
 872                         ll_deauthorize_statahead(inode, fd);
 873
 874                 if (fd != NULL)
 875                         ll_file_data_put(fd);
 876         } else {
 877                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 878         }
 879
 880 out_nofiledata:
 881         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 882                 ptlrpc_req_finished(it->it_request);
 883                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 884         }
 885
 886         return rc;
 887 }
 888
 889 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 890                         struct ldlm_lock_desc *desc, void *data, int flag)
 891 {
 892         int rc;
 893         struct lustre_handle lockh;
 894         ENTRY;
 895
 896         switch (flag) {
 897         case LDLM_CB_BLOCKING:
 898                 ldlm_lock2handle(lock, &lockh);
 899                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 900                 if (rc < 0) {
 901                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 902                         RETURN(rc);
 903                 }
 904                 break;
 905         case LDLM_CB_CANCELING:
 906                 /* do nothing */
 907                 break;
 908         }
 909         RETURN(0);
 910 }
 911
 912 /**
 913  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 914  * and save it as fd->fd_och so as to force client to reopen the file even
 915  * if it has an open lock in cache already.
 916  */
 917 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 918                                 struct lustre_handle *old_open_handle)
 919 {
 920         struct ll_inode_info *lli = ll_i2info(inode);
 921         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 922         struct obd_client_handle **och_p;
 923         __u64 *och_usecount;
 924         int rc = 0;
 925         ENTRY;
 926
 927         /* Get the openhandle of the file */
 928         mutex_lock(&lli->lli_och_mutex);
 929         if (fd->fd_lease_och != NULL)
 930                 GOTO(out_unlock, rc = -EBUSY);
 931
 932         if (fd->fd_och == NULL) {
 933                 if (file->f_mode & FMODE_WRITE) {
 934                         LASSERT(lli->lli_mds_write_och != NULL);
 935                         och_p = &lli->lli_mds_write_och;
 936                         och_usecount = &lli->lli_open_fd_write_count;
 937                 } else {
 938                         LASSERT(lli->lli_mds_read_och != NULL);
 939                         och_p = &lli->lli_mds_read_och;
 940                         och_usecount = &lli->lli_open_fd_read_count;
 941                 }
 942
 943                 if (*och_usecount > 1)
 944                         GOTO(out_unlock, rc = -EBUSY);
 945
 946                 fd->fd_och = *och_p;
 947                 *och_usecount = 0;
 948                 *och_p = NULL;
 949         }
 950
 951         *old_open_handle = fd->fd_och->och_open_handle;
 952
 953         EXIT;
 954 out_unlock:
 955         mutex_unlock(&lli->lli_och_mutex);
 956         return rc;
 957 }
 958
 959 /**
 960  * Release ownership on lli_mds_*_och when putting back a file lease.
 961  */
 962 static int ll_lease_och_release(struct inode *inode, struct file *file)
 963 {
 964         struct ll_inode_info *lli = ll_i2info(inode);
 965         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 966         struct obd_client_handle **och_p;
 967         struct obd_client_handle *old_och = NULL;
 968         __u64 *och_usecount;
 969         int rc = 0;
 970         ENTRY;
 971
 972         mutex_lock(&lli->lli_och_mutex);
 973         if (file->f_mode & FMODE_WRITE) {
 974                 och_p = &lli->lli_mds_write_och;
 975                 och_usecount = &lli->lli_open_fd_write_count;
 976         } else {
 977                 och_p = &lli->lli_mds_read_och;
 978                 och_usecount = &lli->lli_open_fd_read_count;
 979         }
 980
 981         /* The file may have been open by another process (broken lease) so
 982          * *och_p is not NULL. In this case we should simply increase usecount
 983          * and close fd_och.
 984          */
 985         if (*och_p != NULL) {
 986                 old_och = fd->fd_och;
 987                 (*och_usecount)++;
 988         } else {
 989                 *och_p = fd->fd_och;
 990                 *och_usecount = 1;
 991         }
 992         fd->fd_och = NULL;
 993         mutex_unlock(&lli->lli_och_mutex);
 994
 995         if (old_och != NULL)
 996                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 997
 998         RETURN(rc);
 999 }
1000
1001 /**
1002  * Acquire a lease and open the file.
1003  */
1004 static struct obd_client_handle *
1005 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1006               __u64 open_flags)
1007 {
1008         struct lookup_intent it = { .it_op = IT_OPEN };
1009         struct ll_sb_info *sbi = ll_i2sbi(inode);
1010         struct md_op_data *op_data;
1011         struct ptlrpc_request *req = NULL;
1012         struct lustre_handle old_open_handle = { 0 };
1013         struct obd_client_handle *och = NULL;
1014         int rc;
1015         int rc2;
1016         ENTRY;
1017
1018         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1019                 RETURN(ERR_PTR(-EINVAL));
1020
1021         if (file != NULL) {
1022                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1023                         RETURN(ERR_PTR(-EPERM));
1024
1025                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1026                 if (rc)
1027                         RETURN(ERR_PTR(rc));
1028         }
1029
1030         OBD_ALLOC_PTR(och);
1031         if (och == NULL)
1032                 RETURN(ERR_PTR(-ENOMEM));
1033
1034         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1035                                         LUSTRE_OPC_ANY, NULL);
1036         if (IS_ERR(op_data))
1037                 GOTO(out, rc = PTR_ERR(op_data));
1038
1039         /* To tell the MDT this openhandle is from the same owner */
1040         op_data->op_open_handle = old_open_handle;
1041
1042         it.it_flags = fmode | open_flags;
1043         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1044         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1045                             &ll_md_blocking_lease_ast,
1046         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1047          * it can be cancelled which may mislead applications that the lease is
1048          * broken;
1049          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1050          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1051          * doesn't deal with openhandle, so normal openhandle will be leaked. */
1052                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1053         ll_finish_md_op_data(op_data);
1054         ptlrpc_req_finished(req);
1055         if (rc < 0)
1056                 GOTO(out_release_it, rc);
1057
1058         if (it_disposition(&it, DISP_LOOKUP_NEG))
1059                 GOTO(out_release_it, rc = -ENOENT);
1060
1061         rc = it_open_error(DISP_OPEN_OPEN, &it);
1062         if (rc)
1063                 GOTO(out_release_it, rc);
1064
1065         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1066         ll_och_fill(sbi->ll_md_exp, &it, och);
1067
1068         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1069                 GOTO(out_close, rc = -EOPNOTSUPP);
1070
1071         /* already get lease, handle lease lock */
1072         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1073         if (it.it_lock_mode == 0 ||
1074             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1075                 /* open lock must return for lease */
1076                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1077                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1078                         it.it_lock_bits);
1079                 GOTO(out_close, rc = -EPROTO);
1080         }
1081
1082         ll_intent_release(&it);
1083         RETURN(och);
1084
1085 out_close:
1086         /* Cancel open lock */
1087         if (it.it_lock_mode != 0) {
1088                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1089                                             it.it_lock_mode);
1090                 it.it_lock_mode = 0;
1091                 och->och_lease_handle.cookie = 0ULL;
1092         }
1093         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1094         if (rc2 < 0)
1095                 CERROR("%s: error closing file "DFID": %d\n",
1096                        sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1097         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1098 out_release_it:
1099         ll_intent_release(&it);
1100 out:
1101         if (och != NULL)
1102                 OBD_FREE_PTR(och);
1103         RETURN(ERR_PTR(rc));
1104 }
1105
1106 /**
1107  * Check whether a layout swap can be done between two inodes.
1108  *
1109  * \param[in] inode1  First inode to check
1110  * \param[in] inode2  Second inode to check
1111  *
1112  * \retval 0 on success, layout swap can be performed between both inodes
1113  * \retval negative error code if requirements are not met
1114  */
1115 static int ll_check_swap_layouts_validity(struct inode *inode1,
1116                                           struct inode *inode2)
1117 {
1118         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1119                 return -EINVAL;
1120
1121         if (inode_permission(inode1, MAY_WRITE) ||
1122             inode_permission(inode2, MAY_WRITE))
1123                 return -EPERM;
1124
1125         if (inode1->i_sb != inode2->i_sb)
1126                 return -EXDEV;
1127
1128         return 0;
1129 }
1130
1131 static int ll_swap_layouts_close(struct obd_client_handle *och,
1132                                  struct inode *inode, struct inode *inode2)
1133 {
1134         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1135         const struct lu_fid     *fid2;
1136         int                      rc;
1137         ENTRY;
1138
1139         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1140                ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1141
1142         rc = ll_check_swap_layouts_validity(inode, inode2);
1143         if (rc < 0)
1144                 GOTO(out_free_och, rc);
1145
1146         /* We now know that inode2 is a lustre inode */
1147         fid2 = ll_inode2fid(inode2);
1148
1149         rc = lu_fid_cmp(fid1, fid2);
1150         if (rc == 0)
1151                 GOTO(out_free_och, rc = -EINVAL);
1152
1153         /* Close the file and {swap,merge} layouts between inode & inode2.
1154          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1155          * because we still need it to pack l_remote_handle to MDT. */
1156         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1157                                        inode2);
1158
1159         och = NULL; /* freed in ll_close_inode_openhandle() */
1160
1161 out_free_och:
1162         if (och != NULL)
1163                 OBD_FREE_PTR(och);
1164
1165         RETURN(rc);
1166 }
1167
1168 /**
1169  * Release lease and close the file.
1170  * It will check if the lease has ever broken.
1171  */
1172 static int ll_lease_close_intent(struct obd_client_handle *och,
1173                                  struct inode *inode,
1174                                  bool *lease_broken, enum mds_op_bias bias,
1175                                  void *data)
1176 {
1177         struct ldlm_lock *lock;
1178         bool cancelled = true;
1179         int rc;
1180         ENTRY;
1181
1182         lock = ldlm_handle2lock(&och->och_lease_handle);
1183         if (lock != NULL) {
1184                 lock_res_and_lock(lock);
1185                 cancelled = ldlm_is_cancel(lock);
1186                 unlock_res_and_lock(lock);
1187                 LDLM_LOCK_PUT(lock);
1188         }
1189
1190         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1191                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1192
1193         if (lease_broken != NULL)
1194                 *lease_broken = cancelled;
1195
1196         if (!cancelled && !bias)
1197                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1198
1199         if (cancelled) { /* no need to excute intent */
1200                 bias = 0;
1201                 data = NULL;
1202         }
1203
1204         rc = ll_close_inode_openhandle(inode, och, bias, data);
1205         RETURN(rc);
1206 }
1207
1208 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1209                           bool *lease_broken)
1210 {
1211         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1212 }
1213
1214 /**
1215  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1216  */
1217 static int ll_lease_file_resync(struct obd_client_handle *och,
1218                                 struct inode *inode, unsigned long arg)
1219 {
1220         struct ll_sb_info *sbi = ll_i2sbi(inode);
1221         struct md_op_data *op_data;
1222         struct ll_ioc_lease_id ioc;
1223         __u64 data_version_unused;
1224         int rc;
1225         ENTRY;
1226
1227         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1228                                      LUSTRE_OPC_ANY, NULL);
1229         if (IS_ERR(op_data))
1230                 RETURN(PTR_ERR(op_data));
1231
1232         if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1233                            sizeof(ioc)))
1234                 RETURN(-EFAULT);
1235
1236         /* before starting file resync, it's necessary to clean up page cache
1237          * in client memory, otherwise once the layout version is increased,
1238          * writing back cached data will be denied the OSTs. */
1239         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1240         if (rc)
1241                 GOTO(out, rc);
1242
1243         op_data->op_lease_handle = och->och_lease_handle;
1244         op_data->op_mirror_id = ioc.lil_mirror_id;
1245         rc = md_file_resync(sbi->ll_md_exp, op_data);
1246         if (rc)
1247                 GOTO(out, rc);
1248
1249         EXIT;
1250 out:
1251         ll_finish_md_op_data(op_data);
1252         return rc;
1253 }
1254
1255 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1256 {
1257         struct ll_inode_info *lli = ll_i2info(inode);
1258         struct cl_object *obj = lli->lli_clob;
1259         struct cl_attr *attr = vvp_env_thread_attr(env);
1260         s64 atime;
1261         s64 mtime;
1262         s64 ctime;
1263         int rc = 0;
1264
1265         ENTRY;
1266
1267         ll_inode_size_lock(inode);
1268
1269         /* Merge timestamps the most recently obtained from MDS with
1270          * timestamps obtained from OSTs.
1271          *
1272          * Do not overwrite atime of inode because it may be refreshed
1273          * by file_accessed() function. If the read was served by cache
1274          * data, there is no RPC to be sent so that atime may not be
1275          * transferred to OSTs at all. MDT only updates atime at close time
1276          * if it's at least 'mdd.*.atime_diff' older.
1277          * All in all, the atime in Lustre does not strictly comply with
1278          * POSIX. Solving this problem needs to send an RPC to MDT for each
1279          * read, this will hurt performance.
1280          */
1281         if (inode->i_atime.tv_sec < lli->lli_atime ||
1282             lli->lli_update_atime) {
1283                 inode->i_atime.tv_sec = lli->lli_atime;
1284                 lli->lli_update_atime = 0;
1285         }
1286         inode->i_mtime.tv_sec = lli->lli_mtime;
1287         inode->i_ctime.tv_sec = lli->lli_ctime;
1288
1289         mtime = inode->i_mtime.tv_sec;
1290         atime = inode->i_atime.tv_sec;
1291         ctime = inode->i_ctime.tv_sec;
1292
1293         cl_object_attr_lock(obj);
1294         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1295                 rc = -EINVAL;
1296         else
1297                 rc = cl_object_attr_get(env, obj, attr);
1298         cl_object_attr_unlock(obj);
1299
1300         if (rc != 0)
1301                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1302
1303         if (atime < attr->cat_atime)
1304                 atime = attr->cat_atime;
1305
1306         if (ctime < attr->cat_ctime)
1307                 ctime = attr->cat_ctime;
1308
1309         if (mtime < attr->cat_mtime)
1310                 mtime = attr->cat_mtime;
1311
1312         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1313                PFID(&lli->lli_fid), attr->cat_size);
1314
1315         i_size_write(inode, attr->cat_size);
1316         inode->i_blocks = attr->cat_blocks;
1317
1318         inode->i_mtime.tv_sec = mtime;
1319         inode->i_atime.tv_sec = atime;
1320         inode->i_ctime.tv_sec = ctime;
1321
1322 out_size_unlock:
1323         ll_inode_size_unlock(inode);
1324
1325         RETURN(rc);
1326 }
1327
1328 /**
1329  * Set designated mirror for I/O.
1330  *
1331  * So far only read, write, and truncated can support to issue I/O to
1332  * designated mirror.
1333  */
1334 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1335 {
1336         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1337
1338         /* clear layout version for generic(non-resync) I/O in case it carries
1339          * stale layout version due to I/O restart */
1340         io->ci_layout_version = 0;
1341
1342         /* FLR: disable non-delay for designated mirror I/O because obviously
1343          * only one mirror is available */
1344         if (fd->fd_designated_mirror > 0) {
1345                 io->ci_ndelay = 0;
1346                 io->ci_designated_mirror = fd->fd_designated_mirror;
1347                 io->ci_layout_version = fd->fd_layout_version;
1348         }
1349
1350         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1351                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1352 }
1353
1354 static bool file_is_noatime(const struct file *file)
1355 {
1356         const struct vfsmount *mnt = file->f_path.mnt;
1357         const struct inode *inode = file_inode((struct file *)file);
1358
1359         /* Adapted from file_accessed() and touch_atime().*/
1360         if (file->f_flags & O_NOATIME)
1361                 return true;
1362
1363         if (inode->i_flags & S_NOATIME)
1364                 return true;
1365
1366         if (IS_NOATIME(inode))
1367                 return true;
1368
1369         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1370                 return true;
1371
1372         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1373                 return true;
1374
1375         if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1376                 return true;
1377
1378         return false;
1379 }
1380
1381 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1382 {
1383         struct inode *inode = file_inode(file);
1384         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1385
1386         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1387         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1388
1389         if (iot == CIT_WRITE) {
1390                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1391                 io->u.ci_wr.wr_sync   = !!(file->f_flags & O_SYNC ||
1392                                            file->f_flags & O_DIRECT ||
1393                                            IS_SYNC(inode));
1394         }
1395         io->ci_obj = ll_i2info(inode)->lli_clob;
1396         io->ci_lockreq = CILR_MAYBE;
1397         if (ll_file_nolock(file)) {
1398                 io->ci_lockreq = CILR_NEVER;
1399                 io->ci_no_srvlock = 1;
1400         } else if (file->f_flags & O_APPEND) {
1401                 io->ci_lockreq = CILR_MANDATORY;
1402         }
1403         io->ci_noatime = file_is_noatime(file);
1404         io->ci_async_readahead = false;
1405
1406         /* FLR: only use non-delay I/O for read as there is only one
1407          * avaliable mirror for write. */
1408         io->ci_ndelay = !(iot == CIT_WRITE);
1409
1410         ll_io_set_mirror(io, file);
1411 }
1412
1413 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1414                         __u64 count)
1415 {
1416         struct ll_inode_info *lli = ll_i2info(inode);
1417         struct ll_sb_info *sbi = ll_i2sbi(inode);
1418         enum obd_heat_type sample_type;
1419         enum obd_heat_type iobyte_type;
1420         __u64 now = ktime_get_real_seconds();
1421
1422         if (!ll_sbi_has_file_heat(sbi) ||
1423             lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1424                 return;
1425
1426         if (iot == CIT_READ) {
1427                 sample_type = OBD_HEAT_READSAMPLE;
1428                 iobyte_type = OBD_HEAT_READBYTE;
1429         } else if (iot == CIT_WRITE) {
1430                 sample_type = OBD_HEAT_WRITESAMPLE;
1431                 iobyte_type = OBD_HEAT_WRITEBYTE;
1432         } else {
1433                 return;
1434         }
1435
1436         spin_lock(&lli->lli_heat_lock);
1437         obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1438                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1439         obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1440                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1441         spin_unlock(&lli->lli_heat_lock);
1442 }
1443
1444 static ssize_t
1445 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1446                    struct file *file, enum cl_io_type iot,
1447                    loff_t *ppos, size_t count)
1448 {
1449         struct vvp_io           *vio = vvp_env_io(env);
1450         struct inode            *inode = file_inode(file);
1451         struct ll_inode_info    *lli = ll_i2info(inode);
1452         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1453         struct range_lock       range;
1454         struct cl_io            *io;
1455         ssize_t                 result = 0;
1456         int                     rc = 0;
1457         unsigned                retried = 0;
1458         bool                    restarted = false;
1459
1460         ENTRY;
1461
1462         CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1463                 file_dentry(file)->d_name.name,
1464                 iot == CIT_READ ? "read" : "write", *ppos, count);
1465
1466 restart:
1467         io = vvp_env_thread_io(env);
1468         ll_io_init(io, file, iot);
1469         io->ci_ndelay_tried = retried;
1470
1471         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1472                 bool range_locked = false;
1473
1474                 if (file->f_flags & O_APPEND)
1475                         range_lock_init(&range, 0, LUSTRE_EOF);
1476                 else
1477                         range_lock_init(&range, *ppos, *ppos + count - 1);
1478
1479                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1480                 vio->vui_io_subtype = args->via_io_subtype;
1481
1482                 switch (vio->vui_io_subtype) {
1483                 case IO_NORMAL:
1484                         vio->vui_iter = args->u.normal.via_iter;
1485                         vio->vui_iocb = args->u.normal.via_iocb;
1486                         /* Direct IO reads must also take range lock,
1487                          * or multiple reads will try to work on the same pages
1488                          * See LU-6227 for details. */
1489                         if (((iot == CIT_WRITE) ||
1490                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1491                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1492                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1493                                        RL_PARA(&range));
1494                                 rc = range_lock(&lli->lli_write_tree, &range);
1495                                 if (rc < 0)
1496                                         GOTO(out, rc);
1497
1498                                 range_locked = true;
1499                         }
1500                         break;
1501                 case IO_SPLICE:
1502                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1503                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1504                         break;
1505                 default:
1506                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1507                         LBUG();
1508                 }
1509
1510                 ll_cl_add(file, env, io, LCC_RW);
1511                 rc = cl_io_loop(env, io);
1512                 ll_cl_remove(file, env);
1513
1514                 if (range_locked) {
1515                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1516                                RL_PARA(&range));
1517                         range_unlock(&lli->lli_write_tree, &range);
1518                 }
1519         } else {
1520                 /* cl_io_rw_init() handled IO */
1521                 rc = io->ci_result;
1522         }
1523
1524         if (io->ci_nob > 0) {
1525                 result += io->ci_nob;
1526                 count  -= io->ci_nob;
1527                 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1528
1529                 /* prepare IO restart */
1530                 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1531                         args->u.normal.via_iter = vio->vui_iter;
1532         }
1533 out:
1534         cl_io_fini(env, io);
1535
1536         CDEBUG(D_VFSTRACE,
1537                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1538                file->f_path.dentry->d_name.name,
1539                iot, rc, result, io->ci_need_restart);
1540
1541         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1542                 CDEBUG(D_VFSTRACE,
1543                        "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1544                        file_dentry(file)->d_name.name,
1545                        iot == CIT_READ ? "read" : "write",
1546                        *ppos, count, result, rc);
1547                 /* preserve the tried count for FLR */
1548                 retried = io->ci_ndelay_tried;
1549                 restarted = true;
1550                 goto restart;
1551         }
1552
1553         if (iot == CIT_READ) {
1554                 if (result > 0)
1555                         ll_stats_ops_tally(ll_i2sbi(inode),
1556                                            LPROC_LL_READ_BYTES, result);
1557         } else if (iot == CIT_WRITE) {
1558                 if (result > 0) {
1559                         ll_stats_ops_tally(ll_i2sbi(inode),
1560                                            LPROC_LL_WRITE_BYTES, result);
1561                         fd->fd_write_failed = false;
1562                 } else if (result == 0 && rc == 0) {
1563                         rc = io->ci_result;
1564                         if (rc < 0)
1565                                 fd->fd_write_failed = true;
1566                         else
1567                                 fd->fd_write_failed = false;
1568                 } else if (rc != -ERESTARTSYS) {
1569                         fd->fd_write_failed = true;
1570                 }
1571         }
1572
1573         CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1574         if (result > 0)
1575                 ll_heat_add(inode, iot, result);
1576
1577         RETURN(result > 0 ? result : rc);
1578 }
1579
1580 /**
1581  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1582  * especially for small I/O.
1583  *
1584  * To serve a read request, CLIO has to create and initialize a cl_io and
1585  * then request DLM lock. This has turned out to have siginificant overhead
1586  * and affects the performance of small I/O dramatically.
1587  *
1588  * It's not necessary to create a cl_io for each I/O. Under the help of read
1589  * ahead, most of the pages being read are already in memory cache and we can
1590  * read those pages directly because if the pages exist, the corresponding DLM
1591  * lock must exist so that page content must be valid.
1592  *
1593  * In fast read implementation, the llite speculatively finds and reads pages
1594  * in memory cache. There are three scenarios for fast read:
1595  *   - If the page exists and is uptodate, kernel VM will provide the data and
1596  *     CLIO won't be intervened;
1597  *   - If the page was brought into memory by read ahead, it will be exported
1598  *     and read ahead parameters will be updated;
1599  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1600  *     it will go back and invoke normal read, i.e., a cl_io will be created
1601  *     and DLM lock will be requested.
1602  *
1603  * POSIX compliance: posix standard states that read is intended to be atomic.
1604  * Lustre read implementation is in line with Linux kernel read implementation
1605  * and neither of them complies with POSIX standard in this matter. Fast read
1606  * doesn't make the situation worse on single node but it may interleave write
1607  * results from multiple nodes due to short read handling in ll_file_aio_read().
1608  *
1609  * \param env - lu_env
1610  * \param iocb - kiocb from kernel
1611  * \param iter - user space buffers where the data will be copied
1612  *
1613  * \retval - number of bytes have been read, or error code if error occurred.
1614  */
1615 static ssize_t
1616 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1617 {
1618         ssize_t result;
1619
1620         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1621                 return 0;
1622
1623         /* NB: we can't do direct IO for fast read because it will need a lock
1624          * to make IO engine happy. */
1625         if (iocb->ki_filp->f_flags & O_DIRECT)
1626                 return 0;
1627
1628         result = generic_file_read_iter(iocb, iter);
1629
1630         /* If the first page is not in cache, generic_file_aio_read() will be
1631          * returned with -ENODATA.
1632          * See corresponding code in ll_readpage(). */
1633         if (result == -ENODATA)
1634                 result = 0;
1635
1636         if (result > 0) {
1637                 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1638                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1639                                 LPROC_LL_READ_BYTES, result);
1640         }
1641
1642         return result;
1643 }
1644
1645 /*
1646  * Read from a file (through the page cache).
1647  */
1648 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1649 {
1650         struct lu_env *env;
1651         struct vvp_io_args *args;
1652         ssize_t result;
1653         ssize_t rc2;
1654         __u16 refcheck;
1655         bool cached;
1656
1657         if (!iov_iter_count(to))
1658                 return 0;
1659
1660         /**
1661          * Currently when PCC read failed, we do not fall back to the
1662          * normal read path, just return the error.
1663          * The resaon is that: for RW-PCC, the file data may be modified
1664          * in the PCC and inconsistent with the data on OSTs (or file
1665          * data has been removed from the Lustre file system), at this
1666          * time, fallback to the normal read path may read the wrong
1667          * data.
1668          * TODO: for RO-PCC (readonly PCC), fall back to normal read
1669          * path: read data from data copy on OSTs.
1670          */
1671         result = pcc_file_read_iter(iocb, to, &cached);
1672         if (cached)
1673                 return result;
1674
1675         ll_ras_enter(iocb->ki_filp);
1676
1677         result = ll_do_fast_read(iocb, to);
1678         if (result < 0 || iov_iter_count(to) == 0)
1679                 GOTO(out, result);
1680
1681         env = cl_env_get(&refcheck);
1682         if (IS_ERR(env))
1683                 return PTR_ERR(env);
1684
1685         args = ll_env_args(env, IO_NORMAL);
1686         args->u.normal.via_iter = to;
1687         args->u.normal.via_iocb = iocb;
1688
1689         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1690                                  &iocb->ki_pos, iov_iter_count(to));
1691         if (rc2 > 0)
1692                 result += rc2;
1693         else if (result == 0)
1694                 result = rc2;
1695
1696         cl_env_put(env, &refcheck);
1697 out:
1698         return result;
1699 }
1700
1701 /**
1702  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1703  * If a page is already in the page cache and dirty (and some other things -
1704  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1705  * write to it without doing a full I/O, because Lustre already knows about it
1706  * and will write it out.  This saves a lot of processing time.
1707  *
1708  * All writes here are within one page, so exclusion is handled by the page
1709  * lock on the vm page.  We do not do tiny writes for writes which touch
1710  * multiple pages because it's very unlikely multiple sequential pages are
1711  * are already dirty.
1712  *
1713  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1714  * and are unlikely to be to already dirty pages.
1715  *
1716  * Attribute updates are important here, we do them in ll_tiny_write_end.
1717  */
1718 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1719 {
1720         ssize_t count = iov_iter_count(iter);
1721         struct  file *file = iocb->ki_filp;
1722         struct  inode *inode = file_inode(file);
1723         bool    lock_inode = !IS_NOSEC(inode);
1724         ssize_t result = 0;
1725
1726         ENTRY;
1727
1728         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1729          * of function for why.
1730          */
1731         if (count >= PAGE_SIZE ||
1732             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1733                 RETURN(0);
1734
1735         if (unlikely(lock_inode))
1736                 inode_lock(inode);
1737         result = __generic_file_write_iter(iocb, iter);
1738
1739         if (unlikely(lock_inode))
1740                 inode_unlock(inode);
1741
1742         /* If the page is not already dirty, ll_tiny_write_begin returns
1743          * -ENODATA.  We continue on to normal write.
1744          */
1745         if (result == -ENODATA)
1746                 result = 0;
1747
1748         if (result > 0) {
1749                 ll_heat_add(inode, CIT_WRITE, result);
1750                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1751                                    result);
1752                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1753         }
1754
1755         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1756
1757         RETURN(result);
1758 }
1759
1760 /*
1761  * Write to a file (through the page cache).
1762  */
1763 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1764 {
1765         struct vvp_io_args *args;
1766         struct lu_env *env;
1767         ssize_t rc_tiny = 0, rc_normal;
1768         __u16 refcheck;
1769         bool cached;
1770         int result;
1771
1772         ENTRY;
1773
1774         if (!iov_iter_count(from))
1775                 GOTO(out, rc_normal = 0);
1776
1777         /**
1778          * When PCC write failed, we usually do not fall back to the normal
1779          * write path, just return the error. But there is a special case when
1780          * returned error code is -ENOSPC due to running out of space on PCC HSM
1781          * bakcend. At this time, it will fall back to normal I/O path and
1782          * retry the I/O. As the file is in HSM released state, it will restore
1783          * the file data to OSTs first and redo the write again. And the
1784          * restore process will revoke the layout lock and detach the file
1785          * from PCC cache automatically.
1786          */
1787         result = pcc_file_write_iter(iocb, from, &cached);
1788         if (cached && result != -ENOSPC && result != -EDQUOT)
1789                 return result;
1790
1791         /* NB: we can't do direct IO for tiny writes because they use the page
1792          * cache, we can't do sync writes because tiny writes can't flush
1793          * pages, and we can't do append writes because we can't guarantee the
1794          * required DLM locks are held to protect file size.
1795          */
1796         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1797             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1798                 rc_tiny = ll_do_tiny_write(iocb, from);
1799
1800         /* In case of error, go on and try normal write - Only stop if tiny
1801          * write completed I/O.
1802          */
1803         if (iov_iter_count(from) == 0)
1804                 GOTO(out, rc_normal = rc_tiny);
1805
1806         env = cl_env_get(&refcheck);
1807         if (IS_ERR(env))
1808                 return PTR_ERR(env);
1809
1810         args = ll_env_args(env, IO_NORMAL);
1811         args->u.normal.via_iter = from;
1812         args->u.normal.via_iocb = iocb;
1813
1814         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1815                                     &iocb->ki_pos, iov_iter_count(from));
1816
1817         /* On success, combine bytes written. */
1818         if (rc_tiny >= 0 && rc_normal > 0)
1819                 rc_normal += rc_tiny;
1820         /* On error, only return error from normal write if tiny write did not
1821          * write any bytes.  Otherwise return bytes written by tiny write.
1822          */
1823         else if (rc_tiny > 0)
1824                 rc_normal = rc_tiny;
1825
1826         cl_env_put(env, &refcheck);
1827 out:
1828         RETURN(rc_normal);
1829 }
1830
1831 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1832 /*
1833  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1834  */
1835 static int ll_file_get_iov_count(const struct iovec *iov,
1836                                  unsigned long *nr_segs, size_t *count)
1837 {
1838         size_t cnt = 0;
1839         unsigned long seg;
1840
1841         for (seg = 0; seg < *nr_segs; seg++) {
1842                 const struct iovec *iv = &iov[seg];
1843
1844                 /*
1845                  * If any segment has a negative length, or the cumulative
1846                  * length ever wraps negative then return -EINVAL.
1847                  */
1848                 cnt += iv->iov_len;
1849                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1850                         return -EINVAL;
1851                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1852                         continue;
1853                 if (seg == 0)
1854                         return -EFAULT;
1855                 *nr_segs = seg;
1856                 cnt -= iv->iov_len;     /* This segment is no good */
1857                 break;
1858         }
1859         *count = cnt;
1860         return 0;
1861 }
1862
1863 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1864                                 unsigned long nr_segs, loff_t pos)
1865 {
1866         struct iov_iter to;
1867         size_t iov_count;
1868         ssize_t result;
1869         ENTRY;
1870
1871         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1872         if (result)
1873                 RETURN(result);
1874
1875         if (!iov_count)
1876                 RETURN(0);
1877
1878 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1879         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1880 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1881         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1882 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1883
1884         result = ll_file_read_iter(iocb, &to);
1885
1886         RETURN(result);
1887 }
1888
1889 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1890                             loff_t *ppos)
1891 {
1892         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1893         struct kiocb   kiocb;
1894         ssize_t        result;
1895
1896         ENTRY;
1897
1898         if (!count)
1899                 RETURN(0);
1900
1901         init_sync_kiocb(&kiocb, file);
1902         kiocb.ki_pos = *ppos;
1903 #ifdef HAVE_KIOCB_KI_LEFT
1904         kiocb.ki_left = count;
1905 #elif defined(HAVE_KI_NBYTES)
1906         kiocb.i_nbytes = count;
1907 #endif
1908
1909         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1910         *ppos = kiocb.ki_pos;
1911
1912         RETURN(result);
1913 }
1914
1915 /*
1916  * Write to a file (through the page cache).
1917  * AIO stuff
1918  */
1919 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1920                                  unsigned long nr_segs, loff_t pos)
1921 {
1922         struct iov_iter from;
1923         size_t iov_count;
1924         ssize_t result;
1925         ENTRY;
1926
1927         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1928         if (result)
1929                 RETURN(result);
1930
1931         if (!iov_count)
1932                 RETURN(0);
1933
1934 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1935         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1936 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1937         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1938 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1939
1940         result = ll_file_write_iter(iocb, &from);
1941
1942         RETURN(result);
1943 }
1944
1945 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1946                              size_t count, loff_t *ppos)
1947 {
1948         struct iovec   iov = { .iov_base = (void __user *)buf,
1949                                .iov_len = count };
1950         struct kiocb   kiocb;
1951         ssize_t        result;
1952
1953         ENTRY;
1954
1955         if (!count)
1956                 RETURN(0);
1957
1958         init_sync_kiocb(&kiocb, file);
1959         kiocb.ki_pos = *ppos;
1960 #ifdef HAVE_KIOCB_KI_LEFT
1961         kiocb.ki_left = count;
1962 #elif defined(HAVE_KI_NBYTES)
1963         kiocb.ki_nbytes = count;
1964 #endif
1965
1966         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1967         *ppos = kiocb.ki_pos;
1968
1969         RETURN(result);
1970 }
1971 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1972
1973 /*
1974  * Send file content (through pagecache) somewhere with helper
1975  */
1976 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1977                                    struct pipe_inode_info *pipe, size_t count,
1978                                    unsigned int flags)
1979 {
1980         struct lu_env *env;
1981         struct vvp_io_args *args;
1982         ssize_t result;
1983         __u16 refcheck;
1984         bool cached;
1985
1986         ENTRY;
1987
1988         result = pcc_file_splice_read(in_file, ppos, pipe,
1989                                       count, flags, &cached);
1990         if (cached)
1991                 RETURN(result);
1992
1993         ll_ras_enter(in_file);
1994
1995         env = cl_env_get(&refcheck);
1996         if (IS_ERR(env))
1997                 RETURN(PTR_ERR(env));
1998
1999         args = ll_env_args(env, IO_SPLICE);
2000         args->u.splice.via_pipe = pipe;
2001         args->u.splice.via_flags = flags;
2002
2003         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2004         cl_env_put(env, &refcheck);
2005         RETURN(result);
2006 }
2007
2008 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2009                              __u64 flags, struct lov_user_md *lum, int lum_size)
2010 {
2011         struct lookup_intent oit = {
2012                 .it_op = IT_OPEN,
2013                 .it_flags = flags | MDS_OPEN_BY_FID,
2014         };
2015         int rc;
2016         ENTRY;
2017
2018         ll_inode_size_lock(inode);
2019         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2020         if (rc < 0)
2021                 GOTO(out_unlock, rc);
2022
2023         ll_release_openhandle(dentry, &oit);
2024
2025 out_unlock:
2026         ll_inode_size_unlock(inode);
2027         ll_intent_release(&oit);
2028
2029         RETURN(rc);
2030 }
2031
2032 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2033                              struct lov_mds_md **lmmp, int *lmm_size,
2034                              struct ptlrpc_request **request)
2035 {
2036         struct ll_sb_info *sbi = ll_i2sbi(inode);
2037         struct mdt_body  *body;
2038         struct lov_mds_md *lmm = NULL;
2039         struct ptlrpc_request *req = NULL;
2040         struct md_op_data *op_data;
2041         int rc, lmmsize;
2042
2043         rc = ll_get_default_mdsize(sbi, &lmmsize);
2044         if (rc)
2045                 RETURN(rc);
2046
2047         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2048                                      strlen(filename), lmmsize,
2049                                      LUSTRE_OPC_ANY, NULL);
2050         if (IS_ERR(op_data))
2051                 RETURN(PTR_ERR(op_data));
2052
2053         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2054         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2055         ll_finish_md_op_data(op_data);
2056         if (rc < 0) {
2057                 CDEBUG(D_INFO, "md_getattr_name failed "
2058                        "on %s: rc %d\n", filename, rc);
2059                 GOTO(out, rc);
2060         }
2061
2062         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2063         LASSERT(body != NULL); /* checked by mdc_getattr_name */
2064
2065         lmmsize = body->mbo_eadatasize;
2066
2067         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2068                         lmmsize == 0) {
2069                 GOTO(out, rc = -ENODATA);
2070         }
2071
2072         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2073         LASSERT(lmm != NULL);
2074
2075         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2076             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2077             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2078             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2079                 GOTO(out, rc = -EPROTO);
2080
2081         /*
2082          * This is coming from the MDS, so is probably in
2083          * little endian.  We convert it to host endian before
2084          * passing it to userspace.
2085          */
2086         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2087                 int stripe_count;
2088
2089                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2090                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2091                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2092                         if (le32_to_cpu(lmm->lmm_pattern) &
2093                             LOV_PATTERN_F_RELEASED)
2094                                 stripe_count = 0;
2095                 }
2096
2097                 /* if function called for directory - we should
2098                  * avoid swab not existent lsm objects */
2099                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2100                         lustre_swab_lov_user_md_v1(
2101                                         (struct lov_user_md_v1 *)lmm);
2102                         if (S_ISREG(body->mbo_mode))
2103                                 lustre_swab_lov_user_md_objects(
2104                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2105                                     stripe_count);
2106                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2107                         lustre_swab_lov_user_md_v3(
2108                                         (struct lov_user_md_v3 *)lmm);
2109                         if (S_ISREG(body->mbo_mode))
2110                                 lustre_swab_lov_user_md_objects(
2111                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2112                                     stripe_count);
2113                 } else if (lmm->lmm_magic ==
2114                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2115                         lustre_swab_lov_comp_md_v1(
2116                                         (struct lov_comp_md_v1 *)lmm);
2117                 } else if (lmm->lmm_magic ==
2118                            cpu_to_le32(LOV_MAGIC_FOREIGN)) {
2119                         struct lov_foreign_md *lfm;
2120
2121                         lfm = (struct lov_foreign_md *)lmm;
2122                         __swab32s(&lfm->lfm_magic);
2123                         __swab32s(&lfm->lfm_length);
2124                         __swab32s(&lfm->lfm_type);
2125                         __swab32s(&lfm->lfm_flags);
2126                 }
2127         }
2128
2129 out:
2130         *lmmp = lmm;
2131         *lmm_size = lmmsize;
2132         *request = req;
2133         return rc;
2134 }
2135
2136 static int ll_lov_setea(struct inode *inode, struct file *file,
2137                         void __user *arg)
2138 {
2139         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2140         struct lov_user_md      *lump;
2141         int                      lum_size = sizeof(struct lov_user_md) +
2142                                             sizeof(struct lov_user_ost_data);
2143         int                      rc;
2144         ENTRY;
2145
2146         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2147                 RETURN(-EPERM);
2148
2149         OBD_ALLOC_LARGE(lump, lum_size);
2150         if (lump == NULL)
2151                 RETURN(-ENOMEM);
2152
2153         if (copy_from_user(lump, arg, lum_size))
2154                 GOTO(out_lump, rc = -EFAULT);
2155
2156         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2157                                       lum_size);
2158         cl_lov_delay_create_clear(&file->f_flags);
2159
2160 out_lump:
2161         OBD_FREE_LARGE(lump, lum_size);
2162         RETURN(rc);
2163 }
2164
2165 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2166 {
2167         struct lu_env   *env;
2168         __u16           refcheck;
2169         int             rc;
2170         ENTRY;
2171
2172         env = cl_env_get(&refcheck);
2173         if (IS_ERR(env))
2174                 RETURN(PTR_ERR(env));
2175
2176         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2177         cl_env_put(env, &refcheck);
2178         RETURN(rc);
2179 }
2180
2181 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2182                             void __user *arg)
2183 {
2184         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2185         struct lov_user_md        *klum;
2186         int                        lum_size, rc;
2187         __u64                      flags = FMODE_WRITE;
2188         ENTRY;
2189
2190         rc = ll_copy_user_md(lum, &klum);
2191         if (rc < 0)
2192                 RETURN(rc);
2193
2194         lum_size = rc;
2195         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2196                                       lum_size);
2197         if (!rc) {
2198                 __u32 gen;
2199
2200                 rc = put_user(0, &lum->lmm_stripe_count);
2201                 if (rc)
2202                         GOTO(out, rc);
2203
2204                 rc = ll_layout_refresh(inode, &gen);
2205                 if (rc)
2206                         GOTO(out, rc);
2207
2208                 rc = ll_file_getstripe(inode, arg, lum_size);
2209         }
2210         cl_lov_delay_create_clear(&file->f_flags);
2211
2212 out:
2213         OBD_FREE(klum, lum_size);
2214         RETURN(rc);
2215 }
2216
2217 static int
2218 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2219 {
2220         struct ll_inode_info *lli = ll_i2info(inode);
2221         struct cl_object *obj = lli->lli_clob;
2222         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2223         struct ll_grouplock grouplock;
2224         int rc;
2225         ENTRY;
2226
2227         if (arg == 0) {
2228                 CWARN("group id for group lock must not be 0\n");
2229                 RETURN(-EINVAL);
2230         }
2231
2232         if (ll_file_nolock(file))
2233                 RETURN(-EOPNOTSUPP);
2234
2235         spin_lock(&lli->lli_lock);
2236         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2237                 CWARN("group lock already existed with gid %lu\n",
2238                       fd->fd_grouplock.lg_gid);
2239                 spin_unlock(&lli->lli_lock);
2240                 RETURN(-EINVAL);
2241         }
2242         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2243         spin_unlock(&lli->lli_lock);
2244
2245         /**
2246          * XXX: group lock needs to protect all OST objects while PFL
2247          * can add new OST objects during the IO, so we'd instantiate
2248          * all OST objects before getting its group lock.
2249          */
2250         if (obj) {
2251                 struct lu_env *env;
2252                 __u16 refcheck;
2253                 struct cl_layout cl = {
2254                         .cl_is_composite = false,
2255                 };
2256                 struct lu_extent ext = {
2257                         .e_start = 0,
2258                         .e_end = OBD_OBJECT_EOF,
2259                 };
2260
2261                 env = cl_env_get(&refcheck);
2262                 if (IS_ERR(env))
2263                         RETURN(PTR_ERR(env));
2264
2265                 rc = cl_object_layout_get(env, obj, &cl);
2266                 if (!rc && cl.cl_is_composite)
2267                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2268                                                     &ext);
2269
2270                 cl_env_put(env, &refcheck);
2271                 if (rc)
2272                         RETURN(rc);
2273         }
2274
2275         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2276                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2277         if (rc)
2278                 RETURN(rc);
2279
2280         spin_lock(&lli->lli_lock);
2281         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2282                 spin_unlock(&lli->lli_lock);
2283                 CERROR("another thread just won the race\n");
2284                 cl_put_grouplock(&grouplock);
2285                 RETURN(-EINVAL);
2286         }
2287
2288         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2289         fd->fd_grouplock = grouplock;
2290         spin_unlock(&lli->lli_lock);
2291
2292         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2293         RETURN(0);
2294 }
2295
2296 static int ll_put_grouplock(struct inode *inode, struct file *file,
2297                             unsigned long arg)
2298 {
2299         struct ll_inode_info   *lli = ll_i2info(inode);
2300         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2301         struct ll_grouplock     grouplock;
2302         ENTRY;
2303
2304         spin_lock(&lli->lli_lock);
2305         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2306                 spin_unlock(&lli->lli_lock);
2307                 CWARN("no group lock held\n");
2308                 RETURN(-EINVAL);
2309         }
2310
2311         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2312
2313         if (fd->fd_grouplock.lg_gid != arg) {
2314                 CWARN("group lock %lu doesn't match current id %lu\n",
2315                       arg, fd->fd_grouplock.lg_gid);
2316                 spin_unlock(&lli->lli_lock);
2317                 RETURN(-EINVAL);
2318         }
2319
2320         grouplock = fd->fd_grouplock;
2321         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2322         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2323         spin_unlock(&lli->lli_lock);
2324
2325         cl_put_grouplock(&grouplock);
2326         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2327         RETURN(0);
2328 }
2329
2330 /**
2331  * Close inode open handle
2332  *
2333  * \param dentry [in]     dentry which contains the inode
2334  * \param it     [in,out] intent which contains open info and result
2335  *
2336  * \retval 0     success
2337  * \retval <0    failure
2338  */
2339 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2340 {
2341         struct inode *inode = dentry->d_inode;
2342         struct obd_client_handle *och;
2343         int rc;
2344         ENTRY;
2345
2346         LASSERT(inode);
2347
2348         /* Root ? Do nothing. */
2349         if (dentry->d_inode->i_sb->s_root == dentry)
2350                 RETURN(0);
2351
2352         /* No open handle to close? Move away */
2353         if (!it_disposition(it, DISP_OPEN_OPEN))
2354                 RETURN(0);
2355
2356         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2357
2358         OBD_ALLOC(och, sizeof(*och));
2359         if (!och)
2360                 GOTO(out, rc = -ENOMEM);
2361
2362         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2363
2364         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2365 out:
2366         /* this one is in place of ll_file_open */
2367         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2368                 ptlrpc_req_finished(it->it_request);
2369                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2370         }
2371         RETURN(rc);
2372 }
2373
2374 /**
2375  * Get size for inode for which FIEMAP mapping is requested.
2376  * Make the FIEMAP get_info call and returns the result.
2377  * \param fiemap        kernel buffer to hold extens
2378  * \param num_bytes     kernel buffer size
2379  */
2380 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2381                         size_t num_bytes)
2382 {
2383         struct lu_env                   *env;
2384         __u16                           refcheck;
2385         int                             rc = 0;
2386         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2387         ENTRY;
2388
2389         /* Checks for fiemap flags */
2390         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2391                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2392                 return -EBADR;
2393         }
2394
2395         /* Check for FIEMAP_FLAG_SYNC */
2396         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2397                 rc = filemap_fdatawrite(inode->i_mapping);
2398                 if (rc)
2399                         return rc;
2400         }
2401
2402         env = cl_env_get(&refcheck);
2403         if (IS_ERR(env))
2404                 RETURN(PTR_ERR(env));
2405
2406         if (i_size_read(inode) == 0) {
2407                 rc = ll_glimpse_size(inode);
2408                 if (rc)
2409                         GOTO(out, rc);
2410         }
2411
2412         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2413         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2414         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2415
2416         /* If filesize is 0, then there would be no objects for mapping */
2417         if (fmkey.lfik_oa.o_size == 0) {
2418                 fiemap->fm_mapped_extents = 0;
2419                 GOTO(out, rc = 0);
2420         }
2421
2422         fmkey.lfik_fiemap = *fiemap;
2423
2424         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2425                               &fmkey, fiemap, &num_bytes);
2426 out:
2427         cl_env_put(env, &refcheck);
2428         RETURN(rc);
2429 }
2430
2431 int ll_fid2path(struct inode *inode, void __user *arg)
2432 {
2433         struct obd_export       *exp = ll_i2mdexp(inode);
2434         const struct getinfo_fid2path __user *gfin = arg;
2435         __u32                    pathlen;
2436         struct getinfo_fid2path *gfout;
2437         size_t                   outsize;
2438         int                      rc;
2439
2440         ENTRY;
2441
2442         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2443             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2444                 RETURN(-EPERM);
2445
2446         /* Only need to get the buflen */
2447         if (get_user(pathlen, &gfin->gf_pathlen))
2448                 RETURN(-EFAULT);
2449
2450         if (pathlen > PATH_MAX)
2451                 RETURN(-EINVAL);
2452
2453         outsize = sizeof(*gfout) + pathlen;
2454         OBD_ALLOC(gfout, outsize);
2455         if (gfout == NULL)
2456                 RETURN(-ENOMEM);
2457
2458         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2459                 GOTO(gf_free, rc = -EFAULT);
2460         /* append root FID after gfout to let MDT know the root FID so that it
2461          * can lookup the correct path, this is mainly for fileset.
2462          * old server without fileset mount support will ignore this. */
2463         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2464
2465         /* Call mdc_iocontrol */
2466         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2467         if (rc != 0)
2468                 GOTO(gf_free, rc);
2469
2470         if (copy_to_user(arg, gfout, outsize))
2471                 rc = -EFAULT;
2472
2473 gf_free:
2474         OBD_FREE(gfout, outsize);
2475         RETURN(rc);
2476 }
2477
2478 static int
2479 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2480 {
2481         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2482         struct lu_env *env;
2483         struct cl_io *io;
2484         __u16  refcheck;
2485         int result;
2486
2487         ENTRY;
2488
2489         ioc->idv_version = 0;
2490         ioc->idv_layout_version = UINT_MAX;
2491
2492         /* If no file object initialized, we consider its version is 0. */
2493         if (obj == NULL)
2494                 RETURN(0);
2495
2496         env = cl_env_get(&refcheck);
2497         if (IS_ERR(env))
2498                 RETURN(PTR_ERR(env));
2499
2500         io = vvp_env_thread_io(env);
2501         io->ci_obj = obj;
2502         io->u.ci_data_version.dv_data_version = 0;
2503         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2504         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2505
2506 restart:
2507         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2508                 result = cl_io_loop(env, io);
2509         else
2510                 result = io->ci_result;
2511
2512         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2513         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2514
2515         cl_io_fini(env, io);
2516
2517         if (unlikely(io->ci_need_restart))
2518                 goto restart;
2519
2520         cl_env_put(env, &refcheck);
2521
2522         RETURN(result);
2523 }
2524
2525 /*
2526  * Read the data_version for inode.
2527  *
2528  * This value is computed using stripe object version on OST.
2529  * Version is computed using server side locking.
2530  *
2531  * @param flags if do sync on the OST side;
2532  *              0: no sync
2533  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2534  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2535  */
2536 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2537 {
2538         struct ioc_data_version ioc = { .idv_flags = flags };
2539         int rc;
2540
2541         rc = ll_ioc_data_version(inode, &ioc);
2542         if (!rc)
2543                 *data_version = ioc.idv_version;
2544
2545         return rc;
2546 }
2547
2548 /*
2549  * Trigger a HSM release request for the provided inode.
2550  */
2551 int ll_hsm_release(struct inode *inode)
2552 {
2553         struct lu_env *env;
2554         struct obd_client_handle *och = NULL;
2555         __u64 data_version = 0;
2556         int rc;
2557         __u16 refcheck;
2558         ENTRY;
2559
2560         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2561                ll_i2sbi(inode)->ll_fsname,
2562                PFID(&ll_i2info(inode)->lli_fid));
2563
2564         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2565         if (IS_ERR(och))
2566                 GOTO(out, rc = PTR_ERR(och));
2567
2568         /* Grab latest data_version and [am]time values */
2569         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2570         if (rc != 0)
2571                 GOTO(out, rc);
2572
2573         env = cl_env_get(&refcheck);
2574         if (IS_ERR(env))
2575                 GOTO(out, rc = PTR_ERR(env));
2576
2577         rc = ll_merge_attr(env, inode);
2578         cl_env_put(env, &refcheck);
2579
2580         /* If error happen, we have the wrong size for a file.
2581          * Don't release it.
2582          */
2583         if (rc != 0)
2584                 GOTO(out, rc);
2585
2586         /* Release the file.
2587          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2588          * we still need it to pack l_remote_handle to MDT. */
2589         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2590                                        &data_version);
2591         och = NULL;
2592
2593         EXIT;
2594 out:
2595         if (och != NULL && !IS_ERR(och)) /* close the file */
2596                 ll_lease_close(och, inode, NULL);
2597
2598         return rc;
2599 }
2600
2601 struct ll_swap_stack {
2602         __u64                    dv1;
2603         __u64                    dv2;
2604         struct inode            *inode1;
2605         struct inode            *inode2;
2606         bool                     check_dv1;
2607         bool                     check_dv2;
2608 };
2609
2610 static int ll_swap_layouts(struct file *file1, struct file *file2,
2611                            struct lustre_swap_layouts *lsl)
2612 {
2613         struct mdc_swap_layouts  msl;
2614         struct md_op_data       *op_data;
2615         __u32                    gid;
2616         __u64                    dv;
2617         struct ll_swap_stack    *llss = NULL;
2618         int                      rc;
2619
2620         OBD_ALLOC_PTR(llss);
2621         if (llss == NULL)
2622                 RETURN(-ENOMEM);
2623
2624         llss->inode1 = file_inode(file1);
2625         llss->inode2 = file_inode(file2);
2626
2627         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2628         if (rc < 0)
2629                 GOTO(free, rc);
2630
2631         /* we use 2 bool because it is easier to swap than 2 bits */
2632         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2633                 llss->check_dv1 = true;
2634
2635         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2636                 llss->check_dv2 = true;
2637
2638         /* we cannot use lsl->sl_dvX directly because we may swap them */
2639         llss->dv1 = lsl->sl_dv1;
2640         llss->dv2 = lsl->sl_dv2;
2641
2642         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2643         if (rc == 0) /* same file, done! */
2644                 GOTO(free, rc);
2645
2646         if (rc < 0) { /* sequentialize it */
2647                 swap(llss->inode1, llss->inode2);
2648                 swap(file1, file2);
2649                 swap(llss->dv1, llss->dv2);
2650                 swap(llss->check_dv1, llss->check_dv2);
2651         }
2652
2653         gid = lsl->sl_gid;
2654         if (gid != 0) { /* application asks to flush dirty cache */
2655                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2656                 if (rc < 0)
2657                         GOTO(free, rc);
2658
2659                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2660                 if (rc < 0) {
2661                         ll_put_grouplock(llss->inode1, file1, gid);
2662                         GOTO(free, rc);
2663                 }
2664         }
2665
2666         /* ultimate check, before swaping the layouts we check if
2667          * dataversion has changed (if requested) */
2668         if (llss->check_dv1) {
2669                 rc = ll_data_version(llss->inode1, &dv, 0);
2670                 if (rc)
2671                         GOTO(putgl, rc);
2672                 if (dv != llss->dv1)
2673                         GOTO(putgl, rc = -EAGAIN);
2674         }
2675
2676         if (llss->check_dv2) {
2677                 rc = ll_data_version(llss->inode2, &dv, 0);
2678                 if (rc)
2679                         GOTO(putgl, rc);
2680                 if (dv != llss->dv2)
2681                         GOTO(putgl, rc = -EAGAIN);
2682         }
2683
2684         /* struct md_op_data is used to send the swap args to the mdt
2685          * only flags is missing, so we use struct mdc_swap_layouts
2686          * through the md_op_data->op_data */
2687         /* flags from user space have to be converted before they are send to
2688          * server, no flag is sent today, they are only used on the client */
2689         msl.msl_flags = 0;
2690         rc = -ENOMEM;
2691         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2692                                      0, LUSTRE_OPC_ANY, &msl);
2693         if (IS_ERR(op_data))
2694                 GOTO(free, rc = PTR_ERR(op_data));
2695
2696         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2697                            sizeof(*op_data), op_data, NULL);
2698         ll_finish_md_op_data(op_data);
2699
2700         if (rc < 0)
2701                 GOTO(putgl, rc);
2702
2703 putgl:
2704         if (gid != 0) {
2705                 ll_put_grouplock(llss->inode2, file2, gid);
2706                 ll_put_grouplock(llss->inode1, file1, gid);
2707         }
2708
2709 free:
2710         if (llss != NULL)
2711                 OBD_FREE_PTR(llss);
2712
2713         RETURN(rc);
2714 }
2715
2716 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2717 {
2718         struct obd_export *exp = ll_i2mdexp(inode);
2719         struct md_op_data *op_data;
2720         int rc;
2721         ENTRY;
2722
2723         /* Detect out-of range masks */
2724         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2725                 RETURN(-EINVAL);
2726
2727         /* Non-root users are forbidden to set or clear flags which are
2728          * NOT defined in HSM_USER_MASK. */
2729         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2730             !cfs_capable(CFS_CAP_SYS_ADMIN))
2731                 RETURN(-EPERM);
2732
2733         if (!exp_connect_archive_id_array(exp)) {
2734                 /* Detect out-of range archive id */
2735                 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2736                     (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2737                         RETURN(-EINVAL);
2738         }
2739
2740         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2741                                      LUSTRE_OPC_ANY, hss);
2742         if (IS_ERR(op_data))
2743                 RETURN(PTR_ERR(op_data));
2744
2745         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2746                            op_data, NULL);
2747
2748         ll_finish_md_op_data(op_data);
2749
2750         RETURN(rc);
2751 }
2752
2753 static int ll_hsm_import(struct inode *inode, struct file *file,
2754                          struct hsm_user_import *hui)
2755 {
2756         struct hsm_state_set    *hss = NULL;
2757         struct iattr            *attr = NULL;
2758         int                      rc;
2759         ENTRY;
2760
2761         if (!S_ISREG(inode->i_mode))
2762                 RETURN(-EINVAL);
2763
2764         /* set HSM flags */
2765         OBD_ALLOC_PTR(hss);
2766         if (hss == NULL)
2767                 GOTO(out, rc = -ENOMEM);
2768
2769         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2770         hss->hss_archive_id = hui->hui_archive_id;
2771         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2772         rc = ll_hsm_state_set(inode, hss);
2773         if (rc != 0)
2774                 GOTO(out, rc);
2775
2776         OBD_ALLOC_PTR(attr);
2777         if (attr == NULL)
2778                 GOTO(out, rc = -ENOMEM);
2779
2780         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2781         attr->ia_mode |= S_IFREG;
2782         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2783         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2784         attr->ia_size = hui->hui_size;
2785         attr->ia_mtime.tv_sec = hui->hui_mtime;
2786         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2787         attr->ia_atime.tv_sec = hui->hui_atime;
2788         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2789
2790         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2791                          ATTR_UID | ATTR_GID |
2792                          ATTR_MTIME | ATTR_MTIME_SET |
2793                          ATTR_ATIME | ATTR_ATIME_SET;
2794
2795         inode_lock(inode);
2796
2797         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2798         if (rc == -ENODATA)
2799                 rc = 0;
2800
2801         inode_unlock(inode);
2802
2803 out:
2804         if (hss != NULL)
2805                 OBD_FREE_PTR(hss);
2806
2807         if (attr != NULL)
2808                 OBD_FREE_PTR(attr);
2809
2810         RETURN(rc);
2811 }
2812
2813 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2814 {
2815         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2816                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2817 }
2818
2819 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2820 {
2821         struct inode *inode = file_inode(file);
2822         struct iattr ia = {
2823                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2824                             ATTR_MTIME | ATTR_MTIME_SET |
2825                             ATTR_CTIME,
2826                 .ia_atime = {
2827                         .tv_sec = lfu->lfu_atime_sec,
2828                         .tv_nsec = lfu->lfu_atime_nsec,
2829                 },
2830                 .ia_mtime = {
2831                         .tv_sec = lfu->lfu_mtime_sec,
2832                         .tv_nsec = lfu->lfu_mtime_nsec,
2833                 },
2834                 .ia_ctime = {
2835                         .tv_sec = lfu->lfu_ctime_sec,
2836                         .tv_nsec = lfu->lfu_ctime_nsec,
2837                 },
2838         };
2839         int rc;
2840         ENTRY;
2841
2842         if (!capable(CAP_SYS_ADMIN))
2843                 RETURN(-EPERM);
2844
2845         if (!S_ISREG(inode->i_mode))
2846                 RETURN(-EINVAL);
2847
2848         inode_lock(inode);
2849         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2850                             false);
2851         inode_unlock(inode);
2852
2853         RETURN(rc);
2854 }
2855
2856 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2857 {
2858         switch (mode) {
2859         case MODE_READ_USER:
2860                 return CLM_READ;
2861         case MODE_WRITE_USER:
2862                 return CLM_WRITE;
2863         default:
2864                 return -EINVAL;
2865         }
2866 }
2867
2868 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2869
2870 /* Used to allow the upper layers of the client to request an LDLM lock
2871  * without doing an actual read or write.
2872  *
2873  * Used for ladvise lockahead to manually request specific locks.
2874  *
2875  * \param[in] file      file this ladvise lock request is on
2876  * \param[in] ladvise   ladvise struct describing this lock request
2877  *
2878  * \retval 0            success, no detailed result available (sync requests
2879  *                      and requests sent to the server [not handled locally]
2880  *                      cannot return detailed results)
2881  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2882  *                                       see definitions for details.
2883  * \retval negative     negative errno on error
2884  */
2885 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2886 {
2887         struct lu_env *env = NULL;
2888         struct cl_io *io  = NULL;
2889         struct cl_lock *lock = NULL;
2890         struct cl_lock_descr *descr = NULL;
2891         struct dentry *dentry = file->f_path.dentry;
2892         struct inode *inode = dentry->d_inode;
2893         enum cl_lock_mode cl_mode;
2894         off_t start = ladvise->lla_start;
2895         off_t end = ladvise->lla_end;
2896         int result;
2897         __u16 refcheck;
2898
2899         ENTRY;
2900
2901         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2902                "start=%llu, end=%llu\n", dentry->d_name.len,
2903                dentry->d_name.name, dentry->d_inode,
2904                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2905                (__u64) end);
2906
2907         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2908         if (cl_mode < 0)
2909                 GOTO(out, result = cl_mode);
2910
2911         /* Get IO environment */
2912         result = cl_io_get(inode, &env, &io, &refcheck);
2913         if (result <= 0)
2914                 GOTO(out, result);
2915
2916         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2917         if (result > 0) {
2918                 /*
2919                  * nothing to do for this io. This currently happens when
2920                  * stripe sub-object's are not yet created.
2921                  */
2922                 result = io->ci_result;
2923         } else if (result == 0) {
2924                 lock = vvp_env_lock(env);
2925                 descr = &lock->cll_descr;
2926
2927                 descr->cld_obj   = io->ci_obj;
2928                 /* Convert byte offsets to pages */
2929                 descr->cld_start = cl_index(io->ci_obj, start);
2930                 descr->cld_end   = cl_index(io->ci_obj, end);
2931                 descr->cld_mode  = cl_mode;
2932                 /* CEF_MUST is used because we do not want to convert a
2933                  * lockahead request to a lockless lock */
2934                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2935                                        CEF_NONBLOCK;
2936
2937                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2938                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2939
2940                 result = cl_lock_request(env, io, lock);
2941
2942                 /* On success, we need to release the lock */
2943                 if (result >= 0)
2944                         cl_lock_release(env, lock);
2945         }
2946         cl_io_fini(env, io);
2947         cl_env_put(env, &refcheck);
2948
2949         /* -ECANCELED indicates a matching lock with a different extent
2950          * was already present, and -EEXIST indicates a matching lock
2951          * on exactly the same extent was already present.
2952          * We convert them to positive values for userspace to make
2953          * recognizing true errors easier.
2954          * Note we can only return these detailed results on async requests,
2955          * as sync requests look the same as i/o requests for locking. */
2956         if (result == -ECANCELED)
2957                 result = LLA_RESULT_DIFFERENT;
2958         else if (result == -EEXIST)
2959                 result = LLA_RESULT_SAME;
2960
2961 out:
2962         RETURN(result);
2963 }
2964 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2965
2966 static int ll_ladvise_sanity(struct inode *inode,
2967                              struct llapi_lu_ladvise *ladvise)
2968 {
2969         struct ll_sb_info *sbi = ll_i2sbi(inode);
2970         enum lu_ladvise_type advice = ladvise->lla_advice;
2971         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2972          * be in the first 32 bits of enum ladvise_flags */
2973         __u32 flags = ladvise->lla_peradvice_flags;
2974         /* 3 lines at 80 characters per line, should be plenty */
2975         int rc = 0;
2976
2977         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2978                 rc = -EINVAL;
2979                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2980                        "last supported advice is %s (value '%d'): rc = %d\n",
2981                        sbi->ll_fsname, advice,
2982                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2983                 GOTO(out, rc);
2984         }
2985
2986         /* Per-advice checks */
2987         switch (advice) {
2988         case LU_LADVISE_LOCKNOEXPAND:
2989                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2990                         rc = -EINVAL;
2991                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2992                                "rc = %d\n", sbi->ll_fsname, flags,
2993                                ladvise_names[advice], rc);
2994                         GOTO(out, rc);
2995                 }
2996                 break;
2997         case LU_LADVISE_LOCKAHEAD:
2998                 /* Currently only READ and WRITE modes can be requested */
2999                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3000                     ladvise->lla_lockahead_mode == 0) {
3001                         rc = -EINVAL;
3002                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3003                                "rc = %d\n", sbi->ll_fsname,
3004                                ladvise->lla_lockahead_mode,
3005                                ladvise_names[advice], rc);
3006                         GOTO(out, rc);
3007                 }
3008         case LU_LADVISE_WILLREAD:
3009         case LU_LADVISE_DONTNEED:
3010         default:
3011                 /* Note fall through above - These checks apply to all advices
3012                  * except LOCKNOEXPAND */
3013                 if (flags & ~LF_DEFAULT_MASK) {
3014                         rc = -EINVAL;
3015                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3016                                "rc = %d\n", sbi->ll_fsname, flags,
3017                                ladvise_names[advice], rc);
3018                         GOTO(out, rc);
3019                 }
3020                 if (ladvise->lla_start >= ladvise->lla_end) {
3021                         rc = -EINVAL;
3022                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3023                                "for %s: rc = %d\n", sbi->ll_fsname,
3024                                ladvise->lla_start, ladvise->lla_end,
3025                                ladvise_names[advice], rc);
3026                         GOTO(out, rc);
3027                 }
3028                 break;
3029         }
3030
3031 out:
3032         return rc;
3033 }
3034 #undef ERRSIZE
3035
3036 /*
3037  * Give file access advices
3038  *
3039  * The ladvise interface is similar to Linux fadvise() system call, except it
3040  * forwards the advices directly from Lustre client to server. The server side
3041  * codes will apply appropriate read-ahead and caching techniques for the
3042  * corresponding files.
3043  *
3044  * A typical workload for ladvise is e.g. a bunch of different clients are
3045  * doing small random reads of a file, so prefetching pages into OSS cache
3046  * with big linear reads before the random IO is a net benefit. Fetching
3047  * all that data into each client cache with fadvise() may not be, due to
3048  * much more data being sent to the client.
3049  */
3050 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3051                       struct llapi_lu_ladvise *ladvise)
3052 {
3053         struct lu_env *env;
3054         struct cl_io *io;
3055         struct cl_ladvise_io *lio;
3056         int rc;
3057         __u16 refcheck;
3058         ENTRY;
3059
3060         env = cl_env_get(&refcheck);
3061         if (IS_ERR(env))
3062                 RETURN(PTR_ERR(env));
3063
3064         io = vvp_env_thread_io(env);
3065         io->ci_obj = ll_i2info(inode)->lli_clob;
3066
3067         /* initialize parameters for ladvise */
3068         lio = &io->u.ci_ladvise;
3069         lio->li_start = ladvise->lla_start;
3070         lio->li_end = ladvise->lla_end;
3071         lio->li_fid = ll_inode2fid(inode);
3072         lio->li_advice = ladvise->lla_advice;
3073         lio->li_flags = flags;
3074
3075         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3076                 rc = cl_io_loop(env, io);
3077         else
3078                 rc = io->ci_result;
3079
3080         cl_io_fini(env, io);
3081         cl_env_put(env, &refcheck);
3082         RETURN(rc);
3083 }
3084
3085 static int ll_lock_noexpand(struct file *file, int flags)
3086 {
3087         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3088
3089         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3090
3091         return 0;
3092 }
3093
3094 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3095                         unsigned long arg)
3096 {
3097         struct fsxattr fsxattr;
3098
3099         if (copy_from_user(&fsxattr,
3100                            (const struct fsxattr __user *)arg,
3101                            sizeof(fsxattr)))
3102                 RETURN(-EFAULT);
3103
3104         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3105         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3106                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3107         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3108         if (copy_to_user((struct fsxattr __user *)arg,
3109                          &fsxattr, sizeof(fsxattr)))
3110                 RETURN(-EFAULT);
3111
3112         RETURN(0);
3113 }
3114
3115 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3116 {
3117         /*
3118          * Project Quota ID state is only allowed to change from within the init
3119          * namespace. Enforce that restriction only if we are trying to change
3120          * the quota ID state. Everything else is allowed in user namespaces.
3121          */
3122         if (current_user_ns() == &init_user_ns)
3123                 return 0;
3124
3125         if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3126                 return -EINVAL;
3127
3128         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3129                 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3130                         return -EINVAL;
3131         } else {
3132                 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3133                         return -EINVAL;
3134         }
3135
3136         return 0;
3137 }
3138
3139 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3140                         unsigned long arg)
3141 {
3142
3143         struct md_op_data *op_data;
3144         struct ptlrpc_request *req = NULL;
3145         int rc = 0;
3146         struct fsxattr fsxattr;
3147         struct cl_object *obj;
3148         struct iattr *attr;
3149         int flags;
3150
3151         if (copy_from_user(&fsxattr,
3152                            (const struct fsxattr __user *)arg,
3153                            sizeof(fsxattr)))
3154                 RETURN(-EFAULT);
3155
3156         rc = ll_ioctl_check_project(inode, &fsxattr);
3157         if (rc)
3158                 RETURN(rc);
3159
3160         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3161                                      LUSTRE_OPC_ANY, NULL);
3162         if (IS_ERR(op_data))
3163                 RETURN(PTR_ERR(op_data));
3164
3165         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3166         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3167         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3168                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3169         op_data->op_projid = fsxattr.fsx_projid;
3170         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3171         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3172                         0, &req);
3173         ptlrpc_req_finished(req);
3174         if (rc)
3175                 GOTO(out_fsxattr, rc);
3176         ll_update_inode_flags(inode, op_data->op_attr_flags);
3177         obj = ll_i2info(inode)->lli_clob;
3178         if (obj == NULL)
3179                 GOTO(out_fsxattr, rc);
3180
3181         OBD_ALLOC_PTR(attr);
3182         if (attr == NULL)
3183                 GOTO(out_fsxattr, rc = -ENOMEM);
3184
3185         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3186                             fsxattr.fsx_xflags);
3187         OBD_FREE_PTR(attr);
3188 out_fsxattr:
3189         ll_finish_md_op_data(op_data);
3190         RETURN(rc);
3191 }
3192
3193 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3194                                  unsigned long arg)
3195 {
3196         struct inode            *inode = file_inode(file);
3197         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3198         struct ll_inode_info    *lli = ll_i2info(inode);
3199         struct obd_client_handle *och = NULL;
3200         struct split_param sp;
3201         struct pcc_param param;
3202         bool lease_broken = false;
3203         fmode_t fmode = 0;
3204         enum mds_op_bias bias = 0;
3205         struct file *layout_file = NULL;
3206         void *data = NULL;
3207         size_t data_size = 0;
3208         bool attached = false;
3209         long rc, rc2 = 0;
3210
3211         ENTRY;
3212
3213         mutex_lock(&lli->lli_och_mutex);
3214         if (fd->fd_lease_och != NULL) {
3215                 och = fd->fd_lease_och;
3216                 fd->fd_lease_och = NULL;
3217         }
3218         mutex_unlock(&lli->lli_och_mutex);
3219
3220         if (och == NULL)
3221                 RETURN(-ENOLCK);
3222
3223         fmode = och->och_flags;
3224
3225         switch (ioc->lil_flags) {
3226         case LL_LEASE_RESYNC_DONE:
3227                 if (ioc->lil_count > IOC_IDS_MAX)
3228                         GOTO(out_lease_close, rc = -EINVAL);
3229
3230                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3231                 OBD_ALLOC(data, data_size);
3232                 if (!data)
3233                         GOTO(out_lease_close, rc = -ENOMEM);
3234
3235                 if (copy_from_user(data, (void __user *)arg, data_size))
3236                         GOTO(out_lease_close, rc = -EFAULT);
3237
3238                 bias = MDS_CLOSE_RESYNC_DONE;
3239                 break;
3240         case LL_LEASE_LAYOUT_MERGE: {
3241                 int fd;
3242
3243                 if (ioc->lil_count != 1)
3244                         GOTO(out_lease_close, rc = -EINVAL);
3245
3246                 arg += sizeof(*ioc);
3247                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3248                         GOTO(out_lease_close, rc = -EFAULT);
3249
3250                 layout_file = fget(fd);
3251                 if (!layout_file)
3252                         GOTO(out_lease_close, rc = -EBADF);
3253
3254                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3255                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3256                         GOTO(out_lease_close, rc = -EPERM);
3257
3258                 data = file_inode(layout_file);
3259                 bias = MDS_CLOSE_LAYOUT_MERGE;
3260                 break;
3261         }
3262         case LL_LEASE_LAYOUT_SPLIT: {
3263                 int fdv;
3264                 int mirror_id;
3265
3266                 if (ioc->lil_count != 2)
3267                         GOTO(out_lease_close, rc = -EINVAL);
3268
3269                 arg += sizeof(*ioc);
3270                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3271                         GOTO(out_lease_close, rc = -EFAULT);
3272
3273                 arg += sizeof(__u32);
3274                 if (copy_from_user(&mirror_id, (void __user *)arg,
3275                                    sizeof(__u32)))
3276                         GOTO(out_lease_close, rc = -EFAULT);
3277
3278                 layout_file = fget(fdv);
3279                 if (!layout_file)
3280                         GOTO(out_lease_close, rc = -EBADF);
3281
3282                 sp.sp_inode = file_inode(layout_file);
3283                 sp.sp_mirror_id = (__u16)mirror_id;
3284                 data = &sp;
3285                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3286                 break;
3287         }
3288         case LL_LEASE_PCC_ATTACH:
3289                 if (ioc->lil_count != 1)
3290                         RETURN(-EINVAL);
3291
3292                 arg += sizeof(*ioc);
3293                 if (copy_from_user(&param.pa_archive_id, (void __user *)arg,
3294                                    sizeof(__u32)))
3295                         GOTO(out_lease_close, rc2 = -EFAULT);
3296
3297                 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3298                 if (rc2)
3299                         GOTO(out_lease_close, rc2);
3300
3301                 attached = true;
3302                 /* Grab latest data version */
3303                 rc2 = ll_data_version(inode, &param.pa_data_version,
3304                                      LL_DV_WR_FLUSH);
3305                 if (rc2)
3306                         GOTO(out_lease_close, rc2);
3307
3308                 data = &param;
3309                 bias = MDS_PCC_ATTACH;
3310                 break;
3311         default:
3312                 /* without close intent */
3313                 break;
3314         }
3315
3316 out_lease_close:
3317         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3318         if (rc < 0)
3319                 GOTO(out, rc);
3320
3321         rc = ll_lease_och_release(inode, file);
3322         if (rc < 0)
3323                 GOTO(out, rc);
3324
3325         if (lease_broken)
3326                 fmode = 0;
3327         EXIT;
3328
3329 out:
3330         switch (ioc->lil_flags) {
3331         case LL_LEASE_RESYNC_DONE:
3332                 if (data)
3333                         OBD_FREE(data, data_size);
3334                 break;
3335         case LL_LEASE_LAYOUT_MERGE:
3336         case LL_LEASE_LAYOUT_SPLIT:
3337                 if (layout_file)
3338                         fput(layout_file);
3339                 break;
3340         case LL_LEASE_PCC_ATTACH:
3341                 if (!rc)
3342                         rc = rc2;
3343                 rc = pcc_readwrite_attach_fini(file, inode,
3344                                                param.pa_layout_gen,
3345                                                lease_broken, rc,
3346                                                attached);
3347                 break;
3348         }
3349
3350         if (!rc)
3351                 rc = ll_lease_type_from_fmode(fmode);
3352         RETURN(rc);
3353 }
3354
3355 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3356                               unsigned long arg)
3357 {
3358         struct inode *inode = file_inode(file);
3359         struct ll_inode_info *lli = ll_i2info(inode);
3360         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3361         struct obd_client_handle *och = NULL;
3362         __u64 open_flags = 0;
3363         bool lease_broken;
3364         fmode_t fmode;
3365         long rc;
3366         ENTRY;
3367
3368         switch (ioc->lil_mode) {
3369         case LL_LEASE_WRLCK:
3370                 if (!(file->f_mode & FMODE_WRITE))
3371                         RETURN(-EPERM);
3372                 fmode = FMODE_WRITE;
3373                 break;
3374         case LL_LEASE_RDLCK:
3375                 if (!(file->f_mode & FMODE_READ))
3376                         RETURN(-EPERM);
3377                 fmode = FMODE_READ;
3378                 break;
3379         case LL_LEASE_UNLCK:
3380                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3381         default:
3382                 RETURN(-EINVAL);
3383         }
3384
3385         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3386
3387         /* apply for lease */
3388         if (ioc->lil_flags & LL_LEASE_RESYNC)
3389                 open_flags = MDS_OPEN_RESYNC;
3390         och = ll_lease_open(inode, file, fmode, open_flags);
3391         if (IS_ERR(och))
3392                 RETURN(PTR_ERR(och));
3393
3394         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3395                 rc = ll_lease_file_resync(och, inode, arg);
3396                 if (rc) {
3397                         ll_lease_close(och, inode, NULL);
3398                         RETURN(rc);
3399                 }
3400                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3401                 if (rc) {
3402                         ll_lease_close(och, inode, NULL);
3403                         RETURN(rc);
3404                 }
3405         }
3406
3407         rc = 0;
3408         mutex_lock(&lli->lli_och_mutex);
3409         if (fd->fd_lease_och == NULL) {
3410                 fd->fd_lease_och = och;
3411                 och = NULL;
3412         }
3413         mutex_unlock(&lli->lli_och_mutex);
3414         if (och != NULL) {
3415                 /* impossible now that only excl is supported for now */
3416                 ll_lease_close(och, inode, &lease_broken);
3417                 rc = -EBUSY;
3418         }
3419         RETURN(rc);
3420 }
3421
3422 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3423 {
3424         struct ll_inode_info *lli = ll_i2info(inode);
3425         struct ll_sb_info *sbi = ll_i2sbi(inode);
3426         __u64 now = ktime_get_real_seconds();
3427         int i;
3428
3429         spin_lock(&lli->lli_heat_lock);
3430         heat->lh_flags = lli->lli_heat_flags;
3431         for (i = 0; i < heat->lh_count; i++)
3432                 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3433                                                 now, sbi->ll_heat_decay_weight,
3434                                                 sbi->ll_heat_period_second);
3435         spin_unlock(&lli->lli_heat_lock);
3436 }
3437
3438 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3439 {
3440         struct ll_inode_info *lli = ll_i2info(inode);
3441         int rc = 0;
3442
3443         spin_lock(&lli->lli_heat_lock);
3444         if (flags & LU_HEAT_FLAG_CLEAR)
3445                 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3446
3447         if (flags & LU_HEAT_FLAG_OFF)
3448                 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3449         else
3450                 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3451
3452         spin_unlock(&lli->lli_heat_lock);
3453
3454         RETURN(rc);
3455 }
3456
3457 static long
3458 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3459 {
3460         struct inode            *inode = file_inode(file);
3461         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3462         int                      flags, rc;
3463         ENTRY;
3464
3465         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3466                PFID(ll_inode2fid(inode)), inode, cmd);
3467         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3468
3469         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3470         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3471                 RETURN(-ENOTTY);
3472
3473         switch (cmd) {
3474         case LL_IOC_GETFLAGS:
3475                 /* Get the current value of the file flags */
3476                 return put_user(fd->fd_flags, (int __user *)arg);
3477         case LL_IOC_SETFLAGS:
3478         case LL_IOC_CLRFLAGS:
3479                 /* Set or clear specific file flags */
3480                 /* XXX This probably needs checks to ensure the flags are
3481                  *     not abused, and to handle any flag side effects.
3482                  */
3483                 if (get_user(flags, (int __user *) arg))
3484                         RETURN(-EFAULT);
3485
3486                 if (cmd == LL_IOC_SETFLAGS) {
3487                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3488                             !(file->f_flags & O_DIRECT)) {
3489                                 CERROR("%s: unable to disable locking on "
3490                                        "non-O_DIRECT file\n", current->comm);
3491                                 RETURN(-EINVAL);
3492                         }
3493
3494                         fd->fd_flags |= flags;
3495                 } else {
3496                         fd->fd_flags &= ~flags;
3497                 }
3498                 RETURN(0);
3499         case LL_IOC_LOV_SETSTRIPE:
3500         case LL_IOC_LOV_SETSTRIPE_NEW:
3501                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3502         case LL_IOC_LOV_SETEA:
3503                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3504         case LL_IOC_LOV_SWAP_LAYOUTS: {
3505                 struct file *file2;
3506                 struct lustre_swap_layouts lsl;
3507
3508                 if (copy_from_user(&lsl, (char __user *)arg,
3509                                    sizeof(struct lustre_swap_layouts)))
3510                         RETURN(-EFAULT);
3511
3512                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3513                         RETURN(-EPERM);
3514
3515                 file2 = fget(lsl.sl_fd);
3516                 if (file2 == NULL)
3517                         RETURN(-EBADF);
3518
3519                 /* O_WRONLY or O_RDWR */
3520                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3521                         GOTO(out, rc = -EPERM);
3522
3523                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3524                         struct inode                    *inode2;
3525                         struct ll_inode_info            *lli;
3526                         struct obd_client_handle        *och = NULL;
3527
3528                         lli = ll_i2info(inode);
3529                         mutex_lock(&lli->lli_och_mutex);
3530                         if (fd->fd_lease_och != NULL) {
3531                                 och = fd->fd_lease_och;
3532                                 fd->fd_lease_och = NULL;
3533                         }
3534                         mutex_unlock(&lli->lli_och_mutex);
3535                         if (och == NULL)
3536                                 GOTO(out, rc = -ENOLCK);
3537                         inode2 = file_inode(file2);
3538                         rc = ll_swap_layouts_close(och, inode, inode2);
3539                 } else {
3540                         rc = ll_swap_layouts(file, file2, &lsl);
3541                 }
3542 out:
3543                 fput(file2);
3544                 RETURN(rc);
3545         }
3546         case LL_IOC_LOV_GETSTRIPE:
3547         case LL_IOC_LOV_GETSTRIPE_NEW:
3548                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3549         case FS_IOC_GETFLAGS:
3550         case FS_IOC_SETFLAGS:
3551                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3552         case FSFILT_IOC_GETVERSION:
3553         case FS_IOC_GETVERSION:
3554                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3555         /* We need to special case any other ioctls we want to handle,
3556          * to send them to the MDS/OST as appropriate and to properly
3557          * network encode the arg field. */
3558         case FS_IOC_SETVERSION:
3559                 RETURN(-ENOTSUPP);
3560
3561         case LL_IOC_GROUP_LOCK:
3562                 RETURN(ll_get_grouplock(inode, file, arg));
3563         case LL_IOC_GROUP_UNLOCK:
3564                 RETURN(ll_put_grouplock(inode, file, arg));
3565         case IOC_OBD_STATFS:
3566                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3567
3568         case LL_IOC_FLUSHCTX:
3569                 RETURN(ll_flush_ctx(inode));
3570         case LL_IOC_PATH2FID: {
3571                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3572                                  sizeof(struct lu_fid)))
3573                         RETURN(-EFAULT);
3574
3575                 RETURN(0);
3576         }
3577         case LL_IOC_GETPARENT:
3578                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3579
3580         case OBD_IOC_FID2PATH:
3581                 RETURN(ll_fid2path(inode, (void __user *)arg));
3582         case LL_IOC_DATA_VERSION: {
3583                 struct ioc_data_version idv;
3584                 int rc;
3585
3586                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3587                         RETURN(-EFAULT);
3588
3589                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3590                 rc = ll_ioc_data_version(inode, &idv);
3591
3592                 if (rc == 0 &&
3593                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3594                         RETURN(-EFAULT);
3595
3596                 RETURN(rc);
3597         }
3598
3599         case LL_IOC_GET_MDTIDX: {
3600                 int mdtidx;
3601
3602                 mdtidx = ll_get_mdt_idx(inode);
3603                 if (mdtidx < 0)
3604                         RETURN(mdtidx);
3605
3606                 if (put_user((int)mdtidx, (int __user *)arg))
3607                         RETURN(-EFAULT);
3608
3609                 RETURN(0);
3610         }
3611         case OBD_IOC_GETDTNAME:
3612         case OBD_IOC_GETMDNAME:
3613                 RETURN(ll_get_obd_name(inode, cmd, arg));
3614         case LL_IOC_HSM_STATE_GET: {
3615                 struct md_op_data       *op_data;
3616                 struct hsm_user_state   *hus;
3617                 int                      rc;
3618
3619                 OBD_ALLOC_PTR(hus);
3620                 if (hus == NULL)
3621                         RETURN(-ENOMEM);
3622
3623                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3624                                              LUSTRE_OPC_ANY, hus);
3625                 if (IS_ERR(op_data)) {
3626                         OBD_FREE_PTR(hus);
3627                         RETURN(PTR_ERR(op_data));
3628                 }
3629
3630                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3631                                    op_data, NULL);
3632
3633                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3634                         rc = -EFAULT;
3635
3636                 ll_finish_md_op_data(op_data);
3637                 OBD_FREE_PTR(hus);
3638                 RETURN(rc);
3639         }
3640         case LL_IOC_HSM_STATE_SET: {
3641                 struct hsm_state_set    *hss;
3642                 int                      rc;
3643
3644                 OBD_ALLOC_PTR(hss);
3645                 if (hss == NULL)
3646                         RETURN(-ENOMEM);
3647
3648                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3649                         OBD_FREE_PTR(hss);
3650                         RETURN(-EFAULT);
3651                 }
3652
3653                 rc = ll_hsm_state_set(inode, hss);
3654
3655                 OBD_FREE_PTR(hss);
3656                 RETURN(rc);
3657         }
3658         case LL_IOC_HSM_ACTION: {
3659                 struct md_op_data               *op_data;
3660                 struct hsm_current_action       *hca;
3661                 int                              rc;
3662
3663                 OBD_ALLOC_PTR(hca);
3664                 if (hca == NULL)
3665                         RETURN(-ENOMEM);
3666
3667                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3668                                              LUSTRE_OPC_ANY, hca);
3669                 if (IS_ERR(op_data)) {
3670                         OBD_FREE_PTR(hca);
3671                         RETURN(PTR_ERR(op_data));
3672                 }
3673
3674                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3675                                    op_data, NULL);
3676
3677                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3678                         rc = -EFAULT;
3679
3680                 ll_finish_md_op_data(op_data);
3681                 OBD_FREE_PTR(hca);
3682                 RETURN(rc);
3683         }
3684         case LL_IOC_SET_LEASE_OLD: {
3685                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3686
3687                 RETURN(ll_file_set_lease(file, &ioc, 0));
3688         }
3689         case LL_IOC_SET_LEASE: {
3690                 struct ll_ioc_lease ioc;
3691
3692                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3693                         RETURN(-EFAULT);
3694
3695                 RETURN(ll_file_set_lease(file, &ioc, arg));
3696         }
3697         case LL_IOC_GET_LEASE: {
3698                 struct ll_inode_info *lli = ll_i2info(inode);
3699                 struct ldlm_lock *lock = NULL;
3700                 fmode_t fmode = 0;
3701
3702                 mutex_lock(&lli->lli_och_mutex);
3703                 if (fd->fd_lease_och != NULL) {
3704                         struct obd_client_handle *och = fd->fd_lease_och;
3705
3706                         lock = ldlm_handle2lock(&och->och_lease_handle);
3707                         if (lock != NULL) {
3708                                 lock_res_and_lock(lock);
3709                                 if (!ldlm_is_cancel(lock))
3710                                         fmode = och->och_flags;
3711
3712                                 unlock_res_and_lock(lock);
3713                                 LDLM_LOCK_PUT(lock);
3714                         }
3715                 }
3716                 mutex_unlock(&lli->lli_och_mutex);
3717
3718                 RETURN(ll_lease_type_from_fmode(fmode));
3719         }
3720         case LL_IOC_HSM_IMPORT: {
3721                 struct hsm_user_import *hui;
3722
3723                 OBD_ALLOC_PTR(hui);
3724                 if (hui == NULL)
3725                         RETURN(-ENOMEM);
3726
3727                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3728                         OBD_FREE_PTR(hui);
3729                         RETURN(-EFAULT);
3730                 }
3731
3732                 rc = ll_hsm_import(inode, file, hui);
3733
3734                 OBD_FREE_PTR(hui);
3735                 RETURN(rc);
3736         }
3737         case LL_IOC_FUTIMES_3: {
3738                 struct ll_futimes_3 lfu;
3739
3740                 if (copy_from_user(&lfu,
3741                                    (const struct ll_futimes_3 __user *)arg,
3742                                    sizeof(lfu)))
3743                         RETURN(-EFAULT);
3744
3745                 RETURN(ll_file_futimes_3(file, &lfu));
3746         }
3747         case LL_IOC_LADVISE: {
3748                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3749                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3750                 int i;
3751                 int num_advise;
3752                 int alloc_size = sizeof(*k_ladvise_hdr);
3753
3754                 rc = 0;
3755                 u_ladvise_hdr = (void __user *)arg;
3756                 OBD_ALLOC_PTR(k_ladvise_hdr);
3757                 if (k_ladvise_hdr == NULL)
3758                         RETURN(-ENOMEM);
3759
3760                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3761                         GOTO(out_ladvise, rc = -EFAULT);
3762
3763                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3764                     k_ladvise_hdr->lah_count < 1)
3765                         GOTO(out_ladvise, rc = -EINVAL);
3766
3767                 num_advise = k_ladvise_hdr->lah_count;
3768                 if (num_advise >= LAH_COUNT_MAX)
3769                         GOTO(out_ladvise, rc = -EFBIG);
3770
3771                 OBD_FREE_PTR(k_ladvise_hdr);
3772                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3773                                       lah_advise[num_advise]);
3774                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3775                 if (k_ladvise_hdr == NULL)
3776                         RETURN(-ENOMEM);
3777
3778                 /*
3779                  * TODO: submit multiple advices to one server in a single RPC
3780                  */
3781                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3782                         GOTO(out_ladvise, rc = -EFAULT);
3783
3784                 for (i = 0; i < num_advise; i++) {
3785                         struct llapi_lu_ladvise *k_ladvise =
3786                                         &k_ladvise_hdr->lah_advise[i];
3787                         struct llapi_lu_ladvise __user *u_ladvise =
3788                                         &u_ladvise_hdr->lah_advise[i];
3789
3790                         rc = ll_ladvise_sanity(inode, k_ladvise);
3791                         if (rc)
3792                                 GOTO(out_ladvise, rc);
3793
3794                         switch (k_ladvise->lla_advice) {
3795                         case LU_LADVISE_LOCKNOEXPAND:
3796                                 rc = ll_lock_noexpand(file,
3797                                                k_ladvise->lla_peradvice_flags);
3798                                 GOTO(out_ladvise, rc);
3799                         case LU_LADVISE_LOCKAHEAD:
3800
3801                                 rc = ll_file_lock_ahead(file, k_ladvise);
3802
3803                                 if (rc < 0)
3804                                         GOTO(out_ladvise, rc);
3805
3806                                 if (put_user(rc,
3807                                              &u_ladvise->lla_lockahead_result))
3808                                         GOTO(out_ladvise, rc = -EFAULT);
3809                                 break;
3810                         default:
3811                                 rc = ll_ladvise(inode, file,
3812                                                 k_ladvise_hdr->lah_flags,
3813                                                 k_ladvise);
3814                                 if (rc)
3815                                         GOTO(out_ladvise, rc);
3816                                 break;
3817                         }
3818
3819                 }
3820
3821 out_ladvise:
3822                 OBD_FREE(k_ladvise_hdr, alloc_size);
3823                 RETURN(rc);
3824         }
3825         case LL_IOC_FLR_SET_MIRROR: {
3826                 /* mirror I/O must be direct to avoid polluting page cache
3827                  * by stale data. */
3828                 if (!(file->f_flags & O_DIRECT))
3829                         RETURN(-EINVAL);
3830
3831                 fd->fd_designated_mirror = (__u32)arg;
3832                 RETURN(0);
3833         }
3834         case LL_IOC_FSGETXATTR:
3835                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3836         case LL_IOC_FSSETXATTR:
3837                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3838         case BLKSSZGET:
3839                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3840         case LL_IOC_HEAT_GET: {
3841                 struct lu_heat uheat;
3842                 struct lu_heat *heat;
3843                 int size;
3844
3845                 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3846                         RETURN(-EFAULT);
3847
3848                 if (uheat.lh_count > OBD_HEAT_COUNT)
3849                         uheat.lh_count = OBD_HEAT_COUNT;
3850
3851                 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3852                 OBD_ALLOC(heat, size);
3853                 if (heat == NULL)
3854                         RETURN(-ENOMEM);
3855
3856                 heat->lh_count = uheat.lh_count;
3857                 ll_heat_get(inode, heat);
3858                 rc = copy_to_user((char __user *)arg, heat, size);
3859                 OBD_FREE(heat, size);
3860                 RETURN(rc ? -EFAULT : 0);
3861         }
3862         case LL_IOC_HEAT_SET: {
3863                 __u64 flags;
3864
3865                 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3866                         RETURN(-EFAULT);
3867
3868                 rc = ll_heat_set(inode, flags);
3869                 RETURN(rc);
3870         }
3871         case LL_IOC_PCC_DETACH: {
3872                 struct lu_pcc_detach *detach;
3873
3874                 OBD_ALLOC_PTR(detach);
3875                 if (detach == NULL)
3876                         RETURN(-ENOMEM);
3877
3878                 if (copy_from_user(detach,
3879                                    (const struct lu_pcc_detach __user *)arg,
3880                                    sizeof(*detach)))
3881                         GOTO(out_detach_free, rc = -EFAULT);
3882
3883                 if (!S_ISREG(inode->i_mode))
3884                         GOTO(out_detach_free, rc = -EINVAL);
3885
3886                 if (!inode_owner_or_capable(inode))
3887                         GOTO(out_detach_free, rc = -EPERM);
3888
3889                 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3890 out_detach_free:
3891                 OBD_FREE_PTR(detach);
3892                 RETURN(rc);
3893         }
3894         case LL_IOC_PCC_STATE: {
3895                 struct lu_pcc_state __user *ustate =
3896                         (struct lu_pcc_state __user *)arg;
3897                 struct lu_pcc_state *state;
3898
3899                 OBD_ALLOC_PTR(state);
3900                 if (state == NULL)
3901                         RETURN(-ENOMEM);
3902
3903                 if (copy_from_user(state, ustate, sizeof(*state)))
3904                         GOTO(out_state, rc = -EFAULT);
3905
3906                 rc = pcc_ioctl_state(file, inode, state);
3907                 if (rc)
3908                         GOTO(out_state, rc);
3909
3910                 if (copy_to_user(ustate, state, sizeof(*state)))
3911                         GOTO(out_state, rc = -EFAULT);
3912
3913 out_state:
3914                 OBD_FREE_PTR(state);
3915                 RETURN(rc);
3916         }
3917         default:
3918                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3919                                      (void __user *)arg));
3920         }
3921 }
3922
3923 #ifndef HAVE_FILE_LLSEEK_SIZE
3924 static inline loff_t
3925 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3926 {
3927         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3928                 return -EINVAL;
3929         if (offset > maxsize)
3930                 return -EINVAL;
3931
3932         if (offset != file->f_pos) {
3933                 file->f_pos = offset;
3934                 file->f_version = 0;
3935         }
3936         return offset;
3937 }
3938
3939 static loff_t
3940 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3941                 loff_t maxsize, loff_t eof)
3942 {
3943         struct inode *inode = file_inode(file);
3944
3945         switch (origin) {
3946         case SEEK_END:
3947                 offset += eof;
3948                 break;
3949         case SEEK_CUR:
3950                 /*
3951                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3952                  * position-querying operation.  Avoid rewriting the "same"
3953                  * f_pos value back to the file because a concurrent read(),
3954                  * write() or lseek() might have altered it
3955                  */
3956                 if (offset == 0)
3957                         return file->f_pos;
3958                 /*
3959                  * f_lock protects against read/modify/write race with other
3960                  * SEEK_CURs. Note that parallel writes and reads behave
3961                  * like SEEK_SET.
3962                  */
3963                 inode_lock(inode);
3964                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3965                 inode_unlock(inode);
3966                 return offset;
3967         case SEEK_DATA:
3968                 /*
3969                  * In the generic case the entire file is data, so as long as
3970                  * offset isn't at the end of the file then the offset is data.
3971                  */
3972                 if (offset >= eof)
3973                         return -ENXIO;
3974                 break;
3975         case SEEK_HOLE:
3976                 /*
3977                  * There is a virtual hole at the end of the file, so as long as
3978                  * offset isn't i_size or larger, return i_size.
3979                  */
3980                 if (offset >= eof)
3981                         return -ENXIO;
3982                 offset = eof;
3983                 break;
3984         }
3985
3986         return llseek_execute(file, offset, maxsize);
3987 }
3988 #endif
3989
3990 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3991 {
3992         struct inode *inode = file_inode(file);
3993         loff_t retval, eof = 0;
3994
3995         ENTRY;
3996         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3997                            (origin == SEEK_CUR) ? file->f_pos : 0);
3998         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3999                PFID(ll_inode2fid(inode)), inode, retval, retval,
4000                origin);
4001         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
4002
4003         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4004                 retval = ll_glimpse_size(inode);
4005                 if (retval != 0)
4006                         RETURN(retval);
4007                 eof = i_size_read(inode);
4008         }
4009
4010         retval = ll_generic_file_llseek_size(file, offset, origin,
4011                                           ll_file_maxbytes(inode), eof);
4012         RETURN(retval);
4013 }
4014
4015 static int ll_flush(struct file *file, fl_owner_t id)
4016 {
4017         struct inode *inode = file_inode(file);
4018         struct ll_inode_info *lli = ll_i2info(inode);
4019         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4020         int rc, err;
4021
4022         LASSERT(!S_ISDIR(inode->i_mode));
4023
4024         /* catch async errors that were recorded back when async writeback
4025          * failed for pages in this mapping. */
4026         rc = lli->lli_async_rc;
4027         lli->lli_async_rc = 0;
4028         if (lli->lli_clob != NULL) {
4029                 err = lov_read_and_clear_async_rc(lli->lli_clob);
4030                 if (rc == 0)
4031                         rc = err;
4032         }
4033
4034         /* The application has been told write failure already.
4035          * Do not report failure again. */
4036         if (fd->fd_write_failed)
4037                 return 0;
4038         return rc ? -EIO : 0;
4039 }
4040
4041 /**
4042  * Called to make sure a portion of file has been written out.
4043  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4044  *
4045  * Return how many pages have been written.
4046  */
4047 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4048                        enum cl_fsync_mode mode, int ignore_layout)
4049 {
4050         struct lu_env *env;
4051         struct cl_io *io;
4052         struct cl_fsync_io *fio;
4053         int result;
4054         __u16 refcheck;
4055         ENTRY;
4056
4057         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4058             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4059                 RETURN(-EINVAL);
4060
4061         env = cl_env_get(&refcheck);
4062         if (IS_ERR(env))
4063                 RETURN(PTR_ERR(env));
4064
4065         io = vvp_env_thread_io(env);
4066         io->ci_obj = ll_i2info(inode)->lli_clob;
4067         io->ci_ignore_layout = ignore_layout;
4068
4069         /* initialize parameters for sync */
4070         fio = &io->u.ci_fsync;
4071         fio->fi_start = start;
4072         fio->fi_end = end;
4073         fio->fi_fid = ll_inode2fid(inode);
4074         fio->fi_mode = mode;
4075         fio->fi_nr_written = 0;
4076
4077         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4078                 result = cl_io_loop(env, io);
4079         else
4080                 result = io->ci_result;
4081         if (result == 0)
4082                 result = fio->fi_nr_written;
4083         cl_io_fini(env, io);
4084         cl_env_put(env, &refcheck);
4085
4086         RETURN(result);
4087 }
4088
4089 /*
4090  * When dentry is provided (the 'else' case), file_dentry() may be
4091  * null and dentry must be used directly rather than pulled from
4092  * file_dentry() as is done otherwise.
4093  */
4094
4095 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4096 {
4097         struct dentry *dentry = file_dentry(file);
4098         struct inode *inode = dentry->d_inode;
4099         struct ll_inode_info *lli = ll_i2info(inode);
4100         struct ptlrpc_request *req;
4101         int rc, err;
4102
4103         ENTRY;
4104
4105         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4106                "datasync %d\n",
4107                PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4108
4109         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
4110
4111         /* fsync's caller has already called _fdata{sync,write}, we want
4112          * that IO to finish before calling the osc and mdc sync methods */
4113         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4114         inode_lock(inode);
4115
4116         /* catch async errors that were recorded back when async writeback
4117          * failed for pages in this mapping. */
4118         if (!S_ISDIR(inode->i_mode)) {
4119                 err = lli->lli_async_rc;
4120                 lli->lli_async_rc = 0;
4121                 if (rc == 0)
4122                         rc = err;
4123                 if (lli->lli_clob != NULL) {
4124                         err = lov_read_and_clear_async_rc(lli->lli_clob);
4125                         if (rc == 0)
4126                                 rc = err;
4127                 }
4128         }
4129
4130         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4131         if (!rc)
4132                 rc = err;
4133         if (!err)
4134                 ptlrpc_req_finished(req);
4135
4136         if (S_ISREG(inode->i_mode)) {
4137                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4138                 bool cached;
4139
4140                 /* Sync metadata on MDT first, and then sync the cached data
4141                  * on PCC.
4142                  */
4143                 err = pcc_fsync(file, start, end, datasync, &cached);
4144                 if (!cached)
4145                         err = cl_sync_file_range(inode, start, end,
4146                                                  CL_FSYNC_ALL, 0);
4147                 if (rc == 0 && err < 0)
4148                         rc = err;
4149                 if (rc < 0)
4150                         fd->fd_write_failed = true;
4151                 else
4152                         fd->fd_write_failed = false;
4153         }
4154
4155         inode_unlock(inode);
4156         RETURN(rc);
4157 }
4158
4159 static int
4160 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4161 {
4162         struct inode *inode = file_inode(file);
4163         struct ll_sb_info *sbi = ll_i2sbi(inode);
4164         struct ldlm_enqueue_info einfo = {
4165                 .ei_type        = LDLM_FLOCK,
4166                 .ei_cb_cp       = ldlm_flock_completion_ast,
4167                 .ei_cbdata      = file_lock,
4168         };
4169         struct md_op_data *op_data;
4170         struct lustre_handle lockh = { 0 };
4171         union ldlm_policy_data flock = { { 0 } };
4172         int fl_type = file_lock->fl_type;
4173         __u64 flags = 0;
4174         int rc;
4175         int rc2 = 0;
4176         ENTRY;
4177
4178         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4179                PFID(ll_inode2fid(inode)), file_lock);
4180
4181         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4182
4183         if (file_lock->fl_flags & FL_FLOCK) {
4184                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4185                 /* flocks are whole-file locks */
4186                 flock.l_flock.end = OFFSET_MAX;
4187                 /* For flocks owner is determined by the local file desctiptor*/
4188                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4189         } else if (file_lock->fl_flags & FL_POSIX) {
4190                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4191                 flock.l_flock.start = file_lock->fl_start;
4192                 flock.l_flock.end = file_lock->fl_end;
4193         } else {
4194                 RETURN(-EINVAL);
4195         }
4196         flock.l_flock.pid = file_lock->fl_pid;
4197
4198         /* Somewhat ugly workaround for svc lockd.
4199          * lockd installs custom fl_lmops->lm_compare_owner that checks
4200          * for the fl_owner to be the same (which it always is on local node
4201          * I guess between lockd processes) and then compares pid.
4202          * As such we assign pid to the owner field to make it all work,
4203          * conflict with normal locks is unlikely since pid space and
4204          * pointer space for current->files are not intersecting */
4205         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4206                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4207
4208         switch (fl_type) {
4209         case F_RDLCK:
4210                 einfo.ei_mode = LCK_PR;
4211                 break;
4212         case F_UNLCK:
4213                 /* An unlock request may or may not have any relation to
4214                  * existing locks so we may not be able to pass a lock handle
4215                  * via a normal ldlm_lock_cancel() request. The request may even
4216                  * unlock a byte range in the middle of an existing lock. In
4217                  * order to process an unlock request we need all of the same
4218                  * information that is given with a normal read or write record
4219                  * lock request. To avoid creating another ldlm unlock (cancel)
4220                  * message we'll treat a LCK_NL flock request as an unlock. */
4221                 einfo.ei_mode = LCK_NL;
4222                 break;
4223         case F_WRLCK:
4224                 einfo.ei_mode = LCK_PW;
4225                 break;
4226         default:
4227                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4228                 RETURN (-ENOTSUPP);
4229         }
4230
4231         switch (cmd) {
4232         case F_SETLKW:
4233 #ifdef F_SETLKW64
4234         case F_SETLKW64:
4235 #endif
4236                 flags = 0;
4237                 break;
4238         case F_SETLK:
4239 #ifdef F_SETLK64
4240         case F_SETLK64:
4241 #endif
4242                 flags = LDLM_FL_BLOCK_NOWAIT;
4243                 break;
4244         case F_GETLK:
4245 #ifdef F_GETLK64
4246         case F_GETLK64:
4247 #endif
4248                 flags = LDLM_FL_TEST_LOCK;
4249                 break;
4250         default:
4251                 CERROR("unknown fcntl lock command: %d\n", cmd);
4252                 RETURN (-EINVAL);
4253         }
4254
4255         /* Save the old mode so that if the mode in the lock changes we
4256          * can decrement the appropriate reader or writer refcount. */
4257         file_lock->fl_type = einfo.ei_mode;
4258
4259         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4260                                      LUSTRE_OPC_ANY, NULL);
4261         if (IS_ERR(op_data))
4262                 RETURN(PTR_ERR(op_data));
4263
4264         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4265                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4266                flock.l_flock.pid, flags, einfo.ei_mode,
4267                flock.l_flock.start, flock.l_flock.end);
4268
4269         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4270                         flags);
4271
4272         /* Restore the file lock type if not TEST lock. */
4273         if (!(flags & LDLM_FL_TEST_LOCK))
4274                 file_lock->fl_type = fl_type;
4275
4276 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4277         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4278             !(flags & LDLM_FL_TEST_LOCK))
4279                 rc2  = locks_lock_file_wait(file, file_lock);
4280 #else
4281         if ((file_lock->fl_flags & FL_FLOCK) &&
4282             (rc == 0 || file_lock->fl_type == F_UNLCK))
4283                 rc2  = flock_lock_file_wait(file, file_lock);
4284         if ((file_lock->fl_flags & FL_POSIX) &&
4285             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4286             !(flags & LDLM_FL_TEST_LOCK))
4287                 rc2  = posix_lock_file_wait(file, file_lock);
4288 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4289
4290         if (rc2 && file_lock->fl_type != F_UNLCK) {
4291                 einfo.ei_mode = LCK_NL;
4292                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4293                            &lockh, flags);
4294                 rc = rc2;
4295         }
4296
4297         ll_finish_md_op_data(op_data);
4298
4299         RETURN(rc);
4300 }
4301
4302 int ll_get_fid_by_name(struct inode *parent, const char *name,
4303                        int namelen, struct lu_fid *fid,
4304                        struct inode **inode)
4305 {
4306         struct md_op_data       *op_data = NULL;
4307         struct mdt_body         *body;
4308         struct ptlrpc_request   *req;
4309         int                     rc;
4310         ENTRY;
4311
4312         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4313                                      LUSTRE_OPC_ANY, NULL);
4314         if (IS_ERR(op_data))
4315                 RETURN(PTR_ERR(op_data));
4316
4317         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4318         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4319         ll_finish_md_op_data(op_data);
4320         if (rc < 0)
4321                 RETURN(rc);
4322
4323         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4324         if (body == NULL)
4325                 GOTO(out_req, rc = -EFAULT);
4326         if (fid != NULL)
4327                 *fid = body->mbo_fid1;
4328
4329         if (inode != NULL)
4330                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4331 out_req:
4332         ptlrpc_req_finished(req);
4333         RETURN(rc);
4334 }
4335
4336 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4337                const char *name)
4338 {
4339         struct dentry *dchild = NULL;
4340         struct inode *child_inode = NULL;
4341         struct md_op_data *op_data;
4342         struct ptlrpc_request *request = NULL;
4343         struct obd_client_handle *och = NULL;
4344         struct qstr qstr;
4345         struct mdt_body *body;
4346         __u64 data_version = 0;
4347         size_t namelen = strlen(name);
4348         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4349         int rc;
4350         ENTRY;
4351
4352         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4353                PFID(ll_inode2fid(parent)), name,
4354                lum->lum_stripe_offset, lum->lum_stripe_count);
4355
4356         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4357             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4358                 lustre_swab_lmv_user_md(lum);
4359
4360         /* Get child FID first */
4361         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4362         qstr.name = name;
4363         qstr.len = namelen;
4364         dchild = d_lookup(file_dentry(file), &qstr);
4365         if (dchild) {
4366                 if (dchild->d_inode)
4367                         child_inode = igrab(dchild->d_inode);
4368                 dput(dchild);
4369         }
4370
4371         if (!child_inode) {
4372                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4373                                         &child_inode);
4374                 if (rc)
4375                         RETURN(rc);
4376         }
4377
4378         if (!child_inode)
4379                 RETURN(-ENOENT);
4380
4381         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4382               OBD_CONNECT2_DIR_MIGRATE)) {
4383                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4384                     ll_dir_striped(child_inode)) {
4385                         CERROR("%s: MDT doesn't support stripe directory "
4386                                "migration!\n", ll_i2sbi(parent)->ll_fsname);
4387                         GOTO(out_iput, rc = -EOPNOTSUPP);
4388                 }
4389         }
4390
4391         /*
4392          * lfs migrate command needs to be blocked on the client
4393          * by checking the migrate FID against the FID of the
4394          * filesystem root.
4395          */
4396         if (child_inode == parent->i_sb->s_root->d_inode)
4397                 GOTO(out_iput, rc = -EINVAL);
4398
4399         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4400                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4401         if (IS_ERR(op_data))
4402                 GOTO(out_iput, rc = PTR_ERR(op_data));
4403
4404         inode_lock(child_inode);
4405         op_data->op_fid3 = *ll_inode2fid(child_inode);
4406         if (!fid_is_sane(&op_data->op_fid3)) {
4407                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4408                        ll_i2sbi(parent)->ll_fsname, name,
4409                        PFID(&op_data->op_fid3));
4410                 GOTO(out_unlock, rc = -EINVAL);
4411         }
4412
4413         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4414         op_data->op_data = lum;
4415         op_data->op_data_size = lumlen;
4416
4417 again:
4418         if (S_ISREG(child_inode->i_mode)) {
4419                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4420                 if (IS_ERR(och)) {
4421                         rc = PTR_ERR(och);
4422                         och = NULL;
4423                         GOTO(out_unlock, rc);
4424                 }
4425
4426                 rc = ll_data_version(child_inode, &data_version,
4427                                      LL_DV_WR_FLUSH);
4428                 if (rc != 0)
4429                         GOTO(out_close, rc);
4430
4431                 op_data->op_open_handle = och->och_open_handle;
4432                 op_data->op_data_version = data_version;
4433                 op_data->op_lease_handle = och->och_lease_handle;
4434                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4435
4436                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4437                 och->och_mod->mod_open_req->rq_replay = 0;
4438                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4439         }
4440
4441         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4442                        name, namelen, &request);
4443         if (rc == 0) {
4444                 LASSERT(request != NULL);
4445                 ll_update_times(request, parent);
4446         }
4447
4448         if (rc == 0 || rc == -EAGAIN) {
4449                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4450                 LASSERT(body != NULL);
4451
4452                 /* If the server does release layout lock, then we cleanup
4453                  * the client och here, otherwise release it in out_close: */
4454                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4455                         obd_mod_put(och->och_mod);
4456                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4457                                                   och);
4458                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4459                         OBD_FREE_PTR(och);
4460                         och = NULL;
4461                 }
4462         }
4463
4464         if (request != NULL) {
4465                 ptlrpc_req_finished(request);
4466                 request = NULL;
4467         }
4468
4469         /* Try again if the lease has cancelled. */
4470         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4471                 goto again;
4472
4473 out_close:
4474         if (och)
4475                 ll_lease_close(och, child_inode, NULL);
4476         if (!rc)
4477                 clear_nlink(child_inode);
4478 out_unlock:
4479         inode_unlock(child_inode);
4480         ll_finish_md_op_data(op_data);
4481 out_iput:
4482         iput(child_inode);
4483         RETURN(rc);
4484 }
4485
4486 static int
4487 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4488 {
4489         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4490         ENTRY;
4491
4492         /*
4493          * In order to avoid flood of warning messages, only print one message
4494          * for one file. And the entire message rate on the client is limited
4495          * by CDEBUG_LIMIT too.
4496          */
4497         if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4498                 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4499                 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4500                              "flock disabled, mount with '-o [local]flock' to enable\r\n");
4501         }
4502         RETURN(-ENOSYS);
4503 }
4504
4505 /**
4506  * test if some locks matching bits and l_req_mode are acquired
4507  * - bits can be in different locks
4508  * - if found clear the common lock bits in *bits
4509  * - the bits not found, are kept in *bits
4510  * \param inode [IN]
4511  * \param bits [IN] searched lock bits [IN]
4512  * \param l_req_mode [IN] searched lock mode
4513  * \retval boolean, true iff all bits are found
4514  */
4515 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4516 {
4517         struct lustre_handle lockh;
4518         union ldlm_policy_data policy;
4519         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4520                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4521         struct lu_fid *fid;
4522         __u64 flags;
4523         int i;
4524         ENTRY;
4525
4526         if (!inode)
4527                RETURN(0);
4528
4529         fid = &ll_i2info(inode)->lli_fid;
4530         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4531                ldlm_lockname[mode]);
4532
4533         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4534         for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4535                 policy.l_inodebits.bits = *bits & (1 << i);
4536                 if (policy.l_inodebits.bits == 0)
4537                         continue;
4538
4539                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4540                                   &policy, mode, &lockh)) {
4541                         struct ldlm_lock *lock;
4542
4543                         lock = ldlm_handle2lock(&lockh);
4544                         if (lock) {
4545                                 *bits &=
4546                                       ~(lock->l_policy_data.l_inodebits.bits);
4547                                 LDLM_LOCK_PUT(lock);
4548                         } else {
4549                                 *bits &= ~policy.l_inodebits.bits;
4550                         }
4551                 }
4552         }
4553         RETURN(*bits == 0);
4554 }
4555
4556 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4557                                struct lustre_handle *lockh, __u64 flags,
4558                                enum ldlm_mode mode)
4559 {
4560         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4561         struct lu_fid *fid;
4562         enum ldlm_mode rc;
4563         ENTRY;
4564
4565         fid = &ll_i2info(inode)->lli_fid;
4566         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4567
4568         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4569                            fid, LDLM_IBITS, &policy, mode, lockh);
4570
4571         RETURN(rc);
4572 }
4573
4574 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4575 {
4576         /* Already unlinked. Just update nlink and return success */
4577         if (rc == -ENOENT) {
4578                 clear_nlink(inode);
4579                 /* If it is striped directory, and there is bad stripe
4580                  * Let's revalidate the dentry again, instead of returning
4581                  * error */
4582                 if (ll_dir_striped(inode))
4583                         return 0;
4584
4585                 /* This path cannot be hit for regular files unless in
4586                  * case of obscure races, so no need to to validate
4587                  * size. */
4588                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4589                         return 0;
4590         } else if (rc != 0) {
4591                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4592                              "%s: revalidate FID "DFID" error: rc = %d\n",
4593                              ll_i2sbi(inode)->ll_fsname,
4594                              PFID(ll_inode2fid(inode)), rc);
4595         }
4596
4597         return rc;
4598 }
4599
4600 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4601 {
4602         struct inode *inode = dentry->d_inode;
4603         struct obd_export *exp = ll_i2mdexp(inode);
4604         struct lookup_intent oit = {
4605                 .it_op = op,
4606         };
4607         struct ptlrpc_request *req = NULL;
4608         struct md_op_data *op_data;
4609         int rc = 0;
4610         ENTRY;
4611
4612         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4613                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4614
4615         /* Call getattr by fid, so do not provide name at all. */
4616         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4617                                      LUSTRE_OPC_ANY, NULL);
4618         if (IS_ERR(op_data))
4619                 RETURN(PTR_ERR(op_data));
4620
4621         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4622         ll_finish_md_op_data(op_data);
4623         if (rc < 0) {
4624                 rc = ll_inode_revalidate_fini(inode, rc);
4625                 GOTO(out, rc);
4626         }
4627
4628         rc = ll_revalidate_it_finish(req, &oit, dentry);
4629         if (rc != 0) {
4630                 ll_intent_release(&oit);
4631                 GOTO(out, rc);
4632         }
4633
4634         /* Unlinked? Unhash dentry, so it is not picked up later by
4635          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4636          * here to preserve get_cwd functionality on 2.6.
4637          * Bug 10503 */
4638         if (!dentry->d_inode->i_nlink) {
4639                 ll_lock_dcache(inode);
4640                 d_lustre_invalidate(dentry, 0);
4641                 ll_unlock_dcache(inode);
4642         }
4643
4644         ll_lookup_finish_locks(&oit, dentry);
4645 out:
4646         ptlrpc_req_finished(req);
4647
4648         return rc;
4649 }
4650
4651 static int ll_merge_md_attr(struct inode *inode)
4652 {
4653         struct ll_inode_info *lli = ll_i2info(inode);
4654         struct cl_attr attr = { 0 };
4655         int rc;
4656
4657         LASSERT(lli->lli_lsm_md != NULL);
4658
4659         if (!lmv_dir_striped(lli->lli_lsm_md))
4660                 RETURN(0);
4661
4662         down_read(&lli->lli_lsm_sem);
4663         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4664                            &attr, ll_md_blocking_ast);
4665         up_read(&lli->lli_lsm_sem);
4666         if (rc != 0)
4667                 RETURN(rc);
4668
4669         set_nlink(inode, attr.cat_nlink);
4670         inode->i_blocks = attr.cat_blocks;
4671         i_size_write(inode, attr.cat_size);
4672
4673         ll_i2info(inode)->lli_atime = attr.cat_atime;
4674         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4675         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4676
4677         RETURN(0);
4678 }
4679
4680 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4681 {
4682         struct inode *inode = de->d_inode;
4683         struct ll_sb_info *sbi = ll_i2sbi(inode);
4684         struct ll_inode_info *lli = ll_i2info(inode);
4685         int rc;
4686
4687         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4688
4689         rc = ll_inode_revalidate(de, IT_GETATTR);
4690         if (rc < 0)
4691                 RETURN(rc);
4692
4693         if (S_ISREG(inode->i_mode)) {
4694                 bool cached;
4695
4696                 rc = pcc_inode_getattr(inode, &cached);
4697                 if (cached && rc < 0)
4698                         RETURN(rc);
4699
4700                 /* In case of restore, the MDT has the right size and has
4701                  * already send it back without granting the layout lock,
4702                  * inode is up-to-date so glimpse is useless.
4703                  * Also to glimpse we need the layout, in case of a running
4704                  * restore the MDT holds the layout lock so the glimpse will
4705                  * block up to the end of restore (getattr will block)
4706                  */
4707                 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4708                         rc = ll_glimpse_size(inode);
4709                         if (rc < 0)
4710                                 RETURN(rc);
4711                 }
4712         } else {
4713                 /* If object isn't regular a file then don't validate size. */
4714                 if (ll_dir_striped(inode)) {
4715                         rc = ll_merge_md_attr(inode);
4716                         if (rc < 0)
4717                                 RETURN(rc);
4718                 }
4719
4720                 inode->i_atime.tv_sec = lli->lli_atime;
4721                 inode->i_mtime.tv_sec = lli->lli_mtime;
4722                 inode->i_ctime.tv_sec = lli->lli_ctime;
4723         }
4724
4725         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4726
4727         if (ll_need_32bit_api(sbi)) {
4728                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4729                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4730                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4731         } else {
4732                 stat->ino = inode->i_ino;
4733                 stat->dev = inode->i_sb->s_dev;
4734                 stat->rdev = inode->i_rdev;
4735         }
4736
4737         stat->mode = inode->i_mode;
4738         stat->uid = inode->i_uid;
4739         stat->gid = inode->i_gid;
4740         stat->atime = inode->i_atime;
4741         stat->mtime = inode->i_mtime;
4742         stat->ctime = inode->i_ctime;
4743         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4744
4745         stat->nlink = inode->i_nlink;
4746         stat->size = i_size_read(inode);
4747         stat->blocks = inode->i_blocks;
4748
4749         return 0;
4750 }
4751
4752 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4753 int ll_getattr(const struct path *path, struct kstat *stat,
4754                u32 request_mask, unsigned int flags)
4755 {
4756         struct dentry *de = path->dentry;
4757 #else
4758 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4759 {
4760 #endif
4761         return ll_getattr_dentry(de, stat);
4762 }
4763
4764 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4765                      __u64 start, __u64 len)
4766 {
4767         int             rc;
4768         size_t          num_bytes;
4769         struct fiemap   *fiemap;
4770         unsigned int    extent_count = fieinfo->fi_extents_max;
4771
4772         num_bytes = sizeof(*fiemap) + (extent_count *
4773                                        sizeof(struct fiemap_extent));
4774         OBD_ALLOC_LARGE(fiemap, num_bytes);
4775
4776         if (fiemap == NULL)
4777                 RETURN(-ENOMEM);
4778
4779         fiemap->fm_flags = fieinfo->fi_flags;
4780         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4781         fiemap->fm_start = start;
4782         fiemap->fm_length = len;
4783         if (extent_count > 0 &&
4784             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4785                            sizeof(struct fiemap_extent)) != 0)
4786                 GOTO(out, rc = -EFAULT);
4787
4788         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4789
4790         fieinfo->fi_flags = fiemap->fm_flags;
4791         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4792         if (extent_count > 0 &&
4793             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4794                          fiemap->fm_mapped_extents *
4795                          sizeof(struct fiemap_extent)) != 0)
4796                 GOTO(out, rc = -EFAULT);
4797 out:
4798         OBD_FREE_LARGE(fiemap, num_bytes);
4799         return rc;
4800 }
4801
4802 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4803 {
4804         struct ll_inode_info *lli = ll_i2info(inode);
4805         struct posix_acl *acl = NULL;
4806         ENTRY;
4807
4808         spin_lock(&lli->lli_lock);
4809         /* VFS' acl_permission_check->check_acl will release the refcount */
4810         acl = posix_acl_dup(lli->lli_posix_acl);
4811         spin_unlock(&lli->lli_lock);
4812
4813         RETURN(acl);
4814 }
4815
4816 #ifdef HAVE_IOP_SET_ACL
4817 #ifdef CONFIG_FS_POSIX_ACL
4818 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4819 {
4820         struct ll_sb_info *sbi = ll_i2sbi(inode);
4821         struct ptlrpc_request *req = NULL;
4822         const char *name = NULL;
4823         char *value = NULL;
4824         size_t value_size = 0;
4825         int rc = 0;
4826         ENTRY;
4827
4828         switch (type) {
4829         case ACL_TYPE_ACCESS:
4830                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4831                 if (acl)
4832                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4833                 break;
4834
4835         case ACL_TYPE_DEFAULT:
4836                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4837                 if (!S_ISDIR(inode->i_mode))
4838                         rc = acl ? -EACCES : 0;
4839                 break;
4840
4841         default:
4842                 rc = -EINVAL;
4843                 break;
4844         }
4845         if (rc)
4846                 return rc;
4847
4848         if (acl) {
4849                 value_size = posix_acl_xattr_size(acl->a_count);
4850                 value = kmalloc(value_size, GFP_NOFS);
4851                 if (value == NULL)
4852                         GOTO(out, rc = -ENOMEM);
4853
4854                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4855                 if (rc < 0)
4856                         GOTO(out_value, rc);
4857         }
4858
4859         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4860                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4861                          name, value, value_size, 0, 0, &req);
4862
4863         ptlrpc_req_finished(req);
4864 out_value:
4865         kfree(value);
4866 out:
4867         if (rc)
4868                 forget_cached_acl(inode, type);
4869         else
4870                 set_cached_acl(inode, type, acl);
4871         RETURN(rc);
4872 }
4873 #endif /* CONFIG_FS_POSIX_ACL */
4874 #endif /* HAVE_IOP_SET_ACL */
4875
4876 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4877 static int
4878 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4879 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4880 # else
4881 ll_check_acl(struct inode *inode, int mask)
4882 # endif
4883 {
4884 # ifdef CONFIG_FS_POSIX_ACL
4885         struct posix_acl *acl;
4886         int rc;
4887         ENTRY;
4888
4889 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4890         if (flags & IPERM_FLAG_RCU)
4891                 return -ECHILD;
4892 #  endif
4893         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4894
4895         if (!acl)
4896                 RETURN(-EAGAIN);
4897
4898         rc = posix_acl_permission(inode, acl, mask);
4899         posix_acl_release(acl);
4900
4901         RETURN(rc);
4902 # else /* !CONFIG_FS_POSIX_ACL */
4903         return -EAGAIN;
4904 # endif /* CONFIG_FS_POSIX_ACL */
4905 }
4906 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4907
4908 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4909 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4910 #else
4911 # ifdef HAVE_INODE_PERMISION_2ARGS
4912 int ll_inode_permission(struct inode *inode, int mask)
4913 # else
4914 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4915 # endif
4916 #endif
4917 {
4918         int rc = 0;
4919         struct ll_sb_info *sbi;
4920         struct root_squash_info *squash;
4921         struct cred *cred = NULL;
4922         const struct cred *old_cred = NULL;
4923         cfs_cap_t cap;
4924         bool squash_id = false;
4925         ENTRY;
4926
4927 #ifdef MAY_NOT_BLOCK
4928         if (mask & MAY_NOT_BLOCK)
4929                 return -ECHILD;
4930 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4931         if (flags & IPERM_FLAG_RCU)
4932                 return -ECHILD;
4933 #endif
4934
4935        /* as root inode are NOT getting validated in lookup operation,
4936         * need to do it before permission check. */
4937
4938         if (inode == inode->i_sb->s_root->d_inode) {
4939                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4940                 if (rc)
4941                         RETURN(rc);
4942         }
4943
4944         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4945                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4946
4947         /* squash fsuid/fsgid if needed */
4948         sbi = ll_i2sbi(inode);
4949         squash = &sbi->ll_squash;
4950         if (unlikely(squash->rsi_uid != 0 &&
4951                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4952                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4953                         squash_id = true;
4954         }
4955         if (squash_id) {
4956                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4957                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4958                        squash->rsi_uid, squash->rsi_gid);
4959
4960                 /* update current process's credentials
4961                  * and FS capability */
4962                 cred = prepare_creds();
4963                 if (cred == NULL)
4964                         RETURN(-ENOMEM);
4965
4966                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4967                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4968                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4969                         if ((1 << cap) & CFS_CAP_FS_MASK)
4970                                 cap_lower(cred->cap_effective, cap);
4971                 }
4972                 old_cred = override_creds(cred);
4973         }
4974
4975         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4976         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4977         /* restore current process's credentials and FS capability */
4978         if (squash_id) {
4979                 revert_creds(old_cred);
4980                 put_cred(cred);
4981         }
4982
4983         RETURN(rc);
4984 }
4985
4986 /* -o localflock - only provides locally consistent flock locks */
4987 struct file_operations ll_file_operations = {
4988 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4989 # ifdef HAVE_SYNC_READ_WRITE
4990         .read           = new_sync_read,
4991         .write          = new_sync_write,
4992 # endif
4993         .read_iter      = ll_file_read_iter,
4994         .write_iter     = ll_file_write_iter,
4995 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4996         .read           = ll_file_read,
4997         .aio_read       = ll_file_aio_read,
4998         .write          = ll_file_write,
4999         .aio_write      = ll_file_aio_write,
5000 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5001         .unlocked_ioctl = ll_file_ioctl,
5002         .open           = ll_file_open,
5003         .release        = ll_file_release,
5004         .mmap           = ll_file_mmap,
5005         .llseek         = ll_file_seek,
5006         .splice_read    = ll_file_splice_read,
5007         .fsync          = ll_fsync,
5008         .flush          = ll_flush
5009 };
5010
5011 struct file_operations ll_file_operations_flock = {
5012 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5013 # ifdef HAVE_SYNC_READ_WRITE
5014         .read           = new_sync_read,
5015         .write          = new_sync_write,
5016 # endif /* HAVE_SYNC_READ_WRITE */
5017         .read_iter      = ll_file_read_iter,
5018         .write_iter     = ll_file_write_iter,
5019 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5020         .read           = ll_file_read,
5021         .aio_read       = ll_file_aio_read,
5022         .write          = ll_file_write,
5023         .aio_write      = ll_file_aio_write,
5024 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5025         .unlocked_ioctl = ll_file_ioctl,
5026         .open           = ll_file_open,
5027         .release        = ll_file_release,
5028         .mmap           = ll_file_mmap,
5029         .llseek         = ll_file_seek,
5030         .splice_read    = ll_file_splice_read,
5031         .fsync          = ll_fsync,
5032         .flush          = ll_flush,
5033         .flock          = ll_file_flock,
5034         .lock           = ll_file_flock
5035 };
5036
5037 /* These are for -o noflock - to return ENOSYS on flock calls */
5038 struct file_operations ll_file_operations_noflock = {
5039 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5040 # ifdef HAVE_SYNC_READ_WRITE
5041         .read           = new_sync_read,
5042         .write          = new_sync_write,
5043 # endif /* HAVE_SYNC_READ_WRITE */
5044         .read_iter      = ll_file_read_iter,
5045         .write_iter     = ll_file_write_iter,
5046 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5047         .read           = ll_file_read,
5048         .aio_read       = ll_file_aio_read,
5049         .write          = ll_file_write,
5050         .aio_write      = ll_file_aio_write,
5051 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5052         .unlocked_ioctl = ll_file_ioctl,
5053         .open           = ll_file_open,
5054         .release        = ll_file_release,
5055         .mmap           = ll_file_mmap,
5056         .llseek         = ll_file_seek,
5057         .splice_read    = ll_file_splice_read,
5058         .fsync          = ll_fsync,
5059         .flush          = ll_flush,
5060         .flock          = ll_file_noflock,
5061         .lock           = ll_file_noflock
5062 };
5063
5064 struct inode_operations ll_file_inode_operations = {
5065         .setattr        = ll_setattr,
5066         .getattr        = ll_getattr,
5067         .permission     = ll_inode_permission,
5068 #ifdef HAVE_IOP_XATTR
5069         .setxattr       = ll_setxattr,
5070         .getxattr       = ll_getxattr,
5071         .removexattr    = ll_removexattr,
5072 #endif
5073         .listxattr      = ll_listxattr,
5074         .fiemap         = ll_fiemap,
5075 #ifdef HAVE_IOP_GET_ACL
5076         .get_acl        = ll_get_acl,
5077 #endif
5078 #ifdef HAVE_IOP_SET_ACL
5079         .set_acl        = ll_set_acl,
5080 #endif
5081 };
5082
5083 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5084 {
5085         struct ll_inode_info *lli = ll_i2info(inode);
5086         struct cl_object *obj = lli->lli_clob;
5087         struct lu_env *env;
5088         int rc;
5089         __u16 refcheck;
5090         ENTRY;
5091
5092         if (obj == NULL)
5093                 RETURN(0);
5094
5095         env = cl_env_get(&refcheck);
5096         if (IS_ERR(env))
5097                 RETURN(PTR_ERR(env));
5098
5099         rc = cl_conf_set(env, lli->lli_clob, conf);
5100         if (rc < 0)
5101                 GOTO(out, rc);
5102
5103         if (conf->coc_opc == OBJECT_CONF_SET) {
5104                 struct ldlm_lock *lock = conf->coc_lock;
5105                 struct cl_layout cl = {
5106                         .cl_layout_gen = 0,
5107                 };
5108
5109                 LASSERT(lock != NULL);
5110                 LASSERT(ldlm_has_layout(lock));
5111
5112                 /* it can only be allowed to match after layout is
5113                  * applied to inode otherwise false layout would be
5114                  * seen. Applying layout shoud happen before dropping
5115                  * the intent lock. */
5116                 ldlm_lock_allow_match(lock);
5117
5118                 rc = cl_object_layout_get(env, obj, &cl);
5119                 if (rc < 0)
5120                         GOTO(out, rc);
5121
5122                 CDEBUG(D_VFSTRACE,
5123                        DFID": layout version change: %u -> %u\n",
5124                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
5125                        cl.cl_layout_gen);
5126                 ll_layout_version_set(lli, cl.cl_layout_gen);
5127         }
5128
5129 out:
5130         cl_env_put(env, &refcheck);
5131
5132         RETURN(rc);
5133 }
5134
5135 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5136 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5137
5138 {
5139         struct ll_sb_info *sbi = ll_i2sbi(inode);
5140         struct ptlrpc_request *req;
5141         void *lvbdata;
5142         void *lmm;
5143         int lmmsize;
5144         int rc;
5145         ENTRY;
5146
5147         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5148                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5149                lock->l_lvb_data, lock->l_lvb_len);
5150
5151         if (lock->l_lvb_data != NULL)
5152                 RETURN(0);
5153
5154         /* if layout lock was granted right away, the layout is returned
5155          * within DLM_LVB of dlm reply; otherwise if the lock was ever
5156          * blocked and then granted via completion ast, we have to fetch
5157          * layout here. Please note that we can't use the LVB buffer in
5158          * completion AST because it doesn't have a large enough buffer */
5159         rc = ll_get_default_mdsize(sbi, &lmmsize);
5160         if (rc < 0)
5161                 RETURN(rc);
5162
5163         rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5164                          XATTR_NAME_LOV, lmmsize, &req);
5165         if (rc < 0) {
5166                 if (rc == -ENODATA)
5167                         GOTO(out, rc = 0); /* empty layout */
5168                 else
5169                         RETURN(rc);
5170         }
5171
5172         lmmsize = rc;
5173         rc = 0;
5174         if (lmmsize == 0) /* empty layout */
5175                 GOTO(out, rc = 0);
5176
5177         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5178         if (lmm == NULL)
5179                 GOTO(out, rc = -EFAULT);
5180
5181         OBD_ALLOC_LARGE(lvbdata, lmmsize);
5182         if (lvbdata == NULL)
5183                 GOTO(out, rc = -ENOMEM);
5184
5185         memcpy(lvbdata, lmm, lmmsize);
5186         lock_res_and_lock(lock);
5187         if (unlikely(lock->l_lvb_data == NULL)) {
5188                 lock->l_lvb_type = LVB_T_LAYOUT;
5189                 lock->l_lvb_data = lvbdata;
5190                 lock->l_lvb_len = lmmsize;
5191                 lvbdata = NULL;
5192         }
5193         unlock_res_and_lock(lock);
5194
5195         if (lvbdata)
5196                 OBD_FREE_LARGE(lvbdata, lmmsize);
5197
5198         EXIT;
5199
5200 out:
5201         ptlrpc_req_finished(req);
5202         return rc;
5203 }
5204
5205 /**
5206  * Apply the layout to the inode. Layout lock is held and will be released
5207  * in this function.
5208  */
5209 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5210                               struct inode *inode)
5211 {
5212         struct ll_inode_info *lli = ll_i2info(inode);
5213         struct ll_sb_info    *sbi = ll_i2sbi(inode);
5214         struct ldlm_lock *lock;
5215         struct cl_object_conf conf;
5216         int rc = 0;
5217         bool lvb_ready;
5218         bool wait_layout = false;
5219         ENTRY;
5220
5221         LASSERT(lustre_handle_is_used(lockh));
5222
5223         lock = ldlm_handle2lock(lockh);
5224         LASSERT(lock != NULL);
5225         LASSERT(ldlm_has_layout(lock));
5226
5227         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5228                    PFID(&lli->lli_fid), inode);
5229
5230         /* in case this is a caching lock and reinstate with new inode */
5231         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5232
5233         lock_res_and_lock(lock);
5234         lvb_ready = ldlm_is_lvb_ready(lock);
5235         unlock_res_and_lock(lock);
5236
5237         /* checking lvb_ready is racy but this is okay. The worst case is
5238          * that multi processes may configure the file on the same time. */
5239         if (lvb_ready)
5240                 GOTO(out, rc = 0);
5241
5242         rc = ll_layout_fetch(inode, lock);
5243         if (rc < 0)
5244                 GOTO(out, rc);
5245
5246         /* for layout lock, lmm is stored in lock's lvb.
5247          * lvb_data is immutable if the lock is held so it's safe to access it
5248          * without res lock.
5249          *
5250          * set layout to file. Unlikely this will fail as old layout was
5251          * surely eliminated */
5252         memset(&conf, 0, sizeof conf);
5253         conf.coc_opc = OBJECT_CONF_SET;
5254         conf.coc_inode = inode;
5255         conf.coc_lock = lock;
5256         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5257         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5258         rc = ll_layout_conf(inode, &conf);
5259
5260         /* refresh layout failed, need to wait */
5261         wait_layout = rc == -EBUSY;
5262         EXIT;
5263 out:
5264         LDLM_LOCK_PUT(lock);
5265         ldlm_lock_decref(lockh, mode);
5266
5267         /* wait for IO to complete if it's still being used. */
5268         if (wait_layout) {
5269                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5270                        sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5271
5272                 memset(&conf, 0, sizeof conf);
5273                 conf.coc_opc = OBJECT_CONF_WAIT;
5274                 conf.coc_inode = inode;
5275                 rc = ll_layout_conf(inode, &conf);
5276                 if (rc == 0)
5277                         rc = -EAGAIN;
5278
5279                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5280                        sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5281         }
5282         RETURN(rc);
5283 }
5284
5285 /**
5286  * Issue layout intent RPC to MDS.
5287  * \param inode [in]    file inode
5288  * \param intent [in]   layout intent
5289  *
5290  * \retval 0    on success
5291  * \retval < 0  error code
5292  */
5293 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5294 {
5295         struct ll_inode_info  *lli = ll_i2info(inode);
5296         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5297         struct md_op_data     *op_data;
5298         struct lookup_intent it;
5299         struct ptlrpc_request *req;
5300         int rc;
5301         ENTRY;
5302
5303         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5304                                      0, 0, LUSTRE_OPC_ANY, NULL);
5305         if (IS_ERR(op_data))
5306                 RETURN(PTR_ERR(op_data));
5307
5308         op_data->op_data = intent;
5309         op_data->op_data_size = sizeof(*intent);
5310
5311         memset(&it, 0, sizeof(it));
5312         it.it_op = IT_LAYOUT;
5313         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5314             intent->li_opc == LAYOUT_INTENT_TRUNC)
5315                 it.it_flags = FMODE_WRITE;
5316
5317         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5318                           sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5319
5320         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5321                             &ll_md_blocking_ast, 0);
5322         if (it.it_request != NULL)
5323                 ptlrpc_req_finished(it.it_request);
5324         it.it_request = NULL;
5325
5326         ll_finish_md_op_data(op_data);
5327
5328         /* set lock data in case this is a new lock */
5329         if (!rc)
5330                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5331
5332         ll_intent_drop_lock(&it);
5333
5334         RETURN(rc);
5335 }
5336
5337 /**
5338  * This function checks if there exists a LAYOUT lock on the client side,
5339  * or enqueues it if it doesn't have one in cache.
5340  *
5341  * This function will not hold layout lock so it may be revoked any time after
5342  * this function returns. Any operations depend on layout should be redone
5343  * in that case.
5344  *
5345  * This function should be called before lov_io_init() to get an uptodate
5346  * layout version, the caller should save the version number and after IO
5347  * is finished, this function should be called again to verify that layout
5348  * is not changed during IO time.
5349  */
5350 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5351 {
5352         struct ll_inode_info    *lli = ll_i2info(inode);
5353         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5354         struct lustre_handle lockh;
5355         struct layout_intent intent = {
5356                 .li_opc = LAYOUT_INTENT_ACCESS,
5357         };
5358         enum ldlm_mode mode;
5359         int rc;
5360         ENTRY;
5361
5362         *gen = ll_layout_version_get(lli);
5363         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5364                 RETURN(0);
5365
5366         /* sanity checks */
5367         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5368         LASSERT(S_ISREG(inode->i_mode));
5369
5370         /* take layout lock mutex to enqueue layout lock exclusively. */
5371         mutex_lock(&lli->lli_layout_mutex);
5372
5373         while (1) {
5374                 /* mostly layout lock is caching on the local side, so try to
5375                  * match it before grabbing layout lock mutex. */
5376                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5377                                        LCK_CR | LCK_CW | LCK_PR |
5378                                        LCK_PW | LCK_EX);
5379                 if (mode != 0) { /* hit cached lock */
5380                         rc = ll_layout_lock_set(&lockh, mode, inode);
5381                         if (rc == -EAGAIN)
5382                                 continue;
5383                         break;
5384                 }
5385
5386                 rc = ll_layout_intent(inode, &intent);
5387                 if (rc != 0)
5388                         break;
5389         }
5390
5391         if (rc == 0)
5392                 *gen = ll_layout_version_get(lli);
5393         mutex_unlock(&lli->lli_layout_mutex);
5394
5395         RETURN(rc);
5396 }
5397
5398 /**
5399  * Issue layout intent RPC indicating where in a file an IO is about to write.
5400  *
5401  * \param[in] inode     file inode.
5402  * \param[in] ext       write range with start offset of fille in bytes where
5403  *                      an IO is about to write, and exclusive end offset in
5404  *                      bytes.
5405  *
5406  * \retval 0    on success
5407  * \retval < 0  error code
5408  */
5409 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5410                            struct lu_extent *ext)
5411 {
5412         struct layout_intent intent = {
5413                 .li_opc = opc,
5414                 .li_extent.e_start = ext->e_start,
5415                 .li_extent.e_end = ext->e_end,
5416         };
5417         int rc;
5418         ENTRY;
5419
5420         rc = ll_layout_intent(inode, &intent);
5421
5422         RETURN(rc);
5423 }
5424
5425 /**
5426  *  This function send a restore request to the MDT
5427  */
5428 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5429 {
5430         struct hsm_user_request *hur;
5431         int                      len, rc;
5432         ENTRY;
5433
5434         len = sizeof(struct hsm_user_request) +
5435               sizeof(struct hsm_user_item);
5436         OBD_ALLOC(hur, len);
5437         if (hur == NULL)
5438                 RETURN(-ENOMEM);
5439
5440         hur->hur_request.hr_action = HUA_RESTORE;
5441         hur->hur_request.hr_archive_id = 0;
5442         hur->hur_request.hr_flags = 0;
5443         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5444                sizeof(hur->hur_user_item[0].hui_fid));
5445         hur->hur_user_item[0].hui_extent.offset = offset;
5446         hur->hur_user_item[0].hui_extent.length = length;
5447         hur->hur_request.hr_itemcount = 1;
5448         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5449                            len, hur, NULL);
5450         OBD_FREE(hur, len);
5451         RETURN(rc);
5452 }