lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                       ATTR_MTIME | ATTR_MTIME_SET |
 104                                       ATTR_CTIME);
 105         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 106         op_data->op_attr_blocks = inode->i_blocks;
 107         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 108         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 109                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 110         op_data->op_open_handle = och->och_open_handle;
 111
 112         if (och->och_flags & FMODE_WRITE &&
 113             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 114                 /* For HSM: if inode data has been modified, pack it so that
 115                  * MDT can set data dirty flag in the archive. */
 116                 op_data->op_bias |= MDS_DATA_MODIFIED;
 117
 118         EXIT;
 119 }
 120
 121 /**
 122  * Perform a close, possibly with a bias.
 123  * The meaning of "data" depends on the value of "bias".
 124  *
 125  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 126  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 127  * swap layouts with.
 128  */
 129 static int ll_close_inode_openhandle(struct inode *inode,
 130                                      struct obd_client_handle *och,
 131                                      enum mds_op_bias bias, void *data)
 132 {
 133         struct obd_export *md_exp = ll_i2mdexp(inode);
 134         const struct ll_inode_info *lli = ll_i2info(inode);
 135         struct md_op_data *op_data;
 136         struct ptlrpc_request *req = NULL;
 137         int rc;
 138         ENTRY;
 139
 140         if (class_exp2obd(md_exp) == NULL) {
 141                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 142                        ll_get_fsname(inode->i_sb, NULL, 0),
 143                        PFID(&lli->lli_fid));
 144                 GOTO(out, rc = 0);
 145         }
 146
 147         OBD_ALLOC_PTR(op_data);
 148         /* We leak openhandle and request here on error, but not much to be
 149          * done in OOM case since app won't retry close on error either. */
 150         if (op_data == NULL)
 151                 GOTO(out, rc = -ENOMEM);
 152
 153         ll_prepare_close(inode, op_data, och);
 154         switch (bias) {
 155         case MDS_CLOSE_LAYOUT_MERGE:
 156                 /* merge blocks from the victim inode */
 157                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 158                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 159                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 160         case MDS_CLOSE_LAYOUT_SPLIT:
 161         case MDS_CLOSE_LAYOUT_SWAP: {
 162                 struct split_param *sp = data;
 163
 164                 LASSERT(data != NULL);
 165                 op_data->op_bias |= bias;
 166                 op_data->op_data_version = 0;
 167                 op_data->op_lease_handle = och->och_lease_handle;
 168                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 169                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 170                         op_data->op_mirror_id = sp->sp_mirror_id;
 171                 } else {
 172                         op_data->op_fid2 = *ll_inode2fid(data);
 173                 }
 174                 break;
 175         }
 176
 177         case MDS_CLOSE_RESYNC_DONE: {
 178                 struct ll_ioc_lease *ioc = data;
 179
 180                 LASSERT(data != NULL);
 181                 op_data->op_attr_blocks +=
 182                         ioc->lil_count * op_data->op_attr_blocks;
 183                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 184                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 185                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 186
 187                 op_data->op_lease_handle = och->och_lease_handle;
 188                 op_data->op_data = &ioc->lil_ids[0];
 189                 op_data->op_data_size =
 190                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 191                 break;
 192         }
 193
 194         case MDS_HSM_RELEASE:
 195                 LASSERT(data != NULL);
 196                 op_data->op_bias |= MDS_HSM_RELEASE;
 197                 op_data->op_data_version = *(__u64 *)data;
 198                 op_data->op_lease_handle = och->och_lease_handle;
 199                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 200                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 201                 break;
 202
 203         default:
 204                 LASSERT(data == NULL);
 205                 break;
 206         }
 207
 208         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 209                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 210         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 211                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 212
 213         rc = md_close(md_exp, op_data, och->och_mod, &req);
 214         if (rc != 0 && rc != -EINTR)
 215                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 216                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 217
 218         if (rc == 0 && op_data->op_bias & bias) {
 219                 struct mdt_body *body;
 220
 221                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 222                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 223                         rc = -EBUSY;
 224         }
 225
 226         ll_finish_md_op_data(op_data);
 227         EXIT;
 228 out:
 229
 230         md_clear_open_replay_data(md_exp, och);
 231         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 232         OBD_FREE_PTR(och);
 233
 234         ptlrpc_req_finished(req);       /* This is close request */
 235         return rc;
 236 }
 237
 238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 239 {
 240         struct ll_inode_info *lli = ll_i2info(inode);
 241         struct obd_client_handle **och_p;
 242         struct obd_client_handle *och;
 243         __u64 *och_usecount;
 244         int rc = 0;
 245         ENTRY;
 246
 247         if (fmode & FMODE_WRITE) {
 248                 och_p = &lli->lli_mds_write_och;
 249                 och_usecount = &lli->lli_open_fd_write_count;
 250         } else if (fmode & FMODE_EXEC) {
 251                 och_p = &lli->lli_mds_exec_och;
 252                 och_usecount = &lli->lli_open_fd_exec_count;
 253         } else {
 254                 LASSERT(fmode & FMODE_READ);
 255                 och_p = &lli->lli_mds_read_och;
 256                 och_usecount = &lli->lli_open_fd_read_count;
 257         }
 258
 259         mutex_lock(&lli->lli_och_mutex);
 260         if (*och_usecount > 0) {
 261                 /* There are still users of this handle, so skip
 262                  * freeing it. */
 263                 mutex_unlock(&lli->lli_och_mutex);
 264                 RETURN(0);
 265         }
 266
 267         och = *och_p;
 268         *och_p = NULL;
 269         mutex_unlock(&lli->lli_och_mutex);
 270
 271         if (och != NULL) {
 272                 /* There might be a race and this handle may already
 273                  * be closed. */
 274                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 275         }
 276
 277         RETURN(rc);
 278 }
 279
 280 static int ll_md_close(struct inode *inode, struct file *file)
 281 {
 282         union ldlm_policy_data policy = {
 283                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 284         };
 285         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 286         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 287         struct ll_inode_info *lli = ll_i2info(inode);
 288         struct lustre_handle lockh;
 289         enum ldlm_mode lockmode;
 290         int rc = 0;
 291         ENTRY;
 292
 293         /* clear group lock, if present */
 294         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 295                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 296
 297         if (fd->fd_lease_och != NULL) {
 298                 bool lease_broken;
 299
 300                 /* Usually the lease is not released when the
 301                  * application crashed, we need to release here. */
 302                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 303                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 304                         PFID(&lli->lli_fid), rc, lease_broken);
 305
 306                 fd->fd_lease_och = NULL;
 307         }
 308
 309         if (fd->fd_och != NULL) {
 310                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 311                 fd->fd_och = NULL;
 312                 GOTO(out, rc);
 313         }
 314
 315         /* Let's see if we have good enough OPEN lock on the file and if
 316            we can skip talking to MDS */
 317         mutex_lock(&lli->lli_och_mutex);
 318         if (fd->fd_omode & FMODE_WRITE) {
 319                 lockmode = LCK_CW;
 320                 LASSERT(lli->lli_open_fd_write_count);
 321                 lli->lli_open_fd_write_count--;
 322         } else if (fd->fd_omode & FMODE_EXEC) {
 323                 lockmode = LCK_PR;
 324                 LASSERT(lli->lli_open_fd_exec_count);
 325                 lli->lli_open_fd_exec_count--;
 326         } else {
 327                 lockmode = LCK_CR;
 328                 LASSERT(lli->lli_open_fd_read_count);
 329                 lli->lli_open_fd_read_count--;
 330         }
 331         mutex_unlock(&lli->lli_och_mutex);
 332
 333         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 334                            LDLM_IBITS, &policy, lockmode, &lockh))
 335                 rc = ll_md_real_close(inode, fd->fd_omode);
 336
 337 out:
 338         LUSTRE_FPRIVATE(file) = NULL;
 339         ll_file_data_put(fd);
 340
 341         RETURN(rc);
 342 }
 343
 344 /* While this returns an error code, fput() the caller does not, so we need
 345  * to make every effort to clean up all of our state here.  Also, applications
 346  * rarely check close errors and even if an error is returned they will not
 347  * re-try the close call.
 348  */
 349 int ll_file_release(struct inode *inode, struct file *file)
 350 {
 351         struct ll_file_data *fd;
 352         struct ll_sb_info *sbi = ll_i2sbi(inode);
 353         struct ll_inode_info *lli = ll_i2info(inode);
 354         int rc;
 355         ENTRY;
 356
 357         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 358                PFID(ll_inode2fid(inode)), inode);
 359
 360         if (inode->i_sb->s_root != file_dentry(file))
 361                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 362         fd = LUSTRE_FPRIVATE(file);
 363         LASSERT(fd != NULL);
 364
 365         /* The last ref on @file, maybe not the the owner pid of statahead,
 366          * because parent and child process can share the same file handle. */
 367         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 368                 ll_deauthorize_statahead(inode, fd);
 369
 370         if (inode->i_sb->s_root == file_dentry(file)) {
 371                 LUSTRE_FPRIVATE(file) = NULL;
 372                 ll_file_data_put(fd);
 373                 RETURN(0);
 374         }
 375
 376         if (!S_ISDIR(inode->i_mode)) {
 377                 if (lli->lli_clob != NULL)
 378                         lov_read_and_clear_async_rc(lli->lli_clob);
 379                 lli->lli_async_rc = 0;
 380         }
 381
 382         rc = ll_md_close(inode, file);
 383
 384         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 385                 libcfs_debug_dumplog();
 386
 387         RETURN(rc);
 388 }
 389
 390 static inline int ll_dom_readpage(void *data, struct page *page)
 391 {
 392         struct niobuf_local *lnb = data;
 393         void *kaddr;
 394
 395         kaddr = ll_kmap_atomic(page, KM_USER0);
 396         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 397         if (lnb->lnb_len < PAGE_SIZE)
 398                 memset(kaddr + lnb->lnb_len, 0,
 399                        PAGE_SIZE - lnb->lnb_len);
 400         flush_dcache_page(page);
 401         SetPageUptodate(page);
 402         ll_kunmap_atomic(kaddr, KM_USER0);
 403         unlock_page(page);
 404
 405         return 0;
 406 }
 407
 408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 409                         struct lookup_intent *it)
 410 {
 411         struct ll_inode_info *lli = ll_i2info(inode);
 412         struct cl_object *obj = lli->lli_clob;
 413         struct address_space *mapping = inode->i_mapping;
 414         struct page *vmpage;
 415         struct niobuf_remote *rnb;
 416         char *data;
 417         struct lustre_handle lockh;
 418         struct ldlm_lock *lock;
 419         unsigned long index, start;
 420         struct niobuf_local lnb;
 421         bool dom_lock = false;
 422
 423         ENTRY;
 424
 425         if (obj == NULL)
 426                 RETURN_EXIT;
 427
 428         if (it->it_lock_mode != 0) {
 429                 lockh.cookie = it->it_lock_handle;
 430                 lock = ldlm_handle2lock(&lockh);
 431                 if (lock != NULL)
 432                         dom_lock = ldlm_has_dom(lock);
 433                 LDLM_LOCK_PUT(lock);
 434         }
 435         if (!dom_lock)
 436                 RETURN_EXIT;
 437
 438         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 439                                    RCL_SERVER))
 440                 RETURN_EXIT;
 441
 442         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 443         if (rnb == NULL || rnb->rnb_len == 0)
 444                 RETURN_EXIT;
 445
 446         /* LU-11595: Server may return whole file and that is OK always or
 447          * it may return just file tail and its offset must be aligned with
 448          * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
 449          * smaller then offset may be not aligned and that data is just ignored.
 450          */
 451         if (rnb->rnb_offset % PAGE_SIZE)
 452                 RETURN_EXIT;
 453
 454         /* Server returns whole file or just file tail if it fills in
 455          * reply buffer, in both cases total size should be inode size.
 456          */
 457         if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
 458                 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
 459                        ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
 460                        rnb->rnb_len, i_size_read(inode));
 461                 RETURN_EXIT;
 462         }
 463
 464         CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
 465                rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
 466
 467         data = (char *)rnb + sizeof(*rnb);
 468
 469         lnb.lnb_file_offset = rnb->rnb_offset;
 470         start = lnb.lnb_file_offset / PAGE_SIZE;
 471         index = 0;
 472         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 473         lnb.lnb_page_offset = 0;
 474         do {
 475                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 476                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 477                 if (lnb.lnb_len > PAGE_SIZE)
 478                         lnb.lnb_len = PAGE_SIZE;
 479
 480                 vmpage = read_cache_page(mapping, index + start,
 481                                          ll_dom_readpage, &lnb);
 482                 if (IS_ERR(vmpage)) {
 483                         CWARN("%s: cannot fill page %lu for "DFID
 484                               " with data: rc = %li\n",
 485                               ll_get_fsname(inode->i_sb, NULL, 0),
 486                               index + start, PFID(lu_object_fid(&obj->co_lu)),
 487                               PTR_ERR(vmpage));
 488                         break;
 489                 }
 490                 put_page(vmpage);
 491                 index++;
 492         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 493         EXIT;
 494 }
 495
 496 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 497                                 struct lookup_intent *itp)
 498 {
 499         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 500         struct dentry *parent = de->d_parent;
 501         char *name = NULL;
 502         int len = 0;
 503         struct md_op_data *op_data;
 504         struct ptlrpc_request *req = NULL;
 505         int rc;
 506         ENTRY;
 507
 508         LASSERT(parent != NULL);
 509         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 510
 511         /* if server supports open-by-fid, or file name is invalid, don't pack
 512          * name in open request */
 513         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
 514 retry:
 515                 len = de->d_name.len;
 516                 name = kmalloc(len, GFP_NOFS);
 517                 if (!name)
 518                         RETURN(-ENOMEM);
 519                 /* race here */
 520                 spin_lock(&de->d_lock);
 521                 if (len != de->d_name.len) {
 522                         spin_unlock(&de->d_lock);
 523                         kfree(name);
 524                         goto retry;
 525                 }
 526                 memcpy(name, de->d_name.name, len);
 527                 spin_unlock(&de->d_lock);
 528
 529                 if (!lu_name_is_valid_2(name, len)) {
 530                         kfree(name);
 531                         name = NULL;
 532                         len = 0;
 533                 }
 534         }
 535
 536         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 537                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 538         if (IS_ERR(op_data)) {
 539                 kfree(name);
 540                 RETURN(PTR_ERR(op_data));
 541         }
 542         op_data->op_data = lmm;
 543         op_data->op_data_size = lmmsize;
 544
 545         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 546                             &ll_md_blocking_ast, 0);
 547         kfree(name);
 548         ll_finish_md_op_data(op_data);
 549         if (rc == -ESTALE) {
 550                 /* reason for keep own exit path - don`t flood log
 551                  * with messages with -ESTALE errors.
 552                  */
 553                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 554                      it_open_error(DISP_OPEN_OPEN, itp))
 555                         GOTO(out, rc);
 556                 ll_release_openhandle(de, itp);
 557                 GOTO(out, rc);
 558         }
 559
 560         if (it_disposition(itp, DISP_LOOKUP_NEG))
 561                 GOTO(out, rc = -ENOENT);
 562
 563         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 564                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 565                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 566                 GOTO(out, rc);
 567         }
 568
 569         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 570
 571         if (!rc && itp->it_lock_mode) {
 572                 ll_dom_finish_open(de->d_inode, req, itp);
 573                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 574         }
 575
 576 out:
 577         ptlrpc_req_finished(req);
 578         ll_intent_drop_lock(itp);
 579
 580         /* We did open by fid, but by the time we got to the server,
 581          * the object disappeared. If this is a create, we cannot really
 582          * tell the userspace that the file it was trying to create
 583          * does not exist. Instead let's return -ESTALE, and the VFS will
 584          * retry the create with LOOKUP_REVAL that we are going to catch
 585          * in ll_revalidate_dentry() and use lookup then.
 586          */
 587         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 588                 rc = -ESTALE;
 589
 590         RETURN(rc);
 591 }
 592
 593 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 594                        struct obd_client_handle *och)
 595 {
 596         struct mdt_body *body;
 597
 598         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 599         och->och_open_handle = body->mbo_open_handle;
 600         och->och_fid = body->mbo_fid1;
 601         och->och_lease_handle.cookie = it->it_lock_handle;
 602         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 603         och->och_flags = it->it_flags;
 604
 605         return md_set_open_replay_data(md_exp, och, it);
 606 }
 607
 608 static int ll_local_open(struct file *file, struct lookup_intent *it,
 609                          struct ll_file_data *fd, struct obd_client_handle *och)
 610 {
 611         struct inode *inode = file_inode(file);
 612         ENTRY;
 613
 614         LASSERT(!LUSTRE_FPRIVATE(file));
 615
 616         LASSERT(fd != NULL);
 617
 618         if (och) {
 619                 int rc;
 620
 621                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 622                 if (rc != 0)
 623                         RETURN(rc);
 624         }
 625
 626         LUSTRE_FPRIVATE(file) = fd;
 627         ll_readahead_init(inode, &fd->fd_ras);
 628         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 629
 630         /* ll_cl_context initialize */
 631         rwlock_init(&fd->fd_lock);
 632         INIT_LIST_HEAD(&fd->fd_lccs);
 633
 634         RETURN(0);
 635 }
 636
 637 /* Open a file, and (for the very first open) create objects on the OSTs at
 638  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 639  * creation or open until ll_lov_setstripe() ioctl is called.
 640  *
 641  * If we already have the stripe MD locally then we don't request it in
 642  * md_open(), by passing a lmm_size = 0.
 643  *
 644  * It is up to the application to ensure no other processes open this file
 645  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 646  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 647  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 648  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 649  */
 650 int ll_file_open(struct inode *inode, struct file *file)
 651 {
 652         struct ll_inode_info *lli = ll_i2info(inode);
 653         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 654                                           .it_flags = file->f_flags };
 655         struct obd_client_handle **och_p = NULL;
 656         __u64 *och_usecount = NULL;
 657         struct ll_file_data *fd;
 658         int rc = 0;
 659         ENTRY;
 660
 661         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 662                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 663
 664         it = file->private_data; /* XXX: compat macro */
 665         file->private_data = NULL; /* prevent ll_local_open assertion */
 666
 667         fd = ll_file_data_get();
 668         if (fd == NULL)
 669                 GOTO(out_nofiledata, rc = -ENOMEM);
 670
 671         fd->fd_file = file;
 672         if (S_ISDIR(inode->i_mode))
 673                 ll_authorize_statahead(inode, fd);
 674
 675         if (inode->i_sb->s_root == file_dentry(file)) {
 676                 LUSTRE_FPRIVATE(file) = fd;
 677                 RETURN(0);
 678         }
 679
 680         if (!it || !it->it_disposition) {
 681                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 682                  * because everything but O_ACCMODE mask was stripped from
 683                  * there */
 684                 if ((oit.it_flags + 1) & O_ACCMODE)
 685                         oit.it_flags++;
 686                 if (file->f_flags & O_TRUNC)
 687                         oit.it_flags |= FMODE_WRITE;
 688
 689                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 690                  * dentry_open after call to open_namei that checks permissions.
 691                  * Only nfsd_open call dentry_open directly without checking
 692                  * permissions and because of that this code below is safe.
 693                  */
 694                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 695                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 696
 697                 /* We do not want O_EXCL here, presumably we opened the file
 698                  * already? XXX - NFS implications? */
 699                 oit.it_flags &= ~O_EXCL;
 700
 701                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 702                  * created if necessary, then "IT_CREAT" should be set to keep
 703                  * consistent with it */
 704                 if (oit.it_flags & O_CREAT)
 705                         oit.it_op |= IT_CREAT;
 706
 707                 it = &oit;
 708         }
 709
 710 restart:
 711         /* Let's see if we have file open on MDS already. */
 712         if (it->it_flags & FMODE_WRITE) {
 713                 och_p = &lli->lli_mds_write_och;
 714                 och_usecount = &lli->lli_open_fd_write_count;
 715         } else if (it->it_flags & FMODE_EXEC) {
 716                 och_p = &lli->lli_mds_exec_och;
 717                 och_usecount = &lli->lli_open_fd_exec_count;
 718          } else {
 719                 och_p = &lli->lli_mds_read_och;
 720                 och_usecount = &lli->lli_open_fd_read_count;
 721         }
 722
 723         mutex_lock(&lli->lli_och_mutex);
 724         if (*och_p) { /* Open handle is present */
 725                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 726                         /* Well, there's extra open request that we do not need,
 727                            let's close it somehow. This will decref request. */
 728                         rc = it_open_error(DISP_OPEN_OPEN, it);
 729                         if (rc) {
 730                                 mutex_unlock(&lli->lli_och_mutex);
 731                                 GOTO(out_openerr, rc);
 732                         }
 733
 734                         ll_release_openhandle(file_dentry(file), it);
 735                 }
 736                 (*och_usecount)++;
 737
 738                 rc = ll_local_open(file, it, fd, NULL);
 739                 if (rc) {
 740                         (*och_usecount)--;
 741                         mutex_unlock(&lli->lli_och_mutex);
 742                         GOTO(out_openerr, rc);
 743                 }
 744         } else {
 745                 LASSERT(*och_usecount == 0);
 746                 if (!it->it_disposition) {
 747                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 748                         /* We cannot just request lock handle now, new ELC code
 749                            means that one of other OPEN locks for this file
 750                            could be cancelled, and since blocking ast handler
 751                            would attempt to grab och_mutex as well, that would
 752                            result in a deadlock */
 753                         mutex_unlock(&lli->lli_och_mutex);
 754                         /*
 755                          * Normally called under two situations:
 756                          * 1. NFS export.
 757                          * 2. A race/condition on MDS resulting in no open
 758                          *    handle to be returned from LOOKUP|OPEN request,
 759                          *    for example if the target entry was a symlink.
 760                          *
 761                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 762                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 763                          *  bit so that it's not confusing later callers.
 764                          *
 765                          *  NB; when ldd is NULL, it must have come via normal
 766                          *  lookup path only, since ll_iget_for_nfs always calls
 767                          *  ll_d_init().
 768                          */
 769                         if (ldd && ldd->lld_nfs_dentry) {
 770                                 ldd->lld_nfs_dentry = 0;
 771                                 it->it_flags |= MDS_OPEN_LOCK;
 772                         }
 773
 774                          /*
 775                          * Always specify MDS_OPEN_BY_FID because we don't want
 776                          * to get file with different fid.
 777                          */
 778                         it->it_flags |= MDS_OPEN_BY_FID;
 779                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 780                                                  it);
 781                         if (rc)
 782                                 GOTO(out_openerr, rc);
 783
 784                         goto restart;
 785                 }
 786                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 787                 if (!*och_p)
 788                         GOTO(out_och_free, rc = -ENOMEM);
 789
 790                 (*och_usecount)++;
 791
 792                 /* md_intent_lock() didn't get a request ref if there was an
 793                  * open error, so don't do cleanup on the request here
 794                  * (bug 3430) */
 795                 /* XXX (green): Should not we bail out on any error here, not
 796                  * just open error? */
 797                 rc = it_open_error(DISP_OPEN_OPEN, it);
 798                 if (rc != 0)
 799                         GOTO(out_och_free, rc);
 800
 801                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 802                          "inode %p: disposition %x, status %d\n", inode,
 803                          it_disposition(it, ~0), it->it_status);
 804
 805                 rc = ll_local_open(file, it, fd, *och_p);
 806                 if (rc)
 807                         GOTO(out_och_free, rc);
 808         }
 809         mutex_unlock(&lli->lli_och_mutex);
 810         fd = NULL;
 811
 812         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 813            different kind of OPEN lock for this same inode gets cancelled
 814            by ldlm_cancel_lru */
 815         if (!S_ISREG(inode->i_mode))
 816                 GOTO(out_och_free, rc);
 817
 818         cl_lov_delay_create_clear(&file->f_flags);
 819         GOTO(out_och_free, rc);
 820
 821 out_och_free:
 822         if (rc) {
 823                 if (och_p && *och_p) {
 824                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 825                         *och_p = NULL; /* OBD_FREE writes some magic there */
 826                         (*och_usecount)--;
 827                 }
 828                 mutex_unlock(&lli->lli_och_mutex);
 829
 830 out_openerr:
 831                 if (lli->lli_opendir_key == fd)
 832                         ll_deauthorize_statahead(inode, fd);
 833                 if (fd != NULL)
 834                         ll_file_data_put(fd);
 835         } else {
 836                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 837         }
 838
 839 out_nofiledata:
 840         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 841                 ptlrpc_req_finished(it->it_request);
 842                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 843         }
 844
 845         return rc;
 846 }
 847
 848 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 849                         struct ldlm_lock_desc *desc, void *data, int flag)
 850 {
 851         int rc;
 852         struct lustre_handle lockh;
 853         ENTRY;
 854
 855         switch (flag) {
 856         case LDLM_CB_BLOCKING:
 857                 ldlm_lock2handle(lock, &lockh);
 858                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 859                 if (rc < 0) {
 860                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 861                         RETURN(rc);
 862                 }
 863                 break;
 864         case LDLM_CB_CANCELING:
 865                 /* do nothing */
 866                 break;
 867         }
 868         RETURN(0);
 869 }
 870
 871 /**
 872  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 873  * and save it as fd->fd_och so as to force client to reopen the file even
 874  * if it has an open lock in cache already.
 875  */
 876 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 877                                 struct lustre_handle *old_open_handle)
 878 {
 879         struct ll_inode_info *lli = ll_i2info(inode);
 880         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 881         struct obd_client_handle **och_p;
 882         __u64 *och_usecount;
 883         int rc = 0;
 884         ENTRY;
 885
 886         /* Get the openhandle of the file */
 887         mutex_lock(&lli->lli_och_mutex);
 888         if (fd->fd_lease_och != NULL)
 889                 GOTO(out_unlock, rc = -EBUSY);
 890
 891         if (fd->fd_och == NULL) {
 892                 if (file->f_mode & FMODE_WRITE) {
 893                         LASSERT(lli->lli_mds_write_och != NULL);
 894                         och_p = &lli->lli_mds_write_och;
 895                         och_usecount = &lli->lli_open_fd_write_count;
 896                 } else {
 897                         LASSERT(lli->lli_mds_read_och != NULL);
 898                         och_p = &lli->lli_mds_read_och;
 899                         och_usecount = &lli->lli_open_fd_read_count;
 900                 }
 901
 902                 if (*och_usecount > 1)
 903                         GOTO(out_unlock, rc = -EBUSY);
 904
 905                 fd->fd_och = *och_p;
 906                 *och_usecount = 0;
 907                 *och_p = NULL;
 908         }
 909
 910         *old_open_handle = fd->fd_och->och_open_handle;
 911
 912         EXIT;
 913 out_unlock:
 914         mutex_unlock(&lli->lli_och_mutex);
 915         return rc;
 916 }
 917
 918 /**
 919  * Release ownership on lli_mds_*_och when putting back a file lease.
 920  */
 921 static int ll_lease_och_release(struct inode *inode, struct file *file)
 922 {
 923         struct ll_inode_info *lli = ll_i2info(inode);
 924         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 925         struct obd_client_handle **och_p;
 926         struct obd_client_handle *old_och = NULL;
 927         __u64 *och_usecount;
 928         int rc = 0;
 929         ENTRY;
 930
 931         mutex_lock(&lli->lli_och_mutex);
 932         if (file->f_mode & FMODE_WRITE) {
 933                 och_p = &lli->lli_mds_write_och;
 934                 och_usecount = &lli->lli_open_fd_write_count;
 935         } else {
 936                 och_p = &lli->lli_mds_read_och;
 937                 och_usecount = &lli->lli_open_fd_read_count;
 938         }
 939
 940         /* The file may have been open by another process (broken lease) so
 941          * *och_p is not NULL. In this case we should simply increase usecount
 942          * and close fd_och.
 943          */
 944         if (*och_p != NULL) {
 945                 old_och = fd->fd_och;
 946                 (*och_usecount)++;
 947         } else {
 948                 *och_p = fd->fd_och;
 949                 *och_usecount = 1;
 950         }
 951         fd->fd_och = NULL;
 952         mutex_unlock(&lli->lli_och_mutex);
 953
 954         if (old_och != NULL)
 955                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 956
 957         RETURN(rc);
 958 }
 959
 960 /**
 961  * Acquire a lease and open the file.
 962  */
 963 static struct obd_client_handle *
 964 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 965               __u64 open_flags)
 966 {
 967         struct lookup_intent it = { .it_op = IT_OPEN };
 968         struct ll_sb_info *sbi = ll_i2sbi(inode);
 969         struct md_op_data *op_data;
 970         struct ptlrpc_request *req = NULL;
 971         struct lustre_handle old_open_handle = { 0 };
 972         struct obd_client_handle *och = NULL;
 973         int rc;
 974         int rc2;
 975         ENTRY;
 976
 977         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 978                 RETURN(ERR_PTR(-EINVAL));
 979
 980         if (file != NULL) {
 981                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 982                         RETURN(ERR_PTR(-EPERM));
 983
 984                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
 985                 if (rc)
 986                         RETURN(ERR_PTR(rc));
 987         }
 988
 989         OBD_ALLOC_PTR(och);
 990         if (och == NULL)
 991                 RETURN(ERR_PTR(-ENOMEM));
 992
 993         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 994                                         LUSTRE_OPC_ANY, NULL);
 995         if (IS_ERR(op_data))
 996                 GOTO(out, rc = PTR_ERR(op_data));
 997
 998         /* To tell the MDT this openhandle is from the same owner */
 999         op_data->op_open_handle = old_open_handle;
1000
1001         it.it_flags = fmode | open_flags;
1002         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1003         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1004                             &ll_md_blocking_lease_ast,
1005         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1006          * it can be cancelled which may mislead applications that the lease is
1007          * broken;
1008          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1009          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1010          * doesn't deal with openhandle, so normal openhandle will be leaked. */
1011                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1012         ll_finish_md_op_data(op_data);
1013         ptlrpc_req_finished(req);
1014         if (rc < 0)
1015                 GOTO(out_release_it, rc);
1016
1017         if (it_disposition(&it, DISP_LOOKUP_NEG))
1018                 GOTO(out_release_it, rc = -ENOENT);
1019
1020         rc = it_open_error(DISP_OPEN_OPEN, &it);
1021         if (rc)
1022                 GOTO(out_release_it, rc);
1023
1024         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1025         ll_och_fill(sbi->ll_md_exp, &it, och);
1026
1027         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1028                 GOTO(out_close, rc = -EOPNOTSUPP);
1029
1030         /* already get lease, handle lease lock */
1031         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1032         if (it.it_lock_mode == 0 ||
1033             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1034                 /* open lock must return for lease */
1035                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1036                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1037                         it.it_lock_bits);
1038                 GOTO(out_close, rc = -EPROTO);
1039         }
1040
1041         ll_intent_release(&it);
1042         RETURN(och);
1043
1044 out_close:
1045         /* Cancel open lock */
1046         if (it.it_lock_mode != 0) {
1047                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1048                                             it.it_lock_mode);
1049                 it.it_lock_mode = 0;
1050                 och->och_lease_handle.cookie = 0ULL;
1051         }
1052         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1053         if (rc2 < 0)
1054                 CERROR("%s: error closing file "DFID": %d\n",
1055                        ll_get_fsname(inode->i_sb, NULL, 0),
1056                        PFID(&ll_i2info(inode)->lli_fid), rc2);
1057         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1058 out_release_it:
1059         ll_intent_release(&it);
1060 out:
1061         if (och != NULL)
1062                 OBD_FREE_PTR(och);
1063         RETURN(ERR_PTR(rc));
1064 }
1065
1066 /**
1067  * Check whether a layout swap can be done between two inodes.
1068  *
1069  * \param[in] inode1  First inode to check
1070  * \param[in] inode2  Second inode to check
1071  *
1072  * \retval 0 on success, layout swap can be performed between both inodes
1073  * \retval negative error code if requirements are not met
1074  */
1075 static int ll_check_swap_layouts_validity(struct inode *inode1,
1076                                           struct inode *inode2)
1077 {
1078         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1079                 return -EINVAL;
1080
1081         if (inode_permission(inode1, MAY_WRITE) ||
1082             inode_permission(inode2, MAY_WRITE))
1083                 return -EPERM;
1084
1085         if (inode1->i_sb != inode2->i_sb)
1086                 return -EXDEV;
1087
1088         return 0;
1089 }
1090
1091 static int ll_swap_layouts_close(struct obd_client_handle *och,
1092                                  struct inode *inode, struct inode *inode2)
1093 {
1094         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1095         const struct lu_fid     *fid2;
1096         int                      rc;
1097         ENTRY;
1098
1099         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1100                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1101
1102         rc = ll_check_swap_layouts_validity(inode, inode2);
1103         if (rc < 0)
1104                 GOTO(out_free_och, rc);
1105
1106         /* We now know that inode2 is a lustre inode */
1107         fid2 = ll_inode2fid(inode2);
1108
1109         rc = lu_fid_cmp(fid1, fid2);
1110         if (rc == 0)
1111                 GOTO(out_free_och, rc = -EINVAL);
1112
1113         /* Close the file and {swap,merge} layouts between inode & inode2.
1114          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1115          * because we still need it to pack l_remote_handle to MDT. */
1116         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1117                                        inode2);
1118
1119         och = NULL; /* freed in ll_close_inode_openhandle() */
1120
1121 out_free_och:
1122         if (och != NULL)
1123                 OBD_FREE_PTR(och);
1124
1125         RETURN(rc);
1126 }
1127
1128 /**
1129  * Release lease and close the file.
1130  * It will check if the lease has ever broken.
1131  */
1132 static int ll_lease_close_intent(struct obd_client_handle *och,
1133                                  struct inode *inode,
1134                                  bool *lease_broken, enum mds_op_bias bias,
1135                                  void *data)
1136 {
1137         struct ldlm_lock *lock;
1138         bool cancelled = true;
1139         int rc;
1140         ENTRY;
1141
1142         lock = ldlm_handle2lock(&och->och_lease_handle);
1143         if (lock != NULL) {
1144                 lock_res_and_lock(lock);
1145                 cancelled = ldlm_is_cancel(lock);
1146                 unlock_res_and_lock(lock);
1147                 LDLM_LOCK_PUT(lock);
1148         }
1149
1150         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1151                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1152
1153         if (lease_broken != NULL)
1154                 *lease_broken = cancelled;
1155
1156         if (!cancelled && !bias)
1157                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1158
1159         if (cancelled) { /* no need to excute intent */
1160                 bias = 0;
1161                 data = NULL;
1162         }
1163
1164         rc = ll_close_inode_openhandle(inode, och, bias, data);
1165         RETURN(rc);
1166 }
1167
1168 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1169                           bool *lease_broken)
1170 {
1171         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1172 }
1173
1174 /**
1175  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1176  */
1177 static int ll_lease_file_resync(struct obd_client_handle *och,
1178                                 struct inode *inode, unsigned long arg)
1179 {
1180         struct ll_sb_info *sbi = ll_i2sbi(inode);
1181         struct md_op_data *op_data;
1182         struct ll_ioc_lease_id ioc;
1183         __u64 data_version_unused;
1184         int rc;
1185         ENTRY;
1186
1187         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1188                                      LUSTRE_OPC_ANY, NULL);
1189         if (IS_ERR(op_data))
1190                 RETURN(PTR_ERR(op_data));
1191
1192         if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1193                            sizeof(ioc)))
1194                 RETURN(-EFAULT);
1195
1196         /* before starting file resync, it's necessary to clean up page cache
1197          * in client memory, otherwise once the layout version is increased,
1198          * writing back cached data will be denied the OSTs. */
1199         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1200         if (rc)
1201                 GOTO(out, rc);
1202
1203         op_data->op_lease_handle = och->och_lease_handle;
1204         op_data->op_mirror_id = ioc.lil_mirror_id;
1205         rc = md_file_resync(sbi->ll_md_exp, op_data);
1206         if (rc)
1207                 GOTO(out, rc);
1208
1209         EXIT;
1210 out:
1211         ll_finish_md_op_data(op_data);
1212         return rc;
1213 }
1214
1215 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1216 {
1217         struct ll_inode_info *lli = ll_i2info(inode);
1218         struct cl_object *obj = lli->lli_clob;
1219         struct cl_attr *attr = vvp_env_thread_attr(env);
1220         s64 atime;
1221         s64 mtime;
1222         s64 ctime;
1223         int rc = 0;
1224
1225         ENTRY;
1226
1227         ll_inode_size_lock(inode);
1228
1229         /* Merge timestamps the most recently obtained from MDS with
1230          * timestamps obtained from OSTs.
1231          *
1232          * Do not overwrite atime of inode because it may be refreshed
1233          * by file_accessed() function. If the read was served by cache
1234          * data, there is no RPC to be sent so that atime may not be
1235          * transferred to OSTs at all. MDT only updates atime at close time
1236          * if it's at least 'mdd.*.atime_diff' older.
1237          * All in all, the atime in Lustre does not strictly comply with
1238          * POSIX. Solving this problem needs to send an RPC to MDT for each
1239          * read, this will hurt performance.
1240          */
1241         if (inode->i_atime.tv_sec < lli->lli_atime ||
1242             lli->lli_update_atime) {
1243                 inode->i_atime.tv_sec = lli->lli_atime;
1244                 lli->lli_update_atime = 0;
1245         }
1246         inode->i_mtime.tv_sec = lli->lli_mtime;
1247         inode->i_ctime.tv_sec = lli->lli_ctime;
1248
1249         mtime = inode->i_mtime.tv_sec;
1250         atime = inode->i_atime.tv_sec;
1251         ctime = inode->i_ctime.tv_sec;
1252
1253         cl_object_attr_lock(obj);
1254         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1255                 rc = -EINVAL;
1256         else
1257                 rc = cl_object_attr_get(env, obj, attr);
1258         cl_object_attr_unlock(obj);
1259
1260         if (rc != 0)
1261                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1262
1263         if (atime < attr->cat_atime)
1264                 atime = attr->cat_atime;
1265
1266         if (ctime < attr->cat_ctime)
1267                 ctime = attr->cat_ctime;
1268
1269         if (mtime < attr->cat_mtime)
1270                 mtime = attr->cat_mtime;
1271
1272         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1273                PFID(&lli->lli_fid), attr->cat_size);
1274
1275         i_size_write(inode, attr->cat_size);
1276         inode->i_blocks = attr->cat_blocks;
1277
1278         inode->i_mtime.tv_sec = mtime;
1279         inode->i_atime.tv_sec = atime;
1280         inode->i_ctime.tv_sec = ctime;
1281
1282 out_size_unlock:
1283         ll_inode_size_unlock(inode);
1284
1285         RETURN(rc);
1286 }
1287
1288 /**
1289  * Set designated mirror for I/O.
1290  *
1291  * So far only read, write, and truncated can support to issue I/O to
1292  * designated mirror.
1293  */
1294 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1295 {
1296         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1297
1298         /* clear layout version for generic(non-resync) I/O in case it carries
1299          * stale layout version due to I/O restart */
1300         io->ci_layout_version = 0;
1301
1302         /* FLR: disable non-delay for designated mirror I/O because obviously
1303          * only one mirror is available */
1304         if (fd->fd_designated_mirror > 0) {
1305                 io->ci_ndelay = 0;
1306                 io->ci_designated_mirror = fd->fd_designated_mirror;
1307                 io->ci_layout_version = fd->fd_layout_version;
1308         }
1309
1310         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1311                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1312 }
1313
1314 static bool file_is_noatime(const struct file *file)
1315 {
1316         const struct vfsmount *mnt = file->f_path.mnt;
1317         const struct inode *inode = file_inode((struct file *)file);
1318
1319         /* Adapted from file_accessed() and touch_atime().*/
1320         if (file->f_flags & O_NOATIME)
1321                 return true;
1322
1323         if (inode->i_flags & S_NOATIME)
1324                 return true;
1325
1326         if (IS_NOATIME(inode))
1327                 return true;
1328
1329         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1330                 return true;
1331
1332         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1333                 return true;
1334
1335         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1336                 return true;
1337
1338         return false;
1339 }
1340
1341 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1342 {
1343         struct inode *inode = file_inode(file);
1344         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1345
1346         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1347         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1348
1349         if (iot == CIT_WRITE) {
1350                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1351                 io->u.ci_wr.wr_sync   = !!(file->f_flags & O_SYNC ||
1352                                            file->f_flags & O_DIRECT ||
1353                                            IS_SYNC(inode));
1354         }
1355         io->ci_obj = ll_i2info(inode)->lli_clob;
1356         io->ci_lockreq = CILR_MAYBE;
1357         if (ll_file_nolock(file)) {
1358                 io->ci_lockreq = CILR_NEVER;
1359                 io->ci_no_srvlock = 1;
1360         } else if (file->f_flags & O_APPEND) {
1361                 io->ci_lockreq = CILR_MANDATORY;
1362         }
1363         io->ci_noatime = file_is_noatime(file);
1364
1365         /* FLR: only use non-delay I/O for read as there is only one
1366          * avaliable mirror for write. */
1367         io->ci_ndelay = !(iot == CIT_WRITE);
1368
1369         ll_io_set_mirror(io, file);
1370 }
1371
1372 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1373                         __u64 count)
1374 {
1375         struct ll_inode_info *lli = ll_i2info(inode);
1376         struct ll_sb_info *sbi = ll_i2sbi(inode);
1377         enum obd_heat_type sample_type;
1378         enum obd_heat_type iobyte_type;
1379         __u64 now = ktime_get_real_seconds();
1380
1381         if (!ll_sbi_has_file_heat(sbi) ||
1382             lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1383                 return;
1384
1385         if (iot == CIT_READ) {
1386                 sample_type = OBD_HEAT_READSAMPLE;
1387                 iobyte_type = OBD_HEAT_READBYTE;
1388         } else if (iot == CIT_WRITE) {
1389                 sample_type = OBD_HEAT_WRITESAMPLE;
1390                 iobyte_type = OBD_HEAT_WRITEBYTE;
1391         } else {
1392                 return;
1393         }
1394
1395         spin_lock(&lli->lli_heat_lock);
1396         obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1397                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1398         obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1399                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1400         spin_unlock(&lli->lli_heat_lock);
1401 }
1402
1403 static ssize_t
1404 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1405                    struct file *file, enum cl_io_type iot,
1406                    loff_t *ppos, size_t count)
1407 {
1408         struct vvp_io           *vio = vvp_env_io(env);
1409         struct inode            *inode = file_inode(file);
1410         struct ll_inode_info    *lli = ll_i2info(inode);
1411         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1412         struct range_lock       range;
1413         struct cl_io            *io;
1414         ssize_t                 result = 0;
1415         int                     rc = 0;
1416         unsigned                retried = 0;
1417         bool                    restarted = false;
1418
1419         ENTRY;
1420
1421         CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1422                 file_dentry(file)->d_name.name,
1423                 iot == CIT_READ ? "read" : "write", *ppos, count);
1424
1425 restart:
1426         io = vvp_env_thread_io(env);
1427         ll_io_init(io, file, iot);
1428         io->ci_ndelay_tried = retried;
1429
1430         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1431                 bool range_locked = false;
1432
1433                 if (file->f_flags & O_APPEND)
1434                         range_lock_init(&range, 0, LUSTRE_EOF);
1435                 else
1436                         range_lock_init(&range, *ppos, *ppos + count - 1);
1437
1438                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1439                 vio->vui_io_subtype = args->via_io_subtype;
1440
1441                 switch (vio->vui_io_subtype) {
1442                 case IO_NORMAL:
1443                         vio->vui_iter = args->u.normal.via_iter;
1444                         vio->vui_iocb = args->u.normal.via_iocb;
1445                         /* Direct IO reads must also take range lock,
1446                          * or multiple reads will try to work on the same pages
1447                          * See LU-6227 for details. */
1448                         if (((iot == CIT_WRITE) ||
1449                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1450                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1451                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1452                                        RL_PARA(&range));
1453                                 rc = range_lock(&lli->lli_write_tree, &range);
1454                                 if (rc < 0)
1455                                         GOTO(out, rc);
1456
1457                                 range_locked = true;
1458                         }
1459                         break;
1460                 case IO_SPLICE:
1461                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1462                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1463                         break;
1464                 default:
1465                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1466                         LBUG();
1467                 }
1468
1469                 ll_cl_add(file, env, io, LCC_RW);
1470                 rc = cl_io_loop(env, io);
1471                 ll_cl_remove(file, env);
1472
1473                 if (range_locked) {
1474                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1475                                RL_PARA(&range));
1476                         range_unlock(&lli->lli_write_tree, &range);
1477                 }
1478         } else {
1479                 /* cl_io_rw_init() handled IO */
1480                 rc = io->ci_result;
1481         }
1482
1483         if (io->ci_nob > 0) {
1484                 result += io->ci_nob;
1485                 count  -= io->ci_nob;
1486                 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1487
1488                 /* prepare IO restart */
1489                 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1490                         args->u.normal.via_iter = vio->vui_iter;
1491         }
1492 out:
1493         cl_io_fini(env, io);
1494
1495         CDEBUG(D_VFSTRACE,
1496                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1497                file->f_path.dentry->d_name.name,
1498                iot, rc, result, io->ci_need_restart);
1499
1500         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1501                 CDEBUG(D_VFSTRACE,
1502                        "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1503                        file_dentry(file)->d_name.name,
1504                        iot == CIT_READ ? "read" : "write",
1505                        *ppos, count, result, rc);
1506                 /* preserve the tried count for FLR */
1507                 retried = io->ci_ndelay_tried;
1508                 restarted = true;
1509                 goto restart;
1510         }
1511
1512         if (iot == CIT_READ) {
1513                 if (result > 0)
1514                         ll_stats_ops_tally(ll_i2sbi(inode),
1515                                            LPROC_LL_READ_BYTES, result);
1516         } else if (iot == CIT_WRITE) {
1517                 if (result > 0) {
1518                         ll_stats_ops_tally(ll_i2sbi(inode),
1519                                            LPROC_LL_WRITE_BYTES, result);
1520                         fd->fd_write_failed = false;
1521                 } else if (result == 0 && rc == 0) {
1522                         rc = io->ci_result;
1523                         if (rc < 0)
1524                                 fd->fd_write_failed = true;
1525                         else
1526                                 fd->fd_write_failed = false;
1527                 } else if (rc != -ERESTARTSYS) {
1528                         fd->fd_write_failed = true;
1529                 }
1530         }
1531
1532         CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1533         if (result > 0)
1534                 ll_heat_add(inode, iot, result);
1535
1536         RETURN(result > 0 ? result : rc);
1537 }
1538
1539 /**
1540  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1541  * especially for small I/O.
1542  *
1543  * To serve a read request, CLIO has to create and initialize a cl_io and
1544  * then request DLM lock. This has turned out to have siginificant overhead
1545  * and affects the performance of small I/O dramatically.
1546  *
1547  * It's not necessary to create a cl_io for each I/O. Under the help of read
1548  * ahead, most of the pages being read are already in memory cache and we can
1549  * read those pages directly because if the pages exist, the corresponding DLM
1550  * lock must exist so that page content must be valid.
1551  *
1552  * In fast read implementation, the llite speculatively finds and reads pages
1553  * in memory cache. There are three scenarios for fast read:
1554  *   - If the page exists and is uptodate, kernel VM will provide the data and
1555  *     CLIO won't be intervened;
1556  *   - If the page was brought into memory by read ahead, it will be exported
1557  *     and read ahead parameters will be updated;
1558  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1559  *     it will go back and invoke normal read, i.e., a cl_io will be created
1560  *     and DLM lock will be requested.
1561  *
1562  * POSIX compliance: posix standard states that read is intended to be atomic.
1563  * Lustre read implementation is in line with Linux kernel read implementation
1564  * and neither of them complies with POSIX standard in this matter. Fast read
1565  * doesn't make the situation worse on single node but it may interleave write
1566  * results from multiple nodes due to short read handling in ll_file_aio_read().
1567  *
1568  * \param env - lu_env
1569  * \param iocb - kiocb from kernel
1570  * \param iter - user space buffers where the data will be copied
1571  *
1572  * \retval - number of bytes have been read, or error code if error occurred.
1573  */
1574 static ssize_t
1575 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1576 {
1577         ssize_t result;
1578
1579         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1580                 return 0;
1581
1582         /* NB: we can't do direct IO for fast read because it will need a lock
1583          * to make IO engine happy. */
1584         if (iocb->ki_filp->f_flags & O_DIRECT)
1585                 return 0;
1586
1587         result = generic_file_read_iter(iocb, iter);
1588
1589         /* If the first page is not in cache, generic_file_aio_read() will be
1590          * returned with -ENODATA.
1591          * See corresponding code in ll_readpage(). */
1592         if (result == -ENODATA)
1593                 result = 0;
1594
1595         if (result > 0) {
1596                 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1597                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1598                                 LPROC_LL_READ_BYTES, result);
1599         }
1600
1601         return result;
1602 }
1603
1604 /*
1605  * Read from a file (through the page cache).
1606  */
1607 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1608 {
1609         struct lu_env *env;
1610         struct vvp_io_args *args;
1611         ssize_t result;
1612         ssize_t rc2;
1613         __u16 refcheck;
1614
1615         result = ll_do_fast_read(iocb, to);
1616         if (result < 0 || iov_iter_count(to) == 0)
1617                 GOTO(out, result);
1618
1619         env = cl_env_get(&refcheck);
1620         if (IS_ERR(env))
1621                 return PTR_ERR(env);
1622
1623         args = ll_env_args(env, IO_NORMAL);
1624         args->u.normal.via_iter = to;
1625         args->u.normal.via_iocb = iocb;
1626
1627         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1628                                  &iocb->ki_pos, iov_iter_count(to));
1629         if (rc2 > 0)
1630                 result += rc2;
1631         else if (result == 0)
1632                 result = rc2;
1633
1634         cl_env_put(env, &refcheck);
1635 out:
1636         return result;
1637 }
1638
1639 /**
1640  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1641  * If a page is already in the page cache and dirty (and some other things -
1642  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1643  * write to it without doing a full I/O, because Lustre already knows about it
1644  * and will write it out.  This saves a lot of processing time.
1645  *
1646  * All writes here are within one page, so exclusion is handled by the page
1647  * lock on the vm page.  We do not do tiny writes for writes which touch
1648  * multiple pages because it's very unlikely multiple sequential pages are
1649  * are already dirty.
1650  *
1651  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1652  * and are unlikely to be to already dirty pages.
1653  *
1654  * Attribute updates are important here, we do them in ll_tiny_write_end.
1655  */
1656 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1657 {
1658         ssize_t count = iov_iter_count(iter);
1659         struct  file *file = iocb->ki_filp;
1660         struct  inode *inode = file_inode(file);
1661         bool    lock_inode = !IS_NOSEC(inode);
1662         ssize_t result = 0;
1663
1664         ENTRY;
1665
1666         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1667          * of function for why.
1668          */
1669         if (count >= PAGE_SIZE ||
1670             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1671                 RETURN(0);
1672
1673         if (unlikely(lock_inode))
1674                 inode_lock(inode);
1675         result = __generic_file_write_iter(iocb, iter);
1676
1677         if (unlikely(lock_inode))
1678                 inode_unlock(inode);
1679
1680         /* If the page is not already dirty, ll_tiny_write_begin returns
1681          * -ENODATA.  We continue on to normal write.
1682          */
1683         if (result == -ENODATA)
1684                 result = 0;
1685
1686         if (result > 0) {
1687                 ll_heat_add(inode, CIT_WRITE, result);
1688                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1689                                    result);
1690                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1691         }
1692
1693         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1694
1695         RETURN(result);
1696 }
1697
1698 /*
1699  * Write to a file (through the page cache).
1700  */
1701 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1702 {
1703         struct vvp_io_args *args;
1704         struct lu_env *env;
1705         ssize_t rc_tiny = 0, rc_normal;
1706         __u16 refcheck;
1707
1708         ENTRY;
1709
1710         /* NB: we can't do direct IO for tiny writes because they use the page
1711          * cache, we can't do sync writes because tiny writes can't flush
1712          * pages, and we can't do append writes because we can't guarantee the
1713          * required DLM locks are held to protect file size.
1714          */
1715         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1716             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1717                 rc_tiny = ll_do_tiny_write(iocb, from);
1718
1719         /* In case of error, go on and try normal write - Only stop if tiny
1720          * write completed I/O.
1721          */
1722         if (iov_iter_count(from) == 0)
1723                 GOTO(out, rc_normal = rc_tiny);
1724
1725         env = cl_env_get(&refcheck);
1726         if (IS_ERR(env))
1727                 return PTR_ERR(env);
1728
1729         args = ll_env_args(env, IO_NORMAL);
1730         args->u.normal.via_iter = from;
1731         args->u.normal.via_iocb = iocb;
1732
1733         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1734                                     &iocb->ki_pos, iov_iter_count(from));
1735
1736         /* On success, combine bytes written. */
1737         if (rc_tiny >= 0 && rc_normal > 0)
1738                 rc_normal += rc_tiny;
1739         /* On error, only return error from normal write if tiny write did not
1740          * write any bytes.  Otherwise return bytes written by tiny write.
1741          */
1742         else if (rc_tiny > 0)
1743                 rc_normal = rc_tiny;
1744
1745         cl_env_put(env, &refcheck);
1746 out:
1747         RETURN(rc_normal);
1748 }
1749
1750 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1751 /*
1752  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1753  */
1754 static int ll_file_get_iov_count(const struct iovec *iov,
1755                                  unsigned long *nr_segs, size_t *count)
1756 {
1757         size_t cnt = 0;
1758         unsigned long seg;
1759
1760         for (seg = 0; seg < *nr_segs; seg++) {
1761                 const struct iovec *iv = &iov[seg];
1762
1763                 /*
1764                  * If any segment has a negative length, or the cumulative
1765                  * length ever wraps negative then return -EINVAL.
1766                  */
1767                 cnt += iv->iov_len;
1768                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1769                         return -EINVAL;
1770                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1771                         continue;
1772                 if (seg == 0)
1773                         return -EFAULT;
1774                 *nr_segs = seg;
1775                 cnt -= iv->iov_len;     /* This segment is no good */
1776                 break;
1777         }
1778         *count = cnt;
1779         return 0;
1780 }
1781
1782 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1783                                 unsigned long nr_segs, loff_t pos)
1784 {
1785         struct iov_iter to;
1786         size_t iov_count;
1787         ssize_t result;
1788         ENTRY;
1789
1790         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1791         if (result)
1792                 RETURN(result);
1793
1794 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1795         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1796 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1797         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1798 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1799
1800         result = ll_file_read_iter(iocb, &to);
1801
1802         RETURN(result);
1803 }
1804
1805 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1806                             loff_t *ppos)
1807 {
1808         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1809         struct kiocb   kiocb;
1810         ssize_t        result;
1811         ENTRY;
1812
1813         init_sync_kiocb(&kiocb, file);
1814         kiocb.ki_pos = *ppos;
1815 #ifdef HAVE_KIOCB_KI_LEFT
1816         kiocb.ki_left = count;
1817 #elif defined(HAVE_KI_NBYTES)
1818         kiocb.i_nbytes = count;
1819 #endif
1820
1821         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1822         *ppos = kiocb.ki_pos;
1823
1824         RETURN(result);
1825 }
1826
1827 /*
1828  * Write to a file (through the page cache).
1829  * AIO stuff
1830  */
1831 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1832                                  unsigned long nr_segs, loff_t pos)
1833 {
1834         struct iov_iter from;
1835         size_t iov_count;
1836         ssize_t result;
1837         ENTRY;
1838
1839         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1840         if (result)
1841                 RETURN(result);
1842
1843 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1844         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1845 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1846         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1847 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1848
1849         result = ll_file_write_iter(iocb, &from);
1850
1851         RETURN(result);
1852 }
1853
1854 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1855                              size_t count, loff_t *ppos)
1856 {
1857         struct iovec   iov = { .iov_base = (void __user *)buf,
1858                                .iov_len = count };
1859         struct kiocb   kiocb;
1860         ssize_t        result;
1861
1862         ENTRY;
1863
1864         init_sync_kiocb(&kiocb, file);
1865         kiocb.ki_pos = *ppos;
1866 #ifdef HAVE_KIOCB_KI_LEFT
1867         kiocb.ki_left = count;
1868 #elif defined(HAVE_KI_NBYTES)
1869         kiocb.ki_nbytes = count;
1870 #endif
1871
1872         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1873         *ppos = kiocb.ki_pos;
1874
1875         RETURN(result);
1876 }
1877 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1878
1879 /*
1880  * Send file content (through pagecache) somewhere with helper
1881  */
1882 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1883                                    struct pipe_inode_info *pipe, size_t count,
1884                                    unsigned int flags)
1885 {
1886         struct lu_env      *env;
1887         struct vvp_io_args *args;
1888         ssize_t             result;
1889         __u16               refcheck;
1890         ENTRY;
1891
1892         env = cl_env_get(&refcheck);
1893         if (IS_ERR(env))
1894                 RETURN(PTR_ERR(env));
1895
1896         args = ll_env_args(env, IO_SPLICE);
1897         args->u.splice.via_pipe = pipe;
1898         args->u.splice.via_flags = flags;
1899
1900         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1901         cl_env_put(env, &refcheck);
1902         RETURN(result);
1903 }
1904
1905 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1906                              __u64 flags, struct lov_user_md *lum, int lum_size)
1907 {
1908         struct lookup_intent oit = {
1909                 .it_op = IT_OPEN,
1910                 .it_flags = flags | MDS_OPEN_BY_FID,
1911         };
1912         int rc;
1913         ENTRY;
1914
1915         ll_inode_size_lock(inode);
1916         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1917         if (rc < 0)
1918                 GOTO(out_unlock, rc);
1919
1920         ll_release_openhandle(dentry, &oit);
1921
1922 out_unlock:
1923         ll_inode_size_unlock(inode);
1924         ll_intent_release(&oit);
1925
1926         RETURN(rc);
1927 }
1928
1929 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1930                              struct lov_mds_md **lmmp, int *lmm_size,
1931                              struct ptlrpc_request **request)
1932 {
1933         struct ll_sb_info *sbi = ll_i2sbi(inode);
1934         struct mdt_body  *body;
1935         struct lov_mds_md *lmm = NULL;
1936         struct ptlrpc_request *req = NULL;
1937         struct md_op_data *op_data;
1938         int rc, lmmsize;
1939
1940         rc = ll_get_default_mdsize(sbi, &lmmsize);
1941         if (rc)
1942                 RETURN(rc);
1943
1944         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1945                                      strlen(filename), lmmsize,
1946                                      LUSTRE_OPC_ANY, NULL);
1947         if (IS_ERR(op_data))
1948                 RETURN(PTR_ERR(op_data));
1949
1950         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1951         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1952         ll_finish_md_op_data(op_data);
1953         if (rc < 0) {
1954                 CDEBUG(D_INFO, "md_getattr_name failed "
1955                        "on %s: rc %d\n", filename, rc);
1956                 GOTO(out, rc);
1957         }
1958
1959         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1960         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1961
1962         lmmsize = body->mbo_eadatasize;
1963
1964         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1965                         lmmsize == 0) {
1966                 GOTO(out, rc = -ENODATA);
1967         }
1968
1969         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1970         LASSERT(lmm != NULL);
1971
1972         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1973             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1974             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1975                 GOTO(out, rc = -EPROTO);
1976
1977         /*
1978          * This is coming from the MDS, so is probably in
1979          * little endian.  We convert it to host endian before
1980          * passing it to userspace.
1981          */
1982         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1983                 int stripe_count;
1984
1985                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1986                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1987                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1988                         if (le32_to_cpu(lmm->lmm_pattern) &
1989                             LOV_PATTERN_F_RELEASED)
1990                                 stripe_count = 0;
1991                 }
1992
1993                 /* if function called for directory - we should
1994                  * avoid swab not existent lsm objects */
1995                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1996                         lustre_swab_lov_user_md_v1(
1997                                         (struct lov_user_md_v1 *)lmm);
1998                         if (S_ISREG(body->mbo_mode))
1999                                 lustre_swab_lov_user_md_objects(
2000                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2001                                     stripe_count);
2002                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2003                         lustre_swab_lov_user_md_v3(
2004                                         (struct lov_user_md_v3 *)lmm);
2005                         if (S_ISREG(body->mbo_mode))
2006                                 lustre_swab_lov_user_md_objects(
2007                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2008                                     stripe_count);
2009                 } else if (lmm->lmm_magic ==
2010                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2011                         lustre_swab_lov_comp_md_v1(
2012                                         (struct lov_comp_md_v1 *)lmm);
2013                 }
2014         }
2015
2016 out:
2017         *lmmp = lmm;
2018         *lmm_size = lmmsize;
2019         *request = req;
2020         return rc;
2021 }
2022
2023 static int ll_lov_setea(struct inode *inode, struct file *file,
2024                         void __user *arg)
2025 {
2026         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2027         struct lov_user_md      *lump;
2028         int                      lum_size = sizeof(struct lov_user_md) +
2029                                             sizeof(struct lov_user_ost_data);
2030         int                      rc;
2031         ENTRY;
2032
2033         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2034                 RETURN(-EPERM);
2035
2036         OBD_ALLOC_LARGE(lump, lum_size);
2037         if (lump == NULL)
2038                 RETURN(-ENOMEM);
2039
2040         if (copy_from_user(lump, arg, lum_size))
2041                 GOTO(out_lump, rc = -EFAULT);
2042
2043         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2044                                       lum_size);
2045         cl_lov_delay_create_clear(&file->f_flags);
2046
2047 out_lump:
2048         OBD_FREE_LARGE(lump, lum_size);
2049         RETURN(rc);
2050 }
2051
2052 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2053 {
2054         struct lu_env   *env;
2055         __u16           refcheck;
2056         int             rc;
2057         ENTRY;
2058
2059         env = cl_env_get(&refcheck);
2060         if (IS_ERR(env))
2061                 RETURN(PTR_ERR(env));
2062
2063         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2064         cl_env_put(env, &refcheck);
2065         RETURN(rc);
2066 }
2067
2068 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2069                             void __user *arg)
2070 {
2071         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2072         struct lov_user_md        *klum;
2073         int                        lum_size, rc;
2074         __u64                      flags = FMODE_WRITE;
2075         ENTRY;
2076
2077         rc = ll_copy_user_md(lum, &klum);
2078         if (rc < 0)
2079                 RETURN(rc);
2080
2081         lum_size = rc;
2082         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2083                                       lum_size);
2084         if (!rc) {
2085                 __u32 gen;
2086
2087                 rc = put_user(0, &lum->lmm_stripe_count);
2088                 if (rc)
2089                         GOTO(out, rc);
2090
2091                 rc = ll_layout_refresh(inode, &gen);
2092                 if (rc)
2093                         GOTO(out, rc);
2094
2095                 rc = ll_file_getstripe(inode, arg, lum_size);
2096         }
2097         cl_lov_delay_create_clear(&file->f_flags);
2098
2099 out:
2100         OBD_FREE(klum, lum_size);
2101         RETURN(rc);
2102 }
2103
2104 static int
2105 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2106 {
2107         struct ll_inode_info *lli = ll_i2info(inode);
2108         struct cl_object *obj = lli->lli_clob;
2109         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2110         struct ll_grouplock grouplock;
2111         int rc;
2112         ENTRY;
2113
2114         if (arg == 0) {
2115                 CWARN("group id for group lock must not be 0\n");
2116                 RETURN(-EINVAL);
2117         }
2118
2119         if (ll_file_nolock(file))
2120                 RETURN(-EOPNOTSUPP);
2121
2122         spin_lock(&lli->lli_lock);
2123         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2124                 CWARN("group lock already existed with gid %lu\n",
2125                       fd->fd_grouplock.lg_gid);
2126                 spin_unlock(&lli->lli_lock);
2127                 RETURN(-EINVAL);
2128         }
2129         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2130         spin_unlock(&lli->lli_lock);
2131
2132         /**
2133          * XXX: group lock needs to protect all OST objects while PFL
2134          * can add new OST objects during the IO, so we'd instantiate
2135          * all OST objects before getting its group lock.
2136          */
2137         if (obj) {
2138                 struct lu_env *env;
2139                 __u16 refcheck;
2140                 struct cl_layout cl = {
2141                         .cl_is_composite = false,
2142                 };
2143                 struct lu_extent ext = {
2144                         .e_start = 0,
2145                         .e_end = OBD_OBJECT_EOF,
2146                 };
2147
2148                 env = cl_env_get(&refcheck);
2149                 if (IS_ERR(env))
2150                         RETURN(PTR_ERR(env));
2151
2152                 rc = cl_object_layout_get(env, obj, &cl);
2153                 if (!rc && cl.cl_is_composite)
2154                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2155                                                     &ext);
2156
2157                 cl_env_put(env, &refcheck);
2158                 if (rc)
2159                         RETURN(rc);
2160         }
2161
2162         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2163                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2164         if (rc)
2165                 RETURN(rc);
2166
2167         spin_lock(&lli->lli_lock);
2168         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2169                 spin_unlock(&lli->lli_lock);
2170                 CERROR("another thread just won the race\n");
2171                 cl_put_grouplock(&grouplock);
2172                 RETURN(-EINVAL);
2173         }
2174
2175         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2176         fd->fd_grouplock = grouplock;
2177         spin_unlock(&lli->lli_lock);
2178
2179         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2180         RETURN(0);
2181 }
2182
2183 static int ll_put_grouplock(struct inode *inode, struct file *file,
2184                             unsigned long arg)
2185 {
2186         struct ll_inode_info   *lli = ll_i2info(inode);
2187         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2188         struct ll_grouplock     grouplock;
2189         ENTRY;
2190
2191         spin_lock(&lli->lli_lock);
2192         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2193                 spin_unlock(&lli->lli_lock);
2194                 CWARN("no group lock held\n");
2195                 RETURN(-EINVAL);
2196         }
2197
2198         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2199
2200         if (fd->fd_grouplock.lg_gid != arg) {
2201                 CWARN("group lock %lu doesn't match current id %lu\n",
2202                       arg, fd->fd_grouplock.lg_gid);
2203                 spin_unlock(&lli->lli_lock);
2204                 RETURN(-EINVAL);
2205         }
2206
2207         grouplock = fd->fd_grouplock;
2208         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2209         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2210         spin_unlock(&lli->lli_lock);
2211
2212         cl_put_grouplock(&grouplock);
2213         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2214         RETURN(0);
2215 }
2216
2217 /**
2218  * Close inode open handle
2219  *
2220  * \param dentry [in]     dentry which contains the inode
2221  * \param it     [in,out] intent which contains open info and result
2222  *
2223  * \retval 0     success
2224  * \retval <0    failure
2225  */
2226 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2227 {
2228         struct inode *inode = dentry->d_inode;
2229         struct obd_client_handle *och;
2230         int rc;
2231         ENTRY;
2232
2233         LASSERT(inode);
2234
2235         /* Root ? Do nothing. */
2236         if (dentry->d_inode->i_sb->s_root == dentry)
2237                 RETURN(0);
2238
2239         /* No open handle to close? Move away */
2240         if (!it_disposition(it, DISP_OPEN_OPEN))
2241                 RETURN(0);
2242
2243         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2244
2245         OBD_ALLOC(och, sizeof(*och));
2246         if (!och)
2247                 GOTO(out, rc = -ENOMEM);
2248
2249         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2250
2251         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2252 out:
2253         /* this one is in place of ll_file_open */
2254         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2255                 ptlrpc_req_finished(it->it_request);
2256                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2257         }
2258         RETURN(rc);
2259 }
2260
2261 /**
2262  * Get size for inode for which FIEMAP mapping is requested.
2263  * Make the FIEMAP get_info call and returns the result.
2264  * \param fiemap        kernel buffer to hold extens
2265  * \param num_bytes     kernel buffer size
2266  */
2267 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2268                         size_t num_bytes)
2269 {
2270         struct lu_env                   *env;
2271         __u16                           refcheck;
2272         int                             rc = 0;
2273         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2274         ENTRY;
2275
2276         /* Checks for fiemap flags */
2277         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2278                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2279                 return -EBADR;
2280         }
2281
2282         /* Check for FIEMAP_FLAG_SYNC */
2283         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2284                 rc = filemap_fdatawrite(inode->i_mapping);
2285                 if (rc)
2286                         return rc;
2287         }
2288
2289         env = cl_env_get(&refcheck);
2290         if (IS_ERR(env))
2291                 RETURN(PTR_ERR(env));
2292
2293         if (i_size_read(inode) == 0) {
2294                 rc = ll_glimpse_size(inode);
2295                 if (rc)
2296                         GOTO(out, rc);
2297         }
2298
2299         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2300         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2301         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2302
2303         /* If filesize is 0, then there would be no objects for mapping */
2304         if (fmkey.lfik_oa.o_size == 0) {
2305                 fiemap->fm_mapped_extents = 0;
2306                 GOTO(out, rc = 0);
2307         }
2308
2309         fmkey.lfik_fiemap = *fiemap;
2310
2311         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2312                               &fmkey, fiemap, &num_bytes);
2313 out:
2314         cl_env_put(env, &refcheck);
2315         RETURN(rc);
2316 }
2317
2318 int ll_fid2path(struct inode *inode, void __user *arg)
2319 {
2320         struct obd_export       *exp = ll_i2mdexp(inode);
2321         const struct getinfo_fid2path __user *gfin = arg;
2322         __u32                    pathlen;
2323         struct getinfo_fid2path *gfout;
2324         size_t                   outsize;
2325         int                      rc;
2326
2327         ENTRY;
2328
2329         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2330             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2331                 RETURN(-EPERM);
2332
2333         /* Only need to get the buflen */
2334         if (get_user(pathlen, &gfin->gf_pathlen))
2335                 RETURN(-EFAULT);
2336
2337         if (pathlen > PATH_MAX)
2338                 RETURN(-EINVAL);
2339
2340         outsize = sizeof(*gfout) + pathlen;
2341         OBD_ALLOC(gfout, outsize);
2342         if (gfout == NULL)
2343                 RETURN(-ENOMEM);
2344
2345         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2346                 GOTO(gf_free, rc = -EFAULT);
2347         /* append root FID after gfout to let MDT know the root FID so that it
2348          * can lookup the correct path, this is mainly for fileset.
2349          * old server without fileset mount support will ignore this. */
2350         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2351
2352         /* Call mdc_iocontrol */
2353         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2354         if (rc != 0)
2355                 GOTO(gf_free, rc);
2356
2357         if (copy_to_user(arg, gfout, outsize))
2358                 rc = -EFAULT;
2359
2360 gf_free:
2361         OBD_FREE(gfout, outsize);
2362         RETURN(rc);
2363 }
2364
2365 static int
2366 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2367 {
2368         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2369         struct lu_env *env;
2370         struct cl_io *io;
2371         __u16  refcheck;
2372         int result;
2373
2374         ENTRY;
2375
2376         ioc->idv_version = 0;
2377         ioc->idv_layout_version = UINT_MAX;
2378
2379         /* If no file object initialized, we consider its version is 0. */
2380         if (obj == NULL)
2381                 RETURN(0);
2382
2383         env = cl_env_get(&refcheck);
2384         if (IS_ERR(env))
2385                 RETURN(PTR_ERR(env));
2386
2387         io = vvp_env_thread_io(env);
2388         io->ci_obj = obj;
2389         io->u.ci_data_version.dv_data_version = 0;
2390         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2391         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2392
2393 restart:
2394         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2395                 result = cl_io_loop(env, io);
2396         else
2397                 result = io->ci_result;
2398
2399         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2400         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2401
2402         cl_io_fini(env, io);
2403
2404         if (unlikely(io->ci_need_restart))
2405                 goto restart;
2406
2407         cl_env_put(env, &refcheck);
2408
2409         RETURN(result);
2410 }
2411
2412 /*
2413  * Read the data_version for inode.
2414  *
2415  * This value is computed using stripe object version on OST.
2416  * Version is computed using server side locking.
2417  *
2418  * @param flags if do sync on the OST side;
2419  *              0: no sync
2420  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2421  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2422  */
2423 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2424 {
2425         struct ioc_data_version ioc = { .idv_flags = flags };
2426         int rc;
2427
2428         rc = ll_ioc_data_version(inode, &ioc);
2429         if (!rc)
2430                 *data_version = ioc.idv_version;
2431
2432         return rc;
2433 }
2434
2435 /*
2436  * Trigger a HSM release request for the provided inode.
2437  */
2438 int ll_hsm_release(struct inode *inode)
2439 {
2440         struct lu_env *env;
2441         struct obd_client_handle *och = NULL;
2442         __u64 data_version = 0;
2443         int rc;
2444         __u16 refcheck;
2445         ENTRY;
2446
2447         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2448                ll_get_fsname(inode->i_sb, NULL, 0),
2449                PFID(&ll_i2info(inode)->lli_fid));
2450
2451         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2452         if (IS_ERR(och))
2453                 GOTO(out, rc = PTR_ERR(och));
2454
2455         /* Grab latest data_version and [am]time values */
2456         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2457         if (rc != 0)
2458                 GOTO(out, rc);
2459
2460         env = cl_env_get(&refcheck);
2461         if (IS_ERR(env))
2462                 GOTO(out, rc = PTR_ERR(env));
2463
2464         rc = ll_merge_attr(env, inode);
2465         cl_env_put(env, &refcheck);
2466
2467         /* If error happen, we have the wrong size for a file.
2468          * Don't release it.
2469          */
2470         if (rc != 0)
2471                 GOTO(out, rc);
2472
2473         /* Release the file.
2474          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2475          * we still need it to pack l_remote_handle to MDT. */
2476         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2477                                        &data_version);
2478         och = NULL;
2479
2480         EXIT;
2481 out:
2482         if (och != NULL && !IS_ERR(och)) /* close the file */
2483                 ll_lease_close(och, inode, NULL);
2484
2485         return rc;
2486 }
2487
2488 struct ll_swap_stack {
2489         __u64                    dv1;
2490         __u64                    dv2;
2491         struct inode            *inode1;
2492         struct inode            *inode2;
2493         bool                     check_dv1;
2494         bool                     check_dv2;
2495 };
2496
2497 static int ll_swap_layouts(struct file *file1, struct file *file2,
2498                            struct lustre_swap_layouts *lsl)
2499 {
2500         struct mdc_swap_layouts  msl;
2501         struct md_op_data       *op_data;
2502         __u32                    gid;
2503         __u64                    dv;
2504         struct ll_swap_stack    *llss = NULL;
2505         int                      rc;
2506
2507         OBD_ALLOC_PTR(llss);
2508         if (llss == NULL)
2509                 RETURN(-ENOMEM);
2510
2511         llss->inode1 = file_inode(file1);
2512         llss->inode2 = file_inode(file2);
2513
2514         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2515         if (rc < 0)
2516                 GOTO(free, rc);
2517
2518         /* we use 2 bool because it is easier to swap than 2 bits */
2519         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2520                 llss->check_dv1 = true;
2521
2522         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2523                 llss->check_dv2 = true;
2524
2525         /* we cannot use lsl->sl_dvX directly because we may swap them */
2526         llss->dv1 = lsl->sl_dv1;
2527         llss->dv2 = lsl->sl_dv2;
2528
2529         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2530         if (rc == 0) /* same file, done! */
2531                 GOTO(free, rc);
2532
2533         if (rc < 0) { /* sequentialize it */
2534                 swap(llss->inode1, llss->inode2);
2535                 swap(file1, file2);
2536                 swap(llss->dv1, llss->dv2);
2537                 swap(llss->check_dv1, llss->check_dv2);
2538         }
2539
2540         gid = lsl->sl_gid;
2541         if (gid != 0) { /* application asks to flush dirty cache */
2542                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2543                 if (rc < 0)
2544                         GOTO(free, rc);
2545
2546                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2547                 if (rc < 0) {
2548                         ll_put_grouplock(llss->inode1, file1, gid);
2549                         GOTO(free, rc);
2550                 }
2551         }
2552
2553         /* ultimate check, before swaping the layouts we check if
2554          * dataversion has changed (if requested) */
2555         if (llss->check_dv1) {
2556                 rc = ll_data_version(llss->inode1, &dv, 0);
2557                 if (rc)
2558                         GOTO(putgl, rc);
2559                 if (dv != llss->dv1)
2560                         GOTO(putgl, rc = -EAGAIN);
2561         }
2562
2563         if (llss->check_dv2) {
2564                 rc = ll_data_version(llss->inode2, &dv, 0);
2565                 if (rc)
2566                         GOTO(putgl, rc);
2567                 if (dv != llss->dv2)
2568                         GOTO(putgl, rc = -EAGAIN);
2569         }
2570
2571         /* struct md_op_data is used to send the swap args to the mdt
2572          * only flags is missing, so we use struct mdc_swap_layouts
2573          * through the md_op_data->op_data */
2574         /* flags from user space have to be converted before they are send to
2575          * server, no flag is sent today, they are only used on the client */
2576         msl.msl_flags = 0;
2577         rc = -ENOMEM;
2578         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2579                                      0, LUSTRE_OPC_ANY, &msl);
2580         if (IS_ERR(op_data))
2581                 GOTO(free, rc = PTR_ERR(op_data));
2582
2583         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2584                            sizeof(*op_data), op_data, NULL);
2585         ll_finish_md_op_data(op_data);
2586
2587         if (rc < 0)
2588                 GOTO(putgl, rc);
2589
2590 putgl:
2591         if (gid != 0) {
2592                 ll_put_grouplock(llss->inode2, file2, gid);
2593                 ll_put_grouplock(llss->inode1, file1, gid);
2594         }
2595
2596 free:
2597         if (llss != NULL)
2598                 OBD_FREE_PTR(llss);
2599
2600         RETURN(rc);
2601 }
2602
2603 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2604 {
2605         struct obd_export *exp = ll_i2mdexp(inode);
2606         struct md_op_data *op_data;
2607         int rc;
2608         ENTRY;
2609
2610         /* Detect out-of range masks */
2611         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2612                 RETURN(-EINVAL);
2613
2614         /* Non-root users are forbidden to set or clear flags which are
2615          * NOT defined in HSM_USER_MASK. */
2616         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2617             !cfs_capable(CFS_CAP_SYS_ADMIN))
2618                 RETURN(-EPERM);
2619
2620         if (!exp_connect_archive_id_array(exp)) {
2621                 /* Detect out-of range archive id */
2622                 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2623                     (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2624                         RETURN(-EINVAL);
2625         }
2626
2627         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2628                                      LUSTRE_OPC_ANY, hss);
2629         if (IS_ERR(op_data))
2630                 RETURN(PTR_ERR(op_data));
2631
2632         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2633                            op_data, NULL);
2634
2635         ll_finish_md_op_data(op_data);
2636
2637         RETURN(rc);
2638 }
2639
2640 static int ll_hsm_import(struct inode *inode, struct file *file,
2641                          struct hsm_user_import *hui)
2642 {
2643         struct hsm_state_set    *hss = NULL;
2644         struct iattr            *attr = NULL;
2645         int                      rc;
2646         ENTRY;
2647
2648         if (!S_ISREG(inode->i_mode))
2649                 RETURN(-EINVAL);
2650
2651         /* set HSM flags */
2652         OBD_ALLOC_PTR(hss);
2653         if (hss == NULL)
2654                 GOTO(out, rc = -ENOMEM);
2655
2656         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2657         hss->hss_archive_id = hui->hui_archive_id;
2658         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2659         rc = ll_hsm_state_set(inode, hss);
2660         if (rc != 0)
2661                 GOTO(out, rc);
2662
2663         OBD_ALLOC_PTR(attr);
2664         if (attr == NULL)
2665                 GOTO(out, rc = -ENOMEM);
2666
2667         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2668         attr->ia_mode |= S_IFREG;
2669         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2670         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2671         attr->ia_size = hui->hui_size;
2672         attr->ia_mtime.tv_sec = hui->hui_mtime;
2673         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2674         attr->ia_atime.tv_sec = hui->hui_atime;
2675         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2676
2677         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2678                          ATTR_UID | ATTR_GID |
2679                          ATTR_MTIME | ATTR_MTIME_SET |
2680                          ATTR_ATIME | ATTR_ATIME_SET;
2681
2682         inode_lock(inode);
2683
2684         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2685         if (rc == -ENODATA)
2686                 rc = 0;
2687
2688         inode_unlock(inode);
2689
2690 out:
2691         if (hss != NULL)
2692                 OBD_FREE_PTR(hss);
2693
2694         if (attr != NULL)
2695                 OBD_FREE_PTR(attr);
2696
2697         RETURN(rc);
2698 }
2699
2700 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2701 {
2702         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2703                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2704 }
2705
2706 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2707 {
2708         struct inode *inode = file_inode(file);
2709         struct iattr ia = {
2710                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2711                             ATTR_MTIME | ATTR_MTIME_SET |
2712                             ATTR_CTIME,
2713                 .ia_atime = {
2714                         .tv_sec = lfu->lfu_atime_sec,
2715                         .tv_nsec = lfu->lfu_atime_nsec,
2716                 },
2717                 .ia_mtime = {
2718                         .tv_sec = lfu->lfu_mtime_sec,
2719                         .tv_nsec = lfu->lfu_mtime_nsec,
2720                 },
2721                 .ia_ctime = {
2722                         .tv_sec = lfu->lfu_ctime_sec,
2723                         .tv_nsec = lfu->lfu_ctime_nsec,
2724                 },
2725         };
2726         int rc;
2727         ENTRY;
2728
2729         if (!capable(CAP_SYS_ADMIN))
2730                 RETURN(-EPERM);
2731
2732         if (!S_ISREG(inode->i_mode))
2733                 RETURN(-EINVAL);
2734
2735         inode_lock(inode);
2736         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2737                             false);
2738         inode_unlock(inode);
2739
2740         RETURN(rc);
2741 }
2742
2743 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2744 {
2745         switch (mode) {
2746         case MODE_READ_USER:
2747                 return CLM_READ;
2748         case MODE_WRITE_USER:
2749                 return CLM_WRITE;
2750         default:
2751                 return -EINVAL;
2752         }
2753 }
2754
2755 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2756
2757 /* Used to allow the upper layers of the client to request an LDLM lock
2758  * without doing an actual read or write.
2759  *
2760  * Used for ladvise lockahead to manually request specific locks.
2761  *
2762  * \param[in] file      file this ladvise lock request is on
2763  * \param[in] ladvise   ladvise struct describing this lock request
2764  *
2765  * \retval 0            success, no detailed result available (sync requests
2766  *                      and requests sent to the server [not handled locally]
2767  *                      cannot return detailed results)
2768  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2769  *                                       see definitions for details.
2770  * \retval negative     negative errno on error
2771  */
2772 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2773 {
2774         struct lu_env *env = NULL;
2775         struct cl_io *io  = NULL;
2776         struct cl_lock *lock = NULL;
2777         struct cl_lock_descr *descr = NULL;
2778         struct dentry *dentry = file->f_path.dentry;
2779         struct inode *inode = dentry->d_inode;
2780         enum cl_lock_mode cl_mode;
2781         off_t start = ladvise->lla_start;
2782         off_t end = ladvise->lla_end;
2783         int result;
2784         __u16 refcheck;
2785
2786         ENTRY;
2787
2788         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2789                "start=%llu, end=%llu\n", dentry->d_name.len,
2790                dentry->d_name.name, dentry->d_inode,
2791                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2792                (__u64) end);
2793
2794         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2795         if (cl_mode < 0)
2796                 GOTO(out, result = cl_mode);
2797
2798         /* Get IO environment */
2799         result = cl_io_get(inode, &env, &io, &refcheck);
2800         if (result <= 0)
2801                 GOTO(out, result);
2802
2803         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2804         if (result > 0) {
2805                 /*
2806                  * nothing to do for this io. This currently happens when
2807                  * stripe sub-object's are not yet created.
2808                  */
2809                 result = io->ci_result;
2810         } else if (result == 0) {
2811                 lock = vvp_env_lock(env);
2812                 descr = &lock->cll_descr;
2813
2814                 descr->cld_obj   = io->ci_obj;
2815                 /* Convert byte offsets to pages */
2816                 descr->cld_start = cl_index(io->ci_obj, start);
2817                 descr->cld_end   = cl_index(io->ci_obj, end);
2818                 descr->cld_mode  = cl_mode;
2819                 /* CEF_MUST is used because we do not want to convert a
2820                  * lockahead request to a lockless lock */
2821                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2822                                        CEF_NONBLOCK;
2823
2824                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2825                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2826
2827                 result = cl_lock_request(env, io, lock);
2828
2829                 /* On success, we need to release the lock */
2830                 if (result >= 0)
2831                         cl_lock_release(env, lock);
2832         }
2833         cl_io_fini(env, io);
2834         cl_env_put(env, &refcheck);
2835
2836         /* -ECANCELED indicates a matching lock with a different extent
2837          * was already present, and -EEXIST indicates a matching lock
2838          * on exactly the same extent was already present.
2839          * We convert them to positive values for userspace to make
2840          * recognizing true errors easier.
2841          * Note we can only return these detailed results on async requests,
2842          * as sync requests look the same as i/o requests for locking. */
2843         if (result == -ECANCELED)
2844                 result = LLA_RESULT_DIFFERENT;
2845         else if (result == -EEXIST)
2846                 result = LLA_RESULT_SAME;
2847
2848 out:
2849         RETURN(result);
2850 }
2851 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2852
2853 static int ll_ladvise_sanity(struct inode *inode,
2854                              struct llapi_lu_ladvise *ladvise)
2855 {
2856         enum lu_ladvise_type advice = ladvise->lla_advice;
2857         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2858          * be in the first 32 bits of enum ladvise_flags */
2859         __u32 flags = ladvise->lla_peradvice_flags;
2860         /* 3 lines at 80 characters per line, should be plenty */
2861         int rc = 0;
2862
2863         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2864                 rc = -EINVAL;
2865                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2866                        "last supported advice is %s (value '%d'): rc = %d\n",
2867                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2868                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2869                 GOTO(out, rc);
2870         }
2871
2872         /* Per-advice checks */
2873         switch (advice) {
2874         case LU_LADVISE_LOCKNOEXPAND:
2875                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2876                         rc = -EINVAL;
2877                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2878                                "rc = %d\n",
2879                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2880                                ladvise_names[advice], rc);
2881                         GOTO(out, rc);
2882                 }
2883                 break;
2884         case LU_LADVISE_LOCKAHEAD:
2885                 /* Currently only READ and WRITE modes can be requested */
2886                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2887                     ladvise->lla_lockahead_mode == 0) {
2888                         rc = -EINVAL;
2889                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2890                                "rc = %d\n",
2891                                ll_get_fsname(inode->i_sb, NULL, 0),
2892                                ladvise->lla_lockahead_mode,
2893                                ladvise_names[advice], rc);
2894                         GOTO(out, rc);
2895                 }
2896         case LU_LADVISE_WILLREAD:
2897         case LU_LADVISE_DONTNEED:
2898         default:
2899                 /* Note fall through above - These checks apply to all advices
2900                  * except LOCKNOEXPAND */
2901                 if (flags & ~LF_DEFAULT_MASK) {
2902                         rc = -EINVAL;
2903                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2904                                "rc = %d\n",
2905                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2906                                ladvise_names[advice], rc);
2907                         GOTO(out, rc);
2908                 }
2909                 if (ladvise->lla_start >= ladvise->lla_end) {
2910                         rc = -EINVAL;
2911                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2912                                "for %s: rc = %d\n",
2913                                ll_get_fsname(inode->i_sb, NULL, 0),
2914                                ladvise->lla_start, ladvise->lla_end,
2915                                ladvise_names[advice], rc);
2916                         GOTO(out, rc);
2917                 }
2918                 break;
2919         }
2920
2921 out:
2922         return rc;
2923 }
2924 #undef ERRSIZE
2925
2926 /*
2927  * Give file access advices
2928  *
2929  * The ladvise interface is similar to Linux fadvise() system call, except it
2930  * forwards the advices directly from Lustre client to server. The server side
2931  * codes will apply appropriate read-ahead and caching techniques for the
2932  * corresponding files.
2933  *
2934  * A typical workload for ladvise is e.g. a bunch of different clients are
2935  * doing small random reads of a file, so prefetching pages into OSS cache
2936  * with big linear reads before the random IO is a net benefit. Fetching
2937  * all that data into each client cache with fadvise() may not be, due to
2938  * much more data being sent to the client.
2939  */
2940 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2941                       struct llapi_lu_ladvise *ladvise)
2942 {
2943         struct lu_env *env;
2944         struct cl_io *io;
2945         struct cl_ladvise_io *lio;
2946         int rc;
2947         __u16 refcheck;
2948         ENTRY;
2949
2950         env = cl_env_get(&refcheck);
2951         if (IS_ERR(env))
2952                 RETURN(PTR_ERR(env));
2953
2954         io = vvp_env_thread_io(env);
2955         io->ci_obj = ll_i2info(inode)->lli_clob;
2956
2957         /* initialize parameters for ladvise */
2958         lio = &io->u.ci_ladvise;
2959         lio->li_start = ladvise->lla_start;
2960         lio->li_end = ladvise->lla_end;
2961         lio->li_fid = ll_inode2fid(inode);
2962         lio->li_advice = ladvise->lla_advice;
2963         lio->li_flags = flags;
2964
2965         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2966                 rc = cl_io_loop(env, io);
2967         else
2968                 rc = io->ci_result;
2969
2970         cl_io_fini(env, io);
2971         cl_env_put(env, &refcheck);
2972         RETURN(rc);
2973 }
2974
2975 static int ll_lock_noexpand(struct file *file, int flags)
2976 {
2977         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2978
2979         fd->ll_lock_no_expand = !(flags & LF_UNSET);
2980
2981         return 0;
2982 }
2983
2984 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2985                         unsigned long arg)
2986 {
2987         struct fsxattr fsxattr;
2988
2989         if (copy_from_user(&fsxattr,
2990                            (const struct fsxattr __user *)arg,
2991                            sizeof(fsxattr)))
2992                 RETURN(-EFAULT);
2993
2994         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
2995         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
2996                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
2997         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2998         if (copy_to_user((struct fsxattr __user *)arg,
2999                          &fsxattr, sizeof(fsxattr)))
3000                 RETURN(-EFAULT);
3001
3002         RETURN(0);
3003 }
3004
3005 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3006 {
3007         /*
3008          * Project Quota ID state is only allowed to change from within the init
3009          * namespace. Enforce that restriction only if we are trying to change
3010          * the quota ID state. Everything else is allowed in user namespaces.
3011          */
3012         if (current_user_ns() == &init_user_ns)
3013                 return 0;
3014
3015         if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3016                 return -EINVAL;
3017
3018         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3019                 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3020                         return -EINVAL;
3021         } else {
3022                 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3023                         return -EINVAL;
3024         }
3025
3026         return 0;
3027 }
3028
3029 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3030                         unsigned long arg)
3031 {
3032
3033         struct md_op_data *op_data;
3034         struct ptlrpc_request *req = NULL;
3035         int rc = 0;
3036         struct fsxattr fsxattr;
3037         struct cl_object *obj;
3038         struct iattr *attr;
3039         int flags;
3040
3041         if (copy_from_user(&fsxattr,
3042                            (const struct fsxattr __user *)arg,
3043                            sizeof(fsxattr)))
3044                 RETURN(-EFAULT);
3045
3046         rc = ll_ioctl_check_project(inode, &fsxattr);
3047         if (rc)
3048                 RETURN(rc);
3049
3050         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3051                                      LUSTRE_OPC_ANY, NULL);
3052         if (IS_ERR(op_data))
3053                 RETURN(PTR_ERR(op_data));
3054
3055         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3056         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3057         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3058                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3059         op_data->op_projid = fsxattr.fsx_projid;
3060         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3061         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3062                         0, &req);
3063         ptlrpc_req_finished(req);
3064         if (rc)
3065                 GOTO(out_fsxattr, rc);
3066         ll_update_inode_flags(inode, op_data->op_attr_flags);
3067         obj = ll_i2info(inode)->lli_clob;
3068         if (obj == NULL)
3069                 GOTO(out_fsxattr, rc);
3070
3071         OBD_ALLOC_PTR(attr);
3072         if (attr == NULL)
3073                 GOTO(out_fsxattr, rc = -ENOMEM);
3074
3075         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3076                             fsxattr.fsx_xflags);
3077         OBD_FREE_PTR(attr);
3078 out_fsxattr:
3079         ll_finish_md_op_data(op_data);
3080         RETURN(rc);
3081 }
3082
3083 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3084                                  unsigned long arg)
3085 {
3086         struct inode            *inode = file_inode(file);
3087         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3088         struct ll_inode_info    *lli = ll_i2info(inode);
3089         struct obd_client_handle *och = NULL;
3090         struct split_param sp;
3091         bool lease_broken;
3092         fmode_t fmode = 0;
3093         enum mds_op_bias bias = 0;
3094         struct file *layout_file = NULL;
3095         void *data = NULL;
3096         size_t data_size = 0;
3097         long rc;
3098         ENTRY;
3099
3100         mutex_lock(&lli->lli_och_mutex);
3101         if (fd->fd_lease_och != NULL) {
3102                 och = fd->fd_lease_och;
3103                 fd->fd_lease_och = NULL;
3104         }
3105         mutex_unlock(&lli->lli_och_mutex);
3106
3107         if (och == NULL)
3108                 GOTO(out, rc = -ENOLCK);
3109
3110         fmode = och->och_flags;
3111
3112         switch (ioc->lil_flags) {
3113         case LL_LEASE_RESYNC_DONE:
3114                 if (ioc->lil_count > IOC_IDS_MAX)
3115                         GOTO(out, rc = -EINVAL);
3116
3117                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3118                 OBD_ALLOC(data, data_size);
3119                 if (!data)
3120                         GOTO(out, rc = -ENOMEM);
3121
3122                 if (copy_from_user(data, (void __user *)arg, data_size))
3123                         GOTO(out, rc = -EFAULT);
3124
3125                 bias = MDS_CLOSE_RESYNC_DONE;
3126                 break;
3127         case LL_LEASE_LAYOUT_MERGE: {
3128                 int fd;
3129
3130                 if (ioc->lil_count != 1)
3131                         GOTO(out, rc = -EINVAL);
3132
3133                 arg += sizeof(*ioc);
3134                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3135                         GOTO(out, rc = -EFAULT);
3136
3137                 layout_file = fget(fd);
3138                 if (!layout_file)
3139                         GOTO(out, rc = -EBADF);
3140
3141                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3142                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3143                         GOTO(out, rc = -EPERM);
3144
3145                 data = file_inode(layout_file);
3146                 bias = MDS_CLOSE_LAYOUT_MERGE;
3147                 break;
3148         }
3149         case LL_LEASE_LAYOUT_SPLIT: {
3150                 int fdv;
3151                 int mirror_id;
3152
3153                 if (ioc->lil_count != 2)
3154                         GOTO(out, rc = -EINVAL);
3155
3156                 arg += sizeof(*ioc);
3157                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3158                         GOTO(out, rc = -EFAULT);
3159
3160                 arg += sizeof(__u32);
3161                 if (copy_from_user(&mirror_id, (void __user *)arg,
3162                                    sizeof(__u32)))
3163                         GOTO(out, rc = -EFAULT);
3164
3165                 layout_file = fget(fdv);
3166                 if (!layout_file)
3167                         GOTO(out, rc = -EBADF);
3168
3169                 sp.sp_inode = file_inode(layout_file);
3170                 sp.sp_mirror_id = (__u16)mirror_id;
3171                 data = &sp;
3172                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3173                 break;
3174         }
3175         default:
3176                 /* without close intent */
3177                 break;
3178         }
3179
3180         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3181         if (rc < 0)
3182                 GOTO(out, rc);
3183
3184         rc = ll_lease_och_release(inode, file);
3185         if (rc < 0)
3186                 GOTO(out, rc);
3187
3188         if (lease_broken)
3189                 fmode = 0;
3190         EXIT;
3191
3192 out:
3193         switch (ioc->lil_flags) {
3194         case LL_LEASE_RESYNC_DONE:
3195                 if (data)
3196                         OBD_FREE(data, data_size);
3197                 break;
3198         case LL_LEASE_LAYOUT_MERGE:
3199         case LL_LEASE_LAYOUT_SPLIT:
3200                 if (layout_file)
3201                         fput(layout_file);
3202                 break;
3203         }
3204
3205         if (!rc)
3206                 rc = ll_lease_type_from_fmode(fmode);
3207         RETURN(rc);
3208 }
3209
3210 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3211                               unsigned long arg)
3212 {
3213         struct inode *inode = file_inode(file);
3214         struct ll_inode_info *lli = ll_i2info(inode);
3215         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3216         struct obd_client_handle *och = NULL;
3217         __u64 open_flags = 0;
3218         bool lease_broken;
3219         fmode_t fmode;
3220         long rc;
3221         ENTRY;
3222
3223         switch (ioc->lil_mode) {
3224         case LL_LEASE_WRLCK:
3225                 if (!(file->f_mode & FMODE_WRITE))
3226                         RETURN(-EPERM);
3227                 fmode = FMODE_WRITE;
3228                 break;
3229         case LL_LEASE_RDLCK:
3230                 if (!(file->f_mode & FMODE_READ))
3231                         RETURN(-EPERM);
3232                 fmode = FMODE_READ;
3233                 break;
3234         case LL_LEASE_UNLCK:
3235                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3236         default:
3237                 RETURN(-EINVAL);
3238         }
3239
3240         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3241
3242         /* apply for lease */
3243         if (ioc->lil_flags & LL_LEASE_RESYNC)
3244                 open_flags = MDS_OPEN_RESYNC;
3245         och = ll_lease_open(inode, file, fmode, open_flags);
3246         if (IS_ERR(och))
3247                 RETURN(PTR_ERR(och));
3248
3249         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3250                 rc = ll_lease_file_resync(och, inode, arg);
3251                 if (rc) {
3252                         ll_lease_close(och, inode, NULL);
3253                         RETURN(rc);
3254                 }
3255                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3256                 if (rc) {
3257                         ll_lease_close(och, inode, NULL);
3258                         RETURN(rc);
3259                 }
3260         }
3261
3262         rc = 0;
3263         mutex_lock(&lli->lli_och_mutex);
3264         if (fd->fd_lease_och == NULL) {
3265                 fd->fd_lease_och = och;
3266                 och = NULL;
3267         }
3268         mutex_unlock(&lli->lli_och_mutex);
3269         if (och != NULL) {
3270                 /* impossible now that only excl is supported for now */
3271                 ll_lease_close(och, inode, &lease_broken);
3272                 rc = -EBUSY;
3273         }
3274         RETURN(rc);
3275 }
3276
3277 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3278 {
3279         struct ll_inode_info *lli = ll_i2info(inode);
3280         struct ll_sb_info *sbi = ll_i2sbi(inode);
3281         __u64 now = ktime_get_real_seconds();
3282         int i;
3283
3284         spin_lock(&lli->lli_heat_lock);
3285         heat->lh_flags = lli->lli_heat_flags;
3286         for (i = 0; i < heat->lh_count; i++)
3287                 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3288                                                 now, sbi->ll_heat_decay_weight,
3289                                                 sbi->ll_heat_period_second);
3290         spin_unlock(&lli->lli_heat_lock);
3291 }
3292
3293 static int ll_heat_set(struct inode *inode, __u64 flags)
3294 {
3295         struct ll_inode_info *lli = ll_i2info(inode);
3296         int rc = 0;
3297
3298         spin_lock(&lli->lli_heat_lock);
3299         if (flags & LU_HEAT_FLAG_CLEAR)
3300                 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3301
3302         if (flags & LU_HEAT_FLAG_OFF)
3303                 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3304         else
3305                 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3306
3307         spin_unlock(&lli->lli_heat_lock);
3308
3309         RETURN(rc);
3310 }
3311
3312 static long
3313 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3314 {
3315         struct inode            *inode = file_inode(file);
3316         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3317         int                      flags, rc;
3318         ENTRY;
3319
3320         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3321                PFID(ll_inode2fid(inode)), inode, cmd);
3322         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3323
3324         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3325         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3326                 RETURN(-ENOTTY);
3327
3328         switch (cmd) {
3329         case LL_IOC_GETFLAGS:
3330                 /* Get the current value of the file flags */
3331                 return put_user(fd->fd_flags, (int __user *)arg);
3332         case LL_IOC_SETFLAGS:
3333         case LL_IOC_CLRFLAGS:
3334                 /* Set or clear specific file flags */
3335                 /* XXX This probably needs checks to ensure the flags are
3336                  *     not abused, and to handle any flag side effects.
3337                  */
3338                 if (get_user(flags, (int __user *) arg))
3339                         RETURN(-EFAULT);
3340
3341                 if (cmd == LL_IOC_SETFLAGS) {
3342                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3343                             !(file->f_flags & O_DIRECT)) {
3344                                 CERROR("%s: unable to disable locking on "
3345                                        "non-O_DIRECT file\n", current->comm);
3346                                 RETURN(-EINVAL);
3347                         }
3348
3349                         fd->fd_flags |= flags;
3350                 } else {
3351                         fd->fd_flags &= ~flags;
3352                 }
3353                 RETURN(0);
3354         case LL_IOC_LOV_SETSTRIPE:
3355         case LL_IOC_LOV_SETSTRIPE_NEW:
3356                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3357         case LL_IOC_LOV_SETEA:
3358                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3359         case LL_IOC_LOV_SWAP_LAYOUTS: {
3360                 struct file *file2;
3361                 struct lustre_swap_layouts lsl;
3362
3363                 if (copy_from_user(&lsl, (char __user *)arg,
3364                                    sizeof(struct lustre_swap_layouts)))
3365                         RETURN(-EFAULT);
3366
3367                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3368                         RETURN(-EPERM);
3369
3370                 file2 = fget(lsl.sl_fd);
3371                 if (file2 == NULL)
3372                         RETURN(-EBADF);
3373
3374                 /* O_WRONLY or O_RDWR */
3375                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3376                         GOTO(out, rc = -EPERM);
3377
3378                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3379                         struct inode                    *inode2;
3380                         struct ll_inode_info            *lli;
3381                         struct obd_client_handle        *och = NULL;
3382
3383                         lli = ll_i2info(inode);
3384                         mutex_lock(&lli->lli_och_mutex);
3385                         if (fd->fd_lease_och != NULL) {
3386                                 och = fd->fd_lease_och;
3387                                 fd->fd_lease_och = NULL;
3388                         }
3389                         mutex_unlock(&lli->lli_och_mutex);
3390                         if (och == NULL)
3391                                 GOTO(out, rc = -ENOLCK);
3392                         inode2 = file_inode(file2);
3393                         rc = ll_swap_layouts_close(och, inode, inode2);
3394                 } else {
3395                         rc = ll_swap_layouts(file, file2, &lsl);
3396                 }
3397 out:
3398                 fput(file2);
3399                 RETURN(rc);
3400         }
3401         case LL_IOC_LOV_GETSTRIPE:
3402         case LL_IOC_LOV_GETSTRIPE_NEW:
3403                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3404         case FS_IOC_GETFLAGS:
3405         case FS_IOC_SETFLAGS:
3406                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3407         case FSFILT_IOC_GETVERSION:
3408         case FS_IOC_GETVERSION:
3409                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3410         /* We need to special case any other ioctls we want to handle,
3411          * to send them to the MDS/OST as appropriate and to properly
3412          * network encode the arg field. */
3413         case FS_IOC_SETVERSION:
3414                 RETURN(-ENOTSUPP);
3415
3416         case LL_IOC_GROUP_LOCK:
3417                 RETURN(ll_get_grouplock(inode, file, arg));
3418         case LL_IOC_GROUP_UNLOCK:
3419                 RETURN(ll_put_grouplock(inode, file, arg));
3420         case IOC_OBD_STATFS:
3421                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3422
3423         case LL_IOC_FLUSHCTX:
3424                 RETURN(ll_flush_ctx(inode));
3425         case LL_IOC_PATH2FID: {
3426                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3427                                  sizeof(struct lu_fid)))
3428                         RETURN(-EFAULT);
3429
3430                 RETURN(0);
3431         }
3432         case LL_IOC_GETPARENT:
3433                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3434
3435         case OBD_IOC_FID2PATH:
3436                 RETURN(ll_fid2path(inode, (void __user *)arg));
3437         case LL_IOC_DATA_VERSION: {
3438                 struct ioc_data_version idv;
3439                 int rc;
3440
3441                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3442                         RETURN(-EFAULT);
3443
3444                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3445                 rc = ll_ioc_data_version(inode, &idv);
3446
3447                 if (rc == 0 &&
3448                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3449                         RETURN(-EFAULT);
3450
3451                 RETURN(rc);
3452         }
3453
3454         case LL_IOC_GET_MDTIDX: {
3455                 int mdtidx;
3456
3457                 mdtidx = ll_get_mdt_idx(inode);
3458                 if (mdtidx < 0)
3459                         RETURN(mdtidx);
3460
3461                 if (put_user((int)mdtidx, (int __user *)arg))
3462                         RETURN(-EFAULT);
3463
3464                 RETURN(0);
3465         }
3466         case OBD_IOC_GETDTNAME:
3467         case OBD_IOC_GETMDNAME:
3468                 RETURN(ll_get_obd_name(inode, cmd, arg));
3469         case LL_IOC_HSM_STATE_GET: {
3470                 struct md_op_data       *op_data;
3471                 struct hsm_user_state   *hus;
3472                 int                      rc;
3473
3474                 OBD_ALLOC_PTR(hus);
3475                 if (hus == NULL)
3476                         RETURN(-ENOMEM);
3477
3478                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3479                                              LUSTRE_OPC_ANY, hus);
3480                 if (IS_ERR(op_data)) {
3481                         OBD_FREE_PTR(hus);
3482                         RETURN(PTR_ERR(op_data));
3483                 }
3484
3485                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3486                                    op_data, NULL);
3487
3488                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3489                         rc = -EFAULT;
3490
3491                 ll_finish_md_op_data(op_data);
3492                 OBD_FREE_PTR(hus);
3493                 RETURN(rc);
3494         }
3495         case LL_IOC_HSM_STATE_SET: {
3496                 struct hsm_state_set    *hss;
3497                 int                      rc;
3498
3499                 OBD_ALLOC_PTR(hss);
3500                 if (hss == NULL)
3501                         RETURN(-ENOMEM);
3502
3503                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3504                         OBD_FREE_PTR(hss);
3505                         RETURN(-EFAULT);
3506                 }
3507
3508                 rc = ll_hsm_state_set(inode, hss);
3509
3510                 OBD_FREE_PTR(hss);
3511                 RETURN(rc);
3512         }
3513         case LL_IOC_HSM_ACTION: {
3514                 struct md_op_data               *op_data;
3515                 struct hsm_current_action       *hca;
3516                 int                              rc;
3517
3518                 OBD_ALLOC_PTR(hca);
3519                 if (hca == NULL)
3520                         RETURN(-ENOMEM);
3521
3522                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3523                                              LUSTRE_OPC_ANY, hca);
3524                 if (IS_ERR(op_data)) {
3525                         OBD_FREE_PTR(hca);
3526                         RETURN(PTR_ERR(op_data));
3527                 }
3528
3529                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3530                                    op_data, NULL);
3531
3532                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3533                         rc = -EFAULT;
3534
3535                 ll_finish_md_op_data(op_data);
3536                 OBD_FREE_PTR(hca);
3537                 RETURN(rc);
3538         }
3539         case LL_IOC_SET_LEASE_OLD: {
3540                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3541
3542                 RETURN(ll_file_set_lease(file, &ioc, 0));
3543         }
3544         case LL_IOC_SET_LEASE: {
3545                 struct ll_ioc_lease ioc;
3546
3547                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3548                         RETURN(-EFAULT);
3549
3550                 RETURN(ll_file_set_lease(file, &ioc, arg));
3551         }
3552         case LL_IOC_GET_LEASE: {
3553                 struct ll_inode_info *lli = ll_i2info(inode);
3554                 struct ldlm_lock *lock = NULL;
3555                 fmode_t fmode = 0;
3556
3557                 mutex_lock(&lli->lli_och_mutex);
3558                 if (fd->fd_lease_och != NULL) {
3559                         struct obd_client_handle *och = fd->fd_lease_och;
3560
3561                         lock = ldlm_handle2lock(&och->och_lease_handle);
3562                         if (lock != NULL) {
3563                                 lock_res_and_lock(lock);
3564                                 if (!ldlm_is_cancel(lock))
3565                                         fmode = och->och_flags;
3566
3567                                 unlock_res_and_lock(lock);
3568                                 LDLM_LOCK_PUT(lock);
3569                         }
3570                 }
3571                 mutex_unlock(&lli->lli_och_mutex);
3572
3573                 RETURN(ll_lease_type_from_fmode(fmode));
3574         }
3575         case LL_IOC_HSM_IMPORT: {
3576                 struct hsm_user_import *hui;
3577
3578                 OBD_ALLOC_PTR(hui);
3579                 if (hui == NULL)
3580                         RETURN(-ENOMEM);
3581
3582                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3583                         OBD_FREE_PTR(hui);
3584                         RETURN(-EFAULT);
3585                 }
3586
3587                 rc = ll_hsm_import(inode, file, hui);
3588
3589                 OBD_FREE_PTR(hui);
3590                 RETURN(rc);
3591         }
3592         case LL_IOC_FUTIMES_3: {
3593                 struct ll_futimes_3 lfu;
3594
3595                 if (copy_from_user(&lfu,
3596                                    (const struct ll_futimes_3 __user *)arg,
3597                                    sizeof(lfu)))
3598                         RETURN(-EFAULT);
3599
3600                 RETURN(ll_file_futimes_3(file, &lfu));
3601         }
3602         case LL_IOC_LADVISE: {
3603                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3604                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3605                 int i;
3606                 int num_advise;
3607                 int alloc_size = sizeof(*k_ladvise_hdr);
3608
3609                 rc = 0;
3610                 u_ladvise_hdr = (void __user *)arg;
3611                 OBD_ALLOC_PTR(k_ladvise_hdr);
3612                 if (k_ladvise_hdr == NULL)
3613                         RETURN(-ENOMEM);
3614
3615                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3616                         GOTO(out_ladvise, rc = -EFAULT);
3617
3618                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3619                     k_ladvise_hdr->lah_count < 1)
3620                         GOTO(out_ladvise, rc = -EINVAL);
3621
3622                 num_advise = k_ladvise_hdr->lah_count;
3623                 if (num_advise >= LAH_COUNT_MAX)
3624                         GOTO(out_ladvise, rc = -EFBIG);
3625
3626                 OBD_FREE_PTR(k_ladvise_hdr);
3627                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3628                                       lah_advise[num_advise]);
3629                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3630                 if (k_ladvise_hdr == NULL)
3631                         RETURN(-ENOMEM);
3632
3633                 /*
3634                  * TODO: submit multiple advices to one server in a single RPC
3635                  */
3636                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3637                         GOTO(out_ladvise, rc = -EFAULT);
3638
3639                 for (i = 0; i < num_advise; i++) {
3640                         struct llapi_lu_ladvise *k_ladvise =
3641                                         &k_ladvise_hdr->lah_advise[i];
3642                         struct llapi_lu_ladvise __user *u_ladvise =
3643                                         &u_ladvise_hdr->lah_advise[i];
3644
3645                         rc = ll_ladvise_sanity(inode, k_ladvise);
3646                         if (rc)
3647                                 GOTO(out_ladvise, rc);
3648
3649                         switch (k_ladvise->lla_advice) {
3650                         case LU_LADVISE_LOCKNOEXPAND:
3651                                 rc = ll_lock_noexpand(file,
3652                                                k_ladvise->lla_peradvice_flags);
3653                                 GOTO(out_ladvise, rc);
3654                         case LU_LADVISE_LOCKAHEAD:
3655
3656                                 rc = ll_file_lock_ahead(file, k_ladvise);
3657
3658                                 if (rc < 0)
3659                                         GOTO(out_ladvise, rc);
3660
3661                                 if (put_user(rc,
3662                                              &u_ladvise->lla_lockahead_result))
3663                                         GOTO(out_ladvise, rc = -EFAULT);
3664                                 break;
3665                         default:
3666                                 rc = ll_ladvise(inode, file,
3667                                                 k_ladvise_hdr->lah_flags,
3668                                                 k_ladvise);
3669                                 if (rc)
3670                                         GOTO(out_ladvise, rc);
3671                                 break;
3672                         }
3673
3674                 }
3675
3676 out_ladvise:
3677                 OBD_FREE(k_ladvise_hdr, alloc_size);
3678                 RETURN(rc);
3679         }
3680         case LL_IOC_FLR_SET_MIRROR: {
3681                 /* mirror I/O must be direct to avoid polluting page cache
3682                  * by stale data. */
3683                 if (!(file->f_flags & O_DIRECT))
3684                         RETURN(-EINVAL);
3685
3686                 fd->fd_designated_mirror = (__u32)arg;
3687                 RETURN(0);
3688         }
3689         case LL_IOC_FSGETXATTR:
3690                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3691         case LL_IOC_FSSETXATTR:
3692                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3693         case BLKSSZGET:
3694                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3695         case LL_IOC_HEAT_GET: {
3696                 struct lu_heat uheat;
3697                 struct lu_heat *heat;
3698                 int size;
3699
3700                 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3701                         RETURN(-EFAULT);
3702
3703                 if (uheat.lh_count > OBD_HEAT_COUNT)
3704                         uheat.lh_count = OBD_HEAT_COUNT;
3705
3706                 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3707                 OBD_ALLOC(heat, size);
3708                 if (heat == NULL)
3709                         RETURN(-ENOMEM);
3710
3711                 heat->lh_count = uheat.lh_count;
3712                 ll_heat_get(inode, heat);
3713                 rc = copy_to_user((char __user *)arg, heat, size);
3714                 OBD_FREE(heat, size);
3715                 RETURN(rc ? -EFAULT : 0);
3716         }
3717         case LL_IOC_HEAT_SET: {
3718                 __u64 flags;
3719
3720                 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3721                         RETURN(-EFAULT);
3722
3723                 rc = ll_heat_set(inode, flags);
3724                 RETURN(rc);
3725         }
3726         default:
3727                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3728                                      (void __user *)arg));
3729         }
3730 }
3731
3732 #ifndef HAVE_FILE_LLSEEK_SIZE
3733 static inline loff_t
3734 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3735 {
3736         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3737                 return -EINVAL;
3738         if (offset > maxsize)
3739                 return -EINVAL;
3740
3741         if (offset != file->f_pos) {
3742                 file->f_pos = offset;
3743                 file->f_version = 0;
3744         }
3745         return offset;
3746 }
3747
3748 static loff_t
3749 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3750                 loff_t maxsize, loff_t eof)
3751 {
3752         struct inode *inode = file_inode(file);
3753
3754         switch (origin) {
3755         case SEEK_END:
3756                 offset += eof;
3757                 break;
3758         case SEEK_CUR:
3759                 /*
3760                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3761                  * position-querying operation.  Avoid rewriting the "same"
3762                  * f_pos value back to the file because a concurrent read(),
3763                  * write() or lseek() might have altered it
3764                  */
3765                 if (offset == 0)
3766                         return file->f_pos;
3767                 /*
3768                  * f_lock protects against read/modify/write race with other
3769                  * SEEK_CURs. Note that parallel writes and reads behave
3770                  * like SEEK_SET.
3771                  */
3772                 inode_lock(inode);
3773                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3774                 inode_unlock(inode);
3775                 return offset;
3776         case SEEK_DATA:
3777                 /*
3778                  * In the generic case the entire file is data, so as long as
3779                  * offset isn't at the end of the file then the offset is data.
3780                  */
3781                 if (offset >= eof)
3782                         return -ENXIO;
3783                 break;
3784         case SEEK_HOLE:
3785                 /*
3786                  * There is a virtual hole at the end of the file, so as long as
3787                  * offset isn't i_size or larger, return i_size.
3788                  */
3789                 if (offset >= eof)
3790                         return -ENXIO;
3791                 offset = eof;
3792                 break;
3793         }
3794
3795         return llseek_execute(file, offset, maxsize);
3796 }
3797 #endif
3798
3799 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3800 {
3801         struct inode *inode = file_inode(file);
3802         loff_t retval, eof = 0;
3803
3804         ENTRY;
3805         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3806                            (origin == SEEK_CUR) ? file->f_pos : 0);
3807         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3808                PFID(ll_inode2fid(inode)), inode, retval, retval,
3809                origin);
3810         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3811
3812         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3813                 retval = ll_glimpse_size(inode);
3814                 if (retval != 0)
3815                         RETURN(retval);
3816                 eof = i_size_read(inode);
3817         }
3818
3819         retval = ll_generic_file_llseek_size(file, offset, origin,
3820                                           ll_file_maxbytes(inode), eof);
3821         RETURN(retval);
3822 }
3823
3824 static int ll_flush(struct file *file, fl_owner_t id)
3825 {
3826         struct inode *inode = file_inode(file);
3827         struct ll_inode_info *lli = ll_i2info(inode);
3828         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3829         int rc, err;
3830
3831         LASSERT(!S_ISDIR(inode->i_mode));
3832
3833         /* catch async errors that were recorded back when async writeback
3834          * failed for pages in this mapping. */
3835         rc = lli->lli_async_rc;
3836         lli->lli_async_rc = 0;
3837         if (lli->lli_clob != NULL) {
3838                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3839                 if (rc == 0)
3840                         rc = err;
3841         }
3842
3843         /* The application has been told write failure already.
3844          * Do not report failure again. */
3845         if (fd->fd_write_failed)
3846                 return 0;
3847         return rc ? -EIO : 0;
3848 }
3849
3850 /**
3851  * Called to make sure a portion of file has been written out.
3852  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3853  *
3854  * Return how many pages have been written.
3855  */
3856 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3857                        enum cl_fsync_mode mode, int ignore_layout)
3858 {
3859         struct lu_env *env;
3860         struct cl_io *io;
3861         struct cl_fsync_io *fio;
3862         int result;
3863         __u16 refcheck;
3864         ENTRY;
3865
3866         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3867             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3868                 RETURN(-EINVAL);
3869
3870         env = cl_env_get(&refcheck);
3871         if (IS_ERR(env))
3872                 RETURN(PTR_ERR(env));
3873
3874         io = vvp_env_thread_io(env);
3875         io->ci_obj = ll_i2info(inode)->lli_clob;
3876         io->ci_ignore_layout = ignore_layout;
3877
3878         /* initialize parameters for sync */
3879         fio = &io->u.ci_fsync;
3880         fio->fi_start = start;
3881         fio->fi_end = end;
3882         fio->fi_fid = ll_inode2fid(inode);
3883         fio->fi_mode = mode;
3884         fio->fi_nr_written = 0;
3885
3886         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3887                 result = cl_io_loop(env, io);
3888         else
3889                 result = io->ci_result;
3890         if (result == 0)
3891                 result = fio->fi_nr_written;
3892         cl_io_fini(env, io);
3893         cl_env_put(env, &refcheck);
3894
3895         RETURN(result);
3896 }
3897
3898 /*
3899  * When dentry is provided (the 'else' case), file_dentry() may be
3900  * null and dentry must be used directly rather than pulled from
3901  * file_dentry() as is done otherwise.
3902  */
3903
3904 #ifdef HAVE_FILE_FSYNC_4ARGS
3905 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3906 {
3907         struct dentry *dentry = file_dentry(file);
3908 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3909 int ll_fsync(struct file *file, int datasync)
3910 {
3911         struct dentry *dentry = file_dentry(file);
3912         loff_t start = 0;
3913         loff_t end = LLONG_MAX;
3914 #else
3915 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3916 {
3917         loff_t start = 0;
3918         loff_t end = LLONG_MAX;
3919 #endif
3920         struct inode *inode = dentry->d_inode;
3921         struct ll_inode_info *lli = ll_i2info(inode);
3922         struct ptlrpc_request *req;
3923         int rc, err;
3924         ENTRY;
3925
3926         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3927                PFID(ll_inode2fid(inode)), inode);
3928         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3929
3930 #ifdef HAVE_FILE_FSYNC_4ARGS
3931         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3932         inode_lock(inode);
3933 #else
3934         /* fsync's caller has already called _fdata{sync,write}, we want
3935          * that IO to finish before calling the osc and mdc sync methods */
3936         rc = filemap_fdatawait(inode->i_mapping);
3937 #endif
3938
3939         /* catch async errors that were recorded back when async writeback
3940          * failed for pages in this mapping. */
3941         if (!S_ISDIR(inode->i_mode)) {
3942                 err = lli->lli_async_rc;
3943                 lli->lli_async_rc = 0;
3944                 if (rc == 0)
3945                         rc = err;
3946                 if (lli->lli_clob != NULL) {
3947                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3948                         if (rc == 0)
3949                                 rc = err;
3950                 }
3951         }
3952
3953         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3954         if (!rc)
3955                 rc = err;
3956         if (!err)
3957                 ptlrpc_req_finished(req);
3958
3959         if (S_ISREG(inode->i_mode)) {
3960                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3961
3962                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3963                 if (rc == 0 && err < 0)
3964                         rc = err;
3965                 if (rc < 0)
3966                         fd->fd_write_failed = true;
3967                 else
3968                         fd->fd_write_failed = false;
3969         }
3970
3971 #ifdef HAVE_FILE_FSYNC_4ARGS
3972         inode_unlock(inode);
3973 #endif
3974         RETURN(rc);
3975 }
3976
3977 static int
3978 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3979 {
3980         struct inode *inode = file_inode(file);
3981         struct ll_sb_info *sbi = ll_i2sbi(inode);
3982         struct ldlm_enqueue_info einfo = {
3983                 .ei_type        = LDLM_FLOCK,
3984                 .ei_cb_cp       = ldlm_flock_completion_ast,
3985                 .ei_cbdata      = file_lock,
3986         };
3987         struct md_op_data *op_data;
3988         struct lustre_handle lockh = { 0 };
3989         union ldlm_policy_data flock = { { 0 } };
3990         int fl_type = file_lock->fl_type;
3991         __u64 flags = 0;
3992         int rc;
3993         int rc2 = 0;
3994         ENTRY;
3995
3996         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3997                PFID(ll_inode2fid(inode)), file_lock);
3998
3999         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4000
4001         if (file_lock->fl_flags & FL_FLOCK) {
4002                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4003                 /* flocks are whole-file locks */
4004                 flock.l_flock.end = OFFSET_MAX;
4005                 /* For flocks owner is determined by the local file desctiptor*/
4006                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4007         } else if (file_lock->fl_flags & FL_POSIX) {
4008                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4009                 flock.l_flock.start = file_lock->fl_start;
4010                 flock.l_flock.end = file_lock->fl_end;
4011         } else {
4012                 RETURN(-EINVAL);
4013         }
4014         flock.l_flock.pid = file_lock->fl_pid;
4015
4016         /* Somewhat ugly workaround for svc lockd.
4017          * lockd installs custom fl_lmops->lm_compare_owner that checks
4018          * for the fl_owner to be the same (which it always is on local node
4019          * I guess between lockd processes) and then compares pid.
4020          * As such we assign pid to the owner field to make it all work,
4021          * conflict with normal locks is unlikely since pid space and
4022          * pointer space for current->files are not intersecting */
4023         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4024                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4025
4026         switch (fl_type) {
4027         case F_RDLCK:
4028                 einfo.ei_mode = LCK_PR;
4029                 break;
4030         case F_UNLCK:
4031                 /* An unlock request may or may not have any relation to
4032                  * existing locks so we may not be able to pass a lock handle
4033                  * via a normal ldlm_lock_cancel() request. The request may even
4034                  * unlock a byte range in the middle of an existing lock. In
4035                  * order to process an unlock request we need all of the same
4036                  * information that is given with a normal read or write record
4037                  * lock request. To avoid creating another ldlm unlock (cancel)
4038                  * message we'll treat a LCK_NL flock request as an unlock. */
4039                 einfo.ei_mode = LCK_NL;
4040                 break;
4041         case F_WRLCK:
4042                 einfo.ei_mode = LCK_PW;
4043                 break;
4044         default:
4045                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4046                 RETURN (-ENOTSUPP);
4047         }
4048
4049         switch (cmd) {
4050         case F_SETLKW:
4051 #ifdef F_SETLKW64
4052         case F_SETLKW64:
4053 #endif
4054                 flags = 0;
4055                 break;
4056         case F_SETLK:
4057 #ifdef F_SETLK64
4058         case F_SETLK64:
4059 #endif
4060                 flags = LDLM_FL_BLOCK_NOWAIT;
4061                 break;
4062         case F_GETLK:
4063 #ifdef F_GETLK64
4064         case F_GETLK64:
4065 #endif
4066                 flags = LDLM_FL_TEST_LOCK;
4067                 break;
4068         default:
4069                 CERROR("unknown fcntl lock command: %d\n", cmd);
4070                 RETURN (-EINVAL);
4071         }
4072
4073         /* Save the old mode so that if the mode in the lock changes we
4074          * can decrement the appropriate reader or writer refcount. */
4075         file_lock->fl_type = einfo.ei_mode;
4076
4077         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4078                                      LUSTRE_OPC_ANY, NULL);
4079         if (IS_ERR(op_data))
4080                 RETURN(PTR_ERR(op_data));
4081
4082         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4083                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4084                flock.l_flock.pid, flags, einfo.ei_mode,
4085                flock.l_flock.start, flock.l_flock.end);
4086
4087         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4088                         flags);
4089
4090         /* Restore the file lock type if not TEST lock. */
4091         if (!(flags & LDLM_FL_TEST_LOCK))
4092                 file_lock->fl_type = fl_type;
4093
4094 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4095         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4096             !(flags & LDLM_FL_TEST_LOCK))
4097                 rc2  = locks_lock_file_wait(file, file_lock);
4098 #else
4099         if ((file_lock->fl_flags & FL_FLOCK) &&
4100             (rc == 0 || file_lock->fl_type == F_UNLCK))
4101                 rc2  = flock_lock_file_wait(file, file_lock);
4102         if ((file_lock->fl_flags & FL_POSIX) &&
4103             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4104             !(flags & LDLM_FL_TEST_LOCK))
4105                 rc2  = posix_lock_file_wait(file, file_lock);
4106 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4107
4108         if (rc2 && file_lock->fl_type != F_UNLCK) {
4109                 einfo.ei_mode = LCK_NL;
4110                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4111                            &lockh, flags);
4112                 rc = rc2;
4113         }
4114
4115         ll_finish_md_op_data(op_data);
4116
4117         RETURN(rc);
4118 }
4119
4120 int ll_get_fid_by_name(struct inode *parent, const char *name,
4121                        int namelen, struct lu_fid *fid,
4122                        struct inode **inode)
4123 {
4124         struct md_op_data       *op_data = NULL;
4125         struct mdt_body         *body;
4126         struct ptlrpc_request   *req;
4127         int                     rc;
4128         ENTRY;
4129
4130         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4131                                      LUSTRE_OPC_ANY, NULL);
4132         if (IS_ERR(op_data))
4133                 RETURN(PTR_ERR(op_data));
4134
4135         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4136         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4137         ll_finish_md_op_data(op_data);
4138         if (rc < 0)
4139                 RETURN(rc);
4140
4141         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4142         if (body == NULL)
4143                 GOTO(out_req, rc = -EFAULT);
4144         if (fid != NULL)
4145                 *fid = body->mbo_fid1;
4146
4147         if (inode != NULL)
4148                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4149 out_req:
4150         ptlrpc_req_finished(req);
4151         RETURN(rc);
4152 }
4153
4154 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4155                const char *name)
4156 {
4157         struct dentry *dchild = NULL;
4158         struct inode *child_inode = NULL;
4159         struct md_op_data *op_data;
4160         struct ptlrpc_request *request = NULL;
4161         struct obd_client_handle *och = NULL;
4162         struct qstr qstr;
4163         struct mdt_body *body;
4164         __u64 data_version = 0;
4165         size_t namelen = strlen(name);
4166         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4167         int rc;
4168         ENTRY;
4169
4170         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4171                PFID(ll_inode2fid(parent)), name,
4172                lum->lum_stripe_offset, lum->lum_stripe_count);
4173
4174         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4175             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4176                 lustre_swab_lmv_user_md(lum);
4177
4178         /* Get child FID first */
4179         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4180         qstr.name = name;
4181         qstr.len = namelen;
4182         dchild = d_lookup(file_dentry(file), &qstr);
4183         if (dchild) {
4184                 if (dchild->d_inode)
4185                         child_inode = igrab(dchild->d_inode);
4186                 dput(dchild);
4187         }
4188
4189         if (!child_inode) {
4190                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4191                                         &child_inode);
4192                 if (rc)
4193                         RETURN(rc);
4194         }
4195
4196         if (!child_inode)
4197                 RETURN(-ENOENT);
4198
4199         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4200               OBD_CONNECT2_DIR_MIGRATE)) {
4201                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4202                     ll_i2info(child_inode)->lli_lsm_md) {
4203                         CERROR("%s: MDT doesn't support stripe directory "
4204                                "migration!\n",
4205                                ll_get_fsname(parent->i_sb, NULL, 0));
4206                         GOTO(out_iput, rc = -EOPNOTSUPP);
4207                 }
4208         }
4209
4210         /*
4211          * lfs migrate command needs to be blocked on the client
4212          * by checking the migrate FID against the FID of the
4213          * filesystem root.
4214          */
4215         if (child_inode == parent->i_sb->s_root->d_inode)
4216                 GOTO(out_iput, rc = -EINVAL);
4217
4218         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4219                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4220         if (IS_ERR(op_data))
4221                 GOTO(out_iput, rc = PTR_ERR(op_data));
4222
4223         inode_lock(child_inode);
4224         op_data->op_fid3 = *ll_inode2fid(child_inode);
4225         if (!fid_is_sane(&op_data->op_fid3)) {
4226                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4227                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4228                        PFID(&op_data->op_fid3));
4229                 GOTO(out_unlock, rc = -EINVAL);
4230         }
4231
4232         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4233         op_data->op_data = lum;
4234         op_data->op_data_size = lumlen;
4235
4236 again:
4237         if (S_ISREG(child_inode->i_mode)) {
4238                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4239                 if (IS_ERR(och)) {
4240                         rc = PTR_ERR(och);
4241                         och = NULL;
4242                         GOTO(out_unlock, rc);
4243                 }
4244
4245                 rc = ll_data_version(child_inode, &data_version,
4246                                      LL_DV_WR_FLUSH);
4247                 if (rc != 0)
4248                         GOTO(out_close, rc);
4249
4250                 op_data->op_open_handle = och->och_open_handle;
4251                 op_data->op_data_version = data_version;
4252                 op_data->op_lease_handle = och->och_lease_handle;
4253                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4254
4255                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4256                 och->och_mod->mod_open_req->rq_replay = 0;
4257                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4258         }
4259
4260         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4261                        name, namelen, &request);
4262         if (rc == 0) {
4263                 LASSERT(request != NULL);
4264                 ll_update_times(request, parent);
4265         }
4266
4267         if (rc == 0 || rc == -EAGAIN) {
4268                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4269                 LASSERT(body != NULL);
4270
4271                 /* If the server does release layout lock, then we cleanup
4272                  * the client och here, otherwise release it in out_close: */
4273                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4274                         obd_mod_put(och->och_mod);
4275                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4276                                                   och);
4277                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4278                         OBD_FREE_PTR(och);
4279                         och = NULL;
4280                 }
4281         }
4282
4283         if (request != NULL) {
4284                 ptlrpc_req_finished(request);
4285                 request = NULL;
4286         }
4287
4288         /* Try again if the lease has cancelled. */
4289         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4290                 goto again;
4291
4292 out_close:
4293         if (och)
4294                 ll_lease_close(och, child_inode, NULL);
4295         if (!rc)
4296                 clear_nlink(child_inode);
4297 out_unlock:
4298         inode_unlock(child_inode);
4299         ll_finish_md_op_data(op_data);
4300 out_iput:
4301         iput(child_inode);
4302         RETURN(rc);
4303 }
4304
4305 static int
4306 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4307 {
4308         ENTRY;
4309
4310         RETURN(-ENOSYS);
4311 }
4312
4313 /**
4314  * test if some locks matching bits and l_req_mode are acquired
4315  * - bits can be in different locks
4316  * - if found clear the common lock bits in *bits
4317  * - the bits not found, are kept in *bits
4318  * \param inode [IN]
4319  * \param bits [IN] searched lock bits [IN]
4320  * \param l_req_mode [IN] searched lock mode
4321  * \retval boolean, true iff all bits are found
4322  */
4323 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4324 {
4325         struct lustre_handle lockh;
4326         union ldlm_policy_data policy;
4327         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4328                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4329         struct lu_fid *fid;
4330         __u64 flags;
4331         int i;
4332         ENTRY;
4333
4334         if (!inode)
4335                RETURN(0);
4336
4337         fid = &ll_i2info(inode)->lli_fid;
4338         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4339                ldlm_lockname[mode]);
4340
4341         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4342         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4343                 policy.l_inodebits.bits = *bits & (1 << i);
4344                 if (policy.l_inodebits.bits == 0)
4345                         continue;
4346
4347                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4348                                   &policy, mode, &lockh)) {
4349                         struct ldlm_lock *lock;
4350
4351                         lock = ldlm_handle2lock(&lockh);
4352                         if (lock) {
4353                                 *bits &=
4354                                       ~(lock->l_policy_data.l_inodebits.bits);
4355                                 LDLM_LOCK_PUT(lock);
4356                         } else {
4357                                 *bits &= ~policy.l_inodebits.bits;
4358                         }
4359                 }
4360         }
4361         RETURN(*bits == 0);
4362 }
4363
4364 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4365                                struct lustre_handle *lockh, __u64 flags,
4366                                enum ldlm_mode mode)
4367 {
4368         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4369         struct lu_fid *fid;
4370         enum ldlm_mode rc;
4371         ENTRY;
4372
4373         fid = &ll_i2info(inode)->lli_fid;
4374         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4375
4376         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4377                            fid, LDLM_IBITS, &policy, mode, lockh);
4378
4379         RETURN(rc);
4380 }
4381
4382 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4383 {
4384         /* Already unlinked. Just update nlink and return success */
4385         if (rc == -ENOENT) {
4386                 clear_nlink(inode);
4387                 /* If it is striped directory, and there is bad stripe
4388                  * Let's revalidate the dentry again, instead of returning
4389                  * error */
4390                 if (S_ISDIR(inode->i_mode) &&
4391                     ll_i2info(inode)->lli_lsm_md != NULL)
4392                         return 0;
4393
4394                 /* This path cannot be hit for regular files unless in
4395                  * case of obscure races, so no need to to validate
4396                  * size. */
4397                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4398                         return 0;
4399         } else if (rc != 0) {
4400                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4401                              "%s: revalidate FID "DFID" error: rc = %d\n",
4402                              ll_get_fsname(inode->i_sb, NULL, 0),
4403                              PFID(ll_inode2fid(inode)), rc);
4404         }
4405
4406         return rc;
4407 }
4408
4409 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4410 {
4411         struct inode *inode = dentry->d_inode;
4412         struct obd_export *exp = ll_i2mdexp(inode);
4413         struct lookup_intent oit = {
4414                 .it_op = op,
4415         };
4416         struct ptlrpc_request *req = NULL;
4417         struct md_op_data *op_data;
4418         int rc = 0;
4419         ENTRY;
4420
4421         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4422                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4423
4424         /* Call getattr by fid, so do not provide name at all. */
4425         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4426                                      LUSTRE_OPC_ANY, NULL);
4427         if (IS_ERR(op_data))
4428                 RETURN(PTR_ERR(op_data));
4429
4430         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4431         ll_finish_md_op_data(op_data);
4432         if (rc < 0) {
4433                 rc = ll_inode_revalidate_fini(inode, rc);
4434                 GOTO(out, rc);
4435         }
4436
4437         rc = ll_revalidate_it_finish(req, &oit, dentry);
4438         if (rc != 0) {
4439                 ll_intent_release(&oit);
4440                 GOTO(out, rc);
4441         }
4442
4443         /* Unlinked? Unhash dentry, so it is not picked up later by
4444          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4445          * here to preserve get_cwd functionality on 2.6.
4446          * Bug 10503 */
4447         if (!dentry->d_inode->i_nlink) {
4448                 ll_lock_dcache(inode);
4449                 d_lustre_invalidate(dentry, 0);
4450                 ll_unlock_dcache(inode);
4451         }
4452
4453         ll_lookup_finish_locks(&oit, dentry);
4454 out:
4455         ptlrpc_req_finished(req);
4456
4457         return rc;
4458 }
4459
4460 static int ll_merge_md_attr(struct inode *inode)
4461 {
4462         struct ll_inode_info *lli = ll_i2info(inode);
4463         struct cl_attr attr = { 0 };
4464         int rc;
4465
4466         LASSERT(lli->lli_lsm_md != NULL);
4467         down_read(&lli->lli_lsm_sem);
4468         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4469                            &attr, ll_md_blocking_ast);
4470         up_read(&lli->lli_lsm_sem);
4471         if (rc != 0)
4472                 RETURN(rc);
4473
4474         set_nlink(inode, attr.cat_nlink);
4475         inode->i_blocks = attr.cat_blocks;
4476         i_size_write(inode, attr.cat_size);
4477
4478         ll_i2info(inode)->lli_atime = attr.cat_atime;
4479         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4480         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4481
4482         RETURN(0);
4483 }
4484
4485 static inline dev_t ll_compat_encode_dev(dev_t dev)
4486 {
4487         /* The compat_sys_*stat*() syscalls will fail unless the
4488          * device majors and minors are both less than 256. Note that
4489          * the value returned here will be passed through
4490          * old_encode_dev() in cp_compat_stat(). And so we are not
4491          * trying to return a valid compat (u16) device number, just
4492          * one that will pass the old_valid_dev() check. */
4493
4494         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4495 }
4496
4497 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4498 int ll_getattr(const struct path *path, struct kstat *stat,
4499                u32 request_mask, unsigned int flags)
4500 {
4501         struct dentry *de = path->dentry;
4502 #else
4503 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4504 {
4505 #endif
4506         struct inode *inode = de->d_inode;
4507         struct ll_sb_info *sbi = ll_i2sbi(inode);
4508         struct ll_inode_info *lli = ll_i2info(inode);
4509         int rc;
4510
4511         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4512
4513         rc = ll_inode_revalidate(de, IT_GETATTR);
4514         if (rc < 0)
4515                 RETURN(rc);
4516
4517         if (S_ISREG(inode->i_mode)) {
4518                 /* In case of restore, the MDT has the right size and has
4519                  * already send it back without granting the layout lock,
4520                  * inode is up-to-date so glimpse is useless.
4521                  * Also to glimpse we need the layout, in case of a running
4522                  * restore the MDT holds the layout lock so the glimpse will
4523                  * block up to the end of restore (getattr will block)
4524                  */
4525                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4526                         rc = ll_glimpse_size(inode);
4527                         if (rc < 0)
4528                                 RETURN(rc);
4529                 }
4530         } else {
4531                 /* If object isn't regular a file then don't validate size. */
4532                 if (S_ISDIR(inode->i_mode) &&
4533                     lli->lli_lsm_md != NULL) {
4534                         rc = ll_merge_md_attr(inode);
4535                         if (rc < 0)
4536                                 RETURN(rc);
4537                 }
4538
4539                 inode->i_atime.tv_sec = lli->lli_atime;
4540                 inode->i_mtime.tv_sec = lli->lli_mtime;
4541                 inode->i_ctime.tv_sec = lli->lli_ctime;
4542         }
4543
4544         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4545
4546         if (ll_need_32bit_api(sbi)) {
4547                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4548                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4549                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4550         } else {
4551                 stat->ino = inode->i_ino;
4552                 stat->dev = inode->i_sb->s_dev;
4553                 stat->rdev = inode->i_rdev;
4554         }
4555
4556         stat->mode = inode->i_mode;
4557         stat->uid = inode->i_uid;
4558         stat->gid = inode->i_gid;
4559         stat->atime = inode->i_atime;
4560         stat->mtime = inode->i_mtime;
4561         stat->ctime = inode->i_ctime;
4562         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4563
4564         stat->nlink = inode->i_nlink;
4565         stat->size = i_size_read(inode);
4566         stat->blocks = inode->i_blocks;
4567
4568         return 0;
4569 }
4570
4571 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4572                      __u64 start, __u64 len)
4573 {
4574         int             rc;
4575         size_t          num_bytes;
4576         struct fiemap   *fiemap;
4577         unsigned int    extent_count = fieinfo->fi_extents_max;
4578
4579         num_bytes = sizeof(*fiemap) + (extent_count *
4580                                        sizeof(struct fiemap_extent));
4581         OBD_ALLOC_LARGE(fiemap, num_bytes);
4582
4583         if (fiemap == NULL)
4584                 RETURN(-ENOMEM);
4585
4586         fiemap->fm_flags = fieinfo->fi_flags;
4587         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4588         fiemap->fm_start = start;
4589         fiemap->fm_length = len;
4590         if (extent_count > 0 &&
4591             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4592                            sizeof(struct fiemap_extent)) != 0)
4593                 GOTO(out, rc = -EFAULT);
4594
4595         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4596
4597         fieinfo->fi_flags = fiemap->fm_flags;
4598         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4599         if (extent_count > 0 &&
4600             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4601                          fiemap->fm_mapped_extents *
4602                          sizeof(struct fiemap_extent)) != 0)
4603                 GOTO(out, rc = -EFAULT);
4604 out:
4605         OBD_FREE_LARGE(fiemap, num_bytes);
4606         return rc;
4607 }
4608
4609 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4610 {
4611         struct ll_inode_info *lli = ll_i2info(inode);
4612         struct posix_acl *acl = NULL;
4613         ENTRY;
4614
4615         spin_lock(&lli->lli_lock);
4616         /* VFS' acl_permission_check->check_acl will release the refcount */
4617         acl = posix_acl_dup(lli->lli_posix_acl);
4618         spin_unlock(&lli->lli_lock);
4619
4620         RETURN(acl);
4621 }
4622
4623 #ifdef HAVE_IOP_SET_ACL
4624 #ifdef CONFIG_FS_POSIX_ACL
4625 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4626 {
4627         struct ll_sb_info *sbi = ll_i2sbi(inode);
4628         struct ptlrpc_request *req = NULL;
4629         const char *name = NULL;
4630         char *value = NULL;
4631         size_t value_size = 0;
4632         int rc = 0;
4633         ENTRY;
4634
4635         switch (type) {
4636         case ACL_TYPE_ACCESS:
4637                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4638                 if (acl)
4639                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4640                 break;
4641
4642         case ACL_TYPE_DEFAULT:
4643                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4644                 if (!S_ISDIR(inode->i_mode))
4645                         rc = acl ? -EACCES : 0;
4646                 break;
4647
4648         default:
4649                 rc = -EINVAL;
4650                 break;
4651         }
4652         if (rc)
4653                 return rc;
4654
4655         if (acl) {
4656                 value_size = posix_acl_xattr_size(acl->a_count);
4657                 value = kmalloc(value_size, GFP_NOFS);
4658                 if (value == NULL)
4659                         GOTO(out, rc = -ENOMEM);
4660
4661                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4662                 if (rc < 0)
4663                         GOTO(out_value, rc);
4664         }
4665
4666         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4667                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4668                          name, value, value_size, 0, 0, &req);
4669
4670         ptlrpc_req_finished(req);
4671 out_value:
4672         kfree(value);
4673 out:
4674         if (rc)
4675                 forget_cached_acl(inode, type);
4676         else
4677                 set_cached_acl(inode, type, acl);
4678         RETURN(rc);
4679 }
4680 #endif /* CONFIG_FS_POSIX_ACL */
4681 #endif /* HAVE_IOP_SET_ACL */
4682
4683 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4684 static int
4685 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4686 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4687 # else
4688 ll_check_acl(struct inode *inode, int mask)
4689 # endif
4690 {
4691 # ifdef CONFIG_FS_POSIX_ACL
4692         struct posix_acl *acl;
4693         int rc;
4694         ENTRY;
4695
4696 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4697         if (flags & IPERM_FLAG_RCU)
4698                 return -ECHILD;
4699 #  endif
4700         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4701
4702         if (!acl)
4703                 RETURN(-EAGAIN);
4704
4705         rc = posix_acl_permission(inode, acl, mask);
4706         posix_acl_release(acl);
4707
4708         RETURN(rc);
4709 # else /* !CONFIG_FS_POSIX_ACL */
4710         return -EAGAIN;
4711 # endif /* CONFIG_FS_POSIX_ACL */
4712 }
4713 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4714
4715 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4716 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4717 #else
4718 # ifdef HAVE_INODE_PERMISION_2ARGS
4719 int ll_inode_permission(struct inode *inode, int mask)
4720 # else
4721 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4722 # endif
4723 #endif
4724 {
4725         int rc = 0;
4726         struct ll_sb_info *sbi;
4727         struct root_squash_info *squash;
4728         struct cred *cred = NULL;
4729         const struct cred *old_cred = NULL;
4730         cfs_cap_t cap;
4731         bool squash_id = false;
4732         ENTRY;
4733
4734 #ifdef MAY_NOT_BLOCK
4735         if (mask & MAY_NOT_BLOCK)
4736                 return -ECHILD;
4737 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4738         if (flags & IPERM_FLAG_RCU)
4739                 return -ECHILD;
4740 #endif
4741
4742        /* as root inode are NOT getting validated in lookup operation,
4743         * need to do it before permission check. */
4744
4745         if (inode == inode->i_sb->s_root->d_inode) {
4746                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4747                 if (rc)
4748                         RETURN(rc);
4749         }
4750
4751         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4752                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4753
4754         /* squash fsuid/fsgid if needed */
4755         sbi = ll_i2sbi(inode);
4756         squash = &sbi->ll_squash;
4757         if (unlikely(squash->rsi_uid != 0 &&
4758                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4759                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4760                         squash_id = true;
4761         }
4762         if (squash_id) {
4763                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4764                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4765                        squash->rsi_uid, squash->rsi_gid);
4766
4767                 /* update current process's credentials
4768                  * and FS capability */
4769                 cred = prepare_creds();
4770                 if (cred == NULL)
4771                         RETURN(-ENOMEM);
4772
4773                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4774                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4775                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4776                         if ((1 << cap) & CFS_CAP_FS_MASK)
4777                                 cap_lower(cred->cap_effective, cap);
4778                 }
4779                 old_cred = override_creds(cred);
4780         }
4781
4782         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4783         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4784         /* restore current process's credentials and FS capability */
4785         if (squash_id) {
4786                 revert_creds(old_cred);
4787                 put_cred(cred);
4788         }
4789
4790         RETURN(rc);
4791 }
4792
4793 /* -o localflock - only provides locally consistent flock locks */
4794 struct file_operations ll_file_operations = {
4795 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4796 # ifdef HAVE_SYNC_READ_WRITE
4797         .read           = new_sync_read,
4798         .write          = new_sync_write,
4799 # endif
4800         .read_iter      = ll_file_read_iter,
4801         .write_iter     = ll_file_write_iter,
4802 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4803         .read           = ll_file_read,
4804         .aio_read       = ll_file_aio_read,
4805         .write          = ll_file_write,
4806         .aio_write      = ll_file_aio_write,
4807 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4808         .unlocked_ioctl = ll_file_ioctl,
4809         .open           = ll_file_open,
4810         .release        = ll_file_release,
4811         .mmap           = ll_file_mmap,
4812         .llseek         = ll_file_seek,
4813         .splice_read    = ll_file_splice_read,
4814         .fsync          = ll_fsync,
4815         .flush          = ll_flush
4816 };
4817
4818 struct file_operations ll_file_operations_flock = {
4819 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4820 # ifdef HAVE_SYNC_READ_WRITE
4821         .read           = new_sync_read,
4822         .write          = new_sync_write,
4823 # endif /* HAVE_SYNC_READ_WRITE */
4824         .read_iter      = ll_file_read_iter,
4825         .write_iter     = ll_file_write_iter,
4826 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4827         .read           = ll_file_read,
4828         .aio_read       = ll_file_aio_read,
4829         .write          = ll_file_write,
4830         .aio_write      = ll_file_aio_write,
4831 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4832         .unlocked_ioctl = ll_file_ioctl,
4833         .open           = ll_file_open,
4834         .release        = ll_file_release,
4835         .mmap           = ll_file_mmap,
4836         .llseek         = ll_file_seek,
4837         .splice_read    = ll_file_splice_read,
4838         .fsync          = ll_fsync,
4839         .flush          = ll_flush,
4840         .flock          = ll_file_flock,
4841         .lock           = ll_file_flock
4842 };
4843
4844 /* These are for -o noflock - to return ENOSYS on flock calls */
4845 struct file_operations ll_file_operations_noflock = {
4846 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4847 # ifdef HAVE_SYNC_READ_WRITE
4848         .read           = new_sync_read,
4849         .write          = new_sync_write,
4850 # endif /* HAVE_SYNC_READ_WRITE */
4851         .read_iter      = ll_file_read_iter,
4852         .write_iter     = ll_file_write_iter,
4853 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4854         .read           = ll_file_read,
4855         .aio_read       = ll_file_aio_read,
4856         .write          = ll_file_write,
4857         .aio_write      = ll_file_aio_write,
4858 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4859         .unlocked_ioctl = ll_file_ioctl,
4860         .open           = ll_file_open,
4861         .release        = ll_file_release,
4862         .mmap           = ll_file_mmap,
4863         .llseek         = ll_file_seek,
4864         .splice_read    = ll_file_splice_read,
4865         .fsync          = ll_fsync,
4866         .flush          = ll_flush,
4867         .flock          = ll_file_noflock,
4868         .lock           = ll_file_noflock
4869 };
4870
4871 struct inode_operations ll_file_inode_operations = {
4872         .setattr        = ll_setattr,
4873         .getattr        = ll_getattr,
4874         .permission     = ll_inode_permission,
4875 #ifdef HAVE_IOP_XATTR
4876         .setxattr       = ll_setxattr,
4877         .getxattr       = ll_getxattr,
4878         .removexattr    = ll_removexattr,
4879 #endif
4880         .listxattr      = ll_listxattr,
4881         .fiemap         = ll_fiemap,
4882 #ifdef HAVE_IOP_GET_ACL
4883         .get_acl        = ll_get_acl,
4884 #endif
4885 #ifdef HAVE_IOP_SET_ACL
4886         .set_acl        = ll_set_acl,
4887 #endif
4888 };
4889
4890 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4891 {
4892         struct ll_inode_info *lli = ll_i2info(inode);
4893         struct cl_object *obj = lli->lli_clob;
4894         struct lu_env *env;
4895         int rc;
4896         __u16 refcheck;
4897         ENTRY;
4898
4899         if (obj == NULL)
4900                 RETURN(0);
4901
4902         env = cl_env_get(&refcheck);
4903         if (IS_ERR(env))
4904                 RETURN(PTR_ERR(env));
4905
4906         rc = cl_conf_set(env, lli->lli_clob, conf);
4907         if (rc < 0)
4908                 GOTO(out, rc);
4909
4910         if (conf->coc_opc == OBJECT_CONF_SET) {
4911                 struct ldlm_lock *lock = conf->coc_lock;
4912                 struct cl_layout cl = {
4913                         .cl_layout_gen = 0,
4914                 };
4915
4916                 LASSERT(lock != NULL);
4917                 LASSERT(ldlm_has_layout(lock));
4918
4919                 /* it can only be allowed to match after layout is
4920                  * applied to inode otherwise false layout would be
4921                  * seen. Applying layout shoud happen before dropping
4922                  * the intent lock. */
4923                 ldlm_lock_allow_match(lock);
4924
4925                 rc = cl_object_layout_get(env, obj, &cl);
4926                 if (rc < 0)
4927                         GOTO(out, rc);
4928
4929                 CDEBUG(D_VFSTRACE,
4930                        DFID": layout version change: %u -> %u\n",
4931                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4932                        cl.cl_layout_gen);
4933                 ll_layout_version_set(lli, cl.cl_layout_gen);
4934         }
4935
4936 out:
4937         cl_env_put(env, &refcheck);
4938
4939         RETURN(rc);
4940 }
4941
4942 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4943 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4944
4945 {
4946         struct ll_sb_info *sbi = ll_i2sbi(inode);
4947         struct ptlrpc_request *req;
4948         void *lvbdata;
4949         void *lmm;
4950         int lmmsize;
4951         int rc;
4952         ENTRY;
4953
4954         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4955                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4956                lock->l_lvb_data, lock->l_lvb_len);
4957
4958         if (lock->l_lvb_data != NULL)
4959                 RETURN(0);
4960
4961         /* if layout lock was granted right away, the layout is returned
4962          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4963          * blocked and then granted via completion ast, we have to fetch
4964          * layout here. Please note that we can't use the LVB buffer in
4965          * completion AST because it doesn't have a large enough buffer */
4966         rc = ll_get_default_mdsize(sbi, &lmmsize);
4967         if (rc < 0)
4968                 RETURN(rc);
4969
4970         rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4971                          XATTR_NAME_LOV, lmmsize, &req);
4972         if (rc < 0) {
4973                 if (rc == -ENODATA)
4974                         GOTO(out, rc = 0); /* empty layout */
4975                 else
4976                         RETURN(rc);
4977         }
4978
4979         lmmsize = rc;
4980         rc = 0;
4981         if (lmmsize == 0) /* empty layout */
4982                 GOTO(out, rc = 0);
4983
4984         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4985         if (lmm == NULL)
4986                 GOTO(out, rc = -EFAULT);
4987
4988         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4989         if (lvbdata == NULL)
4990                 GOTO(out, rc = -ENOMEM);
4991
4992         memcpy(lvbdata, lmm, lmmsize);
4993         lock_res_and_lock(lock);
4994         if (unlikely(lock->l_lvb_data == NULL)) {
4995                 lock->l_lvb_type = LVB_T_LAYOUT;
4996                 lock->l_lvb_data = lvbdata;
4997                 lock->l_lvb_len = lmmsize;
4998                 lvbdata = NULL;
4999         }
5000         unlock_res_and_lock(lock);
5001
5002         if (lvbdata)
5003                 OBD_FREE_LARGE(lvbdata, lmmsize);
5004
5005         EXIT;
5006
5007 out:
5008         ptlrpc_req_finished(req);
5009         return rc;
5010 }
5011
5012 /**
5013  * Apply the layout to the inode. Layout lock is held and will be released
5014  * in this function.
5015  */
5016 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5017                               struct inode *inode)
5018 {
5019         struct ll_inode_info *lli = ll_i2info(inode);
5020         struct ll_sb_info    *sbi = ll_i2sbi(inode);
5021         struct ldlm_lock *lock;
5022         struct cl_object_conf conf;
5023         int rc = 0;
5024         bool lvb_ready;
5025         bool wait_layout = false;
5026         ENTRY;
5027
5028         LASSERT(lustre_handle_is_used(lockh));
5029
5030         lock = ldlm_handle2lock(lockh);
5031         LASSERT(lock != NULL);
5032         LASSERT(ldlm_has_layout(lock));
5033
5034         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5035                    PFID(&lli->lli_fid), inode);
5036
5037         /* in case this is a caching lock and reinstate with new inode */
5038         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5039
5040         lock_res_and_lock(lock);
5041         lvb_ready = ldlm_is_lvb_ready(lock);
5042         unlock_res_and_lock(lock);
5043
5044         /* checking lvb_ready is racy but this is okay. The worst case is
5045          * that multi processes may configure the file on the same time. */
5046         if (lvb_ready)
5047                 GOTO(out, rc = 0);
5048
5049         rc = ll_layout_fetch(inode, lock);
5050         if (rc < 0)
5051                 GOTO(out, rc);
5052
5053         /* for layout lock, lmm is stored in lock's lvb.
5054          * lvb_data is immutable if the lock is held so it's safe to access it
5055          * without res lock.
5056          *
5057          * set layout to file. Unlikely this will fail as old layout was
5058          * surely eliminated */
5059         memset(&conf, 0, sizeof conf);
5060         conf.coc_opc = OBJECT_CONF_SET;
5061         conf.coc_inode = inode;
5062         conf.coc_lock = lock;
5063         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5064         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5065         rc = ll_layout_conf(inode, &conf);
5066
5067         /* refresh layout failed, need to wait */
5068         wait_layout = rc == -EBUSY;
5069         EXIT;
5070 out:
5071         LDLM_LOCK_PUT(lock);
5072         ldlm_lock_decref(lockh, mode);
5073
5074         /* wait for IO to complete if it's still being used. */
5075         if (wait_layout) {
5076                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5077                        ll_get_fsname(inode->i_sb, NULL, 0),
5078                        PFID(&lli->lli_fid), inode);
5079
5080                 memset(&conf, 0, sizeof conf);
5081                 conf.coc_opc = OBJECT_CONF_WAIT;
5082                 conf.coc_inode = inode;
5083                 rc = ll_layout_conf(inode, &conf);
5084                 if (rc == 0)
5085                         rc = -EAGAIN;
5086
5087                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5088                        ll_get_fsname(inode->i_sb, NULL, 0),
5089                        PFID(&lli->lli_fid), rc);
5090         }
5091         RETURN(rc);
5092 }
5093
5094 /**
5095  * Issue layout intent RPC to MDS.
5096  * \param inode [in]    file inode
5097  * \param intent [in]   layout intent
5098  *
5099  * \retval 0    on success
5100  * \retval < 0  error code
5101  */
5102 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5103 {
5104         struct ll_inode_info  *lli = ll_i2info(inode);
5105         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5106         struct md_op_data     *op_data;
5107         struct lookup_intent it;
5108         struct ptlrpc_request *req;
5109         int rc;
5110         ENTRY;
5111
5112         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5113                                      0, 0, LUSTRE_OPC_ANY, NULL);
5114         if (IS_ERR(op_data))
5115                 RETURN(PTR_ERR(op_data));
5116
5117         op_data->op_data = intent;
5118         op_data->op_data_size = sizeof(*intent);
5119
5120         memset(&it, 0, sizeof(it));
5121         it.it_op = IT_LAYOUT;
5122         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5123             intent->li_opc == LAYOUT_INTENT_TRUNC)
5124                 it.it_flags = FMODE_WRITE;
5125
5126         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5127                           ll_get_fsname(inode->i_sb, NULL, 0),
5128                           PFID(&lli->lli_fid), inode);
5129
5130         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5131                             &ll_md_blocking_ast, 0);
5132         if (it.it_request != NULL)
5133                 ptlrpc_req_finished(it.it_request);
5134         it.it_request = NULL;
5135
5136         ll_finish_md_op_data(op_data);
5137
5138         /* set lock data in case this is a new lock */
5139         if (!rc)
5140                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5141
5142         ll_intent_drop_lock(&it);
5143
5144         RETURN(rc);
5145 }
5146
5147 /**
5148  * This function checks if there exists a LAYOUT lock on the client side,
5149  * or enqueues it if it doesn't have one in cache.
5150  *
5151  * This function will not hold layout lock so it may be revoked any time after
5152  * this function returns. Any operations depend on layout should be redone
5153  * in that case.
5154  *
5155  * This function should be called before lov_io_init() to get an uptodate
5156  * layout version, the caller should save the version number and after IO
5157  * is finished, this function should be called again to verify that layout
5158  * is not changed during IO time.
5159  */
5160 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5161 {
5162         struct ll_inode_info    *lli = ll_i2info(inode);
5163         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5164         struct lustre_handle lockh;
5165         struct layout_intent intent = {
5166                 .li_opc = LAYOUT_INTENT_ACCESS,
5167         };
5168         enum ldlm_mode mode;
5169         int rc;
5170         ENTRY;
5171
5172         *gen = ll_layout_version_get(lli);
5173         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5174                 RETURN(0);
5175
5176         /* sanity checks */
5177         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5178         LASSERT(S_ISREG(inode->i_mode));
5179
5180         /* take layout lock mutex to enqueue layout lock exclusively. */
5181         mutex_lock(&lli->lli_layout_mutex);
5182
5183         while (1) {
5184                 /* mostly layout lock is caching on the local side, so try to
5185                  * match it before grabbing layout lock mutex. */
5186                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5187                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5188                 if (mode != 0) { /* hit cached lock */
5189                         rc = ll_layout_lock_set(&lockh, mode, inode);
5190                         if (rc == -EAGAIN)
5191                                 continue;
5192                         break;
5193                 }
5194
5195                 rc = ll_layout_intent(inode, &intent);
5196                 if (rc != 0)
5197                         break;
5198         }
5199
5200         if (rc == 0)
5201                 *gen = ll_layout_version_get(lli);
5202         mutex_unlock(&lli->lli_layout_mutex);
5203
5204         RETURN(rc);
5205 }
5206
5207 /**
5208  * Issue layout intent RPC indicating where in a file an IO is about to write.
5209  *
5210  * \param[in] inode     file inode.
5211  * \param[in] ext       write range with start offset of fille in bytes where
5212  *                      an IO is about to write, and exclusive end offset in
5213  *                      bytes.
5214  *
5215  * \retval 0    on success
5216  * \retval < 0  error code
5217  */
5218 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5219                            struct lu_extent *ext)
5220 {
5221         struct layout_intent intent = {
5222                 .li_opc = opc,
5223                 .li_extent.e_start = ext->e_start,
5224                 .li_extent.e_end = ext->e_end,
5225         };
5226         int rc;
5227         ENTRY;
5228
5229         rc = ll_layout_intent(inode, &intent);
5230
5231         RETURN(rc);
5232 }
5233
5234 /**
5235  *  This function send a restore request to the MDT
5236  */
5237 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5238 {
5239         struct hsm_user_request *hur;
5240         int                      len, rc;
5241         ENTRY;
5242
5243         len = sizeof(struct hsm_user_request) +
5244               sizeof(struct hsm_user_item);
5245         OBD_ALLOC(hur, len);
5246         if (hur == NULL)
5247                 RETURN(-ENOMEM);
5248
5249         hur->hur_request.hr_action = HUA_RESTORE;
5250         hur->hur_request.hr_archive_id = 0;
5251         hur->hur_request.hr_flags = 0;
5252         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5253                sizeof(hur->hur_user_item[0].hui_fid));
5254         hur->hur_user_item[0].hui_extent.offset = offset;
5255         hur->hur_user_item[0].hui_extent.length = length;
5256         hur->hur_request.hr_itemcount = 1;
5257         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5258                            len, hur, NULL);
5259         OBD_FREE(hur, len);
5260         RETURN(rc);
5261 }