lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                       ATTR_MTIME | ATTR_MTIME_SET |
 104                                       ATTR_CTIME);
 105         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 106         op_data->op_attr_blocks = inode->i_blocks;
 107         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 108         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 109                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 110         op_data->op_open_handle = och->och_open_handle;
 111
 112         if (och->och_flags & FMODE_WRITE &&
 113             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 114                 /* For HSM: if inode data has been modified, pack it so that
 115                  * MDT can set data dirty flag in the archive. */
 116                 op_data->op_bias |= MDS_DATA_MODIFIED;
 117
 118         EXIT;
 119 }
 120
 121 /**
 122  * Perform a close, possibly with a bias.
 123  * The meaning of "data" depends on the value of "bias".
 124  *
 125  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 126  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 127  * swap layouts with.
 128  */
 129 static int ll_close_inode_openhandle(struct inode *inode,
 130                                      struct obd_client_handle *och,
 131                                      enum mds_op_bias bias, void *data)
 132 {
 133         struct obd_export *md_exp = ll_i2mdexp(inode);
 134         const struct ll_inode_info *lli = ll_i2info(inode);
 135         struct md_op_data *op_data;
 136         struct ptlrpc_request *req = NULL;
 137         int rc;
 138         ENTRY;
 139
 140         if (class_exp2obd(md_exp) == NULL) {
 141                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 142                        ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
 143                 GOTO(out, rc = 0);
 144         }
 145
 146         OBD_ALLOC_PTR(op_data);
 147         /* We leak openhandle and request here on error, but not much to be
 148          * done in OOM case since app won't retry close on error either. */
 149         if (op_data == NULL)
 150                 GOTO(out, rc = -ENOMEM);
 151
 152         ll_prepare_close(inode, op_data, och);
 153         switch (bias) {
 154         case MDS_CLOSE_LAYOUT_MERGE:
 155                 /* merge blocks from the victim inode */
 156                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 157                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 158                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 159         case MDS_CLOSE_LAYOUT_SPLIT:
 160         case MDS_CLOSE_LAYOUT_SWAP: {
 161                 struct split_param *sp = data;
 162
 163                 LASSERT(data != NULL);
 164                 op_data->op_bias |= bias;
 165                 op_data->op_data_version = 0;
 166                 op_data->op_lease_handle = och->och_lease_handle;
 167                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 168                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 169                         op_data->op_mirror_id = sp->sp_mirror_id;
 170                 } else {
 171                         op_data->op_fid2 = *ll_inode2fid(data);
 172                 }
 173                 break;
 174         }
 175
 176         case MDS_CLOSE_RESYNC_DONE: {
 177                 struct ll_ioc_lease *ioc = data;
 178
 179                 LASSERT(data != NULL);
 180                 op_data->op_attr_blocks +=
 181                         ioc->lil_count * op_data->op_attr_blocks;
 182                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 183                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 184                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 185
 186                 op_data->op_lease_handle = och->och_lease_handle;
 187                 op_data->op_data = &ioc->lil_ids[0];
 188                 op_data->op_data_size =
 189                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 190                 break;
 191         }
 192
 193         case MDS_HSM_RELEASE:
 194                 LASSERT(data != NULL);
 195                 op_data->op_bias |= MDS_HSM_RELEASE;
 196                 op_data->op_data_version = *(__u64 *)data;
 197                 op_data->op_lease_handle = och->och_lease_handle;
 198                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 199                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 200                 break;
 201
 202         default:
 203                 LASSERT(data == NULL);
 204                 break;
 205         }
 206
 207         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 208                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 209         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 210                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 211
 212         rc = md_close(md_exp, op_data, och->och_mod, &req);
 213         if (rc != 0 && rc != -EINTR)
 214                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 215                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 216
 217         if (rc == 0 && op_data->op_bias & bias) {
 218                 struct mdt_body *body;
 219
 220                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 221                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 222                         rc = -EBUSY;
 223         }
 224
 225         ll_finish_md_op_data(op_data);
 226         EXIT;
 227 out:
 228
 229         md_clear_open_replay_data(md_exp, och);
 230         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 231         OBD_FREE_PTR(och);
 232
 233         ptlrpc_req_finished(req);       /* This is close request */
 234         return rc;
 235 }
 236
 237 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 238 {
 239         struct ll_inode_info *lli = ll_i2info(inode);
 240         struct obd_client_handle **och_p;
 241         struct obd_client_handle *och;
 242         __u64 *och_usecount;
 243         int rc = 0;
 244         ENTRY;
 245
 246         if (fmode & FMODE_WRITE) {
 247                 och_p = &lli->lli_mds_write_och;
 248                 och_usecount = &lli->lli_open_fd_write_count;
 249         } else if (fmode & FMODE_EXEC) {
 250                 och_p = &lli->lli_mds_exec_och;
 251                 och_usecount = &lli->lli_open_fd_exec_count;
 252         } else {
 253                 LASSERT(fmode & FMODE_READ);
 254                 och_p = &lli->lli_mds_read_och;
 255                 och_usecount = &lli->lli_open_fd_read_count;
 256         }
 257
 258         mutex_lock(&lli->lli_och_mutex);
 259         if (*och_usecount > 0) {
 260                 /* There are still users of this handle, so skip
 261                  * freeing it. */
 262                 mutex_unlock(&lli->lli_och_mutex);
 263                 RETURN(0);
 264         }
 265
 266         och = *och_p;
 267         *och_p = NULL;
 268         mutex_unlock(&lli->lli_och_mutex);
 269
 270         if (och != NULL) {
 271                 /* There might be a race and this handle may already
 272                  * be closed. */
 273                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 274         }
 275
 276         RETURN(rc);
 277 }
 278
 279 static int ll_md_close(struct inode *inode, struct file *file)
 280 {
 281         union ldlm_policy_data policy = {
 282                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 283         };
 284         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 285         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 286         struct ll_inode_info *lli = ll_i2info(inode);
 287         struct lustre_handle lockh;
 288         enum ldlm_mode lockmode;
 289         int rc = 0;
 290         ENTRY;
 291
 292         /* clear group lock, if present */
 293         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 294                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 295
 296         if (fd->fd_lease_och != NULL) {
 297                 bool lease_broken;
 298
 299                 /* Usually the lease is not released when the
 300                  * application crashed, we need to release here. */
 301                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 302                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 303                         PFID(&lli->lli_fid), rc, lease_broken);
 304
 305                 fd->fd_lease_och = NULL;
 306         }
 307
 308         if (fd->fd_och != NULL) {
 309                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 310                 fd->fd_och = NULL;
 311                 GOTO(out, rc);
 312         }
 313
 314         /* Let's see if we have good enough OPEN lock on the file and if
 315            we can skip talking to MDS */
 316         mutex_lock(&lli->lli_och_mutex);
 317         if (fd->fd_omode & FMODE_WRITE) {
 318                 lockmode = LCK_CW;
 319                 LASSERT(lli->lli_open_fd_write_count);
 320                 lli->lli_open_fd_write_count--;
 321         } else if (fd->fd_omode & FMODE_EXEC) {
 322                 lockmode = LCK_PR;
 323                 LASSERT(lli->lli_open_fd_exec_count);
 324                 lli->lli_open_fd_exec_count--;
 325         } else {
 326                 lockmode = LCK_CR;
 327                 LASSERT(lli->lli_open_fd_read_count);
 328                 lli->lli_open_fd_read_count--;
 329         }
 330         mutex_unlock(&lli->lli_och_mutex);
 331
 332         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 333                            LDLM_IBITS, &policy, lockmode, &lockh))
 334                 rc = ll_md_real_close(inode, fd->fd_omode);
 335
 336 out:
 337         LUSTRE_FPRIVATE(file) = NULL;
 338         ll_file_data_put(fd);
 339
 340         RETURN(rc);
 341 }
 342
 343 /* While this returns an error code, fput() the caller does not, so we need
 344  * to make every effort to clean up all of our state here.  Also, applications
 345  * rarely check close errors and even if an error is returned they will not
 346  * re-try the close call.
 347  */
 348 int ll_file_release(struct inode *inode, struct file *file)
 349 {
 350         struct ll_file_data *fd;
 351         struct ll_sb_info *sbi = ll_i2sbi(inode);
 352         struct ll_inode_info *lli = ll_i2info(inode);
 353         int rc;
 354         ENTRY;
 355
 356         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 357                PFID(ll_inode2fid(inode)), inode);
 358
 359         if (inode->i_sb->s_root != file_dentry(file))
 360                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 361         fd = LUSTRE_FPRIVATE(file);
 362         LASSERT(fd != NULL);
 363
 364         /* The last ref on @file, maybe not the the owner pid of statahead,
 365          * because parent and child process can share the same file handle. */
 366         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 367                 ll_deauthorize_statahead(inode, fd);
 368
 369         if (inode->i_sb->s_root == file_dentry(file)) {
 370                 LUSTRE_FPRIVATE(file) = NULL;
 371                 ll_file_data_put(fd);
 372                 RETURN(0);
 373         }
 374
 375         if (!S_ISDIR(inode->i_mode)) {
 376                 if (lli->lli_clob != NULL)
 377                         lov_read_and_clear_async_rc(lli->lli_clob);
 378                 lli->lli_async_rc = 0;
 379         }
 380
 381         rc = ll_md_close(inode, file);
 382
 383         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 384                 libcfs_debug_dumplog();
 385
 386         RETURN(rc);
 387 }
 388
 389 static inline int ll_dom_readpage(void *data, struct page *page)
 390 {
 391         struct niobuf_local *lnb = data;
 392         void *kaddr;
 393
 394         kaddr = ll_kmap_atomic(page, KM_USER0);
 395         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 396         if (lnb->lnb_len < PAGE_SIZE)
 397                 memset(kaddr + lnb->lnb_len, 0,
 398                        PAGE_SIZE - lnb->lnb_len);
 399         flush_dcache_page(page);
 400         SetPageUptodate(page);
 401         ll_kunmap_atomic(kaddr, KM_USER0);
 402         unlock_page(page);
 403
 404         return 0;
 405 }
 406
 407 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 408                         struct lookup_intent *it)
 409 {
 410         struct ll_inode_info *lli = ll_i2info(inode);
 411         struct cl_object *obj = lli->lli_clob;
 412         struct address_space *mapping = inode->i_mapping;
 413         struct page *vmpage;
 414         struct niobuf_remote *rnb;
 415         char *data;
 416         struct lustre_handle lockh;
 417         struct ldlm_lock *lock;
 418         unsigned long index, start;
 419         struct niobuf_local lnb;
 420         bool dom_lock = false;
 421
 422         ENTRY;
 423
 424         if (obj == NULL)
 425                 RETURN_EXIT;
 426
 427         if (it->it_lock_mode != 0) {
 428                 lockh.cookie = it->it_lock_handle;
 429                 lock = ldlm_handle2lock(&lockh);
 430                 if (lock != NULL)
 431                         dom_lock = ldlm_has_dom(lock);
 432                 LDLM_LOCK_PUT(lock);
 433         }
 434         if (!dom_lock)
 435                 RETURN_EXIT;
 436
 437         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 438                                    RCL_SERVER))
 439                 RETURN_EXIT;
 440
 441         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 442         if (rnb == NULL || rnb->rnb_len == 0)
 443                 RETURN_EXIT;
 444
 445         /* LU-11595: Server may return whole file and that is OK always or
 446          * it may return just file tail and its offset must be aligned with
 447          * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
 448          * smaller then offset may be not aligned and that data is just ignored.
 449          */
 450         if (rnb->rnb_offset % PAGE_SIZE)
 451                 RETURN_EXIT;
 452
 453         /* Server returns whole file or just file tail if it fills in
 454          * reply buffer, in both cases total size should be inode size.
 455          */
 456         if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
 457                 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
 458                        ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
 459                        rnb->rnb_len, i_size_read(inode));
 460                 RETURN_EXIT;
 461         }
 462
 463         CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
 464                rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
 465
 466         data = (char *)rnb + sizeof(*rnb);
 467
 468         lnb.lnb_file_offset = rnb->rnb_offset;
 469         start = lnb.lnb_file_offset / PAGE_SIZE;
 470         index = 0;
 471         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 472         lnb.lnb_page_offset = 0;
 473         do {
 474                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 475                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 476                 if (lnb.lnb_len > PAGE_SIZE)
 477                         lnb.lnb_len = PAGE_SIZE;
 478
 479                 vmpage = read_cache_page(mapping, index + start,
 480                                          ll_dom_readpage, &lnb);
 481                 if (IS_ERR(vmpage)) {
 482                         CWARN("%s: cannot fill page %lu for "DFID
 483                               " with data: rc = %li\n",
 484                               ll_i2sbi(inode)->ll_fsname, index + start,
 485                               PFID(lu_object_fid(&obj->co_lu)),
 486                               PTR_ERR(vmpage));
 487                         break;
 488                 }
 489                 put_page(vmpage);
 490                 index++;
 491         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 492         EXIT;
 493 }
 494
 495 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 496                                 struct lookup_intent *itp)
 497 {
 498         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 499         struct dentry *parent = de->d_parent;
 500         char *name = NULL;
 501         int len = 0;
 502         struct md_op_data *op_data;
 503         struct ptlrpc_request *req = NULL;
 504         int rc;
 505         ENTRY;
 506
 507         LASSERT(parent != NULL);
 508         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 509
 510         /* if server supports open-by-fid, or file name is invalid, don't pack
 511          * name in open request */
 512         if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
 513             !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
 514 retry:
 515                 len = de->d_name.len;
 516                 name = kmalloc(len + 1, GFP_NOFS);
 517                 if (!name)
 518                         RETURN(-ENOMEM);
 519
 520                 /* race here */
 521                 spin_lock(&de->d_lock);
 522                 if (len != de->d_name.len) {
 523                         spin_unlock(&de->d_lock);
 524                         kfree(name);
 525                         goto retry;
 526                 }
 527                 memcpy(name, de->d_name.name, len);
 528                 name[len] = '\0';
 529                 spin_unlock(&de->d_lock);
 530
 531                 if (!lu_name_is_valid_2(name, len)) {
 532                         kfree(name);
 533                         RETURN(-ESTALE);
 534                 }
 535         }
 536
 537         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 538                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 539         if (IS_ERR(op_data)) {
 540                 kfree(name);
 541                 RETURN(PTR_ERR(op_data));
 542         }
 543         op_data->op_data = lmm;
 544         op_data->op_data_size = lmmsize;
 545
 546         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 547                             &ll_md_blocking_ast, 0);
 548         kfree(name);
 549         ll_finish_md_op_data(op_data);
 550         if (rc == -ESTALE) {
 551                 /* reason for keep own exit path - don`t flood log
 552                  * with messages with -ESTALE errors.
 553                  */
 554                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 555                      it_open_error(DISP_OPEN_OPEN, itp))
 556                         GOTO(out, rc);
 557                 ll_release_openhandle(de, itp);
 558                 GOTO(out, rc);
 559         }
 560
 561         if (it_disposition(itp, DISP_LOOKUP_NEG))
 562                 GOTO(out, rc = -ENOENT);
 563
 564         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 565                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 566                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 567                 GOTO(out, rc);
 568         }
 569
 570         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 571
 572         if (!rc && itp->it_lock_mode) {
 573                 ll_dom_finish_open(de->d_inode, req, itp);
 574                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 575         }
 576
 577 out:
 578         ptlrpc_req_finished(req);
 579         ll_intent_drop_lock(itp);
 580
 581         /* We did open by fid, but by the time we got to the server,
 582          * the object disappeared. If this is a create, we cannot really
 583          * tell the userspace that the file it was trying to create
 584          * does not exist. Instead let's return -ESTALE, and the VFS will
 585          * retry the create with LOOKUP_REVAL that we are going to catch
 586          * in ll_revalidate_dentry() and use lookup then.
 587          */
 588         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 589                 rc = -ESTALE;
 590
 591         RETURN(rc);
 592 }
 593
 594 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 595                        struct obd_client_handle *och)
 596 {
 597         struct mdt_body *body;
 598
 599         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 600         och->och_open_handle = body->mbo_open_handle;
 601         och->och_fid = body->mbo_fid1;
 602         och->och_lease_handle.cookie = it->it_lock_handle;
 603         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 604         och->och_flags = it->it_flags;
 605
 606         return md_set_open_replay_data(md_exp, och, it);
 607 }
 608
 609 static int ll_local_open(struct file *file, struct lookup_intent *it,
 610                          struct ll_file_data *fd, struct obd_client_handle *och)
 611 {
 612         struct inode *inode = file_inode(file);
 613         ENTRY;
 614
 615         LASSERT(!LUSTRE_FPRIVATE(file));
 616
 617         LASSERT(fd != NULL);
 618
 619         if (och) {
 620                 int rc;
 621
 622                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 623                 if (rc != 0)
 624                         RETURN(rc);
 625         }
 626
 627         LUSTRE_FPRIVATE(file) = fd;
 628         ll_readahead_init(inode, &fd->fd_ras);
 629         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 630
 631         /* ll_cl_context initialize */
 632         rwlock_init(&fd->fd_lock);
 633         INIT_LIST_HEAD(&fd->fd_lccs);
 634
 635         RETURN(0);
 636 }
 637
 638 /* Open a file, and (for the very first open) create objects on the OSTs at
 639  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 640  * creation or open until ll_lov_setstripe() ioctl is called.
 641  *
 642  * If we already have the stripe MD locally then we don't request it in
 643  * md_open(), by passing a lmm_size = 0.
 644  *
 645  * It is up to the application to ensure no other processes open this file
 646  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 647  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 648  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 649  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 650  */
 651 int ll_file_open(struct inode *inode, struct file *file)
 652 {
 653         struct ll_inode_info *lli = ll_i2info(inode);
 654         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 655                                           .it_flags = file->f_flags };
 656         struct obd_client_handle **och_p = NULL;
 657         __u64 *och_usecount = NULL;
 658         struct ll_file_data *fd;
 659         int rc = 0;
 660         ENTRY;
 661
 662         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 663                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 664
 665         it = file->private_data; /* XXX: compat macro */
 666         file->private_data = NULL; /* prevent ll_local_open assertion */
 667
 668         fd = ll_file_data_get();
 669         if (fd == NULL)
 670                 GOTO(out_nofiledata, rc = -ENOMEM);
 671
 672         fd->fd_file = file;
 673         if (S_ISDIR(inode->i_mode))
 674                 ll_authorize_statahead(inode, fd);
 675
 676         if (inode->i_sb->s_root == file_dentry(file)) {
 677                 LUSTRE_FPRIVATE(file) = fd;
 678                 RETURN(0);
 679         }
 680
 681         if (!it || !it->it_disposition) {
 682                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 683                  * because everything but O_ACCMODE mask was stripped from
 684                  * there */
 685                 if ((oit.it_flags + 1) & O_ACCMODE)
 686                         oit.it_flags++;
 687                 if (file->f_flags & O_TRUNC)
 688                         oit.it_flags |= FMODE_WRITE;
 689
 690                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 691                  * dentry_open after call to open_namei that checks permissions.
 692                  * Only nfsd_open call dentry_open directly without checking
 693                  * permissions and because of that this code below is safe.
 694                  */
 695                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 696                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 697
 698                 /* We do not want O_EXCL here, presumably we opened the file
 699                  * already? XXX - NFS implications? */
 700                 oit.it_flags &= ~O_EXCL;
 701
 702                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 703                  * created if necessary, then "IT_CREAT" should be set to keep
 704                  * consistent with it */
 705                 if (oit.it_flags & O_CREAT)
 706                         oit.it_op |= IT_CREAT;
 707
 708                 it = &oit;
 709         }
 710
 711 restart:
 712         /* Let's see if we have file open on MDS already. */
 713         if (it->it_flags & FMODE_WRITE) {
 714                 och_p = &lli->lli_mds_write_och;
 715                 och_usecount = &lli->lli_open_fd_write_count;
 716         } else if (it->it_flags & FMODE_EXEC) {
 717                 och_p = &lli->lli_mds_exec_och;
 718                 och_usecount = &lli->lli_open_fd_exec_count;
 719          } else {
 720                 och_p = &lli->lli_mds_read_och;
 721                 och_usecount = &lli->lli_open_fd_read_count;
 722         }
 723
 724         mutex_lock(&lli->lli_och_mutex);
 725         if (*och_p) { /* Open handle is present */
 726                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 727                         /* Well, there's extra open request that we do not need,
 728                            let's close it somehow. This will decref request. */
 729                         rc = it_open_error(DISP_OPEN_OPEN, it);
 730                         if (rc) {
 731                                 mutex_unlock(&lli->lli_och_mutex);
 732                                 GOTO(out_openerr, rc);
 733                         }
 734
 735                         ll_release_openhandle(file_dentry(file), it);
 736                 }
 737                 (*och_usecount)++;
 738
 739                 rc = ll_local_open(file, it, fd, NULL);
 740                 if (rc) {
 741                         (*och_usecount)--;
 742                         mutex_unlock(&lli->lli_och_mutex);
 743                         GOTO(out_openerr, rc);
 744                 }
 745         } else {
 746                 LASSERT(*och_usecount == 0);
 747                 if (!it->it_disposition) {
 748                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 749                         /* We cannot just request lock handle now, new ELC code
 750                            means that one of other OPEN locks for this file
 751                            could be cancelled, and since blocking ast handler
 752                            would attempt to grab och_mutex as well, that would
 753                            result in a deadlock */
 754                         mutex_unlock(&lli->lli_och_mutex);
 755                         /*
 756                          * Normally called under two situations:
 757                          * 1. NFS export.
 758                          * 2. A race/condition on MDS resulting in no open
 759                          *    handle to be returned from LOOKUP|OPEN request,
 760                          *    for example if the target entry was a symlink.
 761                          *
 762                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 763                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 764                          *  bit so that it's not confusing later callers.
 765                          *
 766                          *  NB; when ldd is NULL, it must have come via normal
 767                          *  lookup path only, since ll_iget_for_nfs always calls
 768                          *  ll_d_init().
 769                          */
 770                         if (ldd && ldd->lld_nfs_dentry) {
 771                                 ldd->lld_nfs_dentry = 0;
 772                                 it->it_flags |= MDS_OPEN_LOCK;
 773                         }
 774
 775                          /*
 776                          * Always specify MDS_OPEN_BY_FID because we don't want
 777                          * to get file with different fid.
 778                          */
 779                         it->it_flags |= MDS_OPEN_BY_FID;
 780                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 781                                                  it);
 782                         if (rc)
 783                                 GOTO(out_openerr, rc);
 784
 785                         goto restart;
 786                 }
 787                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 788                 if (!*och_p)
 789                         GOTO(out_och_free, rc = -ENOMEM);
 790
 791                 (*och_usecount)++;
 792
 793                 /* md_intent_lock() didn't get a request ref if there was an
 794                  * open error, so don't do cleanup on the request here
 795                  * (bug 3430) */
 796                 /* XXX (green): Should not we bail out on any error here, not
 797                  * just open error? */
 798                 rc = it_open_error(DISP_OPEN_OPEN, it);
 799                 if (rc != 0)
 800                         GOTO(out_och_free, rc);
 801
 802                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 803                          "inode %p: disposition %x, status %d\n", inode,
 804                          it_disposition(it, ~0), it->it_status);
 805
 806                 rc = ll_local_open(file, it, fd, *och_p);
 807                 if (rc)
 808                         GOTO(out_och_free, rc);
 809         }
 810         mutex_unlock(&lli->lli_och_mutex);
 811         fd = NULL;
 812
 813         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 814            different kind of OPEN lock for this same inode gets cancelled
 815            by ldlm_cancel_lru */
 816         if (!S_ISREG(inode->i_mode))
 817                 GOTO(out_och_free, rc);
 818
 819         cl_lov_delay_create_clear(&file->f_flags);
 820         GOTO(out_och_free, rc);
 821
 822 out_och_free:
 823         if (rc) {
 824                 if (och_p && *och_p) {
 825                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 826                         *och_p = NULL; /* OBD_FREE writes some magic there */
 827                         (*och_usecount)--;
 828                 }
 829                 mutex_unlock(&lli->lli_och_mutex);
 830
 831 out_openerr:
 832                 if (lli->lli_opendir_key == fd)
 833                         ll_deauthorize_statahead(inode, fd);
 834                 if (fd != NULL)
 835                         ll_file_data_put(fd);
 836         } else {
 837                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 838         }
 839
 840 out_nofiledata:
 841         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 842                 ptlrpc_req_finished(it->it_request);
 843                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 844         }
 845
 846         return rc;
 847 }
 848
 849 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 850                         struct ldlm_lock_desc *desc, void *data, int flag)
 851 {
 852         int rc;
 853         struct lustre_handle lockh;
 854         ENTRY;
 855
 856         switch (flag) {
 857         case LDLM_CB_BLOCKING:
 858                 ldlm_lock2handle(lock, &lockh);
 859                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 860                 if (rc < 0) {
 861                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 862                         RETURN(rc);
 863                 }
 864                 break;
 865         case LDLM_CB_CANCELING:
 866                 /* do nothing */
 867                 break;
 868         }
 869         RETURN(0);
 870 }
 871
 872 /**
 873  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 874  * and save it as fd->fd_och so as to force client to reopen the file even
 875  * if it has an open lock in cache already.
 876  */
 877 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 878                                 struct lustre_handle *old_open_handle)
 879 {
 880         struct ll_inode_info *lli = ll_i2info(inode);
 881         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 882         struct obd_client_handle **och_p;
 883         __u64 *och_usecount;
 884         int rc = 0;
 885         ENTRY;
 886
 887         /* Get the openhandle of the file */
 888         mutex_lock(&lli->lli_och_mutex);
 889         if (fd->fd_lease_och != NULL)
 890                 GOTO(out_unlock, rc = -EBUSY);
 891
 892         if (fd->fd_och == NULL) {
 893                 if (file->f_mode & FMODE_WRITE) {
 894                         LASSERT(lli->lli_mds_write_och != NULL);
 895                         och_p = &lli->lli_mds_write_och;
 896                         och_usecount = &lli->lli_open_fd_write_count;
 897                 } else {
 898                         LASSERT(lli->lli_mds_read_och != NULL);
 899                         och_p = &lli->lli_mds_read_och;
 900                         och_usecount = &lli->lli_open_fd_read_count;
 901                 }
 902
 903                 if (*och_usecount > 1)
 904                         GOTO(out_unlock, rc = -EBUSY);
 905
 906                 fd->fd_och = *och_p;
 907                 *och_usecount = 0;
 908                 *och_p = NULL;
 909         }
 910
 911         *old_open_handle = fd->fd_och->och_open_handle;
 912
 913         EXIT;
 914 out_unlock:
 915         mutex_unlock(&lli->lli_och_mutex);
 916         return rc;
 917 }
 918
 919 /**
 920  * Release ownership on lli_mds_*_och when putting back a file lease.
 921  */
 922 static int ll_lease_och_release(struct inode *inode, struct file *file)
 923 {
 924         struct ll_inode_info *lli = ll_i2info(inode);
 925         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 926         struct obd_client_handle **och_p;
 927         struct obd_client_handle *old_och = NULL;
 928         __u64 *och_usecount;
 929         int rc = 0;
 930         ENTRY;
 931
 932         mutex_lock(&lli->lli_och_mutex);
 933         if (file->f_mode & FMODE_WRITE) {
 934                 och_p = &lli->lli_mds_write_och;
 935                 och_usecount = &lli->lli_open_fd_write_count;
 936         } else {
 937                 och_p = &lli->lli_mds_read_och;
 938                 och_usecount = &lli->lli_open_fd_read_count;
 939         }
 940
 941         /* The file may have been open by another process (broken lease) so
 942          * *och_p is not NULL. In this case we should simply increase usecount
 943          * and close fd_och.
 944          */
 945         if (*och_p != NULL) {
 946                 old_och = fd->fd_och;
 947                 (*och_usecount)++;
 948         } else {
 949                 *och_p = fd->fd_och;
 950                 *och_usecount = 1;
 951         }
 952         fd->fd_och = NULL;
 953         mutex_unlock(&lli->lli_och_mutex);
 954
 955         if (old_och != NULL)
 956                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 957
 958         RETURN(rc);
 959 }
 960
 961 /**
 962  * Acquire a lease and open the file.
 963  */
 964 static struct obd_client_handle *
 965 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 966               __u64 open_flags)
 967 {
 968         struct lookup_intent it = { .it_op = IT_OPEN };
 969         struct ll_sb_info *sbi = ll_i2sbi(inode);
 970         struct md_op_data *op_data;
 971         struct ptlrpc_request *req = NULL;
 972         struct lustre_handle old_open_handle = { 0 };
 973         struct obd_client_handle *och = NULL;
 974         int rc;
 975         int rc2;
 976         ENTRY;
 977
 978         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 979                 RETURN(ERR_PTR(-EINVAL));
 980
 981         if (file != NULL) {
 982                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 983                         RETURN(ERR_PTR(-EPERM));
 984
 985                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
 986                 if (rc)
 987                         RETURN(ERR_PTR(rc));
 988         }
 989
 990         OBD_ALLOC_PTR(och);
 991         if (och == NULL)
 992                 RETURN(ERR_PTR(-ENOMEM));
 993
 994         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 995                                         LUSTRE_OPC_ANY, NULL);
 996         if (IS_ERR(op_data))
 997                 GOTO(out, rc = PTR_ERR(op_data));
 998
 999         /* To tell the MDT this openhandle is from the same owner */
1000         op_data->op_open_handle = old_open_handle;
1001
1002         it.it_flags = fmode | open_flags;
1003         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1004         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1005                             &ll_md_blocking_lease_ast,
1006         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1007          * it can be cancelled which may mislead applications that the lease is
1008          * broken;
1009          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1010          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1011          * doesn't deal with openhandle, so normal openhandle will be leaked. */
1012                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1013         ll_finish_md_op_data(op_data);
1014         ptlrpc_req_finished(req);
1015         if (rc < 0)
1016                 GOTO(out_release_it, rc);
1017
1018         if (it_disposition(&it, DISP_LOOKUP_NEG))
1019                 GOTO(out_release_it, rc = -ENOENT);
1020
1021         rc = it_open_error(DISP_OPEN_OPEN, &it);
1022         if (rc)
1023                 GOTO(out_release_it, rc);
1024
1025         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1026         ll_och_fill(sbi->ll_md_exp, &it, och);
1027
1028         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1029                 GOTO(out_close, rc = -EOPNOTSUPP);
1030
1031         /* already get lease, handle lease lock */
1032         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1033         if (it.it_lock_mode == 0 ||
1034             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1035                 /* open lock must return for lease */
1036                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1037                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1038                         it.it_lock_bits);
1039                 GOTO(out_close, rc = -EPROTO);
1040         }
1041
1042         ll_intent_release(&it);
1043         RETURN(och);
1044
1045 out_close:
1046         /* Cancel open lock */
1047         if (it.it_lock_mode != 0) {
1048                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1049                                             it.it_lock_mode);
1050                 it.it_lock_mode = 0;
1051                 och->och_lease_handle.cookie = 0ULL;
1052         }
1053         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1054         if (rc2 < 0)
1055                 CERROR("%s: error closing file "DFID": %d\n",
1056                        sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1057         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1058 out_release_it:
1059         ll_intent_release(&it);
1060 out:
1061         if (och != NULL)
1062                 OBD_FREE_PTR(och);
1063         RETURN(ERR_PTR(rc));
1064 }
1065
1066 /**
1067  * Check whether a layout swap can be done between two inodes.
1068  *
1069  * \param[in] inode1  First inode to check
1070  * \param[in] inode2  Second inode to check
1071  *
1072  * \retval 0 on success, layout swap can be performed between both inodes
1073  * \retval negative error code if requirements are not met
1074  */
1075 static int ll_check_swap_layouts_validity(struct inode *inode1,
1076                                           struct inode *inode2)
1077 {
1078         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1079                 return -EINVAL;
1080
1081         if (inode_permission(inode1, MAY_WRITE) ||
1082             inode_permission(inode2, MAY_WRITE))
1083                 return -EPERM;
1084
1085         if (inode1->i_sb != inode2->i_sb)
1086                 return -EXDEV;
1087
1088         return 0;
1089 }
1090
1091 static int ll_swap_layouts_close(struct obd_client_handle *och,
1092                                  struct inode *inode, struct inode *inode2)
1093 {
1094         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1095         const struct lu_fid     *fid2;
1096         int                      rc;
1097         ENTRY;
1098
1099         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1100                ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1101
1102         rc = ll_check_swap_layouts_validity(inode, inode2);
1103         if (rc < 0)
1104                 GOTO(out_free_och, rc);
1105
1106         /* We now know that inode2 is a lustre inode */
1107         fid2 = ll_inode2fid(inode2);
1108
1109         rc = lu_fid_cmp(fid1, fid2);
1110         if (rc == 0)
1111                 GOTO(out_free_och, rc = -EINVAL);
1112
1113         /* Close the file and {swap,merge} layouts between inode & inode2.
1114          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1115          * because we still need it to pack l_remote_handle to MDT. */
1116         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1117                                        inode2);
1118
1119         och = NULL; /* freed in ll_close_inode_openhandle() */
1120
1121 out_free_och:
1122         if (och != NULL)
1123                 OBD_FREE_PTR(och);
1124
1125         RETURN(rc);
1126 }
1127
1128 /**
1129  * Release lease and close the file.
1130  * It will check if the lease has ever broken.
1131  */
1132 static int ll_lease_close_intent(struct obd_client_handle *och,
1133                                  struct inode *inode,
1134                                  bool *lease_broken, enum mds_op_bias bias,
1135                                  void *data)
1136 {
1137         struct ldlm_lock *lock;
1138         bool cancelled = true;
1139         int rc;
1140         ENTRY;
1141
1142         lock = ldlm_handle2lock(&och->och_lease_handle);
1143         if (lock != NULL) {
1144                 lock_res_and_lock(lock);
1145                 cancelled = ldlm_is_cancel(lock);
1146                 unlock_res_and_lock(lock);
1147                 LDLM_LOCK_PUT(lock);
1148         }
1149
1150         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1151                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1152
1153         if (lease_broken != NULL)
1154                 *lease_broken = cancelled;
1155
1156         if (!cancelled && !bias)
1157                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1158
1159         if (cancelled) { /* no need to excute intent */
1160                 bias = 0;
1161                 data = NULL;
1162         }
1163
1164         rc = ll_close_inode_openhandle(inode, och, bias, data);
1165         RETURN(rc);
1166 }
1167
1168 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1169                           bool *lease_broken)
1170 {
1171         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1172 }
1173
1174 /**
1175  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1176  */
1177 static int ll_lease_file_resync(struct obd_client_handle *och,
1178                                 struct inode *inode, unsigned long arg)
1179 {
1180         struct ll_sb_info *sbi = ll_i2sbi(inode);
1181         struct md_op_data *op_data;
1182         struct ll_ioc_lease_id ioc;
1183         __u64 data_version_unused;
1184         int rc;
1185         ENTRY;
1186
1187         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1188                                      LUSTRE_OPC_ANY, NULL);
1189         if (IS_ERR(op_data))
1190                 RETURN(PTR_ERR(op_data));
1191
1192         if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1193                            sizeof(ioc)))
1194                 RETURN(-EFAULT);
1195
1196         /* before starting file resync, it's necessary to clean up page cache
1197          * in client memory, otherwise once the layout version is increased,
1198          * writing back cached data will be denied the OSTs. */
1199         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1200         if (rc)
1201                 GOTO(out, rc);
1202
1203         op_data->op_lease_handle = och->och_lease_handle;
1204         op_data->op_mirror_id = ioc.lil_mirror_id;
1205         rc = md_file_resync(sbi->ll_md_exp, op_data);
1206         if (rc)
1207                 GOTO(out, rc);
1208
1209         EXIT;
1210 out:
1211         ll_finish_md_op_data(op_data);
1212         return rc;
1213 }
1214
1215 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1216 {
1217         struct ll_inode_info *lli = ll_i2info(inode);
1218         struct cl_object *obj = lli->lli_clob;
1219         struct cl_attr *attr = vvp_env_thread_attr(env);
1220         s64 atime;
1221         s64 mtime;
1222         s64 ctime;
1223         int rc = 0;
1224
1225         ENTRY;
1226
1227         ll_inode_size_lock(inode);
1228
1229         /* Merge timestamps the most recently obtained from MDS with
1230          * timestamps obtained from OSTs.
1231          *
1232          * Do not overwrite atime of inode because it may be refreshed
1233          * by file_accessed() function. If the read was served by cache
1234          * data, there is no RPC to be sent so that atime may not be
1235          * transferred to OSTs at all. MDT only updates atime at close time
1236          * if it's at least 'mdd.*.atime_diff' older.
1237          * All in all, the atime in Lustre does not strictly comply with
1238          * POSIX. Solving this problem needs to send an RPC to MDT for each
1239          * read, this will hurt performance.
1240          */
1241         if (inode->i_atime.tv_sec < lli->lli_atime ||
1242             lli->lli_update_atime) {
1243                 inode->i_atime.tv_sec = lli->lli_atime;
1244                 lli->lli_update_atime = 0;
1245         }
1246         inode->i_mtime.tv_sec = lli->lli_mtime;
1247         inode->i_ctime.tv_sec = lli->lli_ctime;
1248
1249         mtime = inode->i_mtime.tv_sec;
1250         atime = inode->i_atime.tv_sec;
1251         ctime = inode->i_ctime.tv_sec;
1252
1253         cl_object_attr_lock(obj);
1254         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1255                 rc = -EINVAL;
1256         else
1257                 rc = cl_object_attr_get(env, obj, attr);
1258         cl_object_attr_unlock(obj);
1259
1260         if (rc != 0)
1261                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1262
1263         if (atime < attr->cat_atime)
1264                 atime = attr->cat_atime;
1265
1266         if (ctime < attr->cat_ctime)
1267                 ctime = attr->cat_ctime;
1268
1269         if (mtime < attr->cat_mtime)
1270                 mtime = attr->cat_mtime;
1271
1272         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1273                PFID(&lli->lli_fid), attr->cat_size);
1274
1275         i_size_write(inode, attr->cat_size);
1276         inode->i_blocks = attr->cat_blocks;
1277
1278         inode->i_mtime.tv_sec = mtime;
1279         inode->i_atime.tv_sec = atime;
1280         inode->i_ctime.tv_sec = ctime;
1281
1282 out_size_unlock:
1283         ll_inode_size_unlock(inode);
1284
1285         RETURN(rc);
1286 }
1287
1288 /**
1289  * Set designated mirror for I/O.
1290  *
1291  * So far only read, write, and truncated can support to issue I/O to
1292  * designated mirror.
1293  */
1294 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1295 {
1296         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1297
1298         /* clear layout version for generic(non-resync) I/O in case it carries
1299          * stale layout version due to I/O restart */
1300         io->ci_layout_version = 0;
1301
1302         /* FLR: disable non-delay for designated mirror I/O because obviously
1303          * only one mirror is available */
1304         if (fd->fd_designated_mirror > 0) {
1305                 io->ci_ndelay = 0;
1306                 io->ci_designated_mirror = fd->fd_designated_mirror;
1307                 io->ci_layout_version = fd->fd_layout_version;
1308         }
1309
1310         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1311                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1312 }
1313
1314 static bool file_is_noatime(const struct file *file)
1315 {
1316         const struct vfsmount *mnt = file->f_path.mnt;
1317         const struct inode *inode = file_inode((struct file *)file);
1318
1319         /* Adapted from file_accessed() and touch_atime().*/
1320         if (file->f_flags & O_NOATIME)
1321                 return true;
1322
1323         if (inode->i_flags & S_NOATIME)
1324                 return true;
1325
1326         if (IS_NOATIME(inode))
1327                 return true;
1328
1329         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1330                 return true;
1331
1332         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1333                 return true;
1334
1335         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1336                 return true;
1337
1338         return false;
1339 }
1340
1341 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1342 {
1343         struct inode *inode = file_inode(file);
1344         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1345
1346         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1347         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1348
1349         if (iot == CIT_WRITE) {
1350                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1351                 io->u.ci_wr.wr_sync   = !!(file->f_flags & O_SYNC ||
1352                                            file->f_flags & O_DIRECT ||
1353                                            IS_SYNC(inode));
1354         }
1355         io->ci_obj = ll_i2info(inode)->lli_clob;
1356         io->ci_lockreq = CILR_MAYBE;
1357         if (ll_file_nolock(file)) {
1358                 io->ci_lockreq = CILR_NEVER;
1359                 io->ci_no_srvlock = 1;
1360         } else if (file->f_flags & O_APPEND) {
1361                 io->ci_lockreq = CILR_MANDATORY;
1362         }
1363         io->ci_noatime = file_is_noatime(file);
1364
1365         /* FLR: only use non-delay I/O for read as there is only one
1366          * avaliable mirror for write. */
1367         io->ci_ndelay = !(iot == CIT_WRITE);
1368
1369         ll_io_set_mirror(io, file);
1370 }
1371
1372 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1373                         __u64 count)
1374 {
1375         struct ll_inode_info *lli = ll_i2info(inode);
1376         struct ll_sb_info *sbi = ll_i2sbi(inode);
1377         enum obd_heat_type sample_type;
1378         enum obd_heat_type iobyte_type;
1379         __u64 now = ktime_get_real_seconds();
1380
1381         if (!ll_sbi_has_file_heat(sbi) ||
1382             lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1383                 return;
1384
1385         if (iot == CIT_READ) {
1386                 sample_type = OBD_HEAT_READSAMPLE;
1387                 iobyte_type = OBD_HEAT_READBYTE;
1388         } else if (iot == CIT_WRITE) {
1389                 sample_type = OBD_HEAT_WRITESAMPLE;
1390                 iobyte_type = OBD_HEAT_WRITEBYTE;
1391         } else {
1392                 return;
1393         }
1394
1395         spin_lock(&lli->lli_heat_lock);
1396         obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1397                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1398         obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1399                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1400         spin_unlock(&lli->lli_heat_lock);
1401 }
1402
1403 static ssize_t
1404 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1405                    struct file *file, enum cl_io_type iot,
1406                    loff_t *ppos, size_t count)
1407 {
1408         struct vvp_io           *vio = vvp_env_io(env);
1409         struct inode            *inode = file_inode(file);
1410         struct ll_inode_info    *lli = ll_i2info(inode);
1411         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1412         struct range_lock       range;
1413         struct cl_io            *io;
1414         ssize_t                 result = 0;
1415         int                     rc = 0;
1416         unsigned                retried = 0;
1417         bool                    restarted = false;
1418
1419         ENTRY;
1420
1421         CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1422                 file_dentry(file)->d_name.name,
1423                 iot == CIT_READ ? "read" : "write", *ppos, count);
1424
1425 restart:
1426         io = vvp_env_thread_io(env);
1427         ll_io_init(io, file, iot);
1428         io->ci_ndelay_tried = retried;
1429
1430         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1431                 bool range_locked = false;
1432
1433                 if (file->f_flags & O_APPEND)
1434                         range_lock_init(&range, 0, LUSTRE_EOF);
1435                 else
1436                         range_lock_init(&range, *ppos, *ppos + count - 1);
1437
1438                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1439                 vio->vui_io_subtype = args->via_io_subtype;
1440
1441                 switch (vio->vui_io_subtype) {
1442                 case IO_NORMAL:
1443                         vio->vui_iter = args->u.normal.via_iter;
1444                         vio->vui_iocb = args->u.normal.via_iocb;
1445                         /* Direct IO reads must also take range lock,
1446                          * or multiple reads will try to work on the same pages
1447                          * See LU-6227 for details. */
1448                         if (((iot == CIT_WRITE) ||
1449                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1450                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1451                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1452                                        RL_PARA(&range));
1453                                 rc = range_lock(&lli->lli_write_tree, &range);
1454                                 if (rc < 0)
1455                                         GOTO(out, rc);
1456
1457                                 range_locked = true;
1458                         }
1459                         break;
1460                 case IO_SPLICE:
1461                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1462                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1463                         break;
1464                 default:
1465                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1466                         LBUG();
1467                 }
1468
1469                 ll_cl_add(file, env, io, LCC_RW);
1470                 rc = cl_io_loop(env, io);
1471                 ll_cl_remove(file, env);
1472
1473                 if (range_locked) {
1474                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1475                                RL_PARA(&range));
1476                         range_unlock(&lli->lli_write_tree, &range);
1477                 }
1478         } else {
1479                 /* cl_io_rw_init() handled IO */
1480                 rc = io->ci_result;
1481         }
1482
1483         if (io->ci_nob > 0) {
1484                 result += io->ci_nob;
1485                 count  -= io->ci_nob;
1486                 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1487
1488                 /* prepare IO restart */
1489                 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1490                         args->u.normal.via_iter = vio->vui_iter;
1491         }
1492 out:
1493         cl_io_fini(env, io);
1494
1495         CDEBUG(D_VFSTRACE,
1496                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1497                file->f_path.dentry->d_name.name,
1498                iot, rc, result, io->ci_need_restart);
1499
1500         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1501                 CDEBUG(D_VFSTRACE,
1502                        "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1503                        file_dentry(file)->d_name.name,
1504                        iot == CIT_READ ? "read" : "write",
1505                        *ppos, count, result, rc);
1506                 /* preserve the tried count for FLR */
1507                 retried = io->ci_ndelay_tried;
1508                 restarted = true;
1509                 goto restart;
1510         }
1511
1512         if (iot == CIT_READ) {
1513                 if (result > 0)
1514                         ll_stats_ops_tally(ll_i2sbi(inode),
1515                                            LPROC_LL_READ_BYTES, result);
1516         } else if (iot == CIT_WRITE) {
1517                 if (result > 0) {
1518                         ll_stats_ops_tally(ll_i2sbi(inode),
1519                                            LPROC_LL_WRITE_BYTES, result);
1520                         fd->fd_write_failed = false;
1521                 } else if (result == 0 && rc == 0) {
1522                         rc = io->ci_result;
1523                         if (rc < 0)
1524                                 fd->fd_write_failed = true;
1525                         else
1526                                 fd->fd_write_failed = false;
1527                 } else if (rc != -ERESTARTSYS) {
1528                         fd->fd_write_failed = true;
1529                 }
1530         }
1531
1532         CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1533         if (result > 0)
1534                 ll_heat_add(inode, iot, result);
1535
1536         RETURN(result > 0 ? result : rc);
1537 }
1538
1539 /**
1540  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1541  * especially for small I/O.
1542  *
1543  * To serve a read request, CLIO has to create and initialize a cl_io and
1544  * then request DLM lock. This has turned out to have siginificant overhead
1545  * and affects the performance of small I/O dramatically.
1546  *
1547  * It's not necessary to create a cl_io for each I/O. Under the help of read
1548  * ahead, most of the pages being read are already in memory cache and we can
1549  * read those pages directly because if the pages exist, the corresponding DLM
1550  * lock must exist so that page content must be valid.
1551  *
1552  * In fast read implementation, the llite speculatively finds and reads pages
1553  * in memory cache. There are three scenarios for fast read:
1554  *   - If the page exists and is uptodate, kernel VM will provide the data and
1555  *     CLIO won't be intervened;
1556  *   - If the page was brought into memory by read ahead, it will be exported
1557  *     and read ahead parameters will be updated;
1558  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1559  *     it will go back and invoke normal read, i.e., a cl_io will be created
1560  *     and DLM lock will be requested.
1561  *
1562  * POSIX compliance: posix standard states that read is intended to be atomic.
1563  * Lustre read implementation is in line with Linux kernel read implementation
1564  * and neither of them complies with POSIX standard in this matter. Fast read
1565  * doesn't make the situation worse on single node but it may interleave write
1566  * results from multiple nodes due to short read handling in ll_file_aio_read().
1567  *
1568  * \param env - lu_env
1569  * \param iocb - kiocb from kernel
1570  * \param iter - user space buffers where the data will be copied
1571  *
1572  * \retval - number of bytes have been read, or error code if error occurred.
1573  */
1574 static ssize_t
1575 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1576 {
1577         ssize_t result;
1578
1579         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1580                 return 0;
1581
1582         /* NB: we can't do direct IO for fast read because it will need a lock
1583          * to make IO engine happy. */
1584         if (iocb->ki_filp->f_flags & O_DIRECT)
1585                 return 0;
1586
1587         result = generic_file_read_iter(iocb, iter);
1588
1589         /* If the first page is not in cache, generic_file_aio_read() will be
1590          * returned with -ENODATA.
1591          * See corresponding code in ll_readpage(). */
1592         if (result == -ENODATA)
1593                 result = 0;
1594
1595         if (result > 0) {
1596                 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1597                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1598                                 LPROC_LL_READ_BYTES, result);
1599         }
1600
1601         return result;
1602 }
1603
1604 /*
1605  * Read from a file (through the page cache).
1606  */
1607 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1608 {
1609         struct lu_env *env;
1610         struct vvp_io_args *args;
1611         ssize_t result;
1612         ssize_t rc2;
1613         __u16 refcheck;
1614
1615         ll_ras_enter(iocb->ki_filp);
1616
1617         result = ll_do_fast_read(iocb, to);
1618         if (result < 0 || iov_iter_count(to) == 0)
1619                 GOTO(out, result);
1620
1621         env = cl_env_get(&refcheck);
1622         if (IS_ERR(env))
1623                 return PTR_ERR(env);
1624
1625         args = ll_env_args(env, IO_NORMAL);
1626         args->u.normal.via_iter = to;
1627         args->u.normal.via_iocb = iocb;
1628
1629         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1630                                  &iocb->ki_pos, iov_iter_count(to));
1631         if (rc2 > 0)
1632                 result += rc2;
1633         else if (result == 0)
1634                 result = rc2;
1635
1636         cl_env_put(env, &refcheck);
1637 out:
1638         return result;
1639 }
1640
1641 /**
1642  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1643  * If a page is already in the page cache and dirty (and some other things -
1644  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1645  * write to it without doing a full I/O, because Lustre already knows about it
1646  * and will write it out.  This saves a lot of processing time.
1647  *
1648  * All writes here are within one page, so exclusion is handled by the page
1649  * lock on the vm page.  We do not do tiny writes for writes which touch
1650  * multiple pages because it's very unlikely multiple sequential pages are
1651  * are already dirty.
1652  *
1653  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1654  * and are unlikely to be to already dirty pages.
1655  *
1656  * Attribute updates are important here, we do them in ll_tiny_write_end.
1657  */
1658 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1659 {
1660         ssize_t count = iov_iter_count(iter);
1661         struct  file *file = iocb->ki_filp;
1662         struct  inode *inode = file_inode(file);
1663         bool    lock_inode = !IS_NOSEC(inode);
1664         ssize_t result = 0;
1665
1666         ENTRY;
1667
1668         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1669          * of function for why.
1670          */
1671         if (count >= PAGE_SIZE ||
1672             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1673                 RETURN(0);
1674
1675         if (unlikely(lock_inode))
1676                 inode_lock(inode);
1677         result = __generic_file_write_iter(iocb, iter);
1678
1679         if (unlikely(lock_inode))
1680                 inode_unlock(inode);
1681
1682         /* If the page is not already dirty, ll_tiny_write_begin returns
1683          * -ENODATA.  We continue on to normal write.
1684          */
1685         if (result == -ENODATA)
1686                 result = 0;
1687
1688         if (result > 0) {
1689                 ll_heat_add(inode, CIT_WRITE, result);
1690                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1691                                    result);
1692                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1693         }
1694
1695         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1696
1697         RETURN(result);
1698 }
1699
1700 /*
1701  * Write to a file (through the page cache).
1702  */
1703 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1704 {
1705         struct vvp_io_args *args;
1706         struct lu_env *env;
1707         ssize_t rc_tiny = 0, rc_normal;
1708         __u16 refcheck;
1709
1710         ENTRY;
1711
1712         /* NB: we can't do direct IO for tiny writes because they use the page
1713          * cache, we can't do sync writes because tiny writes can't flush
1714          * pages, and we can't do append writes because we can't guarantee the
1715          * required DLM locks are held to protect file size.
1716          */
1717         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1718             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1719                 rc_tiny = ll_do_tiny_write(iocb, from);
1720
1721         /* In case of error, go on and try normal write - Only stop if tiny
1722          * write completed I/O.
1723          */
1724         if (iov_iter_count(from) == 0)
1725                 GOTO(out, rc_normal = rc_tiny);
1726
1727         env = cl_env_get(&refcheck);
1728         if (IS_ERR(env))
1729                 return PTR_ERR(env);
1730
1731         args = ll_env_args(env, IO_NORMAL);
1732         args->u.normal.via_iter = from;
1733         args->u.normal.via_iocb = iocb;
1734
1735         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1736                                     &iocb->ki_pos, iov_iter_count(from));
1737
1738         /* On success, combine bytes written. */
1739         if (rc_tiny >= 0 && rc_normal > 0)
1740                 rc_normal += rc_tiny;
1741         /* On error, only return error from normal write if tiny write did not
1742          * write any bytes.  Otherwise return bytes written by tiny write.
1743          */
1744         else if (rc_tiny > 0)
1745                 rc_normal = rc_tiny;
1746
1747         cl_env_put(env, &refcheck);
1748 out:
1749         RETURN(rc_normal);
1750 }
1751
1752 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1753 /*
1754  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1755  */
1756 static int ll_file_get_iov_count(const struct iovec *iov,
1757                                  unsigned long *nr_segs, size_t *count)
1758 {
1759         size_t cnt = 0;
1760         unsigned long seg;
1761
1762         for (seg = 0; seg < *nr_segs; seg++) {
1763                 const struct iovec *iv = &iov[seg];
1764
1765                 /*
1766                  * If any segment has a negative length, or the cumulative
1767                  * length ever wraps negative then return -EINVAL.
1768                  */
1769                 cnt += iv->iov_len;
1770                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1771                         return -EINVAL;
1772                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1773                         continue;
1774                 if (seg == 0)
1775                         return -EFAULT;
1776                 *nr_segs = seg;
1777                 cnt -= iv->iov_len;     /* This segment is no good */
1778                 break;
1779         }
1780         *count = cnt;
1781         return 0;
1782 }
1783
1784 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1785                                 unsigned long nr_segs, loff_t pos)
1786 {
1787         struct iov_iter to;
1788         size_t iov_count;
1789         ssize_t result;
1790         ENTRY;
1791
1792         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1793         if (result)
1794                 RETURN(result);
1795
1796 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1797         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1798 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1799         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1800 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1801
1802         result = ll_file_read_iter(iocb, &to);
1803
1804         RETURN(result);
1805 }
1806
1807 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1808                             loff_t *ppos)
1809 {
1810         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1811         struct kiocb   kiocb;
1812         ssize_t        result;
1813         ENTRY;
1814
1815         init_sync_kiocb(&kiocb, file);
1816         kiocb.ki_pos = *ppos;
1817 #ifdef HAVE_KIOCB_KI_LEFT
1818         kiocb.ki_left = count;
1819 #elif defined(HAVE_KI_NBYTES)
1820         kiocb.i_nbytes = count;
1821 #endif
1822
1823         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1824         *ppos = kiocb.ki_pos;
1825
1826         RETURN(result);
1827 }
1828
1829 /*
1830  * Write to a file (through the page cache).
1831  * AIO stuff
1832  */
1833 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1834                                  unsigned long nr_segs, loff_t pos)
1835 {
1836         struct iov_iter from;
1837         size_t iov_count;
1838         ssize_t result;
1839         ENTRY;
1840
1841         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1842         if (result)
1843                 RETURN(result);
1844
1845 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1846         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1847 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1848         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1849 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1850
1851         result = ll_file_write_iter(iocb, &from);
1852
1853         RETURN(result);
1854 }
1855
1856 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1857                              size_t count, loff_t *ppos)
1858 {
1859         struct iovec   iov = { .iov_base = (void __user *)buf,
1860                                .iov_len = count };
1861         struct kiocb   kiocb;
1862         ssize_t        result;
1863
1864         ENTRY;
1865
1866         init_sync_kiocb(&kiocb, file);
1867         kiocb.ki_pos = *ppos;
1868 #ifdef HAVE_KIOCB_KI_LEFT
1869         kiocb.ki_left = count;
1870 #elif defined(HAVE_KI_NBYTES)
1871         kiocb.ki_nbytes = count;
1872 #endif
1873
1874         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1875         *ppos = kiocb.ki_pos;
1876
1877         RETURN(result);
1878 }
1879 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1880
1881 /*
1882  * Send file content (through pagecache) somewhere with helper
1883  */
1884 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1885                                    struct pipe_inode_info *pipe, size_t count,
1886                                    unsigned int flags)
1887 {
1888         struct lu_env      *env;
1889         struct vvp_io_args *args;
1890         ssize_t             result;
1891         __u16               refcheck;
1892         ENTRY;
1893
1894         ll_ras_enter(in_file);
1895
1896         env = cl_env_get(&refcheck);
1897         if (IS_ERR(env))
1898                 RETURN(PTR_ERR(env));
1899
1900         args = ll_env_args(env, IO_SPLICE);
1901         args->u.splice.via_pipe = pipe;
1902         args->u.splice.via_flags = flags;
1903
1904         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1905         cl_env_put(env, &refcheck);
1906         RETURN(result);
1907 }
1908
1909 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1910                              __u64 flags, struct lov_user_md *lum, int lum_size)
1911 {
1912         struct lookup_intent oit = {
1913                 .it_op = IT_OPEN,
1914                 .it_flags = flags | MDS_OPEN_BY_FID,
1915         };
1916         int rc;
1917         ENTRY;
1918
1919         ll_inode_size_lock(inode);
1920         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1921         if (rc < 0)
1922                 GOTO(out_unlock, rc);
1923
1924         ll_release_openhandle(dentry, &oit);
1925
1926 out_unlock:
1927         ll_inode_size_unlock(inode);
1928         ll_intent_release(&oit);
1929
1930         RETURN(rc);
1931 }
1932
1933 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1934                              struct lov_mds_md **lmmp, int *lmm_size,
1935                              struct ptlrpc_request **request)
1936 {
1937         struct ll_sb_info *sbi = ll_i2sbi(inode);
1938         struct mdt_body  *body;
1939         struct lov_mds_md *lmm = NULL;
1940         struct ptlrpc_request *req = NULL;
1941         struct md_op_data *op_data;
1942         int rc, lmmsize;
1943
1944         rc = ll_get_default_mdsize(sbi, &lmmsize);
1945         if (rc)
1946                 RETURN(rc);
1947
1948         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1949                                      strlen(filename), lmmsize,
1950                                      LUSTRE_OPC_ANY, NULL);
1951         if (IS_ERR(op_data))
1952                 RETURN(PTR_ERR(op_data));
1953
1954         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1955         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1956         ll_finish_md_op_data(op_data);
1957         if (rc < 0) {
1958                 CDEBUG(D_INFO, "md_getattr_name failed "
1959                        "on %s: rc %d\n", filename, rc);
1960                 GOTO(out, rc);
1961         }
1962
1963         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1964         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1965
1966         lmmsize = body->mbo_eadatasize;
1967
1968         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1969                         lmmsize == 0) {
1970                 GOTO(out, rc = -ENODATA);
1971         }
1972
1973         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1974         LASSERT(lmm != NULL);
1975
1976         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1977             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1978             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
1979             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
1980                 GOTO(out, rc = -EPROTO);
1981
1982         /*
1983          * This is coming from the MDS, so is probably in
1984          * little endian.  We convert it to host endian before
1985          * passing it to userspace.
1986          */
1987         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1988                 int stripe_count;
1989
1990                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1991                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1992                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1993                         if (le32_to_cpu(lmm->lmm_pattern) &
1994                             LOV_PATTERN_F_RELEASED)
1995                                 stripe_count = 0;
1996                 }
1997
1998                 /* if function called for directory - we should
1999                  * avoid swab not existent lsm objects */
2000                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2001                         lustre_swab_lov_user_md_v1(
2002                                         (struct lov_user_md_v1 *)lmm);
2003                         if (S_ISREG(body->mbo_mode))
2004                                 lustre_swab_lov_user_md_objects(
2005                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2006                                     stripe_count);
2007                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2008                         lustre_swab_lov_user_md_v3(
2009                                         (struct lov_user_md_v3 *)lmm);
2010                         if (S_ISREG(body->mbo_mode))
2011                                 lustre_swab_lov_user_md_objects(
2012                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2013                                     stripe_count);
2014                 } else if (lmm->lmm_magic ==
2015                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2016                         lustre_swab_lov_comp_md_v1(
2017                                         (struct lov_comp_md_v1 *)lmm);
2018                 } else if (lmm->lmm_magic ==
2019                            cpu_to_le32(LOV_MAGIC_FOREIGN)) {
2020                         struct lov_foreign_md *lfm;
2021
2022                         lfm = (struct lov_foreign_md *)lmm;
2023                         __swab32s(&lfm->lfm_magic);
2024                         __swab32s(&lfm->lfm_length);
2025                         __swab32s(&lfm->lfm_type);
2026                         __swab32s(&lfm->lfm_flags);
2027                 }
2028         }
2029
2030 out:
2031         *lmmp = lmm;
2032         *lmm_size = lmmsize;
2033         *request = req;
2034         return rc;
2035 }
2036
2037 static int ll_lov_setea(struct inode *inode, struct file *file,
2038                         void __user *arg)
2039 {
2040         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2041         struct lov_user_md      *lump;
2042         int                      lum_size = sizeof(struct lov_user_md) +
2043                                             sizeof(struct lov_user_ost_data);
2044         int                      rc;
2045         ENTRY;
2046
2047         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2048                 RETURN(-EPERM);
2049
2050         OBD_ALLOC_LARGE(lump, lum_size);
2051         if (lump == NULL)
2052                 RETURN(-ENOMEM);
2053
2054         if (copy_from_user(lump, arg, lum_size))
2055                 GOTO(out_lump, rc = -EFAULT);
2056
2057         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2058                                       lum_size);
2059         cl_lov_delay_create_clear(&file->f_flags);
2060
2061 out_lump:
2062         OBD_FREE_LARGE(lump, lum_size);
2063         RETURN(rc);
2064 }
2065
2066 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2067 {
2068         struct lu_env   *env;
2069         __u16           refcheck;
2070         int             rc;
2071         ENTRY;
2072
2073         env = cl_env_get(&refcheck);
2074         if (IS_ERR(env))
2075                 RETURN(PTR_ERR(env));
2076
2077         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2078         cl_env_put(env, &refcheck);
2079         RETURN(rc);
2080 }
2081
2082 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2083                             void __user *arg)
2084 {
2085         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2086         struct lov_user_md        *klum;
2087         int                        lum_size, rc;
2088         __u64                      flags = FMODE_WRITE;
2089         ENTRY;
2090
2091         rc = ll_copy_user_md(lum, &klum);
2092         if (rc < 0)
2093                 RETURN(rc);
2094
2095         lum_size = rc;
2096         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2097                                       lum_size);
2098         if (!rc) {
2099                 __u32 gen;
2100
2101                 rc = put_user(0, &lum->lmm_stripe_count);
2102                 if (rc)
2103                         GOTO(out, rc);
2104
2105                 rc = ll_layout_refresh(inode, &gen);
2106                 if (rc)
2107                         GOTO(out, rc);
2108
2109                 rc = ll_file_getstripe(inode, arg, lum_size);
2110         }
2111         cl_lov_delay_create_clear(&file->f_flags);
2112
2113 out:
2114         OBD_FREE(klum, lum_size);
2115         RETURN(rc);
2116 }
2117
2118 static int
2119 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2120 {
2121         struct ll_inode_info *lli = ll_i2info(inode);
2122         struct cl_object *obj = lli->lli_clob;
2123         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2124         struct ll_grouplock grouplock;
2125         int rc;
2126         ENTRY;
2127
2128         if (arg == 0) {
2129                 CWARN("group id for group lock must not be 0\n");
2130                 RETURN(-EINVAL);
2131         }
2132
2133         if (ll_file_nolock(file))
2134                 RETURN(-EOPNOTSUPP);
2135
2136         spin_lock(&lli->lli_lock);
2137         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2138                 CWARN("group lock already existed with gid %lu\n",
2139                       fd->fd_grouplock.lg_gid);
2140                 spin_unlock(&lli->lli_lock);
2141                 RETURN(-EINVAL);
2142         }
2143         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2144         spin_unlock(&lli->lli_lock);
2145
2146         /**
2147          * XXX: group lock needs to protect all OST objects while PFL
2148          * can add new OST objects during the IO, so we'd instantiate
2149          * all OST objects before getting its group lock.
2150          */
2151         if (obj) {
2152                 struct lu_env *env;
2153                 __u16 refcheck;
2154                 struct cl_layout cl = {
2155                         .cl_is_composite = false,
2156                 };
2157                 struct lu_extent ext = {
2158                         .e_start = 0,
2159                         .e_end = OBD_OBJECT_EOF,
2160                 };
2161
2162                 env = cl_env_get(&refcheck);
2163                 if (IS_ERR(env))
2164                         RETURN(PTR_ERR(env));
2165
2166                 rc = cl_object_layout_get(env, obj, &cl);
2167                 if (!rc && cl.cl_is_composite)
2168                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2169                                                     &ext);
2170
2171                 cl_env_put(env, &refcheck);
2172                 if (rc)
2173                         RETURN(rc);
2174         }
2175
2176         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2177                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2178         if (rc)
2179                 RETURN(rc);
2180
2181         spin_lock(&lli->lli_lock);
2182         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2183                 spin_unlock(&lli->lli_lock);
2184                 CERROR("another thread just won the race\n");
2185                 cl_put_grouplock(&grouplock);
2186                 RETURN(-EINVAL);
2187         }
2188
2189         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2190         fd->fd_grouplock = grouplock;
2191         spin_unlock(&lli->lli_lock);
2192
2193         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2194         RETURN(0);
2195 }
2196
2197 static int ll_put_grouplock(struct inode *inode, struct file *file,
2198                             unsigned long arg)
2199 {
2200         struct ll_inode_info   *lli = ll_i2info(inode);
2201         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2202         struct ll_grouplock     grouplock;
2203         ENTRY;
2204
2205         spin_lock(&lli->lli_lock);
2206         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2207                 spin_unlock(&lli->lli_lock);
2208                 CWARN("no group lock held\n");
2209                 RETURN(-EINVAL);
2210         }
2211
2212         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2213
2214         if (fd->fd_grouplock.lg_gid != arg) {
2215                 CWARN("group lock %lu doesn't match current id %lu\n",
2216                       arg, fd->fd_grouplock.lg_gid);
2217                 spin_unlock(&lli->lli_lock);
2218                 RETURN(-EINVAL);
2219         }
2220
2221         grouplock = fd->fd_grouplock;
2222         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2223         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2224         spin_unlock(&lli->lli_lock);
2225
2226         cl_put_grouplock(&grouplock);
2227         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2228         RETURN(0);
2229 }
2230
2231 /**
2232  * Close inode open handle
2233  *
2234  * \param dentry [in]     dentry which contains the inode
2235  * \param it     [in,out] intent which contains open info and result
2236  *
2237  * \retval 0     success
2238  * \retval <0    failure
2239  */
2240 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2241 {
2242         struct inode *inode = dentry->d_inode;
2243         struct obd_client_handle *och;
2244         int rc;
2245         ENTRY;
2246
2247         LASSERT(inode);
2248
2249         /* Root ? Do nothing. */
2250         if (dentry->d_inode->i_sb->s_root == dentry)
2251                 RETURN(0);
2252
2253         /* No open handle to close? Move away */
2254         if (!it_disposition(it, DISP_OPEN_OPEN))
2255                 RETURN(0);
2256
2257         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2258
2259         OBD_ALLOC(och, sizeof(*och));
2260         if (!och)
2261                 GOTO(out, rc = -ENOMEM);
2262
2263         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2264
2265         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2266 out:
2267         /* this one is in place of ll_file_open */
2268         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2269                 ptlrpc_req_finished(it->it_request);
2270                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2271         }
2272         RETURN(rc);
2273 }
2274
2275 /**
2276  * Get size for inode for which FIEMAP mapping is requested.
2277  * Make the FIEMAP get_info call and returns the result.
2278  * \param fiemap        kernel buffer to hold extens
2279  * \param num_bytes     kernel buffer size
2280  */
2281 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2282                         size_t num_bytes)
2283 {
2284         struct lu_env                   *env;
2285         __u16                           refcheck;
2286         int                             rc = 0;
2287         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2288         ENTRY;
2289
2290         /* Checks for fiemap flags */
2291         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2292                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2293                 return -EBADR;
2294         }
2295
2296         /* Check for FIEMAP_FLAG_SYNC */
2297         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2298                 rc = filemap_fdatawrite(inode->i_mapping);
2299                 if (rc)
2300                         return rc;
2301         }
2302
2303         env = cl_env_get(&refcheck);
2304         if (IS_ERR(env))
2305                 RETURN(PTR_ERR(env));
2306
2307         if (i_size_read(inode) == 0) {
2308                 rc = ll_glimpse_size(inode);
2309                 if (rc)
2310                         GOTO(out, rc);
2311         }
2312
2313         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2314         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2315         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2316
2317         /* If filesize is 0, then there would be no objects for mapping */
2318         if (fmkey.lfik_oa.o_size == 0) {
2319                 fiemap->fm_mapped_extents = 0;
2320                 GOTO(out, rc = 0);
2321         }
2322
2323         fmkey.lfik_fiemap = *fiemap;
2324
2325         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2326                               &fmkey, fiemap, &num_bytes);
2327 out:
2328         cl_env_put(env, &refcheck);
2329         RETURN(rc);
2330 }
2331
2332 int ll_fid2path(struct inode *inode, void __user *arg)
2333 {
2334         struct obd_export       *exp = ll_i2mdexp(inode);
2335         const struct getinfo_fid2path __user *gfin = arg;
2336         __u32                    pathlen;
2337         struct getinfo_fid2path *gfout;
2338         size_t                   outsize;
2339         int                      rc;
2340
2341         ENTRY;
2342
2343         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2344             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2345                 RETURN(-EPERM);
2346
2347         /* Only need to get the buflen */
2348         if (get_user(pathlen, &gfin->gf_pathlen))
2349                 RETURN(-EFAULT);
2350
2351         if (pathlen > PATH_MAX)
2352                 RETURN(-EINVAL);
2353
2354         outsize = sizeof(*gfout) + pathlen;
2355         OBD_ALLOC(gfout, outsize);
2356         if (gfout == NULL)
2357                 RETURN(-ENOMEM);
2358
2359         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2360                 GOTO(gf_free, rc = -EFAULT);
2361         /* append root FID after gfout to let MDT know the root FID so that it
2362          * can lookup the correct path, this is mainly for fileset.
2363          * old server without fileset mount support will ignore this. */
2364         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2365
2366         /* Call mdc_iocontrol */
2367         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2368         if (rc != 0)
2369                 GOTO(gf_free, rc);
2370
2371         if (copy_to_user(arg, gfout, outsize))
2372                 rc = -EFAULT;
2373
2374 gf_free:
2375         OBD_FREE(gfout, outsize);
2376         RETURN(rc);
2377 }
2378
2379 static int
2380 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2381 {
2382         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2383         struct lu_env *env;
2384         struct cl_io *io;
2385         __u16  refcheck;
2386         int result;
2387
2388         ENTRY;
2389
2390         ioc->idv_version = 0;
2391         ioc->idv_layout_version = UINT_MAX;
2392
2393         /* If no file object initialized, we consider its version is 0. */
2394         if (obj == NULL)
2395                 RETURN(0);
2396
2397         env = cl_env_get(&refcheck);
2398         if (IS_ERR(env))
2399                 RETURN(PTR_ERR(env));
2400
2401         io = vvp_env_thread_io(env);
2402         io->ci_obj = obj;
2403         io->u.ci_data_version.dv_data_version = 0;
2404         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2405         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2406
2407 restart:
2408         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2409                 result = cl_io_loop(env, io);
2410         else
2411                 result = io->ci_result;
2412
2413         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2414         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2415
2416         cl_io_fini(env, io);
2417
2418         if (unlikely(io->ci_need_restart))
2419                 goto restart;
2420
2421         cl_env_put(env, &refcheck);
2422
2423         RETURN(result);
2424 }
2425
2426 /*
2427  * Read the data_version for inode.
2428  *
2429  * This value is computed using stripe object version on OST.
2430  * Version is computed using server side locking.
2431  *
2432  * @param flags if do sync on the OST side;
2433  *              0: no sync
2434  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2435  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2436  */
2437 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2438 {
2439         struct ioc_data_version ioc = { .idv_flags = flags };
2440         int rc;
2441
2442         rc = ll_ioc_data_version(inode, &ioc);
2443         if (!rc)
2444                 *data_version = ioc.idv_version;
2445
2446         return rc;
2447 }
2448
2449 /*
2450  * Trigger a HSM release request for the provided inode.
2451  */
2452 int ll_hsm_release(struct inode *inode)
2453 {
2454         struct lu_env *env;
2455         struct obd_client_handle *och = NULL;
2456         __u64 data_version = 0;
2457         int rc;
2458         __u16 refcheck;
2459         ENTRY;
2460
2461         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2462                ll_i2sbi(inode)->ll_fsname,
2463                PFID(&ll_i2info(inode)->lli_fid));
2464
2465         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2466         if (IS_ERR(och))
2467                 GOTO(out, rc = PTR_ERR(och));
2468
2469         /* Grab latest data_version and [am]time values */
2470         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2471         if (rc != 0)
2472                 GOTO(out, rc);
2473
2474         env = cl_env_get(&refcheck);
2475         if (IS_ERR(env))
2476                 GOTO(out, rc = PTR_ERR(env));
2477
2478         rc = ll_merge_attr(env, inode);
2479         cl_env_put(env, &refcheck);
2480
2481         /* If error happen, we have the wrong size for a file.
2482          * Don't release it.
2483          */
2484         if (rc != 0)
2485                 GOTO(out, rc);
2486
2487         /* Release the file.
2488          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2489          * we still need it to pack l_remote_handle to MDT. */
2490         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2491                                        &data_version);
2492         och = NULL;
2493
2494         EXIT;
2495 out:
2496         if (och != NULL && !IS_ERR(och)) /* close the file */
2497                 ll_lease_close(och, inode, NULL);
2498
2499         return rc;
2500 }
2501
2502 struct ll_swap_stack {
2503         __u64                    dv1;
2504         __u64                    dv2;
2505         struct inode            *inode1;
2506         struct inode            *inode2;
2507         bool                     check_dv1;
2508         bool                     check_dv2;
2509 };
2510
2511 static int ll_swap_layouts(struct file *file1, struct file *file2,
2512                            struct lustre_swap_layouts *lsl)
2513 {
2514         struct mdc_swap_layouts  msl;
2515         struct md_op_data       *op_data;
2516         __u32                    gid;
2517         __u64                    dv;
2518         struct ll_swap_stack    *llss = NULL;
2519         int                      rc;
2520
2521         OBD_ALLOC_PTR(llss);
2522         if (llss == NULL)
2523                 RETURN(-ENOMEM);
2524
2525         llss->inode1 = file_inode(file1);
2526         llss->inode2 = file_inode(file2);
2527
2528         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2529         if (rc < 0)
2530                 GOTO(free, rc);
2531
2532         /* we use 2 bool because it is easier to swap than 2 bits */
2533         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2534                 llss->check_dv1 = true;
2535
2536         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2537                 llss->check_dv2 = true;
2538
2539         /* we cannot use lsl->sl_dvX directly because we may swap them */
2540         llss->dv1 = lsl->sl_dv1;
2541         llss->dv2 = lsl->sl_dv2;
2542
2543         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2544         if (rc == 0) /* same file, done! */
2545                 GOTO(free, rc);
2546
2547         if (rc < 0) { /* sequentialize it */
2548                 swap(llss->inode1, llss->inode2);
2549                 swap(file1, file2);
2550                 swap(llss->dv1, llss->dv2);
2551                 swap(llss->check_dv1, llss->check_dv2);
2552         }
2553
2554         gid = lsl->sl_gid;
2555         if (gid != 0) { /* application asks to flush dirty cache */
2556                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2557                 if (rc < 0)
2558                         GOTO(free, rc);
2559
2560                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2561                 if (rc < 0) {
2562                         ll_put_grouplock(llss->inode1, file1, gid);
2563                         GOTO(free, rc);
2564                 }
2565         }
2566
2567         /* ultimate check, before swaping the layouts we check if
2568          * dataversion has changed (if requested) */
2569         if (llss->check_dv1) {
2570                 rc = ll_data_version(llss->inode1, &dv, 0);
2571                 if (rc)
2572                         GOTO(putgl, rc);
2573                 if (dv != llss->dv1)
2574                         GOTO(putgl, rc = -EAGAIN);
2575         }
2576
2577         if (llss->check_dv2) {
2578                 rc = ll_data_version(llss->inode2, &dv, 0);
2579                 if (rc)
2580                         GOTO(putgl, rc);
2581                 if (dv != llss->dv2)
2582                         GOTO(putgl, rc = -EAGAIN);
2583         }
2584
2585         /* struct md_op_data is used to send the swap args to the mdt
2586          * only flags is missing, so we use struct mdc_swap_layouts
2587          * through the md_op_data->op_data */
2588         /* flags from user space have to be converted before they are send to
2589          * server, no flag is sent today, they are only used on the client */
2590         msl.msl_flags = 0;
2591         rc = -ENOMEM;
2592         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2593                                      0, LUSTRE_OPC_ANY, &msl);
2594         if (IS_ERR(op_data))
2595                 GOTO(free, rc = PTR_ERR(op_data));
2596
2597         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2598                            sizeof(*op_data), op_data, NULL);
2599         ll_finish_md_op_data(op_data);
2600
2601         if (rc < 0)
2602                 GOTO(putgl, rc);
2603
2604 putgl:
2605         if (gid != 0) {
2606                 ll_put_grouplock(llss->inode2, file2, gid);
2607                 ll_put_grouplock(llss->inode1, file1, gid);
2608         }
2609
2610 free:
2611         if (llss != NULL)
2612                 OBD_FREE_PTR(llss);
2613
2614         RETURN(rc);
2615 }
2616
2617 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2618 {
2619         struct obd_export *exp = ll_i2mdexp(inode);
2620         struct md_op_data *op_data;
2621         int rc;
2622         ENTRY;
2623
2624         /* Detect out-of range masks */
2625         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2626                 RETURN(-EINVAL);
2627
2628         /* Non-root users are forbidden to set or clear flags which are
2629          * NOT defined in HSM_USER_MASK. */
2630         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2631             !cfs_capable(CFS_CAP_SYS_ADMIN))
2632                 RETURN(-EPERM);
2633
2634         if (!exp_connect_archive_id_array(exp)) {
2635                 /* Detect out-of range archive id */
2636                 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2637                     (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2638                         RETURN(-EINVAL);
2639         }
2640
2641         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2642                                      LUSTRE_OPC_ANY, hss);
2643         if (IS_ERR(op_data))
2644                 RETURN(PTR_ERR(op_data));
2645
2646         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2647                            op_data, NULL);
2648
2649         ll_finish_md_op_data(op_data);
2650
2651         RETURN(rc);
2652 }
2653
2654 static int ll_hsm_import(struct inode *inode, struct file *file,
2655                          struct hsm_user_import *hui)
2656 {
2657         struct hsm_state_set    *hss = NULL;
2658         struct iattr            *attr = NULL;
2659         int                      rc;
2660         ENTRY;
2661
2662         if (!S_ISREG(inode->i_mode))
2663                 RETURN(-EINVAL);
2664
2665         /* set HSM flags */
2666         OBD_ALLOC_PTR(hss);
2667         if (hss == NULL)
2668                 GOTO(out, rc = -ENOMEM);
2669
2670         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2671         hss->hss_archive_id = hui->hui_archive_id;
2672         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2673         rc = ll_hsm_state_set(inode, hss);
2674         if (rc != 0)
2675                 GOTO(out, rc);
2676
2677         OBD_ALLOC_PTR(attr);
2678         if (attr == NULL)
2679                 GOTO(out, rc = -ENOMEM);
2680
2681         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2682         attr->ia_mode |= S_IFREG;
2683         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2684         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2685         attr->ia_size = hui->hui_size;
2686         attr->ia_mtime.tv_sec = hui->hui_mtime;
2687         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2688         attr->ia_atime.tv_sec = hui->hui_atime;
2689         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2690
2691         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2692                          ATTR_UID | ATTR_GID |
2693                          ATTR_MTIME | ATTR_MTIME_SET |
2694                          ATTR_ATIME | ATTR_ATIME_SET;
2695
2696         inode_lock(inode);
2697
2698         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2699         if (rc == -ENODATA)
2700                 rc = 0;
2701
2702         inode_unlock(inode);
2703
2704 out:
2705         if (hss != NULL)
2706                 OBD_FREE_PTR(hss);
2707
2708         if (attr != NULL)
2709                 OBD_FREE_PTR(attr);
2710
2711         RETURN(rc);
2712 }
2713
2714 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2715 {
2716         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2717                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2718 }
2719
2720 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2721 {
2722         struct inode *inode = file_inode(file);
2723         struct iattr ia = {
2724                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2725                             ATTR_MTIME | ATTR_MTIME_SET |
2726                             ATTR_CTIME,
2727                 .ia_atime = {
2728                         .tv_sec = lfu->lfu_atime_sec,
2729                         .tv_nsec = lfu->lfu_atime_nsec,
2730                 },
2731                 .ia_mtime = {
2732                         .tv_sec = lfu->lfu_mtime_sec,
2733                         .tv_nsec = lfu->lfu_mtime_nsec,
2734                 },
2735                 .ia_ctime = {
2736                         .tv_sec = lfu->lfu_ctime_sec,
2737                         .tv_nsec = lfu->lfu_ctime_nsec,
2738                 },
2739         };
2740         int rc;
2741         ENTRY;
2742
2743         if (!capable(CAP_SYS_ADMIN))
2744                 RETURN(-EPERM);
2745
2746         if (!S_ISREG(inode->i_mode))
2747                 RETURN(-EINVAL);
2748
2749         inode_lock(inode);
2750         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2751                             false);
2752         inode_unlock(inode);
2753
2754         RETURN(rc);
2755 }
2756
2757 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2758 {
2759         switch (mode) {
2760         case MODE_READ_USER:
2761                 return CLM_READ;
2762         case MODE_WRITE_USER:
2763                 return CLM_WRITE;
2764         default:
2765                 return -EINVAL;
2766         }
2767 }
2768
2769 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2770
2771 /* Used to allow the upper layers of the client to request an LDLM lock
2772  * without doing an actual read or write.
2773  *
2774  * Used for ladvise lockahead to manually request specific locks.
2775  *
2776  * \param[in] file      file this ladvise lock request is on
2777  * \param[in] ladvise   ladvise struct describing this lock request
2778  *
2779  * \retval 0            success, no detailed result available (sync requests
2780  *                      and requests sent to the server [not handled locally]
2781  *                      cannot return detailed results)
2782  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2783  *                                       see definitions for details.
2784  * \retval negative     negative errno on error
2785  */
2786 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2787 {
2788         struct lu_env *env = NULL;
2789         struct cl_io *io  = NULL;
2790         struct cl_lock *lock = NULL;
2791         struct cl_lock_descr *descr = NULL;
2792         struct dentry *dentry = file->f_path.dentry;
2793         struct inode *inode = dentry->d_inode;
2794         enum cl_lock_mode cl_mode;
2795         off_t start = ladvise->lla_start;
2796         off_t end = ladvise->lla_end;
2797         int result;
2798         __u16 refcheck;
2799
2800         ENTRY;
2801
2802         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2803                "start=%llu, end=%llu\n", dentry->d_name.len,
2804                dentry->d_name.name, dentry->d_inode,
2805                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2806                (__u64) end);
2807
2808         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2809         if (cl_mode < 0)
2810                 GOTO(out, result = cl_mode);
2811
2812         /* Get IO environment */
2813         result = cl_io_get(inode, &env, &io, &refcheck);
2814         if (result <= 0)
2815                 GOTO(out, result);
2816
2817         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2818         if (result > 0) {
2819                 /*
2820                  * nothing to do for this io. This currently happens when
2821                  * stripe sub-object's are not yet created.
2822                  */
2823                 result = io->ci_result;
2824         } else if (result == 0) {
2825                 lock = vvp_env_lock(env);
2826                 descr = &lock->cll_descr;
2827
2828                 descr->cld_obj   = io->ci_obj;
2829                 /* Convert byte offsets to pages */
2830                 descr->cld_start = cl_index(io->ci_obj, start);
2831                 descr->cld_end   = cl_index(io->ci_obj, end);
2832                 descr->cld_mode  = cl_mode;
2833                 /* CEF_MUST is used because we do not want to convert a
2834                  * lockahead request to a lockless lock */
2835                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2836                                        CEF_NONBLOCK;
2837
2838                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2839                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2840
2841                 result = cl_lock_request(env, io, lock);
2842
2843                 /* On success, we need to release the lock */
2844                 if (result >= 0)
2845                         cl_lock_release(env, lock);
2846         }
2847         cl_io_fini(env, io);
2848         cl_env_put(env, &refcheck);
2849
2850         /* -ECANCELED indicates a matching lock with a different extent
2851          * was already present, and -EEXIST indicates a matching lock
2852          * on exactly the same extent was already present.
2853          * We convert them to positive values for userspace to make
2854          * recognizing true errors easier.
2855          * Note we can only return these detailed results on async requests,
2856          * as sync requests look the same as i/o requests for locking. */
2857         if (result == -ECANCELED)
2858                 result = LLA_RESULT_DIFFERENT;
2859         else if (result == -EEXIST)
2860                 result = LLA_RESULT_SAME;
2861
2862 out:
2863         RETURN(result);
2864 }
2865 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2866
2867 static int ll_ladvise_sanity(struct inode *inode,
2868                              struct llapi_lu_ladvise *ladvise)
2869 {
2870         struct ll_sb_info *sbi = ll_i2sbi(inode);
2871         enum lu_ladvise_type advice = ladvise->lla_advice;
2872         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2873          * be in the first 32 bits of enum ladvise_flags */
2874         __u32 flags = ladvise->lla_peradvice_flags;
2875         /* 3 lines at 80 characters per line, should be plenty */
2876         int rc = 0;
2877
2878         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2879                 rc = -EINVAL;
2880                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2881                        "last supported advice is %s (value '%d'): rc = %d\n",
2882                        sbi->ll_fsname, advice,
2883                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2884                 GOTO(out, rc);
2885         }
2886
2887         /* Per-advice checks */
2888         switch (advice) {
2889         case LU_LADVISE_LOCKNOEXPAND:
2890                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2891                         rc = -EINVAL;
2892                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2893                                "rc = %d\n", sbi->ll_fsname, flags,
2894                                ladvise_names[advice], rc);
2895                         GOTO(out, rc);
2896                 }
2897                 break;
2898         case LU_LADVISE_LOCKAHEAD:
2899                 /* Currently only READ and WRITE modes can be requested */
2900                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2901                     ladvise->lla_lockahead_mode == 0) {
2902                         rc = -EINVAL;
2903                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2904                                "rc = %d\n", sbi->ll_fsname,
2905                                ladvise->lla_lockahead_mode,
2906                                ladvise_names[advice], rc);
2907                         GOTO(out, rc);
2908                 }
2909         case LU_LADVISE_WILLREAD:
2910         case LU_LADVISE_DONTNEED:
2911         default:
2912                 /* Note fall through above - These checks apply to all advices
2913                  * except LOCKNOEXPAND */
2914                 if (flags & ~LF_DEFAULT_MASK) {
2915                         rc = -EINVAL;
2916                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2917                                "rc = %d\n", sbi->ll_fsname, flags,
2918                                ladvise_names[advice], rc);
2919                         GOTO(out, rc);
2920                 }
2921                 if (ladvise->lla_start >= ladvise->lla_end) {
2922                         rc = -EINVAL;
2923                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2924                                "for %s: rc = %d\n", sbi->ll_fsname,
2925                                ladvise->lla_start, ladvise->lla_end,
2926                                ladvise_names[advice], rc);
2927                         GOTO(out, rc);
2928                 }
2929                 break;
2930         }
2931
2932 out:
2933         return rc;
2934 }
2935 #undef ERRSIZE
2936
2937 /*
2938  * Give file access advices
2939  *
2940  * The ladvise interface is similar to Linux fadvise() system call, except it
2941  * forwards the advices directly from Lustre client to server. The server side
2942  * codes will apply appropriate read-ahead and caching techniques for the
2943  * corresponding files.
2944  *
2945  * A typical workload for ladvise is e.g. a bunch of different clients are
2946  * doing small random reads of a file, so prefetching pages into OSS cache
2947  * with big linear reads before the random IO is a net benefit. Fetching
2948  * all that data into each client cache with fadvise() may not be, due to
2949  * much more data being sent to the client.
2950  */
2951 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2952                       struct llapi_lu_ladvise *ladvise)
2953 {
2954         struct lu_env *env;
2955         struct cl_io *io;
2956         struct cl_ladvise_io *lio;
2957         int rc;
2958         __u16 refcheck;
2959         ENTRY;
2960
2961         env = cl_env_get(&refcheck);
2962         if (IS_ERR(env))
2963                 RETURN(PTR_ERR(env));
2964
2965         io = vvp_env_thread_io(env);
2966         io->ci_obj = ll_i2info(inode)->lli_clob;
2967
2968         /* initialize parameters for ladvise */
2969         lio = &io->u.ci_ladvise;
2970         lio->li_start = ladvise->lla_start;
2971         lio->li_end = ladvise->lla_end;
2972         lio->li_fid = ll_inode2fid(inode);
2973         lio->li_advice = ladvise->lla_advice;
2974         lio->li_flags = flags;
2975
2976         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2977                 rc = cl_io_loop(env, io);
2978         else
2979                 rc = io->ci_result;
2980
2981         cl_io_fini(env, io);
2982         cl_env_put(env, &refcheck);
2983         RETURN(rc);
2984 }
2985
2986 static int ll_lock_noexpand(struct file *file, int flags)
2987 {
2988         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2989
2990         fd->ll_lock_no_expand = !(flags & LF_UNSET);
2991
2992         return 0;
2993 }
2994
2995 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2996                         unsigned long arg)
2997 {
2998         struct fsxattr fsxattr;
2999
3000         if (copy_from_user(&fsxattr,
3001                            (const struct fsxattr __user *)arg,
3002                            sizeof(fsxattr)))
3003                 RETURN(-EFAULT);
3004
3005         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3006         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3007                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3008         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3009         if (copy_to_user((struct fsxattr __user *)arg,
3010                          &fsxattr, sizeof(fsxattr)))
3011                 RETURN(-EFAULT);
3012
3013         RETURN(0);
3014 }
3015
3016 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3017 {
3018         /*
3019          * Project Quota ID state is only allowed to change from within the init
3020          * namespace. Enforce that restriction only if we are trying to change
3021          * the quota ID state. Everything else is allowed in user namespaces.
3022          */
3023         if (current_user_ns() == &init_user_ns)
3024                 return 0;
3025
3026         if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3027                 return -EINVAL;
3028
3029         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3030                 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3031                         return -EINVAL;
3032         } else {
3033                 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3034                         return -EINVAL;
3035         }
3036
3037         return 0;
3038 }
3039
3040 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3041                         unsigned long arg)
3042 {
3043
3044         struct md_op_data *op_data;
3045         struct ptlrpc_request *req = NULL;
3046         int rc = 0;
3047         struct fsxattr fsxattr;
3048         struct cl_object *obj;
3049         struct iattr *attr;
3050         int flags;
3051
3052         if (copy_from_user(&fsxattr,
3053                            (const struct fsxattr __user *)arg,
3054                            sizeof(fsxattr)))
3055                 RETURN(-EFAULT);
3056
3057         rc = ll_ioctl_check_project(inode, &fsxattr);
3058         if (rc)
3059                 RETURN(rc);
3060
3061         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3062                                      LUSTRE_OPC_ANY, NULL);
3063         if (IS_ERR(op_data))
3064                 RETURN(PTR_ERR(op_data));
3065
3066         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3067         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3068         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3069                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3070         op_data->op_projid = fsxattr.fsx_projid;
3071         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3072         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3073                         0, &req);
3074         ptlrpc_req_finished(req);
3075         if (rc)
3076                 GOTO(out_fsxattr, rc);
3077         ll_update_inode_flags(inode, op_data->op_attr_flags);
3078         obj = ll_i2info(inode)->lli_clob;
3079         if (obj == NULL)
3080                 GOTO(out_fsxattr, rc);
3081
3082         OBD_ALLOC_PTR(attr);
3083         if (attr == NULL)
3084                 GOTO(out_fsxattr, rc = -ENOMEM);
3085
3086         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3087                             fsxattr.fsx_xflags);
3088         OBD_FREE_PTR(attr);
3089 out_fsxattr:
3090         ll_finish_md_op_data(op_data);
3091         RETURN(rc);
3092 }
3093
3094 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3095                                  unsigned long arg)
3096 {
3097         struct inode            *inode = file_inode(file);
3098         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3099         struct ll_inode_info    *lli = ll_i2info(inode);
3100         struct obd_client_handle *och = NULL;
3101         struct split_param sp;
3102         bool lease_broken;
3103         fmode_t fmode = 0;
3104         enum mds_op_bias bias = 0;
3105         struct file *layout_file = NULL;
3106         void *data = NULL;
3107         size_t data_size = 0;
3108         long rc;
3109         ENTRY;
3110
3111         mutex_lock(&lli->lli_och_mutex);
3112         if (fd->fd_lease_och != NULL) {
3113                 och = fd->fd_lease_och;
3114                 fd->fd_lease_och = NULL;
3115         }
3116         mutex_unlock(&lli->lli_och_mutex);
3117
3118         if (och == NULL)
3119                 GOTO(out, rc = -ENOLCK);
3120
3121         fmode = och->och_flags;
3122
3123         switch (ioc->lil_flags) {
3124         case LL_LEASE_RESYNC_DONE:
3125                 if (ioc->lil_count > IOC_IDS_MAX)
3126                         GOTO(out, rc = -EINVAL);
3127
3128                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3129                 OBD_ALLOC(data, data_size);
3130                 if (!data)
3131                         GOTO(out, rc = -ENOMEM);
3132
3133                 if (copy_from_user(data, (void __user *)arg, data_size))
3134                         GOTO(out, rc = -EFAULT);
3135
3136                 bias = MDS_CLOSE_RESYNC_DONE;
3137                 break;
3138         case LL_LEASE_LAYOUT_MERGE: {
3139                 int fd;
3140
3141                 if (ioc->lil_count != 1)
3142                         GOTO(out, rc = -EINVAL);
3143
3144                 arg += sizeof(*ioc);
3145                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3146                         GOTO(out, rc = -EFAULT);
3147
3148                 layout_file = fget(fd);
3149                 if (!layout_file)
3150                         GOTO(out, rc = -EBADF);
3151
3152                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3153                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3154                         GOTO(out, rc = -EPERM);
3155
3156                 data = file_inode(layout_file);
3157                 bias = MDS_CLOSE_LAYOUT_MERGE;
3158                 break;
3159         }
3160         case LL_LEASE_LAYOUT_SPLIT: {
3161                 int fdv;
3162                 int mirror_id;
3163
3164                 if (ioc->lil_count != 2)
3165                         GOTO(out, rc = -EINVAL);
3166
3167                 arg += sizeof(*ioc);
3168                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3169                         GOTO(out, rc = -EFAULT);
3170
3171                 arg += sizeof(__u32);
3172                 if (copy_from_user(&mirror_id, (void __user *)arg,
3173                                    sizeof(__u32)))
3174                         GOTO(out, rc = -EFAULT);
3175
3176                 layout_file = fget(fdv);
3177                 if (!layout_file)
3178                         GOTO(out, rc = -EBADF);
3179
3180                 sp.sp_inode = file_inode(layout_file);
3181                 sp.sp_mirror_id = (__u16)mirror_id;
3182                 data = &sp;
3183                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3184                 break;
3185         }
3186         default:
3187                 /* without close intent */
3188                 break;
3189         }
3190
3191         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3192         if (rc < 0)
3193                 GOTO(out, rc);
3194
3195         rc = ll_lease_och_release(inode, file);
3196         if (rc < 0)
3197                 GOTO(out, rc);
3198
3199         if (lease_broken)
3200                 fmode = 0;
3201         EXIT;
3202
3203 out:
3204         switch (ioc->lil_flags) {
3205         case LL_LEASE_RESYNC_DONE:
3206                 if (data)
3207                         OBD_FREE(data, data_size);
3208                 break;
3209         case LL_LEASE_LAYOUT_MERGE:
3210         case LL_LEASE_LAYOUT_SPLIT:
3211                 if (layout_file)
3212                         fput(layout_file);
3213                 break;
3214         }
3215
3216         if (!rc)
3217                 rc = ll_lease_type_from_fmode(fmode);
3218         RETURN(rc);
3219 }
3220
3221 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3222                               unsigned long arg)
3223 {
3224         struct inode *inode = file_inode(file);
3225         struct ll_inode_info *lli = ll_i2info(inode);
3226         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3227         struct obd_client_handle *och = NULL;
3228         __u64 open_flags = 0;
3229         bool lease_broken;
3230         fmode_t fmode;
3231         long rc;
3232         ENTRY;
3233
3234         switch (ioc->lil_mode) {
3235         case LL_LEASE_WRLCK:
3236                 if (!(file->f_mode & FMODE_WRITE))
3237                         RETURN(-EPERM);
3238                 fmode = FMODE_WRITE;
3239                 break;
3240         case LL_LEASE_RDLCK:
3241                 if (!(file->f_mode & FMODE_READ))
3242                         RETURN(-EPERM);
3243                 fmode = FMODE_READ;
3244                 break;
3245         case LL_LEASE_UNLCK:
3246                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3247         default:
3248                 RETURN(-EINVAL);
3249         }
3250
3251         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3252
3253         /* apply for lease */
3254         if (ioc->lil_flags & LL_LEASE_RESYNC)
3255                 open_flags = MDS_OPEN_RESYNC;
3256         och = ll_lease_open(inode, file, fmode, open_flags);
3257         if (IS_ERR(och))
3258                 RETURN(PTR_ERR(och));
3259
3260         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3261                 rc = ll_lease_file_resync(och, inode, arg);
3262                 if (rc) {
3263                         ll_lease_close(och, inode, NULL);
3264                         RETURN(rc);
3265                 }
3266                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3267                 if (rc) {
3268                         ll_lease_close(och, inode, NULL);
3269                         RETURN(rc);
3270                 }
3271         }
3272
3273         rc = 0;
3274         mutex_lock(&lli->lli_och_mutex);
3275         if (fd->fd_lease_och == NULL) {
3276                 fd->fd_lease_och = och;
3277                 och = NULL;
3278         }
3279         mutex_unlock(&lli->lli_och_mutex);
3280         if (och != NULL) {
3281                 /* impossible now that only excl is supported for now */
3282                 ll_lease_close(och, inode, &lease_broken);
3283                 rc = -EBUSY;
3284         }
3285         RETURN(rc);
3286 }
3287
3288 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3289 {
3290         struct ll_inode_info *lli = ll_i2info(inode);
3291         struct ll_sb_info *sbi = ll_i2sbi(inode);
3292         __u64 now = ktime_get_real_seconds();
3293         int i;
3294
3295         spin_lock(&lli->lli_heat_lock);
3296         heat->lh_flags = lli->lli_heat_flags;
3297         for (i = 0; i < heat->lh_count; i++)
3298                 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3299                                                 now, sbi->ll_heat_decay_weight,
3300                                                 sbi->ll_heat_period_second);
3301         spin_unlock(&lli->lli_heat_lock);
3302 }
3303
3304 static int ll_heat_set(struct inode *inode, __u64 flags)
3305 {
3306         struct ll_inode_info *lli = ll_i2info(inode);
3307         int rc = 0;
3308
3309         spin_lock(&lli->lli_heat_lock);
3310         if (flags & LU_HEAT_FLAG_CLEAR)
3311                 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3312
3313         if (flags & LU_HEAT_FLAG_OFF)
3314                 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3315         else
3316                 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3317
3318         spin_unlock(&lli->lli_heat_lock);
3319
3320         RETURN(rc);
3321 }
3322
3323 static long
3324 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3325 {
3326         struct inode            *inode = file_inode(file);
3327         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3328         int                      flags, rc;
3329         ENTRY;
3330
3331         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3332                PFID(ll_inode2fid(inode)), inode, cmd);
3333         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3334
3335         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3336         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3337                 RETURN(-ENOTTY);
3338
3339         switch (cmd) {
3340         case LL_IOC_GETFLAGS:
3341                 /* Get the current value of the file flags */
3342                 return put_user(fd->fd_flags, (int __user *)arg);
3343         case LL_IOC_SETFLAGS:
3344         case LL_IOC_CLRFLAGS:
3345                 /* Set or clear specific file flags */
3346                 /* XXX This probably needs checks to ensure the flags are
3347                  *     not abused, and to handle any flag side effects.
3348                  */
3349                 if (get_user(flags, (int __user *) arg))
3350                         RETURN(-EFAULT);
3351
3352                 if (cmd == LL_IOC_SETFLAGS) {
3353                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3354                             !(file->f_flags & O_DIRECT)) {
3355                                 CERROR("%s: unable to disable locking on "
3356                                        "non-O_DIRECT file\n", current->comm);
3357                                 RETURN(-EINVAL);
3358                         }
3359
3360                         fd->fd_flags |= flags;
3361                 } else {
3362                         fd->fd_flags &= ~flags;
3363                 }
3364                 RETURN(0);
3365         case LL_IOC_LOV_SETSTRIPE:
3366         case LL_IOC_LOV_SETSTRIPE_NEW:
3367                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3368         case LL_IOC_LOV_SETEA:
3369                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3370         case LL_IOC_LOV_SWAP_LAYOUTS: {
3371                 struct file *file2;
3372                 struct lustre_swap_layouts lsl;
3373
3374                 if (copy_from_user(&lsl, (char __user *)arg,
3375                                    sizeof(struct lustre_swap_layouts)))
3376                         RETURN(-EFAULT);
3377
3378                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3379                         RETURN(-EPERM);
3380
3381                 file2 = fget(lsl.sl_fd);
3382                 if (file2 == NULL)
3383                         RETURN(-EBADF);
3384
3385                 /* O_WRONLY or O_RDWR */
3386                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3387                         GOTO(out, rc = -EPERM);
3388
3389                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3390                         struct inode                    *inode2;
3391                         struct ll_inode_info            *lli;
3392                         struct obd_client_handle        *och = NULL;
3393
3394                         lli = ll_i2info(inode);
3395                         mutex_lock(&lli->lli_och_mutex);
3396                         if (fd->fd_lease_och != NULL) {
3397                                 och = fd->fd_lease_och;
3398                                 fd->fd_lease_och = NULL;
3399                         }
3400                         mutex_unlock(&lli->lli_och_mutex);
3401                         if (och == NULL)
3402                                 GOTO(out, rc = -ENOLCK);
3403                         inode2 = file_inode(file2);
3404                         rc = ll_swap_layouts_close(och, inode, inode2);
3405                 } else {
3406                         rc = ll_swap_layouts(file, file2, &lsl);
3407                 }
3408 out:
3409                 fput(file2);
3410                 RETURN(rc);
3411         }
3412         case LL_IOC_LOV_GETSTRIPE:
3413         case LL_IOC_LOV_GETSTRIPE_NEW:
3414                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3415         case FS_IOC_GETFLAGS:
3416         case FS_IOC_SETFLAGS:
3417                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3418         case FSFILT_IOC_GETVERSION:
3419         case FS_IOC_GETVERSION:
3420                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3421         /* We need to special case any other ioctls we want to handle,
3422          * to send them to the MDS/OST as appropriate and to properly
3423          * network encode the arg field. */
3424         case FS_IOC_SETVERSION:
3425                 RETURN(-ENOTSUPP);
3426
3427         case LL_IOC_GROUP_LOCK:
3428                 RETURN(ll_get_grouplock(inode, file, arg));
3429         case LL_IOC_GROUP_UNLOCK:
3430                 RETURN(ll_put_grouplock(inode, file, arg));
3431         case IOC_OBD_STATFS:
3432                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3433
3434         case LL_IOC_FLUSHCTX:
3435                 RETURN(ll_flush_ctx(inode));
3436         case LL_IOC_PATH2FID: {
3437                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3438                                  sizeof(struct lu_fid)))
3439                         RETURN(-EFAULT);
3440
3441                 RETURN(0);
3442         }
3443         case LL_IOC_GETPARENT:
3444                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3445
3446         case OBD_IOC_FID2PATH:
3447                 RETURN(ll_fid2path(inode, (void __user *)arg));
3448         case LL_IOC_DATA_VERSION: {
3449                 struct ioc_data_version idv;
3450                 int rc;
3451
3452                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3453                         RETURN(-EFAULT);
3454
3455                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3456                 rc = ll_ioc_data_version(inode, &idv);
3457
3458                 if (rc == 0 &&
3459                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3460                         RETURN(-EFAULT);
3461
3462                 RETURN(rc);
3463         }
3464
3465         case LL_IOC_GET_MDTIDX: {
3466                 int mdtidx;
3467
3468                 mdtidx = ll_get_mdt_idx(inode);
3469                 if (mdtidx < 0)
3470                         RETURN(mdtidx);
3471
3472                 if (put_user((int)mdtidx, (int __user *)arg))
3473                         RETURN(-EFAULT);
3474
3475                 RETURN(0);
3476         }
3477         case OBD_IOC_GETDTNAME:
3478         case OBD_IOC_GETMDNAME:
3479                 RETURN(ll_get_obd_name(inode, cmd, arg));
3480         case LL_IOC_HSM_STATE_GET: {
3481                 struct md_op_data       *op_data;
3482                 struct hsm_user_state   *hus;
3483                 int                      rc;
3484
3485                 OBD_ALLOC_PTR(hus);
3486                 if (hus == NULL)
3487                         RETURN(-ENOMEM);
3488
3489                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3490                                              LUSTRE_OPC_ANY, hus);
3491                 if (IS_ERR(op_data)) {
3492                         OBD_FREE_PTR(hus);
3493                         RETURN(PTR_ERR(op_data));
3494                 }
3495
3496                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3497                                    op_data, NULL);
3498
3499                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3500                         rc = -EFAULT;
3501
3502                 ll_finish_md_op_data(op_data);
3503                 OBD_FREE_PTR(hus);
3504                 RETURN(rc);
3505         }
3506         case LL_IOC_HSM_STATE_SET: {
3507                 struct hsm_state_set    *hss;
3508                 int                      rc;
3509
3510                 OBD_ALLOC_PTR(hss);
3511                 if (hss == NULL)
3512                         RETURN(-ENOMEM);
3513
3514                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3515                         OBD_FREE_PTR(hss);
3516                         RETURN(-EFAULT);
3517                 }
3518
3519                 rc = ll_hsm_state_set(inode, hss);
3520
3521                 OBD_FREE_PTR(hss);
3522                 RETURN(rc);
3523         }
3524         case LL_IOC_HSM_ACTION: {
3525                 struct md_op_data               *op_data;
3526                 struct hsm_current_action       *hca;
3527                 int                              rc;
3528
3529                 OBD_ALLOC_PTR(hca);
3530                 if (hca == NULL)
3531                         RETURN(-ENOMEM);
3532
3533                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3534                                              LUSTRE_OPC_ANY, hca);
3535                 if (IS_ERR(op_data)) {
3536                         OBD_FREE_PTR(hca);
3537                         RETURN(PTR_ERR(op_data));
3538                 }
3539
3540                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3541                                    op_data, NULL);
3542
3543                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3544                         rc = -EFAULT;
3545
3546                 ll_finish_md_op_data(op_data);
3547                 OBD_FREE_PTR(hca);
3548                 RETURN(rc);
3549         }
3550         case LL_IOC_SET_LEASE_OLD: {
3551                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3552
3553                 RETURN(ll_file_set_lease(file, &ioc, 0));
3554         }
3555         case LL_IOC_SET_LEASE: {
3556                 struct ll_ioc_lease ioc;
3557
3558                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3559                         RETURN(-EFAULT);
3560
3561                 RETURN(ll_file_set_lease(file, &ioc, arg));
3562         }
3563         case LL_IOC_GET_LEASE: {
3564                 struct ll_inode_info *lli = ll_i2info(inode);
3565                 struct ldlm_lock *lock = NULL;
3566                 fmode_t fmode = 0;
3567
3568                 mutex_lock(&lli->lli_och_mutex);
3569                 if (fd->fd_lease_och != NULL) {
3570                         struct obd_client_handle *och = fd->fd_lease_och;
3571
3572                         lock = ldlm_handle2lock(&och->och_lease_handle);
3573                         if (lock != NULL) {
3574                                 lock_res_and_lock(lock);
3575                                 if (!ldlm_is_cancel(lock))
3576                                         fmode = och->och_flags;
3577
3578                                 unlock_res_and_lock(lock);
3579                                 LDLM_LOCK_PUT(lock);
3580                         }
3581                 }
3582                 mutex_unlock(&lli->lli_och_mutex);
3583
3584                 RETURN(ll_lease_type_from_fmode(fmode));
3585         }
3586         case LL_IOC_HSM_IMPORT: {
3587                 struct hsm_user_import *hui;
3588
3589                 OBD_ALLOC_PTR(hui);
3590                 if (hui == NULL)
3591                         RETURN(-ENOMEM);
3592
3593                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3594                         OBD_FREE_PTR(hui);
3595                         RETURN(-EFAULT);
3596                 }
3597
3598                 rc = ll_hsm_import(inode, file, hui);
3599
3600                 OBD_FREE_PTR(hui);
3601                 RETURN(rc);
3602         }
3603         case LL_IOC_FUTIMES_3: {
3604                 struct ll_futimes_3 lfu;
3605
3606                 if (copy_from_user(&lfu,
3607                                    (const struct ll_futimes_3 __user *)arg,
3608                                    sizeof(lfu)))
3609                         RETURN(-EFAULT);
3610
3611                 RETURN(ll_file_futimes_3(file, &lfu));
3612         }
3613         case LL_IOC_LADVISE: {
3614                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3615                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3616                 int i;
3617                 int num_advise;
3618                 int alloc_size = sizeof(*k_ladvise_hdr);
3619
3620                 rc = 0;
3621                 u_ladvise_hdr = (void __user *)arg;
3622                 OBD_ALLOC_PTR(k_ladvise_hdr);
3623                 if (k_ladvise_hdr == NULL)
3624                         RETURN(-ENOMEM);
3625
3626                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3627                         GOTO(out_ladvise, rc = -EFAULT);
3628
3629                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3630                     k_ladvise_hdr->lah_count < 1)
3631                         GOTO(out_ladvise, rc = -EINVAL);
3632
3633                 num_advise = k_ladvise_hdr->lah_count;
3634                 if (num_advise >= LAH_COUNT_MAX)
3635                         GOTO(out_ladvise, rc = -EFBIG);
3636
3637                 OBD_FREE_PTR(k_ladvise_hdr);
3638                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3639                                       lah_advise[num_advise]);
3640                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3641                 if (k_ladvise_hdr == NULL)
3642                         RETURN(-ENOMEM);
3643
3644                 /*
3645                  * TODO: submit multiple advices to one server in a single RPC
3646                  */
3647                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3648                         GOTO(out_ladvise, rc = -EFAULT);
3649
3650                 for (i = 0; i < num_advise; i++) {
3651                         struct llapi_lu_ladvise *k_ladvise =
3652                                         &k_ladvise_hdr->lah_advise[i];
3653                         struct llapi_lu_ladvise __user *u_ladvise =
3654                                         &u_ladvise_hdr->lah_advise[i];
3655
3656                         rc = ll_ladvise_sanity(inode, k_ladvise);
3657                         if (rc)
3658                                 GOTO(out_ladvise, rc);
3659
3660                         switch (k_ladvise->lla_advice) {
3661                         case LU_LADVISE_LOCKNOEXPAND:
3662                                 rc = ll_lock_noexpand(file,
3663                                                k_ladvise->lla_peradvice_flags);
3664                                 GOTO(out_ladvise, rc);
3665                         case LU_LADVISE_LOCKAHEAD:
3666
3667                                 rc = ll_file_lock_ahead(file, k_ladvise);
3668
3669                                 if (rc < 0)
3670                                         GOTO(out_ladvise, rc);
3671
3672                                 if (put_user(rc,
3673                                              &u_ladvise->lla_lockahead_result))
3674                                         GOTO(out_ladvise, rc = -EFAULT);
3675                                 break;
3676                         default:
3677                                 rc = ll_ladvise(inode, file,
3678                                                 k_ladvise_hdr->lah_flags,
3679                                                 k_ladvise);
3680                                 if (rc)
3681                                         GOTO(out_ladvise, rc);
3682                                 break;
3683                         }
3684
3685                 }
3686
3687 out_ladvise:
3688                 OBD_FREE(k_ladvise_hdr, alloc_size);
3689                 RETURN(rc);
3690         }
3691         case LL_IOC_FLR_SET_MIRROR: {
3692                 /* mirror I/O must be direct to avoid polluting page cache
3693                  * by stale data. */
3694                 if (!(file->f_flags & O_DIRECT))
3695                         RETURN(-EINVAL);
3696
3697                 fd->fd_designated_mirror = (__u32)arg;
3698                 RETURN(0);
3699         }
3700         case LL_IOC_FSGETXATTR:
3701                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3702         case LL_IOC_FSSETXATTR:
3703                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3704         case BLKSSZGET:
3705                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3706         case LL_IOC_HEAT_GET: {
3707                 struct lu_heat uheat;
3708                 struct lu_heat *heat;
3709                 int size;
3710
3711                 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3712                         RETURN(-EFAULT);
3713
3714                 if (uheat.lh_count > OBD_HEAT_COUNT)
3715                         uheat.lh_count = OBD_HEAT_COUNT;
3716
3717                 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3718                 OBD_ALLOC(heat, size);
3719                 if (heat == NULL)
3720                         RETURN(-ENOMEM);
3721
3722                 heat->lh_count = uheat.lh_count;
3723                 ll_heat_get(inode, heat);
3724                 rc = copy_to_user((char __user *)arg, heat, size);
3725                 OBD_FREE(heat, size);
3726                 RETURN(rc ? -EFAULT : 0);
3727         }
3728         case LL_IOC_HEAT_SET: {
3729                 __u64 flags;
3730
3731                 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3732                         RETURN(-EFAULT);
3733
3734                 rc = ll_heat_set(inode, flags);
3735                 RETURN(rc);
3736         }
3737         default:
3738                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3739                                      (void __user *)arg));
3740         }
3741 }
3742
3743 #ifndef HAVE_FILE_LLSEEK_SIZE
3744 static inline loff_t
3745 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3746 {
3747         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3748                 return -EINVAL;
3749         if (offset > maxsize)
3750                 return -EINVAL;
3751
3752         if (offset != file->f_pos) {
3753                 file->f_pos = offset;
3754                 file->f_version = 0;
3755         }
3756         return offset;
3757 }
3758
3759 static loff_t
3760 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3761                 loff_t maxsize, loff_t eof)
3762 {
3763         struct inode *inode = file_inode(file);
3764
3765         switch (origin) {
3766         case SEEK_END:
3767                 offset += eof;
3768                 break;
3769         case SEEK_CUR:
3770                 /*
3771                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3772                  * position-querying operation.  Avoid rewriting the "same"
3773                  * f_pos value back to the file because a concurrent read(),
3774                  * write() or lseek() might have altered it
3775                  */
3776                 if (offset == 0)
3777                         return file->f_pos;
3778                 /*
3779                  * f_lock protects against read/modify/write race with other
3780                  * SEEK_CURs. Note that parallel writes and reads behave
3781                  * like SEEK_SET.
3782                  */
3783                 inode_lock(inode);
3784                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3785                 inode_unlock(inode);
3786                 return offset;
3787         case SEEK_DATA:
3788                 /*
3789                  * In the generic case the entire file is data, so as long as
3790                  * offset isn't at the end of the file then the offset is data.
3791                  */
3792                 if (offset >= eof)
3793                         return -ENXIO;
3794                 break;
3795         case SEEK_HOLE:
3796                 /*
3797                  * There is a virtual hole at the end of the file, so as long as
3798                  * offset isn't i_size or larger, return i_size.
3799                  */
3800                 if (offset >= eof)
3801                         return -ENXIO;
3802                 offset = eof;
3803                 break;
3804         }
3805
3806         return llseek_execute(file, offset, maxsize);
3807 }
3808 #endif
3809
3810 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3811 {
3812         struct inode *inode = file_inode(file);
3813         loff_t retval, eof = 0;
3814
3815         ENTRY;
3816         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3817                            (origin == SEEK_CUR) ? file->f_pos : 0);
3818         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3819                PFID(ll_inode2fid(inode)), inode, retval, retval,
3820                origin);
3821         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3822
3823         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3824                 retval = ll_glimpse_size(inode);
3825                 if (retval != 0)
3826                         RETURN(retval);
3827                 eof = i_size_read(inode);
3828         }
3829
3830         retval = ll_generic_file_llseek_size(file, offset, origin,
3831                                           ll_file_maxbytes(inode), eof);
3832         RETURN(retval);
3833 }
3834
3835 static int ll_flush(struct file *file, fl_owner_t id)
3836 {
3837         struct inode *inode = file_inode(file);
3838         struct ll_inode_info *lli = ll_i2info(inode);
3839         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3840         int rc, err;
3841
3842         LASSERT(!S_ISDIR(inode->i_mode));
3843
3844         /* catch async errors that were recorded back when async writeback
3845          * failed for pages in this mapping. */
3846         rc = lli->lli_async_rc;
3847         lli->lli_async_rc = 0;
3848         if (lli->lli_clob != NULL) {
3849                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3850                 if (rc == 0)
3851                         rc = err;
3852         }
3853
3854         /* The application has been told write failure already.
3855          * Do not report failure again. */
3856         if (fd->fd_write_failed)
3857                 return 0;
3858         return rc ? -EIO : 0;
3859 }
3860
3861 /**
3862  * Called to make sure a portion of file has been written out.
3863  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3864  *
3865  * Return how many pages have been written.
3866  */
3867 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3868                        enum cl_fsync_mode mode, int ignore_layout)
3869 {
3870         struct lu_env *env;
3871         struct cl_io *io;
3872         struct cl_fsync_io *fio;
3873         int result;
3874         __u16 refcheck;
3875         ENTRY;
3876
3877         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3878             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3879                 RETURN(-EINVAL);
3880
3881         env = cl_env_get(&refcheck);
3882         if (IS_ERR(env))
3883                 RETURN(PTR_ERR(env));
3884
3885         io = vvp_env_thread_io(env);
3886         io->ci_obj = ll_i2info(inode)->lli_clob;
3887         io->ci_ignore_layout = ignore_layout;
3888
3889         /* initialize parameters for sync */
3890         fio = &io->u.ci_fsync;
3891         fio->fi_start = start;
3892         fio->fi_end = end;
3893         fio->fi_fid = ll_inode2fid(inode);
3894         fio->fi_mode = mode;
3895         fio->fi_nr_written = 0;
3896
3897         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3898                 result = cl_io_loop(env, io);
3899         else
3900                 result = io->ci_result;
3901         if (result == 0)
3902                 result = fio->fi_nr_written;
3903         cl_io_fini(env, io);
3904         cl_env_put(env, &refcheck);
3905
3906         RETURN(result);
3907 }
3908
3909 /*
3910  * When dentry is provided (the 'else' case), file_dentry() may be
3911  * null and dentry must be used directly rather than pulled from
3912  * file_dentry() as is done otherwise.
3913  */
3914
3915 #ifdef HAVE_FILE_FSYNC_4ARGS
3916 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3917 {
3918         struct dentry *dentry = file_dentry(file);
3919 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3920 int ll_fsync(struct file *file, int datasync)
3921 {
3922         struct dentry *dentry = file_dentry(file);
3923         loff_t start = 0;
3924         loff_t end = LLONG_MAX;
3925 #else
3926 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3927 {
3928         loff_t start = 0;
3929         loff_t end = LLONG_MAX;
3930 #endif
3931         struct inode *inode = dentry->d_inode;
3932         struct ll_inode_info *lli = ll_i2info(inode);
3933         struct ptlrpc_request *req;
3934         int rc, err;
3935         ENTRY;
3936
3937         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3938                PFID(ll_inode2fid(inode)), inode);
3939         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3940
3941 #ifdef HAVE_FILE_FSYNC_4ARGS
3942         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3943         inode_lock(inode);
3944 #else
3945         /* fsync's caller has already called _fdata{sync,write}, we want
3946          * that IO to finish before calling the osc and mdc sync methods */
3947         rc = filemap_fdatawait(inode->i_mapping);
3948 #endif
3949
3950         /* catch async errors that were recorded back when async writeback
3951          * failed for pages in this mapping. */
3952         if (!S_ISDIR(inode->i_mode)) {
3953                 err = lli->lli_async_rc;
3954                 lli->lli_async_rc = 0;
3955                 if (rc == 0)
3956                         rc = err;
3957                 if (lli->lli_clob != NULL) {
3958                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3959                         if (rc == 0)
3960                                 rc = err;
3961                 }
3962         }
3963
3964         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3965         if (!rc)
3966                 rc = err;
3967         if (!err)
3968                 ptlrpc_req_finished(req);
3969
3970         if (S_ISREG(inode->i_mode)) {
3971                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3972
3973                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3974                 if (rc == 0 && err < 0)
3975                         rc = err;
3976                 if (rc < 0)
3977                         fd->fd_write_failed = true;
3978                 else
3979                         fd->fd_write_failed = false;
3980         }
3981
3982 #ifdef HAVE_FILE_FSYNC_4ARGS
3983         inode_unlock(inode);
3984 #endif
3985         RETURN(rc);
3986 }
3987
3988 static int
3989 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3990 {
3991         struct inode *inode = file_inode(file);
3992         struct ll_sb_info *sbi = ll_i2sbi(inode);
3993         struct ldlm_enqueue_info einfo = {
3994                 .ei_type        = LDLM_FLOCK,
3995                 .ei_cb_cp       = ldlm_flock_completion_ast,
3996                 .ei_cbdata      = file_lock,
3997         };
3998         struct md_op_data *op_data;
3999         struct lustre_handle lockh = { 0 };
4000         union ldlm_policy_data flock = { { 0 } };
4001         int fl_type = file_lock->fl_type;
4002         __u64 flags = 0;
4003         int rc;
4004         int rc2 = 0;
4005         ENTRY;
4006
4007         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4008                PFID(ll_inode2fid(inode)), file_lock);
4009
4010         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4011
4012         if (file_lock->fl_flags & FL_FLOCK) {
4013                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4014                 /* flocks are whole-file locks */
4015                 flock.l_flock.end = OFFSET_MAX;
4016                 /* For flocks owner is determined by the local file desctiptor*/
4017                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4018         } else if (file_lock->fl_flags & FL_POSIX) {
4019                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4020                 flock.l_flock.start = file_lock->fl_start;
4021                 flock.l_flock.end = file_lock->fl_end;
4022         } else {
4023                 RETURN(-EINVAL);
4024         }
4025         flock.l_flock.pid = file_lock->fl_pid;
4026
4027         /* Somewhat ugly workaround for svc lockd.
4028          * lockd installs custom fl_lmops->lm_compare_owner that checks
4029          * for the fl_owner to be the same (which it always is on local node
4030          * I guess between lockd processes) and then compares pid.
4031          * As such we assign pid to the owner field to make it all work,
4032          * conflict with normal locks is unlikely since pid space and
4033          * pointer space for current->files are not intersecting */
4034         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4035                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4036
4037         switch (fl_type) {
4038         case F_RDLCK:
4039                 einfo.ei_mode = LCK_PR;
4040                 break;
4041         case F_UNLCK:
4042                 /* An unlock request may or may not have any relation to
4043                  * existing locks so we may not be able to pass a lock handle
4044                  * via a normal ldlm_lock_cancel() request. The request may even
4045                  * unlock a byte range in the middle of an existing lock. In
4046                  * order to process an unlock request we need all of the same
4047                  * information that is given with a normal read or write record
4048                  * lock request. To avoid creating another ldlm unlock (cancel)
4049                  * message we'll treat a LCK_NL flock request as an unlock. */
4050                 einfo.ei_mode = LCK_NL;
4051                 break;
4052         case F_WRLCK:
4053                 einfo.ei_mode = LCK_PW;
4054                 break;
4055         default:
4056                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4057                 RETURN (-ENOTSUPP);
4058         }
4059
4060         switch (cmd) {
4061         case F_SETLKW:
4062 #ifdef F_SETLKW64
4063         case F_SETLKW64:
4064 #endif
4065                 flags = 0;
4066                 break;
4067         case F_SETLK:
4068 #ifdef F_SETLK64
4069         case F_SETLK64:
4070 #endif
4071                 flags = LDLM_FL_BLOCK_NOWAIT;
4072                 break;
4073         case F_GETLK:
4074 #ifdef F_GETLK64
4075         case F_GETLK64:
4076 #endif
4077                 flags = LDLM_FL_TEST_LOCK;
4078                 break;
4079         default:
4080                 CERROR("unknown fcntl lock command: %d\n", cmd);
4081                 RETURN (-EINVAL);
4082         }
4083
4084         /* Save the old mode so that if the mode in the lock changes we
4085          * can decrement the appropriate reader or writer refcount. */
4086         file_lock->fl_type = einfo.ei_mode;
4087
4088         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4089                                      LUSTRE_OPC_ANY, NULL);
4090         if (IS_ERR(op_data))
4091                 RETURN(PTR_ERR(op_data));
4092
4093         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4094                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4095                flock.l_flock.pid, flags, einfo.ei_mode,
4096                flock.l_flock.start, flock.l_flock.end);
4097
4098         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4099                         flags);
4100
4101         /* Restore the file lock type if not TEST lock. */
4102         if (!(flags & LDLM_FL_TEST_LOCK))
4103                 file_lock->fl_type = fl_type;
4104
4105 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4106         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4107             !(flags & LDLM_FL_TEST_LOCK))
4108                 rc2  = locks_lock_file_wait(file, file_lock);
4109 #else
4110         if ((file_lock->fl_flags & FL_FLOCK) &&
4111             (rc == 0 || file_lock->fl_type == F_UNLCK))
4112                 rc2  = flock_lock_file_wait(file, file_lock);
4113         if ((file_lock->fl_flags & FL_POSIX) &&
4114             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4115             !(flags & LDLM_FL_TEST_LOCK))
4116                 rc2  = posix_lock_file_wait(file, file_lock);
4117 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4118
4119         if (rc2 && file_lock->fl_type != F_UNLCK) {
4120                 einfo.ei_mode = LCK_NL;
4121                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4122                            &lockh, flags);
4123                 rc = rc2;
4124         }
4125
4126         ll_finish_md_op_data(op_data);
4127
4128         RETURN(rc);
4129 }
4130
4131 int ll_get_fid_by_name(struct inode *parent, const char *name,
4132                        int namelen, struct lu_fid *fid,
4133                        struct inode **inode)
4134 {
4135         struct md_op_data       *op_data = NULL;
4136         struct mdt_body         *body;
4137         struct ptlrpc_request   *req;
4138         int                     rc;
4139         ENTRY;
4140
4141         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4142                                      LUSTRE_OPC_ANY, NULL);
4143         if (IS_ERR(op_data))
4144                 RETURN(PTR_ERR(op_data));
4145
4146         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4147         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4148         ll_finish_md_op_data(op_data);
4149         if (rc < 0)
4150                 RETURN(rc);
4151
4152         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4153         if (body == NULL)
4154                 GOTO(out_req, rc = -EFAULT);
4155         if (fid != NULL)
4156                 *fid = body->mbo_fid1;
4157
4158         if (inode != NULL)
4159                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4160 out_req:
4161         ptlrpc_req_finished(req);
4162         RETURN(rc);
4163 }
4164
4165 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4166                const char *name)
4167 {
4168         struct dentry *dchild = NULL;
4169         struct inode *child_inode = NULL;
4170         struct md_op_data *op_data;
4171         struct ptlrpc_request *request = NULL;
4172         struct obd_client_handle *och = NULL;
4173         struct qstr qstr;
4174         struct mdt_body *body;
4175         __u64 data_version = 0;
4176         size_t namelen = strlen(name);
4177         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4178         int rc;
4179         ENTRY;
4180
4181         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4182                PFID(ll_inode2fid(parent)), name,
4183                lum->lum_stripe_offset, lum->lum_stripe_count);
4184
4185         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4186             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4187                 lustre_swab_lmv_user_md(lum);
4188
4189         /* Get child FID first */
4190         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4191         qstr.name = name;
4192         qstr.len = namelen;
4193         dchild = d_lookup(file_dentry(file), &qstr);
4194         if (dchild) {
4195                 if (dchild->d_inode)
4196                         child_inode = igrab(dchild->d_inode);
4197                 dput(dchild);
4198         }
4199
4200         if (!child_inode) {
4201                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4202                                         &child_inode);
4203                 if (rc)
4204                         RETURN(rc);
4205         }
4206
4207         if (!child_inode)
4208                 RETURN(-ENOENT);
4209
4210         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4211               OBD_CONNECT2_DIR_MIGRATE)) {
4212                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4213                     ll_i2info(child_inode)->lli_lsm_md) {
4214                         CERROR("%s: MDT doesn't support stripe directory "
4215                                "migration!\n", ll_i2sbi(parent)->ll_fsname);
4216                         GOTO(out_iput, rc = -EOPNOTSUPP);
4217                 }
4218         }
4219
4220         /*
4221          * lfs migrate command needs to be blocked on the client
4222          * by checking the migrate FID against the FID of the
4223          * filesystem root.
4224          */
4225         if (child_inode == parent->i_sb->s_root->d_inode)
4226                 GOTO(out_iput, rc = -EINVAL);
4227
4228         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4229                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4230         if (IS_ERR(op_data))
4231                 GOTO(out_iput, rc = PTR_ERR(op_data));
4232
4233         inode_lock(child_inode);
4234         op_data->op_fid3 = *ll_inode2fid(child_inode);
4235         if (!fid_is_sane(&op_data->op_fid3)) {
4236                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4237                        ll_i2sbi(parent)->ll_fsname, name,
4238                        PFID(&op_data->op_fid3));
4239                 GOTO(out_unlock, rc = -EINVAL);
4240         }
4241
4242         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4243         op_data->op_data = lum;
4244         op_data->op_data_size = lumlen;
4245
4246 again:
4247         if (S_ISREG(child_inode->i_mode)) {
4248                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4249                 if (IS_ERR(och)) {
4250                         rc = PTR_ERR(och);
4251                         och = NULL;
4252                         GOTO(out_unlock, rc);
4253                 }
4254
4255                 rc = ll_data_version(child_inode, &data_version,
4256                                      LL_DV_WR_FLUSH);
4257                 if (rc != 0)
4258                         GOTO(out_close, rc);
4259
4260                 op_data->op_open_handle = och->och_open_handle;
4261                 op_data->op_data_version = data_version;
4262                 op_data->op_lease_handle = och->och_lease_handle;
4263                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4264
4265                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4266                 och->och_mod->mod_open_req->rq_replay = 0;
4267                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4268         }
4269
4270         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4271                        name, namelen, &request);
4272         if (rc == 0) {
4273                 LASSERT(request != NULL);
4274                 ll_update_times(request, parent);
4275         }
4276
4277         if (rc == 0 || rc == -EAGAIN) {
4278                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4279                 LASSERT(body != NULL);
4280
4281                 /* If the server does release layout lock, then we cleanup
4282                  * the client och here, otherwise release it in out_close: */
4283                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4284                         obd_mod_put(och->och_mod);
4285                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4286                                                   och);
4287                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4288                         OBD_FREE_PTR(och);
4289                         och = NULL;
4290                 }
4291         }
4292
4293         if (request != NULL) {
4294                 ptlrpc_req_finished(request);
4295                 request = NULL;
4296         }
4297
4298         /* Try again if the lease has cancelled. */
4299         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4300                 goto again;
4301
4302 out_close:
4303         if (och)
4304                 ll_lease_close(och, child_inode, NULL);
4305         if (!rc)
4306                 clear_nlink(child_inode);
4307 out_unlock:
4308         inode_unlock(child_inode);
4309         ll_finish_md_op_data(op_data);
4310 out_iput:
4311         iput(child_inode);
4312         RETURN(rc);
4313 }
4314
4315 static int
4316 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4317 {
4318         ENTRY;
4319
4320         RETURN(-ENOSYS);
4321 }
4322
4323 /**
4324  * test if some locks matching bits and l_req_mode are acquired
4325  * - bits can be in different locks
4326  * - if found clear the common lock bits in *bits
4327  * - the bits not found, are kept in *bits
4328  * \param inode [IN]
4329  * \param bits [IN] searched lock bits [IN]
4330  * \param l_req_mode [IN] searched lock mode
4331  * \retval boolean, true iff all bits are found
4332  */
4333 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4334 {
4335         struct lustre_handle lockh;
4336         union ldlm_policy_data policy;
4337         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4338                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4339         struct lu_fid *fid;
4340         __u64 flags;
4341         int i;
4342         ENTRY;
4343
4344         if (!inode)
4345                RETURN(0);
4346
4347         fid = &ll_i2info(inode)->lli_fid;
4348         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4349                ldlm_lockname[mode]);
4350
4351         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4352         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4353                 policy.l_inodebits.bits = *bits & (1 << i);
4354                 if (policy.l_inodebits.bits == 0)
4355                         continue;
4356
4357                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4358                                   &policy, mode, &lockh)) {
4359                         struct ldlm_lock *lock;
4360
4361                         lock = ldlm_handle2lock(&lockh);
4362                         if (lock) {
4363                                 *bits &=
4364                                       ~(lock->l_policy_data.l_inodebits.bits);
4365                                 LDLM_LOCK_PUT(lock);
4366                         } else {
4367                                 *bits &= ~policy.l_inodebits.bits;
4368                         }
4369                 }
4370         }
4371         RETURN(*bits == 0);
4372 }
4373
4374 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4375                                struct lustre_handle *lockh, __u64 flags,
4376                                enum ldlm_mode mode)
4377 {
4378         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4379         struct lu_fid *fid;
4380         enum ldlm_mode rc;
4381         ENTRY;
4382
4383         fid = &ll_i2info(inode)->lli_fid;
4384         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4385
4386         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4387                            fid, LDLM_IBITS, &policy, mode, lockh);
4388
4389         RETURN(rc);
4390 }
4391
4392 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4393 {
4394         /* Already unlinked. Just update nlink and return success */
4395         if (rc == -ENOENT) {
4396                 clear_nlink(inode);
4397                 /* If it is striped directory, and there is bad stripe
4398                  * Let's revalidate the dentry again, instead of returning
4399                  * error */
4400                 if (S_ISDIR(inode->i_mode) &&
4401                     ll_i2info(inode)->lli_lsm_md != NULL)
4402                         return 0;
4403
4404                 /* This path cannot be hit for regular files unless in
4405                  * case of obscure races, so no need to to validate
4406                  * size. */
4407                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4408                         return 0;
4409         } else if (rc != 0) {
4410                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4411                              "%s: revalidate FID "DFID" error: rc = %d\n",
4412                              ll_i2sbi(inode)->ll_fsname,
4413                              PFID(ll_inode2fid(inode)), rc);
4414         }
4415
4416         return rc;
4417 }
4418
4419 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4420 {
4421         struct inode *inode = dentry->d_inode;
4422         struct obd_export *exp = ll_i2mdexp(inode);
4423         struct lookup_intent oit = {
4424                 .it_op = op,
4425         };
4426         struct ptlrpc_request *req = NULL;
4427         struct md_op_data *op_data;
4428         int rc = 0;
4429         ENTRY;
4430
4431         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4432                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4433
4434         /* Call getattr by fid, so do not provide name at all. */
4435         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4436                                      LUSTRE_OPC_ANY, NULL);
4437         if (IS_ERR(op_data))
4438                 RETURN(PTR_ERR(op_data));
4439
4440         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4441         ll_finish_md_op_data(op_data);
4442         if (rc < 0) {
4443                 rc = ll_inode_revalidate_fini(inode, rc);
4444                 GOTO(out, rc);
4445         }
4446
4447         rc = ll_revalidate_it_finish(req, &oit, dentry);
4448         if (rc != 0) {
4449                 ll_intent_release(&oit);
4450                 GOTO(out, rc);
4451         }
4452
4453         /* Unlinked? Unhash dentry, so it is not picked up later by
4454          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4455          * here to preserve get_cwd functionality on 2.6.
4456          * Bug 10503 */
4457         if (!dentry->d_inode->i_nlink) {
4458                 ll_lock_dcache(inode);
4459                 d_lustre_invalidate(dentry, 0);
4460                 ll_unlock_dcache(inode);
4461         }
4462
4463         ll_lookup_finish_locks(&oit, dentry);
4464 out:
4465         ptlrpc_req_finished(req);
4466
4467         return rc;
4468 }
4469
4470 static int ll_merge_md_attr(struct inode *inode)
4471 {
4472         struct ll_inode_info *lli = ll_i2info(inode);
4473         struct cl_attr attr = { 0 };
4474         int rc;
4475
4476         LASSERT(lli->lli_lsm_md != NULL);
4477         down_read(&lli->lli_lsm_sem);
4478         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4479                            &attr, ll_md_blocking_ast);
4480         up_read(&lli->lli_lsm_sem);
4481         if (rc != 0)
4482                 RETURN(rc);
4483
4484         set_nlink(inode, attr.cat_nlink);
4485         inode->i_blocks = attr.cat_blocks;
4486         i_size_write(inode, attr.cat_size);
4487
4488         ll_i2info(inode)->lli_atime = attr.cat_atime;
4489         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4490         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4491
4492         RETURN(0);
4493 }
4494
4495 static inline dev_t ll_compat_encode_dev(dev_t dev)
4496 {
4497         /* The compat_sys_*stat*() syscalls will fail unless the
4498          * device majors and minors are both less than 256. Note that
4499          * the value returned here will be passed through
4500          * old_encode_dev() in cp_compat_stat(). And so we are not
4501          * trying to return a valid compat (u16) device number, just
4502          * one that will pass the old_valid_dev() check. */
4503
4504         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4505 }
4506
4507 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4508 int ll_getattr(const struct path *path, struct kstat *stat,
4509                u32 request_mask, unsigned int flags)
4510 {
4511         struct dentry *de = path->dentry;
4512 #else
4513 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4514 {
4515 #endif
4516         struct inode *inode = de->d_inode;
4517         struct ll_sb_info *sbi = ll_i2sbi(inode);
4518         struct ll_inode_info *lli = ll_i2info(inode);
4519         int rc;
4520
4521         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4522
4523         rc = ll_inode_revalidate(de, IT_GETATTR);
4524         if (rc < 0)
4525                 RETURN(rc);
4526
4527         if (S_ISREG(inode->i_mode)) {
4528                 /* In case of restore, the MDT has the right size and has
4529                  * already send it back without granting the layout lock,
4530                  * inode is up-to-date so glimpse is useless.
4531                  * Also to glimpse we need the layout, in case of a running
4532                  * restore the MDT holds the layout lock so the glimpse will
4533                  * block up to the end of restore (getattr will block)
4534                  */
4535                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4536                         rc = ll_glimpse_size(inode);
4537                         if (rc < 0)
4538                                 RETURN(rc);
4539                 }
4540         } else {
4541                 /* If object isn't regular a file then don't validate size. */
4542                 if (S_ISDIR(inode->i_mode) &&
4543                     lli->lli_lsm_md != NULL) {
4544                         rc = ll_merge_md_attr(inode);
4545                         if (rc < 0)
4546                                 RETURN(rc);
4547                 }
4548
4549                 inode->i_atime.tv_sec = lli->lli_atime;
4550                 inode->i_mtime.tv_sec = lli->lli_mtime;
4551                 inode->i_ctime.tv_sec = lli->lli_ctime;
4552         }
4553
4554         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4555
4556         if (ll_need_32bit_api(sbi)) {
4557                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4558                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4559                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4560         } else {
4561                 stat->ino = inode->i_ino;
4562                 stat->dev = inode->i_sb->s_dev;
4563                 stat->rdev = inode->i_rdev;
4564         }
4565
4566         stat->mode = inode->i_mode;
4567         stat->uid = inode->i_uid;
4568         stat->gid = inode->i_gid;
4569         stat->atime = inode->i_atime;
4570         stat->mtime = inode->i_mtime;
4571         stat->ctime = inode->i_ctime;
4572         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4573
4574         stat->nlink = inode->i_nlink;
4575         stat->size = i_size_read(inode);
4576         stat->blocks = inode->i_blocks;
4577
4578         return 0;
4579 }
4580
4581 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4582                      __u64 start, __u64 len)
4583 {
4584         int             rc;
4585         size_t          num_bytes;
4586         struct fiemap   *fiemap;
4587         unsigned int    extent_count = fieinfo->fi_extents_max;
4588
4589         num_bytes = sizeof(*fiemap) + (extent_count *
4590                                        sizeof(struct fiemap_extent));
4591         OBD_ALLOC_LARGE(fiemap, num_bytes);
4592
4593         if (fiemap == NULL)
4594                 RETURN(-ENOMEM);
4595
4596         fiemap->fm_flags = fieinfo->fi_flags;
4597         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4598         fiemap->fm_start = start;
4599         fiemap->fm_length = len;
4600         if (extent_count > 0 &&
4601             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4602                            sizeof(struct fiemap_extent)) != 0)
4603                 GOTO(out, rc = -EFAULT);
4604
4605         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4606
4607         fieinfo->fi_flags = fiemap->fm_flags;
4608         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4609         if (extent_count > 0 &&
4610             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4611                          fiemap->fm_mapped_extents *
4612                          sizeof(struct fiemap_extent)) != 0)
4613                 GOTO(out, rc = -EFAULT);
4614 out:
4615         OBD_FREE_LARGE(fiemap, num_bytes);
4616         return rc;
4617 }
4618
4619 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4620 {
4621         struct ll_inode_info *lli = ll_i2info(inode);
4622         struct posix_acl *acl = NULL;
4623         ENTRY;
4624
4625         spin_lock(&lli->lli_lock);
4626         /* VFS' acl_permission_check->check_acl will release the refcount */
4627         acl = posix_acl_dup(lli->lli_posix_acl);
4628         spin_unlock(&lli->lli_lock);
4629
4630         RETURN(acl);
4631 }
4632
4633 #ifdef HAVE_IOP_SET_ACL
4634 #ifdef CONFIG_FS_POSIX_ACL
4635 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4636 {
4637         struct ll_sb_info *sbi = ll_i2sbi(inode);
4638         struct ptlrpc_request *req = NULL;
4639         const char *name = NULL;
4640         char *value = NULL;
4641         size_t value_size = 0;
4642         int rc = 0;
4643         ENTRY;
4644
4645         switch (type) {
4646         case ACL_TYPE_ACCESS:
4647                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4648                 if (acl)
4649                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4650                 break;
4651
4652         case ACL_TYPE_DEFAULT:
4653                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4654                 if (!S_ISDIR(inode->i_mode))
4655                         rc = acl ? -EACCES : 0;
4656                 break;
4657
4658         default:
4659                 rc = -EINVAL;
4660                 break;
4661         }
4662         if (rc)
4663                 return rc;
4664
4665         if (acl) {
4666                 value_size = posix_acl_xattr_size(acl->a_count);
4667                 value = kmalloc(value_size, GFP_NOFS);
4668                 if (value == NULL)
4669                         GOTO(out, rc = -ENOMEM);
4670
4671                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4672                 if (rc < 0)
4673                         GOTO(out_value, rc);
4674         }
4675
4676         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4677                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4678                          name, value, value_size, 0, 0, &req);
4679
4680         ptlrpc_req_finished(req);
4681 out_value:
4682         kfree(value);
4683 out:
4684         if (rc)
4685                 forget_cached_acl(inode, type);
4686         else
4687                 set_cached_acl(inode, type, acl);
4688         RETURN(rc);
4689 }
4690 #endif /* CONFIG_FS_POSIX_ACL */
4691 #endif /* HAVE_IOP_SET_ACL */
4692
4693 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4694 static int
4695 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4696 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4697 # else
4698 ll_check_acl(struct inode *inode, int mask)
4699 # endif
4700 {
4701 # ifdef CONFIG_FS_POSIX_ACL
4702         struct posix_acl *acl;
4703         int rc;
4704         ENTRY;
4705
4706 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4707         if (flags & IPERM_FLAG_RCU)
4708                 return -ECHILD;
4709 #  endif
4710         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4711
4712         if (!acl)
4713                 RETURN(-EAGAIN);
4714
4715         rc = posix_acl_permission(inode, acl, mask);
4716         posix_acl_release(acl);
4717
4718         RETURN(rc);
4719 # else /* !CONFIG_FS_POSIX_ACL */
4720         return -EAGAIN;
4721 # endif /* CONFIG_FS_POSIX_ACL */
4722 }
4723 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4724
4725 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4726 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4727 #else
4728 # ifdef HAVE_INODE_PERMISION_2ARGS
4729 int ll_inode_permission(struct inode *inode, int mask)
4730 # else
4731 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4732 # endif
4733 #endif
4734 {
4735         int rc = 0;
4736         struct ll_sb_info *sbi;
4737         struct root_squash_info *squash;
4738         struct cred *cred = NULL;
4739         const struct cred *old_cred = NULL;
4740         cfs_cap_t cap;
4741         bool squash_id = false;
4742         ENTRY;
4743
4744 #ifdef MAY_NOT_BLOCK
4745         if (mask & MAY_NOT_BLOCK)
4746                 return -ECHILD;
4747 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4748         if (flags & IPERM_FLAG_RCU)
4749                 return -ECHILD;
4750 #endif
4751
4752        /* as root inode are NOT getting validated in lookup operation,
4753         * need to do it before permission check. */
4754
4755         if (inode == inode->i_sb->s_root->d_inode) {
4756                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4757                 if (rc)
4758                         RETURN(rc);
4759         }
4760
4761         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4762                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4763
4764         /* squash fsuid/fsgid if needed */
4765         sbi = ll_i2sbi(inode);
4766         squash = &sbi->ll_squash;
4767         if (unlikely(squash->rsi_uid != 0 &&
4768                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4769                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4770                         squash_id = true;
4771         }
4772         if (squash_id) {
4773                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4774                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4775                        squash->rsi_uid, squash->rsi_gid);
4776
4777                 /* update current process's credentials
4778                  * and FS capability */
4779                 cred = prepare_creds();
4780                 if (cred == NULL)
4781                         RETURN(-ENOMEM);
4782
4783                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4784                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4785                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4786                         if ((1 << cap) & CFS_CAP_FS_MASK)
4787                                 cap_lower(cred->cap_effective, cap);
4788                 }
4789                 old_cred = override_creds(cred);
4790         }
4791
4792         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4793         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4794         /* restore current process's credentials and FS capability */
4795         if (squash_id) {
4796                 revert_creds(old_cred);
4797                 put_cred(cred);
4798         }
4799
4800         RETURN(rc);
4801 }
4802
4803 /* -o localflock - only provides locally consistent flock locks */
4804 struct file_operations ll_file_operations = {
4805 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4806 # ifdef HAVE_SYNC_READ_WRITE
4807         .read           = new_sync_read,
4808         .write          = new_sync_write,
4809 # endif
4810         .read_iter      = ll_file_read_iter,
4811         .write_iter     = ll_file_write_iter,
4812 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4813         .read           = ll_file_read,
4814         .aio_read       = ll_file_aio_read,
4815         .write          = ll_file_write,
4816         .aio_write      = ll_file_aio_write,
4817 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4818         .unlocked_ioctl = ll_file_ioctl,
4819         .open           = ll_file_open,
4820         .release        = ll_file_release,
4821         .mmap           = ll_file_mmap,
4822         .llseek         = ll_file_seek,
4823         .splice_read    = ll_file_splice_read,
4824         .fsync          = ll_fsync,
4825         .flush          = ll_flush
4826 };
4827
4828 struct file_operations ll_file_operations_flock = {
4829 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4830 # ifdef HAVE_SYNC_READ_WRITE
4831         .read           = new_sync_read,
4832         .write          = new_sync_write,
4833 # endif /* HAVE_SYNC_READ_WRITE */
4834         .read_iter      = ll_file_read_iter,
4835         .write_iter     = ll_file_write_iter,
4836 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4837         .read           = ll_file_read,
4838         .aio_read       = ll_file_aio_read,
4839         .write          = ll_file_write,
4840         .aio_write      = ll_file_aio_write,
4841 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4842         .unlocked_ioctl = ll_file_ioctl,
4843         .open           = ll_file_open,
4844         .release        = ll_file_release,
4845         .mmap           = ll_file_mmap,
4846         .llseek         = ll_file_seek,
4847         .splice_read    = ll_file_splice_read,
4848         .fsync          = ll_fsync,
4849         .flush          = ll_flush,
4850         .flock          = ll_file_flock,
4851         .lock           = ll_file_flock
4852 };
4853
4854 /* These are for -o noflock - to return ENOSYS on flock calls */
4855 struct file_operations ll_file_operations_noflock = {
4856 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4857 # ifdef HAVE_SYNC_READ_WRITE
4858         .read           = new_sync_read,
4859         .write          = new_sync_write,
4860 # endif /* HAVE_SYNC_READ_WRITE */
4861         .read_iter      = ll_file_read_iter,
4862         .write_iter     = ll_file_write_iter,
4863 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4864         .read           = ll_file_read,
4865         .aio_read       = ll_file_aio_read,
4866         .write          = ll_file_write,
4867         .aio_write      = ll_file_aio_write,
4868 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4869         .unlocked_ioctl = ll_file_ioctl,
4870         .open           = ll_file_open,
4871         .release        = ll_file_release,
4872         .mmap           = ll_file_mmap,
4873         .llseek         = ll_file_seek,
4874         .splice_read    = ll_file_splice_read,
4875         .fsync          = ll_fsync,
4876         .flush          = ll_flush,
4877         .flock          = ll_file_noflock,
4878         .lock           = ll_file_noflock
4879 };
4880
4881 struct inode_operations ll_file_inode_operations = {
4882         .setattr        = ll_setattr,
4883         .getattr        = ll_getattr,
4884         .permission     = ll_inode_permission,
4885 #ifdef HAVE_IOP_XATTR
4886         .setxattr       = ll_setxattr,
4887         .getxattr       = ll_getxattr,
4888         .removexattr    = ll_removexattr,
4889 #endif
4890         .listxattr      = ll_listxattr,
4891         .fiemap         = ll_fiemap,
4892 #ifdef HAVE_IOP_GET_ACL
4893         .get_acl        = ll_get_acl,
4894 #endif
4895 #ifdef HAVE_IOP_SET_ACL
4896         .set_acl        = ll_set_acl,
4897 #endif
4898 };
4899
4900 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4901 {
4902         struct ll_inode_info *lli = ll_i2info(inode);
4903         struct cl_object *obj = lli->lli_clob;
4904         struct lu_env *env;
4905         int rc;
4906         __u16 refcheck;
4907         ENTRY;
4908
4909         if (obj == NULL)
4910                 RETURN(0);
4911
4912         env = cl_env_get(&refcheck);
4913         if (IS_ERR(env))
4914                 RETURN(PTR_ERR(env));
4915
4916         rc = cl_conf_set(env, lli->lli_clob, conf);
4917         if (rc < 0)
4918                 GOTO(out, rc);
4919
4920         if (conf->coc_opc == OBJECT_CONF_SET) {
4921                 struct ldlm_lock *lock = conf->coc_lock;
4922                 struct cl_layout cl = {
4923                         .cl_layout_gen = 0,
4924                 };
4925
4926                 LASSERT(lock != NULL);
4927                 LASSERT(ldlm_has_layout(lock));
4928
4929                 /* it can only be allowed to match after layout is
4930                  * applied to inode otherwise false layout would be
4931                  * seen. Applying layout shoud happen before dropping
4932                  * the intent lock. */
4933                 ldlm_lock_allow_match(lock);
4934
4935                 rc = cl_object_layout_get(env, obj, &cl);
4936                 if (rc < 0)
4937                         GOTO(out, rc);
4938
4939                 CDEBUG(D_VFSTRACE,
4940                        DFID": layout version change: %u -> %u\n",
4941                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4942                        cl.cl_layout_gen);
4943                 ll_layout_version_set(lli, cl.cl_layout_gen);
4944         }
4945
4946 out:
4947         cl_env_put(env, &refcheck);
4948
4949         RETURN(rc);
4950 }
4951
4952 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4953 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4954
4955 {
4956         struct ll_sb_info *sbi = ll_i2sbi(inode);
4957         struct ptlrpc_request *req;
4958         void *lvbdata;
4959         void *lmm;
4960         int lmmsize;
4961         int rc;
4962         ENTRY;
4963
4964         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4965                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4966                lock->l_lvb_data, lock->l_lvb_len);
4967
4968         if (lock->l_lvb_data != NULL)
4969                 RETURN(0);
4970
4971         /* if layout lock was granted right away, the layout is returned
4972          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4973          * blocked and then granted via completion ast, we have to fetch
4974          * layout here. Please note that we can't use the LVB buffer in
4975          * completion AST because it doesn't have a large enough buffer */
4976         rc = ll_get_default_mdsize(sbi, &lmmsize);
4977         if (rc < 0)
4978                 RETURN(rc);
4979
4980         rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4981                          XATTR_NAME_LOV, lmmsize, &req);
4982         if (rc < 0) {
4983                 if (rc == -ENODATA)
4984                         GOTO(out, rc = 0); /* empty layout */
4985                 else
4986                         RETURN(rc);
4987         }
4988
4989         lmmsize = rc;
4990         rc = 0;
4991         if (lmmsize == 0) /* empty layout */
4992                 GOTO(out, rc = 0);
4993
4994         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4995         if (lmm == NULL)
4996                 GOTO(out, rc = -EFAULT);
4997
4998         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4999         if (lvbdata == NULL)
5000                 GOTO(out, rc = -ENOMEM);
5001
5002         memcpy(lvbdata, lmm, lmmsize);
5003         lock_res_and_lock(lock);
5004         if (unlikely(lock->l_lvb_data == NULL)) {
5005                 lock->l_lvb_type = LVB_T_LAYOUT;
5006                 lock->l_lvb_data = lvbdata;
5007                 lock->l_lvb_len = lmmsize;
5008                 lvbdata = NULL;
5009         }
5010         unlock_res_and_lock(lock);
5011
5012         if (lvbdata)
5013                 OBD_FREE_LARGE(lvbdata, lmmsize);
5014
5015         EXIT;
5016
5017 out:
5018         ptlrpc_req_finished(req);
5019         return rc;
5020 }
5021
5022 /**
5023  * Apply the layout to the inode. Layout lock is held and will be released
5024  * in this function.
5025  */
5026 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5027                               struct inode *inode)
5028 {
5029         struct ll_inode_info *lli = ll_i2info(inode);
5030         struct ll_sb_info    *sbi = ll_i2sbi(inode);
5031         struct ldlm_lock *lock;
5032         struct cl_object_conf conf;
5033         int rc = 0;
5034         bool lvb_ready;
5035         bool wait_layout = false;
5036         ENTRY;
5037
5038         LASSERT(lustre_handle_is_used(lockh));
5039
5040         lock = ldlm_handle2lock(lockh);
5041         LASSERT(lock != NULL);
5042         LASSERT(ldlm_has_layout(lock));
5043
5044         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5045                    PFID(&lli->lli_fid), inode);
5046
5047         /* in case this is a caching lock and reinstate with new inode */
5048         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5049
5050         lock_res_and_lock(lock);
5051         lvb_ready = ldlm_is_lvb_ready(lock);
5052         unlock_res_and_lock(lock);
5053
5054         /* checking lvb_ready is racy but this is okay. The worst case is
5055          * that multi processes may configure the file on the same time. */
5056         if (lvb_ready)
5057                 GOTO(out, rc = 0);
5058
5059         rc = ll_layout_fetch(inode, lock);
5060         if (rc < 0)
5061                 GOTO(out, rc);
5062
5063         /* for layout lock, lmm is stored in lock's lvb.
5064          * lvb_data is immutable if the lock is held so it's safe to access it
5065          * without res lock.
5066          *
5067          * set layout to file. Unlikely this will fail as old layout was
5068          * surely eliminated */
5069         memset(&conf, 0, sizeof conf);
5070         conf.coc_opc = OBJECT_CONF_SET;
5071         conf.coc_inode = inode;
5072         conf.coc_lock = lock;
5073         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5074         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5075         rc = ll_layout_conf(inode, &conf);
5076
5077         /* refresh layout failed, need to wait */
5078         wait_layout = rc == -EBUSY;
5079         EXIT;
5080 out:
5081         LDLM_LOCK_PUT(lock);
5082         ldlm_lock_decref(lockh, mode);
5083
5084         /* wait for IO to complete if it's still being used. */
5085         if (wait_layout) {
5086                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5087                        sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5088
5089                 memset(&conf, 0, sizeof conf);
5090                 conf.coc_opc = OBJECT_CONF_WAIT;
5091                 conf.coc_inode = inode;
5092                 rc = ll_layout_conf(inode, &conf);
5093                 if (rc == 0)
5094                         rc = -EAGAIN;
5095
5096                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5097                        sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5098         }
5099         RETURN(rc);
5100 }
5101
5102 /**
5103  * Issue layout intent RPC to MDS.
5104  * \param inode [in]    file inode
5105  * \param intent [in]   layout intent
5106  *
5107  * \retval 0    on success
5108  * \retval < 0  error code
5109  */
5110 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5111 {
5112         struct ll_inode_info  *lli = ll_i2info(inode);
5113         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5114         struct md_op_data     *op_data;
5115         struct lookup_intent it;
5116         struct ptlrpc_request *req;
5117         int rc;
5118         ENTRY;
5119
5120         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5121                                      0, 0, LUSTRE_OPC_ANY, NULL);
5122         if (IS_ERR(op_data))
5123                 RETURN(PTR_ERR(op_data));
5124
5125         op_data->op_data = intent;
5126         op_data->op_data_size = sizeof(*intent);
5127
5128         memset(&it, 0, sizeof(it));
5129         it.it_op = IT_LAYOUT;
5130         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5131             intent->li_opc == LAYOUT_INTENT_TRUNC)
5132                 it.it_flags = FMODE_WRITE;
5133
5134         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5135                           sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5136
5137         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5138                             &ll_md_blocking_ast, 0);
5139         if (it.it_request != NULL)
5140                 ptlrpc_req_finished(it.it_request);
5141         it.it_request = NULL;
5142
5143         ll_finish_md_op_data(op_data);
5144
5145         /* set lock data in case this is a new lock */
5146         if (!rc)
5147                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5148
5149         ll_intent_drop_lock(&it);
5150
5151         RETURN(rc);
5152 }
5153
5154 /**
5155  * This function checks if there exists a LAYOUT lock on the client side,
5156  * or enqueues it if it doesn't have one in cache.
5157  *
5158  * This function will not hold layout lock so it may be revoked any time after
5159  * this function returns. Any operations depend on layout should be redone
5160  * in that case.
5161  *
5162  * This function should be called before lov_io_init() to get an uptodate
5163  * layout version, the caller should save the version number and after IO
5164  * is finished, this function should be called again to verify that layout
5165  * is not changed during IO time.
5166  */
5167 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5168 {
5169         struct ll_inode_info    *lli = ll_i2info(inode);
5170         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5171         struct lustre_handle lockh;
5172         struct layout_intent intent = {
5173                 .li_opc = LAYOUT_INTENT_ACCESS,
5174         };
5175         enum ldlm_mode mode;
5176         int rc;
5177         ENTRY;
5178
5179         *gen = ll_layout_version_get(lli);
5180         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5181                 RETURN(0);
5182
5183         /* sanity checks */
5184         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5185         LASSERT(S_ISREG(inode->i_mode));
5186
5187         /* take layout lock mutex to enqueue layout lock exclusively. */
5188         mutex_lock(&lli->lli_layout_mutex);
5189
5190         while (1) {
5191                 /* mostly layout lock is caching on the local side, so try to
5192                  * match it before grabbing layout lock mutex. */
5193                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5194                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5195                 if (mode != 0) { /* hit cached lock */
5196                         rc = ll_layout_lock_set(&lockh, mode, inode);
5197                         if (rc == -EAGAIN)
5198                                 continue;
5199                         break;
5200                 }
5201
5202                 rc = ll_layout_intent(inode, &intent);
5203                 if (rc != 0)
5204                         break;
5205         }
5206
5207         if (rc == 0)
5208                 *gen = ll_layout_version_get(lli);
5209         mutex_unlock(&lli->lli_layout_mutex);
5210
5211         RETURN(rc);
5212 }
5213
5214 /**
5215  * Issue layout intent RPC indicating where in a file an IO is about to write.
5216  *
5217  * \param[in] inode     file inode.
5218  * \param[in] ext       write range with start offset of fille in bytes where
5219  *                      an IO is about to write, and exclusive end offset in
5220  *                      bytes.
5221  *
5222  * \retval 0    on success
5223  * \retval < 0  error code
5224  */
5225 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5226                            struct lu_extent *ext)
5227 {
5228         struct layout_intent intent = {
5229                 .li_opc = opc,
5230                 .li_extent.e_start = ext->e_start,
5231                 .li_extent.e_end = ext->e_end,
5232         };
5233         int rc;
5234         ENTRY;
5235
5236         rc = ll_layout_intent(inode, &intent);
5237
5238         RETURN(rc);
5239 }
5240
5241 /**
5242  *  This function send a restore request to the MDT
5243  */
5244 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5245 {
5246         struct hsm_user_request *hur;
5247         int                      len, rc;
5248         ENTRY;
5249
5250         len = sizeof(struct hsm_user_request) +
5251               sizeof(struct hsm_user_item);
5252         OBD_ALLOC(hur, len);
5253         if (hur == NULL)
5254                 RETURN(-ENOMEM);
5255
5256         hur->hur_request.hr_action = HUA_RESTORE;
5257         hur->hur_request.hr_archive_id = 0;
5258         hur->hur_request.hr_flags = 0;
5259         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5260                sizeof(hur->hur_user_item[0].hui_fid));
5261         hur->hur_user_item[0].hui_extent.offset = offset;
5262         hur->hur_user_item[0].hui_extent.length = length;
5263         hur->hur_request.hr_itemcount = 1;
5264         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5265                            len, hur, NULL);
5266         OBD_FREE(hur, len);
5267         RETURN(rc);
5268 }