lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                       ATTR_MTIME | ATTR_MTIME_SET |
 104                                       ATTR_CTIME);
 105         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 106         op_data->op_attr_blocks = inode->i_blocks;
 107         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 108         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 109                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 110         op_data->op_open_handle = och->och_open_handle;
 111
 112         if (och->och_flags & FMODE_WRITE &&
 113             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 114                 /* For HSM: if inode data has been modified, pack it so that
 115                  * MDT can set data dirty flag in the archive. */
 116                 op_data->op_bias |= MDS_DATA_MODIFIED;
 117
 118         EXIT;
 119 }
 120
 121 /**
 122  * Perform a close, possibly with a bias.
 123  * The meaning of "data" depends on the value of "bias".
 124  *
 125  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 126  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 127  * swap layouts with.
 128  */
 129 static int ll_close_inode_openhandle(struct inode *inode,
 130                                      struct obd_client_handle *och,
 131                                      enum mds_op_bias bias, void *data)
 132 {
 133         struct obd_export *md_exp = ll_i2mdexp(inode);
 134         const struct ll_inode_info *lli = ll_i2info(inode);
 135         struct md_op_data *op_data;
 136         struct ptlrpc_request *req = NULL;
 137         int rc;
 138         ENTRY;
 139
 140         if (class_exp2obd(md_exp) == NULL) {
 141                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 142                        ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
 143                 GOTO(out, rc = 0);
 144         }
 145
 146         OBD_ALLOC_PTR(op_data);
 147         /* We leak openhandle and request here on error, but not much to be
 148          * done in OOM case since app won't retry close on error either. */
 149         if (op_data == NULL)
 150                 GOTO(out, rc = -ENOMEM);
 151
 152         ll_prepare_close(inode, op_data, och);
 153         switch (bias) {
 154         case MDS_CLOSE_LAYOUT_MERGE:
 155                 /* merge blocks from the victim inode */
 156                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 157                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 158                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 159         case MDS_CLOSE_LAYOUT_SPLIT:
 160         case MDS_CLOSE_LAYOUT_SWAP: {
 161                 struct split_param *sp = data;
 162
 163                 LASSERT(data != NULL);
 164                 op_data->op_bias |= bias;
 165                 op_data->op_data_version = 0;
 166                 op_data->op_lease_handle = och->och_lease_handle;
 167                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 168                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 169                         op_data->op_mirror_id = sp->sp_mirror_id;
 170                 } else {
 171                         op_data->op_fid2 = *ll_inode2fid(data);
 172                 }
 173                 break;
 174         }
 175
 176         case MDS_CLOSE_RESYNC_DONE: {
 177                 struct ll_ioc_lease *ioc = data;
 178
 179                 LASSERT(data != NULL);
 180                 op_data->op_attr_blocks +=
 181                         ioc->lil_count * op_data->op_attr_blocks;
 182                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 183                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 184                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 185
 186                 op_data->op_lease_handle = och->och_lease_handle;
 187                 op_data->op_data = &ioc->lil_ids[0];
 188                 op_data->op_data_size =
 189                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 190                 break;
 191         }
 192
 193         case MDS_HSM_RELEASE:
 194                 LASSERT(data != NULL);
 195                 op_data->op_bias |= MDS_HSM_RELEASE;
 196                 op_data->op_data_version = *(__u64 *)data;
 197                 op_data->op_lease_handle = och->och_lease_handle;
 198                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 199                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 200                 break;
 201
 202         default:
 203                 LASSERT(data == NULL);
 204                 break;
 205         }
 206
 207         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 208                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 209         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 210                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 211
 212         rc = md_close(md_exp, op_data, och->och_mod, &req);
 213         if (rc != 0 && rc != -EINTR)
 214                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 215                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 216
 217         if (rc == 0 && op_data->op_bias & bias) {
 218                 struct mdt_body *body;
 219
 220                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 221                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 222                         rc = -EBUSY;
 223         }
 224
 225         ll_finish_md_op_data(op_data);
 226         EXIT;
 227 out:
 228
 229         md_clear_open_replay_data(md_exp, och);
 230         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 231         OBD_FREE_PTR(och);
 232
 233         ptlrpc_req_finished(req);       /* This is close request */
 234         return rc;
 235 }
 236
 237 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 238 {
 239         struct ll_inode_info *lli = ll_i2info(inode);
 240         struct obd_client_handle **och_p;
 241         struct obd_client_handle *och;
 242         __u64 *och_usecount;
 243         int rc = 0;
 244         ENTRY;
 245
 246         if (fmode & FMODE_WRITE) {
 247                 och_p = &lli->lli_mds_write_och;
 248                 och_usecount = &lli->lli_open_fd_write_count;
 249         } else if (fmode & FMODE_EXEC) {
 250                 och_p = &lli->lli_mds_exec_och;
 251                 och_usecount = &lli->lli_open_fd_exec_count;
 252         } else {
 253                 LASSERT(fmode & FMODE_READ);
 254                 och_p = &lli->lli_mds_read_och;
 255                 och_usecount = &lli->lli_open_fd_read_count;
 256         }
 257
 258         mutex_lock(&lli->lli_och_mutex);
 259         if (*och_usecount > 0) {
 260                 /* There are still users of this handle, so skip
 261                  * freeing it. */
 262                 mutex_unlock(&lli->lli_och_mutex);
 263                 RETURN(0);
 264         }
 265
 266         och = *och_p;
 267         *och_p = NULL;
 268         mutex_unlock(&lli->lli_och_mutex);
 269
 270         if (och != NULL) {
 271                 /* There might be a race and this handle may already
 272                  * be closed. */
 273                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 274         }
 275
 276         RETURN(rc);
 277 }
 278
 279 static int ll_md_close(struct inode *inode, struct file *file)
 280 {
 281         union ldlm_policy_data policy = {
 282                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 283         };
 284         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 285         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 286         struct ll_inode_info *lli = ll_i2info(inode);
 287         struct lustre_handle lockh;
 288         enum ldlm_mode lockmode;
 289         int rc = 0;
 290         ENTRY;
 291
 292         /* clear group lock, if present */
 293         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 294                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 295
 296         if (fd->fd_lease_och != NULL) {
 297                 bool lease_broken;
 298
 299                 /* Usually the lease is not released when the
 300                  * application crashed, we need to release here. */
 301                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 302                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 303                         PFID(&lli->lli_fid), rc, lease_broken);
 304
 305                 fd->fd_lease_och = NULL;
 306         }
 307
 308         if (fd->fd_och != NULL) {
 309                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 310                 fd->fd_och = NULL;
 311                 GOTO(out, rc);
 312         }
 313
 314         /* Let's see if we have good enough OPEN lock on the file and if
 315            we can skip talking to MDS */
 316         mutex_lock(&lli->lli_och_mutex);
 317         if (fd->fd_omode & FMODE_WRITE) {
 318                 lockmode = LCK_CW;
 319                 LASSERT(lli->lli_open_fd_write_count);
 320                 lli->lli_open_fd_write_count--;
 321         } else if (fd->fd_omode & FMODE_EXEC) {
 322                 lockmode = LCK_PR;
 323                 LASSERT(lli->lli_open_fd_exec_count);
 324                 lli->lli_open_fd_exec_count--;
 325         } else {
 326                 lockmode = LCK_CR;
 327                 LASSERT(lli->lli_open_fd_read_count);
 328                 lli->lli_open_fd_read_count--;
 329         }
 330         mutex_unlock(&lli->lli_och_mutex);
 331
 332         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 333                            LDLM_IBITS, &policy, lockmode, &lockh))
 334                 rc = ll_md_real_close(inode, fd->fd_omode);
 335
 336 out:
 337         LUSTRE_FPRIVATE(file) = NULL;
 338         ll_file_data_put(fd);
 339
 340         RETURN(rc);
 341 }
 342
 343 /* While this returns an error code, fput() the caller does not, so we need
 344  * to make every effort to clean up all of our state here.  Also, applications
 345  * rarely check close errors and even if an error is returned they will not
 346  * re-try the close call.
 347  */
 348 int ll_file_release(struct inode *inode, struct file *file)
 349 {
 350         struct ll_file_data *fd;
 351         struct ll_sb_info *sbi = ll_i2sbi(inode);
 352         struct ll_inode_info *lli = ll_i2info(inode);
 353         int rc;
 354         ENTRY;
 355
 356         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 357                PFID(ll_inode2fid(inode)), inode);
 358
 359         if (inode->i_sb->s_root != file_dentry(file))
 360                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 361         fd = LUSTRE_FPRIVATE(file);
 362         LASSERT(fd != NULL);
 363
 364         /* The last ref on @file, maybe not the the owner pid of statahead,
 365          * because parent and child process can share the same file handle. */
 366         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 367                 ll_deauthorize_statahead(inode, fd);
 368
 369         if (inode->i_sb->s_root == file_dentry(file)) {
 370                 LUSTRE_FPRIVATE(file) = NULL;
 371                 ll_file_data_put(fd);
 372                 RETURN(0);
 373         }
 374
 375         if (!S_ISDIR(inode->i_mode)) {
 376                 if (lli->lli_clob != NULL)
 377                         lov_read_and_clear_async_rc(lli->lli_clob);
 378                 lli->lli_async_rc = 0;
 379         }
 380
 381         rc = ll_md_close(inode, file);
 382
 383         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 384                 libcfs_debug_dumplog();
 385
 386         RETURN(rc);
 387 }
 388
 389 static inline int ll_dom_readpage(void *data, struct page *page)
 390 {
 391         struct niobuf_local *lnb = data;
 392         void *kaddr;
 393
 394         kaddr = ll_kmap_atomic(page, KM_USER0);
 395         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 396         if (lnb->lnb_len < PAGE_SIZE)
 397                 memset(kaddr + lnb->lnb_len, 0,
 398                        PAGE_SIZE - lnb->lnb_len);
 399         flush_dcache_page(page);
 400         SetPageUptodate(page);
 401         ll_kunmap_atomic(kaddr, KM_USER0);
 402         unlock_page(page);
 403
 404         return 0;
 405 }
 406
 407 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 408                         struct lookup_intent *it)
 409 {
 410         struct ll_inode_info *lli = ll_i2info(inode);
 411         struct cl_object *obj = lli->lli_clob;
 412         struct address_space *mapping = inode->i_mapping;
 413         struct page *vmpage;
 414         struct niobuf_remote *rnb;
 415         char *data;
 416         struct lustre_handle lockh;
 417         struct ldlm_lock *lock;
 418         unsigned long index, start;
 419         struct niobuf_local lnb;
 420         bool dom_lock = false;
 421
 422         ENTRY;
 423
 424         if (obj == NULL)
 425                 RETURN_EXIT;
 426
 427         if (it->it_lock_mode != 0) {
 428                 lockh.cookie = it->it_lock_handle;
 429                 lock = ldlm_handle2lock(&lockh);
 430                 if (lock != NULL)
 431                         dom_lock = ldlm_has_dom(lock);
 432                 LDLM_LOCK_PUT(lock);
 433         }
 434         if (!dom_lock)
 435                 RETURN_EXIT;
 436
 437         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 438                                    RCL_SERVER))
 439                 RETURN_EXIT;
 440
 441         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 442         if (rnb == NULL || rnb->rnb_len == 0)
 443                 RETURN_EXIT;
 444
 445         /* LU-11595: Server may return whole file and that is OK always or
 446          * it may return just file tail and its offset must be aligned with
 447          * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
 448          * smaller then offset may be not aligned and that data is just ignored.
 449          */
 450         if (rnb->rnb_offset % PAGE_SIZE)
 451                 RETURN_EXIT;
 452
 453         /* Server returns whole file or just file tail if it fills in
 454          * reply buffer, in both cases total size should be inode size.
 455          */
 456         if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
 457                 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
 458                        ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
 459                        rnb->rnb_len, i_size_read(inode));
 460                 RETURN_EXIT;
 461         }
 462
 463         CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
 464                rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
 465
 466         data = (char *)rnb + sizeof(*rnb);
 467
 468         lnb.lnb_file_offset = rnb->rnb_offset;
 469         start = lnb.lnb_file_offset / PAGE_SIZE;
 470         index = 0;
 471         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 472         lnb.lnb_page_offset = 0;
 473         do {
 474                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 475                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 476                 if (lnb.lnb_len > PAGE_SIZE)
 477                         lnb.lnb_len = PAGE_SIZE;
 478
 479                 vmpage = read_cache_page(mapping, index + start,
 480                                          ll_dom_readpage, &lnb);
 481                 if (IS_ERR(vmpage)) {
 482                         CWARN("%s: cannot fill page %lu for "DFID
 483                               " with data: rc = %li\n",
 484                               ll_i2sbi(inode)->ll_fsname, index + start,
 485                               PFID(lu_object_fid(&obj->co_lu)),
 486                               PTR_ERR(vmpage));
 487                         break;
 488                 }
 489                 put_page(vmpage);
 490                 index++;
 491         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 492         EXIT;
 493 }
 494
 495 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 496                                 struct lookup_intent *itp)
 497 {
 498         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 499         struct dentry *parent = de->d_parent;
 500         char *name = NULL;
 501         int len = 0;
 502         struct md_op_data *op_data;
 503         struct ptlrpc_request *req = NULL;
 504         int rc;
 505         ENTRY;
 506
 507         LASSERT(parent != NULL);
 508         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 509
 510         /* if server supports open-by-fid, or file name is invalid, don't pack
 511          * name in open request */
 512         if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
 513             !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
 514 retry:
 515                 len = de->d_name.len;
 516                 name = kmalloc(len + 1, GFP_NOFS);
 517                 if (!name)
 518                         RETURN(-ENOMEM);
 519
 520                 /* race here */
 521                 spin_lock(&de->d_lock);
 522                 if (len != de->d_name.len) {
 523                         spin_unlock(&de->d_lock);
 524                         kfree(name);
 525                         goto retry;
 526                 }
 527                 memcpy(name, de->d_name.name, len);
 528                 name[len] = '\0';
 529                 spin_unlock(&de->d_lock);
 530
 531                 if (!lu_name_is_valid_2(name, len)) {
 532                         kfree(name);
 533                         RETURN(-ESTALE);
 534                 }
 535         }
 536
 537         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 538                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 539         if (IS_ERR(op_data)) {
 540                 kfree(name);
 541                 RETURN(PTR_ERR(op_data));
 542         }
 543         op_data->op_data = lmm;
 544         op_data->op_data_size = lmmsize;
 545
 546         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 547                             &ll_md_blocking_ast, 0);
 548         kfree(name);
 549         ll_finish_md_op_data(op_data);
 550         if (rc == -ESTALE) {
 551                 /* reason for keep own exit path - don`t flood log
 552                  * with messages with -ESTALE errors.
 553                  */
 554                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 555                      it_open_error(DISP_OPEN_OPEN, itp))
 556                         GOTO(out, rc);
 557                 ll_release_openhandle(de, itp);
 558                 GOTO(out, rc);
 559         }
 560
 561         if (it_disposition(itp, DISP_LOOKUP_NEG))
 562                 GOTO(out, rc = -ENOENT);
 563
 564         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 565                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 566                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 567                 GOTO(out, rc);
 568         }
 569
 570         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 571
 572         if (!rc && itp->it_lock_mode) {
 573                 ll_dom_finish_open(de->d_inode, req, itp);
 574                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 575         }
 576
 577 out:
 578         ptlrpc_req_finished(req);
 579         ll_intent_drop_lock(itp);
 580
 581         /* We did open by fid, but by the time we got to the server,
 582          * the object disappeared. If this is a create, we cannot really
 583          * tell the userspace that the file it was trying to create
 584          * does not exist. Instead let's return -ESTALE, and the VFS will
 585          * retry the create with LOOKUP_REVAL that we are going to catch
 586          * in ll_revalidate_dentry() and use lookup then.
 587          */
 588         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 589                 rc = -ESTALE;
 590
 591         RETURN(rc);
 592 }
 593
 594 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 595                        struct obd_client_handle *och)
 596 {
 597         struct mdt_body *body;
 598
 599         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 600         och->och_open_handle = body->mbo_open_handle;
 601         och->och_fid = body->mbo_fid1;
 602         och->och_lease_handle.cookie = it->it_lock_handle;
 603         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 604         och->och_flags = it->it_flags;
 605
 606         return md_set_open_replay_data(md_exp, och, it);
 607 }
 608
 609 static int ll_local_open(struct file *file, struct lookup_intent *it,
 610                          struct ll_file_data *fd, struct obd_client_handle *och)
 611 {
 612         struct inode *inode = file_inode(file);
 613         ENTRY;
 614
 615         LASSERT(!LUSTRE_FPRIVATE(file));
 616
 617         LASSERT(fd != NULL);
 618
 619         if (och) {
 620                 int rc;
 621
 622                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 623                 if (rc != 0)
 624                         RETURN(rc);
 625         }
 626
 627         LUSTRE_FPRIVATE(file) = fd;
 628         ll_readahead_init(inode, &fd->fd_ras);
 629         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 630
 631         /* ll_cl_context initialize */
 632         rwlock_init(&fd->fd_lock);
 633         INIT_LIST_HEAD(&fd->fd_lccs);
 634
 635         RETURN(0);
 636 }
 637
 638 /* Open a file, and (for the very first open) create objects on the OSTs at
 639  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 640  * creation or open until ll_lov_setstripe() ioctl is called.
 641  *
 642  * If we already have the stripe MD locally then we don't request it in
 643  * md_open(), by passing a lmm_size = 0.
 644  *
 645  * It is up to the application to ensure no other processes open this file
 646  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 647  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 648  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 649  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 650  */
 651 int ll_file_open(struct inode *inode, struct file *file)
 652 {
 653         struct ll_inode_info *lli = ll_i2info(inode);
 654         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 655                                           .it_flags = file->f_flags };
 656         struct obd_client_handle **och_p = NULL;
 657         __u64 *och_usecount = NULL;
 658         struct ll_file_data *fd;
 659         int rc = 0;
 660         ENTRY;
 661
 662         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 663                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 664
 665         it = file->private_data; /* XXX: compat macro */
 666         file->private_data = NULL; /* prevent ll_local_open assertion */
 667
 668         fd = ll_file_data_get();
 669         if (fd == NULL)
 670                 GOTO(out_nofiledata, rc = -ENOMEM);
 671
 672         fd->fd_file = file;
 673         if (S_ISDIR(inode->i_mode))
 674                 ll_authorize_statahead(inode, fd);
 675
 676         if (inode->i_sb->s_root == file_dentry(file)) {
 677                 LUSTRE_FPRIVATE(file) = fd;
 678                 RETURN(0);
 679         }
 680
 681         if (!it || !it->it_disposition) {
 682                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 683                  * because everything but O_ACCMODE mask was stripped from
 684                  * there */
 685                 if ((oit.it_flags + 1) & O_ACCMODE)
 686                         oit.it_flags++;
 687                 if (file->f_flags & O_TRUNC)
 688                         oit.it_flags |= FMODE_WRITE;
 689
 690                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 691                  * dentry_open after call to open_namei that checks permissions.
 692                  * Only nfsd_open call dentry_open directly without checking
 693                  * permissions and because of that this code below is safe.
 694                  */
 695                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 696                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 697
 698                 /* We do not want O_EXCL here, presumably we opened the file
 699                  * already? XXX - NFS implications? */
 700                 oit.it_flags &= ~O_EXCL;
 701
 702                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 703                  * created if necessary, then "IT_CREAT" should be set to keep
 704                  * consistent with it */
 705                 if (oit.it_flags & O_CREAT)
 706                         oit.it_op |= IT_CREAT;
 707
 708                 it = &oit;
 709         }
 710
 711 restart:
 712         /* Let's see if we have file open on MDS already. */
 713         if (it->it_flags & FMODE_WRITE) {
 714                 och_p = &lli->lli_mds_write_och;
 715                 och_usecount = &lli->lli_open_fd_write_count;
 716         } else if (it->it_flags & FMODE_EXEC) {
 717                 och_p = &lli->lli_mds_exec_och;
 718                 och_usecount = &lli->lli_open_fd_exec_count;
 719          } else {
 720                 och_p = &lli->lli_mds_read_och;
 721                 och_usecount = &lli->lli_open_fd_read_count;
 722         }
 723
 724         mutex_lock(&lli->lli_och_mutex);
 725         if (*och_p) { /* Open handle is present */
 726                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 727                         /* Well, there's extra open request that we do not need,
 728                            let's close it somehow. This will decref request. */
 729                         rc = it_open_error(DISP_OPEN_OPEN, it);
 730                         if (rc) {
 731                                 mutex_unlock(&lli->lli_och_mutex);
 732                                 GOTO(out_openerr, rc);
 733                         }
 734
 735                         ll_release_openhandle(file_dentry(file), it);
 736                 }
 737                 (*och_usecount)++;
 738
 739                 rc = ll_local_open(file, it, fd, NULL);
 740                 if (rc) {
 741                         (*och_usecount)--;
 742                         mutex_unlock(&lli->lli_och_mutex);
 743                         GOTO(out_openerr, rc);
 744                 }
 745         } else {
 746                 LASSERT(*och_usecount == 0);
 747                 if (!it->it_disposition) {
 748                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 749                         /* We cannot just request lock handle now, new ELC code
 750                            means that one of other OPEN locks for this file
 751                            could be cancelled, and since blocking ast handler
 752                            would attempt to grab och_mutex as well, that would
 753                            result in a deadlock */
 754                         mutex_unlock(&lli->lli_och_mutex);
 755                         /*
 756                          * Normally called under two situations:
 757                          * 1. NFS export.
 758                          * 2. A race/condition on MDS resulting in no open
 759                          *    handle to be returned from LOOKUP|OPEN request,
 760                          *    for example if the target entry was a symlink.
 761                          *
 762                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 763                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 764                          *  bit so that it's not confusing later callers.
 765                          *
 766                          *  NB; when ldd is NULL, it must have come via normal
 767                          *  lookup path only, since ll_iget_for_nfs always calls
 768                          *  ll_d_init().
 769                          */
 770                         if (ldd && ldd->lld_nfs_dentry) {
 771                                 ldd->lld_nfs_dentry = 0;
 772                                 it->it_flags |= MDS_OPEN_LOCK;
 773                         }
 774
 775                          /*
 776                          * Always specify MDS_OPEN_BY_FID because we don't want
 777                          * to get file with different fid.
 778                          */
 779                         it->it_flags |= MDS_OPEN_BY_FID;
 780                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 781                                                  it);
 782                         if (rc)
 783                                 GOTO(out_openerr, rc);
 784
 785                         goto restart;
 786                 }
 787                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 788                 if (!*och_p)
 789                         GOTO(out_och_free, rc = -ENOMEM);
 790
 791                 (*och_usecount)++;
 792
 793                 /* md_intent_lock() didn't get a request ref if there was an
 794                  * open error, so don't do cleanup on the request here
 795                  * (bug 3430) */
 796                 /* XXX (green): Should not we bail out on any error here, not
 797                  * just open error? */
 798                 rc = it_open_error(DISP_OPEN_OPEN, it);
 799                 if (rc != 0)
 800                         GOTO(out_och_free, rc);
 801
 802                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 803                          "inode %p: disposition %x, status %d\n", inode,
 804                          it_disposition(it, ~0), it->it_status);
 805
 806                 rc = ll_local_open(file, it, fd, *och_p);
 807                 if (rc)
 808                         GOTO(out_och_free, rc);
 809         }
 810         mutex_unlock(&lli->lli_och_mutex);
 811         fd = NULL;
 812
 813         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 814            different kind of OPEN lock for this same inode gets cancelled
 815            by ldlm_cancel_lru */
 816         if (!S_ISREG(inode->i_mode))
 817                 GOTO(out_och_free, rc);
 818
 819         cl_lov_delay_create_clear(&file->f_flags);
 820         GOTO(out_och_free, rc);
 821
 822 out_och_free:
 823         if (rc) {
 824                 if (och_p && *och_p) {
 825                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 826                         *och_p = NULL; /* OBD_FREE writes some magic there */
 827                         (*och_usecount)--;
 828                 }
 829                 mutex_unlock(&lli->lli_och_mutex);
 830
 831 out_openerr:
 832                 if (lli->lli_opendir_key == fd)
 833                         ll_deauthorize_statahead(inode, fd);
 834                 if (fd != NULL)
 835                         ll_file_data_put(fd);
 836         } else {
 837                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 838         }
 839
 840 out_nofiledata:
 841         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 842                 ptlrpc_req_finished(it->it_request);
 843                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 844         }
 845
 846         return rc;
 847 }
 848
 849 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 850                         struct ldlm_lock_desc *desc, void *data, int flag)
 851 {
 852         int rc;
 853         struct lustre_handle lockh;
 854         ENTRY;
 855
 856         switch (flag) {
 857         case LDLM_CB_BLOCKING:
 858                 ldlm_lock2handle(lock, &lockh);
 859                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 860                 if (rc < 0) {
 861                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 862                         RETURN(rc);
 863                 }
 864                 break;
 865         case LDLM_CB_CANCELING:
 866                 /* do nothing */
 867                 break;
 868         }
 869         RETURN(0);
 870 }
 871
 872 /**
 873  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 874  * and save it as fd->fd_och so as to force client to reopen the file even
 875  * if it has an open lock in cache already.
 876  */
 877 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 878                                 struct lustre_handle *old_open_handle)
 879 {
 880         struct ll_inode_info *lli = ll_i2info(inode);
 881         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 882         struct obd_client_handle **och_p;
 883         __u64 *och_usecount;
 884         int rc = 0;
 885         ENTRY;
 886
 887         /* Get the openhandle of the file */
 888         mutex_lock(&lli->lli_och_mutex);
 889         if (fd->fd_lease_och != NULL)
 890                 GOTO(out_unlock, rc = -EBUSY);
 891
 892         if (fd->fd_och == NULL) {
 893                 if (file->f_mode & FMODE_WRITE) {
 894                         LASSERT(lli->lli_mds_write_och != NULL);
 895                         och_p = &lli->lli_mds_write_och;
 896                         och_usecount = &lli->lli_open_fd_write_count;
 897                 } else {
 898                         LASSERT(lli->lli_mds_read_och != NULL);
 899                         och_p = &lli->lli_mds_read_och;
 900                         och_usecount = &lli->lli_open_fd_read_count;
 901                 }
 902
 903                 if (*och_usecount > 1)
 904                         GOTO(out_unlock, rc = -EBUSY);
 905
 906                 fd->fd_och = *och_p;
 907                 *och_usecount = 0;
 908                 *och_p = NULL;
 909         }
 910
 911         *old_open_handle = fd->fd_och->och_open_handle;
 912
 913         EXIT;
 914 out_unlock:
 915         mutex_unlock(&lli->lli_och_mutex);
 916         return rc;
 917 }
 918
 919 /**
 920  * Release ownership on lli_mds_*_och when putting back a file lease.
 921  */
 922 static int ll_lease_och_release(struct inode *inode, struct file *file)
 923 {
 924         struct ll_inode_info *lli = ll_i2info(inode);
 925         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 926         struct obd_client_handle **och_p;
 927         struct obd_client_handle *old_och = NULL;
 928         __u64 *och_usecount;
 929         int rc = 0;
 930         ENTRY;
 931
 932         mutex_lock(&lli->lli_och_mutex);
 933         if (file->f_mode & FMODE_WRITE) {
 934                 och_p = &lli->lli_mds_write_och;
 935                 och_usecount = &lli->lli_open_fd_write_count;
 936         } else {
 937                 och_p = &lli->lli_mds_read_och;
 938                 och_usecount = &lli->lli_open_fd_read_count;
 939         }
 940
 941         /* The file may have been open by another process (broken lease) so
 942          * *och_p is not NULL. In this case we should simply increase usecount
 943          * and close fd_och.
 944          */
 945         if (*och_p != NULL) {
 946                 old_och = fd->fd_och;
 947                 (*och_usecount)++;
 948         } else {
 949                 *och_p = fd->fd_och;
 950                 *och_usecount = 1;
 951         }
 952         fd->fd_och = NULL;
 953         mutex_unlock(&lli->lli_och_mutex);
 954
 955         if (old_och != NULL)
 956                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 957
 958         RETURN(rc);
 959 }
 960
 961 /**
 962  * Acquire a lease and open the file.
 963  */
 964 static struct obd_client_handle *
 965 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 966               __u64 open_flags)
 967 {
 968         struct lookup_intent it = { .it_op = IT_OPEN };
 969         struct ll_sb_info *sbi = ll_i2sbi(inode);
 970         struct md_op_data *op_data;
 971         struct ptlrpc_request *req = NULL;
 972         struct lustre_handle old_open_handle = { 0 };
 973         struct obd_client_handle *och = NULL;
 974         int rc;
 975         int rc2;
 976         ENTRY;
 977
 978         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 979                 RETURN(ERR_PTR(-EINVAL));
 980
 981         if (file != NULL) {
 982                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 983                         RETURN(ERR_PTR(-EPERM));
 984
 985                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
 986                 if (rc)
 987                         RETURN(ERR_PTR(rc));
 988         }
 989
 990         OBD_ALLOC_PTR(och);
 991         if (och == NULL)
 992                 RETURN(ERR_PTR(-ENOMEM));
 993
 994         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 995                                         LUSTRE_OPC_ANY, NULL);
 996         if (IS_ERR(op_data))
 997                 GOTO(out, rc = PTR_ERR(op_data));
 998
 999         /* To tell the MDT this openhandle is from the same owner */
1000         op_data->op_open_handle = old_open_handle;
1001
1002         it.it_flags = fmode | open_flags;
1003         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1004         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1005                             &ll_md_blocking_lease_ast,
1006         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1007          * it can be cancelled which may mislead applications that the lease is
1008          * broken;
1009          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1010          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1011          * doesn't deal with openhandle, so normal openhandle will be leaked. */
1012                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1013         ll_finish_md_op_data(op_data);
1014         ptlrpc_req_finished(req);
1015         if (rc < 0)
1016                 GOTO(out_release_it, rc);
1017
1018         if (it_disposition(&it, DISP_LOOKUP_NEG))
1019                 GOTO(out_release_it, rc = -ENOENT);
1020
1021         rc = it_open_error(DISP_OPEN_OPEN, &it);
1022         if (rc)
1023                 GOTO(out_release_it, rc);
1024
1025         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1026         ll_och_fill(sbi->ll_md_exp, &it, och);
1027
1028         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1029                 GOTO(out_close, rc = -EOPNOTSUPP);
1030
1031         /* already get lease, handle lease lock */
1032         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1033         if (it.it_lock_mode == 0 ||
1034             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1035                 /* open lock must return for lease */
1036                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1037                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1038                         it.it_lock_bits);
1039                 GOTO(out_close, rc = -EPROTO);
1040         }
1041
1042         ll_intent_release(&it);
1043         RETURN(och);
1044
1045 out_close:
1046         /* Cancel open lock */
1047         if (it.it_lock_mode != 0) {
1048                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1049                                             it.it_lock_mode);
1050                 it.it_lock_mode = 0;
1051                 och->och_lease_handle.cookie = 0ULL;
1052         }
1053         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1054         if (rc2 < 0)
1055                 CERROR("%s: error closing file "DFID": %d\n",
1056                        sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1057         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1058 out_release_it:
1059         ll_intent_release(&it);
1060 out:
1061         if (och != NULL)
1062                 OBD_FREE_PTR(och);
1063         RETURN(ERR_PTR(rc));
1064 }
1065
1066 /**
1067  * Check whether a layout swap can be done between two inodes.
1068  *
1069  * \param[in] inode1  First inode to check
1070  * \param[in] inode2  Second inode to check
1071  *
1072  * \retval 0 on success, layout swap can be performed between both inodes
1073  * \retval negative error code if requirements are not met
1074  */
1075 static int ll_check_swap_layouts_validity(struct inode *inode1,
1076                                           struct inode *inode2)
1077 {
1078         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1079                 return -EINVAL;
1080
1081         if (inode_permission(inode1, MAY_WRITE) ||
1082             inode_permission(inode2, MAY_WRITE))
1083                 return -EPERM;
1084
1085         if (inode1->i_sb != inode2->i_sb)
1086                 return -EXDEV;
1087
1088         return 0;
1089 }
1090
1091 static int ll_swap_layouts_close(struct obd_client_handle *och,
1092                                  struct inode *inode, struct inode *inode2)
1093 {
1094         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1095         const struct lu_fid     *fid2;
1096         int                      rc;
1097         ENTRY;
1098
1099         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1100                ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1101
1102         rc = ll_check_swap_layouts_validity(inode, inode2);
1103         if (rc < 0)
1104                 GOTO(out_free_och, rc);
1105
1106         /* We now know that inode2 is a lustre inode */
1107         fid2 = ll_inode2fid(inode2);
1108
1109         rc = lu_fid_cmp(fid1, fid2);
1110         if (rc == 0)
1111                 GOTO(out_free_och, rc = -EINVAL);
1112
1113         /* Close the file and {swap,merge} layouts between inode & inode2.
1114          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1115          * because we still need it to pack l_remote_handle to MDT. */
1116         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1117                                        inode2);
1118
1119         och = NULL; /* freed in ll_close_inode_openhandle() */
1120
1121 out_free_och:
1122         if (och != NULL)
1123                 OBD_FREE_PTR(och);
1124
1125         RETURN(rc);
1126 }
1127
1128 /**
1129  * Release lease and close the file.
1130  * It will check if the lease has ever broken.
1131  */
1132 static int ll_lease_close_intent(struct obd_client_handle *och,
1133                                  struct inode *inode,
1134                                  bool *lease_broken, enum mds_op_bias bias,
1135                                  void *data)
1136 {
1137         struct ldlm_lock *lock;
1138         bool cancelled = true;
1139         int rc;
1140         ENTRY;
1141
1142         lock = ldlm_handle2lock(&och->och_lease_handle);
1143         if (lock != NULL) {
1144                 lock_res_and_lock(lock);
1145                 cancelled = ldlm_is_cancel(lock);
1146                 unlock_res_and_lock(lock);
1147                 LDLM_LOCK_PUT(lock);
1148         }
1149
1150         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1151                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1152
1153         if (lease_broken != NULL)
1154                 *lease_broken = cancelled;
1155
1156         if (!cancelled && !bias)
1157                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1158
1159         if (cancelled) { /* no need to excute intent */
1160                 bias = 0;
1161                 data = NULL;
1162         }
1163
1164         rc = ll_close_inode_openhandle(inode, och, bias, data);
1165         RETURN(rc);
1166 }
1167
1168 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1169                           bool *lease_broken)
1170 {
1171         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1172 }
1173
1174 /**
1175  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1176  */
1177 static int ll_lease_file_resync(struct obd_client_handle *och,
1178                                 struct inode *inode, unsigned long arg)
1179 {
1180         struct ll_sb_info *sbi = ll_i2sbi(inode);
1181         struct md_op_data *op_data;
1182         struct ll_ioc_lease_id ioc;
1183         __u64 data_version_unused;
1184         int rc;
1185         ENTRY;
1186
1187         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1188                                      LUSTRE_OPC_ANY, NULL);
1189         if (IS_ERR(op_data))
1190                 RETURN(PTR_ERR(op_data));
1191
1192         if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1193                            sizeof(ioc)))
1194                 RETURN(-EFAULT);
1195
1196         /* before starting file resync, it's necessary to clean up page cache
1197          * in client memory, otherwise once the layout version is increased,
1198          * writing back cached data will be denied the OSTs. */
1199         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1200         if (rc)
1201                 GOTO(out, rc);
1202
1203         op_data->op_lease_handle = och->och_lease_handle;
1204         op_data->op_mirror_id = ioc.lil_mirror_id;
1205         rc = md_file_resync(sbi->ll_md_exp, op_data);
1206         if (rc)
1207                 GOTO(out, rc);
1208
1209         EXIT;
1210 out:
1211         ll_finish_md_op_data(op_data);
1212         return rc;
1213 }
1214
1215 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1216 {
1217         struct ll_inode_info *lli = ll_i2info(inode);
1218         struct cl_object *obj = lli->lli_clob;
1219         struct cl_attr *attr = vvp_env_thread_attr(env);
1220         s64 atime;
1221         s64 mtime;
1222         s64 ctime;
1223         int rc = 0;
1224
1225         ENTRY;
1226
1227         ll_inode_size_lock(inode);
1228
1229         /* Merge timestamps the most recently obtained from MDS with
1230          * timestamps obtained from OSTs.
1231          *
1232          * Do not overwrite atime of inode because it may be refreshed
1233          * by file_accessed() function. If the read was served by cache
1234          * data, there is no RPC to be sent so that atime may not be
1235          * transferred to OSTs at all. MDT only updates atime at close time
1236          * if it's at least 'mdd.*.atime_diff' older.
1237          * All in all, the atime in Lustre does not strictly comply with
1238          * POSIX. Solving this problem needs to send an RPC to MDT for each
1239          * read, this will hurt performance.
1240          */
1241         if (inode->i_atime.tv_sec < lli->lli_atime ||
1242             lli->lli_update_atime) {
1243                 inode->i_atime.tv_sec = lli->lli_atime;
1244                 lli->lli_update_atime = 0;
1245         }
1246         inode->i_mtime.tv_sec = lli->lli_mtime;
1247         inode->i_ctime.tv_sec = lli->lli_ctime;
1248
1249         mtime = inode->i_mtime.tv_sec;
1250         atime = inode->i_atime.tv_sec;
1251         ctime = inode->i_ctime.tv_sec;
1252
1253         cl_object_attr_lock(obj);
1254         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1255                 rc = -EINVAL;
1256         else
1257                 rc = cl_object_attr_get(env, obj, attr);
1258         cl_object_attr_unlock(obj);
1259
1260         if (rc != 0)
1261                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1262
1263         if (atime < attr->cat_atime)
1264                 atime = attr->cat_atime;
1265
1266         if (ctime < attr->cat_ctime)
1267                 ctime = attr->cat_ctime;
1268
1269         if (mtime < attr->cat_mtime)
1270                 mtime = attr->cat_mtime;
1271
1272         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1273                PFID(&lli->lli_fid), attr->cat_size);
1274
1275         i_size_write(inode, attr->cat_size);
1276         inode->i_blocks = attr->cat_blocks;
1277
1278         inode->i_mtime.tv_sec = mtime;
1279         inode->i_atime.tv_sec = atime;
1280         inode->i_ctime.tv_sec = ctime;
1281
1282 out_size_unlock:
1283         ll_inode_size_unlock(inode);
1284
1285         RETURN(rc);
1286 }
1287
1288 /**
1289  * Set designated mirror for I/O.
1290  *
1291  * So far only read, write, and truncated can support to issue I/O to
1292  * designated mirror.
1293  */
1294 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1295 {
1296         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1297
1298         /* clear layout version for generic(non-resync) I/O in case it carries
1299          * stale layout version due to I/O restart */
1300         io->ci_layout_version = 0;
1301
1302         /* FLR: disable non-delay for designated mirror I/O because obviously
1303          * only one mirror is available */
1304         if (fd->fd_designated_mirror > 0) {
1305                 io->ci_ndelay = 0;
1306                 io->ci_designated_mirror = fd->fd_designated_mirror;
1307                 io->ci_layout_version = fd->fd_layout_version;
1308         }
1309
1310         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1311                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1312 }
1313
1314 static bool file_is_noatime(const struct file *file)
1315 {
1316         const struct vfsmount *mnt = file->f_path.mnt;
1317         const struct inode *inode = file_inode((struct file *)file);
1318
1319         /* Adapted from file_accessed() and touch_atime().*/
1320         if (file->f_flags & O_NOATIME)
1321                 return true;
1322
1323         if (inode->i_flags & S_NOATIME)
1324                 return true;
1325
1326         if (IS_NOATIME(inode))
1327                 return true;
1328
1329         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1330                 return true;
1331
1332         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1333                 return true;
1334
1335         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1336                 return true;
1337
1338         return false;
1339 }
1340
1341 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1342 {
1343         struct inode *inode = file_inode(file);
1344         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1345
1346         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1347         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1348
1349         if (iot == CIT_WRITE) {
1350                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1351                 io->u.ci_wr.wr_sync   = !!(file->f_flags & O_SYNC ||
1352                                            file->f_flags & O_DIRECT ||
1353                                            IS_SYNC(inode));
1354         }
1355         io->ci_obj = ll_i2info(inode)->lli_clob;
1356         io->ci_lockreq = CILR_MAYBE;
1357         if (ll_file_nolock(file)) {
1358                 io->ci_lockreq = CILR_NEVER;
1359                 io->ci_no_srvlock = 1;
1360         } else if (file->f_flags & O_APPEND) {
1361                 io->ci_lockreq = CILR_MANDATORY;
1362         }
1363         io->ci_noatime = file_is_noatime(file);
1364
1365         /* FLR: only use non-delay I/O for read as there is only one
1366          * avaliable mirror for write. */
1367         io->ci_ndelay = !(iot == CIT_WRITE);
1368
1369         ll_io_set_mirror(io, file);
1370 }
1371
1372 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1373                         __u64 count)
1374 {
1375         struct ll_inode_info *lli = ll_i2info(inode);
1376         struct ll_sb_info *sbi = ll_i2sbi(inode);
1377         enum obd_heat_type sample_type;
1378         enum obd_heat_type iobyte_type;
1379         __u64 now = ktime_get_real_seconds();
1380
1381         if (!ll_sbi_has_file_heat(sbi) ||
1382             lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1383                 return;
1384
1385         if (iot == CIT_READ) {
1386                 sample_type = OBD_HEAT_READSAMPLE;
1387                 iobyte_type = OBD_HEAT_READBYTE;
1388         } else if (iot == CIT_WRITE) {
1389                 sample_type = OBD_HEAT_WRITESAMPLE;
1390                 iobyte_type = OBD_HEAT_WRITEBYTE;
1391         } else {
1392                 return;
1393         }
1394
1395         spin_lock(&lli->lli_heat_lock);
1396         obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1397                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1398         obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1399                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1400         spin_unlock(&lli->lli_heat_lock);
1401 }
1402
1403 static ssize_t
1404 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1405                    struct file *file, enum cl_io_type iot,
1406                    loff_t *ppos, size_t count)
1407 {
1408         struct vvp_io           *vio = vvp_env_io(env);
1409         struct inode            *inode = file_inode(file);
1410         struct ll_inode_info    *lli = ll_i2info(inode);
1411         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1412         struct range_lock       range;
1413         struct cl_io            *io;
1414         ssize_t                 result = 0;
1415         int                     rc = 0;
1416         unsigned                retried = 0;
1417         bool                    restarted = false;
1418
1419         ENTRY;
1420
1421         CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1422                 file_dentry(file)->d_name.name,
1423                 iot == CIT_READ ? "read" : "write", *ppos, count);
1424
1425 restart:
1426         io = vvp_env_thread_io(env);
1427         ll_io_init(io, file, iot);
1428         io->ci_ndelay_tried = retried;
1429
1430         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1431                 bool range_locked = false;
1432
1433                 if (file->f_flags & O_APPEND)
1434                         range_lock_init(&range, 0, LUSTRE_EOF);
1435                 else
1436                         range_lock_init(&range, *ppos, *ppos + count - 1);
1437
1438                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1439                 vio->vui_io_subtype = args->via_io_subtype;
1440
1441                 switch (vio->vui_io_subtype) {
1442                 case IO_NORMAL:
1443                         vio->vui_iter = args->u.normal.via_iter;
1444                         vio->vui_iocb = args->u.normal.via_iocb;
1445                         /* Direct IO reads must also take range lock,
1446                          * or multiple reads will try to work on the same pages
1447                          * See LU-6227 for details. */
1448                         if (((iot == CIT_WRITE) ||
1449                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1450                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1451                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1452                                        RL_PARA(&range));
1453                                 rc = range_lock(&lli->lli_write_tree, &range);
1454                                 if (rc < 0)
1455                                         GOTO(out, rc);
1456
1457                                 range_locked = true;
1458                         }
1459                         break;
1460                 case IO_SPLICE:
1461                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1462                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1463                         break;
1464                 default:
1465                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1466                         LBUG();
1467                 }
1468
1469                 ll_cl_add(file, env, io, LCC_RW);
1470                 rc = cl_io_loop(env, io);
1471                 ll_cl_remove(file, env);
1472
1473                 if (range_locked) {
1474                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1475                                RL_PARA(&range));
1476                         range_unlock(&lli->lli_write_tree, &range);
1477                 }
1478         } else {
1479                 /* cl_io_rw_init() handled IO */
1480                 rc = io->ci_result;
1481         }
1482
1483         if (io->ci_nob > 0) {
1484                 result += io->ci_nob;
1485                 count  -= io->ci_nob;
1486                 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1487
1488                 /* prepare IO restart */
1489                 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1490                         args->u.normal.via_iter = vio->vui_iter;
1491         }
1492 out:
1493         cl_io_fini(env, io);
1494
1495         CDEBUG(D_VFSTRACE,
1496                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1497                file->f_path.dentry->d_name.name,
1498                iot, rc, result, io->ci_need_restart);
1499
1500         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1501                 CDEBUG(D_VFSTRACE,
1502                        "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1503                        file_dentry(file)->d_name.name,
1504                        iot == CIT_READ ? "read" : "write",
1505                        *ppos, count, result, rc);
1506                 /* preserve the tried count for FLR */
1507                 retried = io->ci_ndelay_tried;
1508                 restarted = true;
1509                 goto restart;
1510         }
1511
1512         if (iot == CIT_READ) {
1513                 if (result > 0)
1514                         ll_stats_ops_tally(ll_i2sbi(inode),
1515                                            LPROC_LL_READ_BYTES, result);
1516         } else if (iot == CIT_WRITE) {
1517                 if (result > 0) {
1518                         ll_stats_ops_tally(ll_i2sbi(inode),
1519                                            LPROC_LL_WRITE_BYTES, result);
1520                         fd->fd_write_failed = false;
1521                 } else if (result == 0 && rc == 0) {
1522                         rc = io->ci_result;
1523                         if (rc < 0)
1524                                 fd->fd_write_failed = true;
1525                         else
1526                                 fd->fd_write_failed = false;
1527                 } else if (rc != -ERESTARTSYS) {
1528                         fd->fd_write_failed = true;
1529                 }
1530         }
1531
1532         CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1533         if (result > 0)
1534                 ll_heat_add(inode, iot, result);
1535
1536         RETURN(result > 0 ? result : rc);
1537 }
1538
1539 /**
1540  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1541  * especially for small I/O.
1542  *
1543  * To serve a read request, CLIO has to create and initialize a cl_io and
1544  * then request DLM lock. This has turned out to have siginificant overhead
1545  * and affects the performance of small I/O dramatically.
1546  *
1547  * It's not necessary to create a cl_io for each I/O. Under the help of read
1548  * ahead, most of the pages being read are already in memory cache and we can
1549  * read those pages directly because if the pages exist, the corresponding DLM
1550  * lock must exist so that page content must be valid.
1551  *
1552  * In fast read implementation, the llite speculatively finds and reads pages
1553  * in memory cache. There are three scenarios for fast read:
1554  *   - If the page exists and is uptodate, kernel VM will provide the data and
1555  *     CLIO won't be intervened;
1556  *   - If the page was brought into memory by read ahead, it will be exported
1557  *     and read ahead parameters will be updated;
1558  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1559  *     it will go back and invoke normal read, i.e., a cl_io will be created
1560  *     and DLM lock will be requested.
1561  *
1562  * POSIX compliance: posix standard states that read is intended to be atomic.
1563  * Lustre read implementation is in line with Linux kernel read implementation
1564  * and neither of them complies with POSIX standard in this matter. Fast read
1565  * doesn't make the situation worse on single node but it may interleave write
1566  * results from multiple nodes due to short read handling in ll_file_aio_read().
1567  *
1568  * \param env - lu_env
1569  * \param iocb - kiocb from kernel
1570  * \param iter - user space buffers where the data will be copied
1571  *
1572  * \retval - number of bytes have been read, or error code if error occurred.
1573  */
1574 static ssize_t
1575 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1576 {
1577         ssize_t result;
1578
1579         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1580                 return 0;
1581
1582         /* NB: we can't do direct IO for fast read because it will need a lock
1583          * to make IO engine happy. */
1584         if (iocb->ki_filp->f_flags & O_DIRECT)
1585                 return 0;
1586
1587         result = generic_file_read_iter(iocb, iter);
1588
1589         /* If the first page is not in cache, generic_file_aio_read() will be
1590          * returned with -ENODATA.
1591          * See corresponding code in ll_readpage(). */
1592         if (result == -ENODATA)
1593                 result = 0;
1594
1595         if (result > 0) {
1596                 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1597                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1598                                 LPROC_LL_READ_BYTES, result);
1599         }
1600
1601         return result;
1602 }
1603
1604 /*
1605  * Read from a file (through the page cache).
1606  */
1607 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1608 {
1609         struct lu_env *env;
1610         struct vvp_io_args *args;
1611         ssize_t result;
1612         ssize_t rc2;
1613         __u16 refcheck;
1614
1615         result = ll_do_fast_read(iocb, to);
1616         if (result < 0 || iov_iter_count(to) == 0)
1617                 GOTO(out, result);
1618
1619         env = cl_env_get(&refcheck);
1620         if (IS_ERR(env))
1621                 return PTR_ERR(env);
1622
1623         args = ll_env_args(env, IO_NORMAL);
1624         args->u.normal.via_iter = to;
1625         args->u.normal.via_iocb = iocb;
1626
1627         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1628                                  &iocb->ki_pos, iov_iter_count(to));
1629         if (rc2 > 0)
1630                 result += rc2;
1631         else if (result == 0)
1632                 result = rc2;
1633
1634         cl_env_put(env, &refcheck);
1635 out:
1636         return result;
1637 }
1638
1639 /**
1640  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1641  * If a page is already in the page cache and dirty (and some other things -
1642  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1643  * write to it without doing a full I/O, because Lustre already knows about it
1644  * and will write it out.  This saves a lot of processing time.
1645  *
1646  * All writes here are within one page, so exclusion is handled by the page
1647  * lock on the vm page.  We do not do tiny writes for writes which touch
1648  * multiple pages because it's very unlikely multiple sequential pages are
1649  * are already dirty.
1650  *
1651  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1652  * and are unlikely to be to already dirty pages.
1653  *
1654  * Attribute updates are important here, we do them in ll_tiny_write_end.
1655  */
1656 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1657 {
1658         ssize_t count = iov_iter_count(iter);
1659         struct  file *file = iocb->ki_filp;
1660         struct  inode *inode = file_inode(file);
1661         bool    lock_inode = !IS_NOSEC(inode);
1662         ssize_t result = 0;
1663
1664         ENTRY;
1665
1666         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1667          * of function for why.
1668          */
1669         if (count >= PAGE_SIZE ||
1670             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1671                 RETURN(0);
1672
1673         if (unlikely(lock_inode))
1674                 inode_lock(inode);
1675         result = __generic_file_write_iter(iocb, iter);
1676
1677         if (unlikely(lock_inode))
1678                 inode_unlock(inode);
1679
1680         /* If the page is not already dirty, ll_tiny_write_begin returns
1681          * -ENODATA.  We continue on to normal write.
1682          */
1683         if (result == -ENODATA)
1684                 result = 0;
1685
1686         if (result > 0) {
1687                 ll_heat_add(inode, CIT_WRITE, result);
1688                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1689                                    result);
1690                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1691         }
1692
1693         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1694
1695         RETURN(result);
1696 }
1697
1698 /*
1699  * Write to a file (through the page cache).
1700  */
1701 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1702 {
1703         struct vvp_io_args *args;
1704         struct lu_env *env;
1705         ssize_t rc_tiny = 0, rc_normal;
1706         __u16 refcheck;
1707
1708         ENTRY;
1709
1710         /* NB: we can't do direct IO for tiny writes because they use the page
1711          * cache, we can't do sync writes because tiny writes can't flush
1712          * pages, and we can't do append writes because we can't guarantee the
1713          * required DLM locks are held to protect file size.
1714          */
1715         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1716             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1717                 rc_tiny = ll_do_tiny_write(iocb, from);
1718
1719         /* In case of error, go on and try normal write - Only stop if tiny
1720          * write completed I/O.
1721          */
1722         if (iov_iter_count(from) == 0)
1723                 GOTO(out, rc_normal = rc_tiny);
1724
1725         env = cl_env_get(&refcheck);
1726         if (IS_ERR(env))
1727                 return PTR_ERR(env);
1728
1729         args = ll_env_args(env, IO_NORMAL);
1730         args->u.normal.via_iter = from;
1731         args->u.normal.via_iocb = iocb;
1732
1733         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1734                                     &iocb->ki_pos, iov_iter_count(from));
1735
1736         /* On success, combine bytes written. */
1737         if (rc_tiny >= 0 && rc_normal > 0)
1738                 rc_normal += rc_tiny;
1739         /* On error, only return error from normal write if tiny write did not
1740          * write any bytes.  Otherwise return bytes written by tiny write.
1741          */
1742         else if (rc_tiny > 0)
1743                 rc_normal = rc_tiny;
1744
1745         cl_env_put(env, &refcheck);
1746 out:
1747         RETURN(rc_normal);
1748 }
1749
1750 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1751 /*
1752  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1753  */
1754 static int ll_file_get_iov_count(const struct iovec *iov,
1755                                  unsigned long *nr_segs, size_t *count)
1756 {
1757         size_t cnt = 0;
1758         unsigned long seg;
1759
1760         for (seg = 0; seg < *nr_segs; seg++) {
1761                 const struct iovec *iv = &iov[seg];
1762
1763                 /*
1764                  * If any segment has a negative length, or the cumulative
1765                  * length ever wraps negative then return -EINVAL.
1766                  */
1767                 cnt += iv->iov_len;
1768                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1769                         return -EINVAL;
1770                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1771                         continue;
1772                 if (seg == 0)
1773                         return -EFAULT;
1774                 *nr_segs = seg;
1775                 cnt -= iv->iov_len;     /* This segment is no good */
1776                 break;
1777         }
1778         *count = cnt;
1779         return 0;
1780 }
1781
1782 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1783                                 unsigned long nr_segs, loff_t pos)
1784 {
1785         struct iov_iter to;
1786         size_t iov_count;
1787         ssize_t result;
1788         ENTRY;
1789
1790         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1791         if (result)
1792                 RETURN(result);
1793
1794 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1795         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1796 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1797         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1798 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1799
1800         result = ll_file_read_iter(iocb, &to);
1801
1802         RETURN(result);
1803 }
1804
1805 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1806                             loff_t *ppos)
1807 {
1808         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1809         struct kiocb   kiocb;
1810         ssize_t        result;
1811         ENTRY;
1812
1813         init_sync_kiocb(&kiocb, file);
1814         kiocb.ki_pos = *ppos;
1815 #ifdef HAVE_KIOCB_KI_LEFT
1816         kiocb.ki_left = count;
1817 #elif defined(HAVE_KI_NBYTES)
1818         kiocb.i_nbytes = count;
1819 #endif
1820
1821         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1822         *ppos = kiocb.ki_pos;
1823
1824         RETURN(result);
1825 }
1826
1827 /*
1828  * Write to a file (through the page cache).
1829  * AIO stuff
1830  */
1831 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1832                                  unsigned long nr_segs, loff_t pos)
1833 {
1834         struct iov_iter from;
1835         size_t iov_count;
1836         ssize_t result;
1837         ENTRY;
1838
1839         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1840         if (result)
1841                 RETURN(result);
1842
1843 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1844         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1845 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1846         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1847 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1848
1849         result = ll_file_write_iter(iocb, &from);
1850
1851         RETURN(result);
1852 }
1853
1854 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1855                              size_t count, loff_t *ppos)
1856 {
1857         struct iovec   iov = { .iov_base = (void __user *)buf,
1858                                .iov_len = count };
1859         struct kiocb   kiocb;
1860         ssize_t        result;
1861
1862         ENTRY;
1863
1864         init_sync_kiocb(&kiocb, file);
1865         kiocb.ki_pos = *ppos;
1866 #ifdef HAVE_KIOCB_KI_LEFT
1867         kiocb.ki_left = count;
1868 #elif defined(HAVE_KI_NBYTES)
1869         kiocb.ki_nbytes = count;
1870 #endif
1871
1872         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1873         *ppos = kiocb.ki_pos;
1874
1875         RETURN(result);
1876 }
1877 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1878
1879 /*
1880  * Send file content (through pagecache) somewhere with helper
1881  */
1882 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1883                                    struct pipe_inode_info *pipe, size_t count,
1884                                    unsigned int flags)
1885 {
1886         struct lu_env      *env;
1887         struct vvp_io_args *args;
1888         ssize_t             result;
1889         __u16               refcheck;
1890         ENTRY;
1891
1892         env = cl_env_get(&refcheck);
1893         if (IS_ERR(env))
1894                 RETURN(PTR_ERR(env));
1895
1896         args = ll_env_args(env, IO_SPLICE);
1897         args->u.splice.via_pipe = pipe;
1898         args->u.splice.via_flags = flags;
1899
1900         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1901         cl_env_put(env, &refcheck);
1902         RETURN(result);
1903 }
1904
1905 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1906                              __u64 flags, struct lov_user_md *lum, int lum_size)
1907 {
1908         struct lookup_intent oit = {
1909                 .it_op = IT_OPEN,
1910                 .it_flags = flags | MDS_OPEN_BY_FID,
1911         };
1912         int rc;
1913         ENTRY;
1914
1915         ll_inode_size_lock(inode);
1916         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1917         if (rc < 0)
1918                 GOTO(out_unlock, rc);
1919
1920         ll_release_openhandle(dentry, &oit);
1921
1922 out_unlock:
1923         ll_inode_size_unlock(inode);
1924         ll_intent_release(&oit);
1925
1926         RETURN(rc);
1927 }
1928
1929 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1930                              struct lov_mds_md **lmmp, int *lmm_size,
1931                              struct ptlrpc_request **request)
1932 {
1933         struct ll_sb_info *sbi = ll_i2sbi(inode);
1934         struct mdt_body  *body;
1935         struct lov_mds_md *lmm = NULL;
1936         struct ptlrpc_request *req = NULL;
1937         struct md_op_data *op_data;
1938         int rc, lmmsize;
1939
1940         rc = ll_get_default_mdsize(sbi, &lmmsize);
1941         if (rc)
1942                 RETURN(rc);
1943
1944         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1945                                      strlen(filename), lmmsize,
1946                                      LUSTRE_OPC_ANY, NULL);
1947         if (IS_ERR(op_data))
1948                 RETURN(PTR_ERR(op_data));
1949
1950         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1951         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1952         ll_finish_md_op_data(op_data);
1953         if (rc < 0) {
1954                 CDEBUG(D_INFO, "md_getattr_name failed "
1955                        "on %s: rc %d\n", filename, rc);
1956                 GOTO(out, rc);
1957         }
1958
1959         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1960         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1961
1962         lmmsize = body->mbo_eadatasize;
1963
1964         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1965                         lmmsize == 0) {
1966                 GOTO(out, rc = -ENODATA);
1967         }
1968
1969         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1970         LASSERT(lmm != NULL);
1971
1972         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1973             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1974             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1975                 GOTO(out, rc = -EPROTO);
1976
1977         /*
1978          * This is coming from the MDS, so is probably in
1979          * little endian.  We convert it to host endian before
1980          * passing it to userspace.
1981          */
1982         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1983                 int stripe_count;
1984
1985                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1986                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1987                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1988                         if (le32_to_cpu(lmm->lmm_pattern) &
1989                             LOV_PATTERN_F_RELEASED)
1990                                 stripe_count = 0;
1991                 }
1992
1993                 /* if function called for directory - we should
1994                  * avoid swab not existent lsm objects */
1995                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1996                         lustre_swab_lov_user_md_v1(
1997                                         (struct lov_user_md_v1 *)lmm);
1998                         if (S_ISREG(body->mbo_mode))
1999                                 lustre_swab_lov_user_md_objects(
2000                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2001                                     stripe_count);
2002                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2003                         lustre_swab_lov_user_md_v3(
2004                                         (struct lov_user_md_v3 *)lmm);
2005                         if (S_ISREG(body->mbo_mode))
2006                                 lustre_swab_lov_user_md_objects(
2007                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2008                                     stripe_count);
2009                 } else if (lmm->lmm_magic ==
2010                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2011                         lustre_swab_lov_comp_md_v1(
2012                                         (struct lov_comp_md_v1 *)lmm);
2013                 }
2014         }
2015
2016 out:
2017         *lmmp = lmm;
2018         *lmm_size = lmmsize;
2019         *request = req;
2020         return rc;
2021 }
2022
2023 static int ll_lov_setea(struct inode *inode, struct file *file,
2024                         void __user *arg)
2025 {
2026         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2027         struct lov_user_md      *lump;
2028         int                      lum_size = sizeof(struct lov_user_md) +
2029                                             sizeof(struct lov_user_ost_data);
2030         int                      rc;
2031         ENTRY;
2032
2033         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2034                 RETURN(-EPERM);
2035
2036         OBD_ALLOC_LARGE(lump, lum_size);
2037         if (lump == NULL)
2038                 RETURN(-ENOMEM);
2039
2040         if (copy_from_user(lump, arg, lum_size))
2041                 GOTO(out_lump, rc = -EFAULT);
2042
2043         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2044                                       lum_size);
2045         cl_lov_delay_create_clear(&file->f_flags);
2046
2047 out_lump:
2048         OBD_FREE_LARGE(lump, lum_size);
2049         RETURN(rc);
2050 }
2051
2052 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2053 {
2054         struct lu_env   *env;
2055         __u16           refcheck;
2056         int             rc;
2057         ENTRY;
2058
2059         env = cl_env_get(&refcheck);
2060         if (IS_ERR(env))
2061                 RETURN(PTR_ERR(env));
2062
2063         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2064         cl_env_put(env, &refcheck);
2065         RETURN(rc);
2066 }
2067
2068 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2069                             void __user *arg)
2070 {
2071         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2072         struct lov_user_md        *klum;
2073         int                        lum_size, rc;
2074         __u64                      flags = FMODE_WRITE;
2075         ENTRY;
2076
2077         rc = ll_copy_user_md(lum, &klum);
2078         if (rc < 0)
2079                 RETURN(rc);
2080
2081         lum_size = rc;
2082         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2083                                       lum_size);
2084         if (!rc) {
2085                 __u32 gen;
2086
2087                 rc = put_user(0, &lum->lmm_stripe_count);
2088                 if (rc)
2089                         GOTO(out, rc);
2090
2091                 rc = ll_layout_refresh(inode, &gen);
2092                 if (rc)
2093                         GOTO(out, rc);
2094
2095                 rc = ll_file_getstripe(inode, arg, lum_size);
2096         }
2097         cl_lov_delay_create_clear(&file->f_flags);
2098
2099 out:
2100         OBD_FREE(klum, lum_size);
2101         RETURN(rc);
2102 }
2103
2104 static int
2105 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2106 {
2107         struct ll_inode_info *lli = ll_i2info(inode);
2108         struct cl_object *obj = lli->lli_clob;
2109         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2110         struct ll_grouplock grouplock;
2111         int rc;
2112         ENTRY;
2113
2114         if (arg == 0) {
2115                 CWARN("group id for group lock must not be 0\n");
2116                 RETURN(-EINVAL);
2117         }
2118
2119         if (ll_file_nolock(file))
2120                 RETURN(-EOPNOTSUPP);
2121
2122         spin_lock(&lli->lli_lock);
2123         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2124                 CWARN("group lock already existed with gid %lu\n",
2125                       fd->fd_grouplock.lg_gid);
2126                 spin_unlock(&lli->lli_lock);
2127                 RETURN(-EINVAL);
2128         }
2129         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2130         spin_unlock(&lli->lli_lock);
2131
2132         /**
2133          * XXX: group lock needs to protect all OST objects while PFL
2134          * can add new OST objects during the IO, so we'd instantiate
2135          * all OST objects before getting its group lock.
2136          */
2137         if (obj) {
2138                 struct lu_env *env;
2139                 __u16 refcheck;
2140                 struct cl_layout cl = {
2141                         .cl_is_composite = false,
2142                 };
2143                 struct lu_extent ext = {
2144                         .e_start = 0,
2145                         .e_end = OBD_OBJECT_EOF,
2146                 };
2147
2148                 env = cl_env_get(&refcheck);
2149                 if (IS_ERR(env))
2150                         RETURN(PTR_ERR(env));
2151
2152                 rc = cl_object_layout_get(env, obj, &cl);
2153                 if (!rc && cl.cl_is_composite)
2154                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2155                                                     &ext);
2156
2157                 cl_env_put(env, &refcheck);
2158                 if (rc)
2159                         RETURN(rc);
2160         }
2161
2162         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2163                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2164         if (rc)
2165                 RETURN(rc);
2166
2167         spin_lock(&lli->lli_lock);
2168         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2169                 spin_unlock(&lli->lli_lock);
2170                 CERROR("another thread just won the race\n");
2171                 cl_put_grouplock(&grouplock);
2172                 RETURN(-EINVAL);
2173         }
2174
2175         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2176         fd->fd_grouplock = grouplock;
2177         spin_unlock(&lli->lli_lock);
2178
2179         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2180         RETURN(0);
2181 }
2182
2183 static int ll_put_grouplock(struct inode *inode, struct file *file,
2184                             unsigned long arg)
2185 {
2186         struct ll_inode_info   *lli = ll_i2info(inode);
2187         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2188         struct ll_grouplock     grouplock;
2189         ENTRY;
2190
2191         spin_lock(&lli->lli_lock);
2192         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2193                 spin_unlock(&lli->lli_lock);
2194                 CWARN("no group lock held\n");
2195                 RETURN(-EINVAL);
2196         }
2197
2198         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2199
2200         if (fd->fd_grouplock.lg_gid != arg) {
2201                 CWARN("group lock %lu doesn't match current id %lu\n",
2202                       arg, fd->fd_grouplock.lg_gid);
2203                 spin_unlock(&lli->lli_lock);
2204                 RETURN(-EINVAL);
2205         }
2206
2207         grouplock = fd->fd_grouplock;
2208         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2209         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2210         spin_unlock(&lli->lli_lock);
2211
2212         cl_put_grouplock(&grouplock);
2213         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2214         RETURN(0);
2215 }
2216
2217 /**
2218  * Close inode open handle
2219  *
2220  * \param dentry [in]     dentry which contains the inode
2221  * \param it     [in,out] intent which contains open info and result
2222  *
2223  * \retval 0     success
2224  * \retval <0    failure
2225  */
2226 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2227 {
2228         struct inode *inode = dentry->d_inode;
2229         struct obd_client_handle *och;
2230         int rc;
2231         ENTRY;
2232
2233         LASSERT(inode);
2234
2235         /* Root ? Do nothing. */
2236         if (dentry->d_inode->i_sb->s_root == dentry)
2237                 RETURN(0);
2238
2239         /* No open handle to close? Move away */
2240         if (!it_disposition(it, DISP_OPEN_OPEN))
2241                 RETURN(0);
2242
2243         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2244
2245         OBD_ALLOC(och, sizeof(*och));
2246         if (!och)
2247                 GOTO(out, rc = -ENOMEM);
2248
2249         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2250
2251         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2252 out:
2253         /* this one is in place of ll_file_open */
2254         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2255                 ptlrpc_req_finished(it->it_request);
2256                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2257         }
2258         RETURN(rc);
2259 }
2260
2261 /**
2262  * Get size for inode for which FIEMAP mapping is requested.
2263  * Make the FIEMAP get_info call and returns the result.
2264  * \param fiemap        kernel buffer to hold extens
2265  * \param num_bytes     kernel buffer size
2266  */
2267 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2268                         size_t num_bytes)
2269 {
2270         struct lu_env                   *env;
2271         __u16                           refcheck;
2272         int                             rc = 0;
2273         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2274         ENTRY;
2275
2276         /* Checks for fiemap flags */
2277         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2278                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2279                 return -EBADR;
2280         }
2281
2282         /* Check for FIEMAP_FLAG_SYNC */
2283         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2284                 rc = filemap_fdatawrite(inode->i_mapping);
2285                 if (rc)
2286                         return rc;
2287         }
2288
2289         env = cl_env_get(&refcheck);
2290         if (IS_ERR(env))
2291                 RETURN(PTR_ERR(env));
2292
2293         if (i_size_read(inode) == 0) {
2294                 rc = ll_glimpse_size(inode);
2295                 if (rc)
2296                         GOTO(out, rc);
2297         }
2298
2299         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2300         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2301         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2302
2303         /* If filesize is 0, then there would be no objects for mapping */
2304         if (fmkey.lfik_oa.o_size == 0) {
2305                 fiemap->fm_mapped_extents = 0;
2306                 GOTO(out, rc = 0);
2307         }
2308
2309         fmkey.lfik_fiemap = *fiemap;
2310
2311         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2312                               &fmkey, fiemap, &num_bytes);
2313 out:
2314         cl_env_put(env, &refcheck);
2315         RETURN(rc);
2316 }
2317
2318 int ll_fid2path(struct inode *inode, void __user *arg)
2319 {
2320         struct obd_export       *exp = ll_i2mdexp(inode);
2321         const struct getinfo_fid2path __user *gfin = arg;
2322         __u32                    pathlen;
2323         struct getinfo_fid2path *gfout;
2324         size_t                   outsize;
2325         int                      rc;
2326
2327         ENTRY;
2328
2329         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2330             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2331                 RETURN(-EPERM);
2332
2333         /* Only need to get the buflen */
2334         if (get_user(pathlen, &gfin->gf_pathlen))
2335                 RETURN(-EFAULT);
2336
2337         if (pathlen > PATH_MAX)
2338                 RETURN(-EINVAL);
2339
2340         outsize = sizeof(*gfout) + pathlen;
2341         OBD_ALLOC(gfout, outsize);
2342         if (gfout == NULL)
2343                 RETURN(-ENOMEM);
2344
2345         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2346                 GOTO(gf_free, rc = -EFAULT);
2347         /* append root FID after gfout to let MDT know the root FID so that it
2348          * can lookup the correct path, this is mainly for fileset.
2349          * old server without fileset mount support will ignore this. */
2350         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2351
2352         /* Call mdc_iocontrol */
2353         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2354         if (rc != 0)
2355                 GOTO(gf_free, rc);
2356
2357         if (copy_to_user(arg, gfout, outsize))
2358                 rc = -EFAULT;
2359
2360 gf_free:
2361         OBD_FREE(gfout, outsize);
2362         RETURN(rc);
2363 }
2364
2365 static int
2366 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2367 {
2368         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2369         struct lu_env *env;
2370         struct cl_io *io;
2371         __u16  refcheck;
2372         int result;
2373
2374         ENTRY;
2375
2376         ioc->idv_version = 0;
2377         ioc->idv_layout_version = UINT_MAX;
2378
2379         /* If no file object initialized, we consider its version is 0. */
2380         if (obj == NULL)
2381                 RETURN(0);
2382
2383         env = cl_env_get(&refcheck);
2384         if (IS_ERR(env))
2385                 RETURN(PTR_ERR(env));
2386
2387         io = vvp_env_thread_io(env);
2388         io->ci_obj = obj;
2389         io->u.ci_data_version.dv_data_version = 0;
2390         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2391         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2392
2393 restart:
2394         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2395                 result = cl_io_loop(env, io);
2396         else
2397                 result = io->ci_result;
2398
2399         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2400         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2401
2402         cl_io_fini(env, io);
2403
2404         if (unlikely(io->ci_need_restart))
2405                 goto restart;
2406
2407         cl_env_put(env, &refcheck);
2408
2409         RETURN(result);
2410 }
2411
2412 /*
2413  * Read the data_version for inode.
2414  *
2415  * This value is computed using stripe object version on OST.
2416  * Version is computed using server side locking.
2417  *
2418  * @param flags if do sync on the OST side;
2419  *              0: no sync
2420  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2421  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2422  */
2423 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2424 {
2425         struct ioc_data_version ioc = { .idv_flags = flags };
2426         int rc;
2427
2428         rc = ll_ioc_data_version(inode, &ioc);
2429         if (!rc)
2430                 *data_version = ioc.idv_version;
2431
2432         return rc;
2433 }
2434
2435 /*
2436  * Trigger a HSM release request for the provided inode.
2437  */
2438 int ll_hsm_release(struct inode *inode)
2439 {
2440         struct lu_env *env;
2441         struct obd_client_handle *och = NULL;
2442         __u64 data_version = 0;
2443         int rc;
2444         __u16 refcheck;
2445         ENTRY;
2446
2447         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2448                ll_i2sbi(inode)->ll_fsname,
2449                PFID(&ll_i2info(inode)->lli_fid));
2450
2451         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2452         if (IS_ERR(och))
2453                 GOTO(out, rc = PTR_ERR(och));
2454
2455         /* Grab latest data_version and [am]time values */
2456         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2457         if (rc != 0)
2458                 GOTO(out, rc);
2459
2460         env = cl_env_get(&refcheck);
2461         if (IS_ERR(env))
2462                 GOTO(out, rc = PTR_ERR(env));
2463
2464         rc = ll_merge_attr(env, inode);
2465         cl_env_put(env, &refcheck);
2466
2467         /* If error happen, we have the wrong size for a file.
2468          * Don't release it.
2469          */
2470         if (rc != 0)
2471                 GOTO(out, rc);
2472
2473         /* Release the file.
2474          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2475          * we still need it to pack l_remote_handle to MDT. */
2476         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2477                                        &data_version);
2478         och = NULL;
2479
2480         EXIT;
2481 out:
2482         if (och != NULL && !IS_ERR(och)) /* close the file */
2483                 ll_lease_close(och, inode, NULL);
2484
2485         return rc;
2486 }
2487
2488 struct ll_swap_stack {
2489         __u64                    dv1;
2490         __u64                    dv2;
2491         struct inode            *inode1;
2492         struct inode            *inode2;
2493         bool                     check_dv1;
2494         bool                     check_dv2;
2495 };
2496
2497 static int ll_swap_layouts(struct file *file1, struct file *file2,
2498                            struct lustre_swap_layouts *lsl)
2499 {
2500         struct mdc_swap_layouts  msl;
2501         struct md_op_data       *op_data;
2502         __u32                    gid;
2503         __u64                    dv;
2504         struct ll_swap_stack    *llss = NULL;
2505         int                      rc;
2506
2507         OBD_ALLOC_PTR(llss);
2508         if (llss == NULL)
2509                 RETURN(-ENOMEM);
2510
2511         llss->inode1 = file_inode(file1);
2512         llss->inode2 = file_inode(file2);
2513
2514         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2515         if (rc < 0)
2516                 GOTO(free, rc);
2517
2518         /* we use 2 bool because it is easier to swap than 2 bits */
2519         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2520                 llss->check_dv1 = true;
2521
2522         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2523                 llss->check_dv2 = true;
2524
2525         /* we cannot use lsl->sl_dvX directly because we may swap them */
2526         llss->dv1 = lsl->sl_dv1;
2527         llss->dv2 = lsl->sl_dv2;
2528
2529         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2530         if (rc == 0) /* same file, done! */
2531                 GOTO(free, rc);
2532
2533         if (rc < 0) { /* sequentialize it */
2534                 swap(llss->inode1, llss->inode2);
2535                 swap(file1, file2);
2536                 swap(llss->dv1, llss->dv2);
2537                 swap(llss->check_dv1, llss->check_dv2);
2538         }
2539
2540         gid = lsl->sl_gid;
2541         if (gid != 0) { /* application asks to flush dirty cache */
2542                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2543                 if (rc < 0)
2544                         GOTO(free, rc);
2545
2546                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2547                 if (rc < 0) {
2548                         ll_put_grouplock(llss->inode1, file1, gid);
2549                         GOTO(free, rc);
2550                 }
2551         }
2552
2553         /* ultimate check, before swaping the layouts we check if
2554          * dataversion has changed (if requested) */
2555         if (llss->check_dv1) {
2556                 rc = ll_data_version(llss->inode1, &dv, 0);
2557                 if (rc)
2558                         GOTO(putgl, rc);
2559                 if (dv != llss->dv1)
2560                         GOTO(putgl, rc = -EAGAIN);
2561         }
2562
2563         if (llss->check_dv2) {
2564                 rc = ll_data_version(llss->inode2, &dv, 0);
2565                 if (rc)
2566                         GOTO(putgl, rc);
2567                 if (dv != llss->dv2)
2568                         GOTO(putgl, rc = -EAGAIN);
2569         }
2570
2571         /* struct md_op_data is used to send the swap args to the mdt
2572          * only flags is missing, so we use struct mdc_swap_layouts
2573          * through the md_op_data->op_data */
2574         /* flags from user space have to be converted before they are send to
2575          * server, no flag is sent today, they are only used on the client */
2576         msl.msl_flags = 0;
2577         rc = -ENOMEM;
2578         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2579                                      0, LUSTRE_OPC_ANY, &msl);
2580         if (IS_ERR(op_data))
2581                 GOTO(free, rc = PTR_ERR(op_data));
2582
2583         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2584                            sizeof(*op_data), op_data, NULL);
2585         ll_finish_md_op_data(op_data);
2586
2587         if (rc < 0)
2588                 GOTO(putgl, rc);
2589
2590 putgl:
2591         if (gid != 0) {
2592                 ll_put_grouplock(llss->inode2, file2, gid);
2593                 ll_put_grouplock(llss->inode1, file1, gid);
2594         }
2595
2596 free:
2597         if (llss != NULL)
2598                 OBD_FREE_PTR(llss);
2599
2600         RETURN(rc);
2601 }
2602
2603 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2604 {
2605         struct obd_export *exp = ll_i2mdexp(inode);
2606         struct md_op_data *op_data;
2607         int rc;
2608         ENTRY;
2609
2610         /* Detect out-of range masks */
2611         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2612                 RETURN(-EINVAL);
2613
2614         /* Non-root users are forbidden to set or clear flags which are
2615          * NOT defined in HSM_USER_MASK. */
2616         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2617             !cfs_capable(CFS_CAP_SYS_ADMIN))
2618                 RETURN(-EPERM);
2619
2620         if (!exp_connect_archive_id_array(exp)) {
2621                 /* Detect out-of range archive id */
2622                 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2623                     (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2624                         RETURN(-EINVAL);
2625         }
2626
2627         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2628                                      LUSTRE_OPC_ANY, hss);
2629         if (IS_ERR(op_data))
2630                 RETURN(PTR_ERR(op_data));
2631
2632         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2633                            op_data, NULL);
2634
2635         ll_finish_md_op_data(op_data);
2636
2637         RETURN(rc);
2638 }
2639
2640 static int ll_hsm_import(struct inode *inode, struct file *file,
2641                          struct hsm_user_import *hui)
2642 {
2643         struct hsm_state_set    *hss = NULL;
2644         struct iattr            *attr = NULL;
2645         int                      rc;
2646         ENTRY;
2647
2648         if (!S_ISREG(inode->i_mode))
2649                 RETURN(-EINVAL);
2650
2651         /* set HSM flags */
2652         OBD_ALLOC_PTR(hss);
2653         if (hss == NULL)
2654                 GOTO(out, rc = -ENOMEM);
2655
2656         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2657         hss->hss_archive_id = hui->hui_archive_id;
2658         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2659         rc = ll_hsm_state_set(inode, hss);
2660         if (rc != 0)
2661                 GOTO(out, rc);
2662
2663         OBD_ALLOC_PTR(attr);
2664         if (attr == NULL)
2665                 GOTO(out, rc = -ENOMEM);
2666
2667         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2668         attr->ia_mode |= S_IFREG;
2669         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2670         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2671         attr->ia_size = hui->hui_size;
2672         attr->ia_mtime.tv_sec = hui->hui_mtime;
2673         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2674         attr->ia_atime.tv_sec = hui->hui_atime;
2675         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2676
2677         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2678                          ATTR_UID | ATTR_GID |
2679                          ATTR_MTIME | ATTR_MTIME_SET |
2680                          ATTR_ATIME | ATTR_ATIME_SET;
2681
2682         inode_lock(inode);
2683
2684         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2685         if (rc == -ENODATA)
2686                 rc = 0;
2687
2688         inode_unlock(inode);
2689
2690 out:
2691         if (hss != NULL)
2692                 OBD_FREE_PTR(hss);
2693
2694         if (attr != NULL)
2695                 OBD_FREE_PTR(attr);
2696
2697         RETURN(rc);
2698 }
2699
2700 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2701 {
2702         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2703                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2704 }
2705
2706 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2707 {
2708         struct inode *inode = file_inode(file);
2709         struct iattr ia = {
2710                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2711                             ATTR_MTIME | ATTR_MTIME_SET |
2712                             ATTR_CTIME,
2713                 .ia_atime = {
2714                         .tv_sec = lfu->lfu_atime_sec,
2715                         .tv_nsec = lfu->lfu_atime_nsec,
2716                 },
2717                 .ia_mtime = {
2718                         .tv_sec = lfu->lfu_mtime_sec,
2719                         .tv_nsec = lfu->lfu_mtime_nsec,
2720                 },
2721                 .ia_ctime = {
2722                         .tv_sec = lfu->lfu_ctime_sec,
2723                         .tv_nsec = lfu->lfu_ctime_nsec,
2724                 },
2725         };
2726         int rc;
2727         ENTRY;
2728
2729         if (!capable(CAP_SYS_ADMIN))
2730                 RETURN(-EPERM);
2731
2732         if (!S_ISREG(inode->i_mode))
2733                 RETURN(-EINVAL);
2734
2735         inode_lock(inode);
2736         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2737                             false);
2738         inode_unlock(inode);
2739
2740         RETURN(rc);
2741 }
2742
2743 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2744 {
2745         switch (mode) {
2746         case MODE_READ_USER:
2747                 return CLM_READ;
2748         case MODE_WRITE_USER:
2749                 return CLM_WRITE;
2750         default:
2751                 return -EINVAL;
2752         }
2753 }
2754
2755 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2756
2757 /* Used to allow the upper layers of the client to request an LDLM lock
2758  * without doing an actual read or write.
2759  *
2760  * Used for ladvise lockahead to manually request specific locks.
2761  *
2762  * \param[in] file      file this ladvise lock request is on
2763  * \param[in] ladvise   ladvise struct describing this lock request
2764  *
2765  * \retval 0            success, no detailed result available (sync requests
2766  *                      and requests sent to the server [not handled locally]
2767  *                      cannot return detailed results)
2768  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2769  *                                       see definitions for details.
2770  * \retval negative     negative errno on error
2771  */
2772 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2773 {
2774         struct lu_env *env = NULL;
2775         struct cl_io *io  = NULL;
2776         struct cl_lock *lock = NULL;
2777         struct cl_lock_descr *descr = NULL;
2778         struct dentry *dentry = file->f_path.dentry;
2779         struct inode *inode = dentry->d_inode;
2780         enum cl_lock_mode cl_mode;
2781         off_t start = ladvise->lla_start;
2782         off_t end = ladvise->lla_end;
2783         int result;
2784         __u16 refcheck;
2785
2786         ENTRY;
2787
2788         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2789                "start=%llu, end=%llu\n", dentry->d_name.len,
2790                dentry->d_name.name, dentry->d_inode,
2791                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2792                (__u64) end);
2793
2794         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2795         if (cl_mode < 0)
2796                 GOTO(out, result = cl_mode);
2797
2798         /* Get IO environment */
2799         result = cl_io_get(inode, &env, &io, &refcheck);
2800         if (result <= 0)
2801                 GOTO(out, result);
2802
2803         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2804         if (result > 0) {
2805                 /*
2806                  * nothing to do for this io. This currently happens when
2807                  * stripe sub-object's are not yet created.
2808                  */
2809                 result = io->ci_result;
2810         } else if (result == 0) {
2811                 lock = vvp_env_lock(env);
2812                 descr = &lock->cll_descr;
2813
2814                 descr->cld_obj   = io->ci_obj;
2815                 /* Convert byte offsets to pages */
2816                 descr->cld_start = cl_index(io->ci_obj, start);
2817                 descr->cld_end   = cl_index(io->ci_obj, end);
2818                 descr->cld_mode  = cl_mode;
2819                 /* CEF_MUST is used because we do not want to convert a
2820                  * lockahead request to a lockless lock */
2821                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2822                                        CEF_NONBLOCK;
2823
2824                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2825                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2826
2827                 result = cl_lock_request(env, io, lock);
2828
2829                 /* On success, we need to release the lock */
2830                 if (result >= 0)
2831                         cl_lock_release(env, lock);
2832         }
2833         cl_io_fini(env, io);
2834         cl_env_put(env, &refcheck);
2835
2836         /* -ECANCELED indicates a matching lock with a different extent
2837          * was already present, and -EEXIST indicates a matching lock
2838          * on exactly the same extent was already present.
2839          * We convert them to positive values for userspace to make
2840          * recognizing true errors easier.
2841          * Note we can only return these detailed results on async requests,
2842          * as sync requests look the same as i/o requests for locking. */
2843         if (result == -ECANCELED)
2844                 result = LLA_RESULT_DIFFERENT;
2845         else if (result == -EEXIST)
2846                 result = LLA_RESULT_SAME;
2847
2848 out:
2849         RETURN(result);
2850 }
2851 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2852
2853 static int ll_ladvise_sanity(struct inode *inode,
2854                              struct llapi_lu_ladvise *ladvise)
2855 {
2856         struct ll_sb_info *sbi = ll_i2sbi(inode);
2857         enum lu_ladvise_type advice = ladvise->lla_advice;
2858         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2859          * be in the first 32 bits of enum ladvise_flags */
2860         __u32 flags = ladvise->lla_peradvice_flags;
2861         /* 3 lines at 80 characters per line, should be plenty */
2862         int rc = 0;
2863
2864         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2865                 rc = -EINVAL;
2866                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2867                        "last supported advice is %s (value '%d'): rc = %d\n",
2868                        sbi->ll_fsname, advice,
2869                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2870                 GOTO(out, rc);
2871         }
2872
2873         /* Per-advice checks */
2874         switch (advice) {
2875         case LU_LADVISE_LOCKNOEXPAND:
2876                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2877                         rc = -EINVAL;
2878                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2879                                "rc = %d\n", sbi->ll_fsname, flags,
2880                                ladvise_names[advice], rc);
2881                         GOTO(out, rc);
2882                 }
2883                 break;
2884         case LU_LADVISE_LOCKAHEAD:
2885                 /* Currently only READ and WRITE modes can be requested */
2886                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2887                     ladvise->lla_lockahead_mode == 0) {
2888                         rc = -EINVAL;
2889                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2890                                "rc = %d\n", sbi->ll_fsname,
2891                                ladvise->lla_lockahead_mode,
2892                                ladvise_names[advice], rc);
2893                         GOTO(out, rc);
2894                 }
2895         case LU_LADVISE_WILLREAD:
2896         case LU_LADVISE_DONTNEED:
2897         default:
2898                 /* Note fall through above - These checks apply to all advices
2899                  * except LOCKNOEXPAND */
2900                 if (flags & ~LF_DEFAULT_MASK) {
2901                         rc = -EINVAL;
2902                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2903                                "rc = %d\n", sbi->ll_fsname, flags,
2904                                ladvise_names[advice], rc);
2905                         GOTO(out, rc);
2906                 }
2907                 if (ladvise->lla_start >= ladvise->lla_end) {
2908                         rc = -EINVAL;
2909                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2910                                "for %s: rc = %d\n", sbi->ll_fsname,
2911                                ladvise->lla_start, ladvise->lla_end,
2912                                ladvise_names[advice], rc);
2913                         GOTO(out, rc);
2914                 }
2915                 break;
2916         }
2917
2918 out:
2919         return rc;
2920 }
2921 #undef ERRSIZE
2922
2923 /*
2924  * Give file access advices
2925  *
2926  * The ladvise interface is similar to Linux fadvise() system call, except it
2927  * forwards the advices directly from Lustre client to server. The server side
2928  * codes will apply appropriate read-ahead and caching techniques for the
2929  * corresponding files.
2930  *
2931  * A typical workload for ladvise is e.g. a bunch of different clients are
2932  * doing small random reads of a file, so prefetching pages into OSS cache
2933  * with big linear reads before the random IO is a net benefit. Fetching
2934  * all that data into each client cache with fadvise() may not be, due to
2935  * much more data being sent to the client.
2936  */
2937 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2938                       struct llapi_lu_ladvise *ladvise)
2939 {
2940         struct lu_env *env;
2941         struct cl_io *io;
2942         struct cl_ladvise_io *lio;
2943         int rc;
2944         __u16 refcheck;
2945         ENTRY;
2946
2947         env = cl_env_get(&refcheck);
2948         if (IS_ERR(env))
2949                 RETURN(PTR_ERR(env));
2950
2951         io = vvp_env_thread_io(env);
2952         io->ci_obj = ll_i2info(inode)->lli_clob;
2953
2954         /* initialize parameters for ladvise */
2955         lio = &io->u.ci_ladvise;
2956         lio->li_start = ladvise->lla_start;
2957         lio->li_end = ladvise->lla_end;
2958         lio->li_fid = ll_inode2fid(inode);
2959         lio->li_advice = ladvise->lla_advice;
2960         lio->li_flags = flags;
2961
2962         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2963                 rc = cl_io_loop(env, io);
2964         else
2965                 rc = io->ci_result;
2966
2967         cl_io_fini(env, io);
2968         cl_env_put(env, &refcheck);
2969         RETURN(rc);
2970 }
2971
2972 static int ll_lock_noexpand(struct file *file, int flags)
2973 {
2974         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2975
2976         fd->ll_lock_no_expand = !(flags & LF_UNSET);
2977
2978         return 0;
2979 }
2980
2981 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2982                         unsigned long arg)
2983 {
2984         struct fsxattr fsxattr;
2985
2986         if (copy_from_user(&fsxattr,
2987                            (const struct fsxattr __user *)arg,
2988                            sizeof(fsxattr)))
2989                 RETURN(-EFAULT);
2990
2991         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
2992         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
2993                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
2994         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2995         if (copy_to_user((struct fsxattr __user *)arg,
2996                          &fsxattr, sizeof(fsxattr)))
2997                 RETURN(-EFAULT);
2998
2999         RETURN(0);
3000 }
3001
3002 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3003 {
3004         /*
3005          * Project Quota ID state is only allowed to change from within the init
3006          * namespace. Enforce that restriction only if we are trying to change
3007          * the quota ID state. Everything else is allowed in user namespaces.
3008          */
3009         if (current_user_ns() == &init_user_ns)
3010                 return 0;
3011
3012         if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3013                 return -EINVAL;
3014
3015         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3016                 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3017                         return -EINVAL;
3018         } else {
3019                 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3020                         return -EINVAL;
3021         }
3022
3023         return 0;
3024 }
3025
3026 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3027                         unsigned long arg)
3028 {
3029
3030         struct md_op_data *op_data;
3031         struct ptlrpc_request *req = NULL;
3032         int rc = 0;
3033         struct fsxattr fsxattr;
3034         struct cl_object *obj;
3035         struct iattr *attr;
3036         int flags;
3037
3038         if (copy_from_user(&fsxattr,
3039                            (const struct fsxattr __user *)arg,
3040                            sizeof(fsxattr)))
3041                 RETURN(-EFAULT);
3042
3043         rc = ll_ioctl_check_project(inode, &fsxattr);
3044         if (rc)
3045                 RETURN(rc);
3046
3047         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3048                                      LUSTRE_OPC_ANY, NULL);
3049         if (IS_ERR(op_data))
3050                 RETURN(PTR_ERR(op_data));
3051
3052         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3053         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3054         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3055                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3056         op_data->op_projid = fsxattr.fsx_projid;
3057         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3058         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3059                         0, &req);
3060         ptlrpc_req_finished(req);
3061         if (rc)
3062                 GOTO(out_fsxattr, rc);
3063         ll_update_inode_flags(inode, op_data->op_attr_flags);
3064         obj = ll_i2info(inode)->lli_clob;
3065         if (obj == NULL)
3066                 GOTO(out_fsxattr, rc);
3067
3068         OBD_ALLOC_PTR(attr);
3069         if (attr == NULL)
3070                 GOTO(out_fsxattr, rc = -ENOMEM);
3071
3072         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3073                             fsxattr.fsx_xflags);
3074         OBD_FREE_PTR(attr);
3075 out_fsxattr:
3076         ll_finish_md_op_data(op_data);
3077         RETURN(rc);
3078 }
3079
3080 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3081                                  unsigned long arg)
3082 {
3083         struct inode            *inode = file_inode(file);
3084         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3085         struct ll_inode_info    *lli = ll_i2info(inode);
3086         struct obd_client_handle *och = NULL;
3087         struct split_param sp;
3088         bool lease_broken;
3089         fmode_t fmode = 0;
3090         enum mds_op_bias bias = 0;
3091         struct file *layout_file = NULL;
3092         void *data = NULL;
3093         size_t data_size = 0;
3094         long rc;
3095         ENTRY;
3096
3097         mutex_lock(&lli->lli_och_mutex);
3098         if (fd->fd_lease_och != NULL) {
3099                 och = fd->fd_lease_och;
3100                 fd->fd_lease_och = NULL;
3101         }
3102         mutex_unlock(&lli->lli_och_mutex);
3103
3104         if (och == NULL)
3105                 GOTO(out, rc = -ENOLCK);
3106
3107         fmode = och->och_flags;
3108
3109         switch (ioc->lil_flags) {
3110         case LL_LEASE_RESYNC_DONE:
3111                 if (ioc->lil_count > IOC_IDS_MAX)
3112                         GOTO(out, rc = -EINVAL);
3113
3114                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3115                 OBD_ALLOC(data, data_size);
3116                 if (!data)
3117                         GOTO(out, rc = -ENOMEM);
3118
3119                 if (copy_from_user(data, (void __user *)arg, data_size))
3120                         GOTO(out, rc = -EFAULT);
3121
3122                 bias = MDS_CLOSE_RESYNC_DONE;
3123                 break;
3124         case LL_LEASE_LAYOUT_MERGE: {
3125                 int fd;
3126
3127                 if (ioc->lil_count != 1)
3128                         GOTO(out, rc = -EINVAL);
3129
3130                 arg += sizeof(*ioc);
3131                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3132                         GOTO(out, rc = -EFAULT);
3133
3134                 layout_file = fget(fd);
3135                 if (!layout_file)
3136                         GOTO(out, rc = -EBADF);
3137
3138                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3139                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3140                         GOTO(out, rc = -EPERM);
3141
3142                 data = file_inode(layout_file);
3143                 bias = MDS_CLOSE_LAYOUT_MERGE;
3144                 break;
3145         }
3146         case LL_LEASE_LAYOUT_SPLIT: {
3147                 int fdv;
3148                 int mirror_id;
3149
3150                 if (ioc->lil_count != 2)
3151                         GOTO(out, rc = -EINVAL);
3152
3153                 arg += sizeof(*ioc);
3154                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3155                         GOTO(out, rc = -EFAULT);
3156
3157                 arg += sizeof(__u32);
3158                 if (copy_from_user(&mirror_id, (void __user *)arg,
3159                                    sizeof(__u32)))
3160                         GOTO(out, rc = -EFAULT);
3161
3162                 layout_file = fget(fdv);
3163                 if (!layout_file)
3164                         GOTO(out, rc = -EBADF);
3165
3166                 sp.sp_inode = file_inode(layout_file);
3167                 sp.sp_mirror_id = (__u16)mirror_id;
3168                 data = &sp;
3169                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3170                 break;
3171         }
3172         default:
3173                 /* without close intent */
3174                 break;
3175         }
3176
3177         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3178         if (rc < 0)
3179                 GOTO(out, rc);
3180
3181         rc = ll_lease_och_release(inode, file);
3182         if (rc < 0)
3183                 GOTO(out, rc);
3184
3185         if (lease_broken)
3186                 fmode = 0;
3187         EXIT;
3188
3189 out:
3190         switch (ioc->lil_flags) {
3191         case LL_LEASE_RESYNC_DONE:
3192                 if (data)
3193                         OBD_FREE(data, data_size);
3194                 break;
3195         case LL_LEASE_LAYOUT_MERGE:
3196         case LL_LEASE_LAYOUT_SPLIT:
3197                 if (layout_file)
3198                         fput(layout_file);
3199                 break;
3200         }
3201
3202         if (!rc)
3203                 rc = ll_lease_type_from_fmode(fmode);
3204         RETURN(rc);
3205 }
3206
3207 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3208                               unsigned long arg)
3209 {
3210         struct inode *inode = file_inode(file);
3211         struct ll_inode_info *lli = ll_i2info(inode);
3212         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3213         struct obd_client_handle *och = NULL;
3214         __u64 open_flags = 0;
3215         bool lease_broken;
3216         fmode_t fmode;
3217         long rc;
3218         ENTRY;
3219
3220         switch (ioc->lil_mode) {
3221         case LL_LEASE_WRLCK:
3222                 if (!(file->f_mode & FMODE_WRITE))
3223                         RETURN(-EPERM);
3224                 fmode = FMODE_WRITE;
3225                 break;
3226         case LL_LEASE_RDLCK:
3227                 if (!(file->f_mode & FMODE_READ))
3228                         RETURN(-EPERM);
3229                 fmode = FMODE_READ;
3230                 break;
3231         case LL_LEASE_UNLCK:
3232                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3233         default:
3234                 RETURN(-EINVAL);
3235         }
3236
3237         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3238
3239         /* apply for lease */
3240         if (ioc->lil_flags & LL_LEASE_RESYNC)
3241                 open_flags = MDS_OPEN_RESYNC;
3242         och = ll_lease_open(inode, file, fmode, open_flags);
3243         if (IS_ERR(och))
3244                 RETURN(PTR_ERR(och));
3245
3246         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3247                 rc = ll_lease_file_resync(och, inode, arg);
3248                 if (rc) {
3249                         ll_lease_close(och, inode, NULL);
3250                         RETURN(rc);
3251                 }
3252                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3253                 if (rc) {
3254                         ll_lease_close(och, inode, NULL);
3255                         RETURN(rc);
3256                 }
3257         }
3258
3259         rc = 0;
3260         mutex_lock(&lli->lli_och_mutex);
3261         if (fd->fd_lease_och == NULL) {
3262                 fd->fd_lease_och = och;
3263                 och = NULL;
3264         }
3265         mutex_unlock(&lli->lli_och_mutex);
3266         if (och != NULL) {
3267                 /* impossible now that only excl is supported for now */
3268                 ll_lease_close(och, inode, &lease_broken);
3269                 rc = -EBUSY;
3270         }
3271         RETURN(rc);
3272 }
3273
3274 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3275 {
3276         struct ll_inode_info *lli = ll_i2info(inode);
3277         struct ll_sb_info *sbi = ll_i2sbi(inode);
3278         __u64 now = ktime_get_real_seconds();
3279         int i;
3280
3281         spin_lock(&lli->lli_heat_lock);
3282         heat->lh_flags = lli->lli_heat_flags;
3283         for (i = 0; i < heat->lh_count; i++)
3284                 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3285                                                 now, sbi->ll_heat_decay_weight,
3286                                                 sbi->ll_heat_period_second);
3287         spin_unlock(&lli->lli_heat_lock);
3288 }
3289
3290 static int ll_heat_set(struct inode *inode, __u64 flags)
3291 {
3292         struct ll_inode_info *lli = ll_i2info(inode);
3293         int rc = 0;
3294
3295         spin_lock(&lli->lli_heat_lock);
3296         if (flags & LU_HEAT_FLAG_CLEAR)
3297                 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3298
3299         if (flags & LU_HEAT_FLAG_OFF)
3300                 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3301         else
3302                 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3303
3304         spin_unlock(&lli->lli_heat_lock);
3305
3306         RETURN(rc);
3307 }
3308
3309 static long
3310 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3311 {
3312         struct inode            *inode = file_inode(file);
3313         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3314         int                      flags, rc;
3315         ENTRY;
3316
3317         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3318                PFID(ll_inode2fid(inode)), inode, cmd);
3319         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3320
3321         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3322         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3323                 RETURN(-ENOTTY);
3324
3325         switch (cmd) {
3326         case LL_IOC_GETFLAGS:
3327                 /* Get the current value of the file flags */
3328                 return put_user(fd->fd_flags, (int __user *)arg);
3329         case LL_IOC_SETFLAGS:
3330         case LL_IOC_CLRFLAGS:
3331                 /* Set or clear specific file flags */
3332                 /* XXX This probably needs checks to ensure the flags are
3333                  *     not abused, and to handle any flag side effects.
3334                  */
3335                 if (get_user(flags, (int __user *) arg))
3336                         RETURN(-EFAULT);
3337
3338                 if (cmd == LL_IOC_SETFLAGS) {
3339                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3340                             !(file->f_flags & O_DIRECT)) {
3341                                 CERROR("%s: unable to disable locking on "
3342                                        "non-O_DIRECT file\n", current->comm);
3343                                 RETURN(-EINVAL);
3344                         }
3345
3346                         fd->fd_flags |= flags;
3347                 } else {
3348                         fd->fd_flags &= ~flags;
3349                 }
3350                 RETURN(0);
3351         case LL_IOC_LOV_SETSTRIPE:
3352         case LL_IOC_LOV_SETSTRIPE_NEW:
3353                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3354         case LL_IOC_LOV_SETEA:
3355                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3356         case LL_IOC_LOV_SWAP_LAYOUTS: {
3357                 struct file *file2;
3358                 struct lustre_swap_layouts lsl;
3359
3360                 if (copy_from_user(&lsl, (char __user *)arg,
3361                                    sizeof(struct lustre_swap_layouts)))
3362                         RETURN(-EFAULT);
3363
3364                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3365                         RETURN(-EPERM);
3366
3367                 file2 = fget(lsl.sl_fd);
3368                 if (file2 == NULL)
3369                         RETURN(-EBADF);
3370
3371                 /* O_WRONLY or O_RDWR */
3372                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3373                         GOTO(out, rc = -EPERM);
3374
3375                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3376                         struct inode                    *inode2;
3377                         struct ll_inode_info            *lli;
3378                         struct obd_client_handle        *och = NULL;
3379
3380                         lli = ll_i2info(inode);
3381                         mutex_lock(&lli->lli_och_mutex);
3382                         if (fd->fd_lease_och != NULL) {
3383                                 och = fd->fd_lease_och;
3384                                 fd->fd_lease_och = NULL;
3385                         }
3386                         mutex_unlock(&lli->lli_och_mutex);
3387                         if (och == NULL)
3388                                 GOTO(out, rc = -ENOLCK);
3389                         inode2 = file_inode(file2);
3390                         rc = ll_swap_layouts_close(och, inode, inode2);
3391                 } else {
3392                         rc = ll_swap_layouts(file, file2, &lsl);
3393                 }
3394 out:
3395                 fput(file2);
3396                 RETURN(rc);
3397         }
3398         case LL_IOC_LOV_GETSTRIPE:
3399         case LL_IOC_LOV_GETSTRIPE_NEW:
3400                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3401         case FS_IOC_GETFLAGS:
3402         case FS_IOC_SETFLAGS:
3403                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3404         case FSFILT_IOC_GETVERSION:
3405         case FS_IOC_GETVERSION:
3406                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3407         /* We need to special case any other ioctls we want to handle,
3408          * to send them to the MDS/OST as appropriate and to properly
3409          * network encode the arg field. */
3410         case FS_IOC_SETVERSION:
3411                 RETURN(-ENOTSUPP);
3412
3413         case LL_IOC_GROUP_LOCK:
3414                 RETURN(ll_get_grouplock(inode, file, arg));
3415         case LL_IOC_GROUP_UNLOCK:
3416                 RETURN(ll_put_grouplock(inode, file, arg));
3417         case IOC_OBD_STATFS:
3418                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3419
3420         case LL_IOC_FLUSHCTX:
3421                 RETURN(ll_flush_ctx(inode));
3422         case LL_IOC_PATH2FID: {
3423                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3424                                  sizeof(struct lu_fid)))
3425                         RETURN(-EFAULT);
3426
3427                 RETURN(0);
3428         }
3429         case LL_IOC_GETPARENT:
3430                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3431
3432         case OBD_IOC_FID2PATH:
3433                 RETURN(ll_fid2path(inode, (void __user *)arg));
3434         case LL_IOC_DATA_VERSION: {
3435                 struct ioc_data_version idv;
3436                 int rc;
3437
3438                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3439                         RETURN(-EFAULT);
3440
3441                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3442                 rc = ll_ioc_data_version(inode, &idv);
3443
3444                 if (rc == 0 &&
3445                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3446                         RETURN(-EFAULT);
3447
3448                 RETURN(rc);
3449         }
3450
3451         case LL_IOC_GET_MDTIDX: {
3452                 int mdtidx;
3453
3454                 mdtidx = ll_get_mdt_idx(inode);
3455                 if (mdtidx < 0)
3456                         RETURN(mdtidx);
3457
3458                 if (put_user((int)mdtidx, (int __user *)arg))
3459                         RETURN(-EFAULT);
3460
3461                 RETURN(0);
3462         }
3463         case OBD_IOC_GETDTNAME:
3464         case OBD_IOC_GETMDNAME:
3465                 RETURN(ll_get_obd_name(inode, cmd, arg));
3466         case LL_IOC_HSM_STATE_GET: {
3467                 struct md_op_data       *op_data;
3468                 struct hsm_user_state   *hus;
3469                 int                      rc;
3470
3471                 OBD_ALLOC_PTR(hus);
3472                 if (hus == NULL)
3473                         RETURN(-ENOMEM);
3474
3475                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3476                                              LUSTRE_OPC_ANY, hus);
3477                 if (IS_ERR(op_data)) {
3478                         OBD_FREE_PTR(hus);
3479                         RETURN(PTR_ERR(op_data));
3480                 }
3481
3482                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3483                                    op_data, NULL);
3484
3485                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3486                         rc = -EFAULT;
3487
3488                 ll_finish_md_op_data(op_data);
3489                 OBD_FREE_PTR(hus);
3490                 RETURN(rc);
3491         }
3492         case LL_IOC_HSM_STATE_SET: {
3493                 struct hsm_state_set    *hss;
3494                 int                      rc;
3495
3496                 OBD_ALLOC_PTR(hss);
3497                 if (hss == NULL)
3498                         RETURN(-ENOMEM);
3499
3500                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3501                         OBD_FREE_PTR(hss);
3502                         RETURN(-EFAULT);
3503                 }
3504
3505                 rc = ll_hsm_state_set(inode, hss);
3506
3507                 OBD_FREE_PTR(hss);
3508                 RETURN(rc);
3509         }
3510         case LL_IOC_HSM_ACTION: {
3511                 struct md_op_data               *op_data;
3512                 struct hsm_current_action       *hca;
3513                 int                              rc;
3514
3515                 OBD_ALLOC_PTR(hca);
3516                 if (hca == NULL)
3517                         RETURN(-ENOMEM);
3518
3519                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3520                                              LUSTRE_OPC_ANY, hca);
3521                 if (IS_ERR(op_data)) {
3522                         OBD_FREE_PTR(hca);
3523                         RETURN(PTR_ERR(op_data));
3524                 }
3525
3526                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3527                                    op_data, NULL);
3528
3529                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3530                         rc = -EFAULT;
3531
3532                 ll_finish_md_op_data(op_data);
3533                 OBD_FREE_PTR(hca);
3534                 RETURN(rc);
3535         }
3536         case LL_IOC_SET_LEASE_OLD: {
3537                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3538
3539                 RETURN(ll_file_set_lease(file, &ioc, 0));
3540         }
3541         case LL_IOC_SET_LEASE: {
3542                 struct ll_ioc_lease ioc;
3543
3544                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3545                         RETURN(-EFAULT);
3546
3547                 RETURN(ll_file_set_lease(file, &ioc, arg));
3548         }
3549         case LL_IOC_GET_LEASE: {
3550                 struct ll_inode_info *lli = ll_i2info(inode);
3551                 struct ldlm_lock *lock = NULL;
3552                 fmode_t fmode = 0;
3553
3554                 mutex_lock(&lli->lli_och_mutex);
3555                 if (fd->fd_lease_och != NULL) {
3556                         struct obd_client_handle *och = fd->fd_lease_och;
3557
3558                         lock = ldlm_handle2lock(&och->och_lease_handle);
3559                         if (lock != NULL) {
3560                                 lock_res_and_lock(lock);
3561                                 if (!ldlm_is_cancel(lock))
3562                                         fmode = och->och_flags;
3563
3564                                 unlock_res_and_lock(lock);
3565                                 LDLM_LOCK_PUT(lock);
3566                         }
3567                 }
3568                 mutex_unlock(&lli->lli_och_mutex);
3569
3570                 RETURN(ll_lease_type_from_fmode(fmode));
3571         }
3572         case LL_IOC_HSM_IMPORT: {
3573                 struct hsm_user_import *hui;
3574
3575                 OBD_ALLOC_PTR(hui);
3576                 if (hui == NULL)
3577                         RETURN(-ENOMEM);
3578
3579                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3580                         OBD_FREE_PTR(hui);
3581                         RETURN(-EFAULT);
3582                 }
3583
3584                 rc = ll_hsm_import(inode, file, hui);
3585
3586                 OBD_FREE_PTR(hui);
3587                 RETURN(rc);
3588         }
3589         case LL_IOC_FUTIMES_3: {
3590                 struct ll_futimes_3 lfu;
3591
3592                 if (copy_from_user(&lfu,
3593                                    (const struct ll_futimes_3 __user *)arg,
3594                                    sizeof(lfu)))
3595                         RETURN(-EFAULT);
3596
3597                 RETURN(ll_file_futimes_3(file, &lfu));
3598         }
3599         case LL_IOC_LADVISE: {
3600                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3601                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3602                 int i;
3603                 int num_advise;
3604                 int alloc_size = sizeof(*k_ladvise_hdr);
3605
3606                 rc = 0;
3607                 u_ladvise_hdr = (void __user *)arg;
3608                 OBD_ALLOC_PTR(k_ladvise_hdr);
3609                 if (k_ladvise_hdr == NULL)
3610                         RETURN(-ENOMEM);
3611
3612                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3613                         GOTO(out_ladvise, rc = -EFAULT);
3614
3615                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3616                     k_ladvise_hdr->lah_count < 1)
3617                         GOTO(out_ladvise, rc = -EINVAL);
3618
3619                 num_advise = k_ladvise_hdr->lah_count;
3620                 if (num_advise >= LAH_COUNT_MAX)
3621                         GOTO(out_ladvise, rc = -EFBIG);
3622
3623                 OBD_FREE_PTR(k_ladvise_hdr);
3624                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3625                                       lah_advise[num_advise]);
3626                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3627                 if (k_ladvise_hdr == NULL)
3628                         RETURN(-ENOMEM);
3629
3630                 /*
3631                  * TODO: submit multiple advices to one server in a single RPC
3632                  */
3633                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3634                         GOTO(out_ladvise, rc = -EFAULT);
3635
3636                 for (i = 0; i < num_advise; i++) {
3637                         struct llapi_lu_ladvise *k_ladvise =
3638                                         &k_ladvise_hdr->lah_advise[i];
3639                         struct llapi_lu_ladvise __user *u_ladvise =
3640                                         &u_ladvise_hdr->lah_advise[i];
3641
3642                         rc = ll_ladvise_sanity(inode, k_ladvise);
3643                         if (rc)
3644                                 GOTO(out_ladvise, rc);
3645
3646                         switch (k_ladvise->lla_advice) {
3647                         case LU_LADVISE_LOCKNOEXPAND:
3648                                 rc = ll_lock_noexpand(file,
3649                                                k_ladvise->lla_peradvice_flags);
3650                                 GOTO(out_ladvise, rc);
3651                         case LU_LADVISE_LOCKAHEAD:
3652
3653                                 rc = ll_file_lock_ahead(file, k_ladvise);
3654
3655                                 if (rc < 0)
3656                                         GOTO(out_ladvise, rc);
3657
3658                                 if (put_user(rc,
3659                                              &u_ladvise->lla_lockahead_result))
3660                                         GOTO(out_ladvise, rc = -EFAULT);
3661                                 break;
3662                         default:
3663                                 rc = ll_ladvise(inode, file,
3664                                                 k_ladvise_hdr->lah_flags,
3665                                                 k_ladvise);
3666                                 if (rc)
3667                                         GOTO(out_ladvise, rc);
3668                                 break;
3669                         }
3670
3671                 }
3672
3673 out_ladvise:
3674                 OBD_FREE(k_ladvise_hdr, alloc_size);
3675                 RETURN(rc);
3676         }
3677         case LL_IOC_FLR_SET_MIRROR: {
3678                 /* mirror I/O must be direct to avoid polluting page cache
3679                  * by stale data. */
3680                 if (!(file->f_flags & O_DIRECT))
3681                         RETURN(-EINVAL);
3682
3683                 fd->fd_designated_mirror = (__u32)arg;
3684                 RETURN(0);
3685         }
3686         case LL_IOC_FSGETXATTR:
3687                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3688         case LL_IOC_FSSETXATTR:
3689                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3690         case BLKSSZGET:
3691                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3692         case LL_IOC_HEAT_GET: {
3693                 struct lu_heat uheat;
3694                 struct lu_heat *heat;
3695                 int size;
3696
3697                 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3698                         RETURN(-EFAULT);
3699
3700                 if (uheat.lh_count > OBD_HEAT_COUNT)
3701                         uheat.lh_count = OBD_HEAT_COUNT;
3702
3703                 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3704                 OBD_ALLOC(heat, size);
3705                 if (heat == NULL)
3706                         RETURN(-ENOMEM);
3707
3708                 heat->lh_count = uheat.lh_count;
3709                 ll_heat_get(inode, heat);
3710                 rc = copy_to_user((char __user *)arg, heat, size);
3711                 OBD_FREE(heat, size);
3712                 RETURN(rc ? -EFAULT : 0);
3713         }
3714         case LL_IOC_HEAT_SET: {
3715                 __u64 flags;
3716
3717                 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3718                         RETURN(-EFAULT);
3719
3720                 rc = ll_heat_set(inode, flags);
3721                 RETURN(rc);
3722         }
3723         default:
3724                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3725                                      (void __user *)arg));
3726         }
3727 }
3728
3729 #ifndef HAVE_FILE_LLSEEK_SIZE
3730 static inline loff_t
3731 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3732 {
3733         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3734                 return -EINVAL;
3735         if (offset > maxsize)
3736                 return -EINVAL;
3737
3738         if (offset != file->f_pos) {
3739                 file->f_pos = offset;
3740                 file->f_version = 0;
3741         }
3742         return offset;
3743 }
3744
3745 static loff_t
3746 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3747                 loff_t maxsize, loff_t eof)
3748 {
3749         struct inode *inode = file_inode(file);
3750
3751         switch (origin) {
3752         case SEEK_END:
3753                 offset += eof;
3754                 break;
3755         case SEEK_CUR:
3756                 /*
3757                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3758                  * position-querying operation.  Avoid rewriting the "same"
3759                  * f_pos value back to the file because a concurrent read(),
3760                  * write() or lseek() might have altered it
3761                  */
3762                 if (offset == 0)
3763                         return file->f_pos;
3764                 /*
3765                  * f_lock protects against read/modify/write race with other
3766                  * SEEK_CURs. Note that parallel writes and reads behave
3767                  * like SEEK_SET.
3768                  */
3769                 inode_lock(inode);
3770                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3771                 inode_unlock(inode);
3772                 return offset;
3773         case SEEK_DATA:
3774                 /*
3775                  * In the generic case the entire file is data, so as long as
3776                  * offset isn't at the end of the file then the offset is data.
3777                  */
3778                 if (offset >= eof)
3779                         return -ENXIO;
3780                 break;
3781         case SEEK_HOLE:
3782                 /*
3783                  * There is a virtual hole at the end of the file, so as long as
3784                  * offset isn't i_size or larger, return i_size.
3785                  */
3786                 if (offset >= eof)
3787                         return -ENXIO;
3788                 offset = eof;
3789                 break;
3790         }
3791
3792         return llseek_execute(file, offset, maxsize);
3793 }
3794 #endif
3795
3796 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3797 {
3798         struct inode *inode = file_inode(file);
3799         loff_t retval, eof = 0;
3800
3801         ENTRY;
3802         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3803                            (origin == SEEK_CUR) ? file->f_pos : 0);
3804         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3805                PFID(ll_inode2fid(inode)), inode, retval, retval,
3806                origin);
3807         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3808
3809         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3810                 retval = ll_glimpse_size(inode);
3811                 if (retval != 0)
3812                         RETURN(retval);
3813                 eof = i_size_read(inode);
3814         }
3815
3816         retval = ll_generic_file_llseek_size(file, offset, origin,
3817                                           ll_file_maxbytes(inode), eof);
3818         RETURN(retval);
3819 }
3820
3821 static int ll_flush(struct file *file, fl_owner_t id)
3822 {
3823         struct inode *inode = file_inode(file);
3824         struct ll_inode_info *lli = ll_i2info(inode);
3825         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3826         int rc, err;
3827
3828         LASSERT(!S_ISDIR(inode->i_mode));
3829
3830         /* catch async errors that were recorded back when async writeback
3831          * failed for pages in this mapping. */
3832         rc = lli->lli_async_rc;
3833         lli->lli_async_rc = 0;
3834         if (lli->lli_clob != NULL) {
3835                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3836                 if (rc == 0)
3837                         rc = err;
3838         }
3839
3840         /* The application has been told write failure already.
3841          * Do not report failure again. */
3842         if (fd->fd_write_failed)
3843                 return 0;
3844         return rc ? -EIO : 0;
3845 }
3846
3847 /**
3848  * Called to make sure a portion of file has been written out.
3849  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3850  *
3851  * Return how many pages have been written.
3852  */
3853 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3854                        enum cl_fsync_mode mode, int ignore_layout)
3855 {
3856         struct lu_env *env;
3857         struct cl_io *io;
3858         struct cl_fsync_io *fio;
3859         int result;
3860         __u16 refcheck;
3861         ENTRY;
3862
3863         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3864             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3865                 RETURN(-EINVAL);
3866
3867         env = cl_env_get(&refcheck);
3868         if (IS_ERR(env))
3869                 RETURN(PTR_ERR(env));
3870
3871         io = vvp_env_thread_io(env);
3872         io->ci_obj = ll_i2info(inode)->lli_clob;
3873         io->ci_ignore_layout = ignore_layout;
3874
3875         /* initialize parameters for sync */
3876         fio = &io->u.ci_fsync;
3877         fio->fi_start = start;
3878         fio->fi_end = end;
3879         fio->fi_fid = ll_inode2fid(inode);
3880         fio->fi_mode = mode;
3881         fio->fi_nr_written = 0;
3882
3883         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3884                 result = cl_io_loop(env, io);
3885         else
3886                 result = io->ci_result;
3887         if (result == 0)
3888                 result = fio->fi_nr_written;
3889         cl_io_fini(env, io);
3890         cl_env_put(env, &refcheck);
3891
3892         RETURN(result);
3893 }
3894
3895 /*
3896  * When dentry is provided (the 'else' case), file_dentry() may be
3897  * null and dentry must be used directly rather than pulled from
3898  * file_dentry() as is done otherwise.
3899  */
3900
3901 #ifdef HAVE_FILE_FSYNC_4ARGS
3902 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3903 {
3904         struct dentry *dentry = file_dentry(file);
3905 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3906 int ll_fsync(struct file *file, int datasync)
3907 {
3908         struct dentry *dentry = file_dentry(file);
3909         loff_t start = 0;
3910         loff_t end = LLONG_MAX;
3911 #else
3912 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3913 {
3914         loff_t start = 0;
3915         loff_t end = LLONG_MAX;
3916 #endif
3917         struct inode *inode = dentry->d_inode;
3918         struct ll_inode_info *lli = ll_i2info(inode);
3919         struct ptlrpc_request *req;
3920         int rc, err;
3921         ENTRY;
3922
3923         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3924                PFID(ll_inode2fid(inode)), inode);
3925         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3926
3927 #ifdef HAVE_FILE_FSYNC_4ARGS
3928         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3929         inode_lock(inode);
3930 #else
3931         /* fsync's caller has already called _fdata{sync,write}, we want
3932          * that IO to finish before calling the osc and mdc sync methods */
3933         rc = filemap_fdatawait(inode->i_mapping);
3934 #endif
3935
3936         /* catch async errors that were recorded back when async writeback
3937          * failed for pages in this mapping. */
3938         if (!S_ISDIR(inode->i_mode)) {
3939                 err = lli->lli_async_rc;
3940                 lli->lli_async_rc = 0;
3941                 if (rc == 0)
3942                         rc = err;
3943                 if (lli->lli_clob != NULL) {
3944                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3945                         if (rc == 0)
3946                                 rc = err;
3947                 }
3948         }
3949
3950         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3951         if (!rc)
3952                 rc = err;
3953         if (!err)
3954                 ptlrpc_req_finished(req);
3955
3956         if (S_ISREG(inode->i_mode)) {
3957                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3958
3959                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3960                 if (rc == 0 && err < 0)
3961                         rc = err;
3962                 if (rc < 0)
3963                         fd->fd_write_failed = true;
3964                 else
3965                         fd->fd_write_failed = false;
3966         }
3967
3968 #ifdef HAVE_FILE_FSYNC_4ARGS
3969         inode_unlock(inode);
3970 #endif
3971         RETURN(rc);
3972 }
3973
3974 static int
3975 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3976 {
3977         struct inode *inode = file_inode(file);
3978         struct ll_sb_info *sbi = ll_i2sbi(inode);
3979         struct ldlm_enqueue_info einfo = {
3980                 .ei_type        = LDLM_FLOCK,
3981                 .ei_cb_cp       = ldlm_flock_completion_ast,
3982                 .ei_cbdata      = file_lock,
3983         };
3984         struct md_op_data *op_data;
3985         struct lustre_handle lockh = { 0 };
3986         union ldlm_policy_data flock = { { 0 } };
3987         int fl_type = file_lock->fl_type;
3988         __u64 flags = 0;
3989         int rc;
3990         int rc2 = 0;
3991         ENTRY;
3992
3993         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3994                PFID(ll_inode2fid(inode)), file_lock);
3995
3996         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3997
3998         if (file_lock->fl_flags & FL_FLOCK) {
3999                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4000                 /* flocks are whole-file locks */
4001                 flock.l_flock.end = OFFSET_MAX;
4002                 /* For flocks owner is determined by the local file desctiptor*/
4003                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4004         } else if (file_lock->fl_flags & FL_POSIX) {
4005                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4006                 flock.l_flock.start = file_lock->fl_start;
4007                 flock.l_flock.end = file_lock->fl_end;
4008         } else {
4009                 RETURN(-EINVAL);
4010         }
4011         flock.l_flock.pid = file_lock->fl_pid;
4012
4013         /* Somewhat ugly workaround for svc lockd.
4014          * lockd installs custom fl_lmops->lm_compare_owner that checks
4015          * for the fl_owner to be the same (which it always is on local node
4016          * I guess between lockd processes) and then compares pid.
4017          * As such we assign pid to the owner field to make it all work,
4018          * conflict with normal locks is unlikely since pid space and
4019          * pointer space for current->files are not intersecting */
4020         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4021                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4022
4023         switch (fl_type) {
4024         case F_RDLCK:
4025                 einfo.ei_mode = LCK_PR;
4026                 break;
4027         case F_UNLCK:
4028                 /* An unlock request may or may not have any relation to
4029                  * existing locks so we may not be able to pass a lock handle
4030                  * via a normal ldlm_lock_cancel() request. The request may even
4031                  * unlock a byte range in the middle of an existing lock. In
4032                  * order to process an unlock request we need all of the same
4033                  * information that is given with a normal read or write record
4034                  * lock request. To avoid creating another ldlm unlock (cancel)
4035                  * message we'll treat a LCK_NL flock request as an unlock. */
4036                 einfo.ei_mode = LCK_NL;
4037                 break;
4038         case F_WRLCK:
4039                 einfo.ei_mode = LCK_PW;
4040                 break;
4041         default:
4042                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4043                 RETURN (-ENOTSUPP);
4044         }
4045
4046         switch (cmd) {
4047         case F_SETLKW:
4048 #ifdef F_SETLKW64
4049         case F_SETLKW64:
4050 #endif
4051                 flags = 0;
4052                 break;
4053         case F_SETLK:
4054 #ifdef F_SETLK64
4055         case F_SETLK64:
4056 #endif
4057                 flags = LDLM_FL_BLOCK_NOWAIT;
4058                 break;
4059         case F_GETLK:
4060 #ifdef F_GETLK64
4061         case F_GETLK64:
4062 #endif
4063                 flags = LDLM_FL_TEST_LOCK;
4064                 break;
4065         default:
4066                 CERROR("unknown fcntl lock command: %d\n", cmd);
4067                 RETURN (-EINVAL);
4068         }
4069
4070         /* Save the old mode so that if the mode in the lock changes we
4071          * can decrement the appropriate reader or writer refcount. */
4072         file_lock->fl_type = einfo.ei_mode;
4073
4074         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4075                                      LUSTRE_OPC_ANY, NULL);
4076         if (IS_ERR(op_data))
4077                 RETURN(PTR_ERR(op_data));
4078
4079         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4080                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4081                flock.l_flock.pid, flags, einfo.ei_mode,
4082                flock.l_flock.start, flock.l_flock.end);
4083
4084         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4085                         flags);
4086
4087         /* Restore the file lock type if not TEST lock. */
4088         if (!(flags & LDLM_FL_TEST_LOCK))
4089                 file_lock->fl_type = fl_type;
4090
4091 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4092         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4093             !(flags & LDLM_FL_TEST_LOCK))
4094                 rc2  = locks_lock_file_wait(file, file_lock);
4095 #else
4096         if ((file_lock->fl_flags & FL_FLOCK) &&
4097             (rc == 0 || file_lock->fl_type == F_UNLCK))
4098                 rc2  = flock_lock_file_wait(file, file_lock);
4099         if ((file_lock->fl_flags & FL_POSIX) &&
4100             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4101             !(flags & LDLM_FL_TEST_LOCK))
4102                 rc2  = posix_lock_file_wait(file, file_lock);
4103 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4104
4105         if (rc2 && file_lock->fl_type != F_UNLCK) {
4106                 einfo.ei_mode = LCK_NL;
4107                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4108                            &lockh, flags);
4109                 rc = rc2;
4110         }
4111
4112         ll_finish_md_op_data(op_data);
4113
4114         RETURN(rc);
4115 }
4116
4117 int ll_get_fid_by_name(struct inode *parent, const char *name,
4118                        int namelen, struct lu_fid *fid,
4119                        struct inode **inode)
4120 {
4121         struct md_op_data       *op_data = NULL;
4122         struct mdt_body         *body;
4123         struct ptlrpc_request   *req;
4124         int                     rc;
4125         ENTRY;
4126
4127         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4128                                      LUSTRE_OPC_ANY, NULL);
4129         if (IS_ERR(op_data))
4130                 RETURN(PTR_ERR(op_data));
4131
4132         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4133         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4134         ll_finish_md_op_data(op_data);
4135         if (rc < 0)
4136                 RETURN(rc);
4137
4138         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4139         if (body == NULL)
4140                 GOTO(out_req, rc = -EFAULT);
4141         if (fid != NULL)
4142                 *fid = body->mbo_fid1;
4143
4144         if (inode != NULL)
4145                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4146 out_req:
4147         ptlrpc_req_finished(req);
4148         RETURN(rc);
4149 }
4150
4151 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4152                const char *name)
4153 {
4154         struct dentry *dchild = NULL;
4155         struct inode *child_inode = NULL;
4156         struct md_op_data *op_data;
4157         struct ptlrpc_request *request = NULL;
4158         struct obd_client_handle *och = NULL;
4159         struct qstr qstr;
4160         struct mdt_body *body;
4161         __u64 data_version = 0;
4162         size_t namelen = strlen(name);
4163         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4164         int rc;
4165         ENTRY;
4166
4167         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4168                PFID(ll_inode2fid(parent)), name,
4169                lum->lum_stripe_offset, lum->lum_stripe_count);
4170
4171         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4172             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4173                 lustre_swab_lmv_user_md(lum);
4174
4175         /* Get child FID first */
4176         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4177         qstr.name = name;
4178         qstr.len = namelen;
4179         dchild = d_lookup(file_dentry(file), &qstr);
4180         if (dchild) {
4181                 if (dchild->d_inode)
4182                         child_inode = igrab(dchild->d_inode);
4183                 dput(dchild);
4184         }
4185
4186         if (!child_inode) {
4187                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4188                                         &child_inode);
4189                 if (rc)
4190                         RETURN(rc);
4191         }
4192
4193         if (!child_inode)
4194                 RETURN(-ENOENT);
4195
4196         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4197               OBD_CONNECT2_DIR_MIGRATE)) {
4198                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4199                     ll_i2info(child_inode)->lli_lsm_md) {
4200                         CERROR("%s: MDT doesn't support stripe directory "
4201                                "migration!\n", ll_i2sbi(parent)->ll_fsname);
4202                         GOTO(out_iput, rc = -EOPNOTSUPP);
4203                 }
4204         }
4205
4206         /*
4207          * lfs migrate command needs to be blocked on the client
4208          * by checking the migrate FID against the FID of the
4209          * filesystem root.
4210          */
4211         if (child_inode == parent->i_sb->s_root->d_inode)
4212                 GOTO(out_iput, rc = -EINVAL);
4213
4214         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4215                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4216         if (IS_ERR(op_data))
4217                 GOTO(out_iput, rc = PTR_ERR(op_data));
4218
4219         inode_lock(child_inode);
4220         op_data->op_fid3 = *ll_inode2fid(child_inode);
4221         if (!fid_is_sane(&op_data->op_fid3)) {
4222                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4223                        ll_i2sbi(parent)->ll_fsname, name,
4224                        PFID(&op_data->op_fid3));
4225                 GOTO(out_unlock, rc = -EINVAL);
4226         }
4227
4228         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4229         op_data->op_data = lum;
4230         op_data->op_data_size = lumlen;
4231
4232 again:
4233         if (S_ISREG(child_inode->i_mode)) {
4234                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4235                 if (IS_ERR(och)) {
4236                         rc = PTR_ERR(och);
4237                         och = NULL;
4238                         GOTO(out_unlock, rc);
4239                 }
4240
4241                 rc = ll_data_version(child_inode, &data_version,
4242                                      LL_DV_WR_FLUSH);
4243                 if (rc != 0)
4244                         GOTO(out_close, rc);
4245
4246                 op_data->op_open_handle = och->och_open_handle;
4247                 op_data->op_data_version = data_version;
4248                 op_data->op_lease_handle = och->och_lease_handle;
4249                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4250
4251                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4252                 och->och_mod->mod_open_req->rq_replay = 0;
4253                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4254         }
4255
4256         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4257                        name, namelen, &request);
4258         if (rc == 0) {
4259                 LASSERT(request != NULL);
4260                 ll_update_times(request, parent);
4261         }
4262
4263         if (rc == 0 || rc == -EAGAIN) {
4264                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4265                 LASSERT(body != NULL);
4266
4267                 /* If the server does release layout lock, then we cleanup
4268                  * the client och here, otherwise release it in out_close: */
4269                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4270                         obd_mod_put(och->och_mod);
4271                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4272                                                   och);
4273                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4274                         OBD_FREE_PTR(och);
4275                         och = NULL;
4276                 }
4277         }
4278
4279         if (request != NULL) {
4280                 ptlrpc_req_finished(request);
4281                 request = NULL;
4282         }
4283
4284         /* Try again if the lease has cancelled. */
4285         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4286                 goto again;
4287
4288 out_close:
4289         if (och)
4290                 ll_lease_close(och, child_inode, NULL);
4291         if (!rc)
4292                 clear_nlink(child_inode);
4293 out_unlock:
4294         inode_unlock(child_inode);
4295         ll_finish_md_op_data(op_data);
4296 out_iput:
4297         iput(child_inode);
4298         RETURN(rc);
4299 }
4300
4301 static int
4302 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4303 {
4304         ENTRY;
4305
4306         RETURN(-ENOSYS);
4307 }
4308
4309 /**
4310  * test if some locks matching bits and l_req_mode are acquired
4311  * - bits can be in different locks
4312  * - if found clear the common lock bits in *bits
4313  * - the bits not found, are kept in *bits
4314  * \param inode [IN]
4315  * \param bits [IN] searched lock bits [IN]
4316  * \param l_req_mode [IN] searched lock mode
4317  * \retval boolean, true iff all bits are found
4318  */
4319 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4320 {
4321         struct lustre_handle lockh;
4322         union ldlm_policy_data policy;
4323         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4324                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4325         struct lu_fid *fid;
4326         __u64 flags;
4327         int i;
4328         ENTRY;
4329
4330         if (!inode)
4331                RETURN(0);
4332
4333         fid = &ll_i2info(inode)->lli_fid;
4334         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4335                ldlm_lockname[mode]);
4336
4337         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4338         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4339                 policy.l_inodebits.bits = *bits & (1 << i);
4340                 if (policy.l_inodebits.bits == 0)
4341                         continue;
4342
4343                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4344                                   &policy, mode, &lockh)) {
4345                         struct ldlm_lock *lock;
4346
4347                         lock = ldlm_handle2lock(&lockh);
4348                         if (lock) {
4349                                 *bits &=
4350                                       ~(lock->l_policy_data.l_inodebits.bits);
4351                                 LDLM_LOCK_PUT(lock);
4352                         } else {
4353                                 *bits &= ~policy.l_inodebits.bits;
4354                         }
4355                 }
4356         }
4357         RETURN(*bits == 0);
4358 }
4359
4360 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4361                                struct lustre_handle *lockh, __u64 flags,
4362                                enum ldlm_mode mode)
4363 {
4364         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4365         struct lu_fid *fid;
4366         enum ldlm_mode rc;
4367         ENTRY;
4368
4369         fid = &ll_i2info(inode)->lli_fid;
4370         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4371
4372         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4373                            fid, LDLM_IBITS, &policy, mode, lockh);
4374
4375         RETURN(rc);
4376 }
4377
4378 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4379 {
4380         /* Already unlinked. Just update nlink and return success */
4381         if (rc == -ENOENT) {
4382                 clear_nlink(inode);
4383                 /* If it is striped directory, and there is bad stripe
4384                  * Let's revalidate the dentry again, instead of returning
4385                  * error */
4386                 if (S_ISDIR(inode->i_mode) &&
4387                     ll_i2info(inode)->lli_lsm_md != NULL)
4388                         return 0;
4389
4390                 /* This path cannot be hit for regular files unless in
4391                  * case of obscure races, so no need to to validate
4392                  * size. */
4393                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4394                         return 0;
4395         } else if (rc != 0) {
4396                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4397                              "%s: revalidate FID "DFID" error: rc = %d\n",
4398                              ll_i2sbi(inode)->ll_fsname,
4399                              PFID(ll_inode2fid(inode)), rc);
4400         }
4401
4402         return rc;
4403 }
4404
4405 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4406 {
4407         struct inode *inode = dentry->d_inode;
4408         struct obd_export *exp = ll_i2mdexp(inode);
4409         struct lookup_intent oit = {
4410                 .it_op = op,
4411         };
4412         struct ptlrpc_request *req = NULL;
4413         struct md_op_data *op_data;
4414         int rc = 0;
4415         ENTRY;
4416
4417         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4418                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4419
4420         /* Call getattr by fid, so do not provide name at all. */
4421         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4422                                      LUSTRE_OPC_ANY, NULL);
4423         if (IS_ERR(op_data))
4424                 RETURN(PTR_ERR(op_data));
4425
4426         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4427         ll_finish_md_op_data(op_data);
4428         if (rc < 0) {
4429                 rc = ll_inode_revalidate_fini(inode, rc);
4430                 GOTO(out, rc);
4431         }
4432
4433         rc = ll_revalidate_it_finish(req, &oit, dentry);
4434         if (rc != 0) {
4435                 ll_intent_release(&oit);
4436                 GOTO(out, rc);
4437         }
4438
4439         /* Unlinked? Unhash dentry, so it is not picked up later by
4440          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4441          * here to preserve get_cwd functionality on 2.6.
4442          * Bug 10503 */
4443         if (!dentry->d_inode->i_nlink) {
4444                 ll_lock_dcache(inode);
4445                 d_lustre_invalidate(dentry, 0);
4446                 ll_unlock_dcache(inode);
4447         }
4448
4449         ll_lookup_finish_locks(&oit, dentry);
4450 out:
4451         ptlrpc_req_finished(req);
4452
4453         return rc;
4454 }
4455
4456 static int ll_merge_md_attr(struct inode *inode)
4457 {
4458         struct ll_inode_info *lli = ll_i2info(inode);
4459         struct cl_attr attr = { 0 };
4460         int rc;
4461
4462         LASSERT(lli->lli_lsm_md != NULL);
4463         down_read(&lli->lli_lsm_sem);
4464         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4465                            &attr, ll_md_blocking_ast);
4466         up_read(&lli->lli_lsm_sem);
4467         if (rc != 0)
4468                 RETURN(rc);
4469
4470         set_nlink(inode, attr.cat_nlink);
4471         inode->i_blocks = attr.cat_blocks;
4472         i_size_write(inode, attr.cat_size);
4473
4474         ll_i2info(inode)->lli_atime = attr.cat_atime;
4475         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4476         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4477
4478         RETURN(0);
4479 }
4480
4481 static inline dev_t ll_compat_encode_dev(dev_t dev)
4482 {
4483         /* The compat_sys_*stat*() syscalls will fail unless the
4484          * device majors and minors are both less than 256. Note that
4485          * the value returned here will be passed through
4486          * old_encode_dev() in cp_compat_stat(). And so we are not
4487          * trying to return a valid compat (u16) device number, just
4488          * one that will pass the old_valid_dev() check. */
4489
4490         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4491 }
4492
4493 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4494 int ll_getattr(const struct path *path, struct kstat *stat,
4495                u32 request_mask, unsigned int flags)
4496 {
4497         struct dentry *de = path->dentry;
4498 #else
4499 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4500 {
4501 #endif
4502         struct inode *inode = de->d_inode;
4503         struct ll_sb_info *sbi = ll_i2sbi(inode);
4504         struct ll_inode_info *lli = ll_i2info(inode);
4505         int rc;
4506
4507         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4508
4509         rc = ll_inode_revalidate(de, IT_GETATTR);
4510         if (rc < 0)
4511                 RETURN(rc);
4512
4513         if (S_ISREG(inode->i_mode)) {
4514                 /* In case of restore, the MDT has the right size and has
4515                  * already send it back without granting the layout lock,
4516                  * inode is up-to-date so glimpse is useless.
4517                  * Also to glimpse we need the layout, in case of a running
4518                  * restore the MDT holds the layout lock so the glimpse will
4519                  * block up to the end of restore (getattr will block)
4520                  */
4521                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4522                         rc = ll_glimpse_size(inode);
4523                         if (rc < 0)
4524                                 RETURN(rc);
4525                 }
4526         } else {
4527                 /* If object isn't regular a file then don't validate size. */
4528                 if (S_ISDIR(inode->i_mode) &&
4529                     lli->lli_lsm_md != NULL) {
4530                         rc = ll_merge_md_attr(inode);
4531                         if (rc < 0)
4532                                 RETURN(rc);
4533                 }
4534
4535                 inode->i_atime.tv_sec = lli->lli_atime;
4536                 inode->i_mtime.tv_sec = lli->lli_mtime;
4537                 inode->i_ctime.tv_sec = lli->lli_ctime;
4538         }
4539
4540         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4541
4542         if (ll_need_32bit_api(sbi)) {
4543                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4544                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4545                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4546         } else {
4547                 stat->ino = inode->i_ino;
4548                 stat->dev = inode->i_sb->s_dev;
4549                 stat->rdev = inode->i_rdev;
4550         }
4551
4552         stat->mode = inode->i_mode;
4553         stat->uid = inode->i_uid;
4554         stat->gid = inode->i_gid;
4555         stat->atime = inode->i_atime;
4556         stat->mtime = inode->i_mtime;
4557         stat->ctime = inode->i_ctime;
4558         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4559
4560         stat->nlink = inode->i_nlink;
4561         stat->size = i_size_read(inode);
4562         stat->blocks = inode->i_blocks;
4563
4564         return 0;
4565 }
4566
4567 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4568                      __u64 start, __u64 len)
4569 {
4570         int             rc;
4571         size_t          num_bytes;
4572         struct fiemap   *fiemap;
4573         unsigned int    extent_count = fieinfo->fi_extents_max;
4574
4575         num_bytes = sizeof(*fiemap) + (extent_count *
4576                                        sizeof(struct fiemap_extent));
4577         OBD_ALLOC_LARGE(fiemap, num_bytes);
4578
4579         if (fiemap == NULL)
4580                 RETURN(-ENOMEM);
4581
4582         fiemap->fm_flags = fieinfo->fi_flags;
4583         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4584         fiemap->fm_start = start;
4585         fiemap->fm_length = len;
4586         if (extent_count > 0 &&
4587             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4588                            sizeof(struct fiemap_extent)) != 0)
4589                 GOTO(out, rc = -EFAULT);
4590
4591         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4592
4593         fieinfo->fi_flags = fiemap->fm_flags;
4594         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4595         if (extent_count > 0 &&
4596             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4597                          fiemap->fm_mapped_extents *
4598                          sizeof(struct fiemap_extent)) != 0)
4599                 GOTO(out, rc = -EFAULT);
4600 out:
4601         OBD_FREE_LARGE(fiemap, num_bytes);
4602         return rc;
4603 }
4604
4605 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4606 {
4607         struct ll_inode_info *lli = ll_i2info(inode);
4608         struct posix_acl *acl = NULL;
4609         ENTRY;
4610
4611         spin_lock(&lli->lli_lock);
4612         /* VFS' acl_permission_check->check_acl will release the refcount */
4613         acl = posix_acl_dup(lli->lli_posix_acl);
4614         spin_unlock(&lli->lli_lock);
4615
4616         RETURN(acl);
4617 }
4618
4619 #ifdef HAVE_IOP_SET_ACL
4620 #ifdef CONFIG_FS_POSIX_ACL
4621 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4622 {
4623         struct ll_sb_info *sbi = ll_i2sbi(inode);
4624         struct ptlrpc_request *req = NULL;
4625         const char *name = NULL;
4626         char *value = NULL;
4627         size_t value_size = 0;
4628         int rc = 0;
4629         ENTRY;
4630
4631         switch (type) {
4632         case ACL_TYPE_ACCESS:
4633                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4634                 if (acl)
4635                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4636                 break;
4637
4638         case ACL_TYPE_DEFAULT:
4639                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4640                 if (!S_ISDIR(inode->i_mode))
4641                         rc = acl ? -EACCES : 0;
4642                 break;
4643
4644         default:
4645                 rc = -EINVAL;
4646                 break;
4647         }
4648         if (rc)
4649                 return rc;
4650
4651         if (acl) {
4652                 value_size = posix_acl_xattr_size(acl->a_count);
4653                 value = kmalloc(value_size, GFP_NOFS);
4654                 if (value == NULL)
4655                         GOTO(out, rc = -ENOMEM);
4656
4657                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4658                 if (rc < 0)
4659                         GOTO(out_value, rc);
4660         }
4661
4662         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4663                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4664                          name, value, value_size, 0, 0, &req);
4665
4666         ptlrpc_req_finished(req);
4667 out_value:
4668         kfree(value);
4669 out:
4670         if (rc)
4671                 forget_cached_acl(inode, type);
4672         else
4673                 set_cached_acl(inode, type, acl);
4674         RETURN(rc);
4675 }
4676 #endif /* CONFIG_FS_POSIX_ACL */
4677 #endif /* HAVE_IOP_SET_ACL */
4678
4679 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4680 static int
4681 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4682 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4683 # else
4684 ll_check_acl(struct inode *inode, int mask)
4685 # endif
4686 {
4687 # ifdef CONFIG_FS_POSIX_ACL
4688         struct posix_acl *acl;
4689         int rc;
4690         ENTRY;
4691
4692 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4693         if (flags & IPERM_FLAG_RCU)
4694                 return -ECHILD;
4695 #  endif
4696         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4697
4698         if (!acl)
4699                 RETURN(-EAGAIN);
4700
4701         rc = posix_acl_permission(inode, acl, mask);
4702         posix_acl_release(acl);
4703
4704         RETURN(rc);
4705 # else /* !CONFIG_FS_POSIX_ACL */
4706         return -EAGAIN;
4707 # endif /* CONFIG_FS_POSIX_ACL */
4708 }
4709 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4710
4711 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4712 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4713 #else
4714 # ifdef HAVE_INODE_PERMISION_2ARGS
4715 int ll_inode_permission(struct inode *inode, int mask)
4716 # else
4717 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4718 # endif
4719 #endif
4720 {
4721         int rc = 0;
4722         struct ll_sb_info *sbi;
4723         struct root_squash_info *squash;
4724         struct cred *cred = NULL;
4725         const struct cred *old_cred = NULL;
4726         cfs_cap_t cap;
4727         bool squash_id = false;
4728         ENTRY;
4729
4730 #ifdef MAY_NOT_BLOCK
4731         if (mask & MAY_NOT_BLOCK)
4732                 return -ECHILD;
4733 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4734         if (flags & IPERM_FLAG_RCU)
4735                 return -ECHILD;
4736 #endif
4737
4738        /* as root inode are NOT getting validated in lookup operation,
4739         * need to do it before permission check. */
4740
4741         if (inode == inode->i_sb->s_root->d_inode) {
4742                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4743                 if (rc)
4744                         RETURN(rc);
4745         }
4746
4747         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4748                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4749
4750         /* squash fsuid/fsgid if needed */
4751         sbi = ll_i2sbi(inode);
4752         squash = &sbi->ll_squash;
4753         if (unlikely(squash->rsi_uid != 0 &&
4754                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4755                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4756                         squash_id = true;
4757         }
4758         if (squash_id) {
4759                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4760                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4761                        squash->rsi_uid, squash->rsi_gid);
4762
4763                 /* update current process's credentials
4764                  * and FS capability */
4765                 cred = prepare_creds();
4766                 if (cred == NULL)
4767                         RETURN(-ENOMEM);
4768
4769                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4770                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4771                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4772                         if ((1 << cap) & CFS_CAP_FS_MASK)
4773                                 cap_lower(cred->cap_effective, cap);
4774                 }
4775                 old_cred = override_creds(cred);
4776         }
4777
4778         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4779         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4780         /* restore current process's credentials and FS capability */
4781         if (squash_id) {
4782                 revert_creds(old_cred);
4783                 put_cred(cred);
4784         }
4785
4786         RETURN(rc);
4787 }
4788
4789 /* -o localflock - only provides locally consistent flock locks */
4790 struct file_operations ll_file_operations = {
4791 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4792 # ifdef HAVE_SYNC_READ_WRITE
4793         .read           = new_sync_read,
4794         .write          = new_sync_write,
4795 # endif
4796         .read_iter      = ll_file_read_iter,
4797         .write_iter     = ll_file_write_iter,
4798 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4799         .read           = ll_file_read,
4800         .aio_read       = ll_file_aio_read,
4801         .write          = ll_file_write,
4802         .aio_write      = ll_file_aio_write,
4803 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4804         .unlocked_ioctl = ll_file_ioctl,
4805         .open           = ll_file_open,
4806         .release        = ll_file_release,
4807         .mmap           = ll_file_mmap,
4808         .llseek         = ll_file_seek,
4809         .splice_read    = ll_file_splice_read,
4810         .fsync          = ll_fsync,
4811         .flush          = ll_flush
4812 };
4813
4814 struct file_operations ll_file_operations_flock = {
4815 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4816 # ifdef HAVE_SYNC_READ_WRITE
4817         .read           = new_sync_read,
4818         .write          = new_sync_write,
4819 # endif /* HAVE_SYNC_READ_WRITE */
4820         .read_iter      = ll_file_read_iter,
4821         .write_iter     = ll_file_write_iter,
4822 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4823         .read           = ll_file_read,
4824         .aio_read       = ll_file_aio_read,
4825         .write          = ll_file_write,
4826         .aio_write      = ll_file_aio_write,
4827 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4828         .unlocked_ioctl = ll_file_ioctl,
4829         .open           = ll_file_open,
4830         .release        = ll_file_release,
4831         .mmap           = ll_file_mmap,
4832         .llseek         = ll_file_seek,
4833         .splice_read    = ll_file_splice_read,
4834         .fsync          = ll_fsync,
4835         .flush          = ll_flush,
4836         .flock          = ll_file_flock,
4837         .lock           = ll_file_flock
4838 };
4839
4840 /* These are for -o noflock - to return ENOSYS on flock calls */
4841 struct file_operations ll_file_operations_noflock = {
4842 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4843 # ifdef HAVE_SYNC_READ_WRITE
4844         .read           = new_sync_read,
4845         .write          = new_sync_write,
4846 # endif /* HAVE_SYNC_READ_WRITE */
4847         .read_iter      = ll_file_read_iter,
4848         .write_iter     = ll_file_write_iter,
4849 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4850         .read           = ll_file_read,
4851         .aio_read       = ll_file_aio_read,
4852         .write          = ll_file_write,
4853         .aio_write      = ll_file_aio_write,
4854 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4855         .unlocked_ioctl = ll_file_ioctl,
4856         .open           = ll_file_open,
4857         .release        = ll_file_release,
4858         .mmap           = ll_file_mmap,
4859         .llseek         = ll_file_seek,
4860         .splice_read    = ll_file_splice_read,
4861         .fsync          = ll_fsync,
4862         .flush          = ll_flush,
4863         .flock          = ll_file_noflock,
4864         .lock           = ll_file_noflock
4865 };
4866
4867 struct inode_operations ll_file_inode_operations = {
4868         .setattr        = ll_setattr,
4869         .getattr        = ll_getattr,
4870         .permission     = ll_inode_permission,
4871 #ifdef HAVE_IOP_XATTR
4872         .setxattr       = ll_setxattr,
4873         .getxattr       = ll_getxattr,
4874         .removexattr    = ll_removexattr,
4875 #endif
4876         .listxattr      = ll_listxattr,
4877         .fiemap         = ll_fiemap,
4878 #ifdef HAVE_IOP_GET_ACL
4879         .get_acl        = ll_get_acl,
4880 #endif
4881 #ifdef HAVE_IOP_SET_ACL
4882         .set_acl        = ll_set_acl,
4883 #endif
4884 };
4885
4886 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4887 {
4888         struct ll_inode_info *lli = ll_i2info(inode);
4889         struct cl_object *obj = lli->lli_clob;
4890         struct lu_env *env;
4891         int rc;
4892         __u16 refcheck;
4893         ENTRY;
4894
4895         if (obj == NULL)
4896                 RETURN(0);
4897
4898         env = cl_env_get(&refcheck);
4899         if (IS_ERR(env))
4900                 RETURN(PTR_ERR(env));
4901
4902         rc = cl_conf_set(env, lli->lli_clob, conf);
4903         if (rc < 0)
4904                 GOTO(out, rc);
4905
4906         if (conf->coc_opc == OBJECT_CONF_SET) {
4907                 struct ldlm_lock *lock = conf->coc_lock;
4908                 struct cl_layout cl = {
4909                         .cl_layout_gen = 0,
4910                 };
4911
4912                 LASSERT(lock != NULL);
4913                 LASSERT(ldlm_has_layout(lock));
4914
4915                 /* it can only be allowed to match after layout is
4916                  * applied to inode otherwise false layout would be
4917                  * seen. Applying layout shoud happen before dropping
4918                  * the intent lock. */
4919                 ldlm_lock_allow_match(lock);
4920
4921                 rc = cl_object_layout_get(env, obj, &cl);
4922                 if (rc < 0)
4923                         GOTO(out, rc);
4924
4925                 CDEBUG(D_VFSTRACE,
4926                        DFID": layout version change: %u -> %u\n",
4927                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4928                        cl.cl_layout_gen);
4929                 ll_layout_version_set(lli, cl.cl_layout_gen);
4930         }
4931
4932 out:
4933         cl_env_put(env, &refcheck);
4934
4935         RETURN(rc);
4936 }
4937
4938 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4939 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4940
4941 {
4942         struct ll_sb_info *sbi = ll_i2sbi(inode);
4943         struct ptlrpc_request *req;
4944         void *lvbdata;
4945         void *lmm;
4946         int lmmsize;
4947         int rc;
4948         ENTRY;
4949
4950         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4951                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4952                lock->l_lvb_data, lock->l_lvb_len);
4953
4954         if (lock->l_lvb_data != NULL)
4955                 RETURN(0);
4956
4957         /* if layout lock was granted right away, the layout is returned
4958          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4959          * blocked and then granted via completion ast, we have to fetch
4960          * layout here. Please note that we can't use the LVB buffer in
4961          * completion AST because it doesn't have a large enough buffer */
4962         rc = ll_get_default_mdsize(sbi, &lmmsize);
4963         if (rc < 0)
4964                 RETURN(rc);
4965
4966         rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4967                          XATTR_NAME_LOV, lmmsize, &req);
4968         if (rc < 0) {
4969                 if (rc == -ENODATA)
4970                         GOTO(out, rc = 0); /* empty layout */
4971                 else
4972                         RETURN(rc);
4973         }
4974
4975         lmmsize = rc;
4976         rc = 0;
4977         if (lmmsize == 0) /* empty layout */
4978                 GOTO(out, rc = 0);
4979
4980         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4981         if (lmm == NULL)
4982                 GOTO(out, rc = -EFAULT);
4983
4984         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4985         if (lvbdata == NULL)
4986                 GOTO(out, rc = -ENOMEM);
4987
4988         memcpy(lvbdata, lmm, lmmsize);
4989         lock_res_and_lock(lock);
4990         if (unlikely(lock->l_lvb_data == NULL)) {
4991                 lock->l_lvb_type = LVB_T_LAYOUT;
4992                 lock->l_lvb_data = lvbdata;
4993                 lock->l_lvb_len = lmmsize;
4994                 lvbdata = NULL;
4995         }
4996         unlock_res_and_lock(lock);
4997
4998         if (lvbdata)
4999                 OBD_FREE_LARGE(lvbdata, lmmsize);
5000
5001         EXIT;
5002
5003 out:
5004         ptlrpc_req_finished(req);
5005         return rc;
5006 }
5007
5008 /**
5009  * Apply the layout to the inode. Layout lock is held and will be released
5010  * in this function.
5011  */
5012 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5013                               struct inode *inode)
5014 {
5015         struct ll_inode_info *lli = ll_i2info(inode);
5016         struct ll_sb_info    *sbi = ll_i2sbi(inode);
5017         struct ldlm_lock *lock;
5018         struct cl_object_conf conf;
5019         int rc = 0;
5020         bool lvb_ready;
5021         bool wait_layout = false;
5022         ENTRY;
5023
5024         LASSERT(lustre_handle_is_used(lockh));
5025
5026         lock = ldlm_handle2lock(lockh);
5027         LASSERT(lock != NULL);
5028         LASSERT(ldlm_has_layout(lock));
5029
5030         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5031                    PFID(&lli->lli_fid), inode);
5032
5033         /* in case this is a caching lock and reinstate with new inode */
5034         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5035
5036         lock_res_and_lock(lock);
5037         lvb_ready = ldlm_is_lvb_ready(lock);
5038         unlock_res_and_lock(lock);
5039
5040         /* checking lvb_ready is racy but this is okay. The worst case is
5041          * that multi processes may configure the file on the same time. */
5042         if (lvb_ready)
5043                 GOTO(out, rc = 0);
5044
5045         rc = ll_layout_fetch(inode, lock);
5046         if (rc < 0)
5047                 GOTO(out, rc);
5048
5049         /* for layout lock, lmm is stored in lock's lvb.
5050          * lvb_data is immutable if the lock is held so it's safe to access it
5051          * without res lock.
5052          *
5053          * set layout to file. Unlikely this will fail as old layout was
5054          * surely eliminated */
5055         memset(&conf, 0, sizeof conf);
5056         conf.coc_opc = OBJECT_CONF_SET;
5057         conf.coc_inode = inode;
5058         conf.coc_lock = lock;
5059         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5060         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5061         rc = ll_layout_conf(inode, &conf);
5062
5063         /* refresh layout failed, need to wait */
5064         wait_layout = rc == -EBUSY;
5065         EXIT;
5066 out:
5067         LDLM_LOCK_PUT(lock);
5068         ldlm_lock_decref(lockh, mode);
5069
5070         /* wait for IO to complete if it's still being used. */
5071         if (wait_layout) {
5072                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5073                        sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5074
5075                 memset(&conf, 0, sizeof conf);
5076                 conf.coc_opc = OBJECT_CONF_WAIT;
5077                 conf.coc_inode = inode;
5078                 rc = ll_layout_conf(inode, &conf);
5079                 if (rc == 0)
5080                         rc = -EAGAIN;
5081
5082                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5083                        sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5084         }
5085         RETURN(rc);
5086 }
5087
5088 /**
5089  * Issue layout intent RPC to MDS.
5090  * \param inode [in]    file inode
5091  * \param intent [in]   layout intent
5092  *
5093  * \retval 0    on success
5094  * \retval < 0  error code
5095  */
5096 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5097 {
5098         struct ll_inode_info  *lli = ll_i2info(inode);
5099         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5100         struct md_op_data     *op_data;
5101         struct lookup_intent it;
5102         struct ptlrpc_request *req;
5103         int rc;
5104         ENTRY;
5105
5106         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5107                                      0, 0, LUSTRE_OPC_ANY, NULL);
5108         if (IS_ERR(op_data))
5109                 RETURN(PTR_ERR(op_data));
5110
5111         op_data->op_data = intent;
5112         op_data->op_data_size = sizeof(*intent);
5113
5114         memset(&it, 0, sizeof(it));
5115         it.it_op = IT_LAYOUT;
5116         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5117             intent->li_opc == LAYOUT_INTENT_TRUNC)
5118                 it.it_flags = FMODE_WRITE;
5119
5120         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5121                           sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5122
5123         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5124                             &ll_md_blocking_ast, 0);
5125         if (it.it_request != NULL)
5126                 ptlrpc_req_finished(it.it_request);
5127         it.it_request = NULL;
5128
5129         ll_finish_md_op_data(op_data);
5130
5131         /* set lock data in case this is a new lock */
5132         if (!rc)
5133                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5134
5135         ll_intent_drop_lock(&it);
5136
5137         RETURN(rc);
5138 }
5139
5140 /**
5141  * This function checks if there exists a LAYOUT lock on the client side,
5142  * or enqueues it if it doesn't have one in cache.
5143  *
5144  * This function will not hold layout lock so it may be revoked any time after
5145  * this function returns. Any operations depend on layout should be redone
5146  * in that case.
5147  *
5148  * This function should be called before lov_io_init() to get an uptodate
5149  * layout version, the caller should save the version number and after IO
5150  * is finished, this function should be called again to verify that layout
5151  * is not changed during IO time.
5152  */
5153 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5154 {
5155         struct ll_inode_info    *lli = ll_i2info(inode);
5156         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5157         struct lustre_handle lockh;
5158         struct layout_intent intent = {
5159                 .li_opc = LAYOUT_INTENT_ACCESS,
5160         };
5161         enum ldlm_mode mode;
5162         int rc;
5163         ENTRY;
5164
5165         *gen = ll_layout_version_get(lli);
5166         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5167                 RETURN(0);
5168
5169         /* sanity checks */
5170         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5171         LASSERT(S_ISREG(inode->i_mode));
5172
5173         /* take layout lock mutex to enqueue layout lock exclusively. */
5174         mutex_lock(&lli->lli_layout_mutex);
5175
5176         while (1) {
5177                 /* mostly layout lock is caching on the local side, so try to
5178                  * match it before grabbing layout lock mutex. */
5179                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5180                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5181                 if (mode != 0) { /* hit cached lock */
5182                         rc = ll_layout_lock_set(&lockh, mode, inode);
5183                         if (rc == -EAGAIN)
5184                                 continue;
5185                         break;
5186                 }
5187
5188                 rc = ll_layout_intent(inode, &intent);
5189                 if (rc != 0)
5190                         break;
5191         }
5192
5193         if (rc == 0)
5194                 *gen = ll_layout_version_get(lli);
5195         mutex_unlock(&lli->lli_layout_mutex);
5196
5197         RETURN(rc);
5198 }
5199
5200 /**
5201  * Issue layout intent RPC indicating where in a file an IO is about to write.
5202  *
5203  * \param[in] inode     file inode.
5204  * \param[in] ext       write range with start offset of fille in bytes where
5205  *                      an IO is about to write, and exclusive end offset in
5206  *                      bytes.
5207  *
5208  * \retval 0    on success
5209  * \retval < 0  error code
5210  */
5211 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5212                            struct lu_extent *ext)
5213 {
5214         struct layout_intent intent = {
5215                 .li_opc = opc,
5216                 .li_extent.e_start = ext->e_start,
5217                 .li_extent.e_end = ext->e_end,
5218         };
5219         int rc;
5220         ENTRY;
5221
5222         rc = ll_layout_intent(inode, &intent);
5223
5224         RETURN(rc);
5225 }
5226
5227 /**
5228  *  This function send a restore request to the MDT
5229  */
5230 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5231 {
5232         struct hsm_user_request *hur;
5233         int                      len, rc;
5234         ENTRY;
5235
5236         len = sizeof(struct hsm_user_request) +
5237               sizeof(struct hsm_user_item);
5238         OBD_ALLOC(hur, len);
5239         if (hur == NULL)
5240                 RETURN(-ENOMEM);
5241
5242         hur->hur_request.hr_action = HUA_RESTORE;
5243         hur->hur_request.hr_archive_id = 0;
5244         hur->hur_request.hr_flags = 0;
5245         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5246                sizeof(hur->hur_user_item[0].hui_fid));
5247         hur->hur_user_item[0].hui_extent.offset = offset;
5248         hur->hur_user_item[0].hui_extent.length = length;
5249         hur->hur_request.hr_itemcount = 1;
5250         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5251                            len, hur, NULL);
5252         OBD_FREE(hur, len);
5253         RETURN(rc);
5254 }