lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                       ATTR_MTIME | ATTR_MTIME_SET |
 104                                       ATTR_CTIME);
 105         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 106         op_data->op_attr_blocks = inode->i_blocks;
 107         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 108         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 109                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 110         op_data->op_open_handle = och->och_open_handle;
 111
 112         if (och->och_flags & FMODE_WRITE &&
 113             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 114                 /* For HSM: if inode data has been modified, pack it so that
 115                  * MDT can set data dirty flag in the archive. */
 116                 op_data->op_bias |= MDS_DATA_MODIFIED;
 117
 118         EXIT;
 119 }
 120
 121 /**
 122  * Perform a close, possibly with a bias.
 123  * The meaning of "data" depends on the value of "bias".
 124  *
 125  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 126  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 127  * swap layouts with.
 128  */
 129 static int ll_close_inode_openhandle(struct inode *inode,
 130                                      struct obd_client_handle *och,
 131                                      enum mds_op_bias bias, void *data)
 132 {
 133         struct obd_export *md_exp = ll_i2mdexp(inode);
 134         const struct ll_inode_info *lli = ll_i2info(inode);
 135         struct md_op_data *op_data;
 136         struct ptlrpc_request *req = NULL;
 137         int rc;
 138         ENTRY;
 139
 140         if (class_exp2obd(md_exp) == NULL) {
 141                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 142                        ll_get_fsname(inode->i_sb, NULL, 0),
 143                        PFID(&lli->lli_fid));
 144                 GOTO(out, rc = 0);
 145         }
 146
 147         OBD_ALLOC_PTR(op_data);
 148         /* We leak openhandle and request here on error, but not much to be
 149          * done in OOM case since app won't retry close on error either. */
 150         if (op_data == NULL)
 151                 GOTO(out, rc = -ENOMEM);
 152
 153         ll_prepare_close(inode, op_data, och);
 154         switch (bias) {
 155         case MDS_CLOSE_LAYOUT_MERGE:
 156                 /* merge blocks from the victim inode */
 157                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 158                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 159                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 160         case MDS_CLOSE_LAYOUT_SPLIT:
 161         case MDS_CLOSE_LAYOUT_SWAP: {
 162                 struct split_param *sp = data;
 163
 164                 LASSERT(data != NULL);
 165                 op_data->op_bias |= bias;
 166                 op_data->op_data_version = 0;
 167                 op_data->op_lease_handle = och->och_lease_handle;
 168                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 169                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 170                         op_data->op_mirror_id = sp->sp_mirror_id;
 171                 } else {
 172                         op_data->op_fid2 = *ll_inode2fid(data);
 173                 }
 174                 break;
 175         }
 176
 177         case MDS_CLOSE_RESYNC_DONE: {
 178                 struct ll_ioc_lease *ioc = data;
 179
 180                 LASSERT(data != NULL);
 181                 op_data->op_attr_blocks +=
 182                         ioc->lil_count * op_data->op_attr_blocks;
 183                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 184                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 185                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 186
 187                 op_data->op_lease_handle = och->och_lease_handle;
 188                 op_data->op_data = &ioc->lil_ids[0];
 189                 op_data->op_data_size =
 190                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 191                 break;
 192         }
 193
 194         case MDS_HSM_RELEASE:
 195                 LASSERT(data != NULL);
 196                 op_data->op_bias |= MDS_HSM_RELEASE;
 197                 op_data->op_data_version = *(__u64 *)data;
 198                 op_data->op_lease_handle = och->och_lease_handle;
 199                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 200                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 201                 break;
 202
 203         default:
 204                 LASSERT(data == NULL);
 205                 break;
 206         }
 207
 208         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 209                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 210         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 211                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 212
 213         rc = md_close(md_exp, op_data, och->och_mod, &req);
 214         if (rc != 0 && rc != -EINTR)
 215                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 216                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 217
 218         if (rc == 0 && op_data->op_bias & bias) {
 219                 struct mdt_body *body;
 220
 221                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 222                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 223                         rc = -EBUSY;
 224         }
 225
 226         ll_finish_md_op_data(op_data);
 227         EXIT;
 228 out:
 229
 230         md_clear_open_replay_data(md_exp, och);
 231         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 232         OBD_FREE_PTR(och);
 233
 234         ptlrpc_req_finished(req);       /* This is close request */
 235         return rc;
 236 }
 237
 238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 239 {
 240         struct ll_inode_info *lli = ll_i2info(inode);
 241         struct obd_client_handle **och_p;
 242         struct obd_client_handle *och;
 243         __u64 *och_usecount;
 244         int rc = 0;
 245         ENTRY;
 246
 247         if (fmode & FMODE_WRITE) {
 248                 och_p = &lli->lli_mds_write_och;
 249                 och_usecount = &lli->lli_open_fd_write_count;
 250         } else if (fmode & FMODE_EXEC) {
 251                 och_p = &lli->lli_mds_exec_och;
 252                 och_usecount = &lli->lli_open_fd_exec_count;
 253         } else {
 254                 LASSERT(fmode & FMODE_READ);
 255                 och_p = &lli->lli_mds_read_och;
 256                 och_usecount = &lli->lli_open_fd_read_count;
 257         }
 258
 259         mutex_lock(&lli->lli_och_mutex);
 260         if (*och_usecount > 0) {
 261                 /* There are still users of this handle, so skip
 262                  * freeing it. */
 263                 mutex_unlock(&lli->lli_och_mutex);
 264                 RETURN(0);
 265         }
 266
 267         och = *och_p;
 268         *och_p = NULL;
 269         mutex_unlock(&lli->lli_och_mutex);
 270
 271         if (och != NULL) {
 272                 /* There might be a race and this handle may already
 273                  * be closed. */
 274                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 275         }
 276
 277         RETURN(rc);
 278 }
 279
 280 static int ll_md_close(struct inode *inode, struct file *file)
 281 {
 282         union ldlm_policy_data policy = {
 283                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 284         };
 285         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 286         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 287         struct ll_inode_info *lli = ll_i2info(inode);
 288         struct lustre_handle lockh;
 289         enum ldlm_mode lockmode;
 290         int rc = 0;
 291         ENTRY;
 292
 293         /* clear group lock, if present */
 294         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 295                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 296
 297         if (fd->fd_lease_och != NULL) {
 298                 bool lease_broken;
 299
 300                 /* Usually the lease is not released when the
 301                  * application crashed, we need to release here. */
 302                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 303                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 304                         PFID(&lli->lli_fid), rc, lease_broken);
 305
 306                 fd->fd_lease_och = NULL;
 307         }
 308
 309         if (fd->fd_och != NULL) {
 310                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 311                 fd->fd_och = NULL;
 312                 GOTO(out, rc);
 313         }
 314
 315         /* Let's see if we have good enough OPEN lock on the file and if
 316            we can skip talking to MDS */
 317         mutex_lock(&lli->lli_och_mutex);
 318         if (fd->fd_omode & FMODE_WRITE) {
 319                 lockmode = LCK_CW;
 320                 LASSERT(lli->lli_open_fd_write_count);
 321                 lli->lli_open_fd_write_count--;
 322         } else if (fd->fd_omode & FMODE_EXEC) {
 323                 lockmode = LCK_PR;
 324                 LASSERT(lli->lli_open_fd_exec_count);
 325                 lli->lli_open_fd_exec_count--;
 326         } else {
 327                 lockmode = LCK_CR;
 328                 LASSERT(lli->lli_open_fd_read_count);
 329                 lli->lli_open_fd_read_count--;
 330         }
 331         mutex_unlock(&lli->lli_och_mutex);
 332
 333         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 334                            LDLM_IBITS, &policy, lockmode, &lockh))
 335                 rc = ll_md_real_close(inode, fd->fd_omode);
 336
 337 out:
 338         LUSTRE_FPRIVATE(file) = NULL;
 339         ll_file_data_put(fd);
 340
 341         RETURN(rc);
 342 }
 343
 344 /* While this returns an error code, fput() the caller does not, so we need
 345  * to make every effort to clean up all of our state here.  Also, applications
 346  * rarely check close errors and even if an error is returned they will not
 347  * re-try the close call.
 348  */
 349 int ll_file_release(struct inode *inode, struct file *file)
 350 {
 351         struct ll_file_data *fd;
 352         struct ll_sb_info *sbi = ll_i2sbi(inode);
 353         struct ll_inode_info *lli = ll_i2info(inode);
 354         int rc;
 355         ENTRY;
 356
 357         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 358                PFID(ll_inode2fid(inode)), inode);
 359
 360         if (inode->i_sb->s_root != file_dentry(file))
 361                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 362         fd = LUSTRE_FPRIVATE(file);
 363         LASSERT(fd != NULL);
 364
 365         /* The last ref on @file, maybe not the the owner pid of statahead,
 366          * because parent and child process can share the same file handle. */
 367         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 368                 ll_deauthorize_statahead(inode, fd);
 369
 370         if (inode->i_sb->s_root == file_dentry(file)) {
 371                 LUSTRE_FPRIVATE(file) = NULL;
 372                 ll_file_data_put(fd);
 373                 RETURN(0);
 374         }
 375
 376         if (!S_ISDIR(inode->i_mode)) {
 377                 if (lli->lli_clob != NULL)
 378                         lov_read_and_clear_async_rc(lli->lli_clob);
 379                 lli->lli_async_rc = 0;
 380         }
 381
 382         rc = ll_md_close(inode, file);
 383
 384         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 385                 libcfs_debug_dumplog();
 386
 387         RETURN(rc);
 388 }
 389
 390 static inline int ll_dom_readpage(void *data, struct page *page)
 391 {
 392         struct niobuf_local *lnb = data;
 393         void *kaddr;
 394
 395         kaddr = ll_kmap_atomic(page, KM_USER0);
 396         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 397         if (lnb->lnb_len < PAGE_SIZE)
 398                 memset(kaddr + lnb->lnb_len, 0,
 399                        PAGE_SIZE - lnb->lnb_len);
 400         flush_dcache_page(page);
 401         SetPageUptodate(page);
 402         ll_kunmap_atomic(kaddr, KM_USER0);
 403         unlock_page(page);
 404
 405         return 0;
 406 }
 407
 408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 409                         struct lookup_intent *it)
 410 {
 411         struct ll_inode_info *lli = ll_i2info(inode);
 412         struct cl_object *obj = lli->lli_clob;
 413         struct address_space *mapping = inode->i_mapping;
 414         struct page *vmpage;
 415         struct niobuf_remote *rnb;
 416         char *data;
 417         struct lustre_handle lockh;
 418         struct ldlm_lock *lock;
 419         unsigned long index, start;
 420         struct niobuf_local lnb;
 421         bool dom_lock = false;
 422
 423         ENTRY;
 424
 425         if (obj == NULL)
 426                 RETURN_EXIT;
 427
 428         if (it->it_lock_mode != 0) {
 429                 lockh.cookie = it->it_lock_handle;
 430                 lock = ldlm_handle2lock(&lockh);
 431                 if (lock != NULL)
 432                         dom_lock = ldlm_has_dom(lock);
 433                 LDLM_LOCK_PUT(lock);
 434         }
 435         if (!dom_lock)
 436                 RETURN_EXIT;
 437
 438         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 439                                    RCL_SERVER))
 440                 RETURN_EXIT;
 441
 442         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 443         if (rnb == NULL || rnb->rnb_len == 0)
 444                 RETURN_EXIT;
 445
 446         /* LU-11595: Server may return whole file and that is OK always or
 447          * it may return just file tail and its offset must be aligned with
 448          * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
 449          * smaller then offset may be not aligned and that data is just ignored.
 450          */
 451         if (rnb->rnb_offset % PAGE_SIZE)
 452                 RETURN_EXIT;
 453
 454         /* Server returns whole file or just file tail if it fills in
 455          * reply buffer, in both cases total size should be inode size.
 456          */
 457         if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
 458                 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
 459                        ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
 460                        rnb->rnb_len, i_size_read(inode));
 461                 RETURN_EXIT;
 462         }
 463
 464         CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
 465                rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
 466
 467         data = (char *)rnb + sizeof(*rnb);
 468
 469         lnb.lnb_file_offset = rnb->rnb_offset;
 470         start = lnb.lnb_file_offset / PAGE_SIZE;
 471         index = 0;
 472         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 473         lnb.lnb_page_offset = 0;
 474         do {
 475                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 476                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 477                 if (lnb.lnb_len > PAGE_SIZE)
 478                         lnb.lnb_len = PAGE_SIZE;
 479
 480                 vmpage = read_cache_page(mapping, index + start,
 481                                          ll_dom_readpage, &lnb);
 482                 if (IS_ERR(vmpage)) {
 483                         CWARN("%s: cannot fill page %lu for "DFID
 484                               " with data: rc = %li\n",
 485                               ll_get_fsname(inode->i_sb, NULL, 0),
 486                               index + start, PFID(lu_object_fid(&obj->co_lu)),
 487                               PTR_ERR(vmpage));
 488                         break;
 489                 }
 490                 put_page(vmpage);
 491                 index++;
 492         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 493         EXIT;
 494 }
 495
 496 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 497                                 struct lookup_intent *itp)
 498 {
 499         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 500         struct dentry *parent = de->d_parent;
 501         const char *name = NULL;
 502         int len = 0;
 503         struct md_op_data *op_data;
 504         struct ptlrpc_request *req = NULL;
 505         int rc;
 506         ENTRY;
 507
 508         LASSERT(parent != NULL);
 509         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 510
 511         /* if server supports open-by-fid, or file name is invalid, don't pack
 512          * name in open request */
 513         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 514             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 515                 name = de->d_name.name;
 516                 len = de->d_name.len;
 517         }
 518
 519         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 520                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 521         if (IS_ERR(op_data))
 522                 RETURN(PTR_ERR(op_data));
 523         op_data->op_data = lmm;
 524         op_data->op_data_size = lmmsize;
 525
 526         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 527                             &ll_md_blocking_ast, 0);
 528         ll_finish_md_op_data(op_data);
 529         if (rc == -ESTALE) {
 530                 /* reason for keep own exit path - don`t flood log
 531                  * with messages with -ESTALE errors.
 532                  */
 533                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 534                      it_open_error(DISP_OPEN_OPEN, itp))
 535                         GOTO(out, rc);
 536                 ll_release_openhandle(de, itp);
 537                 GOTO(out, rc);
 538         }
 539
 540         if (it_disposition(itp, DISP_LOOKUP_NEG))
 541                 GOTO(out, rc = -ENOENT);
 542
 543         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 544                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 545                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 546                 GOTO(out, rc);
 547         }
 548
 549         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 550
 551         if (!rc && itp->it_lock_mode) {
 552                 ll_dom_finish_open(de->d_inode, req, itp);
 553                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 554         }
 555
 556 out:
 557         ptlrpc_req_finished(req);
 558         ll_intent_drop_lock(itp);
 559
 560         /* We did open by fid, but by the time we got to the server,
 561          * the object disappeared. If this is a create, we cannot really
 562          * tell the userspace that the file it was trying to create
 563          * does not exist. Instead let's return -ESTALE, and the VFS will
 564          * retry the create with LOOKUP_REVAL that we are going to catch
 565          * in ll_revalidate_dentry() and use lookup then.
 566          */
 567         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 568                 rc = -ESTALE;
 569
 570         RETURN(rc);
 571 }
 572
 573 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 574                        struct obd_client_handle *och)
 575 {
 576         struct mdt_body *body;
 577
 578         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 579         och->och_open_handle = body->mbo_open_handle;
 580         och->och_fid = body->mbo_fid1;
 581         och->och_lease_handle.cookie = it->it_lock_handle;
 582         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 583         och->och_flags = it->it_flags;
 584
 585         return md_set_open_replay_data(md_exp, och, it);
 586 }
 587
 588 static int ll_local_open(struct file *file, struct lookup_intent *it,
 589                          struct ll_file_data *fd, struct obd_client_handle *och)
 590 {
 591         struct inode *inode = file_inode(file);
 592         ENTRY;
 593
 594         LASSERT(!LUSTRE_FPRIVATE(file));
 595
 596         LASSERT(fd != NULL);
 597
 598         if (och) {
 599                 int rc;
 600
 601                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 602                 if (rc != 0)
 603                         RETURN(rc);
 604         }
 605
 606         LUSTRE_FPRIVATE(file) = fd;
 607         ll_readahead_init(inode, &fd->fd_ras);
 608         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 609
 610         /* ll_cl_context initialize */
 611         rwlock_init(&fd->fd_lock);
 612         INIT_LIST_HEAD(&fd->fd_lccs);
 613
 614         RETURN(0);
 615 }
 616
 617 /* Open a file, and (for the very first open) create objects on the OSTs at
 618  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 619  * creation or open until ll_lov_setstripe() ioctl is called.
 620  *
 621  * If we already have the stripe MD locally then we don't request it in
 622  * md_open(), by passing a lmm_size = 0.
 623  *
 624  * It is up to the application to ensure no other processes open this file
 625  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 626  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 627  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 628  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 629  */
 630 int ll_file_open(struct inode *inode, struct file *file)
 631 {
 632         struct ll_inode_info *lli = ll_i2info(inode);
 633         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 634                                           .it_flags = file->f_flags };
 635         struct obd_client_handle **och_p = NULL;
 636         __u64 *och_usecount = NULL;
 637         struct ll_file_data *fd;
 638         int rc = 0;
 639         ENTRY;
 640
 641         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 642                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 643
 644         it = file->private_data; /* XXX: compat macro */
 645         file->private_data = NULL; /* prevent ll_local_open assertion */
 646
 647         fd = ll_file_data_get();
 648         if (fd == NULL)
 649                 GOTO(out_nofiledata, rc = -ENOMEM);
 650
 651         fd->fd_file = file;
 652         if (S_ISDIR(inode->i_mode))
 653                 ll_authorize_statahead(inode, fd);
 654
 655         if (inode->i_sb->s_root == file_dentry(file)) {
 656                 LUSTRE_FPRIVATE(file) = fd;
 657                 RETURN(0);
 658         }
 659
 660         if (!it || !it->it_disposition) {
 661                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 662                  * because everything but O_ACCMODE mask was stripped from
 663                  * there */
 664                 if ((oit.it_flags + 1) & O_ACCMODE)
 665                         oit.it_flags++;
 666                 if (file->f_flags & O_TRUNC)
 667                         oit.it_flags |= FMODE_WRITE;
 668
 669                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 670                  * dentry_open after call to open_namei that checks permissions.
 671                  * Only nfsd_open call dentry_open directly without checking
 672                  * permissions and because of that this code below is safe.
 673                  */
 674                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 675                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 676
 677                 /* We do not want O_EXCL here, presumably we opened the file
 678                  * already? XXX - NFS implications? */
 679                 oit.it_flags &= ~O_EXCL;
 680
 681                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 682                  * created if necessary, then "IT_CREAT" should be set to keep
 683                  * consistent with it */
 684                 if (oit.it_flags & O_CREAT)
 685                         oit.it_op |= IT_CREAT;
 686
 687                 it = &oit;
 688         }
 689
 690 restart:
 691         /* Let's see if we have file open on MDS already. */
 692         if (it->it_flags & FMODE_WRITE) {
 693                 och_p = &lli->lli_mds_write_och;
 694                 och_usecount = &lli->lli_open_fd_write_count;
 695         } else if (it->it_flags & FMODE_EXEC) {
 696                 och_p = &lli->lli_mds_exec_och;
 697                 och_usecount = &lli->lli_open_fd_exec_count;
 698          } else {
 699                 och_p = &lli->lli_mds_read_och;
 700                 och_usecount = &lli->lli_open_fd_read_count;
 701         }
 702
 703         mutex_lock(&lli->lli_och_mutex);
 704         if (*och_p) { /* Open handle is present */
 705                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 706                         /* Well, there's extra open request that we do not need,
 707                            let's close it somehow. This will decref request. */
 708                         rc = it_open_error(DISP_OPEN_OPEN, it);
 709                         if (rc) {
 710                                 mutex_unlock(&lli->lli_och_mutex);
 711                                 GOTO(out_openerr, rc);
 712                         }
 713
 714                         ll_release_openhandle(file_dentry(file), it);
 715                 }
 716                 (*och_usecount)++;
 717
 718                 rc = ll_local_open(file, it, fd, NULL);
 719                 if (rc) {
 720                         (*och_usecount)--;
 721                         mutex_unlock(&lli->lli_och_mutex);
 722                         GOTO(out_openerr, rc);
 723                 }
 724         } else {
 725                 LASSERT(*och_usecount == 0);
 726                 if (!it->it_disposition) {
 727                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 728                         /* We cannot just request lock handle now, new ELC code
 729                            means that one of other OPEN locks for this file
 730                            could be cancelled, and since blocking ast handler
 731                            would attempt to grab och_mutex as well, that would
 732                            result in a deadlock */
 733                         mutex_unlock(&lli->lli_och_mutex);
 734                         /*
 735                          * Normally called under two situations:
 736                          * 1. NFS export.
 737                          * 2. A race/condition on MDS resulting in no open
 738                          *    handle to be returned from LOOKUP|OPEN request,
 739                          *    for example if the target entry was a symlink.
 740                          *
 741                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 742                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 743                          *  bit so that it's not confusing later callers.
 744                          *
 745                          *  NB; when ldd is NULL, it must have come via normal
 746                          *  lookup path only, since ll_iget_for_nfs always calls
 747                          *  ll_d_init().
 748                          */
 749                         if (ldd && ldd->lld_nfs_dentry) {
 750                                 ldd->lld_nfs_dentry = 0;
 751                                 it->it_flags |= MDS_OPEN_LOCK;
 752                         }
 753
 754                          /*
 755                          * Always specify MDS_OPEN_BY_FID because we don't want
 756                          * to get file with different fid.
 757                          */
 758                         it->it_flags |= MDS_OPEN_BY_FID;
 759                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 760                                                  it);
 761                         if (rc)
 762                                 GOTO(out_openerr, rc);
 763
 764                         goto restart;
 765                 }
 766                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 767                 if (!*och_p)
 768                         GOTO(out_och_free, rc = -ENOMEM);
 769
 770                 (*och_usecount)++;
 771
 772                 /* md_intent_lock() didn't get a request ref if there was an
 773                  * open error, so don't do cleanup on the request here
 774                  * (bug 3430) */
 775                 /* XXX (green): Should not we bail out on any error here, not
 776                  * just open error? */
 777                 rc = it_open_error(DISP_OPEN_OPEN, it);
 778                 if (rc != 0)
 779                         GOTO(out_och_free, rc);
 780
 781                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 782                          "inode %p: disposition %x, status %d\n", inode,
 783                          it_disposition(it, ~0), it->it_status);
 784
 785                 rc = ll_local_open(file, it, fd, *och_p);
 786                 if (rc)
 787                         GOTO(out_och_free, rc);
 788         }
 789         mutex_unlock(&lli->lli_och_mutex);
 790         fd = NULL;
 791
 792         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 793            different kind of OPEN lock for this same inode gets cancelled
 794            by ldlm_cancel_lru */
 795         if (!S_ISREG(inode->i_mode))
 796                 GOTO(out_och_free, rc);
 797
 798         cl_lov_delay_create_clear(&file->f_flags);
 799         GOTO(out_och_free, rc);
 800
 801 out_och_free:
 802         if (rc) {
 803                 if (och_p && *och_p) {
 804                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 805                         *och_p = NULL; /* OBD_FREE writes some magic there */
 806                         (*och_usecount)--;
 807                 }
 808                 mutex_unlock(&lli->lli_och_mutex);
 809
 810 out_openerr:
 811                 if (lli->lli_opendir_key == fd)
 812                         ll_deauthorize_statahead(inode, fd);
 813                 if (fd != NULL)
 814                         ll_file_data_put(fd);
 815         } else {
 816                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 817         }
 818
 819 out_nofiledata:
 820         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 821                 ptlrpc_req_finished(it->it_request);
 822                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 823         }
 824
 825         return rc;
 826 }
 827
 828 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 829                         struct ldlm_lock_desc *desc, void *data, int flag)
 830 {
 831         int rc;
 832         struct lustre_handle lockh;
 833         ENTRY;
 834
 835         switch (flag) {
 836         case LDLM_CB_BLOCKING:
 837                 ldlm_lock2handle(lock, &lockh);
 838                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 839                 if (rc < 0) {
 840                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 841                         RETURN(rc);
 842                 }
 843                 break;
 844         case LDLM_CB_CANCELING:
 845                 /* do nothing */
 846                 break;
 847         }
 848         RETURN(0);
 849 }
 850
 851 /**
 852  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 853  * and save it as fd->fd_och so as to force client to reopen the file even
 854  * if it has an open lock in cache already.
 855  */
 856 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 857                                 struct lustre_handle *old_open_handle)
 858 {
 859         struct ll_inode_info *lli = ll_i2info(inode);
 860         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 861         struct obd_client_handle **och_p;
 862         __u64 *och_usecount;
 863         int rc = 0;
 864         ENTRY;
 865
 866         /* Get the openhandle of the file */
 867         mutex_lock(&lli->lli_och_mutex);
 868         if (fd->fd_lease_och != NULL)
 869                 GOTO(out_unlock, rc = -EBUSY);
 870
 871         if (fd->fd_och == NULL) {
 872                 if (file->f_mode & FMODE_WRITE) {
 873                         LASSERT(lli->lli_mds_write_och != NULL);
 874                         och_p = &lli->lli_mds_write_och;
 875                         och_usecount = &lli->lli_open_fd_write_count;
 876                 } else {
 877                         LASSERT(lli->lli_mds_read_och != NULL);
 878                         och_p = &lli->lli_mds_read_och;
 879                         och_usecount = &lli->lli_open_fd_read_count;
 880                 }
 881
 882                 if (*och_usecount > 1)
 883                         GOTO(out_unlock, rc = -EBUSY);
 884
 885                 fd->fd_och = *och_p;
 886                 *och_usecount = 0;
 887                 *och_p = NULL;
 888         }
 889
 890         *old_open_handle = fd->fd_och->och_open_handle;
 891
 892         EXIT;
 893 out_unlock:
 894         mutex_unlock(&lli->lli_och_mutex);
 895         return rc;
 896 }
 897
 898 /**
 899  * Release ownership on lli_mds_*_och when putting back a file lease.
 900  */
 901 static int ll_lease_och_release(struct inode *inode, struct file *file)
 902 {
 903         struct ll_inode_info *lli = ll_i2info(inode);
 904         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 905         struct obd_client_handle **och_p;
 906         struct obd_client_handle *old_och = NULL;
 907         __u64 *och_usecount;
 908         int rc = 0;
 909         ENTRY;
 910
 911         mutex_lock(&lli->lli_och_mutex);
 912         if (file->f_mode & FMODE_WRITE) {
 913                 och_p = &lli->lli_mds_write_och;
 914                 och_usecount = &lli->lli_open_fd_write_count;
 915         } else {
 916                 och_p = &lli->lli_mds_read_och;
 917                 och_usecount = &lli->lli_open_fd_read_count;
 918         }
 919
 920         /* The file may have been open by another process (broken lease) so
 921          * *och_p is not NULL. In this case we should simply increase usecount
 922          * and close fd_och.
 923          */
 924         if (*och_p != NULL) {
 925                 old_och = fd->fd_och;
 926                 (*och_usecount)++;
 927         } else {
 928                 *och_p = fd->fd_och;
 929                 *och_usecount = 1;
 930         }
 931         fd->fd_och = NULL;
 932         mutex_unlock(&lli->lli_och_mutex);
 933
 934         if (old_och != NULL)
 935                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 936
 937         RETURN(rc);
 938 }
 939
 940 /**
 941  * Acquire a lease and open the file.
 942  */
 943 static struct obd_client_handle *
 944 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 945               __u64 open_flags)
 946 {
 947         struct lookup_intent it = { .it_op = IT_OPEN };
 948         struct ll_sb_info *sbi = ll_i2sbi(inode);
 949         struct md_op_data *op_data;
 950         struct ptlrpc_request *req = NULL;
 951         struct lustre_handle old_open_handle = { 0 };
 952         struct obd_client_handle *och = NULL;
 953         int rc;
 954         int rc2;
 955         ENTRY;
 956
 957         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 958                 RETURN(ERR_PTR(-EINVAL));
 959
 960         if (file != NULL) {
 961                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 962                         RETURN(ERR_PTR(-EPERM));
 963
 964                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
 965                 if (rc)
 966                         RETURN(ERR_PTR(rc));
 967         }
 968
 969         OBD_ALLOC_PTR(och);
 970         if (och == NULL)
 971                 RETURN(ERR_PTR(-ENOMEM));
 972
 973         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 974                                         LUSTRE_OPC_ANY, NULL);
 975         if (IS_ERR(op_data))
 976                 GOTO(out, rc = PTR_ERR(op_data));
 977
 978         /* To tell the MDT this openhandle is from the same owner */
 979         op_data->op_open_handle = old_open_handle;
 980
 981         it.it_flags = fmode | open_flags;
 982         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 983         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 984                             &ll_md_blocking_lease_ast,
 985         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 986          * it can be cancelled which may mislead applications that the lease is
 987          * broken;
 988          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 989          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 990          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 991                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 992         ll_finish_md_op_data(op_data);
 993         ptlrpc_req_finished(req);
 994         if (rc < 0)
 995                 GOTO(out_release_it, rc);
 996
 997         if (it_disposition(&it, DISP_LOOKUP_NEG))
 998                 GOTO(out_release_it, rc = -ENOENT);
 999
1000         rc = it_open_error(DISP_OPEN_OPEN, &it);
1001         if (rc)
1002                 GOTO(out_release_it, rc);
1003
1004         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1005         ll_och_fill(sbi->ll_md_exp, &it, och);
1006
1007         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1008                 GOTO(out_close, rc = -EOPNOTSUPP);
1009
1010         /* already get lease, handle lease lock */
1011         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1012         if (it.it_lock_mode == 0 ||
1013             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1014                 /* open lock must return for lease */
1015                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1016                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1017                         it.it_lock_bits);
1018                 GOTO(out_close, rc = -EPROTO);
1019         }
1020
1021         ll_intent_release(&it);
1022         RETURN(och);
1023
1024 out_close:
1025         /* Cancel open lock */
1026         if (it.it_lock_mode != 0) {
1027                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1028                                             it.it_lock_mode);
1029                 it.it_lock_mode = 0;
1030                 och->och_lease_handle.cookie = 0ULL;
1031         }
1032         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1033         if (rc2 < 0)
1034                 CERROR("%s: error closing file "DFID": %d\n",
1035                        ll_get_fsname(inode->i_sb, NULL, 0),
1036                        PFID(&ll_i2info(inode)->lli_fid), rc2);
1037         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1038 out_release_it:
1039         ll_intent_release(&it);
1040 out:
1041         if (och != NULL)
1042                 OBD_FREE_PTR(och);
1043         RETURN(ERR_PTR(rc));
1044 }
1045
1046 /**
1047  * Check whether a layout swap can be done between two inodes.
1048  *
1049  * \param[in] inode1  First inode to check
1050  * \param[in] inode2  Second inode to check
1051  *
1052  * \retval 0 on success, layout swap can be performed between both inodes
1053  * \retval negative error code if requirements are not met
1054  */
1055 static int ll_check_swap_layouts_validity(struct inode *inode1,
1056                                           struct inode *inode2)
1057 {
1058         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1059                 return -EINVAL;
1060
1061         if (inode_permission(inode1, MAY_WRITE) ||
1062             inode_permission(inode2, MAY_WRITE))
1063                 return -EPERM;
1064
1065         if (inode1->i_sb != inode2->i_sb)
1066                 return -EXDEV;
1067
1068         return 0;
1069 }
1070
1071 static int ll_swap_layouts_close(struct obd_client_handle *och,
1072                                  struct inode *inode, struct inode *inode2)
1073 {
1074         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1075         const struct lu_fid     *fid2;
1076         int                      rc;
1077         ENTRY;
1078
1079         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1080                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1081
1082         rc = ll_check_swap_layouts_validity(inode, inode2);
1083         if (rc < 0)
1084                 GOTO(out_free_och, rc);
1085
1086         /* We now know that inode2 is a lustre inode */
1087         fid2 = ll_inode2fid(inode2);
1088
1089         rc = lu_fid_cmp(fid1, fid2);
1090         if (rc == 0)
1091                 GOTO(out_free_och, rc = -EINVAL);
1092
1093         /* Close the file and {swap,merge} layouts between inode & inode2.
1094          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1095          * because we still need it to pack l_remote_handle to MDT. */
1096         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1097                                        inode2);
1098
1099         och = NULL; /* freed in ll_close_inode_openhandle() */
1100
1101 out_free_och:
1102         if (och != NULL)
1103                 OBD_FREE_PTR(och);
1104
1105         RETURN(rc);
1106 }
1107
1108 /**
1109  * Release lease and close the file.
1110  * It will check if the lease has ever broken.
1111  */
1112 static int ll_lease_close_intent(struct obd_client_handle *och,
1113                                  struct inode *inode,
1114                                  bool *lease_broken, enum mds_op_bias bias,
1115                                  void *data)
1116 {
1117         struct ldlm_lock *lock;
1118         bool cancelled = true;
1119         int rc;
1120         ENTRY;
1121
1122         lock = ldlm_handle2lock(&och->och_lease_handle);
1123         if (lock != NULL) {
1124                 lock_res_and_lock(lock);
1125                 cancelled = ldlm_is_cancel(lock);
1126                 unlock_res_and_lock(lock);
1127                 LDLM_LOCK_PUT(lock);
1128         }
1129
1130         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1131                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1132
1133         if (lease_broken != NULL)
1134                 *lease_broken = cancelled;
1135
1136         if (!cancelled && !bias)
1137                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1138
1139         if (cancelled) { /* no need to excute intent */
1140                 bias = 0;
1141                 data = NULL;
1142         }
1143
1144         rc = ll_close_inode_openhandle(inode, och, bias, data);
1145         RETURN(rc);
1146 }
1147
1148 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1149                           bool *lease_broken)
1150 {
1151         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1152 }
1153
1154 /**
1155  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1156  */
1157 static int ll_lease_file_resync(struct obd_client_handle *och,
1158                                 struct inode *inode, unsigned long arg)
1159 {
1160         struct ll_sb_info *sbi = ll_i2sbi(inode);
1161         struct md_op_data *op_data;
1162         struct ll_ioc_lease_id ioc;
1163         __u64 data_version_unused;
1164         int rc;
1165         ENTRY;
1166
1167         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1168                                      LUSTRE_OPC_ANY, NULL);
1169         if (IS_ERR(op_data))
1170                 RETURN(PTR_ERR(op_data));
1171
1172         if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1173                            sizeof(ioc)))
1174                 RETURN(-EFAULT);
1175
1176         /* before starting file resync, it's necessary to clean up page cache
1177          * in client memory, otherwise once the layout version is increased,
1178          * writing back cached data will be denied the OSTs. */
1179         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1180         if (rc)
1181                 GOTO(out, rc);
1182
1183         op_data->op_lease_handle = och->och_lease_handle;
1184         op_data->op_mirror_id = ioc.lil_mirror_id;
1185         rc = md_file_resync(sbi->ll_md_exp, op_data);
1186         if (rc)
1187                 GOTO(out, rc);
1188
1189         EXIT;
1190 out:
1191         ll_finish_md_op_data(op_data);
1192         return rc;
1193 }
1194
1195 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1196 {
1197         struct ll_inode_info *lli = ll_i2info(inode);
1198         struct cl_object *obj = lli->lli_clob;
1199         struct cl_attr *attr = vvp_env_thread_attr(env);
1200         s64 atime;
1201         s64 mtime;
1202         s64 ctime;
1203         int rc = 0;
1204
1205         ENTRY;
1206
1207         ll_inode_size_lock(inode);
1208
1209         /* Merge timestamps the most recently obtained from MDS with
1210          * timestamps obtained from OSTs.
1211          *
1212          * Do not overwrite atime of inode because it may be refreshed
1213          * by file_accessed() function. If the read was served by cache
1214          * data, there is no RPC to be sent so that atime may not be
1215          * transferred to OSTs at all. MDT only updates atime at close time
1216          * if it's at least 'mdd.*.atime_diff' older.
1217          * All in all, the atime in Lustre does not strictly comply with
1218          * POSIX. Solving this problem needs to send an RPC to MDT for each
1219          * read, this will hurt performance. */
1220         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1221                 LTIME_S(inode->i_atime) = lli->lli_atime;
1222                 lli->lli_update_atime = 0;
1223         }
1224         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1225         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1226
1227         atime = LTIME_S(inode->i_atime);
1228         mtime = LTIME_S(inode->i_mtime);
1229         ctime = LTIME_S(inode->i_ctime);
1230
1231         cl_object_attr_lock(obj);
1232         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1233                 rc = -EINVAL;
1234         else
1235                 rc = cl_object_attr_get(env, obj, attr);
1236         cl_object_attr_unlock(obj);
1237
1238         if (rc != 0)
1239                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1240
1241         if (atime < attr->cat_atime)
1242                 atime = attr->cat_atime;
1243
1244         if (ctime < attr->cat_ctime)
1245                 ctime = attr->cat_ctime;
1246
1247         if (mtime < attr->cat_mtime)
1248                 mtime = attr->cat_mtime;
1249
1250         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1251                PFID(&lli->lli_fid), attr->cat_size);
1252
1253         i_size_write(inode, attr->cat_size);
1254         inode->i_blocks = attr->cat_blocks;
1255
1256         LTIME_S(inode->i_atime) = atime;
1257         LTIME_S(inode->i_mtime) = mtime;
1258         LTIME_S(inode->i_ctime) = ctime;
1259
1260 out_size_unlock:
1261         ll_inode_size_unlock(inode);
1262
1263         RETURN(rc);
1264 }
1265
1266 /**
1267  * Set designated mirror for I/O.
1268  *
1269  * So far only read, write, and truncated can support to issue I/O to
1270  * designated mirror.
1271  */
1272 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1273 {
1274         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1275
1276         /* clear layout version for generic(non-resync) I/O in case it carries
1277          * stale layout version due to I/O restart */
1278         io->ci_layout_version = 0;
1279
1280         /* FLR: disable non-delay for designated mirror I/O because obviously
1281          * only one mirror is available */
1282         if (fd->fd_designated_mirror > 0) {
1283                 io->ci_ndelay = 0;
1284                 io->ci_designated_mirror = fd->fd_designated_mirror;
1285                 io->ci_layout_version = fd->fd_layout_version;
1286                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1287                                  * io to ptasks */
1288         }
1289
1290         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1291                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1292 }
1293
1294 static bool file_is_noatime(const struct file *file)
1295 {
1296         const struct vfsmount *mnt = file->f_path.mnt;
1297         const struct inode *inode = file_inode((struct file *)file);
1298
1299         /* Adapted from file_accessed() and touch_atime().*/
1300         if (file->f_flags & O_NOATIME)
1301                 return true;
1302
1303         if (inode->i_flags & S_NOATIME)
1304                 return true;
1305
1306         if (IS_NOATIME(inode))
1307                 return true;
1308
1309         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1310                 return true;
1311
1312         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1313                 return true;
1314
1315         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1316                 return true;
1317
1318         return false;
1319 }
1320
1321 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1322
1323 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1324 {
1325         struct inode *inode = file_inode(file);
1326         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1327
1328         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1329         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1330         io->u.ci_rw.rw_file = file;
1331         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1332         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1333         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1334
1335         if (iot == CIT_WRITE) {
1336                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1337                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1338                                            file->f_flags & O_DIRECT ||
1339                                            IS_SYNC(inode));
1340         }
1341         io->ci_obj = ll_i2info(inode)->lli_clob;
1342         io->ci_lockreq = CILR_MAYBE;
1343         if (ll_file_nolock(file)) {
1344                 io->ci_lockreq = CILR_NEVER;
1345                 io->ci_no_srvlock = 1;
1346         } else if (file->f_flags & O_APPEND) {
1347                 io->ci_lockreq = CILR_MANDATORY;
1348         }
1349         io->ci_noatime = file_is_noatime(file);
1350         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1351                 io->ci_pio = !io->u.ci_rw.rw_append;
1352         else
1353                 io->ci_pio = 0;
1354
1355         /* FLR: only use non-delay I/O for read as there is only one
1356          * avaliable mirror for write. */
1357         io->ci_ndelay = !(iot == CIT_WRITE);
1358
1359         ll_io_set_mirror(io, file);
1360 }
1361
1362 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1363 {
1364         struct cl_io_pt *pt = ptask->pt_cbdata;
1365         struct file *file = pt->cip_file;
1366         struct lu_env *env;
1367         struct cl_io *io;
1368         loff_t pos = pt->cip_pos;
1369         int rc;
1370         __u16 refcheck;
1371         ENTRY;
1372
1373         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1374                 file_dentry(file)->d_name.name,
1375                 pt->cip_iot == CIT_READ ? "read" : "write",
1376                 pos, pos + pt->cip_count);
1377
1378         env = cl_env_get(&refcheck);
1379         if (IS_ERR(env))
1380                 RETURN(PTR_ERR(env));
1381
1382         io = vvp_env_thread_io(env);
1383         ll_io_init(io, file, pt->cip_iot);
1384         io->u.ci_rw.rw_iter = pt->cip_iter;
1385         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1386         io->ci_pio = 0; /* It's already in parallel task */
1387
1388         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1389                            pt->cip_count - pt->cip_result);
1390         if (!rc) {
1391                 struct vvp_io *vio = vvp_env_io(env);
1392
1393                 vio->vui_io_subtype = IO_NORMAL;
1394                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1395
1396                 ll_cl_add(file, env, io, LCC_RW);
1397                 rc = cl_io_loop(env, io);
1398                 ll_cl_remove(file, env);
1399         } else {
1400                 /* cl_io_rw_init() handled IO */
1401                 rc = io->ci_result;
1402         }
1403
1404         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1405                 if (io->ci_nob > 0)
1406                         io->ci_nob /= 2;
1407                 rc = -EIO;
1408         }
1409
1410         if (io->ci_nob > 0) {
1411                 pt->cip_result += io->ci_nob;
1412                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1413                 pos += io->ci_nob;
1414                 pt->cip_iocb.ki_pos = pos;
1415 #ifdef HAVE_KIOCB_KI_LEFT
1416                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1417 #elif defined(HAVE_KI_NBYTES)
1418                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1419 #endif
1420         }
1421
1422         cl_io_fini(env, io);
1423         cl_env_put(env, &refcheck);
1424
1425         pt->cip_need_restart = io->ci_need_restart;
1426
1427         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1428                 file_dentry(file)->d_name.name,
1429                 pt->cip_iot == CIT_READ ? "read" : "write",
1430                 pt->cip_result, rc);
1431
1432         RETURN(pt->cip_result > 0 ? 0 : rc);
1433 }
1434
1435 static ssize_t
1436 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1437                    struct file *file, enum cl_io_type iot,
1438                    loff_t *ppos, size_t count)
1439 {
1440         struct range_lock       range;
1441         struct vvp_io           *vio = vvp_env_io(env);
1442         struct inode            *inode = file_inode(file);
1443         struct ll_inode_info    *lli = ll_i2info(inode);
1444         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1445         struct cl_io            *io;
1446         loff_t                  pos = *ppos;
1447         ssize_t                 result = 0;
1448         int                     rc = 0;
1449         unsigned                retried = 0;
1450         bool                    restarted = false;
1451
1452         ENTRY;
1453
1454         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1455                 file_dentry(file)->d_name.name,
1456                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1457
1458 restart:
1459         io = vvp_env_thread_io(env);
1460         ll_io_init(io, file, iot);
1461         if (args->via_io_subtype == IO_NORMAL) {
1462                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1463                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1464         }
1465         if (args->via_io_subtype != IO_NORMAL || restarted)
1466                 io->ci_pio = 0;
1467         io->ci_ndelay_tried = retried;
1468
1469         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1470                 bool range_locked = false;
1471
1472                 if (file->f_flags & O_APPEND)
1473                         range_lock_init(&range, 0, LUSTRE_EOF);
1474                 else
1475                         range_lock_init(&range, pos, pos + count - 1);
1476
1477                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1478                 vio->vui_io_subtype = args->via_io_subtype;
1479
1480                 switch (vio->vui_io_subtype) {
1481                 case IO_NORMAL:
1482                         /* Direct IO reads must also take range lock,
1483                          * or multiple reads will try to work on the same pages
1484                          * See LU-6227 for details. */
1485                         if (((iot == CIT_WRITE) ||
1486                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1487                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1488                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1489                                        RL_PARA(&range));
1490                                 rc = range_lock(&lli->lli_write_tree, &range);
1491                                 if (rc < 0)
1492                                         GOTO(out, rc);
1493
1494                                 range_locked = true;
1495                         }
1496                         break;
1497                 case IO_SPLICE:
1498                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1499                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1500                         break;
1501                 default:
1502                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1503                         LBUG();
1504                 }
1505
1506                 ll_cl_add(file, env, io, LCC_RW);
1507                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1508                     !lli->lli_inode_locked) {
1509                         inode_lock(inode);
1510                         lli->lli_inode_locked = 1;
1511                 }
1512                 rc = cl_io_loop(env, io);
1513                 if (lli->lli_inode_locked) {
1514                         lli->lli_inode_locked = 0;
1515                         inode_unlock(inode);
1516                 }
1517                 ll_cl_remove(file, env);
1518
1519                 if (range_locked) {
1520                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1521                                RL_PARA(&range));
1522                         range_unlock(&lli->lli_write_tree, &range);
1523                 }
1524         } else {
1525                 /* cl_io_rw_init() handled IO */
1526                 rc = io->ci_result;
1527         }
1528
1529         if (io->ci_nob > 0) {
1530                 result += io->ci_nob;
1531                 count  -= io->ci_nob;
1532
1533                 if (args->via_io_subtype == IO_NORMAL) {
1534                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1535
1536                         /* CLIO is too complicated. See LU-11069. */
1537                         if (cl_io_is_append(io))
1538                                 pos = io->u.ci_rw.rw_iocb.ki_pos;
1539                         else
1540                                 pos += io->ci_nob;
1541
1542                         args->u.normal.via_iocb->ki_pos = pos;
1543 #ifdef HAVE_KIOCB_KI_LEFT
1544                         args->u.normal.via_iocb->ki_left = count;
1545 #elif defined(HAVE_KI_NBYTES)
1546                         args->u.normal.via_iocb->ki_nbytes = count;
1547 #endif
1548                 } else {
1549                         /* for splice */
1550                         pos = io->u.ci_rw.rw_range.cir_pos;
1551                 }
1552         }
1553 out:
1554         cl_io_fini(env, io);
1555
1556         CDEBUG(D_VFSTRACE,
1557                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1558                file->f_path.dentry->d_name.name,
1559                iot, rc, result, io->ci_need_restart);
1560
1561         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1562                 CDEBUG(D_VFSTRACE,
1563                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1564                         file_dentry(file)->d_name.name,
1565                         iot == CIT_READ ? "read" : "write",
1566                         pos, pos + count, result, rc);
1567                 /* preserve the tried count for FLR */
1568                 retried = io->ci_ndelay_tried;
1569                 restarted = true;
1570                 goto restart;
1571         }
1572
1573         if (iot == CIT_READ) {
1574                 if (result > 0)
1575                         ll_stats_ops_tally(ll_i2sbi(inode),
1576                                            LPROC_LL_READ_BYTES, result);
1577         } else if (iot == CIT_WRITE) {
1578                 if (result > 0) {
1579                         ll_stats_ops_tally(ll_i2sbi(inode),
1580                                            LPROC_LL_WRITE_BYTES, result);
1581                         fd->fd_write_failed = false;
1582                 } else if (result == 0 && rc == 0) {
1583                         rc = io->ci_result;
1584                         if (rc < 0)
1585                                 fd->fd_write_failed = true;
1586                         else
1587                                 fd->fd_write_failed = false;
1588                 } else if (rc != -ERESTARTSYS) {
1589                         fd->fd_write_failed = true;
1590                 }
1591         }
1592
1593         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1594                 file_dentry(file)->d_name.name,
1595                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1596
1597         *ppos = pos;
1598
1599         RETURN(result > 0 ? result : rc);
1600 }
1601
1602 /**
1603  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1604  * especially for small I/O.
1605  *
1606  * To serve a read request, CLIO has to create and initialize a cl_io and
1607  * then request DLM lock. This has turned out to have siginificant overhead
1608  * and affects the performance of small I/O dramatically.
1609  *
1610  * It's not necessary to create a cl_io for each I/O. Under the help of read
1611  * ahead, most of the pages being read are already in memory cache and we can
1612  * read those pages directly because if the pages exist, the corresponding DLM
1613  * lock must exist so that page content must be valid.
1614  *
1615  * In fast read implementation, the llite speculatively finds and reads pages
1616  * in memory cache. There are three scenarios for fast read:
1617  *   - If the page exists and is uptodate, kernel VM will provide the data and
1618  *     CLIO won't be intervened;
1619  *   - If the page was brought into memory by read ahead, it will be exported
1620  *     and read ahead parameters will be updated;
1621  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1622  *     it will go back and invoke normal read, i.e., a cl_io will be created
1623  *     and DLM lock will be requested.
1624  *
1625  * POSIX compliance: posix standard states that read is intended to be atomic.
1626  * Lustre read implementation is in line with Linux kernel read implementation
1627  * and neither of them complies with POSIX standard in this matter. Fast read
1628  * doesn't make the situation worse on single node but it may interleave write
1629  * results from multiple nodes due to short read handling in ll_file_aio_read().
1630  *
1631  * \param env - lu_env
1632  * \param iocb - kiocb from kernel
1633  * \param iter - user space buffers where the data will be copied
1634  *
1635  * \retval - number of bytes have been read, or error code if error occurred.
1636  */
1637 static ssize_t
1638 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1639 {
1640         ssize_t result;
1641
1642         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1643                 return 0;
1644
1645         /* NB: we can't do direct IO for fast read because it will need a lock
1646          * to make IO engine happy. */
1647         if (iocb->ki_filp->f_flags & O_DIRECT)
1648                 return 0;
1649
1650         result = generic_file_read_iter(iocb, iter);
1651
1652         /* If the first page is not in cache, generic_file_aio_read() will be
1653          * returned with -ENODATA.
1654          * See corresponding code in ll_readpage(). */
1655         if (result == -ENODATA)
1656                 result = 0;
1657
1658         if (result > 0)
1659                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1660                                 LPROC_LL_READ_BYTES, result);
1661
1662         return result;
1663 }
1664
1665 /*
1666  * Read from a file (through the page cache).
1667  */
1668 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1669 {
1670         struct lu_env *env;
1671         struct vvp_io_args *args;
1672         ssize_t result;
1673         ssize_t rc2;
1674         __u16 refcheck;
1675
1676         result = ll_do_fast_read(iocb, to);
1677         if (result < 0 || iov_iter_count(to) == 0)
1678                 GOTO(out, result);
1679
1680         env = cl_env_get(&refcheck);
1681         if (IS_ERR(env))
1682                 return PTR_ERR(env);
1683
1684         args = ll_env_args(env, IO_NORMAL);
1685         args->u.normal.via_iter = to;
1686         args->u.normal.via_iocb = iocb;
1687
1688         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1689                                  &iocb->ki_pos, iov_iter_count(to));
1690         if (rc2 > 0)
1691                 result += rc2;
1692         else if (result == 0)
1693                 result = rc2;
1694
1695         cl_env_put(env, &refcheck);
1696 out:
1697         return result;
1698 }
1699
1700 /**
1701  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1702  * If a page is already in the page cache and dirty (and some other things -
1703  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1704  * write to it without doing a full I/O, because Lustre already knows about it
1705  * and will write it out.  This saves a lot of processing time.
1706  *
1707  * All writes here are within one page, so exclusion is handled by the page
1708  * lock on the vm page.  We do not do tiny writes for writes which touch
1709  * multiple pages because it's very unlikely multiple sequential pages are
1710  * are already dirty.
1711  *
1712  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1713  * and are unlikely to be to already dirty pages.
1714  *
1715  * Attribute updates are important here, we do them in ll_tiny_write_end.
1716  */
1717 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1718 {
1719         ssize_t count = iov_iter_count(iter);
1720         struct file *file = iocb->ki_filp;
1721         struct inode *inode = file_inode(file);
1722         ssize_t result = 0;
1723
1724         ENTRY;
1725
1726         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1727          * of function for why.
1728          */
1729         if (count >= PAGE_SIZE ||
1730             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1731                 RETURN(0);
1732
1733         result = __generic_file_write_iter(iocb, iter);
1734
1735         /* If the page is not already dirty, ll_tiny_write_begin returns
1736          * -ENODATA.  We continue on to normal write.
1737          */
1738         if (result == -ENODATA)
1739                 result = 0;
1740
1741         if (result > 0) {
1742                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1743                                    result);
1744                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1745         }
1746
1747         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1748
1749         RETURN(result);
1750 }
1751
1752 /*
1753  * Write to a file (through the page cache).
1754  */
1755 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1756 {
1757         struct vvp_io_args *args;
1758         struct lu_env *env;
1759         ssize_t rc_tiny = 0, rc_normal;
1760         __u16 refcheck;
1761
1762         ENTRY;
1763
1764         /* NB: we can't do direct IO for tiny writes because they use the page
1765          * cache, we can't do sync writes because tiny writes can't flush
1766          * pages, and we can't do append writes because we can't guarantee the
1767          * required DLM locks are held to protect file size.
1768          */
1769         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1770             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1771                 rc_tiny = ll_do_tiny_write(iocb, from);
1772
1773         /* In case of error, go on and try normal write - Only stop if tiny
1774          * write completed I/O.
1775          */
1776         if (iov_iter_count(from) == 0)
1777                 GOTO(out, rc_normal = rc_tiny);
1778
1779         env = cl_env_get(&refcheck);
1780         if (IS_ERR(env))
1781                 return PTR_ERR(env);
1782
1783         args = ll_env_args(env, IO_NORMAL);
1784         args->u.normal.via_iter = from;
1785         args->u.normal.via_iocb = iocb;
1786
1787         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1788                                     &iocb->ki_pos, iov_iter_count(from));
1789
1790         /* On success, combine bytes written. */
1791         if (rc_tiny >= 0 && rc_normal > 0)
1792                 rc_normal += rc_tiny;
1793         /* On error, only return error from normal write if tiny write did not
1794          * write any bytes.  Otherwise return bytes written by tiny write.
1795          */
1796         else if (rc_tiny > 0)
1797                 rc_normal = rc_tiny;
1798
1799         cl_env_put(env, &refcheck);
1800 out:
1801         RETURN(rc_normal);
1802 }
1803
1804 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1805 /*
1806  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1807  */
1808 static int ll_file_get_iov_count(const struct iovec *iov,
1809                                  unsigned long *nr_segs, size_t *count)
1810 {
1811         size_t cnt = 0;
1812         unsigned long seg;
1813
1814         for (seg = 0; seg < *nr_segs; seg++) {
1815                 const struct iovec *iv = &iov[seg];
1816
1817                 /*
1818                  * If any segment has a negative length, or the cumulative
1819                  * length ever wraps negative then return -EINVAL.
1820                  */
1821                 cnt += iv->iov_len;
1822                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1823                         return -EINVAL;
1824                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1825                         continue;
1826                 if (seg == 0)
1827                         return -EFAULT;
1828                 *nr_segs = seg;
1829                 cnt -= iv->iov_len;     /* This segment is no good */
1830                 break;
1831         }
1832         *count = cnt;
1833         return 0;
1834 }
1835
1836 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1837                                 unsigned long nr_segs, loff_t pos)
1838 {
1839         struct iov_iter to;
1840         size_t iov_count;
1841         ssize_t result;
1842         ENTRY;
1843
1844         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1845         if (result)
1846                 RETURN(result);
1847
1848 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1849         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1850 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1851         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1852 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1853
1854         result = ll_file_read_iter(iocb, &to);
1855
1856         RETURN(result);
1857 }
1858
1859 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1860                             loff_t *ppos)
1861 {
1862         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1863         struct kiocb   kiocb;
1864         ssize_t        result;
1865         ENTRY;
1866
1867         init_sync_kiocb(&kiocb, file);
1868         kiocb.ki_pos = *ppos;
1869 #ifdef HAVE_KIOCB_KI_LEFT
1870         kiocb.ki_left = count;
1871 #elif defined(HAVE_KI_NBYTES)
1872         kiocb.i_nbytes = count;
1873 #endif
1874
1875         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1876         *ppos = kiocb.ki_pos;
1877
1878         RETURN(result);
1879 }
1880
1881 /*
1882  * Write to a file (through the page cache).
1883  * AIO stuff
1884  */
1885 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1886                                  unsigned long nr_segs, loff_t pos)
1887 {
1888         struct iov_iter from;
1889         size_t iov_count;
1890         ssize_t result;
1891         ENTRY;
1892
1893         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1894         if (result)
1895                 RETURN(result);
1896
1897 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1898         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1899 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1900         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1901 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1902
1903         result = ll_file_write_iter(iocb, &from);
1904
1905         RETURN(result);
1906 }
1907
1908 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1909                              size_t count, loff_t *ppos)
1910 {
1911         struct iovec   iov = { .iov_base = (void __user *)buf,
1912                                .iov_len = count };
1913         struct kiocb   kiocb;
1914         ssize_t        result;
1915
1916         ENTRY;
1917
1918         init_sync_kiocb(&kiocb, file);
1919         kiocb.ki_pos = *ppos;
1920 #ifdef HAVE_KIOCB_KI_LEFT
1921         kiocb.ki_left = count;
1922 #elif defined(HAVE_KI_NBYTES)
1923         kiocb.ki_nbytes = count;
1924 #endif
1925
1926         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1927         *ppos = kiocb.ki_pos;
1928
1929         RETURN(result);
1930 }
1931 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1932
1933 /*
1934  * Send file content (through pagecache) somewhere with helper
1935  */
1936 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1937                                    struct pipe_inode_info *pipe, size_t count,
1938                                    unsigned int flags)
1939 {
1940         struct lu_env      *env;
1941         struct vvp_io_args *args;
1942         ssize_t             result;
1943         __u16               refcheck;
1944         ENTRY;
1945
1946         env = cl_env_get(&refcheck);
1947         if (IS_ERR(env))
1948                 RETURN(PTR_ERR(env));
1949
1950         args = ll_env_args(env, IO_SPLICE);
1951         args->u.splice.via_pipe = pipe;
1952         args->u.splice.via_flags = flags;
1953
1954         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1955         cl_env_put(env, &refcheck);
1956         RETURN(result);
1957 }
1958
1959 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1960                              __u64 flags, struct lov_user_md *lum, int lum_size)
1961 {
1962         struct lookup_intent oit = {
1963                 .it_op = IT_OPEN,
1964                 .it_flags = flags | MDS_OPEN_BY_FID,
1965         };
1966         int rc;
1967         ENTRY;
1968
1969         ll_inode_size_lock(inode);
1970         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1971         if (rc < 0)
1972                 GOTO(out_unlock, rc);
1973
1974         ll_release_openhandle(dentry, &oit);
1975
1976 out_unlock:
1977         ll_inode_size_unlock(inode);
1978         ll_intent_release(&oit);
1979
1980         RETURN(rc);
1981 }
1982
1983 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1984                              struct lov_mds_md **lmmp, int *lmm_size,
1985                              struct ptlrpc_request **request)
1986 {
1987         struct ll_sb_info *sbi = ll_i2sbi(inode);
1988         struct mdt_body  *body;
1989         struct lov_mds_md *lmm = NULL;
1990         struct ptlrpc_request *req = NULL;
1991         struct md_op_data *op_data;
1992         int rc, lmmsize;
1993
1994         rc = ll_get_default_mdsize(sbi, &lmmsize);
1995         if (rc)
1996                 RETURN(rc);
1997
1998         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1999                                      strlen(filename), lmmsize,
2000                                      LUSTRE_OPC_ANY, NULL);
2001         if (IS_ERR(op_data))
2002                 RETURN(PTR_ERR(op_data));
2003
2004         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2005         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2006         ll_finish_md_op_data(op_data);
2007         if (rc < 0) {
2008                 CDEBUG(D_INFO, "md_getattr_name failed "
2009                        "on %s: rc %d\n", filename, rc);
2010                 GOTO(out, rc);
2011         }
2012
2013         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2014         LASSERT(body != NULL); /* checked by mdc_getattr_name */
2015
2016         lmmsize = body->mbo_eadatasize;
2017
2018         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2019                         lmmsize == 0) {
2020                 GOTO(out, rc = -ENODATA);
2021         }
2022
2023         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2024         LASSERT(lmm != NULL);
2025
2026         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2027             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2028             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2029                 GOTO(out, rc = -EPROTO);
2030
2031         /*
2032          * This is coming from the MDS, so is probably in
2033          * little endian.  We convert it to host endian before
2034          * passing it to userspace.
2035          */
2036         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2037                 int stripe_count;
2038
2039                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2040                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2041                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2042                         if (le32_to_cpu(lmm->lmm_pattern) &
2043                             LOV_PATTERN_F_RELEASED)
2044                                 stripe_count = 0;
2045                 }
2046
2047                 /* if function called for directory - we should
2048                  * avoid swab not existent lsm objects */
2049                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2050                         lustre_swab_lov_user_md_v1(
2051                                         (struct lov_user_md_v1 *)lmm);
2052                         if (S_ISREG(body->mbo_mode))
2053                                 lustre_swab_lov_user_md_objects(
2054                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2055                                     stripe_count);
2056                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2057                         lustre_swab_lov_user_md_v3(
2058                                         (struct lov_user_md_v3 *)lmm);
2059                         if (S_ISREG(body->mbo_mode))
2060                                 lustre_swab_lov_user_md_objects(
2061                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2062                                     stripe_count);
2063                 } else if (lmm->lmm_magic ==
2064                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2065                         lustre_swab_lov_comp_md_v1(
2066                                         (struct lov_comp_md_v1 *)lmm);
2067                 }
2068         }
2069
2070 out:
2071         *lmmp = lmm;
2072         *lmm_size = lmmsize;
2073         *request = req;
2074         return rc;
2075 }
2076
2077 static int ll_lov_setea(struct inode *inode, struct file *file,
2078                         void __user *arg)
2079 {
2080         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2081         struct lov_user_md      *lump;
2082         int                      lum_size = sizeof(struct lov_user_md) +
2083                                             sizeof(struct lov_user_ost_data);
2084         int                      rc;
2085         ENTRY;
2086
2087         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2088                 RETURN(-EPERM);
2089
2090         OBD_ALLOC_LARGE(lump, lum_size);
2091         if (lump == NULL)
2092                 RETURN(-ENOMEM);
2093
2094         if (copy_from_user(lump, arg, lum_size))
2095                 GOTO(out_lump, rc = -EFAULT);
2096
2097         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2098                                       lum_size);
2099         cl_lov_delay_create_clear(&file->f_flags);
2100
2101 out_lump:
2102         OBD_FREE_LARGE(lump, lum_size);
2103         RETURN(rc);
2104 }
2105
2106 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2107 {
2108         struct lu_env   *env;
2109         __u16           refcheck;
2110         int             rc;
2111         ENTRY;
2112
2113         env = cl_env_get(&refcheck);
2114         if (IS_ERR(env))
2115                 RETURN(PTR_ERR(env));
2116
2117         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2118         cl_env_put(env, &refcheck);
2119         RETURN(rc);
2120 }
2121
2122 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2123                             void __user *arg)
2124 {
2125         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2126         struct lov_user_md        *klum;
2127         int                        lum_size, rc;
2128         __u64                      flags = FMODE_WRITE;
2129         ENTRY;
2130
2131         rc = ll_copy_user_md(lum, &klum);
2132         if (rc < 0)
2133                 RETURN(rc);
2134
2135         lum_size = rc;
2136         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2137                                       lum_size);
2138         if (!rc) {
2139                 __u32 gen;
2140
2141                 rc = put_user(0, &lum->lmm_stripe_count);
2142                 if (rc)
2143                         GOTO(out, rc);
2144
2145                 rc = ll_layout_refresh(inode, &gen);
2146                 if (rc)
2147                         GOTO(out, rc);
2148
2149                 rc = ll_file_getstripe(inode, arg, lum_size);
2150         }
2151         cl_lov_delay_create_clear(&file->f_flags);
2152
2153 out:
2154         OBD_FREE(klum, lum_size);
2155         RETURN(rc);
2156 }
2157
2158 static int
2159 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2160 {
2161         struct ll_inode_info *lli = ll_i2info(inode);
2162         struct cl_object *obj = lli->lli_clob;
2163         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2164         struct ll_grouplock grouplock;
2165         int rc;
2166         ENTRY;
2167
2168         if (arg == 0) {
2169                 CWARN("group id for group lock must not be 0\n");
2170                 RETURN(-EINVAL);
2171         }
2172
2173         if (ll_file_nolock(file))
2174                 RETURN(-EOPNOTSUPP);
2175
2176         spin_lock(&lli->lli_lock);
2177         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2178                 CWARN("group lock already existed with gid %lu\n",
2179                       fd->fd_grouplock.lg_gid);
2180                 spin_unlock(&lli->lli_lock);
2181                 RETURN(-EINVAL);
2182         }
2183         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2184         spin_unlock(&lli->lli_lock);
2185
2186         /**
2187          * XXX: group lock needs to protect all OST objects while PFL
2188          * can add new OST objects during the IO, so we'd instantiate
2189          * all OST objects before getting its group lock.
2190          */
2191         if (obj) {
2192                 struct lu_env *env;
2193                 __u16 refcheck;
2194                 struct cl_layout cl = {
2195                         .cl_is_composite = false,
2196                 };
2197                 struct lu_extent ext = {
2198                         .e_start = 0,
2199                         .e_end = OBD_OBJECT_EOF,
2200                 };
2201
2202                 env = cl_env_get(&refcheck);
2203                 if (IS_ERR(env))
2204                         RETURN(PTR_ERR(env));
2205
2206                 rc = cl_object_layout_get(env, obj, &cl);
2207                 if (!rc && cl.cl_is_composite)
2208                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2209                                                     &ext);
2210
2211                 cl_env_put(env, &refcheck);
2212                 if (rc)
2213                         RETURN(rc);
2214         }
2215
2216         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2217                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2218         if (rc)
2219                 RETURN(rc);
2220
2221         spin_lock(&lli->lli_lock);
2222         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2223                 spin_unlock(&lli->lli_lock);
2224                 CERROR("another thread just won the race\n");
2225                 cl_put_grouplock(&grouplock);
2226                 RETURN(-EINVAL);
2227         }
2228
2229         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2230         fd->fd_grouplock = grouplock;
2231         spin_unlock(&lli->lli_lock);
2232
2233         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2234         RETURN(0);
2235 }
2236
2237 static int ll_put_grouplock(struct inode *inode, struct file *file,
2238                             unsigned long arg)
2239 {
2240         struct ll_inode_info   *lli = ll_i2info(inode);
2241         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2242         struct ll_grouplock     grouplock;
2243         ENTRY;
2244
2245         spin_lock(&lli->lli_lock);
2246         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2247                 spin_unlock(&lli->lli_lock);
2248                 CWARN("no group lock held\n");
2249                 RETURN(-EINVAL);
2250         }
2251
2252         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2253
2254         if (fd->fd_grouplock.lg_gid != arg) {
2255                 CWARN("group lock %lu doesn't match current id %lu\n",
2256                       arg, fd->fd_grouplock.lg_gid);
2257                 spin_unlock(&lli->lli_lock);
2258                 RETURN(-EINVAL);
2259         }
2260
2261         grouplock = fd->fd_grouplock;
2262         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2263         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2264         spin_unlock(&lli->lli_lock);
2265
2266         cl_put_grouplock(&grouplock);
2267         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2268         RETURN(0);
2269 }
2270
2271 /**
2272  * Close inode open handle
2273  *
2274  * \param dentry [in]     dentry which contains the inode
2275  * \param it     [in,out] intent which contains open info and result
2276  *
2277  * \retval 0     success
2278  * \retval <0    failure
2279  */
2280 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2281 {
2282         struct inode *inode = dentry->d_inode;
2283         struct obd_client_handle *och;
2284         int rc;
2285         ENTRY;
2286
2287         LASSERT(inode);
2288
2289         /* Root ? Do nothing. */
2290         if (dentry->d_inode->i_sb->s_root == dentry)
2291                 RETURN(0);
2292
2293         /* No open handle to close? Move away */
2294         if (!it_disposition(it, DISP_OPEN_OPEN))
2295                 RETURN(0);
2296
2297         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2298
2299         OBD_ALLOC(och, sizeof(*och));
2300         if (!och)
2301                 GOTO(out, rc = -ENOMEM);
2302
2303         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2304
2305         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2306 out:
2307         /* this one is in place of ll_file_open */
2308         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2309                 ptlrpc_req_finished(it->it_request);
2310                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2311         }
2312         RETURN(rc);
2313 }
2314
2315 /**
2316  * Get size for inode for which FIEMAP mapping is requested.
2317  * Make the FIEMAP get_info call and returns the result.
2318  * \param fiemap        kernel buffer to hold extens
2319  * \param num_bytes     kernel buffer size
2320  */
2321 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2322                         size_t num_bytes)
2323 {
2324         struct lu_env                   *env;
2325         __u16                           refcheck;
2326         int                             rc = 0;
2327         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2328         ENTRY;
2329
2330         /* Checks for fiemap flags */
2331         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2332                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2333                 return -EBADR;
2334         }
2335
2336         /* Check for FIEMAP_FLAG_SYNC */
2337         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2338                 rc = filemap_fdatawrite(inode->i_mapping);
2339                 if (rc)
2340                         return rc;
2341         }
2342
2343         env = cl_env_get(&refcheck);
2344         if (IS_ERR(env))
2345                 RETURN(PTR_ERR(env));
2346
2347         if (i_size_read(inode) == 0) {
2348                 rc = ll_glimpse_size(inode);
2349                 if (rc)
2350                         GOTO(out, rc);
2351         }
2352
2353         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2354         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2355         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2356
2357         /* If filesize is 0, then there would be no objects for mapping */
2358         if (fmkey.lfik_oa.o_size == 0) {
2359                 fiemap->fm_mapped_extents = 0;
2360                 GOTO(out, rc = 0);
2361         }
2362
2363         fmkey.lfik_fiemap = *fiemap;
2364
2365         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2366                               &fmkey, fiemap, &num_bytes);
2367 out:
2368         cl_env_put(env, &refcheck);
2369         RETURN(rc);
2370 }
2371
2372 int ll_fid2path(struct inode *inode, void __user *arg)
2373 {
2374         struct obd_export       *exp = ll_i2mdexp(inode);
2375         const struct getinfo_fid2path __user *gfin = arg;
2376         __u32                    pathlen;
2377         struct getinfo_fid2path *gfout;
2378         size_t                   outsize;
2379         int                      rc;
2380
2381         ENTRY;
2382
2383         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2384             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2385                 RETURN(-EPERM);
2386
2387         /* Only need to get the buflen */
2388         if (get_user(pathlen, &gfin->gf_pathlen))
2389                 RETURN(-EFAULT);
2390
2391         if (pathlen > PATH_MAX)
2392                 RETURN(-EINVAL);
2393
2394         outsize = sizeof(*gfout) + pathlen;
2395         OBD_ALLOC(gfout, outsize);
2396         if (gfout == NULL)
2397                 RETURN(-ENOMEM);
2398
2399         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2400                 GOTO(gf_free, rc = -EFAULT);
2401         /* append root FID after gfout to let MDT know the root FID so that it
2402          * can lookup the correct path, this is mainly for fileset.
2403          * old server without fileset mount support will ignore this. */
2404         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2405
2406         /* Call mdc_iocontrol */
2407         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2408         if (rc != 0)
2409                 GOTO(gf_free, rc);
2410
2411         if (copy_to_user(arg, gfout, outsize))
2412                 rc = -EFAULT;
2413
2414 gf_free:
2415         OBD_FREE(gfout, outsize);
2416         RETURN(rc);
2417 }
2418
2419 static int
2420 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2421 {
2422         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2423         struct lu_env *env;
2424         struct cl_io *io;
2425         __u16  refcheck;
2426         int result;
2427
2428         ENTRY;
2429
2430         ioc->idv_version = 0;
2431         ioc->idv_layout_version = UINT_MAX;
2432
2433         /* If no file object initialized, we consider its version is 0. */
2434         if (obj == NULL)
2435                 RETURN(0);
2436
2437         env = cl_env_get(&refcheck);
2438         if (IS_ERR(env))
2439                 RETURN(PTR_ERR(env));
2440
2441         io = vvp_env_thread_io(env);
2442         io->ci_obj = obj;
2443         io->u.ci_data_version.dv_data_version = 0;
2444         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2445         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2446
2447 restart:
2448         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2449                 result = cl_io_loop(env, io);
2450         else
2451                 result = io->ci_result;
2452
2453         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2454         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2455
2456         cl_io_fini(env, io);
2457
2458         if (unlikely(io->ci_need_restart))
2459                 goto restart;
2460
2461         cl_env_put(env, &refcheck);
2462
2463         RETURN(result);
2464 }
2465
2466 /*
2467  * Read the data_version for inode.
2468  *
2469  * This value is computed using stripe object version on OST.
2470  * Version is computed using server side locking.
2471  *
2472  * @param flags if do sync on the OST side;
2473  *              0: no sync
2474  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2475  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2476  */
2477 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2478 {
2479         struct ioc_data_version ioc = { .idv_flags = flags };
2480         int rc;
2481
2482         rc = ll_ioc_data_version(inode, &ioc);
2483         if (!rc)
2484                 *data_version = ioc.idv_version;
2485
2486         return rc;
2487 }
2488
2489 /*
2490  * Trigger a HSM release request for the provided inode.
2491  */
2492 int ll_hsm_release(struct inode *inode)
2493 {
2494         struct lu_env *env;
2495         struct obd_client_handle *och = NULL;
2496         __u64 data_version = 0;
2497         int rc;
2498         __u16 refcheck;
2499         ENTRY;
2500
2501         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2502                ll_get_fsname(inode->i_sb, NULL, 0),
2503                PFID(&ll_i2info(inode)->lli_fid));
2504
2505         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2506         if (IS_ERR(och))
2507                 GOTO(out, rc = PTR_ERR(och));
2508
2509         /* Grab latest data_version and [am]time values */
2510         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2511         if (rc != 0)
2512                 GOTO(out, rc);
2513
2514         env = cl_env_get(&refcheck);
2515         if (IS_ERR(env))
2516                 GOTO(out, rc = PTR_ERR(env));
2517
2518         rc = ll_merge_attr(env, inode);
2519         cl_env_put(env, &refcheck);
2520
2521         /* If error happen, we have the wrong size for a file.
2522          * Don't release it.
2523          */
2524         if (rc != 0)
2525                 GOTO(out, rc);
2526
2527         /* Release the file.
2528          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2529          * we still need it to pack l_remote_handle to MDT. */
2530         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2531                                        &data_version);
2532         och = NULL;
2533
2534         EXIT;
2535 out:
2536         if (och != NULL && !IS_ERR(och)) /* close the file */
2537                 ll_lease_close(och, inode, NULL);
2538
2539         return rc;
2540 }
2541
2542 struct ll_swap_stack {
2543         __u64                    dv1;
2544         __u64                    dv2;
2545         struct inode            *inode1;
2546         struct inode            *inode2;
2547         bool                     check_dv1;
2548         bool                     check_dv2;
2549 };
2550
2551 static int ll_swap_layouts(struct file *file1, struct file *file2,
2552                            struct lustre_swap_layouts *lsl)
2553 {
2554         struct mdc_swap_layouts  msl;
2555         struct md_op_data       *op_data;
2556         __u32                    gid;
2557         __u64                    dv;
2558         struct ll_swap_stack    *llss = NULL;
2559         int                      rc;
2560
2561         OBD_ALLOC_PTR(llss);
2562         if (llss == NULL)
2563                 RETURN(-ENOMEM);
2564
2565         llss->inode1 = file_inode(file1);
2566         llss->inode2 = file_inode(file2);
2567
2568         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2569         if (rc < 0)
2570                 GOTO(free, rc);
2571
2572         /* we use 2 bool because it is easier to swap than 2 bits */
2573         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2574                 llss->check_dv1 = true;
2575
2576         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2577                 llss->check_dv2 = true;
2578
2579         /* we cannot use lsl->sl_dvX directly because we may swap them */
2580         llss->dv1 = lsl->sl_dv1;
2581         llss->dv2 = lsl->sl_dv2;
2582
2583         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2584         if (rc == 0) /* same file, done! */
2585                 GOTO(free, rc);
2586
2587         if (rc < 0) { /* sequentialize it */
2588                 swap(llss->inode1, llss->inode2);
2589                 swap(file1, file2);
2590                 swap(llss->dv1, llss->dv2);
2591                 swap(llss->check_dv1, llss->check_dv2);
2592         }
2593
2594         gid = lsl->sl_gid;
2595         if (gid != 0) { /* application asks to flush dirty cache */
2596                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2597                 if (rc < 0)
2598                         GOTO(free, rc);
2599
2600                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2601                 if (rc < 0) {
2602                         ll_put_grouplock(llss->inode1, file1, gid);
2603                         GOTO(free, rc);
2604                 }
2605         }
2606
2607         /* ultimate check, before swaping the layouts we check if
2608          * dataversion has changed (if requested) */
2609         if (llss->check_dv1) {
2610                 rc = ll_data_version(llss->inode1, &dv, 0);
2611                 if (rc)
2612                         GOTO(putgl, rc);
2613                 if (dv != llss->dv1)
2614                         GOTO(putgl, rc = -EAGAIN);
2615         }
2616
2617         if (llss->check_dv2) {
2618                 rc = ll_data_version(llss->inode2, &dv, 0);
2619                 if (rc)
2620                         GOTO(putgl, rc);
2621                 if (dv != llss->dv2)
2622                         GOTO(putgl, rc = -EAGAIN);
2623         }
2624
2625         /* struct md_op_data is used to send the swap args to the mdt
2626          * only flags is missing, so we use struct mdc_swap_layouts
2627          * through the md_op_data->op_data */
2628         /* flags from user space have to be converted before they are send to
2629          * server, no flag is sent today, they are only used on the client */
2630         msl.msl_flags = 0;
2631         rc = -ENOMEM;
2632         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2633                                      0, LUSTRE_OPC_ANY, &msl);
2634         if (IS_ERR(op_data))
2635                 GOTO(free, rc = PTR_ERR(op_data));
2636
2637         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2638                            sizeof(*op_data), op_data, NULL);
2639         ll_finish_md_op_data(op_data);
2640
2641         if (rc < 0)
2642                 GOTO(putgl, rc);
2643
2644 putgl:
2645         if (gid != 0) {
2646                 ll_put_grouplock(llss->inode2, file2, gid);
2647                 ll_put_grouplock(llss->inode1, file1, gid);
2648         }
2649
2650 free:
2651         if (llss != NULL)
2652                 OBD_FREE_PTR(llss);
2653
2654         RETURN(rc);
2655 }
2656
2657 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2658 {
2659         struct obd_export *exp = ll_i2mdexp(inode);
2660         struct md_op_data *op_data;
2661         int rc;
2662         ENTRY;
2663
2664         /* Detect out-of range masks */
2665         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2666                 RETURN(-EINVAL);
2667
2668         /* Non-root users are forbidden to set or clear flags which are
2669          * NOT defined in HSM_USER_MASK. */
2670         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2671             !cfs_capable(CFS_CAP_SYS_ADMIN))
2672                 RETURN(-EPERM);
2673
2674         if (!exp_connect_archive_id_array(exp)) {
2675                 /* Detect out-of range archive id */
2676                 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2677                     (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2678                         RETURN(-EINVAL);
2679         }
2680
2681         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2682                                      LUSTRE_OPC_ANY, hss);
2683         if (IS_ERR(op_data))
2684                 RETURN(PTR_ERR(op_data));
2685
2686         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2687                            op_data, NULL);
2688
2689         ll_finish_md_op_data(op_data);
2690
2691         RETURN(rc);
2692 }
2693
2694 static int ll_hsm_import(struct inode *inode, struct file *file,
2695                          struct hsm_user_import *hui)
2696 {
2697         struct hsm_state_set    *hss = NULL;
2698         struct iattr            *attr = NULL;
2699         int                      rc;
2700         ENTRY;
2701
2702         if (!S_ISREG(inode->i_mode))
2703                 RETURN(-EINVAL);
2704
2705         /* set HSM flags */
2706         OBD_ALLOC_PTR(hss);
2707         if (hss == NULL)
2708                 GOTO(out, rc = -ENOMEM);
2709
2710         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2711         hss->hss_archive_id = hui->hui_archive_id;
2712         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2713         rc = ll_hsm_state_set(inode, hss);
2714         if (rc != 0)
2715                 GOTO(out, rc);
2716
2717         OBD_ALLOC_PTR(attr);
2718         if (attr == NULL)
2719                 GOTO(out, rc = -ENOMEM);
2720
2721         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2722         attr->ia_mode |= S_IFREG;
2723         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2724         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2725         attr->ia_size = hui->hui_size;
2726         attr->ia_mtime.tv_sec = hui->hui_mtime;
2727         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2728         attr->ia_atime.tv_sec = hui->hui_atime;
2729         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2730
2731         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2732                          ATTR_UID | ATTR_GID |
2733                          ATTR_MTIME | ATTR_MTIME_SET |
2734                          ATTR_ATIME | ATTR_ATIME_SET;
2735
2736         inode_lock(inode);
2737
2738         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2739         if (rc == -ENODATA)
2740                 rc = 0;
2741
2742         inode_unlock(inode);
2743
2744 out:
2745         if (hss != NULL)
2746                 OBD_FREE_PTR(hss);
2747
2748         if (attr != NULL)
2749                 OBD_FREE_PTR(attr);
2750
2751         RETURN(rc);
2752 }
2753
2754 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2755 {
2756         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2757                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2758 }
2759
2760 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2761 {
2762         struct inode *inode = file_inode(file);
2763         struct iattr ia = {
2764                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2765                             ATTR_MTIME | ATTR_MTIME_SET |
2766                             ATTR_CTIME,
2767                 .ia_atime = {
2768                         .tv_sec = lfu->lfu_atime_sec,
2769                         .tv_nsec = lfu->lfu_atime_nsec,
2770                 },
2771                 .ia_mtime = {
2772                         .tv_sec = lfu->lfu_mtime_sec,
2773                         .tv_nsec = lfu->lfu_mtime_nsec,
2774                 },
2775                 .ia_ctime = {
2776                         .tv_sec = lfu->lfu_ctime_sec,
2777                         .tv_nsec = lfu->lfu_ctime_nsec,
2778                 },
2779         };
2780         int rc;
2781         ENTRY;
2782
2783         if (!capable(CAP_SYS_ADMIN))
2784                 RETURN(-EPERM);
2785
2786         if (!S_ISREG(inode->i_mode))
2787                 RETURN(-EINVAL);
2788
2789         inode_lock(inode);
2790         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2791                             false);
2792         inode_unlock(inode);
2793
2794         RETURN(rc);
2795 }
2796
2797 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2798 {
2799         switch (mode) {
2800         case MODE_READ_USER:
2801                 return CLM_READ;
2802         case MODE_WRITE_USER:
2803                 return CLM_WRITE;
2804         default:
2805                 return -EINVAL;
2806         }
2807 }
2808
2809 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2810
2811 /* Used to allow the upper layers of the client to request an LDLM lock
2812  * without doing an actual read or write.
2813  *
2814  * Used for ladvise lockahead to manually request specific locks.
2815  *
2816  * \param[in] file      file this ladvise lock request is on
2817  * \param[in] ladvise   ladvise struct describing this lock request
2818  *
2819  * \retval 0            success, no detailed result available (sync requests
2820  *                      and requests sent to the server [not handled locally]
2821  *                      cannot return detailed results)
2822  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2823  *                                       see definitions for details.
2824  * \retval negative     negative errno on error
2825  */
2826 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2827 {
2828         struct lu_env *env = NULL;
2829         struct cl_io *io  = NULL;
2830         struct cl_lock *lock = NULL;
2831         struct cl_lock_descr *descr = NULL;
2832         struct dentry *dentry = file->f_path.dentry;
2833         struct inode *inode = dentry->d_inode;
2834         enum cl_lock_mode cl_mode;
2835         off_t start = ladvise->lla_start;
2836         off_t end = ladvise->lla_end;
2837         int result;
2838         __u16 refcheck;
2839
2840         ENTRY;
2841
2842         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2843                "start=%llu, end=%llu\n", dentry->d_name.len,
2844                dentry->d_name.name, dentry->d_inode,
2845                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2846                (__u64) end);
2847
2848         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2849         if (cl_mode < 0)
2850                 GOTO(out, result = cl_mode);
2851
2852         /* Get IO environment */
2853         result = cl_io_get(inode, &env, &io, &refcheck);
2854         if (result <= 0)
2855                 GOTO(out, result);
2856
2857         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2858         if (result > 0) {
2859                 /*
2860                  * nothing to do for this io. This currently happens when
2861                  * stripe sub-object's are not yet created.
2862                  */
2863                 result = io->ci_result;
2864         } else if (result == 0) {
2865                 lock = vvp_env_lock(env);
2866                 descr = &lock->cll_descr;
2867
2868                 descr->cld_obj   = io->ci_obj;
2869                 /* Convert byte offsets to pages */
2870                 descr->cld_start = cl_index(io->ci_obj, start);
2871                 descr->cld_end   = cl_index(io->ci_obj, end);
2872                 descr->cld_mode  = cl_mode;
2873                 /* CEF_MUST is used because we do not want to convert a
2874                  * lockahead request to a lockless lock */
2875                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2876                                        CEF_NONBLOCK;
2877
2878                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2879                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2880
2881                 result = cl_lock_request(env, io, lock);
2882
2883                 /* On success, we need to release the lock */
2884                 if (result >= 0)
2885                         cl_lock_release(env, lock);
2886         }
2887         cl_io_fini(env, io);
2888         cl_env_put(env, &refcheck);
2889
2890         /* -ECANCELED indicates a matching lock with a different extent
2891          * was already present, and -EEXIST indicates a matching lock
2892          * on exactly the same extent was already present.
2893          * We convert them to positive values for userspace to make
2894          * recognizing true errors easier.
2895          * Note we can only return these detailed results on async requests,
2896          * as sync requests look the same as i/o requests for locking. */
2897         if (result == -ECANCELED)
2898                 result = LLA_RESULT_DIFFERENT;
2899         else if (result == -EEXIST)
2900                 result = LLA_RESULT_SAME;
2901
2902 out:
2903         RETURN(result);
2904 }
2905 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2906
2907 static int ll_ladvise_sanity(struct inode *inode,
2908                              struct llapi_lu_ladvise *ladvise)
2909 {
2910         enum lu_ladvise_type advice = ladvise->lla_advice;
2911         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2912          * be in the first 32 bits of enum ladvise_flags */
2913         __u32 flags = ladvise->lla_peradvice_flags;
2914         /* 3 lines at 80 characters per line, should be plenty */
2915         int rc = 0;
2916
2917         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2918                 rc = -EINVAL;
2919                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2920                        "last supported advice is %s (value '%d'): rc = %d\n",
2921                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2922                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2923                 GOTO(out, rc);
2924         }
2925
2926         /* Per-advice checks */
2927         switch (advice) {
2928         case LU_LADVISE_LOCKNOEXPAND:
2929                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2930                         rc = -EINVAL;
2931                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2932                                "rc = %d\n",
2933                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2934                                ladvise_names[advice], rc);
2935                         GOTO(out, rc);
2936                 }
2937                 break;
2938         case LU_LADVISE_LOCKAHEAD:
2939                 /* Currently only READ and WRITE modes can be requested */
2940                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2941                     ladvise->lla_lockahead_mode == 0) {
2942                         rc = -EINVAL;
2943                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2944                                "rc = %d\n",
2945                                ll_get_fsname(inode->i_sb, NULL, 0),
2946                                ladvise->lla_lockahead_mode,
2947                                ladvise_names[advice], rc);
2948                         GOTO(out, rc);
2949                 }
2950         case LU_LADVISE_WILLREAD:
2951         case LU_LADVISE_DONTNEED:
2952         default:
2953                 /* Note fall through above - These checks apply to all advices
2954                  * except LOCKNOEXPAND */
2955                 if (flags & ~LF_DEFAULT_MASK) {
2956                         rc = -EINVAL;
2957                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2958                                "rc = %d\n",
2959                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2960                                ladvise_names[advice], rc);
2961                         GOTO(out, rc);
2962                 }
2963                 if (ladvise->lla_start >= ladvise->lla_end) {
2964                         rc = -EINVAL;
2965                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2966                                "for %s: rc = %d\n",
2967                                ll_get_fsname(inode->i_sb, NULL, 0),
2968                                ladvise->lla_start, ladvise->lla_end,
2969                                ladvise_names[advice], rc);
2970                         GOTO(out, rc);
2971                 }
2972                 break;
2973         }
2974
2975 out:
2976         return rc;
2977 }
2978 #undef ERRSIZE
2979
2980 /*
2981  * Give file access advices
2982  *
2983  * The ladvise interface is similar to Linux fadvise() system call, except it
2984  * forwards the advices directly from Lustre client to server. The server side
2985  * codes will apply appropriate read-ahead and caching techniques for the
2986  * corresponding files.
2987  *
2988  * A typical workload for ladvise is e.g. a bunch of different clients are
2989  * doing small random reads of a file, so prefetching pages into OSS cache
2990  * with big linear reads before the random IO is a net benefit. Fetching
2991  * all that data into each client cache with fadvise() may not be, due to
2992  * much more data being sent to the client.
2993  */
2994 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2995                       struct llapi_lu_ladvise *ladvise)
2996 {
2997         struct lu_env *env;
2998         struct cl_io *io;
2999         struct cl_ladvise_io *lio;
3000         int rc;
3001         __u16 refcheck;
3002         ENTRY;
3003
3004         env = cl_env_get(&refcheck);
3005         if (IS_ERR(env))
3006                 RETURN(PTR_ERR(env));
3007
3008         io = vvp_env_thread_io(env);
3009         io->ci_obj = ll_i2info(inode)->lli_clob;
3010
3011         /* initialize parameters for ladvise */
3012         lio = &io->u.ci_ladvise;
3013         lio->li_start = ladvise->lla_start;
3014         lio->li_end = ladvise->lla_end;
3015         lio->li_fid = ll_inode2fid(inode);
3016         lio->li_advice = ladvise->lla_advice;
3017         lio->li_flags = flags;
3018
3019         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3020                 rc = cl_io_loop(env, io);
3021         else
3022                 rc = io->ci_result;
3023
3024         cl_io_fini(env, io);
3025         cl_env_put(env, &refcheck);
3026         RETURN(rc);
3027 }
3028
3029 static int ll_lock_noexpand(struct file *file, int flags)
3030 {
3031         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3032
3033         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3034
3035         return 0;
3036 }
3037
3038 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3039                         unsigned long arg)
3040 {
3041         struct fsxattr fsxattr;
3042
3043         if (copy_from_user(&fsxattr,
3044                            (const struct fsxattr __user *)arg,
3045                            sizeof(fsxattr)))
3046                 RETURN(-EFAULT);
3047
3048         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3049         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3050                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3051         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3052         if (copy_to_user((struct fsxattr __user *)arg,
3053                          &fsxattr, sizeof(fsxattr)))
3054                 RETURN(-EFAULT);
3055
3056         RETURN(0);
3057 }
3058
3059 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3060 {
3061         /*
3062          * Project Quota ID state is only allowed to change from within the init
3063          * namespace. Enforce that restriction only if we are trying to change
3064          * the quota ID state. Everything else is allowed in user namespaces.
3065          */
3066         if (current_user_ns() == &init_user_ns)
3067                 return 0;
3068
3069         if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3070                 return -EINVAL;
3071
3072         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3073                 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3074                         return -EINVAL;
3075         } else {
3076                 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3077                         return -EINVAL;
3078         }
3079
3080         return 0;
3081 }
3082
3083 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3084                         unsigned long arg)
3085 {
3086
3087         struct md_op_data *op_data;
3088         struct ptlrpc_request *req = NULL;
3089         int rc = 0;
3090         struct fsxattr fsxattr;
3091         struct cl_object *obj;
3092         struct iattr *attr;
3093         int flags;
3094
3095         if (copy_from_user(&fsxattr,
3096                            (const struct fsxattr __user *)arg,
3097                            sizeof(fsxattr)))
3098                 RETURN(-EFAULT);
3099
3100         rc = ll_ioctl_check_project(inode, &fsxattr);
3101         if (rc)
3102                 RETURN(rc);
3103
3104         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3105                                      LUSTRE_OPC_ANY, NULL);
3106         if (IS_ERR(op_data))
3107                 RETURN(PTR_ERR(op_data));
3108
3109         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3110         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3111         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3112                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3113         op_data->op_projid = fsxattr.fsx_projid;
3114         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3115         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3116                         0, &req);
3117         ptlrpc_req_finished(req);
3118         if (rc)
3119                 GOTO(out_fsxattr, rc);
3120         ll_update_inode_flags(inode, op_data->op_attr_flags);
3121         obj = ll_i2info(inode)->lli_clob;
3122         if (obj == NULL)
3123                 GOTO(out_fsxattr, rc);
3124
3125         OBD_ALLOC_PTR(attr);
3126         if (attr == NULL)
3127                 GOTO(out_fsxattr, rc = -ENOMEM);
3128
3129         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3130                             fsxattr.fsx_xflags);
3131         OBD_FREE_PTR(attr);
3132 out_fsxattr:
3133         ll_finish_md_op_data(op_data);
3134         RETURN(rc);
3135 }
3136
3137 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3138                                  unsigned long arg)
3139 {
3140         struct inode            *inode = file_inode(file);
3141         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3142         struct ll_inode_info    *lli = ll_i2info(inode);
3143         struct obd_client_handle *och = NULL;
3144         struct split_param sp;
3145         bool lease_broken;
3146         fmode_t fmode = 0;
3147         enum mds_op_bias bias = 0;
3148         struct file *layout_file = NULL;
3149         void *data = NULL;
3150         size_t data_size = 0;
3151         long rc;
3152         ENTRY;
3153
3154         mutex_lock(&lli->lli_och_mutex);
3155         if (fd->fd_lease_och != NULL) {
3156                 och = fd->fd_lease_och;
3157                 fd->fd_lease_och = NULL;
3158         }
3159         mutex_unlock(&lli->lli_och_mutex);
3160
3161         if (och == NULL)
3162                 GOTO(out, rc = -ENOLCK);
3163
3164         fmode = och->och_flags;
3165
3166         switch (ioc->lil_flags) {
3167         case LL_LEASE_RESYNC_DONE:
3168                 if (ioc->lil_count > IOC_IDS_MAX)
3169                         GOTO(out, rc = -EINVAL);
3170
3171                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3172                 OBD_ALLOC(data, data_size);
3173                 if (!data)
3174                         GOTO(out, rc = -ENOMEM);
3175
3176                 if (copy_from_user(data, (void __user *)arg, data_size))
3177                         GOTO(out, rc = -EFAULT);
3178
3179                 bias = MDS_CLOSE_RESYNC_DONE;
3180                 break;
3181         case LL_LEASE_LAYOUT_MERGE: {
3182                 int fd;
3183
3184                 if (ioc->lil_count != 1)
3185                         GOTO(out, rc = -EINVAL);
3186
3187                 arg += sizeof(*ioc);
3188                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3189                         GOTO(out, rc = -EFAULT);
3190
3191                 layout_file = fget(fd);
3192                 if (!layout_file)
3193                         GOTO(out, rc = -EBADF);
3194
3195                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3196                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3197                         GOTO(out, rc = -EPERM);
3198
3199                 data = file_inode(layout_file);
3200                 bias = MDS_CLOSE_LAYOUT_MERGE;
3201                 break;
3202         }
3203         case LL_LEASE_LAYOUT_SPLIT: {
3204                 int fdv;
3205                 int mirror_id;
3206
3207                 if (ioc->lil_count != 2)
3208                         GOTO(out, rc = -EINVAL);
3209
3210                 arg += sizeof(*ioc);
3211                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3212                         GOTO(out, rc = -EFAULT);
3213
3214                 arg += sizeof(__u32);
3215                 if (copy_from_user(&mirror_id, (void __user *)arg,
3216                                    sizeof(__u32)))
3217                         GOTO(out, rc = -EFAULT);
3218
3219                 layout_file = fget(fdv);
3220                 if (!layout_file)
3221                         GOTO(out, rc = -EBADF);
3222
3223                 sp.sp_inode = file_inode(layout_file);
3224                 sp.sp_mirror_id = (__u16)mirror_id;
3225                 data = &sp;
3226                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3227                 break;
3228         }
3229         default:
3230                 /* without close intent */
3231                 break;
3232         }
3233
3234         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3235         if (rc < 0)
3236                 GOTO(out, rc);
3237
3238         rc = ll_lease_och_release(inode, file);
3239         if (rc < 0)
3240                 GOTO(out, rc);
3241
3242         if (lease_broken)
3243                 fmode = 0;
3244         EXIT;
3245
3246 out:
3247         switch (ioc->lil_flags) {
3248         case LL_LEASE_RESYNC_DONE:
3249                 if (data)
3250                         OBD_FREE(data, data_size);
3251                 break;
3252         case LL_LEASE_LAYOUT_MERGE:
3253         case LL_LEASE_LAYOUT_SPLIT:
3254                 if (layout_file)
3255                         fput(layout_file);
3256                 break;
3257         }
3258
3259         if (!rc)
3260                 rc = ll_lease_type_from_fmode(fmode);
3261         RETURN(rc);
3262 }
3263
3264 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3265                               unsigned long arg)
3266 {
3267         struct inode *inode = file_inode(file);
3268         struct ll_inode_info *lli = ll_i2info(inode);
3269         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3270         struct obd_client_handle *och = NULL;
3271         __u64 open_flags = 0;
3272         bool lease_broken;
3273         fmode_t fmode;
3274         long rc;
3275         ENTRY;
3276
3277         switch (ioc->lil_mode) {
3278         case LL_LEASE_WRLCK:
3279                 if (!(file->f_mode & FMODE_WRITE))
3280                         RETURN(-EPERM);
3281                 fmode = FMODE_WRITE;
3282                 break;
3283         case LL_LEASE_RDLCK:
3284                 if (!(file->f_mode & FMODE_READ))
3285                         RETURN(-EPERM);
3286                 fmode = FMODE_READ;
3287                 break;
3288         case LL_LEASE_UNLCK:
3289                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3290         default:
3291                 RETURN(-EINVAL);
3292         }
3293
3294         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3295
3296         /* apply for lease */
3297         if (ioc->lil_flags & LL_LEASE_RESYNC)
3298                 open_flags = MDS_OPEN_RESYNC;
3299         och = ll_lease_open(inode, file, fmode, open_flags);
3300         if (IS_ERR(och))
3301                 RETURN(PTR_ERR(och));
3302
3303         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3304                 rc = ll_lease_file_resync(och, inode, arg);
3305                 if (rc) {
3306                         ll_lease_close(och, inode, NULL);
3307                         RETURN(rc);
3308                 }
3309                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3310                 if (rc) {
3311                         ll_lease_close(och, inode, NULL);
3312                         RETURN(rc);
3313                 }
3314         }
3315
3316         rc = 0;
3317         mutex_lock(&lli->lli_och_mutex);
3318         if (fd->fd_lease_och == NULL) {
3319                 fd->fd_lease_och = och;
3320                 och = NULL;
3321         }
3322         mutex_unlock(&lli->lli_och_mutex);
3323         if (och != NULL) {
3324                 /* impossible now that only excl is supported for now */
3325                 ll_lease_close(och, inode, &lease_broken);
3326                 rc = -EBUSY;
3327         }
3328         RETURN(rc);
3329 }
3330
3331 static long
3332 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3333 {
3334         struct inode            *inode = file_inode(file);
3335         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3336         int                      flags, rc;
3337         ENTRY;
3338
3339         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3340                PFID(ll_inode2fid(inode)), inode, cmd);
3341         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3342
3343         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3344         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3345                 RETURN(-ENOTTY);
3346
3347         switch (cmd) {
3348         case LL_IOC_GETFLAGS:
3349                 /* Get the current value of the file flags */
3350                 return put_user(fd->fd_flags, (int __user *)arg);
3351         case LL_IOC_SETFLAGS:
3352         case LL_IOC_CLRFLAGS:
3353                 /* Set or clear specific file flags */
3354                 /* XXX This probably needs checks to ensure the flags are
3355                  *     not abused, and to handle any flag side effects.
3356                  */
3357                 if (get_user(flags, (int __user *) arg))
3358                         RETURN(-EFAULT);
3359
3360                 if (cmd == LL_IOC_SETFLAGS) {
3361                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3362                             !(file->f_flags & O_DIRECT)) {
3363                                 CERROR("%s: unable to disable locking on "
3364                                        "non-O_DIRECT file\n", current->comm);
3365                                 RETURN(-EINVAL);
3366                         }
3367
3368                         fd->fd_flags |= flags;
3369                 } else {
3370                         fd->fd_flags &= ~flags;
3371                 }
3372                 RETURN(0);
3373         case LL_IOC_LOV_SETSTRIPE:
3374         case LL_IOC_LOV_SETSTRIPE_NEW:
3375                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3376         case LL_IOC_LOV_SETEA:
3377                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3378         case LL_IOC_LOV_SWAP_LAYOUTS: {
3379                 struct file *file2;
3380                 struct lustre_swap_layouts lsl;
3381
3382                 if (copy_from_user(&lsl, (char __user *)arg,
3383                                    sizeof(struct lustre_swap_layouts)))
3384                         RETURN(-EFAULT);
3385
3386                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3387                         RETURN(-EPERM);
3388
3389                 file2 = fget(lsl.sl_fd);
3390                 if (file2 == NULL)
3391                         RETURN(-EBADF);
3392
3393                 /* O_WRONLY or O_RDWR */
3394                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3395                         GOTO(out, rc = -EPERM);
3396
3397                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3398                         struct inode                    *inode2;
3399                         struct ll_inode_info            *lli;
3400                         struct obd_client_handle        *och = NULL;
3401
3402                         lli = ll_i2info(inode);
3403                         mutex_lock(&lli->lli_och_mutex);
3404                         if (fd->fd_lease_och != NULL) {
3405                                 och = fd->fd_lease_och;
3406                                 fd->fd_lease_och = NULL;
3407                         }
3408                         mutex_unlock(&lli->lli_och_mutex);
3409                         if (och == NULL)
3410                                 GOTO(out, rc = -ENOLCK);
3411                         inode2 = file_inode(file2);
3412                         rc = ll_swap_layouts_close(och, inode, inode2);
3413                 } else {
3414                         rc = ll_swap_layouts(file, file2, &lsl);
3415                 }
3416 out:
3417                 fput(file2);
3418                 RETURN(rc);
3419         }
3420         case LL_IOC_LOV_GETSTRIPE:
3421         case LL_IOC_LOV_GETSTRIPE_NEW:
3422                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3423         case FS_IOC_GETFLAGS:
3424         case FS_IOC_SETFLAGS:
3425                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3426         case FSFILT_IOC_GETVERSION:
3427         case FS_IOC_GETVERSION:
3428                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3429         /* We need to special case any other ioctls we want to handle,
3430          * to send them to the MDS/OST as appropriate and to properly
3431          * network encode the arg field. */
3432         case FS_IOC_SETVERSION:
3433                 RETURN(-ENOTSUPP);
3434
3435         case LL_IOC_GROUP_LOCK:
3436                 RETURN(ll_get_grouplock(inode, file, arg));
3437         case LL_IOC_GROUP_UNLOCK:
3438                 RETURN(ll_put_grouplock(inode, file, arg));
3439         case IOC_OBD_STATFS:
3440                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3441
3442         case LL_IOC_FLUSHCTX:
3443                 RETURN(ll_flush_ctx(inode));
3444         case LL_IOC_PATH2FID: {
3445                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3446                                  sizeof(struct lu_fid)))
3447                         RETURN(-EFAULT);
3448
3449                 RETURN(0);
3450         }
3451         case LL_IOC_GETPARENT:
3452                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3453
3454         case OBD_IOC_FID2PATH:
3455                 RETURN(ll_fid2path(inode, (void __user *)arg));
3456         case LL_IOC_DATA_VERSION: {
3457                 struct ioc_data_version idv;
3458                 int rc;
3459
3460                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3461                         RETURN(-EFAULT);
3462
3463                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3464                 rc = ll_ioc_data_version(inode, &idv);
3465
3466                 if (rc == 0 &&
3467                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3468                         RETURN(-EFAULT);
3469
3470                 RETURN(rc);
3471         }
3472
3473         case LL_IOC_GET_MDTIDX: {
3474                 int mdtidx;
3475
3476                 mdtidx = ll_get_mdt_idx(inode);
3477                 if (mdtidx < 0)
3478                         RETURN(mdtidx);
3479
3480                 if (put_user((int)mdtidx, (int __user *)arg))
3481                         RETURN(-EFAULT);
3482
3483                 RETURN(0);
3484         }
3485         case OBD_IOC_GETDTNAME:
3486         case OBD_IOC_GETMDNAME:
3487                 RETURN(ll_get_obd_name(inode, cmd, arg));
3488         case LL_IOC_HSM_STATE_GET: {
3489                 struct md_op_data       *op_data;
3490                 struct hsm_user_state   *hus;
3491                 int                      rc;
3492
3493                 OBD_ALLOC_PTR(hus);
3494                 if (hus == NULL)
3495                         RETURN(-ENOMEM);
3496
3497                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3498                                              LUSTRE_OPC_ANY, hus);
3499                 if (IS_ERR(op_data)) {
3500                         OBD_FREE_PTR(hus);
3501                         RETURN(PTR_ERR(op_data));
3502                 }
3503
3504                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3505                                    op_data, NULL);
3506
3507                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3508                         rc = -EFAULT;
3509
3510                 ll_finish_md_op_data(op_data);
3511                 OBD_FREE_PTR(hus);
3512                 RETURN(rc);
3513         }
3514         case LL_IOC_HSM_STATE_SET: {
3515                 struct hsm_state_set    *hss;
3516                 int                      rc;
3517
3518                 OBD_ALLOC_PTR(hss);
3519                 if (hss == NULL)
3520                         RETURN(-ENOMEM);
3521
3522                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3523                         OBD_FREE_PTR(hss);
3524                         RETURN(-EFAULT);
3525                 }
3526
3527                 rc = ll_hsm_state_set(inode, hss);
3528
3529                 OBD_FREE_PTR(hss);
3530                 RETURN(rc);
3531         }
3532         case LL_IOC_HSM_ACTION: {
3533                 struct md_op_data               *op_data;
3534                 struct hsm_current_action       *hca;
3535                 int                              rc;
3536
3537                 OBD_ALLOC_PTR(hca);
3538                 if (hca == NULL)
3539                         RETURN(-ENOMEM);
3540
3541                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3542                                              LUSTRE_OPC_ANY, hca);
3543                 if (IS_ERR(op_data)) {
3544                         OBD_FREE_PTR(hca);
3545                         RETURN(PTR_ERR(op_data));
3546                 }
3547
3548                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3549                                    op_data, NULL);
3550
3551                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3552                         rc = -EFAULT;
3553
3554                 ll_finish_md_op_data(op_data);
3555                 OBD_FREE_PTR(hca);
3556                 RETURN(rc);
3557         }
3558         case LL_IOC_SET_LEASE_OLD: {
3559                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3560
3561                 RETURN(ll_file_set_lease(file, &ioc, 0));
3562         }
3563         case LL_IOC_SET_LEASE: {
3564                 struct ll_ioc_lease ioc;
3565
3566                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3567                         RETURN(-EFAULT);
3568
3569                 RETURN(ll_file_set_lease(file, &ioc, arg));
3570         }
3571         case LL_IOC_GET_LEASE: {
3572                 struct ll_inode_info *lli = ll_i2info(inode);
3573                 struct ldlm_lock *lock = NULL;
3574                 fmode_t fmode = 0;
3575
3576                 mutex_lock(&lli->lli_och_mutex);
3577                 if (fd->fd_lease_och != NULL) {
3578                         struct obd_client_handle *och = fd->fd_lease_och;
3579
3580                         lock = ldlm_handle2lock(&och->och_lease_handle);
3581                         if (lock != NULL) {
3582                                 lock_res_and_lock(lock);
3583                                 if (!ldlm_is_cancel(lock))
3584                                         fmode = och->och_flags;
3585
3586                                 unlock_res_and_lock(lock);
3587                                 LDLM_LOCK_PUT(lock);
3588                         }
3589                 }
3590                 mutex_unlock(&lli->lli_och_mutex);
3591
3592                 RETURN(ll_lease_type_from_fmode(fmode));
3593         }
3594         case LL_IOC_HSM_IMPORT: {
3595                 struct hsm_user_import *hui;
3596
3597                 OBD_ALLOC_PTR(hui);
3598                 if (hui == NULL)
3599                         RETURN(-ENOMEM);
3600
3601                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3602                         OBD_FREE_PTR(hui);
3603                         RETURN(-EFAULT);
3604                 }
3605
3606                 rc = ll_hsm_import(inode, file, hui);
3607
3608                 OBD_FREE_PTR(hui);
3609                 RETURN(rc);
3610         }
3611         case LL_IOC_FUTIMES_3: {
3612                 struct ll_futimes_3 lfu;
3613
3614                 if (copy_from_user(&lfu,
3615                                    (const struct ll_futimes_3 __user *)arg,
3616                                    sizeof(lfu)))
3617                         RETURN(-EFAULT);
3618
3619                 RETURN(ll_file_futimes_3(file, &lfu));
3620         }
3621         case LL_IOC_LADVISE: {
3622                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3623                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3624                 int i;
3625                 int num_advise;
3626                 int alloc_size = sizeof(*k_ladvise_hdr);
3627
3628                 rc = 0;
3629                 u_ladvise_hdr = (void __user *)arg;
3630                 OBD_ALLOC_PTR(k_ladvise_hdr);
3631                 if (k_ladvise_hdr == NULL)
3632                         RETURN(-ENOMEM);
3633
3634                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3635                         GOTO(out_ladvise, rc = -EFAULT);
3636
3637                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3638                     k_ladvise_hdr->lah_count < 1)
3639                         GOTO(out_ladvise, rc = -EINVAL);
3640
3641                 num_advise = k_ladvise_hdr->lah_count;
3642                 if (num_advise >= LAH_COUNT_MAX)
3643                         GOTO(out_ladvise, rc = -EFBIG);
3644
3645                 OBD_FREE_PTR(k_ladvise_hdr);
3646                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3647                                       lah_advise[num_advise]);
3648                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3649                 if (k_ladvise_hdr == NULL)
3650                         RETURN(-ENOMEM);
3651
3652                 /*
3653                  * TODO: submit multiple advices to one server in a single RPC
3654                  */
3655                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3656                         GOTO(out_ladvise, rc = -EFAULT);
3657
3658                 for (i = 0; i < num_advise; i++) {
3659                         struct llapi_lu_ladvise *k_ladvise =
3660                                         &k_ladvise_hdr->lah_advise[i];
3661                         struct llapi_lu_ladvise __user *u_ladvise =
3662                                         &u_ladvise_hdr->lah_advise[i];
3663
3664                         rc = ll_ladvise_sanity(inode, k_ladvise);
3665                         if (rc)
3666                                 GOTO(out_ladvise, rc);
3667
3668                         switch (k_ladvise->lla_advice) {
3669                         case LU_LADVISE_LOCKNOEXPAND:
3670                                 rc = ll_lock_noexpand(file,
3671                                                k_ladvise->lla_peradvice_flags);
3672                                 GOTO(out_ladvise, rc);
3673                         case LU_LADVISE_LOCKAHEAD:
3674
3675                                 rc = ll_file_lock_ahead(file, k_ladvise);
3676
3677                                 if (rc < 0)
3678                                         GOTO(out_ladvise, rc);
3679
3680                                 if (put_user(rc,
3681                                              &u_ladvise->lla_lockahead_result))
3682                                         GOTO(out_ladvise, rc = -EFAULT);
3683                                 break;
3684                         default:
3685                                 rc = ll_ladvise(inode, file,
3686                                                 k_ladvise_hdr->lah_flags,
3687                                                 k_ladvise);
3688                                 if (rc)
3689                                         GOTO(out_ladvise, rc);
3690                                 break;
3691                         }
3692
3693                 }
3694
3695 out_ladvise:
3696                 OBD_FREE(k_ladvise_hdr, alloc_size);
3697                 RETURN(rc);
3698         }
3699         case LL_IOC_FLR_SET_MIRROR: {
3700                 /* mirror I/O must be direct to avoid polluting page cache
3701                  * by stale data. */
3702                 if (!(file->f_flags & O_DIRECT))
3703                         RETURN(-EINVAL);
3704
3705                 fd->fd_designated_mirror = (__u32)arg;
3706                 RETURN(0);
3707         }
3708         case LL_IOC_FSGETXATTR:
3709                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3710         case LL_IOC_FSSETXATTR:
3711                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3712         case BLKSSZGET:
3713                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3714         default:
3715                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3716                                      (void __user *)arg));
3717         }
3718 }
3719
3720 #ifndef HAVE_FILE_LLSEEK_SIZE
3721 static inline loff_t
3722 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3723 {
3724         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3725                 return -EINVAL;
3726         if (offset > maxsize)
3727                 return -EINVAL;
3728
3729         if (offset != file->f_pos) {
3730                 file->f_pos = offset;
3731                 file->f_version = 0;
3732         }
3733         return offset;
3734 }
3735
3736 static loff_t
3737 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3738                 loff_t maxsize, loff_t eof)
3739 {
3740         struct inode *inode = file_inode(file);
3741
3742         switch (origin) {
3743         case SEEK_END:
3744                 offset += eof;
3745                 break;
3746         case SEEK_CUR:
3747                 /*
3748                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3749                  * position-querying operation.  Avoid rewriting the "same"
3750                  * f_pos value back to the file because a concurrent read(),
3751                  * write() or lseek() might have altered it
3752                  */
3753                 if (offset == 0)
3754                         return file->f_pos;
3755                 /*
3756                  * f_lock protects against read/modify/write race with other
3757                  * SEEK_CURs. Note that parallel writes and reads behave
3758                  * like SEEK_SET.
3759                  */
3760                 inode_lock(inode);
3761                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3762                 inode_unlock(inode);
3763                 return offset;
3764         case SEEK_DATA:
3765                 /*
3766                  * In the generic case the entire file is data, so as long as
3767                  * offset isn't at the end of the file then the offset is data.
3768                  */
3769                 if (offset >= eof)
3770                         return -ENXIO;
3771                 break;
3772         case SEEK_HOLE:
3773                 /*
3774                  * There is a virtual hole at the end of the file, so as long as
3775                  * offset isn't i_size or larger, return i_size.
3776                  */
3777                 if (offset >= eof)
3778                         return -ENXIO;
3779                 offset = eof;
3780                 break;
3781         }
3782
3783         return llseek_execute(file, offset, maxsize);
3784 }
3785 #endif
3786
3787 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3788 {
3789         struct inode *inode = file_inode(file);
3790         loff_t retval, eof = 0;
3791
3792         ENTRY;
3793         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3794                            (origin == SEEK_CUR) ? file->f_pos : 0);
3795         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3796                PFID(ll_inode2fid(inode)), inode, retval, retval,
3797                origin);
3798         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3799
3800         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3801                 retval = ll_glimpse_size(inode);
3802                 if (retval != 0)
3803                         RETURN(retval);
3804                 eof = i_size_read(inode);
3805         }
3806
3807         retval = ll_generic_file_llseek_size(file, offset, origin,
3808                                           ll_file_maxbytes(inode), eof);
3809         RETURN(retval);
3810 }
3811
3812 static int ll_flush(struct file *file, fl_owner_t id)
3813 {
3814         struct inode *inode = file_inode(file);
3815         struct ll_inode_info *lli = ll_i2info(inode);
3816         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3817         int rc, err;
3818
3819         LASSERT(!S_ISDIR(inode->i_mode));
3820
3821         /* catch async errors that were recorded back when async writeback
3822          * failed for pages in this mapping. */
3823         rc = lli->lli_async_rc;
3824         lli->lli_async_rc = 0;
3825         if (lli->lli_clob != NULL) {
3826                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3827                 if (rc == 0)
3828                         rc = err;
3829         }
3830
3831         /* The application has been told write failure already.
3832          * Do not report failure again. */
3833         if (fd->fd_write_failed)
3834                 return 0;
3835         return rc ? -EIO : 0;
3836 }
3837
3838 /**
3839  * Called to make sure a portion of file has been written out.
3840  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3841  *
3842  * Return how many pages have been written.
3843  */
3844 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3845                        enum cl_fsync_mode mode, int ignore_layout)
3846 {
3847         struct lu_env *env;
3848         struct cl_io *io;
3849         struct cl_fsync_io *fio;
3850         int result;
3851         __u16 refcheck;
3852         ENTRY;
3853
3854         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3855             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3856                 RETURN(-EINVAL);
3857
3858         env = cl_env_get(&refcheck);
3859         if (IS_ERR(env))
3860                 RETURN(PTR_ERR(env));
3861
3862         io = vvp_env_thread_io(env);
3863         io->ci_obj = ll_i2info(inode)->lli_clob;
3864         io->ci_ignore_layout = ignore_layout;
3865
3866         /* initialize parameters for sync */
3867         fio = &io->u.ci_fsync;
3868         fio->fi_start = start;
3869         fio->fi_end = end;
3870         fio->fi_fid = ll_inode2fid(inode);
3871         fio->fi_mode = mode;
3872         fio->fi_nr_written = 0;
3873
3874         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3875                 result = cl_io_loop(env, io);
3876         else
3877                 result = io->ci_result;
3878         if (result == 0)
3879                 result = fio->fi_nr_written;
3880         cl_io_fini(env, io);
3881         cl_env_put(env, &refcheck);
3882
3883         RETURN(result);
3884 }
3885
3886 /*
3887  * When dentry is provided (the 'else' case), file_dentry() may be
3888  * null and dentry must be used directly rather than pulled from
3889  * file_dentry() as is done otherwise.
3890  */
3891
3892 #ifdef HAVE_FILE_FSYNC_4ARGS
3893 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3894 {
3895         struct dentry *dentry = file_dentry(file);
3896         bool lock_inode;
3897 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3898 int ll_fsync(struct file *file, int datasync)
3899 {
3900         struct dentry *dentry = file_dentry(file);
3901         loff_t start = 0;
3902         loff_t end = LLONG_MAX;
3903 #else
3904 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3905 {
3906         loff_t start = 0;
3907         loff_t end = LLONG_MAX;
3908 #endif
3909         struct inode *inode = dentry->d_inode;
3910         struct ll_inode_info *lli = ll_i2info(inode);
3911         struct ptlrpc_request *req;
3912         int rc, err;
3913         ENTRY;
3914
3915         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3916                PFID(ll_inode2fid(inode)), inode);
3917         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3918
3919 #ifdef HAVE_FILE_FSYNC_4ARGS
3920         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3921         lock_inode = !lli->lli_inode_locked;
3922         if (lock_inode)
3923                 inode_lock(inode);
3924 #else
3925         /* fsync's caller has already called _fdata{sync,write}, we want
3926          * that IO to finish before calling the osc and mdc sync methods */
3927         rc = filemap_fdatawait(inode->i_mapping);
3928 #endif
3929
3930         /* catch async errors that were recorded back when async writeback
3931          * failed for pages in this mapping. */
3932         if (!S_ISDIR(inode->i_mode)) {
3933                 err = lli->lli_async_rc;
3934                 lli->lli_async_rc = 0;
3935                 if (rc == 0)
3936                         rc = err;
3937                 if (lli->lli_clob != NULL) {
3938                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3939                         if (rc == 0)
3940                                 rc = err;
3941                 }
3942         }
3943
3944         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3945         if (!rc)
3946                 rc = err;
3947         if (!err)
3948                 ptlrpc_req_finished(req);
3949
3950         if (S_ISREG(inode->i_mode)) {
3951                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3952
3953                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3954                 if (rc == 0 && err < 0)
3955                         rc = err;
3956                 if (rc < 0)
3957                         fd->fd_write_failed = true;
3958                 else
3959                         fd->fd_write_failed = false;
3960         }
3961
3962 #ifdef HAVE_FILE_FSYNC_4ARGS
3963         if (lock_inode)
3964                 inode_unlock(inode);
3965 #endif
3966         RETURN(rc);
3967 }
3968
3969 static int
3970 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3971 {
3972         struct inode *inode = file_inode(file);
3973         struct ll_sb_info *sbi = ll_i2sbi(inode);
3974         struct ldlm_enqueue_info einfo = {
3975                 .ei_type        = LDLM_FLOCK,
3976                 .ei_cb_cp       = ldlm_flock_completion_ast,
3977                 .ei_cbdata      = file_lock,
3978         };
3979         struct md_op_data *op_data;
3980         struct lustre_handle lockh = { 0 };
3981         union ldlm_policy_data flock = { { 0 } };
3982         int fl_type = file_lock->fl_type;
3983         __u64 flags = 0;
3984         int rc;
3985         int rc2 = 0;
3986         ENTRY;
3987
3988         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3989                PFID(ll_inode2fid(inode)), file_lock);
3990
3991         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3992
3993         if (file_lock->fl_flags & FL_FLOCK) {
3994                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3995                 /* flocks are whole-file locks */
3996                 flock.l_flock.end = OFFSET_MAX;
3997                 /* For flocks owner is determined by the local file desctiptor*/
3998                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3999         } else if (file_lock->fl_flags & FL_POSIX) {
4000                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4001                 flock.l_flock.start = file_lock->fl_start;
4002                 flock.l_flock.end = file_lock->fl_end;
4003         } else {
4004                 RETURN(-EINVAL);
4005         }
4006         flock.l_flock.pid = file_lock->fl_pid;
4007
4008         /* Somewhat ugly workaround for svc lockd.
4009          * lockd installs custom fl_lmops->lm_compare_owner that checks
4010          * for the fl_owner to be the same (which it always is on local node
4011          * I guess between lockd processes) and then compares pid.
4012          * As such we assign pid to the owner field to make it all work,
4013          * conflict with normal locks is unlikely since pid space and
4014          * pointer space for current->files are not intersecting */
4015         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4016                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4017
4018         switch (fl_type) {
4019         case F_RDLCK:
4020                 einfo.ei_mode = LCK_PR;
4021                 break;
4022         case F_UNLCK:
4023                 /* An unlock request may or may not have any relation to
4024                  * existing locks so we may not be able to pass a lock handle
4025                  * via a normal ldlm_lock_cancel() request. The request may even
4026                  * unlock a byte range in the middle of an existing lock. In
4027                  * order to process an unlock request we need all of the same
4028                  * information that is given with a normal read or write record
4029                  * lock request. To avoid creating another ldlm unlock (cancel)
4030                  * message we'll treat a LCK_NL flock request as an unlock. */
4031                 einfo.ei_mode = LCK_NL;
4032                 break;
4033         case F_WRLCK:
4034                 einfo.ei_mode = LCK_PW;
4035                 break;
4036         default:
4037                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4038                 RETURN (-ENOTSUPP);
4039         }
4040
4041         switch (cmd) {
4042         case F_SETLKW:
4043 #ifdef F_SETLKW64
4044         case F_SETLKW64:
4045 #endif
4046                 flags = 0;
4047                 break;
4048         case F_SETLK:
4049 #ifdef F_SETLK64
4050         case F_SETLK64:
4051 #endif
4052                 flags = LDLM_FL_BLOCK_NOWAIT;
4053                 break;
4054         case F_GETLK:
4055 #ifdef F_GETLK64
4056         case F_GETLK64:
4057 #endif
4058                 flags = LDLM_FL_TEST_LOCK;
4059                 break;
4060         default:
4061                 CERROR("unknown fcntl lock command: %d\n", cmd);
4062                 RETURN (-EINVAL);
4063         }
4064
4065         /* Save the old mode so that if the mode in the lock changes we
4066          * can decrement the appropriate reader or writer refcount. */
4067         file_lock->fl_type = einfo.ei_mode;
4068
4069         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4070                                      LUSTRE_OPC_ANY, NULL);
4071         if (IS_ERR(op_data))
4072                 RETURN(PTR_ERR(op_data));
4073
4074         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4075                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4076                flock.l_flock.pid, flags, einfo.ei_mode,
4077                flock.l_flock.start, flock.l_flock.end);
4078
4079         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4080                         flags);
4081
4082         /* Restore the file lock type if not TEST lock. */
4083         if (!(flags & LDLM_FL_TEST_LOCK))
4084                 file_lock->fl_type = fl_type;
4085
4086 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4087         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4088             !(flags & LDLM_FL_TEST_LOCK))
4089                 rc2  = locks_lock_file_wait(file, file_lock);
4090 #else
4091         if ((file_lock->fl_flags & FL_FLOCK) &&
4092             (rc == 0 || file_lock->fl_type == F_UNLCK))
4093                 rc2  = flock_lock_file_wait(file, file_lock);
4094         if ((file_lock->fl_flags & FL_POSIX) &&
4095             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4096             !(flags & LDLM_FL_TEST_LOCK))
4097                 rc2  = posix_lock_file_wait(file, file_lock);
4098 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4099
4100         if (rc2 && file_lock->fl_type != F_UNLCK) {
4101                 einfo.ei_mode = LCK_NL;
4102                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4103                            &lockh, flags);
4104                 rc = rc2;
4105         }
4106
4107         ll_finish_md_op_data(op_data);
4108
4109         RETURN(rc);
4110 }
4111
4112 int ll_get_fid_by_name(struct inode *parent, const char *name,
4113                        int namelen, struct lu_fid *fid,
4114                        struct inode **inode)
4115 {
4116         struct md_op_data       *op_data = NULL;
4117         struct mdt_body         *body;
4118         struct ptlrpc_request   *req;
4119         int                     rc;
4120         ENTRY;
4121
4122         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4123                                      LUSTRE_OPC_ANY, NULL);
4124         if (IS_ERR(op_data))
4125                 RETURN(PTR_ERR(op_data));
4126
4127         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4128         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4129         ll_finish_md_op_data(op_data);
4130         if (rc < 0)
4131                 RETURN(rc);
4132
4133         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4134         if (body == NULL)
4135                 GOTO(out_req, rc = -EFAULT);
4136         if (fid != NULL)
4137                 *fid = body->mbo_fid1;
4138
4139         if (inode != NULL)
4140                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4141 out_req:
4142         ptlrpc_req_finished(req);
4143         RETURN(rc);
4144 }
4145
4146 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4147                const char *name)
4148 {
4149         struct dentry *dchild = NULL;
4150         struct inode *child_inode = NULL;
4151         struct md_op_data *op_data;
4152         struct ptlrpc_request *request = NULL;
4153         struct obd_client_handle *och = NULL;
4154         struct qstr qstr;
4155         struct mdt_body *body;
4156         __u64 data_version = 0;
4157         size_t namelen = strlen(name);
4158         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4159         int rc;
4160         ENTRY;
4161
4162         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4163                PFID(ll_inode2fid(parent)), name,
4164                lum->lum_stripe_offset, lum->lum_stripe_count);
4165
4166         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4167             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4168                 lustre_swab_lmv_user_md(lum);
4169
4170         /* Get child FID first */
4171         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4172         qstr.name = name;
4173         qstr.len = namelen;
4174         dchild = d_lookup(file_dentry(file), &qstr);
4175         if (dchild) {
4176                 if (dchild->d_inode)
4177                         child_inode = igrab(dchild->d_inode);
4178                 dput(dchild);
4179         }
4180
4181         if (!child_inode) {
4182                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4183                                         &child_inode);
4184                 if (rc)
4185                         RETURN(rc);
4186         }
4187
4188         if (!child_inode)
4189                 RETURN(-ENOENT);
4190
4191         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4192               OBD_CONNECT2_DIR_MIGRATE)) {
4193                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4194                     ll_i2info(child_inode)->lli_lsm_md) {
4195                         CERROR("%s: MDT doesn't support stripe directory "
4196                                "migration!\n",
4197                                ll_get_fsname(parent->i_sb, NULL, 0));
4198                         GOTO(out_iput, rc = -EOPNOTSUPP);
4199                 }
4200         }
4201
4202         /*
4203          * lfs migrate command needs to be blocked on the client
4204          * by checking the migrate FID against the FID of the
4205          * filesystem root.
4206          */
4207         if (child_inode == parent->i_sb->s_root->d_inode)
4208                 GOTO(out_iput, rc = -EINVAL);
4209
4210         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4211                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4212         if (IS_ERR(op_data))
4213                 GOTO(out_iput, rc = PTR_ERR(op_data));
4214
4215         inode_lock(child_inode);
4216         op_data->op_fid3 = *ll_inode2fid(child_inode);
4217         if (!fid_is_sane(&op_data->op_fid3)) {
4218                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4219                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4220                        PFID(&op_data->op_fid3));
4221                 GOTO(out_unlock, rc = -EINVAL);
4222         }
4223
4224         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4225         op_data->op_data = lum;
4226         op_data->op_data_size = lumlen;
4227
4228 again:
4229         if (S_ISREG(child_inode->i_mode)) {
4230                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4231                 if (IS_ERR(och)) {
4232                         rc = PTR_ERR(och);
4233                         och = NULL;
4234                         GOTO(out_unlock, rc);
4235                 }
4236
4237                 rc = ll_data_version(child_inode, &data_version,
4238                                      LL_DV_WR_FLUSH);
4239                 if (rc != 0)
4240                         GOTO(out_close, rc);
4241
4242                 op_data->op_open_handle = och->och_open_handle;
4243                 op_data->op_data_version = data_version;
4244                 op_data->op_lease_handle = och->och_lease_handle;
4245                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4246
4247                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4248                 och->och_mod->mod_open_req->rq_replay = 0;
4249                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4250         }
4251
4252         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4253                        name, namelen, &request);
4254         if (rc == 0) {
4255                 LASSERT(request != NULL);
4256                 ll_update_times(request, parent);
4257
4258                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4259                 LASSERT(body != NULL);
4260
4261                 /* If the server does release layout lock, then we cleanup
4262                  * the client och here, otherwise release it in out_close: */
4263                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4264                         obd_mod_put(och->och_mod);
4265                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4266                                                   och);
4267                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4268                         OBD_FREE_PTR(och);
4269                         och = NULL;
4270                 }
4271         }
4272
4273         if (request != NULL) {
4274                 ptlrpc_req_finished(request);
4275                 request = NULL;
4276         }
4277
4278         /* Try again if the file layout has changed. */
4279         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4280                 goto again;
4281
4282 out_close:
4283         if (och)
4284                 ll_lease_close(och, child_inode, NULL);
4285         if (!rc)
4286                 clear_nlink(child_inode);
4287 out_unlock:
4288         inode_unlock(child_inode);
4289         ll_finish_md_op_data(op_data);
4290 out_iput:
4291         iput(child_inode);
4292         RETURN(rc);
4293 }
4294
4295 static int
4296 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4297 {
4298         ENTRY;
4299
4300         RETURN(-ENOSYS);
4301 }
4302
4303 /**
4304  * test if some locks matching bits and l_req_mode are acquired
4305  * - bits can be in different locks
4306  * - if found clear the common lock bits in *bits
4307  * - the bits not found, are kept in *bits
4308  * \param inode [IN]
4309  * \param bits [IN] searched lock bits [IN]
4310  * \param l_req_mode [IN] searched lock mode
4311  * \retval boolean, true iff all bits are found
4312  */
4313 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4314 {
4315         struct lustre_handle lockh;
4316         union ldlm_policy_data policy;
4317         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4318                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4319         struct lu_fid *fid;
4320         __u64 flags;
4321         int i;
4322         ENTRY;
4323
4324         if (!inode)
4325                RETURN(0);
4326
4327         fid = &ll_i2info(inode)->lli_fid;
4328         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4329                ldlm_lockname[mode]);
4330
4331         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4332         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4333                 policy.l_inodebits.bits = *bits & (1 << i);
4334                 if (policy.l_inodebits.bits == 0)
4335                         continue;
4336
4337                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4338                                   &policy, mode, &lockh)) {
4339                         struct ldlm_lock *lock;
4340
4341                         lock = ldlm_handle2lock(&lockh);
4342                         if (lock) {
4343                                 *bits &=
4344                                       ~(lock->l_policy_data.l_inodebits.bits);
4345                                 LDLM_LOCK_PUT(lock);
4346                         } else {
4347                                 *bits &= ~policy.l_inodebits.bits;
4348                         }
4349                 }
4350         }
4351         RETURN(*bits == 0);
4352 }
4353
4354 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4355                                struct lustre_handle *lockh, __u64 flags,
4356                                enum ldlm_mode mode)
4357 {
4358         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4359         struct lu_fid *fid;
4360         enum ldlm_mode rc;
4361         ENTRY;
4362
4363         fid = &ll_i2info(inode)->lli_fid;
4364         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4365
4366         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4367                            fid, LDLM_IBITS, &policy, mode, lockh);
4368
4369         RETURN(rc);
4370 }
4371
4372 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4373 {
4374         /* Already unlinked. Just update nlink and return success */
4375         if (rc == -ENOENT) {
4376                 clear_nlink(inode);
4377                 /* If it is striped directory, and there is bad stripe
4378                  * Let's revalidate the dentry again, instead of returning
4379                  * error */
4380                 if (S_ISDIR(inode->i_mode) &&
4381                     ll_i2info(inode)->lli_lsm_md != NULL)
4382                         return 0;
4383
4384                 /* This path cannot be hit for regular files unless in
4385                  * case of obscure races, so no need to to validate
4386                  * size. */
4387                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4388                         return 0;
4389         } else if (rc != 0) {
4390                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4391                              "%s: revalidate FID "DFID" error: rc = %d\n",
4392                              ll_get_fsname(inode->i_sb, NULL, 0),
4393                              PFID(ll_inode2fid(inode)), rc);
4394         }
4395
4396         return rc;
4397 }
4398
4399 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4400 {
4401         struct inode *inode = dentry->d_inode;
4402         struct obd_export *exp = ll_i2mdexp(inode);
4403         struct lookup_intent oit = {
4404                 .it_op = op,
4405         };
4406         struct ptlrpc_request *req = NULL;
4407         struct md_op_data *op_data;
4408         int rc = 0;
4409         ENTRY;
4410
4411         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4412                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4413
4414         /* Call getattr by fid, so do not provide name at all. */
4415         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4416                                      LUSTRE_OPC_ANY, NULL);
4417         if (IS_ERR(op_data))
4418                 RETURN(PTR_ERR(op_data));
4419
4420         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4421         ll_finish_md_op_data(op_data);
4422         if (rc < 0) {
4423                 rc = ll_inode_revalidate_fini(inode, rc);
4424                 GOTO(out, rc);
4425         }
4426
4427         rc = ll_revalidate_it_finish(req, &oit, dentry);
4428         if (rc != 0) {
4429                 ll_intent_release(&oit);
4430                 GOTO(out, rc);
4431         }
4432
4433         /* Unlinked? Unhash dentry, so it is not picked up later by
4434          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4435          * here to preserve get_cwd functionality on 2.6.
4436          * Bug 10503 */
4437         if (!dentry->d_inode->i_nlink) {
4438                 ll_lock_dcache(inode);
4439                 d_lustre_invalidate(dentry, 0);
4440                 ll_unlock_dcache(inode);
4441         }
4442
4443         ll_lookup_finish_locks(&oit, dentry);
4444 out:
4445         ptlrpc_req_finished(req);
4446
4447         return rc;
4448 }
4449
4450 static int ll_merge_md_attr(struct inode *inode)
4451 {
4452         struct ll_inode_info *lli = ll_i2info(inode);
4453         struct cl_attr attr = { 0 };
4454         int rc;
4455
4456         LASSERT(lli->lli_lsm_md != NULL);
4457         down_read(&lli->lli_lsm_sem);
4458         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4459                            &attr, ll_md_blocking_ast);
4460         up_read(&lli->lli_lsm_sem);
4461         if (rc != 0)
4462                 RETURN(rc);
4463
4464         set_nlink(inode, attr.cat_nlink);
4465         inode->i_blocks = attr.cat_blocks;
4466         i_size_write(inode, attr.cat_size);
4467
4468         ll_i2info(inode)->lli_atime = attr.cat_atime;
4469         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4470         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4471
4472         RETURN(0);
4473 }
4474
4475 static inline dev_t ll_compat_encode_dev(dev_t dev)
4476 {
4477         /* The compat_sys_*stat*() syscalls will fail unless the
4478          * device majors and minors are both less than 256. Note that
4479          * the value returned here will be passed through
4480          * old_encode_dev() in cp_compat_stat(). And so we are not
4481          * trying to return a valid compat (u16) device number, just
4482          * one that will pass the old_valid_dev() check. */
4483
4484         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4485 }
4486
4487 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4488 int ll_getattr(const struct path *path, struct kstat *stat,
4489                u32 request_mask, unsigned int flags)
4490 {
4491         struct dentry *de = path->dentry;
4492 #else
4493 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4494 {
4495 #endif
4496         struct inode *inode = de->d_inode;
4497         struct ll_sb_info *sbi = ll_i2sbi(inode);
4498         struct ll_inode_info *lli = ll_i2info(inode);
4499         int rc;
4500
4501         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4502
4503         rc = ll_inode_revalidate(de, IT_GETATTR);
4504         if (rc < 0)
4505                 RETURN(rc);
4506
4507         if (S_ISREG(inode->i_mode)) {
4508                 /* In case of restore, the MDT has the right size and has
4509                  * already send it back without granting the layout lock,
4510                  * inode is up-to-date so glimpse is useless.
4511                  * Also to glimpse we need the layout, in case of a running
4512                  * restore the MDT holds the layout lock so the glimpse will
4513                  * block up to the end of restore (getattr will block)
4514                  */
4515                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4516                         rc = ll_glimpse_size(inode);
4517                         if (rc < 0)
4518                                 RETURN(rc);
4519                 }
4520         } else {
4521                 /* If object isn't regular a file then don't validate size. */
4522                 if (S_ISDIR(inode->i_mode) &&
4523                     lli->lli_lsm_md != NULL) {
4524                         rc = ll_merge_md_attr(inode);
4525                         if (rc < 0)
4526                                 RETURN(rc);
4527                 }
4528
4529                 LTIME_S(inode->i_atime) = lli->lli_atime;
4530                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4531                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4532         }
4533
4534         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4535
4536         if (ll_need_32bit_api(sbi)) {
4537                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4538                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4539                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4540         } else {
4541                 stat->ino = inode->i_ino;
4542                 stat->dev = inode->i_sb->s_dev;
4543                 stat->rdev = inode->i_rdev;
4544         }
4545
4546         stat->mode = inode->i_mode;
4547         stat->uid = inode->i_uid;
4548         stat->gid = inode->i_gid;
4549         stat->atime = inode->i_atime;
4550         stat->mtime = inode->i_mtime;
4551         stat->ctime = inode->i_ctime;
4552         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4553
4554         stat->nlink = inode->i_nlink;
4555         stat->size = i_size_read(inode);
4556         stat->blocks = inode->i_blocks;
4557
4558         return 0;
4559 }
4560
4561 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4562                      __u64 start, __u64 len)
4563 {
4564         int             rc;
4565         size_t          num_bytes;
4566         struct fiemap   *fiemap;
4567         unsigned int    extent_count = fieinfo->fi_extents_max;
4568
4569         num_bytes = sizeof(*fiemap) + (extent_count *
4570                                        sizeof(struct fiemap_extent));
4571         OBD_ALLOC_LARGE(fiemap, num_bytes);
4572
4573         if (fiemap == NULL)
4574                 RETURN(-ENOMEM);
4575
4576         fiemap->fm_flags = fieinfo->fi_flags;
4577         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4578         fiemap->fm_start = start;
4579         fiemap->fm_length = len;
4580         if (extent_count > 0 &&
4581             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4582                            sizeof(struct fiemap_extent)) != 0)
4583                 GOTO(out, rc = -EFAULT);
4584
4585         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4586
4587         fieinfo->fi_flags = fiemap->fm_flags;
4588         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4589         if (extent_count > 0 &&
4590             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4591                          fiemap->fm_mapped_extents *
4592                          sizeof(struct fiemap_extent)) != 0)
4593                 GOTO(out, rc = -EFAULT);
4594 out:
4595         OBD_FREE_LARGE(fiemap, num_bytes);
4596         return rc;
4597 }
4598
4599 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4600 {
4601         struct ll_inode_info *lli = ll_i2info(inode);
4602         struct posix_acl *acl = NULL;
4603         ENTRY;
4604
4605         spin_lock(&lli->lli_lock);
4606         /* VFS' acl_permission_check->check_acl will release the refcount */
4607         acl = posix_acl_dup(lli->lli_posix_acl);
4608         spin_unlock(&lli->lli_lock);
4609
4610         RETURN(acl);
4611 }
4612
4613 #ifdef HAVE_IOP_SET_ACL
4614 #ifdef CONFIG_FS_POSIX_ACL
4615 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4616 {
4617         struct ll_sb_info *sbi = ll_i2sbi(inode);
4618         struct ptlrpc_request *req = NULL;
4619         const char *name = NULL;
4620         char *value = NULL;
4621         size_t value_size = 0;
4622         int rc = 0;
4623         ENTRY;
4624
4625         switch (type) {
4626         case ACL_TYPE_ACCESS:
4627                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4628                 if (acl)
4629                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4630                 break;
4631
4632         case ACL_TYPE_DEFAULT:
4633                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4634                 if (!S_ISDIR(inode->i_mode))
4635                         rc = acl ? -EACCES : 0;
4636                 break;
4637
4638         default:
4639                 rc = -EINVAL;
4640                 break;
4641         }
4642         if (rc)
4643                 return rc;
4644
4645         if (acl) {
4646                 value_size = posix_acl_xattr_size(acl->a_count);
4647                 value = kmalloc(value_size, GFP_NOFS);
4648                 if (value == NULL)
4649                         GOTO(out, rc = -ENOMEM);
4650
4651                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4652                 if (rc < 0)
4653                         GOTO(out_value, rc);
4654         }
4655
4656         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4657                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4658                          name, value, value_size, 0, 0, &req);
4659
4660         ptlrpc_req_finished(req);
4661 out_value:
4662         kfree(value);
4663 out:
4664         if (rc)
4665                 forget_cached_acl(inode, type);
4666         else
4667                 set_cached_acl(inode, type, acl);
4668         RETURN(rc);
4669 }
4670 #endif /* CONFIG_FS_POSIX_ACL */
4671 #endif /* HAVE_IOP_SET_ACL */
4672
4673 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4674 static int
4675 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4676 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4677 # else
4678 ll_check_acl(struct inode *inode, int mask)
4679 # endif
4680 {
4681 # ifdef CONFIG_FS_POSIX_ACL
4682         struct posix_acl *acl;
4683         int rc;
4684         ENTRY;
4685
4686 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4687         if (flags & IPERM_FLAG_RCU)
4688                 return -ECHILD;
4689 #  endif
4690         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4691
4692         if (!acl)
4693                 RETURN(-EAGAIN);
4694
4695         rc = posix_acl_permission(inode, acl, mask);
4696         posix_acl_release(acl);
4697
4698         RETURN(rc);
4699 # else /* !CONFIG_FS_POSIX_ACL */
4700         return -EAGAIN;
4701 # endif /* CONFIG_FS_POSIX_ACL */
4702 }
4703 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4704
4705 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4706 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4707 #else
4708 # ifdef HAVE_INODE_PERMISION_2ARGS
4709 int ll_inode_permission(struct inode *inode, int mask)
4710 # else
4711 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4712 # endif
4713 #endif
4714 {
4715         int rc = 0;
4716         struct ll_sb_info *sbi;
4717         struct root_squash_info *squash;
4718         struct cred *cred = NULL;
4719         const struct cred *old_cred = NULL;
4720         cfs_cap_t cap;
4721         bool squash_id = false;
4722         ENTRY;
4723
4724 #ifdef MAY_NOT_BLOCK
4725         if (mask & MAY_NOT_BLOCK)
4726                 return -ECHILD;
4727 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4728         if (flags & IPERM_FLAG_RCU)
4729                 return -ECHILD;
4730 #endif
4731
4732        /* as root inode are NOT getting validated in lookup operation,
4733         * need to do it before permission check. */
4734
4735         if (inode == inode->i_sb->s_root->d_inode) {
4736                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4737                 if (rc)
4738                         RETURN(rc);
4739         }
4740
4741         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4742                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4743
4744         /* squash fsuid/fsgid if needed */
4745         sbi = ll_i2sbi(inode);
4746         squash = &sbi->ll_squash;
4747         if (unlikely(squash->rsi_uid != 0 &&
4748                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4749                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4750                         squash_id = true;
4751         }
4752         if (squash_id) {
4753                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4754                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4755                        squash->rsi_uid, squash->rsi_gid);
4756
4757                 /* update current process's credentials
4758                  * and FS capability */
4759                 cred = prepare_creds();
4760                 if (cred == NULL)
4761                         RETURN(-ENOMEM);
4762
4763                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4764                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4765                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4766                         if ((1 << cap) & CFS_CAP_FS_MASK)
4767                                 cap_lower(cred->cap_effective, cap);
4768                 }
4769                 old_cred = override_creds(cred);
4770         }
4771
4772         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4773         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4774         /* restore current process's credentials and FS capability */
4775         if (squash_id) {
4776                 revert_creds(old_cred);
4777                 put_cred(cred);
4778         }
4779
4780         RETURN(rc);
4781 }
4782
4783 /* -o localflock - only provides locally consistent flock locks */
4784 struct file_operations ll_file_operations = {
4785 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4786 # ifdef HAVE_SYNC_READ_WRITE
4787         .read           = new_sync_read,
4788         .write          = new_sync_write,
4789 # endif
4790         .read_iter      = ll_file_read_iter,
4791         .write_iter     = ll_file_write_iter,
4792 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4793         .read           = ll_file_read,
4794         .aio_read       = ll_file_aio_read,
4795         .write          = ll_file_write,
4796         .aio_write      = ll_file_aio_write,
4797 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4798         .unlocked_ioctl = ll_file_ioctl,
4799         .open           = ll_file_open,
4800         .release        = ll_file_release,
4801         .mmap           = ll_file_mmap,
4802         .llseek         = ll_file_seek,
4803         .splice_read    = ll_file_splice_read,
4804         .fsync          = ll_fsync,
4805         .flush          = ll_flush
4806 };
4807
4808 struct file_operations ll_file_operations_flock = {
4809 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4810 # ifdef HAVE_SYNC_READ_WRITE
4811         .read           = new_sync_read,
4812         .write          = new_sync_write,
4813 # endif /* HAVE_SYNC_READ_WRITE */
4814         .read_iter      = ll_file_read_iter,
4815         .write_iter     = ll_file_write_iter,
4816 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4817         .read           = ll_file_read,
4818         .aio_read       = ll_file_aio_read,
4819         .write          = ll_file_write,
4820         .aio_write      = ll_file_aio_write,
4821 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4822         .unlocked_ioctl = ll_file_ioctl,
4823         .open           = ll_file_open,
4824         .release        = ll_file_release,
4825         .mmap           = ll_file_mmap,
4826         .llseek         = ll_file_seek,
4827         .splice_read    = ll_file_splice_read,
4828         .fsync          = ll_fsync,
4829         .flush          = ll_flush,
4830         .flock          = ll_file_flock,
4831         .lock           = ll_file_flock
4832 };
4833
4834 /* These are for -o noflock - to return ENOSYS on flock calls */
4835 struct file_operations ll_file_operations_noflock = {
4836 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4837 # ifdef HAVE_SYNC_READ_WRITE
4838         .read           = new_sync_read,
4839         .write          = new_sync_write,
4840 # endif /* HAVE_SYNC_READ_WRITE */
4841         .read_iter      = ll_file_read_iter,
4842         .write_iter     = ll_file_write_iter,
4843 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4844         .read           = ll_file_read,
4845         .aio_read       = ll_file_aio_read,
4846         .write          = ll_file_write,
4847         .aio_write      = ll_file_aio_write,
4848 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4849         .unlocked_ioctl = ll_file_ioctl,
4850         .open           = ll_file_open,
4851         .release        = ll_file_release,
4852         .mmap           = ll_file_mmap,
4853         .llseek         = ll_file_seek,
4854         .splice_read    = ll_file_splice_read,
4855         .fsync          = ll_fsync,
4856         .flush          = ll_flush,
4857         .flock          = ll_file_noflock,
4858         .lock           = ll_file_noflock
4859 };
4860
4861 struct inode_operations ll_file_inode_operations = {
4862         .setattr        = ll_setattr,
4863         .getattr        = ll_getattr,
4864         .permission     = ll_inode_permission,
4865 #ifdef HAVE_IOP_XATTR
4866         .setxattr       = ll_setxattr,
4867         .getxattr       = ll_getxattr,
4868         .removexattr    = ll_removexattr,
4869 #endif
4870         .listxattr      = ll_listxattr,
4871         .fiemap         = ll_fiemap,
4872 #ifdef HAVE_IOP_GET_ACL
4873         .get_acl        = ll_get_acl,
4874 #endif
4875 #ifdef HAVE_IOP_SET_ACL
4876         .set_acl        = ll_set_acl,
4877 #endif
4878 };
4879
4880 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4881 {
4882         struct ll_inode_info *lli = ll_i2info(inode);
4883         struct cl_object *obj = lli->lli_clob;
4884         struct lu_env *env;
4885         int rc;
4886         __u16 refcheck;
4887         ENTRY;
4888
4889         if (obj == NULL)
4890                 RETURN(0);
4891
4892         env = cl_env_get(&refcheck);
4893         if (IS_ERR(env))
4894                 RETURN(PTR_ERR(env));
4895
4896         rc = cl_conf_set(env, lli->lli_clob, conf);
4897         if (rc < 0)
4898                 GOTO(out, rc);
4899
4900         if (conf->coc_opc == OBJECT_CONF_SET) {
4901                 struct ldlm_lock *lock = conf->coc_lock;
4902                 struct cl_layout cl = {
4903                         .cl_layout_gen = 0,
4904                 };
4905
4906                 LASSERT(lock != NULL);
4907                 LASSERT(ldlm_has_layout(lock));
4908
4909                 /* it can only be allowed to match after layout is
4910                  * applied to inode otherwise false layout would be
4911                  * seen. Applying layout shoud happen before dropping
4912                  * the intent lock. */
4913                 ldlm_lock_allow_match(lock);
4914
4915                 rc = cl_object_layout_get(env, obj, &cl);
4916                 if (rc < 0)
4917                         GOTO(out, rc);
4918
4919                 CDEBUG(D_VFSTRACE,
4920                        DFID": layout version change: %u -> %u\n",
4921                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4922                        cl.cl_layout_gen);
4923                 ll_layout_version_set(lli, cl.cl_layout_gen);
4924         }
4925
4926 out:
4927         cl_env_put(env, &refcheck);
4928
4929         RETURN(rc);
4930 }
4931
4932 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4933 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4934
4935 {
4936         struct ll_sb_info *sbi = ll_i2sbi(inode);
4937         struct ptlrpc_request *req;
4938         void *lvbdata;
4939         void *lmm;
4940         int lmmsize;
4941         int rc;
4942         ENTRY;
4943
4944         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4945                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4946                lock->l_lvb_data, lock->l_lvb_len);
4947
4948         if (lock->l_lvb_data != NULL)
4949                 RETURN(0);
4950
4951         /* if layout lock was granted right away, the layout is returned
4952          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4953          * blocked and then granted via completion ast, we have to fetch
4954          * layout here. Please note that we can't use the LVB buffer in
4955          * completion AST because it doesn't have a large enough buffer */
4956         rc = ll_get_default_mdsize(sbi, &lmmsize);
4957         if (rc < 0)
4958                 RETURN(rc);
4959
4960         rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4961                          XATTR_NAME_LOV, lmmsize, &req);
4962         if (rc < 0) {
4963                 if (rc == -ENODATA)
4964                         GOTO(out, rc = 0); /* empty layout */
4965                 else
4966                         RETURN(rc);
4967         }
4968
4969         lmmsize = rc;
4970         rc = 0;
4971         if (lmmsize == 0) /* empty layout */
4972                 GOTO(out, rc = 0);
4973
4974         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4975         if (lmm == NULL)
4976                 GOTO(out, rc = -EFAULT);
4977
4978         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4979         if (lvbdata == NULL)
4980                 GOTO(out, rc = -ENOMEM);
4981
4982         memcpy(lvbdata, lmm, lmmsize);
4983         lock_res_and_lock(lock);
4984         if (unlikely(lock->l_lvb_data == NULL)) {
4985                 lock->l_lvb_type = LVB_T_LAYOUT;
4986                 lock->l_lvb_data = lvbdata;
4987                 lock->l_lvb_len = lmmsize;
4988                 lvbdata = NULL;
4989         }
4990         unlock_res_and_lock(lock);
4991
4992         if (lvbdata)
4993                 OBD_FREE_LARGE(lvbdata, lmmsize);
4994
4995         EXIT;
4996
4997 out:
4998         ptlrpc_req_finished(req);
4999         return rc;
5000 }
5001
5002 /**
5003  * Apply the layout to the inode. Layout lock is held and will be released
5004  * in this function.
5005  */
5006 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5007                               struct inode *inode)
5008 {
5009         struct ll_inode_info *lli = ll_i2info(inode);
5010         struct ll_sb_info    *sbi = ll_i2sbi(inode);
5011         struct ldlm_lock *lock;
5012         struct cl_object_conf conf;
5013         int rc = 0;
5014         bool lvb_ready;
5015         bool wait_layout = false;
5016         ENTRY;
5017
5018         LASSERT(lustre_handle_is_used(lockh));
5019
5020         lock = ldlm_handle2lock(lockh);
5021         LASSERT(lock != NULL);
5022         LASSERT(ldlm_has_layout(lock));
5023
5024         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5025                    PFID(&lli->lli_fid), inode);
5026
5027         /* in case this is a caching lock and reinstate with new inode */
5028         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5029
5030         lock_res_and_lock(lock);
5031         lvb_ready = ldlm_is_lvb_ready(lock);
5032         unlock_res_and_lock(lock);
5033
5034         /* checking lvb_ready is racy but this is okay. The worst case is
5035          * that multi processes may configure the file on the same time. */
5036         if (lvb_ready)
5037                 GOTO(out, rc = 0);
5038
5039         rc = ll_layout_fetch(inode, lock);
5040         if (rc < 0)
5041                 GOTO(out, rc);
5042
5043         /* for layout lock, lmm is stored in lock's lvb.
5044          * lvb_data is immutable if the lock is held so it's safe to access it
5045          * without res lock.
5046          *
5047          * set layout to file. Unlikely this will fail as old layout was
5048          * surely eliminated */
5049         memset(&conf, 0, sizeof conf);
5050         conf.coc_opc = OBJECT_CONF_SET;
5051         conf.coc_inode = inode;
5052         conf.coc_lock = lock;
5053         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5054         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5055         rc = ll_layout_conf(inode, &conf);
5056
5057         /* refresh layout failed, need to wait */
5058         wait_layout = rc == -EBUSY;
5059         EXIT;
5060 out:
5061         LDLM_LOCK_PUT(lock);
5062         ldlm_lock_decref(lockh, mode);
5063
5064         /* wait for IO to complete if it's still being used. */
5065         if (wait_layout) {
5066                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5067                        ll_get_fsname(inode->i_sb, NULL, 0),
5068                        PFID(&lli->lli_fid), inode);
5069
5070                 memset(&conf, 0, sizeof conf);
5071                 conf.coc_opc = OBJECT_CONF_WAIT;
5072                 conf.coc_inode = inode;
5073                 rc = ll_layout_conf(inode, &conf);
5074                 if (rc == 0)
5075                         rc = -EAGAIN;
5076
5077                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5078                        ll_get_fsname(inode->i_sb, NULL, 0),
5079                        PFID(&lli->lli_fid), rc);
5080         }
5081         RETURN(rc);
5082 }
5083
5084 /**
5085  * Issue layout intent RPC to MDS.
5086  * \param inode [in]    file inode
5087  * \param intent [in]   layout intent
5088  *
5089  * \retval 0    on success
5090  * \retval < 0  error code
5091  */
5092 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5093 {
5094         struct ll_inode_info  *lli = ll_i2info(inode);
5095         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5096         struct md_op_data     *op_data;
5097         struct lookup_intent it;
5098         struct ptlrpc_request *req;
5099         int rc;
5100         ENTRY;
5101
5102         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5103                                      0, 0, LUSTRE_OPC_ANY, NULL);
5104         if (IS_ERR(op_data))
5105                 RETURN(PTR_ERR(op_data));
5106
5107         op_data->op_data = intent;
5108         op_data->op_data_size = sizeof(*intent);
5109
5110         memset(&it, 0, sizeof(it));
5111         it.it_op = IT_LAYOUT;
5112         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5113             intent->li_opc == LAYOUT_INTENT_TRUNC)
5114                 it.it_flags = FMODE_WRITE;
5115
5116         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5117                           ll_get_fsname(inode->i_sb, NULL, 0),
5118                           PFID(&lli->lli_fid), inode);
5119
5120         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5121                             &ll_md_blocking_ast, 0);
5122         if (it.it_request != NULL)
5123                 ptlrpc_req_finished(it.it_request);
5124         it.it_request = NULL;
5125
5126         ll_finish_md_op_data(op_data);
5127
5128         /* set lock data in case this is a new lock */
5129         if (!rc)
5130                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5131
5132         ll_intent_drop_lock(&it);
5133
5134         RETURN(rc);
5135 }
5136
5137 /**
5138  * This function checks if there exists a LAYOUT lock on the client side,
5139  * or enqueues it if it doesn't have one in cache.
5140  *
5141  * This function will not hold layout lock so it may be revoked any time after
5142  * this function returns. Any operations depend on layout should be redone
5143  * in that case.
5144  *
5145  * This function should be called before lov_io_init() to get an uptodate
5146  * layout version, the caller should save the version number and after IO
5147  * is finished, this function should be called again to verify that layout
5148  * is not changed during IO time.
5149  */
5150 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5151 {
5152         struct ll_inode_info    *lli = ll_i2info(inode);
5153         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5154         struct lustre_handle lockh;
5155         struct layout_intent intent = {
5156                 .li_opc = LAYOUT_INTENT_ACCESS,
5157         };
5158         enum ldlm_mode mode;
5159         int rc;
5160         ENTRY;
5161
5162         *gen = ll_layout_version_get(lli);
5163         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5164                 RETURN(0);
5165
5166         /* sanity checks */
5167         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5168         LASSERT(S_ISREG(inode->i_mode));
5169
5170         /* take layout lock mutex to enqueue layout lock exclusively. */
5171         mutex_lock(&lli->lli_layout_mutex);
5172
5173         while (1) {
5174                 /* mostly layout lock is caching on the local side, so try to
5175                  * match it before grabbing layout lock mutex. */
5176                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5177                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5178                 if (mode != 0) { /* hit cached lock */
5179                         rc = ll_layout_lock_set(&lockh, mode, inode);
5180                         if (rc == -EAGAIN)
5181                                 continue;
5182                         break;
5183                 }
5184
5185                 rc = ll_layout_intent(inode, &intent);
5186                 if (rc != 0)
5187                         break;
5188         }
5189
5190         if (rc == 0)
5191                 *gen = ll_layout_version_get(lli);
5192         mutex_unlock(&lli->lli_layout_mutex);
5193
5194         RETURN(rc);
5195 }
5196
5197 /**
5198  * Issue layout intent RPC indicating where in a file an IO is about to write.
5199  *
5200  * \param[in] inode     file inode.
5201  * \param[in] ext       write range with start offset of fille in bytes where
5202  *                      an IO is about to write, and exclusive end offset in
5203  *                      bytes.
5204  *
5205  * \retval 0    on success
5206  * \retval < 0  error code
5207  */
5208 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5209                            struct lu_extent *ext)
5210 {
5211         struct layout_intent intent = {
5212                 .li_opc = opc,
5213                 .li_extent.e_start = ext->e_start,
5214                 .li_extent.e_end = ext->e_end,
5215         };
5216         int rc;
5217         ENTRY;
5218
5219         rc = ll_layout_intent(inode, &intent);
5220
5221         RETURN(rc);
5222 }
5223
5224 /**
5225  *  This function send a restore request to the MDT
5226  */
5227 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5228 {
5229         struct hsm_user_request *hur;
5230         int                      len, rc;
5231         ENTRY;
5232
5233         len = sizeof(struct hsm_user_request) +
5234               sizeof(struct hsm_user_item);
5235         OBD_ALLOC(hur, len);
5236         if (hur == NULL)
5237                 RETURN(-ENOMEM);
5238
5239         hur->hur_request.hr_action = HUA_RESTORE;
5240         hur->hur_request.hr_archive_id = 0;
5241         hur->hur_request.hr_flags = 0;
5242         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5243                sizeof(hur->hur_user_item[0].hui_fid));
5244         hur->hur_user_item[0].hui_extent.offset = offset;
5245         hur->hur_user_item[0].hui_extent.length = length;
5246         hur->hur_request.hr_itemcount = 1;
5247         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5248                            len, hur, NULL);
5249         OBD_FREE(hur, len);
5250         RETURN(rc);
5251 }