lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                       ATTR_MTIME | ATTR_MTIME_SET |
 104                                       ATTR_CTIME);
 105         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 106         op_data->op_attr_blocks = inode->i_blocks;
 107         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 108         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 109                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 110         op_data->op_open_handle = och->och_open_handle;
 111
 112         if (och->och_flags & FMODE_WRITE &&
 113             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 114                 /* For HSM: if inode data has been modified, pack it so that
 115                  * MDT can set data dirty flag in the archive. */
 116                 op_data->op_bias |= MDS_DATA_MODIFIED;
 117
 118         EXIT;
 119 }
 120
 121 /**
 122  * Perform a close, possibly with a bias.
 123  * The meaning of "data" depends on the value of "bias".
 124  *
 125  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 126  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 127  * swap layouts with.
 128  */
 129 static int ll_close_inode_openhandle(struct inode *inode,
 130                                      struct obd_client_handle *och,
 131                                      enum mds_op_bias bias, void *data)
 132 {
 133         struct obd_export *md_exp = ll_i2mdexp(inode);
 134         const struct ll_inode_info *lli = ll_i2info(inode);
 135         struct md_op_data *op_data;
 136         struct ptlrpc_request *req = NULL;
 137         int rc;
 138         ENTRY;
 139
 140         if (class_exp2obd(md_exp) == NULL) {
 141                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 142                        ll_get_fsname(inode->i_sb, NULL, 0),
 143                        PFID(&lli->lli_fid));
 144                 GOTO(out, rc = 0);
 145         }
 146
 147         OBD_ALLOC_PTR(op_data);
 148         /* We leak openhandle and request here on error, but not much to be
 149          * done in OOM case since app won't retry close on error either. */
 150         if (op_data == NULL)
 151                 GOTO(out, rc = -ENOMEM);
 152
 153         ll_prepare_close(inode, op_data, och);
 154         switch (bias) {
 155         case MDS_CLOSE_LAYOUT_MERGE:
 156                 /* merge blocks from the victim inode */
 157                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 158                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 159                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 160         case MDS_CLOSE_LAYOUT_SPLIT:
 161         case MDS_CLOSE_LAYOUT_SWAP: {
 162                 struct split_param *sp = data;
 163
 164                 LASSERT(data != NULL);
 165                 op_data->op_bias |= bias;
 166                 op_data->op_data_version = 0;
 167                 op_data->op_lease_handle = och->och_lease_handle;
 168                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 169                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 170                         op_data->op_mirror_id = sp->sp_mirror_id;
 171                 } else {
 172                         op_data->op_fid2 = *ll_inode2fid(data);
 173                 }
 174                 break;
 175         }
 176
 177         case MDS_CLOSE_RESYNC_DONE: {
 178                 struct ll_ioc_lease *ioc = data;
 179
 180                 LASSERT(data != NULL);
 181                 op_data->op_attr_blocks +=
 182                         ioc->lil_count * op_data->op_attr_blocks;
 183                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 184                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 185                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 186
 187                 op_data->op_lease_handle = och->och_lease_handle;
 188                 op_data->op_data = &ioc->lil_ids[0];
 189                 op_data->op_data_size =
 190                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 191                 break;
 192         }
 193
 194         case MDS_HSM_RELEASE:
 195                 LASSERT(data != NULL);
 196                 op_data->op_bias |= MDS_HSM_RELEASE;
 197                 op_data->op_data_version = *(__u64 *)data;
 198                 op_data->op_lease_handle = och->och_lease_handle;
 199                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 200                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 201                 break;
 202
 203         default:
 204                 LASSERT(data == NULL);
 205                 break;
 206         }
 207
 208         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 209                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 210         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 211                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 212
 213         rc = md_close(md_exp, op_data, och->och_mod, &req);
 214         if (rc != 0 && rc != -EINTR)
 215                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 216                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 217
 218         if (rc == 0 && op_data->op_bias & bias) {
 219                 struct mdt_body *body;
 220
 221                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 222                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 223                         rc = -EBUSY;
 224         }
 225
 226         ll_finish_md_op_data(op_data);
 227         EXIT;
 228 out:
 229
 230         md_clear_open_replay_data(md_exp, och);
 231         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 232         OBD_FREE_PTR(och);
 233
 234         ptlrpc_req_finished(req);       /* This is close request */
 235         return rc;
 236 }
 237
 238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 239 {
 240         struct ll_inode_info *lli = ll_i2info(inode);
 241         struct obd_client_handle **och_p;
 242         struct obd_client_handle *och;
 243         __u64 *och_usecount;
 244         int rc = 0;
 245         ENTRY;
 246
 247         if (fmode & FMODE_WRITE) {
 248                 och_p = &lli->lli_mds_write_och;
 249                 och_usecount = &lli->lli_open_fd_write_count;
 250         } else if (fmode & FMODE_EXEC) {
 251                 och_p = &lli->lli_mds_exec_och;
 252                 och_usecount = &lli->lli_open_fd_exec_count;
 253         } else {
 254                 LASSERT(fmode & FMODE_READ);
 255                 och_p = &lli->lli_mds_read_och;
 256                 och_usecount = &lli->lli_open_fd_read_count;
 257         }
 258
 259         mutex_lock(&lli->lli_och_mutex);
 260         if (*och_usecount > 0) {
 261                 /* There are still users of this handle, so skip
 262                  * freeing it. */
 263                 mutex_unlock(&lli->lli_och_mutex);
 264                 RETURN(0);
 265         }
 266
 267         och = *och_p;
 268         *och_p = NULL;
 269         mutex_unlock(&lli->lli_och_mutex);
 270
 271         if (och != NULL) {
 272                 /* There might be a race and this handle may already
 273                  * be closed. */
 274                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 275         }
 276
 277         RETURN(rc);
 278 }
 279
 280 static int ll_md_close(struct inode *inode, struct file *file)
 281 {
 282         union ldlm_policy_data policy = {
 283                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 284         };
 285         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 286         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 287         struct ll_inode_info *lli = ll_i2info(inode);
 288         struct lustre_handle lockh;
 289         enum ldlm_mode lockmode;
 290         int rc = 0;
 291         ENTRY;
 292
 293         /* clear group lock, if present */
 294         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 295                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 296
 297         if (fd->fd_lease_och != NULL) {
 298                 bool lease_broken;
 299
 300                 /* Usually the lease is not released when the
 301                  * application crashed, we need to release here. */
 302                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 303                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 304                         PFID(&lli->lli_fid), rc, lease_broken);
 305
 306                 fd->fd_lease_och = NULL;
 307         }
 308
 309         if (fd->fd_och != NULL) {
 310                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 311                 fd->fd_och = NULL;
 312                 GOTO(out, rc);
 313         }
 314
 315         /* Let's see if we have good enough OPEN lock on the file and if
 316            we can skip talking to MDS */
 317         mutex_lock(&lli->lli_och_mutex);
 318         if (fd->fd_omode & FMODE_WRITE) {
 319                 lockmode = LCK_CW;
 320                 LASSERT(lli->lli_open_fd_write_count);
 321                 lli->lli_open_fd_write_count--;
 322         } else if (fd->fd_omode & FMODE_EXEC) {
 323                 lockmode = LCK_PR;
 324                 LASSERT(lli->lli_open_fd_exec_count);
 325                 lli->lli_open_fd_exec_count--;
 326         } else {
 327                 lockmode = LCK_CR;
 328                 LASSERT(lli->lli_open_fd_read_count);
 329                 lli->lli_open_fd_read_count--;
 330         }
 331         mutex_unlock(&lli->lli_och_mutex);
 332
 333         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 334                            LDLM_IBITS, &policy, lockmode, &lockh))
 335                 rc = ll_md_real_close(inode, fd->fd_omode);
 336
 337 out:
 338         LUSTRE_FPRIVATE(file) = NULL;
 339         ll_file_data_put(fd);
 340
 341         RETURN(rc);
 342 }
 343
 344 /* While this returns an error code, fput() the caller does not, so we need
 345  * to make every effort to clean up all of our state here.  Also, applications
 346  * rarely check close errors and even if an error is returned they will not
 347  * re-try the close call.
 348  */
 349 int ll_file_release(struct inode *inode, struct file *file)
 350 {
 351         struct ll_file_data *fd;
 352         struct ll_sb_info *sbi = ll_i2sbi(inode);
 353         struct ll_inode_info *lli = ll_i2info(inode);
 354         int rc;
 355         ENTRY;
 356
 357         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 358                PFID(ll_inode2fid(inode)), inode);
 359
 360         if (inode->i_sb->s_root != file_dentry(file))
 361                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 362         fd = LUSTRE_FPRIVATE(file);
 363         LASSERT(fd != NULL);
 364
 365         /* The last ref on @file, maybe not the the owner pid of statahead,
 366          * because parent and child process can share the same file handle. */
 367         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 368                 ll_deauthorize_statahead(inode, fd);
 369
 370         if (inode->i_sb->s_root == file_dentry(file)) {
 371                 LUSTRE_FPRIVATE(file) = NULL;
 372                 ll_file_data_put(fd);
 373                 RETURN(0);
 374         }
 375
 376         if (!S_ISDIR(inode->i_mode)) {
 377                 if (lli->lli_clob != NULL)
 378                         lov_read_and_clear_async_rc(lli->lli_clob);
 379                 lli->lli_async_rc = 0;
 380         }
 381
 382         rc = ll_md_close(inode, file);
 383
 384         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 385                 libcfs_debug_dumplog();
 386
 387         RETURN(rc);
 388 }
 389
 390 static inline int ll_dom_readpage(void *data, struct page *page)
 391 {
 392         struct niobuf_local *lnb = data;
 393         void *kaddr;
 394
 395         kaddr = ll_kmap_atomic(page, KM_USER0);
 396         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 397         if (lnb->lnb_len < PAGE_SIZE)
 398                 memset(kaddr + lnb->lnb_len, 0,
 399                        PAGE_SIZE - lnb->lnb_len);
 400         flush_dcache_page(page);
 401         SetPageUptodate(page);
 402         ll_kunmap_atomic(kaddr, KM_USER0);
 403         unlock_page(page);
 404
 405         return 0;
 406 }
 407
 408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 409                         struct lookup_intent *it)
 410 {
 411         struct ll_inode_info *lli = ll_i2info(inode);
 412         struct cl_object *obj = lli->lli_clob;
 413         struct address_space *mapping = inode->i_mapping;
 414         struct page *vmpage;
 415         struct niobuf_remote *rnb;
 416         char *data;
 417         struct lustre_handle lockh;
 418         struct ldlm_lock *lock;
 419         unsigned long index, start;
 420         struct niobuf_local lnb;
 421         bool dom_lock = false;
 422
 423         ENTRY;
 424
 425         if (obj == NULL)
 426                 RETURN_EXIT;
 427
 428         if (it->it_lock_mode != 0) {
 429                 lockh.cookie = it->it_lock_handle;
 430                 lock = ldlm_handle2lock(&lockh);
 431                 if (lock != NULL)
 432                         dom_lock = ldlm_has_dom(lock);
 433                 LDLM_LOCK_PUT(lock);
 434         }
 435         if (!dom_lock)
 436                 RETURN_EXIT;
 437
 438         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 439                                    RCL_SERVER))
 440                 RETURN_EXIT;
 441
 442         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 443         if (rnb == NULL || rnb->rnb_len == 0)
 444                 RETURN_EXIT;
 445
 446         /* LU-11595: Server may return whole file and that is OK always or
 447          * it may return just file tail and its offset must be aligned with
 448          * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
 449          * smaller then offset may be not aligned and that data is just ignored.
 450          */
 451         if (rnb->rnb_offset % PAGE_SIZE)
 452                 RETURN_EXIT;
 453
 454         /* Server returns whole file or just file tail if it fills in
 455          * reply buffer, in both cases total size should be inode size.
 456          */
 457         if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
 458                 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
 459                        ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
 460                        rnb->rnb_len, i_size_read(inode));
 461                 RETURN_EXIT;
 462         }
 463
 464         CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
 465                rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
 466
 467         data = (char *)rnb + sizeof(*rnb);
 468
 469         lnb.lnb_file_offset = rnb->rnb_offset;
 470         start = lnb.lnb_file_offset / PAGE_SIZE;
 471         index = 0;
 472         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 473         lnb.lnb_page_offset = 0;
 474         do {
 475                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 476                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 477                 if (lnb.lnb_len > PAGE_SIZE)
 478                         lnb.lnb_len = PAGE_SIZE;
 479
 480                 vmpage = read_cache_page(mapping, index + start,
 481                                          ll_dom_readpage, &lnb);
 482                 if (IS_ERR(vmpage)) {
 483                         CWARN("%s: cannot fill page %lu for "DFID
 484                               " with data: rc = %li\n",
 485                               ll_get_fsname(inode->i_sb, NULL, 0),
 486                               index + start, PFID(lu_object_fid(&obj->co_lu)),
 487                               PTR_ERR(vmpage));
 488                         break;
 489                 }
 490                 put_page(vmpage);
 491                 index++;
 492         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 493         EXIT;
 494 }
 495
 496 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 497                                 struct lookup_intent *itp)
 498 {
 499         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 500         struct dentry *parent = de->d_parent;
 501         const char *name = NULL;
 502         int len = 0;
 503         struct md_op_data *op_data;
 504         struct ptlrpc_request *req = NULL;
 505         int rc;
 506         ENTRY;
 507
 508         LASSERT(parent != NULL);
 509         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 510
 511         /* if server supports open-by-fid, or file name is invalid, don't pack
 512          * name in open request */
 513         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 514             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 515                 name = de->d_name.name;
 516                 len = de->d_name.len;
 517         }
 518
 519         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 520                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 521         if (IS_ERR(op_data))
 522                 RETURN(PTR_ERR(op_data));
 523         op_data->op_data = lmm;
 524         op_data->op_data_size = lmmsize;
 525
 526         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 527                             &ll_md_blocking_ast, 0);
 528         ll_finish_md_op_data(op_data);
 529         if (rc == -ESTALE) {
 530                 /* reason for keep own exit path - don`t flood log
 531                  * with messages with -ESTALE errors.
 532                  */
 533                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 534                      it_open_error(DISP_OPEN_OPEN, itp))
 535                         GOTO(out, rc);
 536                 ll_release_openhandle(de, itp);
 537                 GOTO(out, rc);
 538         }
 539
 540         if (it_disposition(itp, DISP_LOOKUP_NEG))
 541                 GOTO(out, rc = -ENOENT);
 542
 543         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 544                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 545                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 546                 GOTO(out, rc);
 547         }
 548
 549         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 550
 551         if (!rc && itp->it_lock_mode) {
 552                 ll_dom_finish_open(de->d_inode, req, itp);
 553                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 554         }
 555
 556 out:
 557         ptlrpc_req_finished(req);
 558         ll_intent_drop_lock(itp);
 559
 560         /* We did open by fid, but by the time we got to the server,
 561          * the object disappeared. If this is a create, we cannot really
 562          * tell the userspace that the file it was trying to create
 563          * does not exist. Instead let's return -ESTALE, and the VFS will
 564          * retry the create with LOOKUP_REVAL that we are going to catch
 565          * in ll_revalidate_dentry() and use lookup then.
 566          */
 567         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 568                 rc = -ESTALE;
 569
 570         RETURN(rc);
 571 }
 572
 573 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 574                        struct obd_client_handle *och)
 575 {
 576         struct mdt_body *body;
 577
 578         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 579         och->och_open_handle = body->mbo_open_handle;
 580         och->och_fid = body->mbo_fid1;
 581         och->och_lease_handle.cookie = it->it_lock_handle;
 582         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 583         och->och_flags = it->it_flags;
 584
 585         return md_set_open_replay_data(md_exp, och, it);
 586 }
 587
 588 static int ll_local_open(struct file *file, struct lookup_intent *it,
 589                          struct ll_file_data *fd, struct obd_client_handle *och)
 590 {
 591         struct inode *inode = file_inode(file);
 592         ENTRY;
 593
 594         LASSERT(!LUSTRE_FPRIVATE(file));
 595
 596         LASSERT(fd != NULL);
 597
 598         if (och) {
 599                 int rc;
 600
 601                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 602                 if (rc != 0)
 603                         RETURN(rc);
 604         }
 605
 606         LUSTRE_FPRIVATE(file) = fd;
 607         ll_readahead_init(inode, &fd->fd_ras);
 608         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 609
 610         /* ll_cl_context initialize */
 611         rwlock_init(&fd->fd_lock);
 612         INIT_LIST_HEAD(&fd->fd_lccs);
 613
 614         RETURN(0);
 615 }
 616
 617 /* Open a file, and (for the very first open) create objects on the OSTs at
 618  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 619  * creation or open until ll_lov_setstripe() ioctl is called.
 620  *
 621  * If we already have the stripe MD locally then we don't request it in
 622  * md_open(), by passing a lmm_size = 0.
 623  *
 624  * It is up to the application to ensure no other processes open this file
 625  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 626  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 627  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 628  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 629  */
 630 int ll_file_open(struct inode *inode, struct file *file)
 631 {
 632         struct ll_inode_info *lli = ll_i2info(inode);
 633         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 634                                           .it_flags = file->f_flags };
 635         struct obd_client_handle **och_p = NULL;
 636         __u64 *och_usecount = NULL;
 637         struct ll_file_data *fd;
 638         int rc = 0;
 639         ENTRY;
 640
 641         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 642                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 643
 644         it = file->private_data; /* XXX: compat macro */
 645         file->private_data = NULL; /* prevent ll_local_open assertion */
 646
 647         fd = ll_file_data_get();
 648         if (fd == NULL)
 649                 GOTO(out_nofiledata, rc = -ENOMEM);
 650
 651         fd->fd_file = file;
 652         if (S_ISDIR(inode->i_mode))
 653                 ll_authorize_statahead(inode, fd);
 654
 655         if (inode->i_sb->s_root == file_dentry(file)) {
 656                 LUSTRE_FPRIVATE(file) = fd;
 657                 RETURN(0);
 658         }
 659
 660         if (!it || !it->it_disposition) {
 661                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 662                  * because everything but O_ACCMODE mask was stripped from
 663                  * there */
 664                 if ((oit.it_flags + 1) & O_ACCMODE)
 665                         oit.it_flags++;
 666                 if (file->f_flags & O_TRUNC)
 667                         oit.it_flags |= FMODE_WRITE;
 668
 669                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 670                  * dentry_open after call to open_namei that checks permissions.
 671                  * Only nfsd_open call dentry_open directly without checking
 672                  * permissions and because of that this code below is safe.
 673                  */
 674                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 675                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 676
 677                 /* We do not want O_EXCL here, presumably we opened the file
 678                  * already? XXX - NFS implications? */
 679                 oit.it_flags &= ~O_EXCL;
 680
 681                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 682                  * created if necessary, then "IT_CREAT" should be set to keep
 683                  * consistent with it */
 684                 if (oit.it_flags & O_CREAT)
 685                         oit.it_op |= IT_CREAT;
 686
 687                 it = &oit;
 688         }
 689
 690 restart:
 691         /* Let's see if we have file open on MDS already. */
 692         if (it->it_flags & FMODE_WRITE) {
 693                 och_p = &lli->lli_mds_write_och;
 694                 och_usecount = &lli->lli_open_fd_write_count;
 695         } else if (it->it_flags & FMODE_EXEC) {
 696                 och_p = &lli->lli_mds_exec_och;
 697                 och_usecount = &lli->lli_open_fd_exec_count;
 698          } else {
 699                 och_p = &lli->lli_mds_read_och;
 700                 och_usecount = &lli->lli_open_fd_read_count;
 701         }
 702
 703         mutex_lock(&lli->lli_och_mutex);
 704         if (*och_p) { /* Open handle is present */
 705                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 706                         /* Well, there's extra open request that we do not need,
 707                            let's close it somehow. This will decref request. */
 708                         rc = it_open_error(DISP_OPEN_OPEN, it);
 709                         if (rc) {
 710                                 mutex_unlock(&lli->lli_och_mutex);
 711                                 GOTO(out_openerr, rc);
 712                         }
 713
 714                         ll_release_openhandle(file_dentry(file), it);
 715                 }
 716                 (*och_usecount)++;
 717
 718                 rc = ll_local_open(file, it, fd, NULL);
 719                 if (rc) {
 720                         (*och_usecount)--;
 721                         mutex_unlock(&lli->lli_och_mutex);
 722                         GOTO(out_openerr, rc);
 723                 }
 724         } else {
 725                 LASSERT(*och_usecount == 0);
 726                 if (!it->it_disposition) {
 727                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 728                         /* We cannot just request lock handle now, new ELC code
 729                            means that one of other OPEN locks for this file
 730                            could be cancelled, and since blocking ast handler
 731                            would attempt to grab och_mutex as well, that would
 732                            result in a deadlock */
 733                         mutex_unlock(&lli->lli_och_mutex);
 734                         /*
 735                          * Normally called under two situations:
 736                          * 1. NFS export.
 737                          * 2. A race/condition on MDS resulting in no open
 738                          *    handle to be returned from LOOKUP|OPEN request,
 739                          *    for example if the target entry was a symlink.
 740                          *
 741                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 742                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 743                          *  bit so that it's not confusing later callers.
 744                          *
 745                          *  NB; when ldd is NULL, it must have come via normal
 746                          *  lookup path only, since ll_iget_for_nfs always calls
 747                          *  ll_d_init().
 748                          */
 749                         if (ldd && ldd->lld_nfs_dentry) {
 750                                 ldd->lld_nfs_dentry = 0;
 751                                 it->it_flags |= MDS_OPEN_LOCK;
 752                         }
 753
 754                          /*
 755                          * Always specify MDS_OPEN_BY_FID because we don't want
 756                          * to get file with different fid.
 757                          */
 758                         it->it_flags |= MDS_OPEN_BY_FID;
 759                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 760                                                  it);
 761                         if (rc)
 762                                 GOTO(out_openerr, rc);
 763
 764                         goto restart;
 765                 }
 766                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 767                 if (!*och_p)
 768                         GOTO(out_och_free, rc = -ENOMEM);
 769
 770                 (*och_usecount)++;
 771
 772                 /* md_intent_lock() didn't get a request ref if there was an
 773                  * open error, so don't do cleanup on the request here
 774                  * (bug 3430) */
 775                 /* XXX (green): Should not we bail out on any error here, not
 776                  * just open error? */
 777                 rc = it_open_error(DISP_OPEN_OPEN, it);
 778                 if (rc != 0)
 779                         GOTO(out_och_free, rc);
 780
 781                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 782                          "inode %p: disposition %x, status %d\n", inode,
 783                          it_disposition(it, ~0), it->it_status);
 784
 785                 rc = ll_local_open(file, it, fd, *och_p);
 786                 if (rc)
 787                         GOTO(out_och_free, rc);
 788         }
 789         mutex_unlock(&lli->lli_och_mutex);
 790         fd = NULL;
 791
 792         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 793            different kind of OPEN lock for this same inode gets cancelled
 794            by ldlm_cancel_lru */
 795         if (!S_ISREG(inode->i_mode))
 796                 GOTO(out_och_free, rc);
 797
 798         cl_lov_delay_create_clear(&file->f_flags);
 799         GOTO(out_och_free, rc);
 800
 801 out_och_free:
 802         if (rc) {
 803                 if (och_p && *och_p) {
 804                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 805                         *och_p = NULL; /* OBD_FREE writes some magic there */
 806                         (*och_usecount)--;
 807                 }
 808                 mutex_unlock(&lli->lli_och_mutex);
 809
 810 out_openerr:
 811                 if (lli->lli_opendir_key == fd)
 812                         ll_deauthorize_statahead(inode, fd);
 813                 if (fd != NULL)
 814                         ll_file_data_put(fd);
 815         } else {
 816                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 817         }
 818
 819 out_nofiledata:
 820         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 821                 ptlrpc_req_finished(it->it_request);
 822                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 823         }
 824
 825         return rc;
 826 }
 827
 828 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 829                         struct ldlm_lock_desc *desc, void *data, int flag)
 830 {
 831         int rc;
 832         struct lustre_handle lockh;
 833         ENTRY;
 834
 835         switch (flag) {
 836         case LDLM_CB_BLOCKING:
 837                 ldlm_lock2handle(lock, &lockh);
 838                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 839                 if (rc < 0) {
 840                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 841                         RETURN(rc);
 842                 }
 843                 break;
 844         case LDLM_CB_CANCELING:
 845                 /* do nothing */
 846                 break;
 847         }
 848         RETURN(0);
 849 }
 850
 851 /**
 852  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 853  * and save it as fd->fd_och so as to force client to reopen the file even
 854  * if it has an open lock in cache already.
 855  */
 856 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 857                                 struct lustre_handle *old_open_handle)
 858 {
 859         struct ll_inode_info *lli = ll_i2info(inode);
 860         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 861         struct obd_client_handle **och_p;
 862         __u64 *och_usecount;
 863         int rc = 0;
 864         ENTRY;
 865
 866         /* Get the openhandle of the file */
 867         mutex_lock(&lli->lli_och_mutex);
 868         if (fd->fd_lease_och != NULL)
 869                 GOTO(out_unlock, rc = -EBUSY);
 870
 871         if (fd->fd_och == NULL) {
 872                 if (file->f_mode & FMODE_WRITE) {
 873                         LASSERT(lli->lli_mds_write_och != NULL);
 874                         och_p = &lli->lli_mds_write_och;
 875                         och_usecount = &lli->lli_open_fd_write_count;
 876                 } else {
 877                         LASSERT(lli->lli_mds_read_och != NULL);
 878                         och_p = &lli->lli_mds_read_och;
 879                         och_usecount = &lli->lli_open_fd_read_count;
 880                 }
 881
 882                 if (*och_usecount > 1)
 883                         GOTO(out_unlock, rc = -EBUSY);
 884
 885                 fd->fd_och = *och_p;
 886                 *och_usecount = 0;
 887                 *och_p = NULL;
 888         }
 889
 890         *old_open_handle = fd->fd_och->och_open_handle;
 891
 892         EXIT;
 893 out_unlock:
 894         mutex_unlock(&lli->lli_och_mutex);
 895         return rc;
 896 }
 897
 898 /**
 899  * Release ownership on lli_mds_*_och when putting back a file lease.
 900  */
 901 static int ll_lease_och_release(struct inode *inode, struct file *file)
 902 {
 903         struct ll_inode_info *lli = ll_i2info(inode);
 904         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 905         struct obd_client_handle **och_p;
 906         struct obd_client_handle *old_och = NULL;
 907         __u64 *och_usecount;
 908         int rc = 0;
 909         ENTRY;
 910
 911         mutex_lock(&lli->lli_och_mutex);
 912         if (file->f_mode & FMODE_WRITE) {
 913                 och_p = &lli->lli_mds_write_och;
 914                 och_usecount = &lli->lli_open_fd_write_count;
 915         } else {
 916                 och_p = &lli->lli_mds_read_och;
 917                 och_usecount = &lli->lli_open_fd_read_count;
 918         }
 919
 920         /* The file may have been open by another process (broken lease) so
 921          * *och_p is not NULL. In this case we should simply increase usecount
 922          * and close fd_och.
 923          */
 924         if (*och_p != NULL) {
 925                 old_och = fd->fd_och;
 926                 (*och_usecount)++;
 927         } else {
 928                 *och_p = fd->fd_och;
 929                 *och_usecount = 1;
 930         }
 931         fd->fd_och = NULL;
 932         mutex_unlock(&lli->lli_och_mutex);
 933
 934         if (old_och != NULL)
 935                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 936
 937         RETURN(rc);
 938 }
 939
 940 /**
 941  * Acquire a lease and open the file.
 942  */
 943 static struct obd_client_handle *
 944 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 945               __u64 open_flags)
 946 {
 947         struct lookup_intent it = { .it_op = IT_OPEN };
 948         struct ll_sb_info *sbi = ll_i2sbi(inode);
 949         struct md_op_data *op_data;
 950         struct ptlrpc_request *req = NULL;
 951         struct lustre_handle old_open_handle = { 0 };
 952         struct obd_client_handle *och = NULL;
 953         int rc;
 954         int rc2;
 955         ENTRY;
 956
 957         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 958                 RETURN(ERR_PTR(-EINVAL));
 959
 960         if (file != NULL) {
 961                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 962                         RETURN(ERR_PTR(-EPERM));
 963
 964                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
 965                 if (rc)
 966                         RETURN(ERR_PTR(rc));
 967         }
 968
 969         OBD_ALLOC_PTR(och);
 970         if (och == NULL)
 971                 RETURN(ERR_PTR(-ENOMEM));
 972
 973         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 974                                         LUSTRE_OPC_ANY, NULL);
 975         if (IS_ERR(op_data))
 976                 GOTO(out, rc = PTR_ERR(op_data));
 977
 978         /* To tell the MDT this openhandle is from the same owner */
 979         op_data->op_open_handle = old_open_handle;
 980
 981         it.it_flags = fmode | open_flags;
 982         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 983         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 984                             &ll_md_blocking_lease_ast,
 985         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 986          * it can be cancelled which may mislead applications that the lease is
 987          * broken;
 988          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 989          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 990          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 991                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 992         ll_finish_md_op_data(op_data);
 993         ptlrpc_req_finished(req);
 994         if (rc < 0)
 995                 GOTO(out_release_it, rc);
 996
 997         if (it_disposition(&it, DISP_LOOKUP_NEG))
 998                 GOTO(out_release_it, rc = -ENOENT);
 999
1000         rc = it_open_error(DISP_OPEN_OPEN, &it);
1001         if (rc)
1002                 GOTO(out_release_it, rc);
1003
1004         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1005         ll_och_fill(sbi->ll_md_exp, &it, och);
1006
1007         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1008                 GOTO(out_close, rc = -EOPNOTSUPP);
1009
1010         /* already get lease, handle lease lock */
1011         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1012         if (it.it_lock_mode == 0 ||
1013             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1014                 /* open lock must return for lease */
1015                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1016                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1017                         it.it_lock_bits);
1018                 GOTO(out_close, rc = -EPROTO);
1019         }
1020
1021         ll_intent_release(&it);
1022         RETURN(och);
1023
1024 out_close:
1025         /* Cancel open lock */
1026         if (it.it_lock_mode != 0) {
1027                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1028                                             it.it_lock_mode);
1029                 it.it_lock_mode = 0;
1030                 och->och_lease_handle.cookie = 0ULL;
1031         }
1032         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1033         if (rc2 < 0)
1034                 CERROR("%s: error closing file "DFID": %d\n",
1035                        ll_get_fsname(inode->i_sb, NULL, 0),
1036                        PFID(&ll_i2info(inode)->lli_fid), rc2);
1037         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1038 out_release_it:
1039         ll_intent_release(&it);
1040 out:
1041         if (och != NULL)
1042                 OBD_FREE_PTR(och);
1043         RETURN(ERR_PTR(rc));
1044 }
1045
1046 /**
1047  * Check whether a layout swap can be done between two inodes.
1048  *
1049  * \param[in] inode1  First inode to check
1050  * \param[in] inode2  Second inode to check
1051  *
1052  * \retval 0 on success, layout swap can be performed between both inodes
1053  * \retval negative error code if requirements are not met
1054  */
1055 static int ll_check_swap_layouts_validity(struct inode *inode1,
1056                                           struct inode *inode2)
1057 {
1058         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1059                 return -EINVAL;
1060
1061         if (inode_permission(inode1, MAY_WRITE) ||
1062             inode_permission(inode2, MAY_WRITE))
1063                 return -EPERM;
1064
1065         if (inode1->i_sb != inode2->i_sb)
1066                 return -EXDEV;
1067
1068         return 0;
1069 }
1070
1071 static int ll_swap_layouts_close(struct obd_client_handle *och,
1072                                  struct inode *inode, struct inode *inode2)
1073 {
1074         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1075         const struct lu_fid     *fid2;
1076         int                      rc;
1077         ENTRY;
1078
1079         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1080                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1081
1082         rc = ll_check_swap_layouts_validity(inode, inode2);
1083         if (rc < 0)
1084                 GOTO(out_free_och, rc);
1085
1086         /* We now know that inode2 is a lustre inode */
1087         fid2 = ll_inode2fid(inode2);
1088
1089         rc = lu_fid_cmp(fid1, fid2);
1090         if (rc == 0)
1091                 GOTO(out_free_och, rc = -EINVAL);
1092
1093         /* Close the file and {swap,merge} layouts between inode & inode2.
1094          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1095          * because we still need it to pack l_remote_handle to MDT. */
1096         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1097                                        inode2);
1098
1099         och = NULL; /* freed in ll_close_inode_openhandle() */
1100
1101 out_free_och:
1102         if (och != NULL)
1103                 OBD_FREE_PTR(och);
1104
1105         RETURN(rc);
1106 }
1107
1108 /**
1109  * Release lease and close the file.
1110  * It will check if the lease has ever broken.
1111  */
1112 static int ll_lease_close_intent(struct obd_client_handle *och,
1113                                  struct inode *inode,
1114                                  bool *lease_broken, enum mds_op_bias bias,
1115                                  void *data)
1116 {
1117         struct ldlm_lock *lock;
1118         bool cancelled = true;
1119         int rc;
1120         ENTRY;
1121
1122         lock = ldlm_handle2lock(&och->och_lease_handle);
1123         if (lock != NULL) {
1124                 lock_res_and_lock(lock);
1125                 cancelled = ldlm_is_cancel(lock);
1126                 unlock_res_and_lock(lock);
1127                 LDLM_LOCK_PUT(lock);
1128         }
1129
1130         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1131                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1132
1133         if (lease_broken != NULL)
1134                 *lease_broken = cancelled;
1135
1136         if (!cancelled && !bias)
1137                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1138
1139         if (cancelled) { /* no need to excute intent */
1140                 bias = 0;
1141                 data = NULL;
1142         }
1143
1144         rc = ll_close_inode_openhandle(inode, och, bias, data);
1145         RETURN(rc);
1146 }
1147
1148 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1149                           bool *lease_broken)
1150 {
1151         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1152 }
1153
1154 /**
1155  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1156  */
1157 static int ll_lease_file_resync(struct obd_client_handle *och,
1158                                 struct inode *inode, unsigned long arg)
1159 {
1160         struct ll_sb_info *sbi = ll_i2sbi(inode);
1161         struct md_op_data *op_data;
1162         struct ll_ioc_lease_id ioc;
1163         __u64 data_version_unused;
1164         int rc;
1165         ENTRY;
1166
1167         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1168                                      LUSTRE_OPC_ANY, NULL);
1169         if (IS_ERR(op_data))
1170                 RETURN(PTR_ERR(op_data));
1171
1172         if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1173                            sizeof(ioc)))
1174                 RETURN(-EFAULT);
1175
1176         /* before starting file resync, it's necessary to clean up page cache
1177          * in client memory, otherwise once the layout version is increased,
1178          * writing back cached data will be denied the OSTs. */
1179         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1180         if (rc)
1181                 GOTO(out, rc);
1182
1183         op_data->op_lease_handle = och->och_lease_handle;
1184         op_data->op_mirror_id = ioc.lil_mirror_id;
1185         rc = md_file_resync(sbi->ll_md_exp, op_data);
1186         if (rc)
1187                 GOTO(out, rc);
1188
1189         EXIT;
1190 out:
1191         ll_finish_md_op_data(op_data);
1192         return rc;
1193 }
1194
1195 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1196 {
1197         struct ll_inode_info *lli = ll_i2info(inode);
1198         struct cl_object *obj = lli->lli_clob;
1199         struct cl_attr *attr = vvp_env_thread_attr(env);
1200         s64 atime;
1201         s64 mtime;
1202         s64 ctime;
1203         int rc = 0;
1204
1205         ENTRY;
1206
1207         ll_inode_size_lock(inode);
1208
1209         /* Merge timestamps the most recently obtained from MDS with
1210          * timestamps obtained from OSTs.
1211          *
1212          * Do not overwrite atime of inode because it may be refreshed
1213          * by file_accessed() function. If the read was served by cache
1214          * data, there is no RPC to be sent so that atime may not be
1215          * transferred to OSTs at all. MDT only updates atime at close time
1216          * if it's at least 'mdd.*.atime_diff' older.
1217          * All in all, the atime in Lustre does not strictly comply with
1218          * POSIX. Solving this problem needs to send an RPC to MDT for each
1219          * read, this will hurt performance. */
1220         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1221                 LTIME_S(inode->i_atime) = lli->lli_atime;
1222                 lli->lli_update_atime = 0;
1223         }
1224         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1225         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1226
1227         atime = LTIME_S(inode->i_atime);
1228         mtime = LTIME_S(inode->i_mtime);
1229         ctime = LTIME_S(inode->i_ctime);
1230
1231         cl_object_attr_lock(obj);
1232         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1233                 rc = -EINVAL;
1234         else
1235                 rc = cl_object_attr_get(env, obj, attr);
1236         cl_object_attr_unlock(obj);
1237
1238         if (rc != 0)
1239                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1240
1241         if (atime < attr->cat_atime)
1242                 atime = attr->cat_atime;
1243
1244         if (ctime < attr->cat_ctime)
1245                 ctime = attr->cat_ctime;
1246
1247         if (mtime < attr->cat_mtime)
1248                 mtime = attr->cat_mtime;
1249
1250         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1251                PFID(&lli->lli_fid), attr->cat_size);
1252
1253         i_size_write(inode, attr->cat_size);
1254         inode->i_blocks = attr->cat_blocks;
1255
1256         LTIME_S(inode->i_atime) = atime;
1257         LTIME_S(inode->i_mtime) = mtime;
1258         LTIME_S(inode->i_ctime) = ctime;
1259
1260 out_size_unlock:
1261         ll_inode_size_unlock(inode);
1262
1263         RETURN(rc);
1264 }
1265
1266 /**
1267  * Set designated mirror for I/O.
1268  *
1269  * So far only read, write, and truncated can support to issue I/O to
1270  * designated mirror.
1271  */
1272 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1273 {
1274         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1275
1276         /* clear layout version for generic(non-resync) I/O in case it carries
1277          * stale layout version due to I/O restart */
1278         io->ci_layout_version = 0;
1279
1280         /* FLR: disable non-delay for designated mirror I/O because obviously
1281          * only one mirror is available */
1282         if (fd->fd_designated_mirror > 0) {
1283                 io->ci_ndelay = 0;
1284                 io->ci_designated_mirror = fd->fd_designated_mirror;
1285                 io->ci_layout_version = fd->fd_layout_version;
1286                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1287                                  * io to ptasks */
1288         }
1289
1290         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1291                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1292 }
1293
1294 static bool file_is_noatime(const struct file *file)
1295 {
1296         const struct vfsmount *mnt = file->f_path.mnt;
1297         const struct inode *inode = file_inode((struct file *)file);
1298
1299         /* Adapted from file_accessed() and touch_atime().*/
1300         if (file->f_flags & O_NOATIME)
1301                 return true;
1302
1303         if (inode->i_flags & S_NOATIME)
1304                 return true;
1305
1306         if (IS_NOATIME(inode))
1307                 return true;
1308
1309         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1310                 return true;
1311
1312         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1313                 return true;
1314
1315         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1316                 return true;
1317
1318         return false;
1319 }
1320
1321 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1322
1323 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1324 {
1325         struct inode *inode = file_inode(file);
1326         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1327
1328         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1329         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1330         io->u.ci_rw.rw_file = file;
1331         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1332         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1333         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1334
1335         if (iot == CIT_WRITE) {
1336                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1337                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1338                                            file->f_flags & O_DIRECT ||
1339                                            IS_SYNC(inode));
1340         }
1341         io->ci_obj = ll_i2info(inode)->lli_clob;
1342         io->ci_lockreq = CILR_MAYBE;
1343         if (ll_file_nolock(file)) {
1344                 io->ci_lockreq = CILR_NEVER;
1345                 io->ci_no_srvlock = 1;
1346         } else if (file->f_flags & O_APPEND) {
1347                 io->ci_lockreq = CILR_MANDATORY;
1348         }
1349         io->ci_noatime = file_is_noatime(file);
1350         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1351                 io->ci_pio = !io->u.ci_rw.rw_append;
1352         else
1353                 io->ci_pio = 0;
1354
1355         /* FLR: only use non-delay I/O for read as there is only one
1356          * avaliable mirror for write. */
1357         io->ci_ndelay = !(iot == CIT_WRITE);
1358
1359         ll_io_set_mirror(io, file);
1360 }
1361
1362 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1363 {
1364         struct cl_io_pt *pt = ptask->pt_cbdata;
1365         struct file *file = pt->cip_file;
1366         struct lu_env *env;
1367         struct cl_io *io;
1368         loff_t pos = pt->cip_pos;
1369         int rc;
1370         __u16 refcheck;
1371         ENTRY;
1372
1373         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1374                 file_dentry(file)->d_name.name,
1375                 pt->cip_iot == CIT_READ ? "read" : "write",
1376                 pos, pos + pt->cip_count);
1377
1378         env = cl_env_get(&refcheck);
1379         if (IS_ERR(env))
1380                 RETURN(PTR_ERR(env));
1381
1382         io = vvp_env_thread_io(env);
1383         ll_io_init(io, file, pt->cip_iot);
1384         io->u.ci_rw.rw_iter = pt->cip_iter;
1385         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1386         io->ci_pio = 0; /* It's already in parallel task */
1387
1388         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1389                            pt->cip_count - pt->cip_result);
1390         if (!rc) {
1391                 struct vvp_io *vio = vvp_env_io(env);
1392
1393                 vio->vui_io_subtype = IO_NORMAL;
1394                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1395
1396                 ll_cl_add(file, env, io, LCC_RW);
1397                 rc = cl_io_loop(env, io);
1398                 ll_cl_remove(file, env);
1399         } else {
1400                 /* cl_io_rw_init() handled IO */
1401                 rc = io->ci_result;
1402         }
1403
1404         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1405                 if (io->ci_nob > 0)
1406                         io->ci_nob /= 2;
1407                 rc = -EIO;
1408         }
1409
1410         if (io->ci_nob > 0) {
1411                 pt->cip_result += io->ci_nob;
1412                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1413                 pos += io->ci_nob;
1414                 pt->cip_iocb.ki_pos = pos;
1415 #ifdef HAVE_KIOCB_KI_LEFT
1416                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1417 #elif defined(HAVE_KI_NBYTES)
1418                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1419 #endif
1420         }
1421
1422         cl_io_fini(env, io);
1423         cl_env_put(env, &refcheck);
1424
1425         pt->cip_need_restart = io->ci_need_restart;
1426
1427         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1428                 file_dentry(file)->d_name.name,
1429                 pt->cip_iot == CIT_READ ? "read" : "write",
1430                 pt->cip_result, rc);
1431
1432         RETURN(pt->cip_result > 0 ? 0 : rc);
1433 }
1434
1435 static ssize_t
1436 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1437                    struct file *file, enum cl_io_type iot,
1438                    loff_t *ppos, size_t count)
1439 {
1440         struct range_lock       range;
1441         struct vvp_io           *vio = vvp_env_io(env);
1442         struct inode            *inode = file_inode(file);
1443         struct ll_inode_info    *lli = ll_i2info(inode);
1444         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1445         struct cl_io            *io;
1446         loff_t                  pos = *ppos;
1447         ssize_t                 result = 0;
1448         int                     rc = 0;
1449         unsigned                retried = 0;
1450         bool                    restarted = false;
1451
1452         ENTRY;
1453
1454         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1455                 file_dentry(file)->d_name.name,
1456                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1457
1458 restart:
1459         io = vvp_env_thread_io(env);
1460         ll_io_init(io, file, iot);
1461         if (args->via_io_subtype == IO_NORMAL) {
1462                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1463                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1464         }
1465         if (args->via_io_subtype != IO_NORMAL || restarted)
1466                 io->ci_pio = 0;
1467         io->ci_ndelay_tried = retried;
1468
1469         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1470                 bool range_locked = false;
1471
1472                 if (file->f_flags & O_APPEND)
1473                         range_lock_init(&range, 0, LUSTRE_EOF);
1474                 else
1475                         range_lock_init(&range, pos, pos + count - 1);
1476
1477                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1478                 vio->vui_io_subtype = args->via_io_subtype;
1479
1480                 switch (vio->vui_io_subtype) {
1481                 case IO_NORMAL:
1482                         /* Direct IO reads must also take range lock,
1483                          * or multiple reads will try to work on the same pages
1484                          * See LU-6227 for details. */
1485                         if (((iot == CIT_WRITE) ||
1486                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1487                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1488                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1489                                        RL_PARA(&range));
1490                                 rc = range_lock(&lli->lli_write_tree, &range);
1491                                 if (rc < 0)
1492                                         GOTO(out, rc);
1493
1494                                 range_locked = true;
1495                         }
1496                         break;
1497                 case IO_SPLICE:
1498                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1499                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1500                         break;
1501                 default:
1502                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1503                         LBUG();
1504                 }
1505
1506                 ll_cl_add(file, env, io, LCC_RW);
1507                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1508                     !lli->lli_inode_locked) {
1509                         inode_lock(inode);
1510                         lli->lli_inode_locked = 1;
1511                 }
1512                 rc = cl_io_loop(env, io);
1513                 if (lli->lli_inode_locked) {
1514                         lli->lli_inode_locked = 0;
1515                         inode_unlock(inode);
1516                 }
1517                 ll_cl_remove(file, env);
1518
1519                 if (range_locked) {
1520                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1521                                RL_PARA(&range));
1522                         range_unlock(&lli->lli_write_tree, &range);
1523                 }
1524         } else {
1525                 /* cl_io_rw_init() handled IO */
1526                 rc = io->ci_result;
1527         }
1528
1529         if (io->ci_nob > 0) {
1530                 result += io->ci_nob;
1531                 count  -= io->ci_nob;
1532
1533                 if (args->via_io_subtype == IO_NORMAL) {
1534                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1535
1536                         /* CLIO is too complicated. See LU-11069. */
1537                         if (cl_io_is_append(io))
1538                                 pos = io->u.ci_rw.rw_iocb.ki_pos;
1539                         else
1540                                 pos += io->ci_nob;
1541
1542                         args->u.normal.via_iocb->ki_pos = pos;
1543 #ifdef HAVE_KIOCB_KI_LEFT
1544                         args->u.normal.via_iocb->ki_left = count;
1545 #elif defined(HAVE_KI_NBYTES)
1546                         args->u.normal.via_iocb->ki_nbytes = count;
1547 #endif
1548                 } else {
1549                         /* for splice */
1550                         pos = io->u.ci_rw.rw_range.cir_pos;
1551                 }
1552         }
1553 out:
1554         cl_io_fini(env, io);
1555
1556         CDEBUG(D_VFSTRACE,
1557                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1558                file->f_path.dentry->d_name.name,
1559                iot, rc, result, io->ci_need_restart);
1560
1561         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1562                 CDEBUG(D_VFSTRACE,
1563                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1564                         file_dentry(file)->d_name.name,
1565                         iot == CIT_READ ? "read" : "write",
1566                         pos, pos + count, result, rc);
1567                 /* preserve the tried count for FLR */
1568                 retried = io->ci_ndelay_tried;
1569                 restarted = true;
1570                 goto restart;
1571         }
1572
1573         if (iot == CIT_READ) {
1574                 if (result > 0)
1575                         ll_stats_ops_tally(ll_i2sbi(inode),
1576                                            LPROC_LL_READ_BYTES, result);
1577         } else if (iot == CIT_WRITE) {
1578                 if (result > 0) {
1579                         ll_stats_ops_tally(ll_i2sbi(inode),
1580                                            LPROC_LL_WRITE_BYTES, result);
1581                         fd->fd_write_failed = false;
1582                 } else if (result == 0 && rc == 0) {
1583                         rc = io->ci_result;
1584                         if (rc < 0)
1585                                 fd->fd_write_failed = true;
1586                         else
1587                                 fd->fd_write_failed = false;
1588                 } else if (rc != -ERESTARTSYS) {
1589                         fd->fd_write_failed = true;
1590                 }
1591         }
1592
1593         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1594                 file_dentry(file)->d_name.name,
1595                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1596
1597         *ppos = pos;
1598
1599         RETURN(result > 0 ? result : rc);
1600 }
1601
1602 /**
1603  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1604  * especially for small I/O.
1605  *
1606  * To serve a read request, CLIO has to create and initialize a cl_io and
1607  * then request DLM lock. This has turned out to have siginificant overhead
1608  * and affects the performance of small I/O dramatically.
1609  *
1610  * It's not necessary to create a cl_io for each I/O. Under the help of read
1611  * ahead, most of the pages being read are already in memory cache and we can
1612  * read those pages directly because if the pages exist, the corresponding DLM
1613  * lock must exist so that page content must be valid.
1614  *
1615  * In fast read implementation, the llite speculatively finds and reads pages
1616  * in memory cache. There are three scenarios for fast read:
1617  *   - If the page exists and is uptodate, kernel VM will provide the data and
1618  *     CLIO won't be intervened;
1619  *   - If the page was brought into memory by read ahead, it will be exported
1620  *     and read ahead parameters will be updated;
1621  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1622  *     it will go back and invoke normal read, i.e., a cl_io will be created
1623  *     and DLM lock will be requested.
1624  *
1625  * POSIX compliance: posix standard states that read is intended to be atomic.
1626  * Lustre read implementation is in line with Linux kernel read implementation
1627  * and neither of them complies with POSIX standard in this matter. Fast read
1628  * doesn't make the situation worse on single node but it may interleave write
1629  * results from multiple nodes due to short read handling in ll_file_aio_read().
1630  *
1631  * \param env - lu_env
1632  * \param iocb - kiocb from kernel
1633  * \param iter - user space buffers where the data will be copied
1634  *
1635  * \retval - number of bytes have been read, or error code if error occurred.
1636  */
1637 static ssize_t
1638 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1639 {
1640         ssize_t result;
1641
1642         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1643                 return 0;
1644
1645         /* NB: we can't do direct IO for fast read because it will need a lock
1646          * to make IO engine happy. */
1647         if (iocb->ki_filp->f_flags & O_DIRECT)
1648                 return 0;
1649
1650         result = generic_file_read_iter(iocb, iter);
1651
1652         /* If the first page is not in cache, generic_file_aio_read() will be
1653          * returned with -ENODATA.
1654          * See corresponding code in ll_readpage(). */
1655         if (result == -ENODATA)
1656                 result = 0;
1657
1658         if (result > 0)
1659                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1660                                 LPROC_LL_READ_BYTES, result);
1661
1662         return result;
1663 }
1664
1665 /*
1666  * Read from a file (through the page cache).
1667  */
1668 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1669 {
1670         struct lu_env *env;
1671         struct vvp_io_args *args;
1672         ssize_t result;
1673         ssize_t rc2;
1674         __u16 refcheck;
1675
1676         result = ll_do_fast_read(iocb, to);
1677         if (result < 0 || iov_iter_count(to) == 0)
1678                 GOTO(out, result);
1679
1680         env = cl_env_get(&refcheck);
1681         if (IS_ERR(env))
1682                 return PTR_ERR(env);
1683
1684         args = ll_env_args(env, IO_NORMAL);
1685         args->u.normal.via_iter = to;
1686         args->u.normal.via_iocb = iocb;
1687
1688         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1689                                  &iocb->ki_pos, iov_iter_count(to));
1690         if (rc2 > 0)
1691                 result += rc2;
1692         else if (result == 0)
1693                 result = rc2;
1694
1695         cl_env_put(env, &refcheck);
1696 out:
1697         return result;
1698 }
1699
1700 /**
1701  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1702  * If a page is already in the page cache and dirty (and some other things -
1703  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1704  * write to it without doing a full I/O, because Lustre already knows about it
1705  * and will write it out.  This saves a lot of processing time.
1706  *
1707  * All writes here are within one page, so exclusion is handled by the page
1708  * lock on the vm page.  We do not do tiny writes for writes which touch
1709  * multiple pages because it's very unlikely multiple sequential pages are
1710  * are already dirty.
1711  *
1712  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1713  * and are unlikely to be to already dirty pages.
1714  *
1715  * Attribute updates are important here, we do them in ll_tiny_write_end.
1716  */
1717 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1718 {
1719         ssize_t count = iov_iter_count(iter);
1720         struct file *file = iocb->ki_filp;
1721         struct inode *inode = file_inode(file);
1722         ssize_t result = 0;
1723
1724         ENTRY;
1725
1726         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1727          * of function for why.
1728          */
1729         if (count >= PAGE_SIZE ||
1730             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1731                 RETURN(0);
1732
1733         result = __generic_file_write_iter(iocb, iter);
1734
1735         /* If the page is not already dirty, ll_tiny_write_begin returns
1736          * -ENODATA.  We continue on to normal write.
1737          */
1738         if (result == -ENODATA)
1739                 result = 0;
1740
1741         if (result > 0) {
1742                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1743                                    result);
1744                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1745         }
1746
1747         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1748
1749         RETURN(result);
1750 }
1751
1752 /*
1753  * Write to a file (through the page cache).
1754  */
1755 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1756 {
1757         struct vvp_io_args *args;
1758         struct lu_env *env;
1759         ssize_t rc_tiny = 0, rc_normal;
1760         __u16 refcheck;
1761
1762         ENTRY;
1763
1764         /* NB: we can't do direct IO for tiny writes because they use the page
1765          * cache, we can't do sync writes because tiny writes can't flush
1766          * pages, and we can't do append writes because we can't guarantee the
1767          * required DLM locks are held to protect file size.
1768          */
1769         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1770             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1771                 rc_tiny = ll_do_tiny_write(iocb, from);
1772
1773         /* In case of error, go on and try normal write - Only stop if tiny
1774          * write completed I/O.
1775          */
1776         if (iov_iter_count(from) == 0)
1777                 GOTO(out, rc_normal = rc_tiny);
1778
1779         env = cl_env_get(&refcheck);
1780         if (IS_ERR(env))
1781                 return PTR_ERR(env);
1782
1783         args = ll_env_args(env, IO_NORMAL);
1784         args->u.normal.via_iter = from;
1785         args->u.normal.via_iocb = iocb;
1786
1787         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1788                                     &iocb->ki_pos, iov_iter_count(from));
1789
1790         /* On success, combine bytes written. */
1791         if (rc_tiny >= 0 && rc_normal > 0)
1792                 rc_normal += rc_tiny;
1793         /* On error, only return error from normal write if tiny write did not
1794          * write any bytes.  Otherwise return bytes written by tiny write.
1795          */
1796         else if (rc_tiny > 0)
1797                 rc_normal = rc_tiny;
1798
1799         cl_env_put(env, &refcheck);
1800 out:
1801         RETURN(rc_normal);
1802 }
1803
1804 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1805 /*
1806  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1807  */
1808 static int ll_file_get_iov_count(const struct iovec *iov,
1809                                  unsigned long *nr_segs, size_t *count)
1810 {
1811         size_t cnt = 0;
1812         unsigned long seg;
1813
1814         for (seg = 0; seg < *nr_segs; seg++) {
1815                 const struct iovec *iv = &iov[seg];
1816
1817                 /*
1818                  * If any segment has a negative length, or the cumulative
1819                  * length ever wraps negative then return -EINVAL.
1820                  */
1821                 cnt += iv->iov_len;
1822                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1823                         return -EINVAL;
1824                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1825                         continue;
1826                 if (seg == 0)
1827                         return -EFAULT;
1828                 *nr_segs = seg;
1829                 cnt -= iv->iov_len;     /* This segment is no good */
1830                 break;
1831         }
1832         *count = cnt;
1833         return 0;
1834 }
1835
1836 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1837                                 unsigned long nr_segs, loff_t pos)
1838 {
1839         struct iov_iter to;
1840         size_t iov_count;
1841         ssize_t result;
1842         ENTRY;
1843
1844         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1845         if (result)
1846                 RETURN(result);
1847
1848 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1849         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1850 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1851         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1852 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1853
1854         result = ll_file_read_iter(iocb, &to);
1855
1856         RETURN(result);
1857 }
1858
1859 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1860                             loff_t *ppos)
1861 {
1862         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1863         struct kiocb   kiocb;
1864         ssize_t        result;
1865         ENTRY;
1866
1867         init_sync_kiocb(&kiocb, file);
1868         kiocb.ki_pos = *ppos;
1869 #ifdef HAVE_KIOCB_KI_LEFT
1870         kiocb.ki_left = count;
1871 #elif defined(HAVE_KI_NBYTES)
1872         kiocb.i_nbytes = count;
1873 #endif
1874
1875         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1876         *ppos = kiocb.ki_pos;
1877
1878         RETURN(result);
1879 }
1880
1881 /*
1882  * Write to a file (through the page cache).
1883  * AIO stuff
1884  */
1885 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1886                                  unsigned long nr_segs, loff_t pos)
1887 {
1888         struct iov_iter from;
1889         size_t iov_count;
1890         ssize_t result;
1891         ENTRY;
1892
1893         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1894         if (result)
1895                 RETURN(result);
1896
1897 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1898         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1899 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1900         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1901 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1902
1903         result = ll_file_write_iter(iocb, &from);
1904
1905         RETURN(result);
1906 }
1907
1908 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1909                              size_t count, loff_t *ppos)
1910 {
1911         struct iovec   iov = { .iov_base = (void __user *)buf,
1912                                .iov_len = count };
1913         struct kiocb   kiocb;
1914         ssize_t        result;
1915
1916         ENTRY;
1917
1918         init_sync_kiocb(&kiocb, file);
1919         kiocb.ki_pos = *ppos;
1920 #ifdef HAVE_KIOCB_KI_LEFT
1921         kiocb.ki_left = count;
1922 #elif defined(HAVE_KI_NBYTES)
1923         kiocb.ki_nbytes = count;
1924 #endif
1925
1926         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1927         *ppos = kiocb.ki_pos;
1928
1929         RETURN(result);
1930 }
1931 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1932
1933 /*
1934  * Send file content (through pagecache) somewhere with helper
1935  */
1936 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1937                                    struct pipe_inode_info *pipe, size_t count,
1938                                    unsigned int flags)
1939 {
1940         struct lu_env      *env;
1941         struct vvp_io_args *args;
1942         ssize_t             result;
1943         __u16               refcheck;
1944         ENTRY;
1945
1946         env = cl_env_get(&refcheck);
1947         if (IS_ERR(env))
1948                 RETURN(PTR_ERR(env));
1949
1950         args = ll_env_args(env, IO_SPLICE);
1951         args->u.splice.via_pipe = pipe;
1952         args->u.splice.via_flags = flags;
1953
1954         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1955         cl_env_put(env, &refcheck);
1956         RETURN(result);
1957 }
1958
1959 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1960                              __u64 flags, struct lov_user_md *lum, int lum_size)
1961 {
1962         struct lookup_intent oit = {
1963                 .it_op = IT_OPEN,
1964                 .it_flags = flags | MDS_OPEN_BY_FID,
1965         };
1966         int rc;
1967         ENTRY;
1968
1969         ll_inode_size_lock(inode);
1970         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1971         if (rc < 0)
1972                 GOTO(out_unlock, rc);
1973
1974         ll_release_openhandle(dentry, &oit);
1975
1976 out_unlock:
1977         ll_inode_size_unlock(inode);
1978         ll_intent_release(&oit);
1979
1980         RETURN(rc);
1981 }
1982
1983 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1984                              struct lov_mds_md **lmmp, int *lmm_size,
1985                              struct ptlrpc_request **request)
1986 {
1987         struct ll_sb_info *sbi = ll_i2sbi(inode);
1988         struct mdt_body  *body;
1989         struct lov_mds_md *lmm = NULL;
1990         struct ptlrpc_request *req = NULL;
1991         struct md_op_data *op_data;
1992         int rc, lmmsize;
1993
1994         rc = ll_get_default_mdsize(sbi, &lmmsize);
1995         if (rc)
1996                 RETURN(rc);
1997
1998         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1999                                      strlen(filename), lmmsize,
2000                                      LUSTRE_OPC_ANY, NULL);
2001         if (IS_ERR(op_data))
2002                 RETURN(PTR_ERR(op_data));
2003
2004         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2005         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2006         ll_finish_md_op_data(op_data);
2007         if (rc < 0) {
2008                 CDEBUG(D_INFO, "md_getattr_name failed "
2009                        "on %s: rc %d\n", filename, rc);
2010                 GOTO(out, rc);
2011         }
2012
2013         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2014         LASSERT(body != NULL); /* checked by mdc_getattr_name */
2015
2016         lmmsize = body->mbo_eadatasize;
2017
2018         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2019                         lmmsize == 0) {
2020                 GOTO(out, rc = -ENODATA);
2021         }
2022
2023         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2024         LASSERT(lmm != NULL);
2025
2026         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2027             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2028             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2029                 GOTO(out, rc = -EPROTO);
2030
2031         /*
2032          * This is coming from the MDS, so is probably in
2033          * little endian.  We convert it to host endian before
2034          * passing it to userspace.
2035          */
2036         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2037                 int stripe_count;
2038
2039                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2040                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2041                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2042                         if (le32_to_cpu(lmm->lmm_pattern) &
2043                             LOV_PATTERN_F_RELEASED)
2044                                 stripe_count = 0;
2045                 }
2046
2047                 /* if function called for directory - we should
2048                  * avoid swab not existent lsm objects */
2049                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2050                         lustre_swab_lov_user_md_v1(
2051                                         (struct lov_user_md_v1 *)lmm);
2052                         if (S_ISREG(body->mbo_mode))
2053                                 lustre_swab_lov_user_md_objects(
2054                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2055                                     stripe_count);
2056                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2057                         lustre_swab_lov_user_md_v3(
2058                                         (struct lov_user_md_v3 *)lmm);
2059                         if (S_ISREG(body->mbo_mode))
2060                                 lustre_swab_lov_user_md_objects(
2061                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2062                                     stripe_count);
2063                 } else if (lmm->lmm_magic ==
2064                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2065                         lustre_swab_lov_comp_md_v1(
2066                                         (struct lov_comp_md_v1 *)lmm);
2067                 }
2068         }
2069
2070 out:
2071         *lmmp = lmm;
2072         *lmm_size = lmmsize;
2073         *request = req;
2074         return rc;
2075 }
2076
2077 static int ll_lov_setea(struct inode *inode, struct file *file,
2078                         void __user *arg)
2079 {
2080         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2081         struct lov_user_md      *lump;
2082         int                      lum_size = sizeof(struct lov_user_md) +
2083                                             sizeof(struct lov_user_ost_data);
2084         int                      rc;
2085         ENTRY;
2086
2087         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2088                 RETURN(-EPERM);
2089
2090         OBD_ALLOC_LARGE(lump, lum_size);
2091         if (lump == NULL)
2092                 RETURN(-ENOMEM);
2093
2094         if (copy_from_user(lump, arg, lum_size))
2095                 GOTO(out_lump, rc = -EFAULT);
2096
2097         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2098                                       lum_size);
2099         cl_lov_delay_create_clear(&file->f_flags);
2100
2101 out_lump:
2102         OBD_FREE_LARGE(lump, lum_size);
2103         RETURN(rc);
2104 }
2105
2106 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2107 {
2108         struct lu_env   *env;
2109         __u16           refcheck;
2110         int             rc;
2111         ENTRY;
2112
2113         env = cl_env_get(&refcheck);
2114         if (IS_ERR(env))
2115                 RETURN(PTR_ERR(env));
2116
2117         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2118         cl_env_put(env, &refcheck);
2119         RETURN(rc);
2120 }
2121
2122 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2123                             void __user *arg)
2124 {
2125         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2126         struct lov_user_md        *klum;
2127         int                        lum_size, rc;
2128         __u64                      flags = FMODE_WRITE;
2129         ENTRY;
2130
2131         rc = ll_copy_user_md(lum, &klum);
2132         if (rc < 0)
2133                 RETURN(rc);
2134
2135         lum_size = rc;
2136         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2137                                       lum_size);
2138         if (!rc) {
2139                 __u32 gen;
2140
2141                 rc = put_user(0, &lum->lmm_stripe_count);
2142                 if (rc)
2143                         GOTO(out, rc);
2144
2145                 rc = ll_layout_refresh(inode, &gen);
2146                 if (rc)
2147                         GOTO(out, rc);
2148
2149                 rc = ll_file_getstripe(inode, arg, lum_size);
2150         }
2151         cl_lov_delay_create_clear(&file->f_flags);
2152
2153 out:
2154         OBD_FREE(klum, lum_size);
2155         RETURN(rc);
2156 }
2157
2158 static int
2159 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2160 {
2161         struct ll_inode_info *lli = ll_i2info(inode);
2162         struct cl_object *obj = lli->lli_clob;
2163         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2164         struct ll_grouplock grouplock;
2165         int rc;
2166         ENTRY;
2167
2168         if (arg == 0) {
2169                 CWARN("group id for group lock must not be 0\n");
2170                 RETURN(-EINVAL);
2171         }
2172
2173         if (ll_file_nolock(file))
2174                 RETURN(-EOPNOTSUPP);
2175
2176         spin_lock(&lli->lli_lock);
2177         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2178                 CWARN("group lock already existed with gid %lu\n",
2179                       fd->fd_grouplock.lg_gid);
2180                 spin_unlock(&lli->lli_lock);
2181                 RETURN(-EINVAL);
2182         }
2183         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2184         spin_unlock(&lli->lli_lock);
2185
2186         /**
2187          * XXX: group lock needs to protect all OST objects while PFL
2188          * can add new OST objects during the IO, so we'd instantiate
2189          * all OST objects before getting its group lock.
2190          */
2191         if (obj) {
2192                 struct lu_env *env;
2193                 __u16 refcheck;
2194                 struct cl_layout cl = {
2195                         .cl_is_composite = false,
2196                 };
2197                 struct lu_extent ext = {
2198                         .e_start = 0,
2199                         .e_end = OBD_OBJECT_EOF,
2200                 };
2201
2202                 env = cl_env_get(&refcheck);
2203                 if (IS_ERR(env))
2204                         RETURN(PTR_ERR(env));
2205
2206                 rc = cl_object_layout_get(env, obj, &cl);
2207                 if (!rc && cl.cl_is_composite)
2208                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2209                                                     &ext);
2210
2211                 cl_env_put(env, &refcheck);
2212                 if (rc)
2213                         RETURN(rc);
2214         }
2215
2216         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2217                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2218         if (rc)
2219                 RETURN(rc);
2220
2221         spin_lock(&lli->lli_lock);
2222         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2223                 spin_unlock(&lli->lli_lock);
2224                 CERROR("another thread just won the race\n");
2225                 cl_put_grouplock(&grouplock);
2226                 RETURN(-EINVAL);
2227         }
2228
2229         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2230         fd->fd_grouplock = grouplock;
2231         spin_unlock(&lli->lli_lock);
2232
2233         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2234         RETURN(0);
2235 }
2236
2237 static int ll_put_grouplock(struct inode *inode, struct file *file,
2238                             unsigned long arg)
2239 {
2240         struct ll_inode_info   *lli = ll_i2info(inode);
2241         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2242         struct ll_grouplock     grouplock;
2243         ENTRY;
2244
2245         spin_lock(&lli->lli_lock);
2246         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2247                 spin_unlock(&lli->lli_lock);
2248                 CWARN("no group lock held\n");
2249                 RETURN(-EINVAL);
2250         }
2251
2252         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2253
2254         if (fd->fd_grouplock.lg_gid != arg) {
2255                 CWARN("group lock %lu doesn't match current id %lu\n",
2256                       arg, fd->fd_grouplock.lg_gid);
2257                 spin_unlock(&lli->lli_lock);
2258                 RETURN(-EINVAL);
2259         }
2260
2261         grouplock = fd->fd_grouplock;
2262         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2263         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2264         spin_unlock(&lli->lli_lock);
2265
2266         cl_put_grouplock(&grouplock);
2267         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2268         RETURN(0);
2269 }
2270
2271 /**
2272  * Close inode open handle
2273  *
2274  * \param dentry [in]     dentry which contains the inode
2275  * \param it     [in,out] intent which contains open info and result
2276  *
2277  * \retval 0     success
2278  * \retval <0    failure
2279  */
2280 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2281 {
2282         struct inode *inode = dentry->d_inode;
2283         struct obd_client_handle *och;
2284         int rc;
2285         ENTRY;
2286
2287         LASSERT(inode);
2288
2289         /* Root ? Do nothing. */
2290         if (dentry->d_inode->i_sb->s_root == dentry)
2291                 RETURN(0);
2292
2293         /* No open handle to close? Move away */
2294         if (!it_disposition(it, DISP_OPEN_OPEN))
2295                 RETURN(0);
2296
2297         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2298
2299         OBD_ALLOC(och, sizeof(*och));
2300         if (!och)
2301                 GOTO(out, rc = -ENOMEM);
2302
2303         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2304
2305         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2306 out:
2307         /* this one is in place of ll_file_open */
2308         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2309                 ptlrpc_req_finished(it->it_request);
2310                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2311         }
2312         RETURN(rc);
2313 }
2314
2315 /**
2316  * Get size for inode for which FIEMAP mapping is requested.
2317  * Make the FIEMAP get_info call and returns the result.
2318  * \param fiemap        kernel buffer to hold extens
2319  * \param num_bytes     kernel buffer size
2320  */
2321 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2322                         size_t num_bytes)
2323 {
2324         struct lu_env                   *env;
2325         __u16                           refcheck;
2326         int                             rc = 0;
2327         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2328         ENTRY;
2329
2330         /* Checks for fiemap flags */
2331         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2332                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2333                 return -EBADR;
2334         }
2335
2336         /* Check for FIEMAP_FLAG_SYNC */
2337         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2338                 rc = filemap_fdatawrite(inode->i_mapping);
2339                 if (rc)
2340                         return rc;
2341         }
2342
2343         env = cl_env_get(&refcheck);
2344         if (IS_ERR(env))
2345                 RETURN(PTR_ERR(env));
2346
2347         if (i_size_read(inode) == 0) {
2348                 rc = ll_glimpse_size(inode);
2349                 if (rc)
2350                         GOTO(out, rc);
2351         }
2352
2353         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2354         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2355         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2356
2357         /* If filesize is 0, then there would be no objects for mapping */
2358         if (fmkey.lfik_oa.o_size == 0) {
2359                 fiemap->fm_mapped_extents = 0;
2360                 GOTO(out, rc = 0);
2361         }
2362
2363         fmkey.lfik_fiemap = *fiemap;
2364
2365         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2366                               &fmkey, fiemap, &num_bytes);
2367 out:
2368         cl_env_put(env, &refcheck);
2369         RETURN(rc);
2370 }
2371
2372 int ll_fid2path(struct inode *inode, void __user *arg)
2373 {
2374         struct obd_export       *exp = ll_i2mdexp(inode);
2375         const struct getinfo_fid2path __user *gfin = arg;
2376         __u32                    pathlen;
2377         struct getinfo_fid2path *gfout;
2378         size_t                   outsize;
2379         int                      rc;
2380
2381         ENTRY;
2382
2383         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2384             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2385                 RETURN(-EPERM);
2386
2387         /* Only need to get the buflen */
2388         if (get_user(pathlen, &gfin->gf_pathlen))
2389                 RETURN(-EFAULT);
2390
2391         if (pathlen > PATH_MAX)
2392                 RETURN(-EINVAL);
2393
2394         outsize = sizeof(*gfout) + pathlen;
2395         OBD_ALLOC(gfout, outsize);
2396         if (gfout == NULL)
2397                 RETURN(-ENOMEM);
2398
2399         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2400                 GOTO(gf_free, rc = -EFAULT);
2401         /* append root FID after gfout to let MDT know the root FID so that it
2402          * can lookup the correct path, this is mainly for fileset.
2403          * old server without fileset mount support will ignore this. */
2404         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2405
2406         /* Call mdc_iocontrol */
2407         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2408         if (rc != 0)
2409                 GOTO(gf_free, rc);
2410
2411         if (copy_to_user(arg, gfout, outsize))
2412                 rc = -EFAULT;
2413
2414 gf_free:
2415         OBD_FREE(gfout, outsize);
2416         RETURN(rc);
2417 }
2418
2419 static int
2420 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2421 {
2422         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2423         struct lu_env *env;
2424         struct cl_io *io;
2425         __u16  refcheck;
2426         int result;
2427
2428         ENTRY;
2429
2430         ioc->idv_version = 0;
2431         ioc->idv_layout_version = UINT_MAX;
2432
2433         /* If no file object initialized, we consider its version is 0. */
2434         if (obj == NULL)
2435                 RETURN(0);
2436
2437         env = cl_env_get(&refcheck);
2438         if (IS_ERR(env))
2439                 RETURN(PTR_ERR(env));
2440
2441         io = vvp_env_thread_io(env);
2442         io->ci_obj = obj;
2443         io->u.ci_data_version.dv_data_version = 0;
2444         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2445         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2446
2447 restart:
2448         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2449                 result = cl_io_loop(env, io);
2450         else
2451                 result = io->ci_result;
2452
2453         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2454         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2455
2456         cl_io_fini(env, io);
2457
2458         if (unlikely(io->ci_need_restart))
2459                 goto restart;
2460
2461         cl_env_put(env, &refcheck);
2462
2463         RETURN(result);
2464 }
2465
2466 /*
2467  * Read the data_version for inode.
2468  *
2469  * This value is computed using stripe object version on OST.
2470  * Version is computed using server side locking.
2471  *
2472  * @param flags if do sync on the OST side;
2473  *              0: no sync
2474  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2475  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2476  */
2477 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2478 {
2479         struct ioc_data_version ioc = { .idv_flags = flags };
2480         int rc;
2481
2482         rc = ll_ioc_data_version(inode, &ioc);
2483         if (!rc)
2484                 *data_version = ioc.idv_version;
2485
2486         return rc;
2487 }
2488
2489 /*
2490  * Trigger a HSM release request for the provided inode.
2491  */
2492 int ll_hsm_release(struct inode *inode)
2493 {
2494         struct lu_env *env;
2495         struct obd_client_handle *och = NULL;
2496         __u64 data_version = 0;
2497         int rc;
2498         __u16 refcheck;
2499         ENTRY;
2500
2501         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2502                ll_get_fsname(inode->i_sb, NULL, 0),
2503                PFID(&ll_i2info(inode)->lli_fid));
2504
2505         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2506         if (IS_ERR(och))
2507                 GOTO(out, rc = PTR_ERR(och));
2508
2509         /* Grab latest data_version and [am]time values */
2510         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2511         if (rc != 0)
2512                 GOTO(out, rc);
2513
2514         env = cl_env_get(&refcheck);
2515         if (IS_ERR(env))
2516                 GOTO(out, rc = PTR_ERR(env));
2517
2518         rc = ll_merge_attr(env, inode);
2519         cl_env_put(env, &refcheck);
2520
2521         /* If error happen, we have the wrong size for a file.
2522          * Don't release it.
2523          */
2524         if (rc != 0)
2525                 GOTO(out, rc);
2526
2527         /* Release the file.
2528          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2529          * we still need it to pack l_remote_handle to MDT. */
2530         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2531                                        &data_version);
2532         och = NULL;
2533
2534         EXIT;
2535 out:
2536         if (och != NULL && !IS_ERR(och)) /* close the file */
2537                 ll_lease_close(och, inode, NULL);
2538
2539         return rc;
2540 }
2541
2542 struct ll_swap_stack {
2543         __u64                    dv1;
2544         __u64                    dv2;
2545         struct inode            *inode1;
2546         struct inode            *inode2;
2547         bool                     check_dv1;
2548         bool                     check_dv2;
2549 };
2550
2551 static int ll_swap_layouts(struct file *file1, struct file *file2,
2552                            struct lustre_swap_layouts *lsl)
2553 {
2554         struct mdc_swap_layouts  msl;
2555         struct md_op_data       *op_data;
2556         __u32                    gid;
2557         __u64                    dv;
2558         struct ll_swap_stack    *llss = NULL;
2559         int                      rc;
2560
2561         OBD_ALLOC_PTR(llss);
2562         if (llss == NULL)
2563                 RETURN(-ENOMEM);
2564
2565         llss->inode1 = file_inode(file1);
2566         llss->inode2 = file_inode(file2);
2567
2568         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2569         if (rc < 0)
2570                 GOTO(free, rc);
2571
2572         /* we use 2 bool because it is easier to swap than 2 bits */
2573         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2574                 llss->check_dv1 = true;
2575
2576         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2577                 llss->check_dv2 = true;
2578
2579         /* we cannot use lsl->sl_dvX directly because we may swap them */
2580         llss->dv1 = lsl->sl_dv1;
2581         llss->dv2 = lsl->sl_dv2;
2582
2583         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2584         if (rc == 0) /* same file, done! */
2585                 GOTO(free, rc);
2586
2587         if (rc < 0) { /* sequentialize it */
2588                 swap(llss->inode1, llss->inode2);
2589                 swap(file1, file2);
2590                 swap(llss->dv1, llss->dv2);
2591                 swap(llss->check_dv1, llss->check_dv2);
2592         }
2593
2594         gid = lsl->sl_gid;
2595         if (gid != 0) { /* application asks to flush dirty cache */
2596                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2597                 if (rc < 0)
2598                         GOTO(free, rc);
2599
2600                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2601                 if (rc < 0) {
2602                         ll_put_grouplock(llss->inode1, file1, gid);
2603                         GOTO(free, rc);
2604                 }
2605         }
2606
2607         /* ultimate check, before swaping the layouts we check if
2608          * dataversion has changed (if requested) */
2609         if (llss->check_dv1) {
2610                 rc = ll_data_version(llss->inode1, &dv, 0);
2611                 if (rc)
2612                         GOTO(putgl, rc);
2613                 if (dv != llss->dv1)
2614                         GOTO(putgl, rc = -EAGAIN);
2615         }
2616
2617         if (llss->check_dv2) {
2618                 rc = ll_data_version(llss->inode2, &dv, 0);
2619                 if (rc)
2620                         GOTO(putgl, rc);
2621                 if (dv != llss->dv2)
2622                         GOTO(putgl, rc = -EAGAIN);
2623         }
2624
2625         /* struct md_op_data is used to send the swap args to the mdt
2626          * only flags is missing, so we use struct mdc_swap_layouts
2627          * through the md_op_data->op_data */
2628         /* flags from user space have to be converted before they are send to
2629          * server, no flag is sent today, they are only used on the client */
2630         msl.msl_flags = 0;
2631         rc = -ENOMEM;
2632         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2633                                      0, LUSTRE_OPC_ANY, &msl);
2634         if (IS_ERR(op_data))
2635                 GOTO(free, rc = PTR_ERR(op_data));
2636
2637         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2638                            sizeof(*op_data), op_data, NULL);
2639         ll_finish_md_op_data(op_data);
2640
2641         if (rc < 0)
2642                 GOTO(putgl, rc);
2643
2644 putgl:
2645         if (gid != 0) {
2646                 ll_put_grouplock(llss->inode2, file2, gid);
2647                 ll_put_grouplock(llss->inode1, file1, gid);
2648         }
2649
2650 free:
2651         if (llss != NULL)
2652                 OBD_FREE_PTR(llss);
2653
2654         RETURN(rc);
2655 }
2656
2657 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2658 {
2659         struct md_op_data       *op_data;
2660         int                      rc;
2661         ENTRY;
2662
2663         /* Detect out-of range masks */
2664         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2665                 RETURN(-EINVAL);
2666
2667         /* Non-root users are forbidden to set or clear flags which are
2668          * NOT defined in HSM_USER_MASK. */
2669         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2670             !cfs_capable(CFS_CAP_SYS_ADMIN))
2671                 RETURN(-EPERM);
2672
2673         /* Detect out-of range archive id */
2674         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2675             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2676                 RETURN(-EINVAL);
2677
2678         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2679                                      LUSTRE_OPC_ANY, hss);
2680         if (IS_ERR(op_data))
2681                 RETURN(PTR_ERR(op_data));
2682
2683         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2684                            sizeof(*op_data), op_data, NULL);
2685
2686         ll_finish_md_op_data(op_data);
2687
2688         RETURN(rc);
2689 }
2690
2691 static int ll_hsm_import(struct inode *inode, struct file *file,
2692                          struct hsm_user_import *hui)
2693 {
2694         struct hsm_state_set    *hss = NULL;
2695         struct iattr            *attr = NULL;
2696         int                      rc;
2697         ENTRY;
2698
2699         if (!S_ISREG(inode->i_mode))
2700                 RETURN(-EINVAL);
2701
2702         /* set HSM flags */
2703         OBD_ALLOC_PTR(hss);
2704         if (hss == NULL)
2705                 GOTO(out, rc = -ENOMEM);
2706
2707         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2708         hss->hss_archive_id = hui->hui_archive_id;
2709         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2710         rc = ll_hsm_state_set(inode, hss);
2711         if (rc != 0)
2712                 GOTO(out, rc);
2713
2714         OBD_ALLOC_PTR(attr);
2715         if (attr == NULL)
2716                 GOTO(out, rc = -ENOMEM);
2717
2718         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2719         attr->ia_mode |= S_IFREG;
2720         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2721         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2722         attr->ia_size = hui->hui_size;
2723         attr->ia_mtime.tv_sec = hui->hui_mtime;
2724         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2725         attr->ia_atime.tv_sec = hui->hui_atime;
2726         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2727
2728         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2729                          ATTR_UID | ATTR_GID |
2730                          ATTR_MTIME | ATTR_MTIME_SET |
2731                          ATTR_ATIME | ATTR_ATIME_SET;
2732
2733         inode_lock(inode);
2734
2735         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2736         if (rc == -ENODATA)
2737                 rc = 0;
2738
2739         inode_unlock(inode);
2740
2741 out:
2742         if (hss != NULL)
2743                 OBD_FREE_PTR(hss);
2744
2745         if (attr != NULL)
2746                 OBD_FREE_PTR(attr);
2747
2748         RETURN(rc);
2749 }
2750
2751 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2752 {
2753         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2754                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2755 }
2756
2757 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2758 {
2759         struct inode *inode = file_inode(file);
2760         struct iattr ia = {
2761                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2762                             ATTR_MTIME | ATTR_MTIME_SET |
2763                             ATTR_CTIME,
2764                 .ia_atime = {
2765                         .tv_sec = lfu->lfu_atime_sec,
2766                         .tv_nsec = lfu->lfu_atime_nsec,
2767                 },
2768                 .ia_mtime = {
2769                         .tv_sec = lfu->lfu_mtime_sec,
2770                         .tv_nsec = lfu->lfu_mtime_nsec,
2771                 },
2772                 .ia_ctime = {
2773                         .tv_sec = lfu->lfu_ctime_sec,
2774                         .tv_nsec = lfu->lfu_ctime_nsec,
2775                 },
2776         };
2777         int rc;
2778         ENTRY;
2779
2780         if (!capable(CAP_SYS_ADMIN))
2781                 RETURN(-EPERM);
2782
2783         if (!S_ISREG(inode->i_mode))
2784                 RETURN(-EINVAL);
2785
2786         inode_lock(inode);
2787         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2788                             false);
2789         inode_unlock(inode);
2790
2791         RETURN(rc);
2792 }
2793
2794 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2795 {
2796         switch (mode) {
2797         case MODE_READ_USER:
2798                 return CLM_READ;
2799         case MODE_WRITE_USER:
2800                 return CLM_WRITE;
2801         default:
2802                 return -EINVAL;
2803         }
2804 }
2805
2806 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2807
2808 /* Used to allow the upper layers of the client to request an LDLM lock
2809  * without doing an actual read or write.
2810  *
2811  * Used for ladvise lockahead to manually request specific locks.
2812  *
2813  * \param[in] file      file this ladvise lock request is on
2814  * \param[in] ladvise   ladvise struct describing this lock request
2815  *
2816  * \retval 0            success, no detailed result available (sync requests
2817  *                      and requests sent to the server [not handled locally]
2818  *                      cannot return detailed results)
2819  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2820  *                                       see definitions for details.
2821  * \retval negative     negative errno on error
2822  */
2823 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2824 {
2825         struct lu_env *env = NULL;
2826         struct cl_io *io  = NULL;
2827         struct cl_lock *lock = NULL;
2828         struct cl_lock_descr *descr = NULL;
2829         struct dentry *dentry = file->f_path.dentry;
2830         struct inode *inode = dentry->d_inode;
2831         enum cl_lock_mode cl_mode;
2832         off_t start = ladvise->lla_start;
2833         off_t end = ladvise->lla_end;
2834         int result;
2835         __u16 refcheck;
2836
2837         ENTRY;
2838
2839         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2840                "start=%llu, end=%llu\n", dentry->d_name.len,
2841                dentry->d_name.name, dentry->d_inode,
2842                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2843                (__u64) end);
2844
2845         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2846         if (cl_mode < 0)
2847                 GOTO(out, result = cl_mode);
2848
2849         /* Get IO environment */
2850         result = cl_io_get(inode, &env, &io, &refcheck);
2851         if (result <= 0)
2852                 GOTO(out, result);
2853
2854         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2855         if (result > 0) {
2856                 /*
2857                  * nothing to do for this io. This currently happens when
2858                  * stripe sub-object's are not yet created.
2859                  */
2860                 result = io->ci_result;
2861         } else if (result == 0) {
2862                 lock = vvp_env_lock(env);
2863                 descr = &lock->cll_descr;
2864
2865                 descr->cld_obj   = io->ci_obj;
2866                 /* Convert byte offsets to pages */
2867                 descr->cld_start = cl_index(io->ci_obj, start);
2868                 descr->cld_end   = cl_index(io->ci_obj, end);
2869                 descr->cld_mode  = cl_mode;
2870                 /* CEF_MUST is used because we do not want to convert a
2871                  * lockahead request to a lockless lock */
2872                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2873                                        CEF_NONBLOCK;
2874
2875                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2876                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2877
2878                 result = cl_lock_request(env, io, lock);
2879
2880                 /* On success, we need to release the lock */
2881                 if (result >= 0)
2882                         cl_lock_release(env, lock);
2883         }
2884         cl_io_fini(env, io);
2885         cl_env_put(env, &refcheck);
2886
2887         /* -ECANCELED indicates a matching lock with a different extent
2888          * was already present, and -EEXIST indicates a matching lock
2889          * on exactly the same extent was already present.
2890          * We convert them to positive values for userspace to make
2891          * recognizing true errors easier.
2892          * Note we can only return these detailed results on async requests,
2893          * as sync requests look the same as i/o requests for locking. */
2894         if (result == -ECANCELED)
2895                 result = LLA_RESULT_DIFFERENT;
2896         else if (result == -EEXIST)
2897                 result = LLA_RESULT_SAME;
2898
2899 out:
2900         RETURN(result);
2901 }
2902 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2903
2904 static int ll_ladvise_sanity(struct inode *inode,
2905                              struct llapi_lu_ladvise *ladvise)
2906 {
2907         enum lu_ladvise_type advice = ladvise->lla_advice;
2908         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2909          * be in the first 32 bits of enum ladvise_flags */
2910         __u32 flags = ladvise->lla_peradvice_flags;
2911         /* 3 lines at 80 characters per line, should be plenty */
2912         int rc = 0;
2913
2914         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2915                 rc = -EINVAL;
2916                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2917                        "last supported advice is %s (value '%d'): rc = %d\n",
2918                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2919                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2920                 GOTO(out, rc);
2921         }
2922
2923         /* Per-advice checks */
2924         switch (advice) {
2925         case LU_LADVISE_LOCKNOEXPAND:
2926                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2927                         rc = -EINVAL;
2928                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2929                                "rc = %d\n",
2930                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2931                                ladvise_names[advice], rc);
2932                         GOTO(out, rc);
2933                 }
2934                 break;
2935         case LU_LADVISE_LOCKAHEAD:
2936                 /* Currently only READ and WRITE modes can be requested */
2937                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2938                     ladvise->lla_lockahead_mode == 0) {
2939                         rc = -EINVAL;
2940                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2941                                "rc = %d\n",
2942                                ll_get_fsname(inode->i_sb, NULL, 0),
2943                                ladvise->lla_lockahead_mode,
2944                                ladvise_names[advice], rc);
2945                         GOTO(out, rc);
2946                 }
2947         case LU_LADVISE_WILLREAD:
2948         case LU_LADVISE_DONTNEED:
2949         default:
2950                 /* Note fall through above - These checks apply to all advices
2951                  * except LOCKNOEXPAND */
2952                 if (flags & ~LF_DEFAULT_MASK) {
2953                         rc = -EINVAL;
2954                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2955                                "rc = %d\n",
2956                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2957                                ladvise_names[advice], rc);
2958                         GOTO(out, rc);
2959                 }
2960                 if (ladvise->lla_start >= ladvise->lla_end) {
2961                         rc = -EINVAL;
2962                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2963                                "for %s: rc = %d\n",
2964                                ll_get_fsname(inode->i_sb, NULL, 0),
2965                                ladvise->lla_start, ladvise->lla_end,
2966                                ladvise_names[advice], rc);
2967                         GOTO(out, rc);
2968                 }
2969                 break;
2970         }
2971
2972 out:
2973         return rc;
2974 }
2975 #undef ERRSIZE
2976
2977 /*
2978  * Give file access advices
2979  *
2980  * The ladvise interface is similar to Linux fadvise() system call, except it
2981  * forwards the advices directly from Lustre client to server. The server side
2982  * codes will apply appropriate read-ahead and caching techniques for the
2983  * corresponding files.
2984  *
2985  * A typical workload for ladvise is e.g. a bunch of different clients are
2986  * doing small random reads of a file, so prefetching pages into OSS cache
2987  * with big linear reads before the random IO is a net benefit. Fetching
2988  * all that data into each client cache with fadvise() may not be, due to
2989  * much more data being sent to the client.
2990  */
2991 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2992                       struct llapi_lu_ladvise *ladvise)
2993 {
2994         struct lu_env *env;
2995         struct cl_io *io;
2996         struct cl_ladvise_io *lio;
2997         int rc;
2998         __u16 refcheck;
2999         ENTRY;
3000
3001         env = cl_env_get(&refcheck);
3002         if (IS_ERR(env))
3003                 RETURN(PTR_ERR(env));
3004
3005         io = vvp_env_thread_io(env);
3006         io->ci_obj = ll_i2info(inode)->lli_clob;
3007
3008         /* initialize parameters for ladvise */
3009         lio = &io->u.ci_ladvise;
3010         lio->li_start = ladvise->lla_start;
3011         lio->li_end = ladvise->lla_end;
3012         lio->li_fid = ll_inode2fid(inode);
3013         lio->li_advice = ladvise->lla_advice;
3014         lio->li_flags = flags;
3015
3016         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3017                 rc = cl_io_loop(env, io);
3018         else
3019                 rc = io->ci_result;
3020
3021         cl_io_fini(env, io);
3022         cl_env_put(env, &refcheck);
3023         RETURN(rc);
3024 }
3025
3026 static int ll_lock_noexpand(struct file *file, int flags)
3027 {
3028         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3029
3030         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3031
3032         return 0;
3033 }
3034
3035 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3036                         unsigned long arg)
3037 {
3038         struct fsxattr fsxattr;
3039
3040         if (copy_from_user(&fsxattr,
3041                            (const struct fsxattr __user *)arg,
3042                            sizeof(fsxattr)))
3043                 RETURN(-EFAULT);
3044
3045         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3046         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3047                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3048         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3049         if (copy_to_user((struct fsxattr __user *)arg,
3050                          &fsxattr, sizeof(fsxattr)))
3051                 RETURN(-EFAULT);
3052
3053         RETURN(0);
3054 }
3055
3056 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3057 {
3058         /*
3059          * Project Quota ID state is only allowed to change from within the init
3060          * namespace. Enforce that restriction only if we are trying to change
3061          * the quota ID state. Everything else is allowed in user namespaces.
3062          */
3063         if (current_user_ns() == &init_user_ns)
3064                 return 0;
3065
3066         if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3067                 return -EINVAL;
3068
3069         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3070                 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3071                         return -EINVAL;
3072         } else {
3073                 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3074                         return -EINVAL;
3075         }
3076
3077         return 0;
3078 }
3079
3080 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3081                         unsigned long arg)
3082 {
3083
3084         struct md_op_data *op_data;
3085         struct ptlrpc_request *req = NULL;
3086         int rc = 0;
3087         struct fsxattr fsxattr;
3088         struct cl_object *obj;
3089         struct iattr *attr;
3090         int flags;
3091
3092         if (copy_from_user(&fsxattr,
3093                            (const struct fsxattr __user *)arg,
3094                            sizeof(fsxattr)))
3095                 RETURN(-EFAULT);
3096
3097         rc = ll_ioctl_check_project(inode, &fsxattr);
3098         if (rc)
3099                 RETURN(rc);
3100
3101         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3102                                      LUSTRE_OPC_ANY, NULL);
3103         if (IS_ERR(op_data))
3104                 RETURN(PTR_ERR(op_data));
3105
3106         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3107         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3108         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3109                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3110         op_data->op_projid = fsxattr.fsx_projid;
3111         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3112         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3113                         0, &req);
3114         ptlrpc_req_finished(req);
3115         if (rc)
3116                 GOTO(out_fsxattr, rc);
3117         ll_update_inode_flags(inode, op_data->op_attr_flags);
3118         obj = ll_i2info(inode)->lli_clob;
3119         if (obj == NULL)
3120                 GOTO(out_fsxattr, rc);
3121
3122         OBD_ALLOC_PTR(attr);
3123         if (attr == NULL)
3124                 GOTO(out_fsxattr, rc = -ENOMEM);
3125
3126         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3127                             fsxattr.fsx_xflags);
3128         OBD_FREE_PTR(attr);
3129 out_fsxattr:
3130         ll_finish_md_op_data(op_data);
3131         RETURN(rc);
3132 }
3133
3134 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3135                                  unsigned long arg)
3136 {
3137         struct inode            *inode = file_inode(file);
3138         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3139         struct ll_inode_info    *lli = ll_i2info(inode);
3140         struct obd_client_handle *och = NULL;
3141         struct split_param sp;
3142         bool lease_broken;
3143         fmode_t fmode = 0;
3144         enum mds_op_bias bias = 0;
3145         struct file *layout_file = NULL;
3146         void *data = NULL;
3147         size_t data_size = 0;
3148         long rc;
3149         ENTRY;
3150
3151         mutex_lock(&lli->lli_och_mutex);
3152         if (fd->fd_lease_och != NULL) {
3153                 och = fd->fd_lease_och;
3154                 fd->fd_lease_och = NULL;
3155         }
3156         mutex_unlock(&lli->lli_och_mutex);
3157
3158         if (och == NULL)
3159                 GOTO(out, rc = -ENOLCK);
3160
3161         fmode = och->och_flags;
3162
3163         switch (ioc->lil_flags) {
3164         case LL_LEASE_RESYNC_DONE:
3165                 if (ioc->lil_count > IOC_IDS_MAX)
3166                         GOTO(out, rc = -EINVAL);
3167
3168                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3169                 OBD_ALLOC(data, data_size);
3170                 if (!data)
3171                         GOTO(out, rc = -ENOMEM);
3172
3173                 if (copy_from_user(data, (void __user *)arg, data_size))
3174                         GOTO(out, rc = -EFAULT);
3175
3176                 bias = MDS_CLOSE_RESYNC_DONE;
3177                 break;
3178         case LL_LEASE_LAYOUT_MERGE: {
3179                 int fd;
3180
3181                 if (ioc->lil_count != 1)
3182                         GOTO(out, rc = -EINVAL);
3183
3184                 arg += sizeof(*ioc);
3185                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3186                         GOTO(out, rc = -EFAULT);
3187
3188                 layout_file = fget(fd);
3189                 if (!layout_file)
3190                         GOTO(out, rc = -EBADF);
3191
3192                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3193                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3194                         GOTO(out, rc = -EPERM);
3195
3196                 data = file_inode(layout_file);
3197                 bias = MDS_CLOSE_LAYOUT_MERGE;
3198                 break;
3199         }
3200         case LL_LEASE_LAYOUT_SPLIT: {
3201                 int fdv;
3202                 int mirror_id;
3203
3204                 if (ioc->lil_count != 2)
3205                         GOTO(out, rc = -EINVAL);
3206
3207                 arg += sizeof(*ioc);
3208                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3209                         GOTO(out, rc = -EFAULT);
3210
3211                 arg += sizeof(__u32);
3212                 if (copy_from_user(&mirror_id, (void __user *)arg,
3213                                    sizeof(__u32)))
3214                         GOTO(out, rc = -EFAULT);
3215
3216                 layout_file = fget(fdv);
3217                 if (!layout_file)
3218                         GOTO(out, rc = -EBADF);
3219
3220                 sp.sp_inode = file_inode(layout_file);
3221                 sp.sp_mirror_id = (__u16)mirror_id;
3222                 data = &sp;
3223                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3224                 break;
3225         }
3226         default:
3227                 /* without close intent */
3228                 break;
3229         }
3230
3231         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3232         if (rc < 0)
3233                 GOTO(out, rc);
3234
3235         rc = ll_lease_och_release(inode, file);
3236         if (rc < 0)
3237                 GOTO(out, rc);
3238
3239         if (lease_broken)
3240                 fmode = 0;
3241         EXIT;
3242
3243 out:
3244         switch (ioc->lil_flags) {
3245         case LL_LEASE_RESYNC_DONE:
3246                 if (data)
3247                         OBD_FREE(data, data_size);
3248                 break;
3249         case LL_LEASE_LAYOUT_MERGE:
3250         case LL_LEASE_LAYOUT_SPLIT:
3251                 if (layout_file)
3252                         fput(layout_file);
3253                 break;
3254         }
3255
3256         if (!rc)
3257                 rc = ll_lease_type_from_fmode(fmode);
3258         RETURN(rc);
3259 }
3260
3261 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3262                               unsigned long arg)
3263 {
3264         struct inode *inode = file_inode(file);
3265         struct ll_inode_info *lli = ll_i2info(inode);
3266         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3267         struct obd_client_handle *och = NULL;
3268         __u64 open_flags = 0;
3269         bool lease_broken;
3270         fmode_t fmode;
3271         long rc;
3272         ENTRY;
3273
3274         switch (ioc->lil_mode) {
3275         case LL_LEASE_WRLCK:
3276                 if (!(file->f_mode & FMODE_WRITE))
3277                         RETURN(-EPERM);
3278                 fmode = FMODE_WRITE;
3279                 break;
3280         case LL_LEASE_RDLCK:
3281                 if (!(file->f_mode & FMODE_READ))
3282                         RETURN(-EPERM);
3283                 fmode = FMODE_READ;
3284                 break;
3285         case LL_LEASE_UNLCK:
3286                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3287         default:
3288                 RETURN(-EINVAL);
3289         }
3290
3291         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3292
3293         /* apply for lease */
3294         if (ioc->lil_flags & LL_LEASE_RESYNC)
3295                 open_flags = MDS_OPEN_RESYNC;
3296         och = ll_lease_open(inode, file, fmode, open_flags);
3297         if (IS_ERR(och))
3298                 RETURN(PTR_ERR(och));
3299
3300         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3301                 rc = ll_lease_file_resync(och, inode, arg);
3302                 if (rc) {
3303                         ll_lease_close(och, inode, NULL);
3304                         RETURN(rc);
3305                 }
3306                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3307                 if (rc) {
3308                         ll_lease_close(och, inode, NULL);
3309                         RETURN(rc);
3310                 }
3311         }
3312
3313         rc = 0;
3314         mutex_lock(&lli->lli_och_mutex);
3315         if (fd->fd_lease_och == NULL) {
3316                 fd->fd_lease_och = och;
3317                 och = NULL;
3318         }
3319         mutex_unlock(&lli->lli_och_mutex);
3320         if (och != NULL) {
3321                 /* impossible now that only excl is supported for now */
3322                 ll_lease_close(och, inode, &lease_broken);
3323                 rc = -EBUSY;
3324         }
3325         RETURN(rc);
3326 }
3327
3328 static long
3329 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3330 {
3331         struct inode            *inode = file_inode(file);
3332         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3333         int                      flags, rc;
3334         ENTRY;
3335
3336         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3337                PFID(ll_inode2fid(inode)), inode, cmd);
3338         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3339
3340         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3341         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3342                 RETURN(-ENOTTY);
3343
3344         switch (cmd) {
3345         case LL_IOC_GETFLAGS:
3346                 /* Get the current value of the file flags */
3347                 return put_user(fd->fd_flags, (int __user *)arg);
3348         case LL_IOC_SETFLAGS:
3349         case LL_IOC_CLRFLAGS:
3350                 /* Set or clear specific file flags */
3351                 /* XXX This probably needs checks to ensure the flags are
3352                  *     not abused, and to handle any flag side effects.
3353                  */
3354                 if (get_user(flags, (int __user *) arg))
3355                         RETURN(-EFAULT);
3356
3357                 if (cmd == LL_IOC_SETFLAGS) {
3358                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3359                             !(file->f_flags & O_DIRECT)) {
3360                                 CERROR("%s: unable to disable locking on "
3361                                        "non-O_DIRECT file\n", current->comm);
3362                                 RETURN(-EINVAL);
3363                         }
3364
3365                         fd->fd_flags |= flags;
3366                 } else {
3367                         fd->fd_flags &= ~flags;
3368                 }
3369                 RETURN(0);
3370         case LL_IOC_LOV_SETSTRIPE:
3371         case LL_IOC_LOV_SETSTRIPE_NEW:
3372                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3373         case LL_IOC_LOV_SETEA:
3374                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3375         case LL_IOC_LOV_SWAP_LAYOUTS: {
3376                 struct file *file2;
3377                 struct lustre_swap_layouts lsl;
3378
3379                 if (copy_from_user(&lsl, (char __user *)arg,
3380                                    sizeof(struct lustre_swap_layouts)))
3381                         RETURN(-EFAULT);
3382
3383                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3384                         RETURN(-EPERM);
3385
3386                 file2 = fget(lsl.sl_fd);
3387                 if (file2 == NULL)
3388                         RETURN(-EBADF);
3389
3390                 /* O_WRONLY or O_RDWR */
3391                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3392                         GOTO(out, rc = -EPERM);
3393
3394                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3395                         struct inode                    *inode2;
3396                         struct ll_inode_info            *lli;
3397                         struct obd_client_handle        *och = NULL;
3398
3399                         lli = ll_i2info(inode);
3400                         mutex_lock(&lli->lli_och_mutex);
3401                         if (fd->fd_lease_och != NULL) {
3402                                 och = fd->fd_lease_och;
3403                                 fd->fd_lease_och = NULL;
3404                         }
3405                         mutex_unlock(&lli->lli_och_mutex);
3406                         if (och == NULL)
3407                                 GOTO(out, rc = -ENOLCK);
3408                         inode2 = file_inode(file2);
3409                         rc = ll_swap_layouts_close(och, inode, inode2);
3410                 } else {
3411                         rc = ll_swap_layouts(file, file2, &lsl);
3412                 }
3413 out:
3414                 fput(file2);
3415                 RETURN(rc);
3416         }
3417         case LL_IOC_LOV_GETSTRIPE:
3418         case LL_IOC_LOV_GETSTRIPE_NEW:
3419                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3420         case FS_IOC_GETFLAGS:
3421         case FS_IOC_SETFLAGS:
3422                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3423         case FSFILT_IOC_GETVERSION:
3424         case FS_IOC_GETVERSION:
3425                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3426         /* We need to special case any other ioctls we want to handle,
3427          * to send them to the MDS/OST as appropriate and to properly
3428          * network encode the arg field. */
3429         case FS_IOC_SETVERSION:
3430                 RETURN(-ENOTSUPP);
3431
3432         case LL_IOC_GROUP_LOCK:
3433                 RETURN(ll_get_grouplock(inode, file, arg));
3434         case LL_IOC_GROUP_UNLOCK:
3435                 RETURN(ll_put_grouplock(inode, file, arg));
3436         case IOC_OBD_STATFS:
3437                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3438
3439         case LL_IOC_FLUSHCTX:
3440                 RETURN(ll_flush_ctx(inode));
3441         case LL_IOC_PATH2FID: {
3442                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3443                                  sizeof(struct lu_fid)))
3444                         RETURN(-EFAULT);
3445
3446                 RETURN(0);
3447         }
3448         case LL_IOC_GETPARENT:
3449                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3450
3451         case OBD_IOC_FID2PATH:
3452                 RETURN(ll_fid2path(inode, (void __user *)arg));
3453         case LL_IOC_DATA_VERSION: {
3454                 struct ioc_data_version idv;
3455                 int rc;
3456
3457                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3458                         RETURN(-EFAULT);
3459
3460                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3461                 rc = ll_ioc_data_version(inode, &idv);
3462
3463                 if (rc == 0 &&
3464                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3465                         RETURN(-EFAULT);
3466
3467                 RETURN(rc);
3468         }
3469
3470         case LL_IOC_GET_MDTIDX: {
3471                 int mdtidx;
3472
3473                 mdtidx = ll_get_mdt_idx(inode);
3474                 if (mdtidx < 0)
3475                         RETURN(mdtidx);
3476
3477                 if (put_user((int)mdtidx, (int __user *)arg))
3478                         RETURN(-EFAULT);
3479
3480                 RETURN(0);
3481         }
3482         case OBD_IOC_GETDTNAME:
3483         case OBD_IOC_GETMDNAME:
3484                 RETURN(ll_get_obd_name(inode, cmd, arg));
3485         case LL_IOC_HSM_STATE_GET: {
3486                 struct md_op_data       *op_data;
3487                 struct hsm_user_state   *hus;
3488                 int                      rc;
3489
3490                 OBD_ALLOC_PTR(hus);
3491                 if (hus == NULL)
3492                         RETURN(-ENOMEM);
3493
3494                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3495                                              LUSTRE_OPC_ANY, hus);
3496                 if (IS_ERR(op_data)) {
3497                         OBD_FREE_PTR(hus);
3498                         RETURN(PTR_ERR(op_data));
3499                 }
3500
3501                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3502                                    op_data, NULL);
3503
3504                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3505                         rc = -EFAULT;
3506
3507                 ll_finish_md_op_data(op_data);
3508                 OBD_FREE_PTR(hus);
3509                 RETURN(rc);
3510         }
3511         case LL_IOC_HSM_STATE_SET: {
3512                 struct hsm_state_set    *hss;
3513                 int                      rc;
3514
3515                 OBD_ALLOC_PTR(hss);
3516                 if (hss == NULL)
3517                         RETURN(-ENOMEM);
3518
3519                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3520                         OBD_FREE_PTR(hss);
3521                         RETURN(-EFAULT);
3522                 }
3523
3524                 rc = ll_hsm_state_set(inode, hss);
3525
3526                 OBD_FREE_PTR(hss);
3527                 RETURN(rc);
3528         }
3529         case LL_IOC_HSM_ACTION: {
3530                 struct md_op_data               *op_data;
3531                 struct hsm_current_action       *hca;
3532                 int                              rc;
3533
3534                 OBD_ALLOC_PTR(hca);
3535                 if (hca == NULL)
3536                         RETURN(-ENOMEM);
3537
3538                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3539                                              LUSTRE_OPC_ANY, hca);
3540                 if (IS_ERR(op_data)) {
3541                         OBD_FREE_PTR(hca);
3542                         RETURN(PTR_ERR(op_data));
3543                 }
3544
3545                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3546                                    op_data, NULL);
3547
3548                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3549                         rc = -EFAULT;
3550
3551                 ll_finish_md_op_data(op_data);
3552                 OBD_FREE_PTR(hca);
3553                 RETURN(rc);
3554         }
3555         case LL_IOC_SET_LEASE_OLD: {
3556                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3557
3558                 RETURN(ll_file_set_lease(file, &ioc, 0));
3559         }
3560         case LL_IOC_SET_LEASE: {
3561                 struct ll_ioc_lease ioc;
3562
3563                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3564                         RETURN(-EFAULT);
3565
3566                 RETURN(ll_file_set_lease(file, &ioc, arg));
3567         }
3568         case LL_IOC_GET_LEASE: {
3569                 struct ll_inode_info *lli = ll_i2info(inode);
3570                 struct ldlm_lock *lock = NULL;
3571                 fmode_t fmode = 0;
3572
3573                 mutex_lock(&lli->lli_och_mutex);
3574                 if (fd->fd_lease_och != NULL) {
3575                         struct obd_client_handle *och = fd->fd_lease_och;
3576
3577                         lock = ldlm_handle2lock(&och->och_lease_handle);
3578                         if (lock != NULL) {
3579                                 lock_res_and_lock(lock);
3580                                 if (!ldlm_is_cancel(lock))
3581                                         fmode = och->och_flags;
3582
3583                                 unlock_res_and_lock(lock);
3584                                 LDLM_LOCK_PUT(lock);
3585                         }
3586                 }
3587                 mutex_unlock(&lli->lli_och_mutex);
3588
3589                 RETURN(ll_lease_type_from_fmode(fmode));
3590         }
3591         case LL_IOC_HSM_IMPORT: {
3592                 struct hsm_user_import *hui;
3593
3594                 OBD_ALLOC_PTR(hui);
3595                 if (hui == NULL)
3596                         RETURN(-ENOMEM);
3597
3598                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3599                         OBD_FREE_PTR(hui);
3600                         RETURN(-EFAULT);
3601                 }
3602
3603                 rc = ll_hsm_import(inode, file, hui);
3604
3605                 OBD_FREE_PTR(hui);
3606                 RETURN(rc);
3607         }
3608         case LL_IOC_FUTIMES_3: {
3609                 struct ll_futimes_3 lfu;
3610
3611                 if (copy_from_user(&lfu,
3612                                    (const struct ll_futimes_3 __user *)arg,
3613                                    sizeof(lfu)))
3614                         RETURN(-EFAULT);
3615
3616                 RETURN(ll_file_futimes_3(file, &lfu));
3617         }
3618         case LL_IOC_LADVISE: {
3619                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3620                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3621                 int i;
3622                 int num_advise;
3623                 int alloc_size = sizeof(*k_ladvise_hdr);
3624
3625                 rc = 0;
3626                 u_ladvise_hdr = (void __user *)arg;
3627                 OBD_ALLOC_PTR(k_ladvise_hdr);
3628                 if (k_ladvise_hdr == NULL)
3629                         RETURN(-ENOMEM);
3630
3631                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3632                         GOTO(out_ladvise, rc = -EFAULT);
3633
3634                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3635                     k_ladvise_hdr->lah_count < 1)
3636                         GOTO(out_ladvise, rc = -EINVAL);
3637
3638                 num_advise = k_ladvise_hdr->lah_count;
3639                 if (num_advise >= LAH_COUNT_MAX)
3640                         GOTO(out_ladvise, rc = -EFBIG);
3641
3642                 OBD_FREE_PTR(k_ladvise_hdr);
3643                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3644                                       lah_advise[num_advise]);
3645                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3646                 if (k_ladvise_hdr == NULL)
3647                         RETURN(-ENOMEM);
3648
3649                 /*
3650                  * TODO: submit multiple advices to one server in a single RPC
3651                  */
3652                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3653                         GOTO(out_ladvise, rc = -EFAULT);
3654
3655                 for (i = 0; i < num_advise; i++) {
3656                         struct llapi_lu_ladvise *k_ladvise =
3657                                         &k_ladvise_hdr->lah_advise[i];
3658                         struct llapi_lu_ladvise __user *u_ladvise =
3659                                         &u_ladvise_hdr->lah_advise[i];
3660
3661                         rc = ll_ladvise_sanity(inode, k_ladvise);
3662                         if (rc)
3663                                 GOTO(out_ladvise, rc);
3664
3665                         switch (k_ladvise->lla_advice) {
3666                         case LU_LADVISE_LOCKNOEXPAND:
3667                                 rc = ll_lock_noexpand(file,
3668                                                k_ladvise->lla_peradvice_flags);
3669                                 GOTO(out_ladvise, rc);
3670                         case LU_LADVISE_LOCKAHEAD:
3671
3672                                 rc = ll_file_lock_ahead(file, k_ladvise);
3673
3674                                 if (rc < 0)
3675                                         GOTO(out_ladvise, rc);
3676
3677                                 if (put_user(rc,
3678                                              &u_ladvise->lla_lockahead_result))
3679                                         GOTO(out_ladvise, rc = -EFAULT);
3680                                 break;
3681                         default:
3682                                 rc = ll_ladvise(inode, file,
3683                                                 k_ladvise_hdr->lah_flags,
3684                                                 k_ladvise);
3685                                 if (rc)
3686                                         GOTO(out_ladvise, rc);
3687                                 break;
3688                         }
3689
3690                 }
3691
3692 out_ladvise:
3693                 OBD_FREE(k_ladvise_hdr, alloc_size);
3694                 RETURN(rc);
3695         }
3696         case LL_IOC_FLR_SET_MIRROR: {
3697                 /* mirror I/O must be direct to avoid polluting page cache
3698                  * by stale data. */
3699                 if (!(file->f_flags & O_DIRECT))
3700                         RETURN(-EINVAL);
3701
3702                 fd->fd_designated_mirror = (__u32)arg;
3703                 RETURN(0);
3704         }
3705         case LL_IOC_FSGETXATTR:
3706                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3707         case LL_IOC_FSSETXATTR:
3708                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3709         case BLKSSZGET:
3710                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3711         default:
3712                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3713                                      (void __user *)arg));
3714         }
3715 }
3716
3717 #ifndef HAVE_FILE_LLSEEK_SIZE
3718 static inline loff_t
3719 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3720 {
3721         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3722                 return -EINVAL;
3723         if (offset > maxsize)
3724                 return -EINVAL;
3725
3726         if (offset != file->f_pos) {
3727                 file->f_pos = offset;
3728                 file->f_version = 0;
3729         }
3730         return offset;
3731 }
3732
3733 static loff_t
3734 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3735                 loff_t maxsize, loff_t eof)
3736 {
3737         struct inode *inode = file_inode(file);
3738
3739         switch (origin) {
3740         case SEEK_END:
3741                 offset += eof;
3742                 break;
3743         case SEEK_CUR:
3744                 /*
3745                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3746                  * position-querying operation.  Avoid rewriting the "same"
3747                  * f_pos value back to the file because a concurrent read(),
3748                  * write() or lseek() might have altered it
3749                  */
3750                 if (offset == 0)
3751                         return file->f_pos;
3752                 /*
3753                  * f_lock protects against read/modify/write race with other
3754                  * SEEK_CURs. Note that parallel writes and reads behave
3755                  * like SEEK_SET.
3756                  */
3757                 inode_lock(inode);
3758                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3759                 inode_unlock(inode);
3760                 return offset;
3761         case SEEK_DATA:
3762                 /*
3763                  * In the generic case the entire file is data, so as long as
3764                  * offset isn't at the end of the file then the offset is data.
3765                  */
3766                 if (offset >= eof)
3767                         return -ENXIO;
3768                 break;
3769         case SEEK_HOLE:
3770                 /*
3771                  * There is a virtual hole at the end of the file, so as long as
3772                  * offset isn't i_size or larger, return i_size.
3773                  */
3774                 if (offset >= eof)
3775                         return -ENXIO;
3776                 offset = eof;
3777                 break;
3778         }
3779
3780         return llseek_execute(file, offset, maxsize);
3781 }
3782 #endif
3783
3784 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3785 {
3786         struct inode *inode = file_inode(file);
3787         loff_t retval, eof = 0;
3788
3789         ENTRY;
3790         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3791                            (origin == SEEK_CUR) ? file->f_pos : 0);
3792         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3793                PFID(ll_inode2fid(inode)), inode, retval, retval,
3794                origin);
3795         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3796
3797         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3798                 retval = ll_glimpse_size(inode);
3799                 if (retval != 0)
3800                         RETURN(retval);
3801                 eof = i_size_read(inode);
3802         }
3803
3804         retval = ll_generic_file_llseek_size(file, offset, origin,
3805                                           ll_file_maxbytes(inode), eof);
3806         RETURN(retval);
3807 }
3808
3809 static int ll_flush(struct file *file, fl_owner_t id)
3810 {
3811         struct inode *inode = file_inode(file);
3812         struct ll_inode_info *lli = ll_i2info(inode);
3813         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3814         int rc, err;
3815
3816         LASSERT(!S_ISDIR(inode->i_mode));
3817
3818         /* catch async errors that were recorded back when async writeback
3819          * failed for pages in this mapping. */
3820         rc = lli->lli_async_rc;
3821         lli->lli_async_rc = 0;
3822         if (lli->lli_clob != NULL) {
3823                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3824                 if (rc == 0)
3825                         rc = err;
3826         }
3827
3828         /* The application has been told write failure already.
3829          * Do not report failure again. */
3830         if (fd->fd_write_failed)
3831                 return 0;
3832         return rc ? -EIO : 0;
3833 }
3834
3835 /**
3836  * Called to make sure a portion of file has been written out.
3837  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3838  *
3839  * Return how many pages have been written.
3840  */
3841 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3842                        enum cl_fsync_mode mode, int ignore_layout)
3843 {
3844         struct lu_env *env;
3845         struct cl_io *io;
3846         struct cl_fsync_io *fio;
3847         int result;
3848         __u16 refcheck;
3849         ENTRY;
3850
3851         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3852             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3853                 RETURN(-EINVAL);
3854
3855         env = cl_env_get(&refcheck);
3856         if (IS_ERR(env))
3857                 RETURN(PTR_ERR(env));
3858
3859         io = vvp_env_thread_io(env);
3860         io->ci_obj = ll_i2info(inode)->lli_clob;
3861         io->ci_ignore_layout = ignore_layout;
3862
3863         /* initialize parameters for sync */
3864         fio = &io->u.ci_fsync;
3865         fio->fi_start = start;
3866         fio->fi_end = end;
3867         fio->fi_fid = ll_inode2fid(inode);
3868         fio->fi_mode = mode;
3869         fio->fi_nr_written = 0;
3870
3871         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3872                 result = cl_io_loop(env, io);
3873         else
3874                 result = io->ci_result;
3875         if (result == 0)
3876                 result = fio->fi_nr_written;
3877         cl_io_fini(env, io);
3878         cl_env_put(env, &refcheck);
3879
3880         RETURN(result);
3881 }
3882
3883 /*
3884  * When dentry is provided (the 'else' case), file_dentry() may be
3885  * null and dentry must be used directly rather than pulled from
3886  * file_dentry() as is done otherwise.
3887  */
3888
3889 #ifdef HAVE_FILE_FSYNC_4ARGS
3890 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3891 {
3892         struct dentry *dentry = file_dentry(file);
3893         bool lock_inode;
3894 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3895 int ll_fsync(struct file *file, int datasync)
3896 {
3897         struct dentry *dentry = file_dentry(file);
3898         loff_t start = 0;
3899         loff_t end = LLONG_MAX;
3900 #else
3901 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3902 {
3903         loff_t start = 0;
3904         loff_t end = LLONG_MAX;
3905 #endif
3906         struct inode *inode = dentry->d_inode;
3907         struct ll_inode_info *lli = ll_i2info(inode);
3908         struct ptlrpc_request *req;
3909         int rc, err;
3910         ENTRY;
3911
3912         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3913                PFID(ll_inode2fid(inode)), inode);
3914         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3915
3916 #ifdef HAVE_FILE_FSYNC_4ARGS
3917         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3918         lock_inode = !lli->lli_inode_locked;
3919         if (lock_inode)
3920                 inode_lock(inode);
3921 #else
3922         /* fsync's caller has already called _fdata{sync,write}, we want
3923          * that IO to finish before calling the osc and mdc sync methods */
3924         rc = filemap_fdatawait(inode->i_mapping);
3925 #endif
3926
3927         /* catch async errors that were recorded back when async writeback
3928          * failed for pages in this mapping. */
3929         if (!S_ISDIR(inode->i_mode)) {
3930                 err = lli->lli_async_rc;
3931                 lli->lli_async_rc = 0;
3932                 if (rc == 0)
3933                         rc = err;
3934                 if (lli->lli_clob != NULL) {
3935                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3936                         if (rc == 0)
3937                                 rc = err;
3938                 }
3939         }
3940
3941         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3942         if (!rc)
3943                 rc = err;
3944         if (!err)
3945                 ptlrpc_req_finished(req);
3946
3947         if (S_ISREG(inode->i_mode)) {
3948                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3949
3950                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3951                 if (rc == 0 && err < 0)
3952                         rc = err;
3953                 if (rc < 0)
3954                         fd->fd_write_failed = true;
3955                 else
3956                         fd->fd_write_failed = false;
3957         }
3958
3959 #ifdef HAVE_FILE_FSYNC_4ARGS
3960         if (lock_inode)
3961                 inode_unlock(inode);
3962 #endif
3963         RETURN(rc);
3964 }
3965
3966 static int
3967 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3968 {
3969         struct inode *inode = file_inode(file);
3970         struct ll_sb_info *sbi = ll_i2sbi(inode);
3971         struct ldlm_enqueue_info einfo = {
3972                 .ei_type        = LDLM_FLOCK,
3973                 .ei_cb_cp       = ldlm_flock_completion_ast,
3974                 .ei_cbdata      = file_lock,
3975         };
3976         struct md_op_data *op_data;
3977         struct lustre_handle lockh = { 0 };
3978         union ldlm_policy_data flock = { { 0 } };
3979         int fl_type = file_lock->fl_type;
3980         __u64 flags = 0;
3981         int rc;
3982         int rc2 = 0;
3983         ENTRY;
3984
3985         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3986                PFID(ll_inode2fid(inode)), file_lock);
3987
3988         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3989
3990         if (file_lock->fl_flags & FL_FLOCK) {
3991                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3992                 /* flocks are whole-file locks */
3993                 flock.l_flock.end = OFFSET_MAX;
3994                 /* For flocks owner is determined by the local file desctiptor*/
3995                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3996         } else if (file_lock->fl_flags & FL_POSIX) {
3997                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3998                 flock.l_flock.start = file_lock->fl_start;
3999                 flock.l_flock.end = file_lock->fl_end;
4000         } else {
4001                 RETURN(-EINVAL);
4002         }
4003         flock.l_flock.pid = file_lock->fl_pid;
4004
4005         /* Somewhat ugly workaround for svc lockd.
4006          * lockd installs custom fl_lmops->lm_compare_owner that checks
4007          * for the fl_owner to be the same (which it always is on local node
4008          * I guess between lockd processes) and then compares pid.
4009          * As such we assign pid to the owner field to make it all work,
4010          * conflict with normal locks is unlikely since pid space and
4011          * pointer space for current->files are not intersecting */
4012         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4013                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4014
4015         switch (fl_type) {
4016         case F_RDLCK:
4017                 einfo.ei_mode = LCK_PR;
4018                 break;
4019         case F_UNLCK:
4020                 /* An unlock request may or may not have any relation to
4021                  * existing locks so we may not be able to pass a lock handle
4022                  * via a normal ldlm_lock_cancel() request. The request may even
4023                  * unlock a byte range in the middle of an existing lock. In
4024                  * order to process an unlock request we need all of the same
4025                  * information that is given with a normal read or write record
4026                  * lock request. To avoid creating another ldlm unlock (cancel)
4027                  * message we'll treat a LCK_NL flock request as an unlock. */
4028                 einfo.ei_mode = LCK_NL;
4029                 break;
4030         case F_WRLCK:
4031                 einfo.ei_mode = LCK_PW;
4032                 break;
4033         default:
4034                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4035                 RETURN (-ENOTSUPP);
4036         }
4037
4038         switch (cmd) {
4039         case F_SETLKW:
4040 #ifdef F_SETLKW64
4041         case F_SETLKW64:
4042 #endif
4043                 flags = 0;
4044                 break;
4045         case F_SETLK:
4046 #ifdef F_SETLK64
4047         case F_SETLK64:
4048 #endif
4049                 flags = LDLM_FL_BLOCK_NOWAIT;
4050                 break;
4051         case F_GETLK:
4052 #ifdef F_GETLK64
4053         case F_GETLK64:
4054 #endif
4055                 flags = LDLM_FL_TEST_LOCK;
4056                 break;
4057         default:
4058                 CERROR("unknown fcntl lock command: %d\n", cmd);
4059                 RETURN (-EINVAL);
4060         }
4061
4062         /* Save the old mode so that if the mode in the lock changes we
4063          * can decrement the appropriate reader or writer refcount. */
4064         file_lock->fl_type = einfo.ei_mode;
4065
4066         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4067                                      LUSTRE_OPC_ANY, NULL);
4068         if (IS_ERR(op_data))
4069                 RETURN(PTR_ERR(op_data));
4070
4071         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4072                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4073                flock.l_flock.pid, flags, einfo.ei_mode,
4074                flock.l_flock.start, flock.l_flock.end);
4075
4076         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4077                         flags);
4078
4079         /* Restore the file lock type if not TEST lock. */
4080         if (!(flags & LDLM_FL_TEST_LOCK))
4081                 file_lock->fl_type = fl_type;
4082
4083 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4084         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4085             !(flags & LDLM_FL_TEST_LOCK))
4086                 rc2  = locks_lock_file_wait(file, file_lock);
4087 #else
4088         if ((file_lock->fl_flags & FL_FLOCK) &&
4089             (rc == 0 || file_lock->fl_type == F_UNLCK))
4090                 rc2  = flock_lock_file_wait(file, file_lock);
4091         if ((file_lock->fl_flags & FL_POSIX) &&
4092             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4093             !(flags & LDLM_FL_TEST_LOCK))
4094                 rc2  = posix_lock_file_wait(file, file_lock);
4095 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4096
4097         if (rc2 && file_lock->fl_type != F_UNLCK) {
4098                 einfo.ei_mode = LCK_NL;
4099                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4100                            &lockh, flags);
4101                 rc = rc2;
4102         }
4103
4104         ll_finish_md_op_data(op_data);
4105
4106         RETURN(rc);
4107 }
4108
4109 int ll_get_fid_by_name(struct inode *parent, const char *name,
4110                        int namelen, struct lu_fid *fid,
4111                        struct inode **inode)
4112 {
4113         struct md_op_data       *op_data = NULL;
4114         struct mdt_body         *body;
4115         struct ptlrpc_request   *req;
4116         int                     rc;
4117         ENTRY;
4118
4119         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4120                                      LUSTRE_OPC_ANY, NULL);
4121         if (IS_ERR(op_data))
4122                 RETURN(PTR_ERR(op_data));
4123
4124         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4125         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4126         ll_finish_md_op_data(op_data);
4127         if (rc < 0)
4128                 RETURN(rc);
4129
4130         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4131         if (body == NULL)
4132                 GOTO(out_req, rc = -EFAULT);
4133         if (fid != NULL)
4134                 *fid = body->mbo_fid1;
4135
4136         if (inode != NULL)
4137                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4138 out_req:
4139         ptlrpc_req_finished(req);
4140         RETURN(rc);
4141 }
4142
4143 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4144                const char *name)
4145 {
4146         struct dentry *dchild = NULL;
4147         struct inode *child_inode = NULL;
4148         struct md_op_data *op_data;
4149         struct ptlrpc_request *request = NULL;
4150         struct obd_client_handle *och = NULL;
4151         struct qstr qstr;
4152         struct mdt_body *body;
4153         __u64 data_version = 0;
4154         size_t namelen = strlen(name);
4155         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4156         int rc;
4157         ENTRY;
4158
4159         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4160                PFID(ll_inode2fid(parent)), name,
4161                lum->lum_stripe_offset, lum->lum_stripe_count);
4162
4163         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4164             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4165                 lustre_swab_lmv_user_md(lum);
4166
4167         /* Get child FID first */
4168         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4169         qstr.name = name;
4170         qstr.len = namelen;
4171         dchild = d_lookup(file_dentry(file), &qstr);
4172         if (dchild) {
4173                 if (dchild->d_inode)
4174                         child_inode = igrab(dchild->d_inode);
4175                 dput(dchild);
4176         }
4177
4178         if (!child_inode) {
4179                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4180                                         &child_inode);
4181                 if (rc)
4182                         RETURN(rc);
4183         }
4184
4185         if (!child_inode)
4186                 RETURN(-ENOENT);
4187
4188         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4189               OBD_CONNECT2_DIR_MIGRATE)) {
4190                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4191                     ll_i2info(child_inode)->lli_lsm_md) {
4192                         CERROR("%s: MDT doesn't support stripe directory "
4193                                "migration!\n",
4194                                ll_get_fsname(parent->i_sb, NULL, 0));
4195                         GOTO(out_iput, rc = -EOPNOTSUPP);
4196                 }
4197         }
4198
4199         /*
4200          * lfs migrate command needs to be blocked on the client
4201          * by checking the migrate FID against the FID of the
4202          * filesystem root.
4203          */
4204         if (child_inode == parent->i_sb->s_root->d_inode)
4205                 GOTO(out_iput, rc = -EINVAL);
4206
4207         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4208                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4209         if (IS_ERR(op_data))
4210                 GOTO(out_iput, rc = PTR_ERR(op_data));
4211
4212         inode_lock(child_inode);
4213         op_data->op_fid3 = *ll_inode2fid(child_inode);
4214         if (!fid_is_sane(&op_data->op_fid3)) {
4215                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4216                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4217                        PFID(&op_data->op_fid3));
4218                 GOTO(out_unlock, rc = -EINVAL);
4219         }
4220
4221         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4222         op_data->op_data = lum;
4223         op_data->op_data_size = lumlen;
4224
4225 again:
4226         if (S_ISREG(child_inode->i_mode)) {
4227                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4228                 if (IS_ERR(och)) {
4229                         rc = PTR_ERR(och);
4230                         och = NULL;
4231                         GOTO(out_unlock, rc);
4232                 }
4233
4234                 rc = ll_data_version(child_inode, &data_version,
4235                                      LL_DV_WR_FLUSH);
4236                 if (rc != 0)
4237                         GOTO(out_close, rc);
4238
4239                 op_data->op_open_handle = och->och_open_handle;
4240                 op_data->op_data_version = data_version;
4241                 op_data->op_lease_handle = och->och_lease_handle;
4242                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4243
4244                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4245                 och->och_mod->mod_open_req->rq_replay = 0;
4246                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4247         }
4248
4249         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4250                        name, namelen, &request);
4251         if (rc == 0) {
4252                 LASSERT(request != NULL);
4253                 ll_update_times(request, parent);
4254
4255                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4256                 LASSERT(body != NULL);
4257
4258                 /* If the server does release layout lock, then we cleanup
4259                  * the client och here, otherwise release it in out_close: */
4260                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4261                         obd_mod_put(och->och_mod);
4262                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4263                                                   och);
4264                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4265                         OBD_FREE_PTR(och);
4266                         och = NULL;
4267                 }
4268         }
4269
4270         if (request != NULL) {
4271                 ptlrpc_req_finished(request);
4272                 request = NULL;
4273         }
4274
4275         /* Try again if the file layout has changed. */
4276         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4277                 goto again;
4278
4279 out_close:
4280         if (och)
4281                 ll_lease_close(och, child_inode, NULL);
4282         if (!rc)
4283                 clear_nlink(child_inode);
4284 out_unlock:
4285         inode_unlock(child_inode);
4286         ll_finish_md_op_data(op_data);
4287 out_iput:
4288         iput(child_inode);
4289         RETURN(rc);
4290 }
4291
4292 static int
4293 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4294 {
4295         ENTRY;
4296
4297         RETURN(-ENOSYS);
4298 }
4299
4300 /**
4301  * test if some locks matching bits and l_req_mode are acquired
4302  * - bits can be in different locks
4303  * - if found clear the common lock bits in *bits
4304  * - the bits not found, are kept in *bits
4305  * \param inode [IN]
4306  * \param bits [IN] searched lock bits [IN]
4307  * \param l_req_mode [IN] searched lock mode
4308  * \retval boolean, true iff all bits are found
4309  */
4310 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4311 {
4312         struct lustre_handle lockh;
4313         union ldlm_policy_data policy;
4314         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4315                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4316         struct lu_fid *fid;
4317         __u64 flags;
4318         int i;
4319         ENTRY;
4320
4321         if (!inode)
4322                RETURN(0);
4323
4324         fid = &ll_i2info(inode)->lli_fid;
4325         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4326                ldlm_lockname[mode]);
4327
4328         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4329         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4330                 policy.l_inodebits.bits = *bits & (1 << i);
4331                 if (policy.l_inodebits.bits == 0)
4332                         continue;
4333
4334                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4335                                   &policy, mode, &lockh)) {
4336                         struct ldlm_lock *lock;
4337
4338                         lock = ldlm_handle2lock(&lockh);
4339                         if (lock) {
4340                                 *bits &=
4341                                       ~(lock->l_policy_data.l_inodebits.bits);
4342                                 LDLM_LOCK_PUT(lock);
4343                         } else {
4344                                 *bits &= ~policy.l_inodebits.bits;
4345                         }
4346                 }
4347         }
4348         RETURN(*bits == 0);
4349 }
4350
4351 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4352                                struct lustre_handle *lockh, __u64 flags,
4353                                enum ldlm_mode mode)
4354 {
4355         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4356         struct lu_fid *fid;
4357         enum ldlm_mode rc;
4358         ENTRY;
4359
4360         fid = &ll_i2info(inode)->lli_fid;
4361         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4362
4363         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4364                            fid, LDLM_IBITS, &policy, mode, lockh);
4365
4366         RETURN(rc);
4367 }
4368
4369 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4370 {
4371         /* Already unlinked. Just update nlink and return success */
4372         if (rc == -ENOENT) {
4373                 clear_nlink(inode);
4374                 /* If it is striped directory, and there is bad stripe
4375                  * Let's revalidate the dentry again, instead of returning
4376                  * error */
4377                 if (S_ISDIR(inode->i_mode) &&
4378                     ll_i2info(inode)->lli_lsm_md != NULL)
4379                         return 0;
4380
4381                 /* This path cannot be hit for regular files unless in
4382                  * case of obscure races, so no need to to validate
4383                  * size. */
4384                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4385                         return 0;
4386         } else if (rc != 0) {
4387                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4388                              "%s: revalidate FID "DFID" error: rc = %d\n",
4389                              ll_get_fsname(inode->i_sb, NULL, 0),
4390                              PFID(ll_inode2fid(inode)), rc);
4391         }
4392
4393         return rc;
4394 }
4395
4396 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4397 {
4398         struct inode *inode = dentry->d_inode;
4399         struct obd_export *exp = ll_i2mdexp(inode);
4400         struct lookup_intent oit = {
4401                 .it_op = op,
4402         };
4403         struct ptlrpc_request *req = NULL;
4404         struct md_op_data *op_data;
4405         int rc = 0;
4406         ENTRY;
4407
4408         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4409                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4410
4411         /* Call getattr by fid, so do not provide name at all. */
4412         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4413                                      LUSTRE_OPC_ANY, NULL);
4414         if (IS_ERR(op_data))
4415                 RETURN(PTR_ERR(op_data));
4416
4417         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4418         ll_finish_md_op_data(op_data);
4419         if (rc < 0) {
4420                 rc = ll_inode_revalidate_fini(inode, rc);
4421                 GOTO(out, rc);
4422         }
4423
4424         rc = ll_revalidate_it_finish(req, &oit, dentry);
4425         if (rc != 0) {
4426                 ll_intent_release(&oit);
4427                 GOTO(out, rc);
4428         }
4429
4430         /* Unlinked? Unhash dentry, so it is not picked up later by
4431          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4432          * here to preserve get_cwd functionality on 2.6.
4433          * Bug 10503 */
4434         if (!dentry->d_inode->i_nlink) {
4435                 ll_lock_dcache(inode);
4436                 d_lustre_invalidate(dentry, 0);
4437                 ll_unlock_dcache(inode);
4438         }
4439
4440         ll_lookup_finish_locks(&oit, dentry);
4441 out:
4442         ptlrpc_req_finished(req);
4443
4444         return rc;
4445 }
4446
4447 static int ll_merge_md_attr(struct inode *inode)
4448 {
4449         struct ll_inode_info *lli = ll_i2info(inode);
4450         struct cl_attr attr = { 0 };
4451         int rc;
4452
4453         LASSERT(lli->lli_lsm_md != NULL);
4454         down_read(&lli->lli_lsm_sem);
4455         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4456                            &attr, ll_md_blocking_ast);
4457         up_read(&lli->lli_lsm_sem);
4458         if (rc != 0)
4459                 RETURN(rc);
4460
4461         set_nlink(inode, attr.cat_nlink);
4462         inode->i_blocks = attr.cat_blocks;
4463         i_size_write(inode, attr.cat_size);
4464
4465         ll_i2info(inode)->lli_atime = attr.cat_atime;
4466         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4467         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4468
4469         RETURN(0);
4470 }
4471
4472 static inline dev_t ll_compat_encode_dev(dev_t dev)
4473 {
4474         /* The compat_sys_*stat*() syscalls will fail unless the
4475          * device majors and minors are both less than 256. Note that
4476          * the value returned here will be passed through
4477          * old_encode_dev() in cp_compat_stat(). And so we are not
4478          * trying to return a valid compat (u16) device number, just
4479          * one that will pass the old_valid_dev() check. */
4480
4481         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4482 }
4483
4484 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4485 int ll_getattr(const struct path *path, struct kstat *stat,
4486                u32 request_mask, unsigned int flags)
4487 {
4488         struct dentry *de = path->dentry;
4489 #else
4490 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4491 {
4492 #endif
4493         struct inode *inode = de->d_inode;
4494         struct ll_sb_info *sbi = ll_i2sbi(inode);
4495         struct ll_inode_info *lli = ll_i2info(inode);
4496         int rc;
4497
4498         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4499
4500         rc = ll_inode_revalidate(de, IT_GETATTR);
4501         if (rc < 0)
4502                 RETURN(rc);
4503
4504         if (S_ISREG(inode->i_mode)) {
4505                 /* In case of restore, the MDT has the right size and has
4506                  * already send it back without granting the layout lock,
4507                  * inode is up-to-date so glimpse is useless.
4508                  * Also to glimpse we need the layout, in case of a running
4509                  * restore the MDT holds the layout lock so the glimpse will
4510                  * block up to the end of restore (getattr will block)
4511                  */
4512                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4513                         rc = ll_glimpse_size(inode);
4514                         if (rc < 0)
4515                                 RETURN(rc);
4516                 }
4517         } else {
4518                 /* If object isn't regular a file then don't validate size. */
4519                 if (S_ISDIR(inode->i_mode) &&
4520                     lli->lli_lsm_md != NULL) {
4521                         rc = ll_merge_md_attr(inode);
4522                         if (rc < 0)
4523                                 RETURN(rc);
4524                 }
4525
4526                 LTIME_S(inode->i_atime) = lli->lli_atime;
4527                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4528                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4529         }
4530
4531         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4532
4533         if (ll_need_32bit_api(sbi)) {
4534                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4535                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4536                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4537         } else {
4538                 stat->ino = inode->i_ino;
4539                 stat->dev = inode->i_sb->s_dev;
4540                 stat->rdev = inode->i_rdev;
4541         }
4542
4543         stat->mode = inode->i_mode;
4544         stat->uid = inode->i_uid;
4545         stat->gid = inode->i_gid;
4546         stat->atime = inode->i_atime;
4547         stat->mtime = inode->i_mtime;
4548         stat->ctime = inode->i_ctime;
4549         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4550
4551         stat->nlink = inode->i_nlink;
4552         stat->size = i_size_read(inode);
4553         stat->blocks = inode->i_blocks;
4554
4555         return 0;
4556 }
4557
4558 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4559                      __u64 start, __u64 len)
4560 {
4561         int             rc;
4562         size_t          num_bytes;
4563         struct fiemap   *fiemap;
4564         unsigned int    extent_count = fieinfo->fi_extents_max;
4565
4566         num_bytes = sizeof(*fiemap) + (extent_count *
4567                                        sizeof(struct fiemap_extent));
4568         OBD_ALLOC_LARGE(fiemap, num_bytes);
4569
4570         if (fiemap == NULL)
4571                 RETURN(-ENOMEM);
4572
4573         fiemap->fm_flags = fieinfo->fi_flags;
4574         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4575         fiemap->fm_start = start;
4576         fiemap->fm_length = len;
4577         if (extent_count > 0 &&
4578             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4579                            sizeof(struct fiemap_extent)) != 0)
4580                 GOTO(out, rc = -EFAULT);
4581
4582         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4583
4584         fieinfo->fi_flags = fiemap->fm_flags;
4585         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4586         if (extent_count > 0 &&
4587             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4588                          fiemap->fm_mapped_extents *
4589                          sizeof(struct fiemap_extent)) != 0)
4590                 GOTO(out, rc = -EFAULT);
4591 out:
4592         OBD_FREE_LARGE(fiemap, num_bytes);
4593         return rc;
4594 }
4595
4596 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4597 {
4598         struct ll_inode_info *lli = ll_i2info(inode);
4599         struct posix_acl *acl = NULL;
4600         ENTRY;
4601
4602         spin_lock(&lli->lli_lock);
4603         /* VFS' acl_permission_check->check_acl will release the refcount */
4604         acl = posix_acl_dup(lli->lli_posix_acl);
4605         spin_unlock(&lli->lli_lock);
4606
4607         RETURN(acl);
4608 }
4609
4610 #ifdef HAVE_IOP_SET_ACL
4611 #ifdef CONFIG_FS_POSIX_ACL
4612 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4613 {
4614         struct ll_sb_info *sbi = ll_i2sbi(inode);
4615         struct ptlrpc_request *req = NULL;
4616         const char *name = NULL;
4617         char *value = NULL;
4618         size_t value_size = 0;
4619         int rc = 0;
4620         ENTRY;
4621
4622         switch (type) {
4623         case ACL_TYPE_ACCESS:
4624                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4625                 if (acl)
4626                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4627                 break;
4628
4629         case ACL_TYPE_DEFAULT:
4630                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4631                 if (!S_ISDIR(inode->i_mode))
4632                         rc = acl ? -EACCES : 0;
4633                 break;
4634
4635         default:
4636                 rc = -EINVAL;
4637                 break;
4638         }
4639         if (rc)
4640                 return rc;
4641
4642         if (acl) {
4643                 value_size = posix_acl_xattr_size(acl->a_count);
4644                 value = kmalloc(value_size, GFP_NOFS);
4645                 if (value == NULL)
4646                         GOTO(out, rc = -ENOMEM);
4647
4648                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4649                 if (rc < 0)
4650                         GOTO(out_value, rc);
4651         }
4652
4653         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4654                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4655                          name, value, value_size, 0, 0, &req);
4656
4657         ptlrpc_req_finished(req);
4658 out_value:
4659         kfree(value);
4660 out:
4661         if (rc)
4662                 forget_cached_acl(inode, type);
4663         else
4664                 set_cached_acl(inode, type, acl);
4665         RETURN(rc);
4666 }
4667 #endif /* CONFIG_FS_POSIX_ACL */
4668 #endif /* HAVE_IOP_SET_ACL */
4669
4670 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4671 static int
4672 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4673 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4674 # else
4675 ll_check_acl(struct inode *inode, int mask)
4676 # endif
4677 {
4678 # ifdef CONFIG_FS_POSIX_ACL
4679         struct posix_acl *acl;
4680         int rc;
4681         ENTRY;
4682
4683 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4684         if (flags & IPERM_FLAG_RCU)
4685                 return -ECHILD;
4686 #  endif
4687         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4688
4689         if (!acl)
4690                 RETURN(-EAGAIN);
4691
4692         rc = posix_acl_permission(inode, acl, mask);
4693         posix_acl_release(acl);
4694
4695         RETURN(rc);
4696 # else /* !CONFIG_FS_POSIX_ACL */
4697         return -EAGAIN;
4698 # endif /* CONFIG_FS_POSIX_ACL */
4699 }
4700 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4701
4702 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4703 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4704 #else
4705 # ifdef HAVE_INODE_PERMISION_2ARGS
4706 int ll_inode_permission(struct inode *inode, int mask)
4707 # else
4708 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4709 # endif
4710 #endif
4711 {
4712         int rc = 0;
4713         struct ll_sb_info *sbi;
4714         struct root_squash_info *squash;
4715         struct cred *cred = NULL;
4716         const struct cred *old_cred = NULL;
4717         cfs_cap_t cap;
4718         bool squash_id = false;
4719         ENTRY;
4720
4721 #ifdef MAY_NOT_BLOCK
4722         if (mask & MAY_NOT_BLOCK)
4723                 return -ECHILD;
4724 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4725         if (flags & IPERM_FLAG_RCU)
4726                 return -ECHILD;
4727 #endif
4728
4729        /* as root inode are NOT getting validated in lookup operation,
4730         * need to do it before permission check. */
4731
4732         if (inode == inode->i_sb->s_root->d_inode) {
4733                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4734                 if (rc)
4735                         RETURN(rc);
4736         }
4737
4738         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4739                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4740
4741         /* squash fsuid/fsgid if needed */
4742         sbi = ll_i2sbi(inode);
4743         squash = &sbi->ll_squash;
4744         if (unlikely(squash->rsi_uid != 0 &&
4745                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4746                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4747                         squash_id = true;
4748         }
4749         if (squash_id) {
4750                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4751                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4752                        squash->rsi_uid, squash->rsi_gid);
4753
4754                 /* update current process's credentials
4755                  * and FS capability */
4756                 cred = prepare_creds();
4757                 if (cred == NULL)
4758                         RETURN(-ENOMEM);
4759
4760                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4761                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4762                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4763                         if ((1 << cap) & CFS_CAP_FS_MASK)
4764                                 cap_lower(cred->cap_effective, cap);
4765                 }
4766                 old_cred = override_creds(cred);
4767         }
4768
4769         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4770         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4771         /* restore current process's credentials and FS capability */
4772         if (squash_id) {
4773                 revert_creds(old_cred);
4774                 put_cred(cred);
4775         }
4776
4777         RETURN(rc);
4778 }
4779
4780 /* -o localflock - only provides locally consistent flock locks */
4781 struct file_operations ll_file_operations = {
4782 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4783 # ifdef HAVE_SYNC_READ_WRITE
4784         .read           = new_sync_read,
4785         .write          = new_sync_write,
4786 # endif
4787         .read_iter      = ll_file_read_iter,
4788         .write_iter     = ll_file_write_iter,
4789 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4790         .read           = ll_file_read,
4791         .aio_read       = ll_file_aio_read,
4792         .write          = ll_file_write,
4793         .aio_write      = ll_file_aio_write,
4794 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4795         .unlocked_ioctl = ll_file_ioctl,
4796         .open           = ll_file_open,
4797         .release        = ll_file_release,
4798         .mmap           = ll_file_mmap,
4799         .llseek         = ll_file_seek,
4800         .splice_read    = ll_file_splice_read,
4801         .fsync          = ll_fsync,
4802         .flush          = ll_flush
4803 };
4804
4805 struct file_operations ll_file_operations_flock = {
4806 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4807 # ifdef HAVE_SYNC_READ_WRITE
4808         .read           = new_sync_read,
4809         .write          = new_sync_write,
4810 # endif /* HAVE_SYNC_READ_WRITE */
4811         .read_iter      = ll_file_read_iter,
4812         .write_iter     = ll_file_write_iter,
4813 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4814         .read           = ll_file_read,
4815         .aio_read       = ll_file_aio_read,
4816         .write          = ll_file_write,
4817         .aio_write      = ll_file_aio_write,
4818 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4819         .unlocked_ioctl = ll_file_ioctl,
4820         .open           = ll_file_open,
4821         .release        = ll_file_release,
4822         .mmap           = ll_file_mmap,
4823         .llseek         = ll_file_seek,
4824         .splice_read    = ll_file_splice_read,
4825         .fsync          = ll_fsync,
4826         .flush          = ll_flush,
4827         .flock          = ll_file_flock,
4828         .lock           = ll_file_flock
4829 };
4830
4831 /* These are for -o noflock - to return ENOSYS on flock calls */
4832 struct file_operations ll_file_operations_noflock = {
4833 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4834 # ifdef HAVE_SYNC_READ_WRITE
4835         .read           = new_sync_read,
4836         .write          = new_sync_write,
4837 # endif /* HAVE_SYNC_READ_WRITE */
4838         .read_iter      = ll_file_read_iter,
4839         .write_iter     = ll_file_write_iter,
4840 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4841         .read           = ll_file_read,
4842         .aio_read       = ll_file_aio_read,
4843         .write          = ll_file_write,
4844         .aio_write      = ll_file_aio_write,
4845 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4846         .unlocked_ioctl = ll_file_ioctl,
4847         .open           = ll_file_open,
4848         .release        = ll_file_release,
4849         .mmap           = ll_file_mmap,
4850         .llseek         = ll_file_seek,
4851         .splice_read    = ll_file_splice_read,
4852         .fsync          = ll_fsync,
4853         .flush          = ll_flush,
4854         .flock          = ll_file_noflock,
4855         .lock           = ll_file_noflock
4856 };
4857
4858 struct inode_operations ll_file_inode_operations = {
4859         .setattr        = ll_setattr,
4860         .getattr        = ll_getattr,
4861         .permission     = ll_inode_permission,
4862 #ifdef HAVE_IOP_XATTR
4863         .setxattr       = ll_setxattr,
4864         .getxattr       = ll_getxattr,
4865         .removexattr    = ll_removexattr,
4866 #endif
4867         .listxattr      = ll_listxattr,
4868         .fiemap         = ll_fiemap,
4869 #ifdef HAVE_IOP_GET_ACL
4870         .get_acl        = ll_get_acl,
4871 #endif
4872 #ifdef HAVE_IOP_SET_ACL
4873         .set_acl        = ll_set_acl,
4874 #endif
4875 };
4876
4877 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4878 {
4879         struct ll_inode_info *lli = ll_i2info(inode);
4880         struct cl_object *obj = lli->lli_clob;
4881         struct lu_env *env;
4882         int rc;
4883         __u16 refcheck;
4884         ENTRY;
4885
4886         if (obj == NULL)
4887                 RETURN(0);
4888
4889         env = cl_env_get(&refcheck);
4890         if (IS_ERR(env))
4891                 RETURN(PTR_ERR(env));
4892
4893         rc = cl_conf_set(env, lli->lli_clob, conf);
4894         if (rc < 0)
4895                 GOTO(out, rc);
4896
4897         if (conf->coc_opc == OBJECT_CONF_SET) {
4898                 struct ldlm_lock *lock = conf->coc_lock;
4899                 struct cl_layout cl = {
4900                         .cl_layout_gen = 0,
4901                 };
4902
4903                 LASSERT(lock != NULL);
4904                 LASSERT(ldlm_has_layout(lock));
4905
4906                 /* it can only be allowed to match after layout is
4907                  * applied to inode otherwise false layout would be
4908                  * seen. Applying layout shoud happen before dropping
4909                  * the intent lock. */
4910                 ldlm_lock_allow_match(lock);
4911
4912                 rc = cl_object_layout_get(env, obj, &cl);
4913                 if (rc < 0)
4914                         GOTO(out, rc);
4915
4916                 CDEBUG(D_VFSTRACE,
4917                        DFID": layout version change: %u -> %u\n",
4918                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4919                        cl.cl_layout_gen);
4920                 ll_layout_version_set(lli, cl.cl_layout_gen);
4921         }
4922
4923 out:
4924         cl_env_put(env, &refcheck);
4925
4926         RETURN(rc);
4927 }
4928
4929 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4930 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4931
4932 {
4933         struct ll_sb_info *sbi = ll_i2sbi(inode);
4934         struct ptlrpc_request *req;
4935         void *lvbdata;
4936         void *lmm;
4937         int lmmsize;
4938         int rc;
4939         ENTRY;
4940
4941         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4942                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4943                lock->l_lvb_data, lock->l_lvb_len);
4944
4945         if (lock->l_lvb_data != NULL)
4946                 RETURN(0);
4947
4948         /* if layout lock was granted right away, the layout is returned
4949          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4950          * blocked and then granted via completion ast, we have to fetch
4951          * layout here. Please note that we can't use the LVB buffer in
4952          * completion AST because it doesn't have a large enough buffer */
4953         rc = ll_get_default_mdsize(sbi, &lmmsize);
4954         if (rc < 0)
4955                 RETURN(rc);
4956
4957         rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4958                          XATTR_NAME_LOV, lmmsize, &req);
4959         if (rc < 0)
4960                 RETURN(rc);
4961
4962         lmmsize = rc;
4963         rc = 0;
4964         if (lmmsize == 0) /* empty layout */
4965                 GOTO(out, rc = 0);
4966
4967         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4968         if (lmm == NULL)
4969                 GOTO(out, rc = -EFAULT);
4970
4971         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4972         if (lvbdata == NULL)
4973                 GOTO(out, rc = -ENOMEM);
4974
4975         memcpy(lvbdata, lmm, lmmsize);
4976         lock_res_and_lock(lock);
4977         if (unlikely(lock->l_lvb_data == NULL)) {
4978                 lock->l_lvb_type = LVB_T_LAYOUT;
4979                 lock->l_lvb_data = lvbdata;
4980                 lock->l_lvb_len = lmmsize;
4981                 lvbdata = NULL;
4982         }
4983         unlock_res_and_lock(lock);
4984
4985         if (lvbdata)
4986                 OBD_FREE_LARGE(lvbdata, lmmsize);
4987
4988         EXIT;
4989
4990 out:
4991         ptlrpc_req_finished(req);
4992         return rc;
4993 }
4994
4995 /**
4996  * Apply the layout to the inode. Layout lock is held and will be released
4997  * in this function.
4998  */
4999 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5000                               struct inode *inode)
5001 {
5002         struct ll_inode_info *lli = ll_i2info(inode);
5003         struct ll_sb_info    *sbi = ll_i2sbi(inode);
5004         struct ldlm_lock *lock;
5005         struct cl_object_conf conf;
5006         int rc = 0;
5007         bool lvb_ready;
5008         bool wait_layout = false;
5009         ENTRY;
5010
5011         LASSERT(lustre_handle_is_used(lockh));
5012
5013         lock = ldlm_handle2lock(lockh);
5014         LASSERT(lock != NULL);
5015         LASSERT(ldlm_has_layout(lock));
5016
5017         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5018                    PFID(&lli->lli_fid), inode);
5019
5020         /* in case this is a caching lock and reinstate with new inode */
5021         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5022
5023         lock_res_and_lock(lock);
5024         lvb_ready = ldlm_is_lvb_ready(lock);
5025         unlock_res_and_lock(lock);
5026
5027         /* checking lvb_ready is racy but this is okay. The worst case is
5028          * that multi processes may configure the file on the same time. */
5029         if (lvb_ready)
5030                 GOTO(out, rc = 0);
5031
5032         rc = ll_layout_fetch(inode, lock);
5033         if (rc < 0)
5034                 GOTO(out, rc);
5035
5036         /* for layout lock, lmm is stored in lock's lvb.
5037          * lvb_data is immutable if the lock is held so it's safe to access it
5038          * without res lock.
5039          *
5040          * set layout to file. Unlikely this will fail as old layout was
5041          * surely eliminated */
5042         memset(&conf, 0, sizeof conf);
5043         conf.coc_opc = OBJECT_CONF_SET;
5044         conf.coc_inode = inode;
5045         conf.coc_lock = lock;
5046         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5047         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5048         rc = ll_layout_conf(inode, &conf);
5049
5050         /* refresh layout failed, need to wait */
5051         wait_layout = rc == -EBUSY;
5052         EXIT;
5053 out:
5054         LDLM_LOCK_PUT(lock);
5055         ldlm_lock_decref(lockh, mode);
5056
5057         /* wait for IO to complete if it's still being used. */
5058         if (wait_layout) {
5059                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5060                        ll_get_fsname(inode->i_sb, NULL, 0),
5061                        PFID(&lli->lli_fid), inode);
5062
5063                 memset(&conf, 0, sizeof conf);
5064                 conf.coc_opc = OBJECT_CONF_WAIT;
5065                 conf.coc_inode = inode;
5066                 rc = ll_layout_conf(inode, &conf);
5067                 if (rc == 0)
5068                         rc = -EAGAIN;
5069
5070                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5071                        ll_get_fsname(inode->i_sb, NULL, 0),
5072                        PFID(&lli->lli_fid), rc);
5073         }
5074         RETURN(rc);
5075 }
5076
5077 /**
5078  * Issue layout intent RPC to MDS.
5079  * \param inode [in]    file inode
5080  * \param intent [in]   layout intent
5081  *
5082  * \retval 0    on success
5083  * \retval < 0  error code
5084  */
5085 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5086 {
5087         struct ll_inode_info  *lli = ll_i2info(inode);
5088         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5089         struct md_op_data     *op_data;
5090         struct lookup_intent it;
5091         struct ptlrpc_request *req;
5092         int rc;
5093         ENTRY;
5094
5095         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5096                                      0, 0, LUSTRE_OPC_ANY, NULL);
5097         if (IS_ERR(op_data))
5098                 RETURN(PTR_ERR(op_data));
5099
5100         op_data->op_data = intent;
5101         op_data->op_data_size = sizeof(*intent);
5102
5103         memset(&it, 0, sizeof(it));
5104         it.it_op = IT_LAYOUT;
5105         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5106             intent->li_opc == LAYOUT_INTENT_TRUNC)
5107                 it.it_flags = FMODE_WRITE;
5108
5109         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5110                           ll_get_fsname(inode->i_sb, NULL, 0),
5111                           PFID(&lli->lli_fid), inode);
5112
5113         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5114                             &ll_md_blocking_ast, 0);
5115         if (it.it_request != NULL)
5116                 ptlrpc_req_finished(it.it_request);
5117         it.it_request = NULL;
5118
5119         ll_finish_md_op_data(op_data);
5120
5121         /* set lock data in case this is a new lock */
5122         if (!rc)
5123                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5124
5125         ll_intent_drop_lock(&it);
5126
5127         RETURN(rc);
5128 }
5129
5130 /**
5131  * This function checks if there exists a LAYOUT lock on the client side,
5132  * or enqueues it if it doesn't have one in cache.
5133  *
5134  * This function will not hold layout lock so it may be revoked any time after
5135  * this function returns. Any operations depend on layout should be redone
5136  * in that case.
5137  *
5138  * This function should be called before lov_io_init() to get an uptodate
5139  * layout version, the caller should save the version number and after IO
5140  * is finished, this function should be called again to verify that layout
5141  * is not changed during IO time.
5142  */
5143 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5144 {
5145         struct ll_inode_info    *lli = ll_i2info(inode);
5146         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5147         struct lustre_handle lockh;
5148         struct layout_intent intent = {
5149                 .li_opc = LAYOUT_INTENT_ACCESS,
5150         };
5151         enum ldlm_mode mode;
5152         int rc;
5153         ENTRY;
5154
5155         *gen = ll_layout_version_get(lli);
5156         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5157                 RETURN(0);
5158
5159         /* sanity checks */
5160         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5161         LASSERT(S_ISREG(inode->i_mode));
5162
5163         /* take layout lock mutex to enqueue layout lock exclusively. */
5164         mutex_lock(&lli->lli_layout_mutex);
5165
5166         while (1) {
5167                 /* mostly layout lock is caching on the local side, so try to
5168                  * match it before grabbing layout lock mutex. */
5169                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5170                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5171                 if (mode != 0) { /* hit cached lock */
5172                         rc = ll_layout_lock_set(&lockh, mode, inode);
5173                         if (rc == -EAGAIN)
5174                                 continue;
5175                         break;
5176                 }
5177
5178                 rc = ll_layout_intent(inode, &intent);
5179                 if (rc != 0)
5180                         break;
5181         }
5182
5183         if (rc == 0)
5184                 *gen = ll_layout_version_get(lli);
5185         mutex_unlock(&lli->lli_layout_mutex);
5186
5187         RETURN(rc);
5188 }
5189
5190 /**
5191  * Issue layout intent RPC indicating where in a file an IO is about to write.
5192  *
5193  * \param[in] inode     file inode.
5194  * \param[in] ext       write range with start offset of fille in bytes where
5195  *                      an IO is about to write, and exclusive end offset in
5196  *                      bytes.
5197  *
5198  * \retval 0    on success
5199  * \retval < 0  error code
5200  */
5201 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5202                            struct lu_extent *ext)
5203 {
5204         struct layout_intent intent = {
5205                 .li_opc = opc,
5206                 .li_extent.e_start = ext->e_start,
5207                 .li_extent.e_end = ext->e_end,
5208         };
5209         int rc;
5210         ENTRY;
5211
5212         rc = ll_layout_intent(inode, &intent);
5213
5214         RETURN(rc);
5215 }
5216
5217 /**
5218  *  This function send a restore request to the MDT
5219  */
5220 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5221 {
5222         struct hsm_user_request *hur;
5223         int                      len, rc;
5224         ENTRY;
5225
5226         len = sizeof(struct hsm_user_request) +
5227               sizeof(struct hsm_user_item);
5228         OBD_ALLOC(hur, len);
5229         if (hur == NULL)
5230                 RETURN(-ENOMEM);
5231
5232         hur->hur_request.hr_action = HUA_RESTORE;
5233         hur->hur_request.hr_archive_id = 0;
5234         hur->hur_request.hr_flags = 0;
5235         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5236                sizeof(hur->hur_user_item[0].hui_fid));
5237         hur->hur_user_item[0].hui_extent.offset = offset;
5238         hur->hur_user_item[0].hui_extent.length = length;
5239         hur->hur_request.hr_itemcount = 1;
5240         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5241                            len, hur, NULL);
5242         OBD_FREE(hur, len);
5243         RETURN(rc);
5244 }