lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                       ATTR_MTIME | ATTR_MTIME_SET |
 104                                       ATTR_CTIME);
 105         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 106         op_data->op_attr_blocks = inode->i_blocks;
 107         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 108         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 109                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 110         op_data->op_open_handle = och->och_open_handle;
 111
 112         if (och->och_flags & FMODE_WRITE &&
 113             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 114                 /* For HSM: if inode data has been modified, pack it so that
 115                  * MDT can set data dirty flag in the archive. */
 116                 op_data->op_bias |= MDS_DATA_MODIFIED;
 117
 118         EXIT;
 119 }
 120
 121 /**
 122  * Perform a close, possibly with a bias.
 123  * The meaning of "data" depends on the value of "bias".
 124  *
 125  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 126  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 127  * swap layouts with.
 128  */
 129 static int ll_close_inode_openhandle(struct inode *inode,
 130                                      struct obd_client_handle *och,
 131                                      enum mds_op_bias bias, void *data)
 132 {
 133         struct obd_export *md_exp = ll_i2mdexp(inode);
 134         const struct ll_inode_info *lli = ll_i2info(inode);
 135         struct md_op_data *op_data;
 136         struct ptlrpc_request *req = NULL;
 137         int rc;
 138         ENTRY;
 139
 140         if (class_exp2obd(md_exp) == NULL) {
 141                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 142                        ll_get_fsname(inode->i_sb, NULL, 0),
 143                        PFID(&lli->lli_fid));
 144                 GOTO(out, rc = 0);
 145         }
 146
 147         OBD_ALLOC_PTR(op_data);
 148         /* We leak openhandle and request here on error, but not much to be
 149          * done in OOM case since app won't retry close on error either. */
 150         if (op_data == NULL)
 151                 GOTO(out, rc = -ENOMEM);
 152
 153         ll_prepare_close(inode, op_data, och);
 154         switch (bias) {
 155         case MDS_CLOSE_LAYOUT_MERGE:
 156                 /* merge blocks from the victim inode */
 157                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 158                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 159                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 160         case MDS_CLOSE_LAYOUT_SPLIT:
 161         case MDS_CLOSE_LAYOUT_SWAP: {
 162                 struct split_param *sp = data;
 163
 164                 LASSERT(data != NULL);
 165                 op_data->op_bias |= bias;
 166                 op_data->op_data_version = 0;
 167                 op_data->op_lease_handle = och->och_lease_handle;
 168                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 169                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 170                         op_data->op_mirror_id = sp->sp_mirror_id;
 171                 } else {
 172                         op_data->op_fid2 = *ll_inode2fid(data);
 173                 }
 174                 break;
 175         }
 176
 177         case MDS_CLOSE_RESYNC_DONE: {
 178                 struct ll_ioc_lease *ioc = data;
 179
 180                 LASSERT(data != NULL);
 181                 op_data->op_attr_blocks +=
 182                         ioc->lil_count * op_data->op_attr_blocks;
 183                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 184                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 185                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 186
 187                 op_data->op_lease_handle = och->och_lease_handle;
 188                 op_data->op_data = &ioc->lil_ids[0];
 189                 op_data->op_data_size =
 190                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 191                 break;
 192         }
 193
 194         case MDS_HSM_RELEASE:
 195                 LASSERT(data != NULL);
 196                 op_data->op_bias |= MDS_HSM_RELEASE;
 197                 op_data->op_data_version = *(__u64 *)data;
 198                 op_data->op_lease_handle = och->och_lease_handle;
 199                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 200                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 201                 break;
 202
 203         default:
 204                 LASSERT(data == NULL);
 205                 break;
 206         }
 207
 208         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 209                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 210         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 211                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 212
 213         rc = md_close(md_exp, op_data, och->och_mod, &req);
 214         if (rc != 0 && rc != -EINTR)
 215                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 216                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 217
 218         if (rc == 0 && op_data->op_bias & bias) {
 219                 struct mdt_body *body;
 220
 221                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 222                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 223                         rc = -EBUSY;
 224         }
 225
 226         ll_finish_md_op_data(op_data);
 227         EXIT;
 228 out:
 229
 230         md_clear_open_replay_data(md_exp, och);
 231         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 232         OBD_FREE_PTR(och);
 233
 234         ptlrpc_req_finished(req);       /* This is close request */
 235         return rc;
 236 }
 237
 238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 239 {
 240         struct ll_inode_info *lli = ll_i2info(inode);
 241         struct obd_client_handle **och_p;
 242         struct obd_client_handle *och;
 243         __u64 *och_usecount;
 244         int rc = 0;
 245         ENTRY;
 246
 247         if (fmode & FMODE_WRITE) {
 248                 och_p = &lli->lli_mds_write_och;
 249                 och_usecount = &lli->lli_open_fd_write_count;
 250         } else if (fmode & FMODE_EXEC) {
 251                 och_p = &lli->lli_mds_exec_och;
 252                 och_usecount = &lli->lli_open_fd_exec_count;
 253         } else {
 254                 LASSERT(fmode & FMODE_READ);
 255                 och_p = &lli->lli_mds_read_och;
 256                 och_usecount = &lli->lli_open_fd_read_count;
 257         }
 258
 259         mutex_lock(&lli->lli_och_mutex);
 260         if (*och_usecount > 0) {
 261                 /* There are still users of this handle, so skip
 262                  * freeing it. */
 263                 mutex_unlock(&lli->lli_och_mutex);
 264                 RETURN(0);
 265         }
 266
 267         och = *och_p;
 268         *och_p = NULL;
 269         mutex_unlock(&lli->lli_och_mutex);
 270
 271         if (och != NULL) {
 272                 /* There might be a race and this handle may already
 273                  * be closed. */
 274                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 275         }
 276
 277         RETURN(rc);
 278 }
 279
 280 static int ll_md_close(struct inode *inode, struct file *file)
 281 {
 282         union ldlm_policy_data policy = {
 283                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 284         };
 285         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 286         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 287         struct ll_inode_info *lli = ll_i2info(inode);
 288         struct lustre_handle lockh;
 289         enum ldlm_mode lockmode;
 290         int rc = 0;
 291         ENTRY;
 292
 293         /* clear group lock, if present */
 294         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 295                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 296
 297         if (fd->fd_lease_och != NULL) {
 298                 bool lease_broken;
 299
 300                 /* Usually the lease is not released when the
 301                  * application crashed, we need to release here. */
 302                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 303                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 304                         PFID(&lli->lli_fid), rc, lease_broken);
 305
 306                 fd->fd_lease_och = NULL;
 307         }
 308
 309         if (fd->fd_och != NULL) {
 310                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 311                 fd->fd_och = NULL;
 312                 GOTO(out, rc);
 313         }
 314
 315         /* Let's see if we have good enough OPEN lock on the file and if
 316            we can skip talking to MDS */
 317         mutex_lock(&lli->lli_och_mutex);
 318         if (fd->fd_omode & FMODE_WRITE) {
 319                 lockmode = LCK_CW;
 320                 LASSERT(lli->lli_open_fd_write_count);
 321                 lli->lli_open_fd_write_count--;
 322         } else if (fd->fd_omode & FMODE_EXEC) {
 323                 lockmode = LCK_PR;
 324                 LASSERT(lli->lli_open_fd_exec_count);
 325                 lli->lli_open_fd_exec_count--;
 326         } else {
 327                 lockmode = LCK_CR;
 328                 LASSERT(lli->lli_open_fd_read_count);
 329                 lli->lli_open_fd_read_count--;
 330         }
 331         mutex_unlock(&lli->lli_och_mutex);
 332
 333         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 334                            LDLM_IBITS, &policy, lockmode, &lockh))
 335                 rc = ll_md_real_close(inode, fd->fd_omode);
 336
 337 out:
 338         LUSTRE_FPRIVATE(file) = NULL;
 339         ll_file_data_put(fd);
 340
 341         RETURN(rc);
 342 }
 343
 344 /* While this returns an error code, fput() the caller does not, so we need
 345  * to make every effort to clean up all of our state here.  Also, applications
 346  * rarely check close errors and even if an error is returned they will not
 347  * re-try the close call.
 348  */
 349 int ll_file_release(struct inode *inode, struct file *file)
 350 {
 351         struct ll_file_data *fd;
 352         struct ll_sb_info *sbi = ll_i2sbi(inode);
 353         struct ll_inode_info *lli = ll_i2info(inode);
 354         int rc;
 355         ENTRY;
 356
 357         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 358                PFID(ll_inode2fid(inode)), inode);
 359
 360         if (inode->i_sb->s_root != file_dentry(file))
 361                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 362         fd = LUSTRE_FPRIVATE(file);
 363         LASSERT(fd != NULL);
 364
 365         /* The last ref on @file, maybe not the the owner pid of statahead,
 366          * because parent and child process can share the same file handle. */
 367         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 368                 ll_deauthorize_statahead(inode, fd);
 369
 370         if (inode->i_sb->s_root == file_dentry(file)) {
 371                 LUSTRE_FPRIVATE(file) = NULL;
 372                 ll_file_data_put(fd);
 373                 RETURN(0);
 374         }
 375
 376         if (!S_ISDIR(inode->i_mode)) {
 377                 if (lli->lli_clob != NULL)
 378                         lov_read_and_clear_async_rc(lli->lli_clob);
 379                 lli->lli_async_rc = 0;
 380         }
 381
 382         rc = ll_md_close(inode, file);
 383
 384         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 385                 libcfs_debug_dumplog();
 386
 387         RETURN(rc);
 388 }
 389
 390 static inline int ll_dom_readpage(void *data, struct page *page)
 391 {
 392         struct niobuf_local *lnb = data;
 393         void *kaddr;
 394
 395         kaddr = ll_kmap_atomic(page, KM_USER0);
 396         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 397         if (lnb->lnb_len < PAGE_SIZE)
 398                 memset(kaddr + lnb->lnb_len, 0,
 399                        PAGE_SIZE - lnb->lnb_len);
 400         flush_dcache_page(page);
 401         SetPageUptodate(page);
 402         ll_kunmap_atomic(kaddr, KM_USER0);
 403         unlock_page(page);
 404
 405         return 0;
 406 }
 407
 408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 409                         struct lookup_intent *it)
 410 {
 411         struct ll_inode_info *lli = ll_i2info(inode);
 412         struct cl_object *obj = lli->lli_clob;
 413         struct address_space *mapping = inode->i_mapping;
 414         struct page *vmpage;
 415         struct niobuf_remote *rnb;
 416         char *data;
 417         struct lustre_handle lockh;
 418         struct ldlm_lock *lock;
 419         unsigned long index, start;
 420         struct niobuf_local lnb;
 421         bool dom_lock = false;
 422
 423         ENTRY;
 424
 425         if (obj == NULL)
 426                 RETURN_EXIT;
 427
 428         if (it->it_lock_mode != 0) {
 429                 lockh.cookie = it->it_lock_handle;
 430                 lock = ldlm_handle2lock(&lockh);
 431                 if (lock != NULL)
 432                         dom_lock = ldlm_has_dom(lock);
 433                 LDLM_LOCK_PUT(lock);
 434         }
 435         if (!dom_lock)
 436                 RETURN_EXIT;
 437
 438         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 439                                    RCL_SERVER))
 440                 RETURN_EXIT;
 441
 442         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 443         if (rnb == NULL || rnb->rnb_len == 0)
 444                 RETURN_EXIT;
 445
 446         CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
 447                rnb->rnb_len, i_size_read(inode));
 448
 449         data = (char *)rnb + sizeof(*rnb);
 450
 451         lnb.lnb_file_offset = rnb->rnb_offset;
 452         start = lnb.lnb_file_offset / PAGE_SIZE;
 453         index = 0;
 454         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 455         lnb.lnb_page_offset = 0;
 456         do {
 457                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 458                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 459                 if (lnb.lnb_len > PAGE_SIZE)
 460                         lnb.lnb_len = PAGE_SIZE;
 461
 462                 vmpage = read_cache_page(mapping, index + start,
 463                                          ll_dom_readpage, &lnb);
 464                 if (IS_ERR(vmpage)) {
 465                         CWARN("%s: cannot fill page %lu for "DFID
 466                               " with data: rc = %li\n",
 467                               ll_get_fsname(inode->i_sb, NULL, 0),
 468                               index + start, PFID(lu_object_fid(&obj->co_lu)),
 469                               PTR_ERR(vmpage));
 470                         break;
 471                 }
 472                 put_page(vmpage);
 473                 index++;
 474         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 475         EXIT;
 476 }
 477
 478 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 479                                 struct lookup_intent *itp)
 480 {
 481         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 482         struct dentry *parent = de->d_parent;
 483         const char *name = NULL;
 484         int len = 0;
 485         struct md_op_data *op_data;
 486         struct ptlrpc_request *req = NULL;
 487         int rc;
 488         ENTRY;
 489
 490         LASSERT(parent != NULL);
 491         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 492
 493         /* if server supports open-by-fid, or file name is invalid, don't pack
 494          * name in open request */
 495         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 496             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 497                 name = de->d_name.name;
 498                 len = de->d_name.len;
 499         }
 500
 501         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 502                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 503         if (IS_ERR(op_data))
 504                 RETURN(PTR_ERR(op_data));
 505         op_data->op_data = lmm;
 506         op_data->op_data_size = lmmsize;
 507
 508         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 509                             &ll_md_blocking_ast, 0);
 510         ll_finish_md_op_data(op_data);
 511         if (rc == -ESTALE) {
 512                 /* reason for keep own exit path - don`t flood log
 513                  * with messages with -ESTALE errors.
 514                  */
 515                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 516                      it_open_error(DISP_OPEN_OPEN, itp))
 517                         GOTO(out, rc);
 518                 ll_release_openhandle(de, itp);
 519                 GOTO(out, rc);
 520         }
 521
 522         if (it_disposition(itp, DISP_LOOKUP_NEG))
 523                 GOTO(out, rc = -ENOENT);
 524
 525         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 526                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 527                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 528                 GOTO(out, rc);
 529         }
 530
 531         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 532
 533         if (!rc && itp->it_lock_mode) {
 534                 ll_dom_finish_open(de->d_inode, req, itp);
 535                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 536         }
 537
 538 out:
 539         ptlrpc_req_finished(req);
 540         ll_intent_drop_lock(itp);
 541
 542         /* We did open by fid, but by the time we got to the server,
 543          * the object disappeared. If this is a create, we cannot really
 544          * tell the userspace that the file it was trying to create
 545          * does not exist. Instead let's return -ESTALE, and the VFS will
 546          * retry the create with LOOKUP_REVAL that we are going to catch
 547          * in ll_revalidate_dentry() and use lookup then.
 548          */
 549         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 550                 rc = -ESTALE;
 551
 552         RETURN(rc);
 553 }
 554
 555 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 556                        struct obd_client_handle *och)
 557 {
 558         struct mdt_body *body;
 559
 560         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 561         och->och_open_handle = body->mbo_open_handle;
 562         och->och_fid = body->mbo_fid1;
 563         och->och_lease_handle.cookie = it->it_lock_handle;
 564         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 565         och->och_flags = it->it_flags;
 566
 567         return md_set_open_replay_data(md_exp, och, it);
 568 }
 569
 570 static int ll_local_open(struct file *file, struct lookup_intent *it,
 571                          struct ll_file_data *fd, struct obd_client_handle *och)
 572 {
 573         struct inode *inode = file_inode(file);
 574         ENTRY;
 575
 576         LASSERT(!LUSTRE_FPRIVATE(file));
 577
 578         LASSERT(fd != NULL);
 579
 580         if (och) {
 581                 int rc;
 582
 583                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 584                 if (rc != 0)
 585                         RETURN(rc);
 586         }
 587
 588         LUSTRE_FPRIVATE(file) = fd;
 589         ll_readahead_init(inode, &fd->fd_ras);
 590         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 591
 592         /* ll_cl_context initialize */
 593         rwlock_init(&fd->fd_lock);
 594         INIT_LIST_HEAD(&fd->fd_lccs);
 595
 596         RETURN(0);
 597 }
 598
 599 /* Open a file, and (for the very first open) create objects on the OSTs at
 600  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 601  * creation or open until ll_lov_setstripe() ioctl is called.
 602  *
 603  * If we already have the stripe MD locally then we don't request it in
 604  * md_open(), by passing a lmm_size = 0.
 605  *
 606  * It is up to the application to ensure no other processes open this file
 607  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 608  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 609  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 610  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 611  */
 612 int ll_file_open(struct inode *inode, struct file *file)
 613 {
 614         struct ll_inode_info *lli = ll_i2info(inode);
 615         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 616                                           .it_flags = file->f_flags };
 617         struct obd_client_handle **och_p = NULL;
 618         __u64 *och_usecount = NULL;
 619         struct ll_file_data *fd;
 620         int rc = 0;
 621         ENTRY;
 622
 623         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 624                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 625
 626         it = file->private_data; /* XXX: compat macro */
 627         file->private_data = NULL; /* prevent ll_local_open assertion */
 628
 629         fd = ll_file_data_get();
 630         if (fd == NULL)
 631                 GOTO(out_nofiledata, rc = -ENOMEM);
 632
 633         fd->fd_file = file;
 634         if (S_ISDIR(inode->i_mode))
 635                 ll_authorize_statahead(inode, fd);
 636
 637         if (inode->i_sb->s_root == file_dentry(file)) {
 638                 LUSTRE_FPRIVATE(file) = fd;
 639                 RETURN(0);
 640         }
 641
 642         if (!it || !it->it_disposition) {
 643                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 644                  * because everything but O_ACCMODE mask was stripped from
 645                  * there */
 646                 if ((oit.it_flags + 1) & O_ACCMODE)
 647                         oit.it_flags++;
 648                 if (file->f_flags & O_TRUNC)
 649                         oit.it_flags |= FMODE_WRITE;
 650
 651                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 652                  * dentry_open after call to open_namei that checks permissions.
 653                  * Only nfsd_open call dentry_open directly without checking
 654                  * permissions and because of that this code below is safe.
 655                  */
 656                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 657                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 658
 659                 /* We do not want O_EXCL here, presumably we opened the file
 660                  * already? XXX - NFS implications? */
 661                 oit.it_flags &= ~O_EXCL;
 662
 663                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 664                  * created if necessary, then "IT_CREAT" should be set to keep
 665                  * consistent with it */
 666                 if (oit.it_flags & O_CREAT)
 667                         oit.it_op |= IT_CREAT;
 668
 669                 it = &oit;
 670         }
 671
 672 restart:
 673         /* Let's see if we have file open on MDS already. */
 674         if (it->it_flags & FMODE_WRITE) {
 675                 och_p = &lli->lli_mds_write_och;
 676                 och_usecount = &lli->lli_open_fd_write_count;
 677         } else if (it->it_flags & FMODE_EXEC) {
 678                 och_p = &lli->lli_mds_exec_och;
 679                 och_usecount = &lli->lli_open_fd_exec_count;
 680          } else {
 681                 och_p = &lli->lli_mds_read_och;
 682                 och_usecount = &lli->lli_open_fd_read_count;
 683         }
 684
 685         mutex_lock(&lli->lli_och_mutex);
 686         if (*och_p) { /* Open handle is present */
 687                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 688                         /* Well, there's extra open request that we do not need,
 689                            let's close it somehow. This will decref request. */
 690                         rc = it_open_error(DISP_OPEN_OPEN, it);
 691                         if (rc) {
 692                                 mutex_unlock(&lli->lli_och_mutex);
 693                                 GOTO(out_openerr, rc);
 694                         }
 695
 696                         ll_release_openhandle(file_dentry(file), it);
 697                 }
 698                 (*och_usecount)++;
 699
 700                 rc = ll_local_open(file, it, fd, NULL);
 701                 if (rc) {
 702                         (*och_usecount)--;
 703                         mutex_unlock(&lli->lli_och_mutex);
 704                         GOTO(out_openerr, rc);
 705                 }
 706         } else {
 707                 LASSERT(*och_usecount == 0);
 708                 if (!it->it_disposition) {
 709                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 710                         /* We cannot just request lock handle now, new ELC code
 711                            means that one of other OPEN locks for this file
 712                            could be cancelled, and since blocking ast handler
 713                            would attempt to grab och_mutex as well, that would
 714                            result in a deadlock */
 715                         mutex_unlock(&lli->lli_och_mutex);
 716                         /*
 717                          * Normally called under two situations:
 718                          * 1. NFS export.
 719                          * 2. A race/condition on MDS resulting in no open
 720                          *    handle to be returned from LOOKUP|OPEN request,
 721                          *    for example if the target entry was a symlink.
 722                          *
 723                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 724                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 725                          *  bit so that it's not confusing later callers.
 726                          *
 727                          *  NB; when ldd is NULL, it must have come via normal
 728                          *  lookup path only, since ll_iget_for_nfs always calls
 729                          *  ll_d_init().
 730                          */
 731                         if (ldd && ldd->lld_nfs_dentry) {
 732                                 ldd->lld_nfs_dentry = 0;
 733                                 it->it_flags |= MDS_OPEN_LOCK;
 734                         }
 735
 736                          /*
 737                          * Always specify MDS_OPEN_BY_FID because we don't want
 738                          * to get file with different fid.
 739                          */
 740                         it->it_flags |= MDS_OPEN_BY_FID;
 741                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 742                                                  it);
 743                         if (rc)
 744                                 GOTO(out_openerr, rc);
 745
 746                         goto restart;
 747                 }
 748                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 749                 if (!*och_p)
 750                         GOTO(out_och_free, rc = -ENOMEM);
 751
 752                 (*och_usecount)++;
 753
 754                 /* md_intent_lock() didn't get a request ref if there was an
 755                  * open error, so don't do cleanup on the request here
 756                  * (bug 3430) */
 757                 /* XXX (green): Should not we bail out on any error here, not
 758                  * just open error? */
 759                 rc = it_open_error(DISP_OPEN_OPEN, it);
 760                 if (rc != 0)
 761                         GOTO(out_och_free, rc);
 762
 763                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 764                          "inode %p: disposition %x, status %d\n", inode,
 765                          it_disposition(it, ~0), it->it_status);
 766
 767                 rc = ll_local_open(file, it, fd, *och_p);
 768                 if (rc)
 769                         GOTO(out_och_free, rc);
 770         }
 771         mutex_unlock(&lli->lli_och_mutex);
 772         fd = NULL;
 773
 774         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 775            different kind of OPEN lock for this same inode gets cancelled
 776            by ldlm_cancel_lru */
 777         if (!S_ISREG(inode->i_mode))
 778                 GOTO(out_och_free, rc);
 779
 780         cl_lov_delay_create_clear(&file->f_flags);
 781         GOTO(out_och_free, rc);
 782
 783 out_och_free:
 784         if (rc) {
 785                 if (och_p && *och_p) {
 786                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 787                         *och_p = NULL; /* OBD_FREE writes some magic there */
 788                         (*och_usecount)--;
 789                 }
 790                 mutex_unlock(&lli->lli_och_mutex);
 791
 792 out_openerr:
 793                 if (lli->lli_opendir_key == fd)
 794                         ll_deauthorize_statahead(inode, fd);
 795                 if (fd != NULL)
 796                         ll_file_data_put(fd);
 797         } else {
 798                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 799         }
 800
 801 out_nofiledata:
 802         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 803                 ptlrpc_req_finished(it->it_request);
 804                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 805         }
 806
 807         return rc;
 808 }
 809
 810 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 811                         struct ldlm_lock_desc *desc, void *data, int flag)
 812 {
 813         int rc;
 814         struct lustre_handle lockh;
 815         ENTRY;
 816
 817         switch (flag) {
 818         case LDLM_CB_BLOCKING:
 819                 ldlm_lock2handle(lock, &lockh);
 820                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 821                 if (rc < 0) {
 822                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 823                         RETURN(rc);
 824                 }
 825                 break;
 826         case LDLM_CB_CANCELING:
 827                 /* do nothing */
 828                 break;
 829         }
 830         RETURN(0);
 831 }
 832
 833 /**
 834  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 835  * and save it as fd->fd_och so as to force client to reopen the file even
 836  * if it has an open lock in cache already.
 837  */
 838 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 839                                 struct lustre_handle *old_open_handle)
 840 {
 841         struct ll_inode_info *lli = ll_i2info(inode);
 842         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 843         struct obd_client_handle **och_p;
 844         __u64 *och_usecount;
 845         int rc = 0;
 846         ENTRY;
 847
 848         /* Get the openhandle of the file */
 849         mutex_lock(&lli->lli_och_mutex);
 850         if (fd->fd_lease_och != NULL)
 851                 GOTO(out_unlock, rc = -EBUSY);
 852
 853         if (fd->fd_och == NULL) {
 854                 if (file->f_mode & FMODE_WRITE) {
 855                         LASSERT(lli->lli_mds_write_och != NULL);
 856                         och_p = &lli->lli_mds_write_och;
 857                         och_usecount = &lli->lli_open_fd_write_count;
 858                 } else {
 859                         LASSERT(lli->lli_mds_read_och != NULL);
 860                         och_p = &lli->lli_mds_read_och;
 861                         och_usecount = &lli->lli_open_fd_read_count;
 862                 }
 863
 864                 if (*och_usecount > 1)
 865                         GOTO(out_unlock, rc = -EBUSY);
 866
 867                 fd->fd_och = *och_p;
 868                 *och_usecount = 0;
 869                 *och_p = NULL;
 870         }
 871
 872         *old_open_handle = fd->fd_och->och_open_handle;
 873
 874         EXIT;
 875 out_unlock:
 876         mutex_unlock(&lli->lli_och_mutex);
 877         return rc;
 878 }
 879
 880 /**
 881  * Release ownership on lli_mds_*_och when putting back a file lease.
 882  */
 883 static int ll_lease_och_release(struct inode *inode, struct file *file)
 884 {
 885         struct ll_inode_info *lli = ll_i2info(inode);
 886         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 887         struct obd_client_handle **och_p;
 888         struct obd_client_handle *old_och = NULL;
 889         __u64 *och_usecount;
 890         int rc = 0;
 891         ENTRY;
 892
 893         mutex_lock(&lli->lli_och_mutex);
 894         if (file->f_mode & FMODE_WRITE) {
 895                 och_p = &lli->lli_mds_write_och;
 896                 och_usecount = &lli->lli_open_fd_write_count;
 897         } else {
 898                 och_p = &lli->lli_mds_read_och;
 899                 och_usecount = &lli->lli_open_fd_read_count;
 900         }
 901
 902         /* The file may have been open by another process (broken lease) so
 903          * *och_p is not NULL. In this case we should simply increase usecount
 904          * and close fd_och.
 905          */
 906         if (*och_p != NULL) {
 907                 old_och = fd->fd_och;
 908                 (*och_usecount)++;
 909         } else {
 910                 *och_p = fd->fd_och;
 911                 *och_usecount = 1;
 912         }
 913         fd->fd_och = NULL;
 914         mutex_unlock(&lli->lli_och_mutex);
 915
 916         if (old_och != NULL)
 917                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 918
 919         RETURN(rc);
 920 }
 921
 922 /**
 923  * Acquire a lease and open the file.
 924  */
 925 static struct obd_client_handle *
 926 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 927               __u64 open_flags)
 928 {
 929         struct lookup_intent it = { .it_op = IT_OPEN };
 930         struct ll_sb_info *sbi = ll_i2sbi(inode);
 931         struct md_op_data *op_data;
 932         struct ptlrpc_request *req = NULL;
 933         struct lustre_handle old_open_handle = { 0 };
 934         struct obd_client_handle *och = NULL;
 935         int rc;
 936         int rc2;
 937         ENTRY;
 938
 939         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 940                 RETURN(ERR_PTR(-EINVAL));
 941
 942         if (file != NULL) {
 943                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 944                         RETURN(ERR_PTR(-EPERM));
 945
 946                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
 947                 if (rc)
 948                         RETURN(ERR_PTR(rc));
 949         }
 950
 951         OBD_ALLOC_PTR(och);
 952         if (och == NULL)
 953                 RETURN(ERR_PTR(-ENOMEM));
 954
 955         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 956                                         LUSTRE_OPC_ANY, NULL);
 957         if (IS_ERR(op_data))
 958                 GOTO(out, rc = PTR_ERR(op_data));
 959
 960         /* To tell the MDT this openhandle is from the same owner */
 961         op_data->op_open_handle = old_open_handle;
 962
 963         it.it_flags = fmode | open_flags;
 964         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 965         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 966                             &ll_md_blocking_lease_ast,
 967         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 968          * it can be cancelled which may mislead applications that the lease is
 969          * broken;
 970          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 971          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 972          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 973                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 974         ll_finish_md_op_data(op_data);
 975         ptlrpc_req_finished(req);
 976         if (rc < 0)
 977                 GOTO(out_release_it, rc);
 978
 979         if (it_disposition(&it, DISP_LOOKUP_NEG))
 980                 GOTO(out_release_it, rc = -ENOENT);
 981
 982         rc = it_open_error(DISP_OPEN_OPEN, &it);
 983         if (rc)
 984                 GOTO(out_release_it, rc);
 985
 986         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 987         ll_och_fill(sbi->ll_md_exp, &it, och);
 988
 989         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 990                 GOTO(out_close, rc = -EOPNOTSUPP);
 991
 992         /* already get lease, handle lease lock */
 993         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 994         if (it.it_lock_mode == 0 ||
 995             it.it_lock_bits != MDS_INODELOCK_OPEN) {
 996                 /* open lock must return for lease */
 997                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 998                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
 999                         it.it_lock_bits);
1000                 GOTO(out_close, rc = -EPROTO);
1001         }
1002
1003         ll_intent_release(&it);
1004         RETURN(och);
1005
1006 out_close:
1007         /* Cancel open lock */
1008         if (it.it_lock_mode != 0) {
1009                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1010                                             it.it_lock_mode);
1011                 it.it_lock_mode = 0;
1012                 och->och_lease_handle.cookie = 0ULL;
1013         }
1014         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1015         if (rc2 < 0)
1016                 CERROR("%s: error closing file "DFID": %d\n",
1017                        ll_get_fsname(inode->i_sb, NULL, 0),
1018                        PFID(&ll_i2info(inode)->lli_fid), rc2);
1019         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1020 out_release_it:
1021         ll_intent_release(&it);
1022 out:
1023         if (och != NULL)
1024                 OBD_FREE_PTR(och);
1025         RETURN(ERR_PTR(rc));
1026 }
1027
1028 /**
1029  * Check whether a layout swap can be done between two inodes.
1030  *
1031  * \param[in] inode1  First inode to check
1032  * \param[in] inode2  Second inode to check
1033  *
1034  * \retval 0 on success, layout swap can be performed between both inodes
1035  * \retval negative error code if requirements are not met
1036  */
1037 static int ll_check_swap_layouts_validity(struct inode *inode1,
1038                                           struct inode *inode2)
1039 {
1040         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1041                 return -EINVAL;
1042
1043         if (inode_permission(inode1, MAY_WRITE) ||
1044             inode_permission(inode2, MAY_WRITE))
1045                 return -EPERM;
1046
1047         if (inode1->i_sb != inode2->i_sb)
1048                 return -EXDEV;
1049
1050         return 0;
1051 }
1052
1053 static int ll_swap_layouts_close(struct obd_client_handle *och,
1054                                  struct inode *inode, struct inode *inode2)
1055 {
1056         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1057         const struct lu_fid     *fid2;
1058         int                      rc;
1059         ENTRY;
1060
1061         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1062                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1063
1064         rc = ll_check_swap_layouts_validity(inode, inode2);
1065         if (rc < 0)
1066                 GOTO(out_free_och, rc);
1067
1068         /* We now know that inode2 is a lustre inode */
1069         fid2 = ll_inode2fid(inode2);
1070
1071         rc = lu_fid_cmp(fid1, fid2);
1072         if (rc == 0)
1073                 GOTO(out_free_och, rc = -EINVAL);
1074
1075         /* Close the file and {swap,merge} layouts between inode & inode2.
1076          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1077          * because we still need it to pack l_remote_handle to MDT. */
1078         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1079                                        inode2);
1080
1081         och = NULL; /* freed in ll_close_inode_openhandle() */
1082
1083 out_free_och:
1084         if (och != NULL)
1085                 OBD_FREE_PTR(och);
1086
1087         RETURN(rc);
1088 }
1089
1090 /**
1091  * Release lease and close the file.
1092  * It will check if the lease has ever broken.
1093  */
1094 static int ll_lease_close_intent(struct obd_client_handle *och,
1095                                  struct inode *inode,
1096                                  bool *lease_broken, enum mds_op_bias bias,
1097                                  void *data)
1098 {
1099         struct ldlm_lock *lock;
1100         bool cancelled = true;
1101         int rc;
1102         ENTRY;
1103
1104         lock = ldlm_handle2lock(&och->och_lease_handle);
1105         if (lock != NULL) {
1106                 lock_res_and_lock(lock);
1107                 cancelled = ldlm_is_cancel(lock);
1108                 unlock_res_and_lock(lock);
1109                 LDLM_LOCK_PUT(lock);
1110         }
1111
1112         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1113                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1114
1115         if (lease_broken != NULL)
1116                 *lease_broken = cancelled;
1117
1118         if (!cancelled && !bias)
1119                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1120
1121         if (cancelled) { /* no need to excute intent */
1122                 bias = 0;
1123                 data = NULL;
1124         }
1125
1126         rc = ll_close_inode_openhandle(inode, och, bias, data);
1127         RETURN(rc);
1128 }
1129
1130 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1131                           bool *lease_broken)
1132 {
1133         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1134 }
1135
1136 /**
1137  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1138  */
1139 static int ll_lease_file_resync(struct obd_client_handle *och,
1140                                 struct inode *inode, unsigned long arg)
1141 {
1142         struct ll_sb_info *sbi = ll_i2sbi(inode);
1143         struct md_op_data *op_data;
1144         struct ll_ioc_lease_id ioc;
1145         __u64 data_version_unused;
1146         int rc;
1147         ENTRY;
1148
1149         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1150                                      LUSTRE_OPC_ANY, NULL);
1151         if (IS_ERR(op_data))
1152                 RETURN(PTR_ERR(op_data));
1153
1154         if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1155                            sizeof(ioc)))
1156                 RETURN(-EFAULT);
1157
1158         /* before starting file resync, it's necessary to clean up page cache
1159          * in client memory, otherwise once the layout version is increased,
1160          * writing back cached data will be denied the OSTs. */
1161         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1162         if (rc)
1163                 GOTO(out, rc);
1164
1165         op_data->op_lease_handle = och->och_lease_handle;
1166         op_data->op_mirror_id = ioc.lil_mirror_id;
1167         rc = md_file_resync(sbi->ll_md_exp, op_data);
1168         if (rc)
1169                 GOTO(out, rc);
1170
1171         EXIT;
1172 out:
1173         ll_finish_md_op_data(op_data);
1174         return rc;
1175 }
1176
1177 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1178 {
1179         struct ll_inode_info *lli = ll_i2info(inode);
1180         struct cl_object *obj = lli->lli_clob;
1181         struct cl_attr *attr = vvp_env_thread_attr(env);
1182         s64 atime;
1183         s64 mtime;
1184         s64 ctime;
1185         int rc = 0;
1186
1187         ENTRY;
1188
1189         ll_inode_size_lock(inode);
1190
1191         /* Merge timestamps the most recently obtained from MDS with
1192          * timestamps obtained from OSTs.
1193          *
1194          * Do not overwrite atime of inode because it may be refreshed
1195          * by file_accessed() function. If the read was served by cache
1196          * data, there is no RPC to be sent so that atime may not be
1197          * transferred to OSTs at all. MDT only updates atime at close time
1198          * if it's at least 'mdd.*.atime_diff' older.
1199          * All in all, the atime in Lustre does not strictly comply with
1200          * POSIX. Solving this problem needs to send an RPC to MDT for each
1201          * read, this will hurt performance. */
1202         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1203                 LTIME_S(inode->i_atime) = lli->lli_atime;
1204                 lli->lli_update_atime = 0;
1205         }
1206         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1207         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1208
1209         atime = LTIME_S(inode->i_atime);
1210         mtime = LTIME_S(inode->i_mtime);
1211         ctime = LTIME_S(inode->i_ctime);
1212
1213         cl_object_attr_lock(obj);
1214         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1215                 rc = -EINVAL;
1216         else
1217                 rc = cl_object_attr_get(env, obj, attr);
1218         cl_object_attr_unlock(obj);
1219
1220         if (rc != 0)
1221                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1222
1223         if (atime < attr->cat_atime)
1224                 atime = attr->cat_atime;
1225
1226         if (ctime < attr->cat_ctime)
1227                 ctime = attr->cat_ctime;
1228
1229         if (mtime < attr->cat_mtime)
1230                 mtime = attr->cat_mtime;
1231
1232         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1233                PFID(&lli->lli_fid), attr->cat_size);
1234
1235         i_size_write(inode, attr->cat_size);
1236         inode->i_blocks = attr->cat_blocks;
1237
1238         LTIME_S(inode->i_atime) = atime;
1239         LTIME_S(inode->i_mtime) = mtime;
1240         LTIME_S(inode->i_ctime) = ctime;
1241
1242 out_size_unlock:
1243         ll_inode_size_unlock(inode);
1244
1245         RETURN(rc);
1246 }
1247
1248 /**
1249  * Set designated mirror for I/O.
1250  *
1251  * So far only read, write, and truncated can support to issue I/O to
1252  * designated mirror.
1253  */
1254 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1255 {
1256         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1257
1258         /* clear layout version for generic(non-resync) I/O in case it carries
1259          * stale layout version due to I/O restart */
1260         io->ci_layout_version = 0;
1261
1262         /* FLR: disable non-delay for designated mirror I/O because obviously
1263          * only one mirror is available */
1264         if (fd->fd_designated_mirror > 0) {
1265                 io->ci_ndelay = 0;
1266                 io->ci_designated_mirror = fd->fd_designated_mirror;
1267                 io->ci_layout_version = fd->fd_layout_version;
1268                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1269                                  * io to ptasks */
1270         }
1271
1272         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1273                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1274 }
1275
1276 static bool file_is_noatime(const struct file *file)
1277 {
1278         const struct vfsmount *mnt = file->f_path.mnt;
1279         const struct inode *inode = file_inode((struct file *)file);
1280
1281         /* Adapted from file_accessed() and touch_atime().*/
1282         if (file->f_flags & O_NOATIME)
1283                 return true;
1284
1285         if (inode->i_flags & S_NOATIME)
1286                 return true;
1287
1288         if (IS_NOATIME(inode))
1289                 return true;
1290
1291         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1292                 return true;
1293
1294         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1295                 return true;
1296
1297         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1298                 return true;
1299
1300         return false;
1301 }
1302
1303 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1304
1305 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1306 {
1307         struct inode *inode = file_inode(file);
1308         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1309
1310         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1311         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1312         io->u.ci_rw.rw_file = file;
1313         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1314         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1315         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1316
1317         if (iot == CIT_WRITE) {
1318                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1319                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1320                                            file->f_flags & O_DIRECT ||
1321                                            IS_SYNC(inode));
1322         }
1323         io->ci_obj = ll_i2info(inode)->lli_clob;
1324         io->ci_lockreq = CILR_MAYBE;
1325         if (ll_file_nolock(file)) {
1326                 io->ci_lockreq = CILR_NEVER;
1327                 io->ci_no_srvlock = 1;
1328         } else if (file->f_flags & O_APPEND) {
1329                 io->ci_lockreq = CILR_MANDATORY;
1330         }
1331         io->ci_noatime = file_is_noatime(file);
1332         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1333                 io->ci_pio = !io->u.ci_rw.rw_append;
1334         else
1335                 io->ci_pio = 0;
1336
1337         /* FLR: only use non-delay I/O for read as there is only one
1338          * avaliable mirror for write. */
1339         io->ci_ndelay = !(iot == CIT_WRITE);
1340
1341         ll_io_set_mirror(io, file);
1342 }
1343
1344 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1345 {
1346         struct cl_io_pt *pt = ptask->pt_cbdata;
1347         struct file *file = pt->cip_file;
1348         struct lu_env *env;
1349         struct cl_io *io;
1350         loff_t pos = pt->cip_pos;
1351         int rc;
1352         __u16 refcheck;
1353         ENTRY;
1354
1355         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1356                 file_dentry(file)->d_name.name,
1357                 pt->cip_iot == CIT_READ ? "read" : "write",
1358                 pos, pos + pt->cip_count);
1359
1360         env = cl_env_get(&refcheck);
1361         if (IS_ERR(env))
1362                 RETURN(PTR_ERR(env));
1363
1364         io = vvp_env_thread_io(env);
1365         ll_io_init(io, file, pt->cip_iot);
1366         io->u.ci_rw.rw_iter = pt->cip_iter;
1367         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1368         io->ci_pio = 0; /* It's already in parallel task */
1369
1370         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1371                            pt->cip_count - pt->cip_result);
1372         if (!rc) {
1373                 struct vvp_io *vio = vvp_env_io(env);
1374
1375                 vio->vui_io_subtype = IO_NORMAL;
1376                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1377
1378                 ll_cl_add(file, env, io, LCC_RW);
1379                 rc = cl_io_loop(env, io);
1380                 ll_cl_remove(file, env);
1381         } else {
1382                 /* cl_io_rw_init() handled IO */
1383                 rc = io->ci_result;
1384         }
1385
1386         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1387                 if (io->ci_nob > 0)
1388                         io->ci_nob /= 2;
1389                 rc = -EIO;
1390         }
1391
1392         if (io->ci_nob > 0) {
1393                 pt->cip_result += io->ci_nob;
1394                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1395                 pos += io->ci_nob;
1396                 pt->cip_iocb.ki_pos = pos;
1397 #ifdef HAVE_KIOCB_KI_LEFT
1398                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1399 #elif defined(HAVE_KI_NBYTES)
1400                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1401 #endif
1402         }
1403
1404         cl_io_fini(env, io);
1405         cl_env_put(env, &refcheck);
1406
1407         pt->cip_need_restart = io->ci_need_restart;
1408
1409         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1410                 file_dentry(file)->d_name.name,
1411                 pt->cip_iot == CIT_READ ? "read" : "write",
1412                 pt->cip_result, rc);
1413
1414         RETURN(pt->cip_result > 0 ? 0 : rc);
1415 }
1416
1417 static ssize_t
1418 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1419                    struct file *file, enum cl_io_type iot,
1420                    loff_t *ppos, size_t count)
1421 {
1422         struct range_lock       range;
1423         struct vvp_io           *vio = vvp_env_io(env);
1424         struct inode            *inode = file_inode(file);
1425         struct ll_inode_info    *lli = ll_i2info(inode);
1426         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1427         struct cl_io            *io;
1428         loff_t                  pos = *ppos;
1429         ssize_t                 result = 0;
1430         int                     rc = 0;
1431         unsigned                retried = 0;
1432         bool                    restarted = false;
1433
1434         ENTRY;
1435
1436         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1437                 file_dentry(file)->d_name.name,
1438                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1439
1440 restart:
1441         io = vvp_env_thread_io(env);
1442         ll_io_init(io, file, iot);
1443         if (args->via_io_subtype == IO_NORMAL) {
1444                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1445                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1446         }
1447         if (args->via_io_subtype != IO_NORMAL || restarted)
1448                 io->ci_pio = 0;
1449         io->ci_ndelay_tried = retried;
1450
1451         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1452                 bool range_locked = false;
1453
1454                 if (file->f_flags & O_APPEND)
1455                         range_lock_init(&range, 0, LUSTRE_EOF);
1456                 else
1457                         range_lock_init(&range, pos, pos + count - 1);
1458
1459                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1460                 vio->vui_io_subtype = args->via_io_subtype;
1461
1462                 switch (vio->vui_io_subtype) {
1463                 case IO_NORMAL:
1464                         /* Direct IO reads must also take range lock,
1465                          * or multiple reads will try to work on the same pages
1466                          * See LU-6227 for details. */
1467                         if (((iot == CIT_WRITE) ||
1468                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1469                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1470                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1471                                        RL_PARA(&range));
1472                                 rc = range_lock(&lli->lli_write_tree, &range);
1473                                 if (rc < 0)
1474                                         GOTO(out, rc);
1475
1476                                 range_locked = true;
1477                         }
1478                         break;
1479                 case IO_SPLICE:
1480                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1481                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1482                         break;
1483                 default:
1484                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1485                         LBUG();
1486                 }
1487
1488                 ll_cl_add(file, env, io, LCC_RW);
1489                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1490                     !lli->lli_inode_locked) {
1491                         inode_lock(inode);
1492                         lli->lli_inode_locked = 1;
1493                 }
1494                 rc = cl_io_loop(env, io);
1495                 if (lli->lli_inode_locked) {
1496                         lli->lli_inode_locked = 0;
1497                         inode_unlock(inode);
1498                 }
1499                 ll_cl_remove(file, env);
1500
1501                 if (range_locked) {
1502                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1503                                RL_PARA(&range));
1504                         range_unlock(&lli->lli_write_tree, &range);
1505                 }
1506         } else {
1507                 /* cl_io_rw_init() handled IO */
1508                 rc = io->ci_result;
1509         }
1510
1511         if (io->ci_nob > 0) {
1512                 result += io->ci_nob;
1513                 count  -= io->ci_nob;
1514
1515                 if (args->via_io_subtype == IO_NORMAL) {
1516                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1517
1518                         /* CLIO is too complicated. See LU-11069. */
1519                         if (cl_io_is_append(io))
1520                                 pos = io->u.ci_rw.rw_iocb.ki_pos;
1521                         else
1522                                 pos += io->ci_nob;
1523
1524                         args->u.normal.via_iocb->ki_pos = pos;
1525 #ifdef HAVE_KIOCB_KI_LEFT
1526                         args->u.normal.via_iocb->ki_left = count;
1527 #elif defined(HAVE_KI_NBYTES)
1528                         args->u.normal.via_iocb->ki_nbytes = count;
1529 #endif
1530                 } else {
1531                         /* for splice */
1532                         pos = io->u.ci_rw.rw_range.cir_pos;
1533                 }
1534         }
1535 out:
1536         cl_io_fini(env, io);
1537
1538         CDEBUG(D_VFSTRACE,
1539                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1540                file->f_path.dentry->d_name.name,
1541                iot, rc, result, io->ci_need_restart);
1542
1543         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1544                 CDEBUG(D_VFSTRACE,
1545                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1546                         file_dentry(file)->d_name.name,
1547                         iot == CIT_READ ? "read" : "write",
1548                         pos, pos + count, result, rc);
1549                 /* preserve the tried count for FLR */
1550                 retried = io->ci_ndelay_tried;
1551                 restarted = true;
1552                 goto restart;
1553         }
1554
1555         if (iot == CIT_READ) {
1556                 if (result > 0)
1557                         ll_stats_ops_tally(ll_i2sbi(inode),
1558                                            LPROC_LL_READ_BYTES, result);
1559         } else if (iot == CIT_WRITE) {
1560                 if (result > 0) {
1561                         ll_stats_ops_tally(ll_i2sbi(inode),
1562                                            LPROC_LL_WRITE_BYTES, result);
1563                         fd->fd_write_failed = false;
1564                 } else if (result == 0 && rc == 0) {
1565                         rc = io->ci_result;
1566                         if (rc < 0)
1567                                 fd->fd_write_failed = true;
1568                         else
1569                                 fd->fd_write_failed = false;
1570                 } else if (rc != -ERESTARTSYS) {
1571                         fd->fd_write_failed = true;
1572                 }
1573         }
1574
1575         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1576                 file_dentry(file)->d_name.name,
1577                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1578
1579         *ppos = pos;
1580
1581         RETURN(result > 0 ? result : rc);
1582 }
1583
1584 /**
1585  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1586  * especially for small I/O.
1587  *
1588  * To serve a read request, CLIO has to create and initialize a cl_io and
1589  * then request DLM lock. This has turned out to have siginificant overhead
1590  * and affects the performance of small I/O dramatically.
1591  *
1592  * It's not necessary to create a cl_io for each I/O. Under the help of read
1593  * ahead, most of the pages being read are already in memory cache and we can
1594  * read those pages directly because if the pages exist, the corresponding DLM
1595  * lock must exist so that page content must be valid.
1596  *
1597  * In fast read implementation, the llite speculatively finds and reads pages
1598  * in memory cache. There are three scenarios for fast read:
1599  *   - If the page exists and is uptodate, kernel VM will provide the data and
1600  *     CLIO won't be intervened;
1601  *   - If the page was brought into memory by read ahead, it will be exported
1602  *     and read ahead parameters will be updated;
1603  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1604  *     it will go back and invoke normal read, i.e., a cl_io will be created
1605  *     and DLM lock will be requested.
1606  *
1607  * POSIX compliance: posix standard states that read is intended to be atomic.
1608  * Lustre read implementation is in line with Linux kernel read implementation
1609  * and neither of them complies with POSIX standard in this matter. Fast read
1610  * doesn't make the situation worse on single node but it may interleave write
1611  * results from multiple nodes due to short read handling in ll_file_aio_read().
1612  *
1613  * \param env - lu_env
1614  * \param iocb - kiocb from kernel
1615  * \param iter - user space buffers where the data will be copied
1616  *
1617  * \retval - number of bytes have been read, or error code if error occurred.
1618  */
1619 static ssize_t
1620 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1621 {
1622         ssize_t result;
1623
1624         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1625                 return 0;
1626
1627         /* NB: we can't do direct IO for fast read because it will need a lock
1628          * to make IO engine happy. */
1629         if (iocb->ki_filp->f_flags & O_DIRECT)
1630                 return 0;
1631
1632         result = generic_file_read_iter(iocb, iter);
1633
1634         /* If the first page is not in cache, generic_file_aio_read() will be
1635          * returned with -ENODATA.
1636          * See corresponding code in ll_readpage(). */
1637         if (result == -ENODATA)
1638                 result = 0;
1639
1640         if (result > 0)
1641                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1642                                 LPROC_LL_READ_BYTES, result);
1643
1644         return result;
1645 }
1646
1647 /*
1648  * Read from a file (through the page cache).
1649  */
1650 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1651 {
1652         struct lu_env *env;
1653         struct vvp_io_args *args;
1654         ssize_t result;
1655         ssize_t rc2;
1656         __u16 refcheck;
1657
1658         result = ll_do_fast_read(iocb, to);
1659         if (result < 0 || iov_iter_count(to) == 0)
1660                 GOTO(out, result);
1661
1662         env = cl_env_get(&refcheck);
1663         if (IS_ERR(env))
1664                 return PTR_ERR(env);
1665
1666         args = ll_env_args(env, IO_NORMAL);
1667         args->u.normal.via_iter = to;
1668         args->u.normal.via_iocb = iocb;
1669
1670         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1671                                  &iocb->ki_pos, iov_iter_count(to));
1672         if (rc2 > 0)
1673                 result += rc2;
1674         else if (result == 0)
1675                 result = rc2;
1676
1677         cl_env_put(env, &refcheck);
1678 out:
1679         return result;
1680 }
1681
1682 /**
1683  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1684  * If a page is already in the page cache and dirty (and some other things -
1685  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1686  * write to it without doing a full I/O, because Lustre already knows about it
1687  * and will write it out.  This saves a lot of processing time.
1688  *
1689  * All writes here are within one page, so exclusion is handled by the page
1690  * lock on the vm page.  We do not do tiny writes for writes which touch
1691  * multiple pages because it's very unlikely multiple sequential pages are
1692  * are already dirty.
1693  *
1694  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1695  * and are unlikely to be to already dirty pages.
1696  *
1697  * Attribute updates are important here, we do them in ll_tiny_write_end.
1698  */
1699 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1700 {
1701         ssize_t count = iov_iter_count(iter);
1702         struct file *file = iocb->ki_filp;
1703         struct inode *inode = file_inode(file);
1704         ssize_t result = 0;
1705
1706         ENTRY;
1707
1708         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1709          * of function for why.
1710          */
1711         if (count >= PAGE_SIZE ||
1712             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1713                 RETURN(0);
1714
1715         result = __generic_file_write_iter(iocb, iter);
1716
1717         /* If the page is not already dirty, ll_tiny_write_begin returns
1718          * -ENODATA.  We continue on to normal write.
1719          */
1720         if (result == -ENODATA)
1721                 result = 0;
1722
1723         if (result > 0) {
1724                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1725                                    result);
1726                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1727         }
1728
1729         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1730
1731         RETURN(result);
1732 }
1733
1734 /*
1735  * Write to a file (through the page cache).
1736  */
1737 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1738 {
1739         struct vvp_io_args *args;
1740         struct lu_env *env;
1741         ssize_t rc_tiny = 0, rc_normal;
1742         __u16 refcheck;
1743
1744         ENTRY;
1745
1746         /* NB: we can't do direct IO for tiny writes because they use the page
1747          * cache, we can't do sync writes because tiny writes can't flush
1748          * pages, and we can't do append writes because we can't guarantee the
1749          * required DLM locks are held to protect file size.
1750          */
1751         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1752             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1753                 rc_tiny = ll_do_tiny_write(iocb, from);
1754
1755         /* In case of error, go on and try normal write - Only stop if tiny
1756          * write completed I/O.
1757          */
1758         if (iov_iter_count(from) == 0)
1759                 GOTO(out, rc_normal = rc_tiny);
1760
1761         env = cl_env_get(&refcheck);
1762         if (IS_ERR(env))
1763                 return PTR_ERR(env);
1764
1765         args = ll_env_args(env, IO_NORMAL);
1766         args->u.normal.via_iter = from;
1767         args->u.normal.via_iocb = iocb;
1768
1769         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1770                                     &iocb->ki_pos, iov_iter_count(from));
1771
1772         /* On success, combine bytes written. */
1773         if (rc_tiny >= 0 && rc_normal > 0)
1774                 rc_normal += rc_tiny;
1775         /* On error, only return error from normal write if tiny write did not
1776          * write any bytes.  Otherwise return bytes written by tiny write.
1777          */
1778         else if (rc_tiny > 0)
1779                 rc_normal = rc_tiny;
1780
1781         cl_env_put(env, &refcheck);
1782 out:
1783         RETURN(rc_normal);
1784 }
1785
1786 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1787 /*
1788  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1789  */
1790 static int ll_file_get_iov_count(const struct iovec *iov,
1791                                  unsigned long *nr_segs, size_t *count)
1792 {
1793         size_t cnt = 0;
1794         unsigned long seg;
1795
1796         for (seg = 0; seg < *nr_segs; seg++) {
1797                 const struct iovec *iv = &iov[seg];
1798
1799                 /*
1800                  * If any segment has a negative length, or the cumulative
1801                  * length ever wraps negative then return -EINVAL.
1802                  */
1803                 cnt += iv->iov_len;
1804                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1805                         return -EINVAL;
1806                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1807                         continue;
1808                 if (seg == 0)
1809                         return -EFAULT;
1810                 *nr_segs = seg;
1811                 cnt -= iv->iov_len;     /* This segment is no good */
1812                 break;
1813         }
1814         *count = cnt;
1815         return 0;
1816 }
1817
1818 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1819                                 unsigned long nr_segs, loff_t pos)
1820 {
1821         struct iov_iter to;
1822         size_t iov_count;
1823         ssize_t result;
1824         ENTRY;
1825
1826         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1827         if (result)
1828                 RETURN(result);
1829
1830 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1831         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1832 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1833         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1834 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1835
1836         result = ll_file_read_iter(iocb, &to);
1837
1838         RETURN(result);
1839 }
1840
1841 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1842                             loff_t *ppos)
1843 {
1844         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1845         struct kiocb   kiocb;
1846         ssize_t        result;
1847         ENTRY;
1848
1849         init_sync_kiocb(&kiocb, file);
1850         kiocb.ki_pos = *ppos;
1851 #ifdef HAVE_KIOCB_KI_LEFT
1852         kiocb.ki_left = count;
1853 #elif defined(HAVE_KI_NBYTES)
1854         kiocb.i_nbytes = count;
1855 #endif
1856
1857         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1858         *ppos = kiocb.ki_pos;
1859
1860         RETURN(result);
1861 }
1862
1863 /*
1864  * Write to a file (through the page cache).
1865  * AIO stuff
1866  */
1867 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1868                                  unsigned long nr_segs, loff_t pos)
1869 {
1870         struct iov_iter from;
1871         size_t iov_count;
1872         ssize_t result;
1873         ENTRY;
1874
1875         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1876         if (result)
1877                 RETURN(result);
1878
1879 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1880         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1881 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1882         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1883 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1884
1885         result = ll_file_write_iter(iocb, &from);
1886
1887         RETURN(result);
1888 }
1889
1890 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1891                              size_t count, loff_t *ppos)
1892 {
1893         struct iovec   iov = { .iov_base = (void __user *)buf,
1894                                .iov_len = count };
1895         struct kiocb   kiocb;
1896         ssize_t        result;
1897
1898         ENTRY;
1899
1900         init_sync_kiocb(&kiocb, file);
1901         kiocb.ki_pos = *ppos;
1902 #ifdef HAVE_KIOCB_KI_LEFT
1903         kiocb.ki_left = count;
1904 #elif defined(HAVE_KI_NBYTES)
1905         kiocb.ki_nbytes = count;
1906 #endif
1907
1908         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1909         *ppos = kiocb.ki_pos;
1910
1911         RETURN(result);
1912 }
1913 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1914
1915 /*
1916  * Send file content (through pagecache) somewhere with helper
1917  */
1918 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1919                                    struct pipe_inode_info *pipe, size_t count,
1920                                    unsigned int flags)
1921 {
1922         struct lu_env      *env;
1923         struct vvp_io_args *args;
1924         ssize_t             result;
1925         __u16               refcheck;
1926         ENTRY;
1927
1928         env = cl_env_get(&refcheck);
1929         if (IS_ERR(env))
1930                 RETURN(PTR_ERR(env));
1931
1932         args = ll_env_args(env, IO_SPLICE);
1933         args->u.splice.via_pipe = pipe;
1934         args->u.splice.via_flags = flags;
1935
1936         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1937         cl_env_put(env, &refcheck);
1938         RETURN(result);
1939 }
1940
1941 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1942                              __u64 flags, struct lov_user_md *lum, int lum_size)
1943 {
1944         struct lookup_intent oit = {
1945                 .it_op = IT_OPEN,
1946                 .it_flags = flags | MDS_OPEN_BY_FID,
1947         };
1948         int rc;
1949         ENTRY;
1950
1951         ll_inode_size_lock(inode);
1952         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1953         if (rc < 0)
1954                 GOTO(out_unlock, rc);
1955
1956         ll_release_openhandle(dentry, &oit);
1957
1958 out_unlock:
1959         ll_inode_size_unlock(inode);
1960         ll_intent_release(&oit);
1961
1962         RETURN(rc);
1963 }
1964
1965 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1966                              struct lov_mds_md **lmmp, int *lmm_size,
1967                              struct ptlrpc_request **request)
1968 {
1969         struct ll_sb_info *sbi = ll_i2sbi(inode);
1970         struct mdt_body  *body;
1971         struct lov_mds_md *lmm = NULL;
1972         struct ptlrpc_request *req = NULL;
1973         struct md_op_data *op_data;
1974         int rc, lmmsize;
1975
1976         rc = ll_get_default_mdsize(sbi, &lmmsize);
1977         if (rc)
1978                 RETURN(rc);
1979
1980         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1981                                      strlen(filename), lmmsize,
1982                                      LUSTRE_OPC_ANY, NULL);
1983         if (IS_ERR(op_data))
1984                 RETURN(PTR_ERR(op_data));
1985
1986         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1987         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1988         ll_finish_md_op_data(op_data);
1989         if (rc < 0) {
1990                 CDEBUG(D_INFO, "md_getattr_name failed "
1991                        "on %s: rc %d\n", filename, rc);
1992                 GOTO(out, rc);
1993         }
1994
1995         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1996         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1997
1998         lmmsize = body->mbo_eadatasize;
1999
2000         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2001                         lmmsize == 0) {
2002                 GOTO(out, rc = -ENODATA);
2003         }
2004
2005         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2006         LASSERT(lmm != NULL);
2007
2008         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2009             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2010             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2011                 GOTO(out, rc = -EPROTO);
2012
2013         /*
2014          * This is coming from the MDS, so is probably in
2015          * little endian.  We convert it to host endian before
2016          * passing it to userspace.
2017          */
2018         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2019                 int stripe_count;
2020
2021                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2022                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2023                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2024                         if (le32_to_cpu(lmm->lmm_pattern) &
2025                             LOV_PATTERN_F_RELEASED)
2026                                 stripe_count = 0;
2027                 }
2028
2029                 /* if function called for directory - we should
2030                  * avoid swab not existent lsm objects */
2031                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2032                         lustre_swab_lov_user_md_v1(
2033                                         (struct lov_user_md_v1 *)lmm);
2034                         if (S_ISREG(body->mbo_mode))
2035                                 lustre_swab_lov_user_md_objects(
2036                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2037                                     stripe_count);
2038                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2039                         lustre_swab_lov_user_md_v3(
2040                                         (struct lov_user_md_v3 *)lmm);
2041                         if (S_ISREG(body->mbo_mode))
2042                                 lustre_swab_lov_user_md_objects(
2043                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2044                                     stripe_count);
2045                 } else if (lmm->lmm_magic ==
2046                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2047                         lustre_swab_lov_comp_md_v1(
2048                                         (struct lov_comp_md_v1 *)lmm);
2049                 }
2050         }
2051
2052 out:
2053         *lmmp = lmm;
2054         *lmm_size = lmmsize;
2055         *request = req;
2056         return rc;
2057 }
2058
2059 static int ll_lov_setea(struct inode *inode, struct file *file,
2060                         void __user *arg)
2061 {
2062         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2063         struct lov_user_md      *lump;
2064         int                      lum_size = sizeof(struct lov_user_md) +
2065                                             sizeof(struct lov_user_ost_data);
2066         int                      rc;
2067         ENTRY;
2068
2069         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2070                 RETURN(-EPERM);
2071
2072         OBD_ALLOC_LARGE(lump, lum_size);
2073         if (lump == NULL)
2074                 RETURN(-ENOMEM);
2075
2076         if (copy_from_user(lump, arg, lum_size))
2077                 GOTO(out_lump, rc = -EFAULT);
2078
2079         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2080                                       lum_size);
2081         cl_lov_delay_create_clear(&file->f_flags);
2082
2083 out_lump:
2084         OBD_FREE_LARGE(lump, lum_size);
2085         RETURN(rc);
2086 }
2087
2088 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2089 {
2090         struct lu_env   *env;
2091         __u16           refcheck;
2092         int             rc;
2093         ENTRY;
2094
2095         env = cl_env_get(&refcheck);
2096         if (IS_ERR(env))
2097                 RETURN(PTR_ERR(env));
2098
2099         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2100         cl_env_put(env, &refcheck);
2101         RETURN(rc);
2102 }
2103
2104 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2105                             void __user *arg)
2106 {
2107         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2108         struct lov_user_md        *klum;
2109         int                        lum_size, rc;
2110         __u64                      flags = FMODE_WRITE;
2111         ENTRY;
2112
2113         rc = ll_copy_user_md(lum, &klum);
2114         if (rc < 0)
2115                 RETURN(rc);
2116
2117         lum_size = rc;
2118         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2119                                       lum_size);
2120         if (!rc) {
2121                 __u32 gen;
2122
2123                 rc = put_user(0, &lum->lmm_stripe_count);
2124                 if (rc)
2125                         GOTO(out, rc);
2126
2127                 rc = ll_layout_refresh(inode, &gen);
2128                 if (rc)
2129                         GOTO(out, rc);
2130
2131                 rc = ll_file_getstripe(inode, arg, lum_size);
2132         }
2133         cl_lov_delay_create_clear(&file->f_flags);
2134
2135 out:
2136         OBD_FREE(klum, lum_size);
2137         RETURN(rc);
2138 }
2139
2140 static int
2141 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2142 {
2143         struct ll_inode_info *lli = ll_i2info(inode);
2144         struct cl_object *obj = lli->lli_clob;
2145         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2146         struct ll_grouplock grouplock;
2147         int rc;
2148         ENTRY;
2149
2150         if (arg == 0) {
2151                 CWARN("group id for group lock must not be 0\n");
2152                 RETURN(-EINVAL);
2153         }
2154
2155         if (ll_file_nolock(file))
2156                 RETURN(-EOPNOTSUPP);
2157
2158         spin_lock(&lli->lli_lock);
2159         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2160                 CWARN("group lock already existed with gid %lu\n",
2161                       fd->fd_grouplock.lg_gid);
2162                 spin_unlock(&lli->lli_lock);
2163                 RETURN(-EINVAL);
2164         }
2165         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2166         spin_unlock(&lli->lli_lock);
2167
2168         /**
2169          * XXX: group lock needs to protect all OST objects while PFL
2170          * can add new OST objects during the IO, so we'd instantiate
2171          * all OST objects before getting its group lock.
2172          */
2173         if (obj) {
2174                 struct lu_env *env;
2175                 __u16 refcheck;
2176                 struct cl_layout cl = {
2177                         .cl_is_composite = false,
2178                 };
2179                 struct lu_extent ext = {
2180                         .e_start = 0,
2181                         .e_end = OBD_OBJECT_EOF,
2182                 };
2183
2184                 env = cl_env_get(&refcheck);
2185                 if (IS_ERR(env))
2186                         RETURN(PTR_ERR(env));
2187
2188                 rc = cl_object_layout_get(env, obj, &cl);
2189                 if (!rc && cl.cl_is_composite)
2190                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2191                                                     &ext);
2192
2193                 cl_env_put(env, &refcheck);
2194                 if (rc)
2195                         RETURN(rc);
2196         }
2197
2198         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2199                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2200         if (rc)
2201                 RETURN(rc);
2202
2203         spin_lock(&lli->lli_lock);
2204         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2205                 spin_unlock(&lli->lli_lock);
2206                 CERROR("another thread just won the race\n");
2207                 cl_put_grouplock(&grouplock);
2208                 RETURN(-EINVAL);
2209         }
2210
2211         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2212         fd->fd_grouplock = grouplock;
2213         spin_unlock(&lli->lli_lock);
2214
2215         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2216         RETURN(0);
2217 }
2218
2219 static int ll_put_grouplock(struct inode *inode, struct file *file,
2220                             unsigned long arg)
2221 {
2222         struct ll_inode_info   *lli = ll_i2info(inode);
2223         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2224         struct ll_grouplock     grouplock;
2225         ENTRY;
2226
2227         spin_lock(&lli->lli_lock);
2228         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2229                 spin_unlock(&lli->lli_lock);
2230                 CWARN("no group lock held\n");
2231                 RETURN(-EINVAL);
2232         }
2233
2234         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2235
2236         if (fd->fd_grouplock.lg_gid != arg) {
2237                 CWARN("group lock %lu doesn't match current id %lu\n",
2238                       arg, fd->fd_grouplock.lg_gid);
2239                 spin_unlock(&lli->lli_lock);
2240                 RETURN(-EINVAL);
2241         }
2242
2243         grouplock = fd->fd_grouplock;
2244         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2245         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2246         spin_unlock(&lli->lli_lock);
2247
2248         cl_put_grouplock(&grouplock);
2249         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2250         RETURN(0);
2251 }
2252
2253 /**
2254  * Close inode open handle
2255  *
2256  * \param dentry [in]     dentry which contains the inode
2257  * \param it     [in,out] intent which contains open info and result
2258  *
2259  * \retval 0     success
2260  * \retval <0    failure
2261  */
2262 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2263 {
2264         struct inode *inode = dentry->d_inode;
2265         struct obd_client_handle *och;
2266         int rc;
2267         ENTRY;
2268
2269         LASSERT(inode);
2270
2271         /* Root ? Do nothing. */
2272         if (dentry->d_inode->i_sb->s_root == dentry)
2273                 RETURN(0);
2274
2275         /* No open handle to close? Move away */
2276         if (!it_disposition(it, DISP_OPEN_OPEN))
2277                 RETURN(0);
2278
2279         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2280
2281         OBD_ALLOC(och, sizeof(*och));
2282         if (!och)
2283                 GOTO(out, rc = -ENOMEM);
2284
2285         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2286
2287         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2288 out:
2289         /* this one is in place of ll_file_open */
2290         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2291                 ptlrpc_req_finished(it->it_request);
2292                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2293         }
2294         RETURN(rc);
2295 }
2296
2297 /**
2298  * Get size for inode for which FIEMAP mapping is requested.
2299  * Make the FIEMAP get_info call and returns the result.
2300  * \param fiemap        kernel buffer to hold extens
2301  * \param num_bytes     kernel buffer size
2302  */
2303 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2304                         size_t num_bytes)
2305 {
2306         struct lu_env                   *env;
2307         __u16                           refcheck;
2308         int                             rc = 0;
2309         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2310         ENTRY;
2311
2312         /* Checks for fiemap flags */
2313         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2314                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2315                 return -EBADR;
2316         }
2317
2318         /* Check for FIEMAP_FLAG_SYNC */
2319         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2320                 rc = filemap_fdatawrite(inode->i_mapping);
2321                 if (rc)
2322                         return rc;
2323         }
2324
2325         env = cl_env_get(&refcheck);
2326         if (IS_ERR(env))
2327                 RETURN(PTR_ERR(env));
2328
2329         if (i_size_read(inode) == 0) {
2330                 rc = ll_glimpse_size(inode);
2331                 if (rc)
2332                         GOTO(out, rc);
2333         }
2334
2335         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2336         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2337         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2338
2339         /* If filesize is 0, then there would be no objects for mapping */
2340         if (fmkey.lfik_oa.o_size == 0) {
2341                 fiemap->fm_mapped_extents = 0;
2342                 GOTO(out, rc = 0);
2343         }
2344
2345         fmkey.lfik_fiemap = *fiemap;
2346
2347         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2348                               &fmkey, fiemap, &num_bytes);
2349 out:
2350         cl_env_put(env, &refcheck);
2351         RETURN(rc);
2352 }
2353
2354 int ll_fid2path(struct inode *inode, void __user *arg)
2355 {
2356         struct obd_export       *exp = ll_i2mdexp(inode);
2357         const struct getinfo_fid2path __user *gfin = arg;
2358         __u32                    pathlen;
2359         struct getinfo_fid2path *gfout;
2360         size_t                   outsize;
2361         int                      rc;
2362
2363         ENTRY;
2364
2365         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2366             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2367                 RETURN(-EPERM);
2368
2369         /* Only need to get the buflen */
2370         if (get_user(pathlen, &gfin->gf_pathlen))
2371                 RETURN(-EFAULT);
2372
2373         if (pathlen > PATH_MAX)
2374                 RETURN(-EINVAL);
2375
2376         outsize = sizeof(*gfout) + pathlen;
2377         OBD_ALLOC(gfout, outsize);
2378         if (gfout == NULL)
2379                 RETURN(-ENOMEM);
2380
2381         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2382                 GOTO(gf_free, rc = -EFAULT);
2383         /* append root FID after gfout to let MDT know the root FID so that it
2384          * can lookup the correct path, this is mainly for fileset.
2385          * old server without fileset mount support will ignore this. */
2386         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2387
2388         /* Call mdc_iocontrol */
2389         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2390         if (rc != 0)
2391                 GOTO(gf_free, rc);
2392
2393         if (copy_to_user(arg, gfout, outsize))
2394                 rc = -EFAULT;
2395
2396 gf_free:
2397         OBD_FREE(gfout, outsize);
2398         RETURN(rc);
2399 }
2400
2401 static int
2402 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2403 {
2404         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2405         struct lu_env *env;
2406         struct cl_io *io;
2407         __u16  refcheck;
2408         int result;
2409
2410         ENTRY;
2411
2412         ioc->idv_version = 0;
2413         ioc->idv_layout_version = UINT_MAX;
2414
2415         /* If no file object initialized, we consider its version is 0. */
2416         if (obj == NULL)
2417                 RETURN(0);
2418
2419         env = cl_env_get(&refcheck);
2420         if (IS_ERR(env))
2421                 RETURN(PTR_ERR(env));
2422
2423         io = vvp_env_thread_io(env);
2424         io->ci_obj = obj;
2425         io->u.ci_data_version.dv_data_version = 0;
2426         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2427         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2428
2429 restart:
2430         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2431                 result = cl_io_loop(env, io);
2432         else
2433                 result = io->ci_result;
2434
2435         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2436         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2437
2438         cl_io_fini(env, io);
2439
2440         if (unlikely(io->ci_need_restart))
2441                 goto restart;
2442
2443         cl_env_put(env, &refcheck);
2444
2445         RETURN(result);
2446 }
2447
2448 /*
2449  * Read the data_version for inode.
2450  *
2451  * This value is computed using stripe object version on OST.
2452  * Version is computed using server side locking.
2453  *
2454  * @param flags if do sync on the OST side;
2455  *              0: no sync
2456  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2457  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2458  */
2459 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2460 {
2461         struct ioc_data_version ioc = { .idv_flags = flags };
2462         int rc;
2463
2464         rc = ll_ioc_data_version(inode, &ioc);
2465         if (!rc)
2466                 *data_version = ioc.idv_version;
2467
2468         return rc;
2469 }
2470
2471 /*
2472  * Trigger a HSM release request for the provided inode.
2473  */
2474 int ll_hsm_release(struct inode *inode)
2475 {
2476         struct lu_env *env;
2477         struct obd_client_handle *och = NULL;
2478         __u64 data_version = 0;
2479         int rc;
2480         __u16 refcheck;
2481         ENTRY;
2482
2483         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2484                ll_get_fsname(inode->i_sb, NULL, 0),
2485                PFID(&ll_i2info(inode)->lli_fid));
2486
2487         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2488         if (IS_ERR(och))
2489                 GOTO(out, rc = PTR_ERR(och));
2490
2491         /* Grab latest data_version and [am]time values */
2492         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2493         if (rc != 0)
2494                 GOTO(out, rc);
2495
2496         env = cl_env_get(&refcheck);
2497         if (IS_ERR(env))
2498                 GOTO(out, rc = PTR_ERR(env));
2499
2500         rc = ll_merge_attr(env, inode);
2501         cl_env_put(env, &refcheck);
2502
2503         /* If error happen, we have the wrong size for a file.
2504          * Don't release it.
2505          */
2506         if (rc != 0)
2507                 GOTO(out, rc);
2508
2509         /* Release the file.
2510          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2511          * we still need it to pack l_remote_handle to MDT. */
2512         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2513                                        &data_version);
2514         och = NULL;
2515
2516         EXIT;
2517 out:
2518         if (och != NULL && !IS_ERR(och)) /* close the file */
2519                 ll_lease_close(och, inode, NULL);
2520
2521         return rc;
2522 }
2523
2524 struct ll_swap_stack {
2525         __u64                    dv1;
2526         __u64                    dv2;
2527         struct inode            *inode1;
2528         struct inode            *inode2;
2529         bool                     check_dv1;
2530         bool                     check_dv2;
2531 };
2532
2533 static int ll_swap_layouts(struct file *file1, struct file *file2,
2534                            struct lustre_swap_layouts *lsl)
2535 {
2536         struct mdc_swap_layouts  msl;
2537         struct md_op_data       *op_data;
2538         __u32                    gid;
2539         __u64                    dv;
2540         struct ll_swap_stack    *llss = NULL;
2541         int                      rc;
2542
2543         OBD_ALLOC_PTR(llss);
2544         if (llss == NULL)
2545                 RETURN(-ENOMEM);
2546
2547         llss->inode1 = file_inode(file1);
2548         llss->inode2 = file_inode(file2);
2549
2550         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2551         if (rc < 0)
2552                 GOTO(free, rc);
2553
2554         /* we use 2 bool because it is easier to swap than 2 bits */
2555         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2556                 llss->check_dv1 = true;
2557
2558         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2559                 llss->check_dv2 = true;
2560
2561         /* we cannot use lsl->sl_dvX directly because we may swap them */
2562         llss->dv1 = lsl->sl_dv1;
2563         llss->dv2 = lsl->sl_dv2;
2564
2565         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2566         if (rc == 0) /* same file, done! */
2567                 GOTO(free, rc);
2568
2569         if (rc < 0) { /* sequentialize it */
2570                 swap(llss->inode1, llss->inode2);
2571                 swap(file1, file2);
2572                 swap(llss->dv1, llss->dv2);
2573                 swap(llss->check_dv1, llss->check_dv2);
2574         }
2575
2576         gid = lsl->sl_gid;
2577         if (gid != 0) { /* application asks to flush dirty cache */
2578                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2579                 if (rc < 0)
2580                         GOTO(free, rc);
2581
2582                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2583                 if (rc < 0) {
2584                         ll_put_grouplock(llss->inode1, file1, gid);
2585                         GOTO(free, rc);
2586                 }
2587         }
2588
2589         /* ultimate check, before swaping the layouts we check if
2590          * dataversion has changed (if requested) */
2591         if (llss->check_dv1) {
2592                 rc = ll_data_version(llss->inode1, &dv, 0);
2593                 if (rc)
2594                         GOTO(putgl, rc);
2595                 if (dv != llss->dv1)
2596                         GOTO(putgl, rc = -EAGAIN);
2597         }
2598
2599         if (llss->check_dv2) {
2600                 rc = ll_data_version(llss->inode2, &dv, 0);
2601                 if (rc)
2602                         GOTO(putgl, rc);
2603                 if (dv != llss->dv2)
2604                         GOTO(putgl, rc = -EAGAIN);
2605         }
2606
2607         /* struct md_op_data is used to send the swap args to the mdt
2608          * only flags is missing, so we use struct mdc_swap_layouts
2609          * through the md_op_data->op_data */
2610         /* flags from user space have to be converted before they are send to
2611          * server, no flag is sent today, they are only used on the client */
2612         msl.msl_flags = 0;
2613         rc = -ENOMEM;
2614         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2615                                      0, LUSTRE_OPC_ANY, &msl);
2616         if (IS_ERR(op_data))
2617                 GOTO(free, rc = PTR_ERR(op_data));
2618
2619         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2620                            sizeof(*op_data), op_data, NULL);
2621         ll_finish_md_op_data(op_data);
2622
2623         if (rc < 0)
2624                 GOTO(putgl, rc);
2625
2626 putgl:
2627         if (gid != 0) {
2628                 ll_put_grouplock(llss->inode2, file2, gid);
2629                 ll_put_grouplock(llss->inode1, file1, gid);
2630         }
2631
2632 free:
2633         if (llss != NULL)
2634                 OBD_FREE_PTR(llss);
2635
2636         RETURN(rc);
2637 }
2638
2639 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2640 {
2641         struct md_op_data       *op_data;
2642         int                      rc;
2643         ENTRY;
2644
2645         /* Detect out-of range masks */
2646         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2647                 RETURN(-EINVAL);
2648
2649         /* Non-root users are forbidden to set or clear flags which are
2650          * NOT defined in HSM_USER_MASK. */
2651         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2652             !cfs_capable(CFS_CAP_SYS_ADMIN))
2653                 RETURN(-EPERM);
2654
2655         /* Detect out-of range archive id */
2656         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2657             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2658                 RETURN(-EINVAL);
2659
2660         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2661                                      LUSTRE_OPC_ANY, hss);
2662         if (IS_ERR(op_data))
2663                 RETURN(PTR_ERR(op_data));
2664
2665         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2666                            sizeof(*op_data), op_data, NULL);
2667
2668         ll_finish_md_op_data(op_data);
2669
2670         RETURN(rc);
2671 }
2672
2673 static int ll_hsm_import(struct inode *inode, struct file *file,
2674                          struct hsm_user_import *hui)
2675 {
2676         struct hsm_state_set    *hss = NULL;
2677         struct iattr            *attr = NULL;
2678         int                      rc;
2679         ENTRY;
2680
2681         if (!S_ISREG(inode->i_mode))
2682                 RETURN(-EINVAL);
2683
2684         /* set HSM flags */
2685         OBD_ALLOC_PTR(hss);
2686         if (hss == NULL)
2687                 GOTO(out, rc = -ENOMEM);
2688
2689         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2690         hss->hss_archive_id = hui->hui_archive_id;
2691         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2692         rc = ll_hsm_state_set(inode, hss);
2693         if (rc != 0)
2694                 GOTO(out, rc);
2695
2696         OBD_ALLOC_PTR(attr);
2697         if (attr == NULL)
2698                 GOTO(out, rc = -ENOMEM);
2699
2700         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2701         attr->ia_mode |= S_IFREG;
2702         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2703         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2704         attr->ia_size = hui->hui_size;
2705         attr->ia_mtime.tv_sec = hui->hui_mtime;
2706         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2707         attr->ia_atime.tv_sec = hui->hui_atime;
2708         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2709
2710         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2711                          ATTR_UID | ATTR_GID |
2712                          ATTR_MTIME | ATTR_MTIME_SET |
2713                          ATTR_ATIME | ATTR_ATIME_SET;
2714
2715         inode_lock(inode);
2716
2717         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2718         if (rc == -ENODATA)
2719                 rc = 0;
2720
2721         inode_unlock(inode);
2722
2723 out:
2724         if (hss != NULL)
2725                 OBD_FREE_PTR(hss);
2726
2727         if (attr != NULL)
2728                 OBD_FREE_PTR(attr);
2729
2730         RETURN(rc);
2731 }
2732
2733 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2734 {
2735         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2736                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2737 }
2738
2739 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2740 {
2741         struct inode *inode = file_inode(file);
2742         struct iattr ia = {
2743                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2744                             ATTR_MTIME | ATTR_MTIME_SET |
2745                             ATTR_CTIME,
2746                 .ia_atime = {
2747                         .tv_sec = lfu->lfu_atime_sec,
2748                         .tv_nsec = lfu->lfu_atime_nsec,
2749                 },
2750                 .ia_mtime = {
2751                         .tv_sec = lfu->lfu_mtime_sec,
2752                         .tv_nsec = lfu->lfu_mtime_nsec,
2753                 },
2754                 .ia_ctime = {
2755                         .tv_sec = lfu->lfu_ctime_sec,
2756                         .tv_nsec = lfu->lfu_ctime_nsec,
2757                 },
2758         };
2759         int rc;
2760         ENTRY;
2761
2762         if (!capable(CAP_SYS_ADMIN))
2763                 RETURN(-EPERM);
2764
2765         if (!S_ISREG(inode->i_mode))
2766                 RETURN(-EINVAL);
2767
2768         inode_lock(inode);
2769         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2770                             false);
2771         inode_unlock(inode);
2772
2773         RETURN(rc);
2774 }
2775
2776 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2777 {
2778         switch (mode) {
2779         case MODE_READ_USER:
2780                 return CLM_READ;
2781         case MODE_WRITE_USER:
2782                 return CLM_WRITE;
2783         default:
2784                 return -EINVAL;
2785         }
2786 }
2787
2788 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2789
2790 /* Used to allow the upper layers of the client to request an LDLM lock
2791  * without doing an actual read or write.
2792  *
2793  * Used for ladvise lockahead to manually request specific locks.
2794  *
2795  * \param[in] file      file this ladvise lock request is on
2796  * \param[in] ladvise   ladvise struct describing this lock request
2797  *
2798  * \retval 0            success, no detailed result available (sync requests
2799  *                      and requests sent to the server [not handled locally]
2800  *                      cannot return detailed results)
2801  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2802  *                                       see definitions for details.
2803  * \retval negative     negative errno on error
2804  */
2805 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2806 {
2807         struct lu_env *env = NULL;
2808         struct cl_io *io  = NULL;
2809         struct cl_lock *lock = NULL;
2810         struct cl_lock_descr *descr = NULL;
2811         struct dentry *dentry = file->f_path.dentry;
2812         struct inode *inode = dentry->d_inode;
2813         enum cl_lock_mode cl_mode;
2814         off_t start = ladvise->lla_start;
2815         off_t end = ladvise->lla_end;
2816         int result;
2817         __u16 refcheck;
2818
2819         ENTRY;
2820
2821         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2822                "start=%llu, end=%llu\n", dentry->d_name.len,
2823                dentry->d_name.name, dentry->d_inode,
2824                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2825                (__u64) end);
2826
2827         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2828         if (cl_mode < 0)
2829                 GOTO(out, result = cl_mode);
2830
2831         /* Get IO environment */
2832         result = cl_io_get(inode, &env, &io, &refcheck);
2833         if (result <= 0)
2834                 GOTO(out, result);
2835
2836         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2837         if (result > 0) {
2838                 /*
2839                  * nothing to do for this io. This currently happens when
2840                  * stripe sub-object's are not yet created.
2841                  */
2842                 result = io->ci_result;
2843         } else if (result == 0) {
2844                 lock = vvp_env_lock(env);
2845                 descr = &lock->cll_descr;
2846
2847                 descr->cld_obj   = io->ci_obj;
2848                 /* Convert byte offsets to pages */
2849                 descr->cld_start = cl_index(io->ci_obj, start);
2850                 descr->cld_end   = cl_index(io->ci_obj, end);
2851                 descr->cld_mode  = cl_mode;
2852                 /* CEF_MUST is used because we do not want to convert a
2853                  * lockahead request to a lockless lock */
2854                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2855                                        CEF_NONBLOCK;
2856
2857                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2858                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2859
2860                 result = cl_lock_request(env, io, lock);
2861
2862                 /* On success, we need to release the lock */
2863                 if (result >= 0)
2864                         cl_lock_release(env, lock);
2865         }
2866         cl_io_fini(env, io);
2867         cl_env_put(env, &refcheck);
2868
2869         /* -ECANCELED indicates a matching lock with a different extent
2870          * was already present, and -EEXIST indicates a matching lock
2871          * on exactly the same extent was already present.
2872          * We convert them to positive values for userspace to make
2873          * recognizing true errors easier.
2874          * Note we can only return these detailed results on async requests,
2875          * as sync requests look the same as i/o requests for locking. */
2876         if (result == -ECANCELED)
2877                 result = LLA_RESULT_DIFFERENT;
2878         else if (result == -EEXIST)
2879                 result = LLA_RESULT_SAME;
2880
2881 out:
2882         RETURN(result);
2883 }
2884 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2885
2886 static int ll_ladvise_sanity(struct inode *inode,
2887                              struct llapi_lu_ladvise *ladvise)
2888 {
2889         enum lu_ladvise_type advice = ladvise->lla_advice;
2890         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2891          * be in the first 32 bits of enum ladvise_flags */
2892         __u32 flags = ladvise->lla_peradvice_flags;
2893         /* 3 lines at 80 characters per line, should be plenty */
2894         int rc = 0;
2895
2896         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2897                 rc = -EINVAL;
2898                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2899                        "last supported advice is %s (value '%d'): rc = %d\n",
2900                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2901                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2902                 GOTO(out, rc);
2903         }
2904
2905         /* Per-advice checks */
2906         switch (advice) {
2907         case LU_LADVISE_LOCKNOEXPAND:
2908                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2909                         rc = -EINVAL;
2910                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2911                                "rc = %d\n",
2912                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2913                                ladvise_names[advice], rc);
2914                         GOTO(out, rc);
2915                 }
2916                 break;
2917         case LU_LADVISE_LOCKAHEAD:
2918                 /* Currently only READ and WRITE modes can be requested */
2919                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2920                     ladvise->lla_lockahead_mode == 0) {
2921                         rc = -EINVAL;
2922                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2923                                "rc = %d\n",
2924                                ll_get_fsname(inode->i_sb, NULL, 0),
2925                                ladvise->lla_lockahead_mode,
2926                                ladvise_names[advice], rc);
2927                         GOTO(out, rc);
2928                 }
2929         case LU_LADVISE_WILLREAD:
2930         case LU_LADVISE_DONTNEED:
2931         default:
2932                 /* Note fall through above - These checks apply to all advices
2933                  * except LOCKNOEXPAND */
2934                 if (flags & ~LF_DEFAULT_MASK) {
2935                         rc = -EINVAL;
2936                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2937                                "rc = %d\n",
2938                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2939                                ladvise_names[advice], rc);
2940                         GOTO(out, rc);
2941                 }
2942                 if (ladvise->lla_start >= ladvise->lla_end) {
2943                         rc = -EINVAL;
2944                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2945                                "for %s: rc = %d\n",
2946                                ll_get_fsname(inode->i_sb, NULL, 0),
2947                                ladvise->lla_start, ladvise->lla_end,
2948                                ladvise_names[advice], rc);
2949                         GOTO(out, rc);
2950                 }
2951                 break;
2952         }
2953
2954 out:
2955         return rc;
2956 }
2957 #undef ERRSIZE
2958
2959 /*
2960  * Give file access advices
2961  *
2962  * The ladvise interface is similar to Linux fadvise() system call, except it
2963  * forwards the advices directly from Lustre client to server. The server side
2964  * codes will apply appropriate read-ahead and caching techniques for the
2965  * corresponding files.
2966  *
2967  * A typical workload for ladvise is e.g. a bunch of different clients are
2968  * doing small random reads of a file, so prefetching pages into OSS cache
2969  * with big linear reads before the random IO is a net benefit. Fetching
2970  * all that data into each client cache with fadvise() may not be, due to
2971  * much more data being sent to the client.
2972  */
2973 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2974                       struct llapi_lu_ladvise *ladvise)
2975 {
2976         struct lu_env *env;
2977         struct cl_io *io;
2978         struct cl_ladvise_io *lio;
2979         int rc;
2980         __u16 refcheck;
2981         ENTRY;
2982
2983         env = cl_env_get(&refcheck);
2984         if (IS_ERR(env))
2985                 RETURN(PTR_ERR(env));
2986
2987         io = vvp_env_thread_io(env);
2988         io->ci_obj = ll_i2info(inode)->lli_clob;
2989
2990         /* initialize parameters for ladvise */
2991         lio = &io->u.ci_ladvise;
2992         lio->li_start = ladvise->lla_start;
2993         lio->li_end = ladvise->lla_end;
2994         lio->li_fid = ll_inode2fid(inode);
2995         lio->li_advice = ladvise->lla_advice;
2996         lio->li_flags = flags;
2997
2998         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2999                 rc = cl_io_loop(env, io);
3000         else
3001                 rc = io->ci_result;
3002
3003         cl_io_fini(env, io);
3004         cl_env_put(env, &refcheck);
3005         RETURN(rc);
3006 }
3007
3008 static int ll_lock_noexpand(struct file *file, int flags)
3009 {
3010         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3011
3012         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3013
3014         return 0;
3015 }
3016
3017 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3018                         unsigned long arg)
3019 {
3020         struct fsxattr fsxattr;
3021
3022         if (copy_from_user(&fsxattr,
3023                            (const struct fsxattr __user *)arg,
3024                            sizeof(fsxattr)))
3025                 RETURN(-EFAULT);
3026
3027         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3028         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3029                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3030         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3031         if (copy_to_user((struct fsxattr __user *)arg,
3032                          &fsxattr, sizeof(fsxattr)))
3033                 RETURN(-EFAULT);
3034
3035         RETURN(0);
3036 }
3037
3038 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3039 {
3040         /*
3041          * Project Quota ID state is only allowed to change from within the init
3042          * namespace. Enforce that restriction only if we are trying to change
3043          * the quota ID state. Everything else is allowed in user namespaces.
3044          */
3045         if (current_user_ns() == &init_user_ns)
3046                 return 0;
3047
3048         if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3049                 return -EINVAL;
3050
3051         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3052                 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3053                         return -EINVAL;
3054         } else {
3055                 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3056                         return -EINVAL;
3057         }
3058
3059         return 0;
3060 }
3061
3062 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3063                         unsigned long arg)
3064 {
3065
3066         struct md_op_data *op_data;
3067         struct ptlrpc_request *req = NULL;
3068         int rc = 0;
3069         struct fsxattr fsxattr;
3070         struct cl_object *obj;
3071         struct iattr *attr;
3072         int flags;
3073
3074         if (copy_from_user(&fsxattr,
3075                            (const struct fsxattr __user *)arg,
3076                            sizeof(fsxattr)))
3077                 RETURN(-EFAULT);
3078
3079         rc = ll_ioctl_check_project(inode, &fsxattr);
3080         if (rc)
3081                 RETURN(rc);
3082
3083         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3084                                      LUSTRE_OPC_ANY, NULL);
3085         if (IS_ERR(op_data))
3086                 RETURN(PTR_ERR(op_data));
3087
3088         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3089         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3090         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3091                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3092         op_data->op_projid = fsxattr.fsx_projid;
3093         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3094         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3095                         0, &req);
3096         ptlrpc_req_finished(req);
3097         if (rc)
3098                 GOTO(out_fsxattr, rc);
3099         ll_update_inode_flags(inode, op_data->op_attr_flags);
3100         obj = ll_i2info(inode)->lli_clob;
3101         if (obj == NULL)
3102                 GOTO(out_fsxattr, rc);
3103
3104         OBD_ALLOC_PTR(attr);
3105         if (attr == NULL)
3106                 GOTO(out_fsxattr, rc = -ENOMEM);
3107
3108         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3109                             fsxattr.fsx_xflags);
3110         OBD_FREE_PTR(attr);
3111 out_fsxattr:
3112         ll_finish_md_op_data(op_data);
3113         RETURN(rc);
3114 }
3115
3116 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3117                                  unsigned long arg)
3118 {
3119         struct inode            *inode = file_inode(file);
3120         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3121         struct ll_inode_info    *lli = ll_i2info(inode);
3122         struct obd_client_handle *och = NULL;
3123         struct split_param sp;
3124         bool lease_broken;
3125         fmode_t fmode = 0;
3126         enum mds_op_bias bias = 0;
3127         struct file *layout_file = NULL;
3128         void *data = NULL;
3129         size_t data_size = 0;
3130         long rc;
3131         ENTRY;
3132
3133         mutex_lock(&lli->lli_och_mutex);
3134         if (fd->fd_lease_och != NULL) {
3135                 och = fd->fd_lease_och;
3136                 fd->fd_lease_och = NULL;
3137         }
3138         mutex_unlock(&lli->lli_och_mutex);
3139
3140         if (och == NULL)
3141                 GOTO(out, rc = -ENOLCK);
3142
3143         fmode = och->och_flags;
3144
3145         switch (ioc->lil_flags) {
3146         case LL_LEASE_RESYNC_DONE:
3147                 if (ioc->lil_count > IOC_IDS_MAX)
3148                         GOTO(out, rc = -EINVAL);
3149
3150                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3151                 OBD_ALLOC(data, data_size);
3152                 if (!data)
3153                         GOTO(out, rc = -ENOMEM);
3154
3155                 if (copy_from_user(data, (void __user *)arg, data_size))
3156                         GOTO(out, rc = -EFAULT);
3157
3158                 bias = MDS_CLOSE_RESYNC_DONE;
3159                 break;
3160         case LL_LEASE_LAYOUT_MERGE: {
3161                 int fd;
3162
3163                 if (ioc->lil_count != 1)
3164                         GOTO(out, rc = -EINVAL);
3165
3166                 arg += sizeof(*ioc);
3167                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3168                         GOTO(out, rc = -EFAULT);
3169
3170                 layout_file = fget(fd);
3171                 if (!layout_file)
3172                         GOTO(out, rc = -EBADF);
3173
3174                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3175                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3176                         GOTO(out, rc = -EPERM);
3177
3178                 data = file_inode(layout_file);
3179                 bias = MDS_CLOSE_LAYOUT_MERGE;
3180                 break;
3181         }
3182         case LL_LEASE_LAYOUT_SPLIT: {
3183                 int fdv;
3184                 int mirror_id;
3185
3186                 if (ioc->lil_count != 2)
3187                         GOTO(out, rc = -EINVAL);
3188
3189                 arg += sizeof(*ioc);
3190                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3191                         GOTO(out, rc = -EFAULT);
3192
3193                 arg += sizeof(__u32);
3194                 if (copy_from_user(&mirror_id, (void __user *)arg,
3195                                    sizeof(__u32)))
3196                         GOTO(out, rc = -EFAULT);
3197
3198                 layout_file = fget(fdv);
3199                 if (!layout_file)
3200                         GOTO(out, rc = -EBADF);
3201
3202                 sp.sp_inode = file_inode(layout_file);
3203                 sp.sp_mirror_id = (__u16)mirror_id;
3204                 data = &sp;
3205                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3206                 break;
3207         }
3208         default:
3209                 /* without close intent */
3210                 break;
3211         }
3212
3213         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3214         if (rc < 0)
3215                 GOTO(out, rc);
3216
3217         rc = ll_lease_och_release(inode, file);
3218         if (rc < 0)
3219                 GOTO(out, rc);
3220
3221         if (lease_broken)
3222                 fmode = 0;
3223         EXIT;
3224
3225 out:
3226         switch (ioc->lil_flags) {
3227         case LL_LEASE_RESYNC_DONE:
3228                 if (data)
3229                         OBD_FREE(data, data_size);
3230                 break;
3231         case LL_LEASE_LAYOUT_MERGE:
3232         case LL_LEASE_LAYOUT_SPLIT:
3233                 if (layout_file)
3234                         fput(layout_file);
3235                 break;
3236         }
3237
3238         if (!rc)
3239                 rc = ll_lease_type_from_fmode(fmode);
3240         RETURN(rc);
3241 }
3242
3243 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3244                               unsigned long arg)
3245 {
3246         struct inode *inode = file_inode(file);
3247         struct ll_inode_info *lli = ll_i2info(inode);
3248         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3249         struct obd_client_handle *och = NULL;
3250         __u64 open_flags = 0;
3251         bool lease_broken;
3252         fmode_t fmode;
3253         long rc;
3254         ENTRY;
3255
3256         switch (ioc->lil_mode) {
3257         case LL_LEASE_WRLCK:
3258                 if (!(file->f_mode & FMODE_WRITE))
3259                         RETURN(-EPERM);
3260                 fmode = FMODE_WRITE;
3261                 break;
3262         case LL_LEASE_RDLCK:
3263                 if (!(file->f_mode & FMODE_READ))
3264                         RETURN(-EPERM);
3265                 fmode = FMODE_READ;
3266                 break;
3267         case LL_LEASE_UNLCK:
3268                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3269         default:
3270                 RETURN(-EINVAL);
3271         }
3272
3273         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3274
3275         /* apply for lease */
3276         if (ioc->lil_flags & LL_LEASE_RESYNC)
3277                 open_flags = MDS_OPEN_RESYNC;
3278         och = ll_lease_open(inode, file, fmode, open_flags);
3279         if (IS_ERR(och))
3280                 RETURN(PTR_ERR(och));
3281
3282         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3283                 rc = ll_lease_file_resync(och, inode, arg);
3284                 if (rc) {
3285                         ll_lease_close(och, inode, NULL);
3286                         RETURN(rc);
3287                 }
3288                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3289                 if (rc) {
3290                         ll_lease_close(och, inode, NULL);
3291                         RETURN(rc);
3292                 }
3293         }
3294
3295         rc = 0;
3296         mutex_lock(&lli->lli_och_mutex);
3297         if (fd->fd_lease_och == NULL) {
3298                 fd->fd_lease_och = och;
3299                 och = NULL;
3300         }
3301         mutex_unlock(&lli->lli_och_mutex);
3302         if (och != NULL) {
3303                 /* impossible now that only excl is supported for now */
3304                 ll_lease_close(och, inode, &lease_broken);
3305                 rc = -EBUSY;
3306         }
3307         RETURN(rc);
3308 }
3309
3310 static long
3311 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3312 {
3313         struct inode            *inode = file_inode(file);
3314         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3315         int                      flags, rc;
3316         ENTRY;
3317
3318         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3319                PFID(ll_inode2fid(inode)), inode, cmd);
3320         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3321
3322         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3323         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3324                 RETURN(-ENOTTY);
3325
3326         switch (cmd) {
3327         case LL_IOC_GETFLAGS:
3328                 /* Get the current value of the file flags */
3329                 return put_user(fd->fd_flags, (int __user *)arg);
3330         case LL_IOC_SETFLAGS:
3331         case LL_IOC_CLRFLAGS:
3332                 /* Set or clear specific file flags */
3333                 /* XXX This probably needs checks to ensure the flags are
3334                  *     not abused, and to handle any flag side effects.
3335                  */
3336                 if (get_user(flags, (int __user *) arg))
3337                         RETURN(-EFAULT);
3338
3339                 if (cmd == LL_IOC_SETFLAGS) {
3340                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3341                             !(file->f_flags & O_DIRECT)) {
3342                                 CERROR("%s: unable to disable locking on "
3343                                        "non-O_DIRECT file\n", current->comm);
3344                                 RETURN(-EINVAL);
3345                         }
3346
3347                         fd->fd_flags |= flags;
3348                 } else {
3349                         fd->fd_flags &= ~flags;
3350                 }
3351                 RETURN(0);
3352         case LL_IOC_LOV_SETSTRIPE:
3353         case LL_IOC_LOV_SETSTRIPE_NEW:
3354                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3355         case LL_IOC_LOV_SETEA:
3356                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3357         case LL_IOC_LOV_SWAP_LAYOUTS: {
3358                 struct file *file2;
3359                 struct lustre_swap_layouts lsl;
3360
3361                 if (copy_from_user(&lsl, (char __user *)arg,
3362                                    sizeof(struct lustre_swap_layouts)))
3363                         RETURN(-EFAULT);
3364
3365                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3366                         RETURN(-EPERM);
3367
3368                 file2 = fget(lsl.sl_fd);
3369                 if (file2 == NULL)
3370                         RETURN(-EBADF);
3371
3372                 /* O_WRONLY or O_RDWR */
3373                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3374                         GOTO(out, rc = -EPERM);
3375
3376                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3377                         struct inode                    *inode2;
3378                         struct ll_inode_info            *lli;
3379                         struct obd_client_handle        *och = NULL;
3380
3381                         lli = ll_i2info(inode);
3382                         mutex_lock(&lli->lli_och_mutex);
3383                         if (fd->fd_lease_och != NULL) {
3384                                 och = fd->fd_lease_och;
3385                                 fd->fd_lease_och = NULL;
3386                         }
3387                         mutex_unlock(&lli->lli_och_mutex);
3388                         if (och == NULL)
3389                                 GOTO(out, rc = -ENOLCK);
3390                         inode2 = file_inode(file2);
3391                         rc = ll_swap_layouts_close(och, inode, inode2);
3392                 } else {
3393                         rc = ll_swap_layouts(file, file2, &lsl);
3394                 }
3395 out:
3396                 fput(file2);
3397                 RETURN(rc);
3398         }
3399         case LL_IOC_LOV_GETSTRIPE:
3400         case LL_IOC_LOV_GETSTRIPE_NEW:
3401                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3402         case FS_IOC_GETFLAGS:
3403         case FS_IOC_SETFLAGS:
3404                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3405         case FSFILT_IOC_GETVERSION:
3406         case FS_IOC_GETVERSION:
3407                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3408         /* We need to special case any other ioctls we want to handle,
3409          * to send them to the MDS/OST as appropriate and to properly
3410          * network encode the arg field. */
3411         case FS_IOC_SETVERSION:
3412                 RETURN(-ENOTSUPP);
3413
3414         case LL_IOC_GROUP_LOCK:
3415                 RETURN(ll_get_grouplock(inode, file, arg));
3416         case LL_IOC_GROUP_UNLOCK:
3417                 RETURN(ll_put_grouplock(inode, file, arg));
3418         case IOC_OBD_STATFS:
3419                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3420
3421         case LL_IOC_FLUSHCTX:
3422                 RETURN(ll_flush_ctx(inode));
3423         case LL_IOC_PATH2FID: {
3424                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3425                                  sizeof(struct lu_fid)))
3426                         RETURN(-EFAULT);
3427
3428                 RETURN(0);
3429         }
3430         case LL_IOC_GETPARENT:
3431                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3432
3433         case OBD_IOC_FID2PATH:
3434                 RETURN(ll_fid2path(inode, (void __user *)arg));
3435         case LL_IOC_DATA_VERSION: {
3436                 struct ioc_data_version idv;
3437                 int rc;
3438
3439                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3440                         RETURN(-EFAULT);
3441
3442                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3443                 rc = ll_ioc_data_version(inode, &idv);
3444
3445                 if (rc == 0 &&
3446                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3447                         RETURN(-EFAULT);
3448
3449                 RETURN(rc);
3450         }
3451
3452         case LL_IOC_GET_MDTIDX: {
3453                 int mdtidx;
3454
3455                 mdtidx = ll_get_mdt_idx(inode);
3456                 if (mdtidx < 0)
3457                         RETURN(mdtidx);
3458
3459                 if (put_user((int)mdtidx, (int __user *)arg))
3460                         RETURN(-EFAULT);
3461
3462                 RETURN(0);
3463         }
3464         case OBD_IOC_GETDTNAME:
3465         case OBD_IOC_GETMDNAME:
3466                 RETURN(ll_get_obd_name(inode, cmd, arg));
3467         case LL_IOC_HSM_STATE_GET: {
3468                 struct md_op_data       *op_data;
3469                 struct hsm_user_state   *hus;
3470                 int                      rc;
3471
3472                 OBD_ALLOC_PTR(hus);
3473                 if (hus == NULL)
3474                         RETURN(-ENOMEM);
3475
3476                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3477                                              LUSTRE_OPC_ANY, hus);
3478                 if (IS_ERR(op_data)) {
3479                         OBD_FREE_PTR(hus);
3480                         RETURN(PTR_ERR(op_data));
3481                 }
3482
3483                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3484                                    op_data, NULL);
3485
3486                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3487                         rc = -EFAULT;
3488
3489                 ll_finish_md_op_data(op_data);
3490                 OBD_FREE_PTR(hus);
3491                 RETURN(rc);
3492         }
3493         case LL_IOC_HSM_STATE_SET: {
3494                 struct hsm_state_set    *hss;
3495                 int                      rc;
3496
3497                 OBD_ALLOC_PTR(hss);
3498                 if (hss == NULL)
3499                         RETURN(-ENOMEM);
3500
3501                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3502                         OBD_FREE_PTR(hss);
3503                         RETURN(-EFAULT);
3504                 }
3505
3506                 rc = ll_hsm_state_set(inode, hss);
3507
3508                 OBD_FREE_PTR(hss);
3509                 RETURN(rc);
3510         }
3511         case LL_IOC_HSM_ACTION: {
3512                 struct md_op_data               *op_data;
3513                 struct hsm_current_action       *hca;
3514                 int                              rc;
3515
3516                 OBD_ALLOC_PTR(hca);
3517                 if (hca == NULL)
3518                         RETURN(-ENOMEM);
3519
3520                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3521                                              LUSTRE_OPC_ANY, hca);
3522                 if (IS_ERR(op_data)) {
3523                         OBD_FREE_PTR(hca);
3524                         RETURN(PTR_ERR(op_data));
3525                 }
3526
3527                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3528                                    op_data, NULL);
3529
3530                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3531                         rc = -EFAULT;
3532
3533                 ll_finish_md_op_data(op_data);
3534                 OBD_FREE_PTR(hca);
3535                 RETURN(rc);
3536         }
3537         case LL_IOC_SET_LEASE_OLD: {
3538                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3539
3540                 RETURN(ll_file_set_lease(file, &ioc, 0));
3541         }
3542         case LL_IOC_SET_LEASE: {
3543                 struct ll_ioc_lease ioc;
3544
3545                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3546                         RETURN(-EFAULT);
3547
3548                 RETURN(ll_file_set_lease(file, &ioc, arg));
3549         }
3550         case LL_IOC_GET_LEASE: {
3551                 struct ll_inode_info *lli = ll_i2info(inode);
3552                 struct ldlm_lock *lock = NULL;
3553                 fmode_t fmode = 0;
3554
3555                 mutex_lock(&lli->lli_och_mutex);
3556                 if (fd->fd_lease_och != NULL) {
3557                         struct obd_client_handle *och = fd->fd_lease_och;
3558
3559                         lock = ldlm_handle2lock(&och->och_lease_handle);
3560                         if (lock != NULL) {
3561                                 lock_res_and_lock(lock);
3562                                 if (!ldlm_is_cancel(lock))
3563                                         fmode = och->och_flags;
3564
3565                                 unlock_res_and_lock(lock);
3566                                 LDLM_LOCK_PUT(lock);
3567                         }
3568                 }
3569                 mutex_unlock(&lli->lli_och_mutex);
3570
3571                 RETURN(ll_lease_type_from_fmode(fmode));
3572         }
3573         case LL_IOC_HSM_IMPORT: {
3574                 struct hsm_user_import *hui;
3575
3576                 OBD_ALLOC_PTR(hui);
3577                 if (hui == NULL)
3578                         RETURN(-ENOMEM);
3579
3580                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3581                         OBD_FREE_PTR(hui);
3582                         RETURN(-EFAULT);
3583                 }
3584
3585                 rc = ll_hsm_import(inode, file, hui);
3586
3587                 OBD_FREE_PTR(hui);
3588                 RETURN(rc);
3589         }
3590         case LL_IOC_FUTIMES_3: {
3591                 struct ll_futimes_3 lfu;
3592
3593                 if (copy_from_user(&lfu,
3594                                    (const struct ll_futimes_3 __user *)arg,
3595                                    sizeof(lfu)))
3596                         RETURN(-EFAULT);
3597
3598                 RETURN(ll_file_futimes_3(file, &lfu));
3599         }
3600         case LL_IOC_LADVISE: {
3601                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3602                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3603                 int i;
3604                 int num_advise;
3605                 int alloc_size = sizeof(*k_ladvise_hdr);
3606
3607                 rc = 0;
3608                 u_ladvise_hdr = (void __user *)arg;
3609                 OBD_ALLOC_PTR(k_ladvise_hdr);
3610                 if (k_ladvise_hdr == NULL)
3611                         RETURN(-ENOMEM);
3612
3613                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3614                         GOTO(out_ladvise, rc = -EFAULT);
3615
3616                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3617                     k_ladvise_hdr->lah_count < 1)
3618                         GOTO(out_ladvise, rc = -EINVAL);
3619
3620                 num_advise = k_ladvise_hdr->lah_count;
3621                 if (num_advise >= LAH_COUNT_MAX)
3622                         GOTO(out_ladvise, rc = -EFBIG);
3623
3624                 OBD_FREE_PTR(k_ladvise_hdr);
3625                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3626                                       lah_advise[num_advise]);
3627                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3628                 if (k_ladvise_hdr == NULL)
3629                         RETURN(-ENOMEM);
3630
3631                 /*
3632                  * TODO: submit multiple advices to one server in a single RPC
3633                  */
3634                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3635                         GOTO(out_ladvise, rc = -EFAULT);
3636
3637                 for (i = 0; i < num_advise; i++) {
3638                         struct llapi_lu_ladvise *k_ladvise =
3639                                         &k_ladvise_hdr->lah_advise[i];
3640                         struct llapi_lu_ladvise __user *u_ladvise =
3641                                         &u_ladvise_hdr->lah_advise[i];
3642
3643                         rc = ll_ladvise_sanity(inode, k_ladvise);
3644                         if (rc)
3645                                 GOTO(out_ladvise, rc);
3646
3647                         switch (k_ladvise->lla_advice) {
3648                         case LU_LADVISE_LOCKNOEXPAND:
3649                                 rc = ll_lock_noexpand(file,
3650                                                k_ladvise->lla_peradvice_flags);
3651                                 GOTO(out_ladvise, rc);
3652                         case LU_LADVISE_LOCKAHEAD:
3653
3654                                 rc = ll_file_lock_ahead(file, k_ladvise);
3655
3656                                 if (rc < 0)
3657                                         GOTO(out_ladvise, rc);
3658
3659                                 if (put_user(rc,
3660                                              &u_ladvise->lla_lockahead_result))
3661                                         GOTO(out_ladvise, rc = -EFAULT);
3662                                 break;
3663                         default:
3664                                 rc = ll_ladvise(inode, file,
3665                                                 k_ladvise_hdr->lah_flags,
3666                                                 k_ladvise);
3667                                 if (rc)
3668                                         GOTO(out_ladvise, rc);
3669                                 break;
3670                         }
3671
3672                 }
3673
3674 out_ladvise:
3675                 OBD_FREE(k_ladvise_hdr, alloc_size);
3676                 RETURN(rc);
3677         }
3678         case LL_IOC_FLR_SET_MIRROR: {
3679                 /* mirror I/O must be direct to avoid polluting page cache
3680                  * by stale data. */
3681                 if (!(file->f_flags & O_DIRECT))
3682                         RETURN(-EINVAL);
3683
3684                 fd->fd_designated_mirror = (__u32)arg;
3685                 RETURN(0);
3686         }
3687         case LL_IOC_FSGETXATTR:
3688                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3689         case LL_IOC_FSSETXATTR:
3690                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3691         case BLKSSZGET:
3692                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3693         default:
3694                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3695                                      (void __user *)arg));
3696         }
3697 }
3698
3699 #ifndef HAVE_FILE_LLSEEK_SIZE
3700 static inline loff_t
3701 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3702 {
3703         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3704                 return -EINVAL;
3705         if (offset > maxsize)
3706                 return -EINVAL;
3707
3708         if (offset != file->f_pos) {
3709                 file->f_pos = offset;
3710                 file->f_version = 0;
3711         }
3712         return offset;
3713 }
3714
3715 static loff_t
3716 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3717                 loff_t maxsize, loff_t eof)
3718 {
3719         struct inode *inode = file_inode(file);
3720
3721         switch (origin) {
3722         case SEEK_END:
3723                 offset += eof;
3724                 break;
3725         case SEEK_CUR:
3726                 /*
3727                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3728                  * position-querying operation.  Avoid rewriting the "same"
3729                  * f_pos value back to the file because a concurrent read(),
3730                  * write() or lseek() might have altered it
3731                  */
3732                 if (offset == 0)
3733                         return file->f_pos;
3734                 /*
3735                  * f_lock protects against read/modify/write race with other
3736                  * SEEK_CURs. Note that parallel writes and reads behave
3737                  * like SEEK_SET.
3738                  */
3739                 inode_lock(inode);
3740                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3741                 inode_unlock(inode);
3742                 return offset;
3743         case SEEK_DATA:
3744                 /*
3745                  * In the generic case the entire file is data, so as long as
3746                  * offset isn't at the end of the file then the offset is data.
3747                  */
3748                 if (offset >= eof)
3749                         return -ENXIO;
3750                 break;
3751         case SEEK_HOLE:
3752                 /*
3753                  * There is a virtual hole at the end of the file, so as long as
3754                  * offset isn't i_size or larger, return i_size.
3755                  */
3756                 if (offset >= eof)
3757                         return -ENXIO;
3758                 offset = eof;
3759                 break;
3760         }
3761
3762         return llseek_execute(file, offset, maxsize);
3763 }
3764 #endif
3765
3766 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3767 {
3768         struct inode *inode = file_inode(file);
3769         loff_t retval, eof = 0;
3770
3771         ENTRY;
3772         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3773                            (origin == SEEK_CUR) ? file->f_pos : 0);
3774         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3775                PFID(ll_inode2fid(inode)), inode, retval, retval,
3776                origin);
3777         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3778
3779         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3780                 retval = ll_glimpse_size(inode);
3781                 if (retval != 0)
3782                         RETURN(retval);
3783                 eof = i_size_read(inode);
3784         }
3785
3786         retval = ll_generic_file_llseek_size(file, offset, origin,
3787                                           ll_file_maxbytes(inode), eof);
3788         RETURN(retval);
3789 }
3790
3791 static int ll_flush(struct file *file, fl_owner_t id)
3792 {
3793         struct inode *inode = file_inode(file);
3794         struct ll_inode_info *lli = ll_i2info(inode);
3795         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3796         int rc, err;
3797
3798         LASSERT(!S_ISDIR(inode->i_mode));
3799
3800         /* catch async errors that were recorded back when async writeback
3801          * failed for pages in this mapping. */
3802         rc = lli->lli_async_rc;
3803         lli->lli_async_rc = 0;
3804         if (lli->lli_clob != NULL) {
3805                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3806                 if (rc == 0)
3807                         rc = err;
3808         }
3809
3810         /* The application has been told write failure already.
3811          * Do not report failure again. */
3812         if (fd->fd_write_failed)
3813                 return 0;
3814         return rc ? -EIO : 0;
3815 }
3816
3817 /**
3818  * Called to make sure a portion of file has been written out.
3819  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3820  *
3821  * Return how many pages have been written.
3822  */
3823 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3824                        enum cl_fsync_mode mode, int ignore_layout)
3825 {
3826         struct lu_env *env;
3827         struct cl_io *io;
3828         struct cl_fsync_io *fio;
3829         int result;
3830         __u16 refcheck;
3831         ENTRY;
3832
3833         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3834             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3835                 RETURN(-EINVAL);
3836
3837         env = cl_env_get(&refcheck);
3838         if (IS_ERR(env))
3839                 RETURN(PTR_ERR(env));
3840
3841         io = vvp_env_thread_io(env);
3842         io->ci_obj = ll_i2info(inode)->lli_clob;
3843         io->ci_ignore_layout = ignore_layout;
3844
3845         /* initialize parameters for sync */
3846         fio = &io->u.ci_fsync;
3847         fio->fi_start = start;
3848         fio->fi_end = end;
3849         fio->fi_fid = ll_inode2fid(inode);
3850         fio->fi_mode = mode;
3851         fio->fi_nr_written = 0;
3852
3853         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3854                 result = cl_io_loop(env, io);
3855         else
3856                 result = io->ci_result;
3857         if (result == 0)
3858                 result = fio->fi_nr_written;
3859         cl_io_fini(env, io);
3860         cl_env_put(env, &refcheck);
3861
3862         RETURN(result);
3863 }
3864
3865 /*
3866  * When dentry is provided (the 'else' case), file_dentry() may be
3867  * null and dentry must be used directly rather than pulled from
3868  * file_dentry() as is done otherwise.
3869  */
3870
3871 #ifdef HAVE_FILE_FSYNC_4ARGS
3872 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3873 {
3874         struct dentry *dentry = file_dentry(file);
3875         bool lock_inode;
3876 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3877 int ll_fsync(struct file *file, int datasync)
3878 {
3879         struct dentry *dentry = file_dentry(file);
3880         loff_t start = 0;
3881         loff_t end = LLONG_MAX;
3882 #else
3883 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3884 {
3885         loff_t start = 0;
3886         loff_t end = LLONG_MAX;
3887 #endif
3888         struct inode *inode = dentry->d_inode;
3889         struct ll_inode_info *lli = ll_i2info(inode);
3890         struct ptlrpc_request *req;
3891         int rc, err;
3892         ENTRY;
3893
3894         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3895                PFID(ll_inode2fid(inode)), inode);
3896         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3897
3898 #ifdef HAVE_FILE_FSYNC_4ARGS
3899         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3900         lock_inode = !lli->lli_inode_locked;
3901         if (lock_inode)
3902                 inode_lock(inode);
3903 #else
3904         /* fsync's caller has already called _fdata{sync,write}, we want
3905          * that IO to finish before calling the osc and mdc sync methods */
3906         rc = filemap_fdatawait(inode->i_mapping);
3907 #endif
3908
3909         /* catch async errors that were recorded back when async writeback
3910          * failed for pages in this mapping. */
3911         if (!S_ISDIR(inode->i_mode)) {
3912                 err = lli->lli_async_rc;
3913                 lli->lli_async_rc = 0;
3914                 if (rc == 0)
3915                         rc = err;
3916                 if (lli->lli_clob != NULL) {
3917                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3918                         if (rc == 0)
3919                                 rc = err;
3920                 }
3921         }
3922
3923         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3924         if (!rc)
3925                 rc = err;
3926         if (!err)
3927                 ptlrpc_req_finished(req);
3928
3929         if (S_ISREG(inode->i_mode)) {
3930                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3931
3932                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3933                 if (rc == 0 && err < 0)
3934                         rc = err;
3935                 if (rc < 0)
3936                         fd->fd_write_failed = true;
3937                 else
3938                         fd->fd_write_failed = false;
3939         }
3940
3941 #ifdef HAVE_FILE_FSYNC_4ARGS
3942         if (lock_inode)
3943                 inode_unlock(inode);
3944 #endif
3945         RETURN(rc);
3946 }
3947
3948 static int
3949 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3950 {
3951         struct inode *inode = file_inode(file);
3952         struct ll_sb_info *sbi = ll_i2sbi(inode);
3953         struct ldlm_enqueue_info einfo = {
3954                 .ei_type        = LDLM_FLOCK,
3955                 .ei_cb_cp       = ldlm_flock_completion_ast,
3956                 .ei_cbdata      = file_lock,
3957         };
3958         struct md_op_data *op_data;
3959         struct lustre_handle lockh = { 0 };
3960         union ldlm_policy_data flock = { { 0 } };
3961         int fl_type = file_lock->fl_type;
3962         __u64 flags = 0;
3963         int rc;
3964         int rc2 = 0;
3965         ENTRY;
3966
3967         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3968                PFID(ll_inode2fid(inode)), file_lock);
3969
3970         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3971
3972         if (file_lock->fl_flags & FL_FLOCK) {
3973                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3974                 /* flocks are whole-file locks */
3975                 flock.l_flock.end = OFFSET_MAX;
3976                 /* For flocks owner is determined by the local file desctiptor*/
3977                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3978         } else if (file_lock->fl_flags & FL_POSIX) {
3979                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3980                 flock.l_flock.start = file_lock->fl_start;
3981                 flock.l_flock.end = file_lock->fl_end;
3982         } else {
3983                 RETURN(-EINVAL);
3984         }
3985         flock.l_flock.pid = file_lock->fl_pid;
3986
3987         /* Somewhat ugly workaround for svc lockd.
3988          * lockd installs custom fl_lmops->lm_compare_owner that checks
3989          * for the fl_owner to be the same (which it always is on local node
3990          * I guess between lockd processes) and then compares pid.
3991          * As such we assign pid to the owner field to make it all work,
3992          * conflict with normal locks is unlikely since pid space and
3993          * pointer space for current->files are not intersecting */
3994         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3995                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3996
3997         switch (fl_type) {
3998         case F_RDLCK:
3999                 einfo.ei_mode = LCK_PR;
4000                 break;
4001         case F_UNLCK:
4002                 /* An unlock request may or may not have any relation to
4003                  * existing locks so we may not be able to pass a lock handle
4004                  * via a normal ldlm_lock_cancel() request. The request may even
4005                  * unlock a byte range in the middle of an existing lock. In
4006                  * order to process an unlock request we need all of the same
4007                  * information that is given with a normal read or write record
4008                  * lock request. To avoid creating another ldlm unlock (cancel)
4009                  * message we'll treat a LCK_NL flock request as an unlock. */
4010                 einfo.ei_mode = LCK_NL;
4011                 break;
4012         case F_WRLCK:
4013                 einfo.ei_mode = LCK_PW;
4014                 break;
4015         default:
4016                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4017                 RETURN (-ENOTSUPP);
4018         }
4019
4020         switch (cmd) {
4021         case F_SETLKW:
4022 #ifdef F_SETLKW64
4023         case F_SETLKW64:
4024 #endif
4025                 flags = 0;
4026                 break;
4027         case F_SETLK:
4028 #ifdef F_SETLK64
4029         case F_SETLK64:
4030 #endif
4031                 flags = LDLM_FL_BLOCK_NOWAIT;
4032                 break;
4033         case F_GETLK:
4034 #ifdef F_GETLK64
4035         case F_GETLK64:
4036 #endif
4037                 flags = LDLM_FL_TEST_LOCK;
4038                 break;
4039         default:
4040                 CERROR("unknown fcntl lock command: %d\n", cmd);
4041                 RETURN (-EINVAL);
4042         }
4043
4044         /* Save the old mode so that if the mode in the lock changes we
4045          * can decrement the appropriate reader or writer refcount. */
4046         file_lock->fl_type = einfo.ei_mode;
4047
4048         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4049                                      LUSTRE_OPC_ANY, NULL);
4050         if (IS_ERR(op_data))
4051                 RETURN(PTR_ERR(op_data));
4052
4053         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4054                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4055                flock.l_flock.pid, flags, einfo.ei_mode,
4056                flock.l_flock.start, flock.l_flock.end);
4057
4058         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4059                         flags);
4060
4061         /* Restore the file lock type if not TEST lock. */
4062         if (!(flags & LDLM_FL_TEST_LOCK))
4063                 file_lock->fl_type = fl_type;
4064
4065 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4066         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4067             !(flags & LDLM_FL_TEST_LOCK))
4068                 rc2  = locks_lock_file_wait(file, file_lock);
4069 #else
4070         if ((file_lock->fl_flags & FL_FLOCK) &&
4071             (rc == 0 || file_lock->fl_type == F_UNLCK))
4072                 rc2  = flock_lock_file_wait(file, file_lock);
4073         if ((file_lock->fl_flags & FL_POSIX) &&
4074             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4075             !(flags & LDLM_FL_TEST_LOCK))
4076                 rc2  = posix_lock_file_wait(file, file_lock);
4077 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4078
4079         if (rc2 && file_lock->fl_type != F_UNLCK) {
4080                 einfo.ei_mode = LCK_NL;
4081                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4082                            &lockh, flags);
4083                 rc = rc2;
4084         }
4085
4086         ll_finish_md_op_data(op_data);
4087
4088         RETURN(rc);
4089 }
4090
4091 int ll_get_fid_by_name(struct inode *parent, const char *name,
4092                        int namelen, struct lu_fid *fid,
4093                        struct inode **inode)
4094 {
4095         struct md_op_data       *op_data = NULL;
4096         struct mdt_body         *body;
4097         struct ptlrpc_request   *req;
4098         int                     rc;
4099         ENTRY;
4100
4101         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4102                                      LUSTRE_OPC_ANY, NULL);
4103         if (IS_ERR(op_data))
4104                 RETURN(PTR_ERR(op_data));
4105
4106         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4107         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4108         ll_finish_md_op_data(op_data);
4109         if (rc < 0)
4110                 RETURN(rc);
4111
4112         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4113         if (body == NULL)
4114                 GOTO(out_req, rc = -EFAULT);
4115         if (fid != NULL)
4116                 *fid = body->mbo_fid1;
4117
4118         if (inode != NULL)
4119                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4120 out_req:
4121         ptlrpc_req_finished(req);
4122         RETURN(rc);
4123 }
4124
4125 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4126                const char *name)
4127 {
4128         struct dentry *dchild = NULL;
4129         struct inode *child_inode = NULL;
4130         struct md_op_data *op_data;
4131         struct ptlrpc_request *request = NULL;
4132         struct obd_client_handle *och = NULL;
4133         struct qstr qstr;
4134         struct mdt_body *body;
4135         __u64 data_version = 0;
4136         size_t namelen = strlen(name);
4137         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4138         int rc;
4139         ENTRY;
4140
4141         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4142                PFID(ll_inode2fid(parent)), name,
4143                lum->lum_stripe_offset, lum->lum_stripe_count);
4144
4145         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4146             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4147                 lustre_swab_lmv_user_md(lum);
4148
4149         /* Get child FID first */
4150         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4151         qstr.name = name;
4152         qstr.len = namelen;
4153         dchild = d_lookup(file_dentry(file), &qstr);
4154         if (dchild) {
4155                 if (dchild->d_inode)
4156                         child_inode = igrab(dchild->d_inode);
4157                 dput(dchild);
4158         }
4159
4160         if (!child_inode) {
4161                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4162                                         &child_inode);
4163                 if (rc)
4164                         RETURN(rc);
4165         }
4166
4167         if (!child_inode)
4168                 RETURN(-ENOENT);
4169
4170         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4171               OBD_CONNECT2_DIR_MIGRATE)) {
4172                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4173                     ll_i2info(child_inode)->lli_lsm_md) {
4174                         CERROR("%s: MDT doesn't support stripe directory "
4175                                "migration!\n",
4176                                ll_get_fsname(parent->i_sb, NULL, 0));
4177                         GOTO(out_iput, rc = -EOPNOTSUPP);
4178                 }
4179         }
4180
4181         /*
4182          * lfs migrate command needs to be blocked on the client
4183          * by checking the migrate FID against the FID of the
4184          * filesystem root.
4185          */
4186         if (child_inode == parent->i_sb->s_root->d_inode)
4187                 GOTO(out_iput, rc = -EINVAL);
4188
4189         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4190                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4191         if (IS_ERR(op_data))
4192                 GOTO(out_iput, rc = PTR_ERR(op_data));
4193
4194         inode_lock(child_inode);
4195         op_data->op_fid3 = *ll_inode2fid(child_inode);
4196         if (!fid_is_sane(&op_data->op_fid3)) {
4197                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4198                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4199                        PFID(&op_data->op_fid3));
4200                 GOTO(out_unlock, rc = -EINVAL);
4201         }
4202
4203         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4204         op_data->op_data = lum;
4205         op_data->op_data_size = lumlen;
4206
4207 again:
4208         if (S_ISREG(child_inode->i_mode)) {
4209                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4210                 if (IS_ERR(och)) {
4211                         rc = PTR_ERR(och);
4212                         och = NULL;
4213                         GOTO(out_unlock, rc);
4214                 }
4215
4216                 rc = ll_data_version(child_inode, &data_version,
4217                                      LL_DV_WR_FLUSH);
4218                 if (rc != 0)
4219                         GOTO(out_close, rc);
4220
4221                 op_data->op_open_handle = och->och_open_handle;
4222                 op_data->op_data_version = data_version;
4223                 op_data->op_lease_handle = och->och_lease_handle;
4224                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4225
4226                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4227                 och->och_mod->mod_open_req->rq_replay = 0;
4228                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4229         }
4230
4231         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4232                        name, namelen, &request);
4233         if (rc == 0) {
4234                 LASSERT(request != NULL);
4235                 ll_update_times(request, parent);
4236
4237                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4238                 LASSERT(body != NULL);
4239
4240                 /* If the server does release layout lock, then we cleanup
4241                  * the client och here, otherwise release it in out_close: */
4242                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4243                         obd_mod_put(och->och_mod);
4244                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4245                                                   och);
4246                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4247                         OBD_FREE_PTR(och);
4248                         och = NULL;
4249                 }
4250         }
4251
4252         if (request != NULL) {
4253                 ptlrpc_req_finished(request);
4254                 request = NULL;
4255         }
4256
4257         /* Try again if the file layout has changed. */
4258         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4259                 goto again;
4260
4261 out_close:
4262         if (och)
4263                 ll_lease_close(och, child_inode, NULL);
4264         if (!rc)
4265                 clear_nlink(child_inode);
4266 out_unlock:
4267         inode_unlock(child_inode);
4268         ll_finish_md_op_data(op_data);
4269 out_iput:
4270         iput(child_inode);
4271         RETURN(rc);
4272 }
4273
4274 static int
4275 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4276 {
4277         ENTRY;
4278
4279         RETURN(-ENOSYS);
4280 }
4281
4282 /**
4283  * test if some locks matching bits and l_req_mode are acquired
4284  * - bits can be in different locks
4285  * - if found clear the common lock bits in *bits
4286  * - the bits not found, are kept in *bits
4287  * \param inode [IN]
4288  * \param bits [IN] searched lock bits [IN]
4289  * \param l_req_mode [IN] searched lock mode
4290  * \retval boolean, true iff all bits are found
4291  */
4292 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4293 {
4294         struct lustre_handle lockh;
4295         union ldlm_policy_data policy;
4296         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4297                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4298         struct lu_fid *fid;
4299         __u64 flags;
4300         int i;
4301         ENTRY;
4302
4303         if (!inode)
4304                RETURN(0);
4305
4306         fid = &ll_i2info(inode)->lli_fid;
4307         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4308                ldlm_lockname[mode]);
4309
4310         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4311         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4312                 policy.l_inodebits.bits = *bits & (1 << i);
4313                 if (policy.l_inodebits.bits == 0)
4314                         continue;
4315
4316                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4317                                   &policy, mode, &lockh)) {
4318                         struct ldlm_lock *lock;
4319
4320                         lock = ldlm_handle2lock(&lockh);
4321                         if (lock) {
4322                                 *bits &=
4323                                       ~(lock->l_policy_data.l_inodebits.bits);
4324                                 LDLM_LOCK_PUT(lock);
4325                         } else {
4326                                 *bits &= ~policy.l_inodebits.bits;
4327                         }
4328                 }
4329         }
4330         RETURN(*bits == 0);
4331 }
4332
4333 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4334                                struct lustre_handle *lockh, __u64 flags,
4335                                enum ldlm_mode mode)
4336 {
4337         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4338         struct lu_fid *fid;
4339         enum ldlm_mode rc;
4340         ENTRY;
4341
4342         fid = &ll_i2info(inode)->lli_fid;
4343         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4344
4345         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4346                            fid, LDLM_IBITS, &policy, mode, lockh);
4347
4348         RETURN(rc);
4349 }
4350
4351 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4352 {
4353         /* Already unlinked. Just update nlink and return success */
4354         if (rc == -ENOENT) {
4355                 clear_nlink(inode);
4356                 /* If it is striped directory, and there is bad stripe
4357                  * Let's revalidate the dentry again, instead of returning
4358                  * error */
4359                 if (S_ISDIR(inode->i_mode) &&
4360                     ll_i2info(inode)->lli_lsm_md != NULL)
4361                         return 0;
4362
4363                 /* This path cannot be hit for regular files unless in
4364                  * case of obscure races, so no need to to validate
4365                  * size. */
4366                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4367                         return 0;
4368         } else if (rc != 0) {
4369                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4370                              "%s: revalidate FID "DFID" error: rc = %d\n",
4371                              ll_get_fsname(inode->i_sb, NULL, 0),
4372                              PFID(ll_inode2fid(inode)), rc);
4373         }
4374
4375         return rc;
4376 }
4377
4378 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4379 {
4380         struct inode *inode = dentry->d_inode;
4381         struct obd_export *exp = ll_i2mdexp(inode);
4382         struct lookup_intent oit = {
4383                 .it_op = op,
4384         };
4385         struct ptlrpc_request *req = NULL;
4386         struct md_op_data *op_data;
4387         int rc = 0;
4388         ENTRY;
4389
4390         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4391                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4392
4393         /* Call getattr by fid, so do not provide name at all. */
4394         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4395                                      LUSTRE_OPC_ANY, NULL);
4396         if (IS_ERR(op_data))
4397                 RETURN(PTR_ERR(op_data));
4398
4399         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4400         ll_finish_md_op_data(op_data);
4401         if (rc < 0) {
4402                 rc = ll_inode_revalidate_fini(inode, rc);
4403                 GOTO(out, rc);
4404         }
4405
4406         rc = ll_revalidate_it_finish(req, &oit, dentry);
4407         if (rc != 0) {
4408                 ll_intent_release(&oit);
4409                 GOTO(out, rc);
4410         }
4411
4412         /* Unlinked? Unhash dentry, so it is not picked up later by
4413          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4414          * here to preserve get_cwd functionality on 2.6.
4415          * Bug 10503 */
4416         if (!dentry->d_inode->i_nlink) {
4417                 ll_lock_dcache(inode);
4418                 d_lustre_invalidate(dentry, 0);
4419                 ll_unlock_dcache(inode);
4420         }
4421
4422         ll_lookup_finish_locks(&oit, dentry);
4423 out:
4424         ptlrpc_req_finished(req);
4425
4426         return rc;
4427 }
4428
4429 static int ll_merge_md_attr(struct inode *inode)
4430 {
4431         struct cl_attr attr = { 0 };
4432         int rc;
4433
4434         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4435         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4436                            &attr, ll_md_blocking_ast);
4437         if (rc != 0)
4438                 RETURN(rc);
4439
4440         set_nlink(inode, attr.cat_nlink);
4441         inode->i_blocks = attr.cat_blocks;
4442         i_size_write(inode, attr.cat_size);
4443
4444         ll_i2info(inode)->lli_atime = attr.cat_atime;
4445         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4446         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4447
4448         RETURN(0);
4449 }
4450
4451 static inline dev_t ll_compat_encode_dev(dev_t dev)
4452 {
4453         /* The compat_sys_*stat*() syscalls will fail unless the
4454          * device majors and minors are both less than 256. Note that
4455          * the value returned here will be passed through
4456          * old_encode_dev() in cp_compat_stat(). And so we are not
4457          * trying to return a valid compat (u16) device number, just
4458          * one that will pass the old_valid_dev() check. */
4459
4460         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4461 }
4462
4463 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4464 int ll_getattr(const struct path *path, struct kstat *stat,
4465                u32 request_mask, unsigned int flags)
4466 {
4467         struct dentry *de = path->dentry;
4468 #else
4469 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4470 {
4471 #endif
4472         struct inode *inode = de->d_inode;
4473         struct ll_sb_info *sbi = ll_i2sbi(inode);
4474         struct ll_inode_info *lli = ll_i2info(inode);
4475         int rc;
4476
4477         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4478
4479         rc = ll_inode_revalidate(de, IT_GETATTR);
4480         if (rc < 0)
4481                 RETURN(rc);
4482
4483         if (S_ISREG(inode->i_mode)) {
4484                 /* In case of restore, the MDT has the right size and has
4485                  * already send it back without granting the layout lock,
4486                  * inode is up-to-date so glimpse is useless.
4487                  * Also to glimpse we need the layout, in case of a running
4488                  * restore the MDT holds the layout lock so the glimpse will
4489                  * block up to the end of restore (getattr will block)
4490                  */
4491                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4492                         rc = ll_glimpse_size(inode);
4493                         if (rc < 0)
4494                                 RETURN(rc);
4495                 }
4496         } else {
4497                 /* If object isn't regular a file then don't validate size. */
4498                 if (S_ISDIR(inode->i_mode) &&
4499                     lli->lli_lsm_md != NULL) {
4500                         rc = ll_merge_md_attr(inode);
4501                         if (rc < 0)
4502                                 RETURN(rc);
4503                 }
4504
4505                 LTIME_S(inode->i_atime) = lli->lli_atime;
4506                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4507                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4508         }
4509
4510         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4511
4512         if (ll_need_32bit_api(sbi)) {
4513                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4514                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4515                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4516         } else {
4517                 stat->ino = inode->i_ino;
4518                 stat->dev = inode->i_sb->s_dev;
4519                 stat->rdev = inode->i_rdev;
4520         }
4521
4522         stat->mode = inode->i_mode;
4523         stat->uid = inode->i_uid;
4524         stat->gid = inode->i_gid;
4525         stat->atime = inode->i_atime;
4526         stat->mtime = inode->i_mtime;
4527         stat->ctime = inode->i_ctime;
4528         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4529
4530         stat->nlink = inode->i_nlink;
4531         stat->size = i_size_read(inode);
4532         stat->blocks = inode->i_blocks;
4533
4534         return 0;
4535 }
4536
4537 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4538                      __u64 start, __u64 len)
4539 {
4540         int             rc;
4541         size_t          num_bytes;
4542         struct fiemap   *fiemap;
4543         unsigned int    extent_count = fieinfo->fi_extents_max;
4544
4545         num_bytes = sizeof(*fiemap) + (extent_count *
4546                                        sizeof(struct fiemap_extent));
4547         OBD_ALLOC_LARGE(fiemap, num_bytes);
4548
4549         if (fiemap == NULL)
4550                 RETURN(-ENOMEM);
4551
4552         fiemap->fm_flags = fieinfo->fi_flags;
4553         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4554         fiemap->fm_start = start;
4555         fiemap->fm_length = len;
4556         if (extent_count > 0 &&
4557             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4558                            sizeof(struct fiemap_extent)) != 0)
4559                 GOTO(out, rc = -EFAULT);
4560
4561         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4562
4563         fieinfo->fi_flags = fiemap->fm_flags;
4564         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4565         if (extent_count > 0 &&
4566             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4567                          fiemap->fm_mapped_extents *
4568                          sizeof(struct fiemap_extent)) != 0)
4569                 GOTO(out, rc = -EFAULT);
4570 out:
4571         OBD_FREE_LARGE(fiemap, num_bytes);
4572         return rc;
4573 }
4574
4575 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4576 {
4577         struct ll_inode_info *lli = ll_i2info(inode);
4578         struct posix_acl *acl = NULL;
4579         ENTRY;
4580
4581         spin_lock(&lli->lli_lock);
4582         /* VFS' acl_permission_check->check_acl will release the refcount */
4583         acl = posix_acl_dup(lli->lli_posix_acl);
4584         spin_unlock(&lli->lli_lock);
4585
4586         RETURN(acl);
4587 }
4588
4589 #ifdef HAVE_IOP_SET_ACL
4590 #ifdef CONFIG_FS_POSIX_ACL
4591 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4592 {
4593         struct ll_sb_info *sbi = ll_i2sbi(inode);
4594         struct ptlrpc_request *req = NULL;
4595         const char *name = NULL;
4596         char *value = NULL;
4597         size_t value_size = 0;
4598         int rc = 0;
4599         ENTRY;
4600
4601         switch (type) {
4602         case ACL_TYPE_ACCESS:
4603                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4604                 if (acl)
4605                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4606                 break;
4607
4608         case ACL_TYPE_DEFAULT:
4609                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4610                 if (!S_ISDIR(inode->i_mode))
4611                         rc = acl ? -EACCES : 0;
4612                 break;
4613
4614         default:
4615                 rc = -EINVAL;
4616                 break;
4617         }
4618         if (rc)
4619                 return rc;
4620
4621         if (acl) {
4622                 value_size = posix_acl_xattr_size(acl->a_count);
4623                 value = kmalloc(value_size, GFP_NOFS);
4624                 if (value == NULL)
4625                         GOTO(out, rc = -ENOMEM);
4626
4627                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4628                 if (rc < 0)
4629                         GOTO(out_value, rc);
4630         }
4631
4632         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4633                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4634                          name, value, value_size, 0, 0, &req);
4635
4636         ptlrpc_req_finished(req);
4637 out_value:
4638         kfree(value);
4639 out:
4640         if (rc)
4641                 forget_cached_acl(inode, type);
4642         else
4643                 set_cached_acl(inode, type, acl);
4644         RETURN(rc);
4645 }
4646 #endif /* CONFIG_FS_POSIX_ACL */
4647 #endif /* HAVE_IOP_SET_ACL */
4648
4649 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4650 static int
4651 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4652 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4653 # else
4654 ll_check_acl(struct inode *inode, int mask)
4655 # endif
4656 {
4657 # ifdef CONFIG_FS_POSIX_ACL
4658         struct posix_acl *acl;
4659         int rc;
4660         ENTRY;
4661
4662 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4663         if (flags & IPERM_FLAG_RCU)
4664                 return -ECHILD;
4665 #  endif
4666         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4667
4668         if (!acl)
4669                 RETURN(-EAGAIN);
4670
4671         rc = posix_acl_permission(inode, acl, mask);
4672         posix_acl_release(acl);
4673
4674         RETURN(rc);
4675 # else /* !CONFIG_FS_POSIX_ACL */
4676         return -EAGAIN;
4677 # endif /* CONFIG_FS_POSIX_ACL */
4678 }
4679 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4680
4681 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4682 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4683 #else
4684 # ifdef HAVE_INODE_PERMISION_2ARGS
4685 int ll_inode_permission(struct inode *inode, int mask)
4686 # else
4687 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4688 # endif
4689 #endif
4690 {
4691         int rc = 0;
4692         struct ll_sb_info *sbi;
4693         struct root_squash_info *squash;
4694         struct cred *cred = NULL;
4695         const struct cred *old_cred = NULL;
4696         cfs_cap_t cap;
4697         bool squash_id = false;
4698         ENTRY;
4699
4700 #ifdef MAY_NOT_BLOCK
4701         if (mask & MAY_NOT_BLOCK)
4702                 return -ECHILD;
4703 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4704         if (flags & IPERM_FLAG_RCU)
4705                 return -ECHILD;
4706 #endif
4707
4708        /* as root inode are NOT getting validated in lookup operation,
4709         * need to do it before permission check. */
4710
4711         if (inode == inode->i_sb->s_root->d_inode) {
4712                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4713                 if (rc)
4714                         RETURN(rc);
4715         }
4716
4717         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4718                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4719
4720         /* squash fsuid/fsgid if needed */
4721         sbi = ll_i2sbi(inode);
4722         squash = &sbi->ll_squash;
4723         if (unlikely(squash->rsi_uid != 0 &&
4724                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4725                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4726                         squash_id = true;
4727         }
4728         if (squash_id) {
4729                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4730                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4731                        squash->rsi_uid, squash->rsi_gid);
4732
4733                 /* update current process's credentials
4734                  * and FS capability */
4735                 cred = prepare_creds();
4736                 if (cred == NULL)
4737                         RETURN(-ENOMEM);
4738
4739                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4740                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4741                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4742                         if ((1 << cap) & CFS_CAP_FS_MASK)
4743                                 cap_lower(cred->cap_effective, cap);
4744                 }
4745                 old_cred = override_creds(cred);
4746         }
4747
4748         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4749         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4750         /* restore current process's credentials and FS capability */
4751         if (squash_id) {
4752                 revert_creds(old_cred);
4753                 put_cred(cred);
4754         }
4755
4756         RETURN(rc);
4757 }
4758
4759 /* -o localflock - only provides locally consistent flock locks */
4760 struct file_operations ll_file_operations = {
4761 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4762 # ifdef HAVE_SYNC_READ_WRITE
4763         .read           = new_sync_read,
4764         .write          = new_sync_write,
4765 # endif
4766         .read_iter      = ll_file_read_iter,
4767         .write_iter     = ll_file_write_iter,
4768 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4769         .read           = ll_file_read,
4770         .aio_read       = ll_file_aio_read,
4771         .write          = ll_file_write,
4772         .aio_write      = ll_file_aio_write,
4773 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4774         .unlocked_ioctl = ll_file_ioctl,
4775         .open           = ll_file_open,
4776         .release        = ll_file_release,
4777         .mmap           = ll_file_mmap,
4778         .llseek         = ll_file_seek,
4779         .splice_read    = ll_file_splice_read,
4780         .fsync          = ll_fsync,
4781         .flush          = ll_flush
4782 };
4783
4784 struct file_operations ll_file_operations_flock = {
4785 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4786 # ifdef HAVE_SYNC_READ_WRITE
4787         .read           = new_sync_read,
4788         .write          = new_sync_write,
4789 # endif /* HAVE_SYNC_READ_WRITE */
4790         .read_iter      = ll_file_read_iter,
4791         .write_iter     = ll_file_write_iter,
4792 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4793         .read           = ll_file_read,
4794         .aio_read       = ll_file_aio_read,
4795         .write          = ll_file_write,
4796         .aio_write      = ll_file_aio_write,
4797 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4798         .unlocked_ioctl = ll_file_ioctl,
4799         .open           = ll_file_open,
4800         .release        = ll_file_release,
4801         .mmap           = ll_file_mmap,
4802         .llseek         = ll_file_seek,
4803         .splice_read    = ll_file_splice_read,
4804         .fsync          = ll_fsync,
4805         .flush          = ll_flush,
4806         .flock          = ll_file_flock,
4807         .lock           = ll_file_flock
4808 };
4809
4810 /* These are for -o noflock - to return ENOSYS on flock calls */
4811 struct file_operations ll_file_operations_noflock = {
4812 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4813 # ifdef HAVE_SYNC_READ_WRITE
4814         .read           = new_sync_read,
4815         .write          = new_sync_write,
4816 # endif /* HAVE_SYNC_READ_WRITE */
4817         .read_iter      = ll_file_read_iter,
4818         .write_iter     = ll_file_write_iter,
4819 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4820         .read           = ll_file_read,
4821         .aio_read       = ll_file_aio_read,
4822         .write          = ll_file_write,
4823         .aio_write      = ll_file_aio_write,
4824 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4825         .unlocked_ioctl = ll_file_ioctl,
4826         .open           = ll_file_open,
4827         .release        = ll_file_release,
4828         .mmap           = ll_file_mmap,
4829         .llseek         = ll_file_seek,
4830         .splice_read    = ll_file_splice_read,
4831         .fsync          = ll_fsync,
4832         .flush          = ll_flush,
4833         .flock          = ll_file_noflock,
4834         .lock           = ll_file_noflock
4835 };
4836
4837 struct inode_operations ll_file_inode_operations = {
4838         .setattr        = ll_setattr,
4839         .getattr        = ll_getattr,
4840         .permission     = ll_inode_permission,
4841 #ifdef HAVE_IOP_XATTR
4842         .setxattr       = ll_setxattr,
4843         .getxattr       = ll_getxattr,
4844         .removexattr    = ll_removexattr,
4845 #endif
4846         .listxattr      = ll_listxattr,
4847         .fiemap         = ll_fiemap,
4848 #ifdef HAVE_IOP_GET_ACL
4849         .get_acl        = ll_get_acl,
4850 #endif
4851 #ifdef HAVE_IOP_SET_ACL
4852         .set_acl        = ll_set_acl,
4853 #endif
4854 };
4855
4856 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4857 {
4858         struct ll_inode_info *lli = ll_i2info(inode);
4859         struct cl_object *obj = lli->lli_clob;
4860         struct lu_env *env;
4861         int rc;
4862         __u16 refcheck;
4863         ENTRY;
4864
4865         if (obj == NULL)
4866                 RETURN(0);
4867
4868         env = cl_env_get(&refcheck);
4869         if (IS_ERR(env))
4870                 RETURN(PTR_ERR(env));
4871
4872         rc = cl_conf_set(env, lli->lli_clob, conf);
4873         if (rc < 0)
4874                 GOTO(out, rc);
4875
4876         if (conf->coc_opc == OBJECT_CONF_SET) {
4877                 struct ldlm_lock *lock = conf->coc_lock;
4878                 struct cl_layout cl = {
4879                         .cl_layout_gen = 0,
4880                 };
4881
4882                 LASSERT(lock != NULL);
4883                 LASSERT(ldlm_has_layout(lock));
4884
4885                 /* it can only be allowed to match after layout is
4886                  * applied to inode otherwise false layout would be
4887                  * seen. Applying layout shoud happen before dropping
4888                  * the intent lock. */
4889                 ldlm_lock_allow_match(lock);
4890
4891                 rc = cl_object_layout_get(env, obj, &cl);
4892                 if (rc < 0)
4893                         GOTO(out, rc);
4894
4895                 CDEBUG(D_VFSTRACE,
4896                        DFID": layout version change: %u -> %u\n",
4897                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4898                        cl.cl_layout_gen);
4899                 ll_layout_version_set(lli, cl.cl_layout_gen);
4900         }
4901
4902 out:
4903         cl_env_put(env, &refcheck);
4904
4905         RETURN(rc);
4906 }
4907
4908 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4909 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4910
4911 {
4912         struct ll_sb_info *sbi = ll_i2sbi(inode);
4913         struct ptlrpc_request *req;
4914         struct mdt_body *body;
4915         void *lvbdata;
4916         void *lmm;
4917         int lmmsize;
4918         int rc;
4919         ENTRY;
4920
4921         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4922                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4923                lock->l_lvb_data, lock->l_lvb_len);
4924
4925         if (lock->l_lvb_data != NULL)
4926                 RETURN(0);
4927
4928         /* if layout lock was granted right away, the layout is returned
4929          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4930          * blocked and then granted via completion ast, we have to fetch
4931          * layout here. Please note that we can't use the LVB buffer in
4932          * completion AST because it doesn't have a large enough buffer */
4933         rc = ll_get_default_mdsize(sbi, &lmmsize);
4934         if (rc == 0)
4935                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4936                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4937         if (rc < 0)
4938                 RETURN(rc);
4939
4940         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4941         if (body == NULL)
4942                 GOTO(out, rc = -EPROTO);
4943
4944         lmmsize = body->mbo_eadatasize;
4945         if (lmmsize == 0) /* empty layout */
4946                 GOTO(out, rc = 0);
4947
4948         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4949         if (lmm == NULL)
4950                 GOTO(out, rc = -EFAULT);
4951
4952         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4953         if (lvbdata == NULL)
4954                 GOTO(out, rc = -ENOMEM);
4955
4956         memcpy(lvbdata, lmm, lmmsize);
4957         lock_res_and_lock(lock);
4958         if (unlikely(lock->l_lvb_data == NULL)) {
4959                 lock->l_lvb_type = LVB_T_LAYOUT;
4960                 lock->l_lvb_data = lvbdata;
4961                 lock->l_lvb_len = lmmsize;
4962                 lvbdata = NULL;
4963         }
4964         unlock_res_and_lock(lock);
4965
4966         if (lvbdata)
4967                 OBD_FREE_LARGE(lvbdata, lmmsize);
4968
4969         EXIT;
4970
4971 out:
4972         ptlrpc_req_finished(req);
4973         return rc;
4974 }
4975
4976 /**
4977  * Apply the layout to the inode. Layout lock is held and will be released
4978  * in this function.
4979  */
4980 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4981                               struct inode *inode)
4982 {
4983         struct ll_inode_info *lli = ll_i2info(inode);
4984         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4985         struct ldlm_lock *lock;
4986         struct cl_object_conf conf;
4987         int rc = 0;
4988         bool lvb_ready;
4989         bool wait_layout = false;
4990         ENTRY;
4991
4992         LASSERT(lustre_handle_is_used(lockh));
4993
4994         lock = ldlm_handle2lock(lockh);
4995         LASSERT(lock != NULL);
4996         LASSERT(ldlm_has_layout(lock));
4997
4998         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4999                    PFID(&lli->lli_fid), inode);
5000
5001         /* in case this is a caching lock and reinstate with new inode */
5002         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5003
5004         lock_res_and_lock(lock);
5005         lvb_ready = ldlm_is_lvb_ready(lock);
5006         unlock_res_and_lock(lock);
5007
5008         /* checking lvb_ready is racy but this is okay. The worst case is
5009          * that multi processes may configure the file on the same time. */
5010         if (lvb_ready)
5011                 GOTO(out, rc = 0);
5012
5013         rc = ll_layout_fetch(inode, lock);
5014         if (rc < 0)
5015                 GOTO(out, rc);
5016
5017         /* for layout lock, lmm is stored in lock's lvb.
5018          * lvb_data is immutable if the lock is held so it's safe to access it
5019          * without res lock.
5020          *
5021          * set layout to file. Unlikely this will fail as old layout was
5022          * surely eliminated */
5023         memset(&conf, 0, sizeof conf);
5024         conf.coc_opc = OBJECT_CONF_SET;
5025         conf.coc_inode = inode;
5026         conf.coc_lock = lock;
5027         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5028         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5029         rc = ll_layout_conf(inode, &conf);
5030
5031         /* refresh layout failed, need to wait */
5032         wait_layout = rc == -EBUSY;
5033         EXIT;
5034 out:
5035         LDLM_LOCK_PUT(lock);
5036         ldlm_lock_decref(lockh, mode);
5037
5038         /* wait for IO to complete if it's still being used. */
5039         if (wait_layout) {
5040                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5041                        ll_get_fsname(inode->i_sb, NULL, 0),
5042                        PFID(&lli->lli_fid), inode);
5043
5044                 memset(&conf, 0, sizeof conf);
5045                 conf.coc_opc = OBJECT_CONF_WAIT;
5046                 conf.coc_inode = inode;
5047                 rc = ll_layout_conf(inode, &conf);
5048                 if (rc == 0)
5049                         rc = -EAGAIN;
5050
5051                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5052                        ll_get_fsname(inode->i_sb, NULL, 0),
5053                        PFID(&lli->lli_fid), rc);
5054         }
5055         RETURN(rc);
5056 }
5057
5058 /**
5059  * Issue layout intent RPC to MDS.
5060  * \param inode [in]    file inode
5061  * \param intent [in]   layout intent
5062  *
5063  * \retval 0    on success
5064  * \retval < 0  error code
5065  */
5066 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5067 {
5068         struct ll_inode_info  *lli = ll_i2info(inode);
5069         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5070         struct md_op_data     *op_data;
5071         struct lookup_intent it;
5072         struct ptlrpc_request *req;
5073         int rc;
5074         ENTRY;
5075
5076         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5077                                      0, 0, LUSTRE_OPC_ANY, NULL);
5078         if (IS_ERR(op_data))
5079                 RETURN(PTR_ERR(op_data));
5080
5081         op_data->op_data = intent;
5082         op_data->op_data_size = sizeof(*intent);
5083
5084         memset(&it, 0, sizeof(it));
5085         it.it_op = IT_LAYOUT;
5086         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5087             intent->li_opc == LAYOUT_INTENT_TRUNC)
5088                 it.it_flags = FMODE_WRITE;
5089
5090         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5091                           ll_get_fsname(inode->i_sb, NULL, 0),
5092                           PFID(&lli->lli_fid), inode);
5093
5094         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5095                             &ll_md_blocking_ast, 0);
5096         if (it.it_request != NULL)
5097                 ptlrpc_req_finished(it.it_request);
5098         it.it_request = NULL;
5099
5100         ll_finish_md_op_data(op_data);
5101
5102         /* set lock data in case this is a new lock */
5103         if (!rc)
5104                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5105
5106         ll_intent_drop_lock(&it);
5107
5108         RETURN(rc);
5109 }
5110
5111 /**
5112  * This function checks if there exists a LAYOUT lock on the client side,
5113  * or enqueues it if it doesn't have one in cache.
5114  *
5115  * This function will not hold layout lock so it may be revoked any time after
5116  * this function returns. Any operations depend on layout should be redone
5117  * in that case.
5118  *
5119  * This function should be called before lov_io_init() to get an uptodate
5120  * layout version, the caller should save the version number and after IO
5121  * is finished, this function should be called again to verify that layout
5122  * is not changed during IO time.
5123  */
5124 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5125 {
5126         struct ll_inode_info    *lli = ll_i2info(inode);
5127         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5128         struct lustre_handle lockh;
5129         struct layout_intent intent = {
5130                 .li_opc = LAYOUT_INTENT_ACCESS,
5131         };
5132         enum ldlm_mode mode;
5133         int rc;
5134         ENTRY;
5135
5136         *gen = ll_layout_version_get(lli);
5137         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5138                 RETURN(0);
5139
5140         /* sanity checks */
5141         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5142         LASSERT(S_ISREG(inode->i_mode));
5143
5144         /* take layout lock mutex to enqueue layout lock exclusively. */
5145         mutex_lock(&lli->lli_layout_mutex);
5146
5147         while (1) {
5148                 /* mostly layout lock is caching on the local side, so try to
5149                  * match it before grabbing layout lock mutex. */
5150                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5151                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5152                 if (mode != 0) { /* hit cached lock */
5153                         rc = ll_layout_lock_set(&lockh, mode, inode);
5154                         if (rc == -EAGAIN)
5155                                 continue;
5156                         break;
5157                 }
5158
5159                 rc = ll_layout_intent(inode, &intent);
5160                 if (rc != 0)
5161                         break;
5162         }
5163
5164         if (rc == 0)
5165                 *gen = ll_layout_version_get(lli);
5166         mutex_unlock(&lli->lli_layout_mutex);
5167
5168         RETURN(rc);
5169 }
5170
5171 /**
5172  * Issue layout intent RPC indicating where in a file an IO is about to write.
5173  *
5174  * \param[in] inode     file inode.
5175  * \param[in] ext       write range with start offset of fille in bytes where
5176  *                      an IO is about to write, and exclusive end offset in
5177  *                      bytes.
5178  *
5179  * \retval 0    on success
5180  * \retval < 0  error code
5181  */
5182 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5183                            struct lu_extent *ext)
5184 {
5185         struct layout_intent intent = {
5186                 .li_opc = opc,
5187                 .li_extent.e_start = ext->e_start,
5188                 .li_extent.e_end = ext->e_end,
5189         };
5190         int rc;
5191         ENTRY;
5192
5193         rc = ll_layout_intent(inode, &intent);
5194
5195         RETURN(rc);
5196 }
5197
5198 /**
5199  *  This function send a restore request to the MDT
5200  */
5201 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5202 {
5203         struct hsm_user_request *hur;
5204         int                      len, rc;
5205         ENTRY;
5206
5207         len = sizeof(struct hsm_user_request) +
5208               sizeof(struct hsm_user_item);
5209         OBD_ALLOC(hur, len);
5210         if (hur == NULL)
5211                 RETURN(-ENOMEM);
5212
5213         hur->hur_request.hr_action = HUA_RESTORE;
5214         hur->hur_request.hr_archive_id = 0;
5215         hur->hur_request.hr_flags = 0;
5216         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5217                sizeof(hur->hur_user_item[0].hui_fid));
5218         hur->hur_user_item[0].hui_extent.offset = offset;
5219         hur->hur_user_item[0].hui_extent.length = length;
5220         hur->hur_request.hr_itemcount = 1;
5221         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5222                            len, hur, NULL);
5223         OBD_FREE(hur, len);
5224         RETURN(rc);
5225 }