lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                       ATTR_MTIME | ATTR_MTIME_SET |
 104                                       ATTR_CTIME);
 105         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 106         op_data->op_attr_blocks = inode->i_blocks;
 107         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 108         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 109                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 110         op_data->op_open_handle = och->och_open_handle;
 111
 112         if (och->och_flags & FMODE_WRITE &&
 113             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 114                 /* For HSM: if inode data has been modified, pack it so that
 115                  * MDT can set data dirty flag in the archive. */
 116                 op_data->op_bias |= MDS_DATA_MODIFIED;
 117
 118         EXIT;
 119 }
 120
 121 /**
 122  * Perform a close, possibly with a bias.
 123  * The meaning of "data" depends on the value of "bias".
 124  *
 125  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 126  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 127  * swap layouts with.
 128  */
 129 static int ll_close_inode_openhandle(struct inode *inode,
 130                                      struct obd_client_handle *och,
 131                                      enum mds_op_bias bias, void *data)
 132 {
 133         struct obd_export *md_exp = ll_i2mdexp(inode);
 134         const struct ll_inode_info *lli = ll_i2info(inode);
 135         struct md_op_data *op_data;
 136         struct ptlrpc_request *req = NULL;
 137         int rc;
 138         ENTRY;
 139
 140         if (class_exp2obd(md_exp) == NULL) {
 141                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 142                        ll_get_fsname(inode->i_sb, NULL, 0),
 143                        PFID(&lli->lli_fid));
 144                 GOTO(out, rc = 0);
 145         }
 146
 147         OBD_ALLOC_PTR(op_data);
 148         /* We leak openhandle and request here on error, but not much to be
 149          * done in OOM case since app won't retry close on error either. */
 150         if (op_data == NULL)
 151                 GOTO(out, rc = -ENOMEM);
 152
 153         ll_prepare_close(inode, op_data, och);
 154         switch (bias) {
 155         case MDS_CLOSE_LAYOUT_MERGE:
 156                 /* merge blocks from the victim inode */
 157                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 158                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 159                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 160         case MDS_CLOSE_LAYOUT_SPLIT:
 161         case MDS_CLOSE_LAYOUT_SWAP: {
 162                 struct split_param *sp = data;
 163
 164                 LASSERT(data != NULL);
 165                 op_data->op_bias |= bias;
 166                 op_data->op_data_version = 0;
 167                 op_data->op_lease_handle = och->och_lease_handle;
 168                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 169                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 170                         op_data->op_mirror_id = sp->sp_mirror_id;
 171                 } else {
 172                         op_data->op_fid2 = *ll_inode2fid(data);
 173                 }
 174                 break;
 175         }
 176
 177         case MDS_CLOSE_RESYNC_DONE: {
 178                 struct ll_ioc_lease *ioc = data;
 179
 180                 LASSERT(data != NULL);
 181                 op_data->op_attr_blocks +=
 182                         ioc->lil_count * op_data->op_attr_blocks;
 183                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 184                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 185                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 186
 187                 op_data->op_lease_handle = och->och_lease_handle;
 188                 op_data->op_data = &ioc->lil_ids[0];
 189                 op_data->op_data_size =
 190                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 191                 break;
 192         }
 193
 194         case MDS_HSM_RELEASE:
 195                 LASSERT(data != NULL);
 196                 op_data->op_bias |= MDS_HSM_RELEASE;
 197                 op_data->op_data_version = *(__u64 *)data;
 198                 op_data->op_lease_handle = och->och_lease_handle;
 199                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 200                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 201                 break;
 202
 203         default:
 204                 LASSERT(data == NULL);
 205                 break;
 206         }
 207
 208         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 209                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 210         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 211                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 212
 213         rc = md_close(md_exp, op_data, och->och_mod, &req);
 214         if (rc != 0 && rc != -EINTR)
 215                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 216                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 217
 218         if (rc == 0 && op_data->op_bias & bias) {
 219                 struct mdt_body *body;
 220
 221                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 222                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 223                         rc = -EBUSY;
 224         }
 225
 226         ll_finish_md_op_data(op_data);
 227         EXIT;
 228 out:
 229
 230         md_clear_open_replay_data(md_exp, och);
 231         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 232         OBD_FREE_PTR(och);
 233
 234         ptlrpc_req_finished(req);       /* This is close request */
 235         return rc;
 236 }
 237
 238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 239 {
 240         struct ll_inode_info *lli = ll_i2info(inode);
 241         struct obd_client_handle **och_p;
 242         struct obd_client_handle *och;
 243         __u64 *och_usecount;
 244         int rc = 0;
 245         ENTRY;
 246
 247         if (fmode & FMODE_WRITE) {
 248                 och_p = &lli->lli_mds_write_och;
 249                 och_usecount = &lli->lli_open_fd_write_count;
 250         } else if (fmode & FMODE_EXEC) {
 251                 och_p = &lli->lli_mds_exec_och;
 252                 och_usecount = &lli->lli_open_fd_exec_count;
 253         } else {
 254                 LASSERT(fmode & FMODE_READ);
 255                 och_p = &lli->lli_mds_read_och;
 256                 och_usecount = &lli->lli_open_fd_read_count;
 257         }
 258
 259         mutex_lock(&lli->lli_och_mutex);
 260         if (*och_usecount > 0) {
 261                 /* There are still users of this handle, so skip
 262                  * freeing it. */
 263                 mutex_unlock(&lli->lli_och_mutex);
 264                 RETURN(0);
 265         }
 266
 267         och = *och_p;
 268         *och_p = NULL;
 269         mutex_unlock(&lli->lli_och_mutex);
 270
 271         if (och != NULL) {
 272                 /* There might be a race and this handle may already
 273                  * be closed. */
 274                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 275         }
 276
 277         RETURN(rc);
 278 }
 279
 280 static int ll_md_close(struct inode *inode, struct file *file)
 281 {
 282         union ldlm_policy_data policy = {
 283                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 284         };
 285         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 286         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 287         struct ll_inode_info *lli = ll_i2info(inode);
 288         struct lustre_handle lockh;
 289         enum ldlm_mode lockmode;
 290         int rc = 0;
 291         ENTRY;
 292
 293         /* clear group lock, if present */
 294         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 295                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 296
 297         if (fd->fd_lease_och != NULL) {
 298                 bool lease_broken;
 299
 300                 /* Usually the lease is not released when the
 301                  * application crashed, we need to release here. */
 302                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 303                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 304                         PFID(&lli->lli_fid), rc, lease_broken);
 305
 306                 fd->fd_lease_och = NULL;
 307         }
 308
 309         if (fd->fd_och != NULL) {
 310                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 311                 fd->fd_och = NULL;
 312                 GOTO(out, rc);
 313         }
 314
 315         /* Let's see if we have good enough OPEN lock on the file and if
 316            we can skip talking to MDS */
 317         mutex_lock(&lli->lli_och_mutex);
 318         if (fd->fd_omode & FMODE_WRITE) {
 319                 lockmode = LCK_CW;
 320                 LASSERT(lli->lli_open_fd_write_count);
 321                 lli->lli_open_fd_write_count--;
 322         } else if (fd->fd_omode & FMODE_EXEC) {
 323                 lockmode = LCK_PR;
 324                 LASSERT(lli->lli_open_fd_exec_count);
 325                 lli->lli_open_fd_exec_count--;
 326         } else {
 327                 lockmode = LCK_CR;
 328                 LASSERT(lli->lli_open_fd_read_count);
 329                 lli->lli_open_fd_read_count--;
 330         }
 331         mutex_unlock(&lli->lli_och_mutex);
 332
 333         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 334                            LDLM_IBITS, &policy, lockmode, &lockh))
 335                 rc = ll_md_real_close(inode, fd->fd_omode);
 336
 337 out:
 338         LUSTRE_FPRIVATE(file) = NULL;
 339         ll_file_data_put(fd);
 340
 341         RETURN(rc);
 342 }
 343
 344 /* While this returns an error code, fput() the caller does not, so we need
 345  * to make every effort to clean up all of our state here.  Also, applications
 346  * rarely check close errors and even if an error is returned they will not
 347  * re-try the close call.
 348  */
 349 int ll_file_release(struct inode *inode, struct file *file)
 350 {
 351         struct ll_file_data *fd;
 352         struct ll_sb_info *sbi = ll_i2sbi(inode);
 353         struct ll_inode_info *lli = ll_i2info(inode);
 354         int rc;
 355         ENTRY;
 356
 357         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 358                PFID(ll_inode2fid(inode)), inode);
 359
 360         if (inode->i_sb->s_root != file_dentry(file))
 361                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 362         fd = LUSTRE_FPRIVATE(file);
 363         LASSERT(fd != NULL);
 364
 365         /* The last ref on @file, maybe not the the owner pid of statahead,
 366          * because parent and child process can share the same file handle. */
 367         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 368                 ll_deauthorize_statahead(inode, fd);
 369
 370         if (inode->i_sb->s_root == file_dentry(file)) {
 371                 LUSTRE_FPRIVATE(file) = NULL;
 372                 ll_file_data_put(fd);
 373                 RETURN(0);
 374         }
 375
 376         if (!S_ISDIR(inode->i_mode)) {
 377                 if (lli->lli_clob != NULL)
 378                         lov_read_and_clear_async_rc(lli->lli_clob);
 379                 lli->lli_async_rc = 0;
 380         }
 381
 382         rc = ll_md_close(inode, file);
 383
 384         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 385                 libcfs_debug_dumplog();
 386
 387         RETURN(rc);
 388 }
 389
 390 static inline int ll_dom_readpage(void *data, struct page *page)
 391 {
 392         struct niobuf_local *lnb = data;
 393         void *kaddr;
 394
 395         kaddr = ll_kmap_atomic(page, KM_USER0);
 396         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 397         if (lnb->lnb_len < PAGE_SIZE)
 398                 memset(kaddr + lnb->lnb_len, 0,
 399                        PAGE_SIZE - lnb->lnb_len);
 400         flush_dcache_page(page);
 401         SetPageUptodate(page);
 402         ll_kunmap_atomic(kaddr, KM_USER0);
 403         unlock_page(page);
 404
 405         return 0;
 406 }
 407
 408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 409                         struct lookup_intent *it)
 410 {
 411         struct ll_inode_info *lli = ll_i2info(inode);
 412         struct cl_object *obj = lli->lli_clob;
 413         struct address_space *mapping = inode->i_mapping;
 414         struct page *vmpage;
 415         struct niobuf_remote *rnb;
 416         char *data;
 417         struct lustre_handle lockh;
 418         struct ldlm_lock *lock;
 419         unsigned long index, start;
 420         struct niobuf_local lnb;
 421         bool dom_lock = false;
 422
 423         ENTRY;
 424
 425         if (obj == NULL)
 426                 RETURN_EXIT;
 427
 428         if (it->it_lock_mode != 0) {
 429                 lockh.cookie = it->it_lock_handle;
 430                 lock = ldlm_handle2lock(&lockh);
 431                 if (lock != NULL)
 432                         dom_lock = ldlm_has_dom(lock);
 433                 LDLM_LOCK_PUT(lock);
 434         }
 435         if (!dom_lock)
 436                 RETURN_EXIT;
 437
 438         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 439                                    RCL_SERVER))
 440                 RETURN_EXIT;
 441
 442         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 443         if (rnb == NULL || rnb->rnb_len == 0)
 444                 RETURN_EXIT;
 445
 446         CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
 447                rnb->rnb_len, i_size_read(inode));
 448
 449         data = (char *)rnb + sizeof(*rnb);
 450
 451         lnb.lnb_file_offset = rnb->rnb_offset;
 452         start = lnb.lnb_file_offset / PAGE_SIZE;
 453         index = 0;
 454         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 455         lnb.lnb_page_offset = 0;
 456         do {
 457                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 458                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 459                 if (lnb.lnb_len > PAGE_SIZE)
 460                         lnb.lnb_len = PAGE_SIZE;
 461
 462                 vmpage = read_cache_page(mapping, index + start,
 463                                          ll_dom_readpage, &lnb);
 464                 if (IS_ERR(vmpage)) {
 465                         CWARN("%s: cannot fill page %lu for "DFID
 466                               " with data: rc = %li\n",
 467                               ll_get_fsname(inode->i_sb, NULL, 0),
 468                               index + start, PFID(lu_object_fid(&obj->co_lu)),
 469                               PTR_ERR(vmpage));
 470                         break;
 471                 }
 472                 put_page(vmpage);
 473                 index++;
 474         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 475         EXIT;
 476 }
 477
 478 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 479                                 struct lookup_intent *itp)
 480 {
 481         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 482         struct dentry *parent = de->d_parent;
 483         const char *name = NULL;
 484         int len = 0;
 485         struct md_op_data *op_data;
 486         struct ptlrpc_request *req = NULL;
 487         int rc;
 488         ENTRY;
 489
 490         LASSERT(parent != NULL);
 491         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 492
 493         /* if server supports open-by-fid, or file name is invalid, don't pack
 494          * name in open request */
 495         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 496             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 497                 name = de->d_name.name;
 498                 len = de->d_name.len;
 499         }
 500
 501         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 502                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 503         if (IS_ERR(op_data))
 504                 RETURN(PTR_ERR(op_data));
 505         op_data->op_data = lmm;
 506         op_data->op_data_size = lmmsize;
 507
 508         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 509                             &ll_md_blocking_ast, 0);
 510         ll_finish_md_op_data(op_data);
 511         if (rc == -ESTALE) {
 512                 /* reason for keep own exit path - don`t flood log
 513                  * with messages with -ESTALE errors.
 514                  */
 515                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 516                      it_open_error(DISP_OPEN_OPEN, itp))
 517                         GOTO(out, rc);
 518                 ll_release_openhandle(de, itp);
 519                 GOTO(out, rc);
 520         }
 521
 522         if (it_disposition(itp, DISP_LOOKUP_NEG))
 523                 GOTO(out, rc = -ENOENT);
 524
 525         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 526                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 527                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 528                 GOTO(out, rc);
 529         }
 530
 531         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 532
 533         if (!rc && itp->it_lock_mode) {
 534                 ll_dom_finish_open(de->d_inode, req, itp);
 535                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 536         }
 537
 538 out:
 539         ptlrpc_req_finished(req);
 540         ll_intent_drop_lock(itp);
 541
 542         /* We did open by fid, but by the time we got to the server,
 543          * the object disappeared. If this is a create, we cannot really
 544          * tell the userspace that the file it was trying to create
 545          * does not exist. Instead let's return -ESTALE, and the VFS will
 546          * retry the create with LOOKUP_REVAL that we are going to catch
 547          * in ll_revalidate_dentry() and use lookup then.
 548          */
 549         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 550                 rc = -ESTALE;
 551
 552         RETURN(rc);
 553 }
 554
 555 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 556                        struct obd_client_handle *och)
 557 {
 558         struct mdt_body *body;
 559
 560         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 561         och->och_open_handle = body->mbo_open_handle;
 562         och->och_fid = body->mbo_fid1;
 563         och->och_lease_handle.cookie = it->it_lock_handle;
 564         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 565         och->och_flags = it->it_flags;
 566
 567         return md_set_open_replay_data(md_exp, och, it);
 568 }
 569
 570 static int ll_local_open(struct file *file, struct lookup_intent *it,
 571                          struct ll_file_data *fd, struct obd_client_handle *och)
 572 {
 573         struct inode *inode = file_inode(file);
 574         ENTRY;
 575
 576         LASSERT(!LUSTRE_FPRIVATE(file));
 577
 578         LASSERT(fd != NULL);
 579
 580         if (och) {
 581                 int rc;
 582
 583                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 584                 if (rc != 0)
 585                         RETURN(rc);
 586         }
 587
 588         LUSTRE_FPRIVATE(file) = fd;
 589         ll_readahead_init(inode, &fd->fd_ras);
 590         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 591
 592         /* ll_cl_context initialize */
 593         rwlock_init(&fd->fd_lock);
 594         INIT_LIST_HEAD(&fd->fd_lccs);
 595
 596         RETURN(0);
 597 }
 598
 599 /* Open a file, and (for the very first open) create objects on the OSTs at
 600  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 601  * creation or open until ll_lov_setstripe() ioctl is called.
 602  *
 603  * If we already have the stripe MD locally then we don't request it in
 604  * md_open(), by passing a lmm_size = 0.
 605  *
 606  * It is up to the application to ensure no other processes open this file
 607  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 608  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 609  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 610  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 611  */
 612 int ll_file_open(struct inode *inode, struct file *file)
 613 {
 614         struct ll_inode_info *lli = ll_i2info(inode);
 615         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 616                                           .it_flags = file->f_flags };
 617         struct obd_client_handle **och_p = NULL;
 618         __u64 *och_usecount = NULL;
 619         struct ll_file_data *fd;
 620         int rc = 0;
 621         ENTRY;
 622
 623         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 624                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 625
 626         it = file->private_data; /* XXX: compat macro */
 627         file->private_data = NULL; /* prevent ll_local_open assertion */
 628
 629         fd = ll_file_data_get();
 630         if (fd == NULL)
 631                 GOTO(out_nofiledata, rc = -ENOMEM);
 632
 633         fd->fd_file = file;
 634         if (S_ISDIR(inode->i_mode))
 635                 ll_authorize_statahead(inode, fd);
 636
 637         if (inode->i_sb->s_root == file_dentry(file)) {
 638                 LUSTRE_FPRIVATE(file) = fd;
 639                 RETURN(0);
 640         }
 641
 642         if (!it || !it->it_disposition) {
 643                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 644                  * because everything but O_ACCMODE mask was stripped from
 645                  * there */
 646                 if ((oit.it_flags + 1) & O_ACCMODE)
 647                         oit.it_flags++;
 648                 if (file->f_flags & O_TRUNC)
 649                         oit.it_flags |= FMODE_WRITE;
 650
 651                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 652                  * dentry_open after call to open_namei that checks permissions.
 653                  * Only nfsd_open call dentry_open directly without checking
 654                  * permissions and because of that this code below is safe.
 655                  */
 656                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 657                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 658
 659                 /* We do not want O_EXCL here, presumably we opened the file
 660                  * already? XXX - NFS implications? */
 661                 oit.it_flags &= ~O_EXCL;
 662
 663                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 664                  * created if necessary, then "IT_CREAT" should be set to keep
 665                  * consistent with it */
 666                 if (oit.it_flags & O_CREAT)
 667                         oit.it_op |= IT_CREAT;
 668
 669                 it = &oit;
 670         }
 671
 672 restart:
 673         /* Let's see if we have file open on MDS already. */
 674         if (it->it_flags & FMODE_WRITE) {
 675                 och_p = &lli->lli_mds_write_och;
 676                 och_usecount = &lli->lli_open_fd_write_count;
 677         } else if (it->it_flags & FMODE_EXEC) {
 678                 och_p = &lli->lli_mds_exec_och;
 679                 och_usecount = &lli->lli_open_fd_exec_count;
 680          } else {
 681                 och_p = &lli->lli_mds_read_och;
 682                 och_usecount = &lli->lli_open_fd_read_count;
 683         }
 684
 685         mutex_lock(&lli->lli_och_mutex);
 686         if (*och_p) { /* Open handle is present */
 687                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 688                         /* Well, there's extra open request that we do not need,
 689                            let's close it somehow. This will decref request. */
 690                         rc = it_open_error(DISP_OPEN_OPEN, it);
 691                         if (rc) {
 692                                 mutex_unlock(&lli->lli_och_mutex);
 693                                 GOTO(out_openerr, rc);
 694                         }
 695
 696                         ll_release_openhandle(file_dentry(file), it);
 697                 }
 698                 (*och_usecount)++;
 699
 700                 rc = ll_local_open(file, it, fd, NULL);
 701                 if (rc) {
 702                         (*och_usecount)--;
 703                         mutex_unlock(&lli->lli_och_mutex);
 704                         GOTO(out_openerr, rc);
 705                 }
 706         } else {
 707                 LASSERT(*och_usecount == 0);
 708                 if (!it->it_disposition) {
 709                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 710                         /* We cannot just request lock handle now, new ELC code
 711                            means that one of other OPEN locks for this file
 712                            could be cancelled, and since blocking ast handler
 713                            would attempt to grab och_mutex as well, that would
 714                            result in a deadlock */
 715                         mutex_unlock(&lli->lli_och_mutex);
 716                         /*
 717                          * Normally called under two situations:
 718                          * 1. NFS export.
 719                          * 2. A race/condition on MDS resulting in no open
 720                          *    handle to be returned from LOOKUP|OPEN request,
 721                          *    for example if the target entry was a symlink.
 722                          *
 723                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 724                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 725                          *  bit so that it's not confusing later callers.
 726                          *
 727                          *  NB; when ldd is NULL, it must have come via normal
 728                          *  lookup path only, since ll_iget_for_nfs always calls
 729                          *  ll_d_init().
 730                          */
 731                         if (ldd && ldd->lld_nfs_dentry) {
 732                                 ldd->lld_nfs_dentry = 0;
 733                                 it->it_flags |= MDS_OPEN_LOCK;
 734                         }
 735
 736                          /*
 737                          * Always specify MDS_OPEN_BY_FID because we don't want
 738                          * to get file with different fid.
 739                          */
 740                         it->it_flags |= MDS_OPEN_BY_FID;
 741                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 742                                                  it);
 743                         if (rc)
 744                                 GOTO(out_openerr, rc);
 745
 746                         goto restart;
 747                 }
 748                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 749                 if (!*och_p)
 750                         GOTO(out_och_free, rc = -ENOMEM);
 751
 752                 (*och_usecount)++;
 753
 754                 /* md_intent_lock() didn't get a request ref if there was an
 755                  * open error, so don't do cleanup on the request here
 756                  * (bug 3430) */
 757                 /* XXX (green): Should not we bail out on any error here, not
 758                  * just open error? */
 759                 rc = it_open_error(DISP_OPEN_OPEN, it);
 760                 if (rc != 0)
 761                         GOTO(out_och_free, rc);
 762
 763                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 764                          "inode %p: disposition %x, status %d\n", inode,
 765                          it_disposition(it, ~0), it->it_status);
 766
 767                 rc = ll_local_open(file, it, fd, *och_p);
 768                 if (rc)
 769                         GOTO(out_och_free, rc);
 770         }
 771         mutex_unlock(&lli->lli_och_mutex);
 772         fd = NULL;
 773
 774         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 775            different kind of OPEN lock for this same inode gets cancelled
 776            by ldlm_cancel_lru */
 777         if (!S_ISREG(inode->i_mode))
 778                 GOTO(out_och_free, rc);
 779
 780         cl_lov_delay_create_clear(&file->f_flags);
 781         GOTO(out_och_free, rc);
 782
 783 out_och_free:
 784         if (rc) {
 785                 if (och_p && *och_p) {
 786                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 787                         *och_p = NULL; /* OBD_FREE writes some magic there */
 788                         (*och_usecount)--;
 789                 }
 790                 mutex_unlock(&lli->lli_och_mutex);
 791
 792 out_openerr:
 793                 if (lli->lli_opendir_key == fd)
 794                         ll_deauthorize_statahead(inode, fd);
 795                 if (fd != NULL)
 796                         ll_file_data_put(fd);
 797         } else {
 798                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 799         }
 800
 801 out_nofiledata:
 802         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 803                 ptlrpc_req_finished(it->it_request);
 804                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 805         }
 806
 807         return rc;
 808 }
 809
 810 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 811                         struct ldlm_lock_desc *desc, void *data, int flag)
 812 {
 813         int rc;
 814         struct lustre_handle lockh;
 815         ENTRY;
 816
 817         switch (flag) {
 818         case LDLM_CB_BLOCKING:
 819                 ldlm_lock2handle(lock, &lockh);
 820                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 821                 if (rc < 0) {
 822                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 823                         RETURN(rc);
 824                 }
 825                 break;
 826         case LDLM_CB_CANCELING:
 827                 /* do nothing */
 828                 break;
 829         }
 830         RETURN(0);
 831 }
 832
 833 /**
 834  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 835  * and save it as fd->fd_och so as to force client to reopen the file even
 836  * if it has an open lock in cache already.
 837  */
 838 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 839                                 struct lustre_handle *old_open_handle)
 840 {
 841         struct ll_inode_info *lli = ll_i2info(inode);
 842         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 843         struct obd_client_handle **och_p;
 844         __u64 *och_usecount;
 845         int rc = 0;
 846         ENTRY;
 847
 848         /* Get the openhandle of the file */
 849         mutex_lock(&lli->lli_och_mutex);
 850         if (fd->fd_lease_och != NULL)
 851                 GOTO(out_unlock, rc = -EBUSY);
 852
 853         if (fd->fd_och == NULL) {
 854                 if (file->f_mode & FMODE_WRITE) {
 855                         LASSERT(lli->lli_mds_write_och != NULL);
 856                         och_p = &lli->lli_mds_write_och;
 857                         och_usecount = &lli->lli_open_fd_write_count;
 858                 } else {
 859                         LASSERT(lli->lli_mds_read_och != NULL);
 860                         och_p = &lli->lli_mds_read_och;
 861                         och_usecount = &lli->lli_open_fd_read_count;
 862                 }
 863
 864                 if (*och_usecount > 1)
 865                         GOTO(out_unlock, rc = -EBUSY);
 866
 867                 fd->fd_och = *och_p;
 868                 *och_usecount = 0;
 869                 *och_p = NULL;
 870         }
 871
 872         *old_open_handle = fd->fd_och->och_open_handle;
 873
 874         EXIT;
 875 out_unlock:
 876         mutex_unlock(&lli->lli_och_mutex);
 877         return rc;
 878 }
 879
 880 /**
 881  * Release ownership on lli_mds_*_och when putting back a file lease.
 882  */
 883 static int ll_lease_och_release(struct inode *inode, struct file *file)
 884 {
 885         struct ll_inode_info *lli = ll_i2info(inode);
 886         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 887         struct obd_client_handle **och_p;
 888         struct obd_client_handle *old_och = NULL;
 889         __u64 *och_usecount;
 890         int rc = 0;
 891         ENTRY;
 892
 893         mutex_lock(&lli->lli_och_mutex);
 894         if (file->f_mode & FMODE_WRITE) {
 895                 och_p = &lli->lli_mds_write_och;
 896                 och_usecount = &lli->lli_open_fd_write_count;
 897         } else {
 898                 och_p = &lli->lli_mds_read_och;
 899                 och_usecount = &lli->lli_open_fd_read_count;
 900         }
 901
 902         /* The file may have been open by another process (broken lease) so
 903          * *och_p is not NULL. In this case we should simply increase usecount
 904          * and close fd_och.
 905          */
 906         if (*och_p != NULL) {
 907                 old_och = fd->fd_och;
 908                 (*och_usecount)++;
 909         } else {
 910                 *och_p = fd->fd_och;
 911                 *och_usecount = 1;
 912         }
 913         fd->fd_och = NULL;
 914         mutex_unlock(&lli->lli_och_mutex);
 915
 916         if (old_och != NULL)
 917                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 918
 919         RETURN(rc);
 920 }
 921
 922 /**
 923  * Acquire a lease and open the file.
 924  */
 925 static struct obd_client_handle *
 926 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 927               __u64 open_flags)
 928 {
 929         struct lookup_intent it = { .it_op = IT_OPEN };
 930         struct ll_sb_info *sbi = ll_i2sbi(inode);
 931         struct md_op_data *op_data;
 932         struct ptlrpc_request *req = NULL;
 933         struct lustre_handle old_open_handle = { 0 };
 934         struct obd_client_handle *och = NULL;
 935         int rc;
 936         int rc2;
 937         ENTRY;
 938
 939         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 940                 RETURN(ERR_PTR(-EINVAL));
 941
 942         if (file != NULL) {
 943                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 944                         RETURN(ERR_PTR(-EPERM));
 945
 946                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
 947                 if (rc)
 948                         RETURN(ERR_PTR(rc));
 949         }
 950
 951         OBD_ALLOC_PTR(och);
 952         if (och == NULL)
 953                 RETURN(ERR_PTR(-ENOMEM));
 954
 955         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 956                                         LUSTRE_OPC_ANY, NULL);
 957         if (IS_ERR(op_data))
 958                 GOTO(out, rc = PTR_ERR(op_data));
 959
 960         /* To tell the MDT this openhandle is from the same owner */
 961         op_data->op_open_handle = old_open_handle;
 962
 963         it.it_flags = fmode | open_flags;
 964         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 965         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 966                             &ll_md_blocking_lease_ast,
 967         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 968          * it can be cancelled which may mislead applications that the lease is
 969          * broken;
 970          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 971          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 972          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 973                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 974         ll_finish_md_op_data(op_data);
 975         ptlrpc_req_finished(req);
 976         if (rc < 0)
 977                 GOTO(out_release_it, rc);
 978
 979         if (it_disposition(&it, DISP_LOOKUP_NEG))
 980                 GOTO(out_release_it, rc = -ENOENT);
 981
 982         rc = it_open_error(DISP_OPEN_OPEN, &it);
 983         if (rc)
 984                 GOTO(out_release_it, rc);
 985
 986         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 987         ll_och_fill(sbi->ll_md_exp, &it, och);
 988
 989         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 990                 GOTO(out_close, rc = -EOPNOTSUPP);
 991
 992         /* already get lease, handle lease lock */
 993         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 994         if (it.it_lock_mode == 0 ||
 995             it.it_lock_bits != MDS_INODELOCK_OPEN) {
 996                 /* open lock must return for lease */
 997                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 998                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
 999                         it.it_lock_bits);
1000                 GOTO(out_close, rc = -EPROTO);
1001         }
1002
1003         ll_intent_release(&it);
1004         RETURN(och);
1005
1006 out_close:
1007         /* Cancel open lock */
1008         if (it.it_lock_mode != 0) {
1009                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1010                                             it.it_lock_mode);
1011                 it.it_lock_mode = 0;
1012                 och->och_lease_handle.cookie = 0ULL;
1013         }
1014         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1015         if (rc2 < 0)
1016                 CERROR("%s: error closing file "DFID": %d\n",
1017                        ll_get_fsname(inode->i_sb, NULL, 0),
1018                        PFID(&ll_i2info(inode)->lli_fid), rc2);
1019         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1020 out_release_it:
1021         ll_intent_release(&it);
1022 out:
1023         if (och != NULL)
1024                 OBD_FREE_PTR(och);
1025         RETURN(ERR_PTR(rc));
1026 }
1027
1028 /**
1029  * Check whether a layout swap can be done between two inodes.
1030  *
1031  * \param[in] inode1  First inode to check
1032  * \param[in] inode2  Second inode to check
1033  *
1034  * \retval 0 on success, layout swap can be performed between both inodes
1035  * \retval negative error code if requirements are not met
1036  */
1037 static int ll_check_swap_layouts_validity(struct inode *inode1,
1038                                           struct inode *inode2)
1039 {
1040         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1041                 return -EINVAL;
1042
1043         if (inode_permission(inode1, MAY_WRITE) ||
1044             inode_permission(inode2, MAY_WRITE))
1045                 return -EPERM;
1046
1047         if (inode1->i_sb != inode2->i_sb)
1048                 return -EXDEV;
1049
1050         return 0;
1051 }
1052
1053 static int ll_swap_layouts_close(struct obd_client_handle *och,
1054                                  struct inode *inode, struct inode *inode2)
1055 {
1056         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1057         const struct lu_fid     *fid2;
1058         int                      rc;
1059         ENTRY;
1060
1061         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1062                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1063
1064         rc = ll_check_swap_layouts_validity(inode, inode2);
1065         if (rc < 0)
1066                 GOTO(out_free_och, rc);
1067
1068         /* We now know that inode2 is a lustre inode */
1069         fid2 = ll_inode2fid(inode2);
1070
1071         rc = lu_fid_cmp(fid1, fid2);
1072         if (rc == 0)
1073                 GOTO(out_free_och, rc = -EINVAL);
1074
1075         /* Close the file and {swap,merge} layouts between inode & inode2.
1076          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1077          * because we still need it to pack l_remote_handle to MDT. */
1078         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1079                                        inode2);
1080
1081         och = NULL; /* freed in ll_close_inode_openhandle() */
1082
1083 out_free_och:
1084         if (och != NULL)
1085                 OBD_FREE_PTR(och);
1086
1087         RETURN(rc);
1088 }
1089
1090 /**
1091  * Release lease and close the file.
1092  * It will check if the lease has ever broken.
1093  */
1094 static int ll_lease_close_intent(struct obd_client_handle *och,
1095                                  struct inode *inode,
1096                                  bool *lease_broken, enum mds_op_bias bias,
1097                                  void *data)
1098 {
1099         struct ldlm_lock *lock;
1100         bool cancelled = true;
1101         int rc;
1102         ENTRY;
1103
1104         lock = ldlm_handle2lock(&och->och_lease_handle);
1105         if (lock != NULL) {
1106                 lock_res_and_lock(lock);
1107                 cancelled = ldlm_is_cancel(lock);
1108                 unlock_res_and_lock(lock);
1109                 LDLM_LOCK_PUT(lock);
1110         }
1111
1112         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1113                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1114
1115         if (lease_broken != NULL)
1116                 *lease_broken = cancelled;
1117
1118         if (!cancelled && !bias)
1119                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1120
1121         if (cancelled) { /* no need to excute intent */
1122                 bias = 0;
1123                 data = NULL;
1124         }
1125
1126         rc = ll_close_inode_openhandle(inode, och, bias, data);
1127         RETURN(rc);
1128 }
1129
1130 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1131                           bool *lease_broken)
1132 {
1133         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1134 }
1135
1136 /**
1137  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1138  */
1139 static int ll_lease_file_resync(struct obd_client_handle *och,
1140                                 struct inode *inode, unsigned long arg)
1141 {
1142         struct ll_sb_info *sbi = ll_i2sbi(inode);
1143         struct md_op_data *op_data;
1144         struct ll_ioc_lease_id ioc;
1145         __u64 data_version_unused;
1146         int rc;
1147         ENTRY;
1148
1149         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1150                                      LUSTRE_OPC_ANY, NULL);
1151         if (IS_ERR(op_data))
1152                 RETURN(PTR_ERR(op_data));
1153
1154         if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1155                            sizeof(ioc)))
1156                 RETURN(-EFAULT);
1157
1158         /* before starting file resync, it's necessary to clean up page cache
1159          * in client memory, otherwise once the layout version is increased,
1160          * writing back cached data will be denied the OSTs. */
1161         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1162         if (rc)
1163                 GOTO(out, rc);
1164
1165         op_data->op_lease_handle = och->och_lease_handle;
1166         op_data->op_mirror_id = ioc.lil_mirror_id;
1167         rc = md_file_resync(sbi->ll_md_exp, op_data);
1168         if (rc)
1169                 GOTO(out, rc);
1170
1171         EXIT;
1172 out:
1173         ll_finish_md_op_data(op_data);
1174         return rc;
1175 }
1176
1177 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1178 {
1179         struct ll_inode_info *lli = ll_i2info(inode);
1180         struct cl_object *obj = lli->lli_clob;
1181         struct cl_attr *attr = vvp_env_thread_attr(env);
1182         s64 atime;
1183         s64 mtime;
1184         s64 ctime;
1185         int rc = 0;
1186
1187         ENTRY;
1188
1189         ll_inode_size_lock(inode);
1190
1191         /* Merge timestamps the most recently obtained from MDS with
1192          * timestamps obtained from OSTs.
1193          *
1194          * Do not overwrite atime of inode because it may be refreshed
1195          * by file_accessed() function. If the read was served by cache
1196          * data, there is no RPC to be sent so that atime may not be
1197          * transferred to OSTs at all. MDT only updates atime at close time
1198          * if it's at least 'mdd.*.atime_diff' older.
1199          * All in all, the atime in Lustre does not strictly comply with
1200          * POSIX. Solving this problem needs to send an RPC to MDT for each
1201          * read, this will hurt performance. */
1202         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1203                 LTIME_S(inode->i_atime) = lli->lli_atime;
1204                 lli->lli_update_atime = 0;
1205         }
1206         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1207         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1208
1209         atime = LTIME_S(inode->i_atime);
1210         mtime = LTIME_S(inode->i_mtime);
1211         ctime = LTIME_S(inode->i_ctime);
1212
1213         cl_object_attr_lock(obj);
1214         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1215                 rc = -EINVAL;
1216         else
1217                 rc = cl_object_attr_get(env, obj, attr);
1218         cl_object_attr_unlock(obj);
1219
1220         if (rc != 0)
1221                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1222
1223         if (atime < attr->cat_atime)
1224                 atime = attr->cat_atime;
1225
1226         if (ctime < attr->cat_ctime)
1227                 ctime = attr->cat_ctime;
1228
1229         if (mtime < attr->cat_mtime)
1230                 mtime = attr->cat_mtime;
1231
1232         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1233                PFID(&lli->lli_fid), attr->cat_size);
1234
1235         i_size_write(inode, attr->cat_size);
1236         inode->i_blocks = attr->cat_blocks;
1237
1238         LTIME_S(inode->i_atime) = atime;
1239         LTIME_S(inode->i_mtime) = mtime;
1240         LTIME_S(inode->i_ctime) = ctime;
1241
1242 out_size_unlock:
1243         ll_inode_size_unlock(inode);
1244
1245         RETURN(rc);
1246 }
1247
1248 /**
1249  * Set designated mirror for I/O.
1250  *
1251  * So far only read, write, and truncated can support to issue I/O to
1252  * designated mirror.
1253  */
1254 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1255 {
1256         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1257
1258         /* clear layout version for generic(non-resync) I/O in case it carries
1259          * stale layout version due to I/O restart */
1260         io->ci_layout_version = 0;
1261
1262         /* FLR: disable non-delay for designated mirror I/O because obviously
1263          * only one mirror is available */
1264         if (fd->fd_designated_mirror > 0) {
1265                 io->ci_ndelay = 0;
1266                 io->ci_designated_mirror = fd->fd_designated_mirror;
1267                 io->ci_layout_version = fd->fd_layout_version;
1268                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1269                                  * io to ptasks */
1270         }
1271
1272         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1273                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1274 }
1275
1276 static bool file_is_noatime(const struct file *file)
1277 {
1278         const struct vfsmount *mnt = file->f_path.mnt;
1279         const struct inode *inode = file_inode((struct file *)file);
1280
1281         /* Adapted from file_accessed() and touch_atime().*/
1282         if (file->f_flags & O_NOATIME)
1283                 return true;
1284
1285         if (inode->i_flags & S_NOATIME)
1286                 return true;
1287
1288         if (IS_NOATIME(inode))
1289                 return true;
1290
1291         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1292                 return true;
1293
1294         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1295                 return true;
1296
1297         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1298                 return true;
1299
1300         return false;
1301 }
1302
1303 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1304
1305 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1306 {
1307         struct inode *inode = file_inode(file);
1308         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1309
1310         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1311         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1312         io->u.ci_rw.rw_file = file;
1313         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1314         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1315         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1316
1317         if (iot == CIT_WRITE) {
1318                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1319                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1320                                            file->f_flags & O_DIRECT ||
1321                                            IS_SYNC(inode));
1322         }
1323         io->ci_obj = ll_i2info(inode)->lli_clob;
1324         io->ci_lockreq = CILR_MAYBE;
1325         if (ll_file_nolock(file)) {
1326                 io->ci_lockreq = CILR_NEVER;
1327                 io->ci_no_srvlock = 1;
1328         } else if (file->f_flags & O_APPEND) {
1329                 io->ci_lockreq = CILR_MANDATORY;
1330         }
1331         io->ci_noatime = file_is_noatime(file);
1332         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1333                 io->ci_pio = !io->u.ci_rw.rw_append;
1334         else
1335                 io->ci_pio = 0;
1336
1337         /* FLR: only use non-delay I/O for read as there is only one
1338          * avaliable mirror for write. */
1339         io->ci_ndelay = !(iot == CIT_WRITE);
1340
1341         ll_io_set_mirror(io, file);
1342 }
1343
1344 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1345 {
1346         struct cl_io_pt *pt = ptask->pt_cbdata;
1347         struct file *file = pt->cip_file;
1348         struct lu_env *env;
1349         struct cl_io *io;
1350         loff_t pos = pt->cip_pos;
1351         int rc;
1352         __u16 refcheck;
1353         ENTRY;
1354
1355         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1356                 file_dentry(file)->d_name.name,
1357                 pt->cip_iot == CIT_READ ? "read" : "write",
1358                 pos, pos + pt->cip_count);
1359
1360         env = cl_env_get(&refcheck);
1361         if (IS_ERR(env))
1362                 RETURN(PTR_ERR(env));
1363
1364         io = vvp_env_thread_io(env);
1365         ll_io_init(io, file, pt->cip_iot);
1366         io->u.ci_rw.rw_iter = pt->cip_iter;
1367         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1368         io->ci_pio = 0; /* It's already in parallel task */
1369
1370         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1371                            pt->cip_count - pt->cip_result);
1372         if (!rc) {
1373                 struct vvp_io *vio = vvp_env_io(env);
1374
1375                 vio->vui_io_subtype = IO_NORMAL;
1376                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1377
1378                 ll_cl_add(file, env, io, LCC_RW);
1379                 rc = cl_io_loop(env, io);
1380                 ll_cl_remove(file, env);
1381         } else {
1382                 /* cl_io_rw_init() handled IO */
1383                 rc = io->ci_result;
1384         }
1385
1386         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1387                 if (io->ci_nob > 0)
1388                         io->ci_nob /= 2;
1389                 rc = -EIO;
1390         }
1391
1392         if (io->ci_nob > 0) {
1393                 pt->cip_result += io->ci_nob;
1394                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1395                 pos += io->ci_nob;
1396                 pt->cip_iocb.ki_pos = pos;
1397 #ifdef HAVE_KIOCB_KI_LEFT
1398                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1399 #elif defined(HAVE_KI_NBYTES)
1400                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1401 #endif
1402         }
1403
1404         cl_io_fini(env, io);
1405         cl_env_put(env, &refcheck);
1406
1407         pt->cip_need_restart = io->ci_need_restart;
1408
1409         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1410                 file_dentry(file)->d_name.name,
1411                 pt->cip_iot == CIT_READ ? "read" : "write",
1412                 pt->cip_result, rc);
1413
1414         RETURN(pt->cip_result > 0 ? 0 : rc);
1415 }
1416
1417 static ssize_t
1418 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1419                    struct file *file, enum cl_io_type iot,
1420                    loff_t *ppos, size_t count)
1421 {
1422         struct range_lock       range;
1423         struct vvp_io           *vio = vvp_env_io(env);
1424         struct inode            *inode = file_inode(file);
1425         struct ll_inode_info    *lli = ll_i2info(inode);
1426         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1427         struct cl_io            *io;
1428         loff_t                  pos = *ppos;
1429         ssize_t                 result = 0;
1430         int                     rc = 0;
1431         unsigned                retried = 0;
1432         bool                    restarted = false;
1433
1434         ENTRY;
1435
1436         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1437                 file_dentry(file)->d_name.name,
1438                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1439
1440 restart:
1441         io = vvp_env_thread_io(env);
1442         ll_io_init(io, file, iot);
1443         if (args->via_io_subtype == IO_NORMAL) {
1444                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1445                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1446         }
1447         if (args->via_io_subtype != IO_NORMAL || restarted)
1448                 io->ci_pio = 0;
1449         io->ci_ndelay_tried = retried;
1450
1451         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1452                 bool range_locked = false;
1453
1454                 if (file->f_flags & O_APPEND)
1455                         range_lock_init(&range, 0, LUSTRE_EOF);
1456                 else
1457                         range_lock_init(&range, pos, pos + count - 1);
1458
1459                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1460                 vio->vui_io_subtype = args->via_io_subtype;
1461
1462                 switch (vio->vui_io_subtype) {
1463                 case IO_NORMAL:
1464                         /* Direct IO reads must also take range lock,
1465                          * or multiple reads will try to work on the same pages
1466                          * See LU-6227 for details. */
1467                         if (((iot == CIT_WRITE) ||
1468                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1469                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1470                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1471                                        RL_PARA(&range));
1472                                 rc = range_lock(&lli->lli_write_tree, &range);
1473                                 if (rc < 0)
1474                                         GOTO(out, rc);
1475
1476                                 range_locked = true;
1477                         }
1478                         break;
1479                 case IO_SPLICE:
1480                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1481                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1482                         break;
1483                 default:
1484                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1485                         LBUG();
1486                 }
1487
1488                 ll_cl_add(file, env, io, LCC_RW);
1489                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1490                     !lli->lli_inode_locked) {
1491                         inode_lock(inode);
1492                         lli->lli_inode_locked = 1;
1493                 }
1494                 rc = cl_io_loop(env, io);
1495                 if (lli->lli_inode_locked) {
1496                         lli->lli_inode_locked = 0;
1497                         inode_unlock(inode);
1498                 }
1499                 ll_cl_remove(file, env);
1500
1501                 if (range_locked) {
1502                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1503                                RL_PARA(&range));
1504                         range_unlock(&lli->lli_write_tree, &range);
1505                 }
1506         } else {
1507                 /* cl_io_rw_init() handled IO */
1508                 rc = io->ci_result;
1509         }
1510
1511         if (io->ci_nob > 0) {
1512                 result += io->ci_nob;
1513                 count  -= io->ci_nob;
1514
1515                 if (args->via_io_subtype == IO_NORMAL) {
1516                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1517
1518                         /* CLIO is too complicated. See LU-11069. */
1519                         if (cl_io_is_append(io))
1520                                 pos = io->u.ci_rw.rw_iocb.ki_pos;
1521                         else
1522                                 pos += io->ci_nob;
1523
1524                         args->u.normal.via_iocb->ki_pos = pos;
1525 #ifdef HAVE_KIOCB_KI_LEFT
1526                         args->u.normal.via_iocb->ki_left = count;
1527 #elif defined(HAVE_KI_NBYTES)
1528                         args->u.normal.via_iocb->ki_nbytes = count;
1529 #endif
1530                 } else {
1531                         /* for splice */
1532                         pos = io->u.ci_rw.rw_range.cir_pos;
1533                 }
1534         }
1535 out:
1536         cl_io_fini(env, io);
1537
1538         CDEBUG(D_VFSTRACE,
1539                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1540                file->f_path.dentry->d_name.name,
1541                iot, rc, result, io->ci_need_restart);
1542
1543         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1544                 CDEBUG(D_VFSTRACE,
1545                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1546                         file_dentry(file)->d_name.name,
1547                         iot == CIT_READ ? "read" : "write",
1548                         pos, pos + count, result, rc);
1549                 /* preserve the tried count for FLR */
1550                 retried = io->ci_ndelay_tried;
1551                 restarted = true;
1552                 goto restart;
1553         }
1554
1555         if (iot == CIT_READ) {
1556                 if (result > 0)
1557                         ll_stats_ops_tally(ll_i2sbi(inode),
1558                                            LPROC_LL_READ_BYTES, result);
1559         } else if (iot == CIT_WRITE) {
1560                 if (result > 0) {
1561                         ll_stats_ops_tally(ll_i2sbi(inode),
1562                                            LPROC_LL_WRITE_BYTES, result);
1563                         fd->fd_write_failed = false;
1564                 } else if (result == 0 && rc == 0) {
1565                         rc = io->ci_result;
1566                         if (rc < 0)
1567                                 fd->fd_write_failed = true;
1568                         else
1569                                 fd->fd_write_failed = false;
1570                 } else if (rc != -ERESTARTSYS) {
1571                         fd->fd_write_failed = true;
1572                 }
1573         }
1574
1575         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1576                 file_dentry(file)->d_name.name,
1577                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1578
1579         *ppos = pos;
1580
1581         RETURN(result > 0 ? result : rc);
1582 }
1583
1584 /**
1585  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1586  * especially for small I/O.
1587  *
1588  * To serve a read request, CLIO has to create and initialize a cl_io and
1589  * then request DLM lock. This has turned out to have siginificant overhead
1590  * and affects the performance of small I/O dramatically.
1591  *
1592  * It's not necessary to create a cl_io for each I/O. Under the help of read
1593  * ahead, most of the pages being read are already in memory cache and we can
1594  * read those pages directly because if the pages exist, the corresponding DLM
1595  * lock must exist so that page content must be valid.
1596  *
1597  * In fast read implementation, the llite speculatively finds and reads pages
1598  * in memory cache. There are three scenarios for fast read:
1599  *   - If the page exists and is uptodate, kernel VM will provide the data and
1600  *     CLIO won't be intervened;
1601  *   - If the page was brought into memory by read ahead, it will be exported
1602  *     and read ahead parameters will be updated;
1603  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1604  *     it will go back and invoke normal read, i.e., a cl_io will be created
1605  *     and DLM lock will be requested.
1606  *
1607  * POSIX compliance: posix standard states that read is intended to be atomic.
1608  * Lustre read implementation is in line with Linux kernel read implementation
1609  * and neither of them complies with POSIX standard in this matter. Fast read
1610  * doesn't make the situation worse on single node but it may interleave write
1611  * results from multiple nodes due to short read handling in ll_file_aio_read().
1612  *
1613  * \param env - lu_env
1614  * \param iocb - kiocb from kernel
1615  * \param iter - user space buffers where the data will be copied
1616  *
1617  * \retval - number of bytes have been read, or error code if error occurred.
1618  */
1619 static ssize_t
1620 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1621 {
1622         ssize_t result;
1623
1624         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1625                 return 0;
1626
1627         /* NB: we can't do direct IO for fast read because it will need a lock
1628          * to make IO engine happy. */
1629         if (iocb->ki_filp->f_flags & O_DIRECT)
1630                 return 0;
1631
1632         result = generic_file_read_iter(iocb, iter);
1633
1634         /* If the first page is not in cache, generic_file_aio_read() will be
1635          * returned with -ENODATA.
1636          * See corresponding code in ll_readpage(). */
1637         if (result == -ENODATA)
1638                 result = 0;
1639
1640         if (result > 0)
1641                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1642                                 LPROC_LL_READ_BYTES, result);
1643
1644         return result;
1645 }
1646
1647 /*
1648  * Read from a file (through the page cache).
1649  */
1650 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1651 {
1652         struct lu_env *env;
1653         struct vvp_io_args *args;
1654         ssize_t result;
1655         ssize_t rc2;
1656         __u16 refcheck;
1657
1658         result = ll_do_fast_read(iocb, to);
1659         if (result < 0 || iov_iter_count(to) == 0)
1660                 GOTO(out, result);
1661
1662         env = cl_env_get(&refcheck);
1663         if (IS_ERR(env))
1664                 return PTR_ERR(env);
1665
1666         args = ll_env_args(env, IO_NORMAL);
1667         args->u.normal.via_iter = to;
1668         args->u.normal.via_iocb = iocb;
1669
1670         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1671                                  &iocb->ki_pos, iov_iter_count(to));
1672         if (rc2 > 0)
1673                 result += rc2;
1674         else if (result == 0)
1675                 result = rc2;
1676
1677         cl_env_put(env, &refcheck);
1678 out:
1679         return result;
1680 }
1681
1682 /**
1683  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1684  * If a page is already in the page cache and dirty (and some other things -
1685  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1686  * write to it without doing a full I/O, because Lustre already knows about it
1687  * and will write it out.  This saves a lot of processing time.
1688  *
1689  * All writes here are within one page, so exclusion is handled by the page
1690  * lock on the vm page.  We do not do tiny writes for writes which touch
1691  * multiple pages because it's very unlikely multiple sequential pages are
1692  * are already dirty.
1693  *
1694  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1695  * and are unlikely to be to already dirty pages.
1696  *
1697  * Attribute updates are important here, we do them in ll_tiny_write_end.
1698  */
1699 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1700 {
1701         ssize_t count = iov_iter_count(iter);
1702         struct file *file = iocb->ki_filp;
1703         struct inode *inode = file_inode(file);
1704         ssize_t result = 0;
1705
1706         ENTRY;
1707
1708         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1709          * of function for why.
1710          */
1711         if (count >= PAGE_SIZE ||
1712             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1713                 RETURN(0);
1714
1715         result = __generic_file_write_iter(iocb, iter);
1716
1717         /* If the page is not already dirty, ll_tiny_write_begin returns
1718          * -ENODATA.  We continue on to normal write.
1719          */
1720         if (result == -ENODATA)
1721                 result = 0;
1722
1723         if (result > 0) {
1724                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1725                                    result);
1726                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1727         }
1728
1729         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1730
1731         RETURN(result);
1732 }
1733
1734 /*
1735  * Write to a file (through the page cache).
1736  */
1737 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1738 {
1739         struct vvp_io_args *args;
1740         struct lu_env *env;
1741         ssize_t rc_tiny = 0, rc_normal;
1742         __u16 refcheck;
1743
1744         ENTRY;
1745
1746         /* NB: we can't do direct IO for tiny writes because they use the page
1747          * cache, we can't do sync writes because tiny writes can't flush
1748          * pages, and we can't do append writes because we can't guarantee the
1749          * required DLM locks are held to protect file size.
1750          */
1751         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1752             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1753                 rc_tiny = ll_do_tiny_write(iocb, from);
1754
1755         /* In case of error, go on and try normal write - Only stop if tiny
1756          * write completed I/O.
1757          */
1758         if (iov_iter_count(from) == 0)
1759                 GOTO(out, rc_normal = rc_tiny);
1760
1761         env = cl_env_get(&refcheck);
1762         if (IS_ERR(env))
1763                 return PTR_ERR(env);
1764
1765         args = ll_env_args(env, IO_NORMAL);
1766         args->u.normal.via_iter = from;
1767         args->u.normal.via_iocb = iocb;
1768
1769         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1770                                     &iocb->ki_pos, iov_iter_count(from));
1771
1772         /* On success, combine bytes written. */
1773         if (rc_tiny >= 0 && rc_normal > 0)
1774                 rc_normal += rc_tiny;
1775         /* On error, only return error from normal write if tiny write did not
1776          * write any bytes.  Otherwise return bytes written by tiny write.
1777          */
1778         else if (rc_tiny > 0)
1779                 rc_normal = rc_tiny;
1780
1781         cl_env_put(env, &refcheck);
1782 out:
1783         RETURN(rc_normal);
1784 }
1785
1786 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1787 /*
1788  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1789  */
1790 static int ll_file_get_iov_count(const struct iovec *iov,
1791                                  unsigned long *nr_segs, size_t *count)
1792 {
1793         size_t cnt = 0;
1794         unsigned long seg;
1795
1796         for (seg = 0; seg < *nr_segs; seg++) {
1797                 const struct iovec *iv = &iov[seg];
1798
1799                 /*
1800                  * If any segment has a negative length, or the cumulative
1801                  * length ever wraps negative then return -EINVAL.
1802                  */
1803                 cnt += iv->iov_len;
1804                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1805                         return -EINVAL;
1806                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1807                         continue;
1808                 if (seg == 0)
1809                         return -EFAULT;
1810                 *nr_segs = seg;
1811                 cnt -= iv->iov_len;     /* This segment is no good */
1812                 break;
1813         }
1814         *count = cnt;
1815         return 0;
1816 }
1817
1818 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1819                                 unsigned long nr_segs, loff_t pos)
1820 {
1821         struct iov_iter to;
1822         size_t iov_count;
1823         ssize_t result;
1824         ENTRY;
1825
1826         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1827         if (result)
1828                 RETURN(result);
1829
1830 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1831         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1832 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1833         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1834 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1835
1836         result = ll_file_read_iter(iocb, &to);
1837
1838         RETURN(result);
1839 }
1840
1841 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1842                             loff_t *ppos)
1843 {
1844         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1845         struct kiocb   kiocb;
1846         ssize_t        result;
1847         ENTRY;
1848
1849         init_sync_kiocb(&kiocb, file);
1850         kiocb.ki_pos = *ppos;
1851 #ifdef HAVE_KIOCB_KI_LEFT
1852         kiocb.ki_left = count;
1853 #elif defined(HAVE_KI_NBYTES)
1854         kiocb.i_nbytes = count;
1855 #endif
1856
1857         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1858         *ppos = kiocb.ki_pos;
1859
1860         RETURN(result);
1861 }
1862
1863 /*
1864  * Write to a file (through the page cache).
1865  * AIO stuff
1866  */
1867 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1868                                  unsigned long nr_segs, loff_t pos)
1869 {
1870         struct iov_iter from;
1871         size_t iov_count;
1872         ssize_t result;
1873         ENTRY;
1874
1875         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1876         if (result)
1877                 RETURN(result);
1878
1879 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1880         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1881 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1882         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1883 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1884
1885         result = ll_file_write_iter(iocb, &from);
1886
1887         RETURN(result);
1888 }
1889
1890 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1891                              size_t count, loff_t *ppos)
1892 {
1893         struct iovec   iov = { .iov_base = (void __user *)buf,
1894                                .iov_len = count };
1895         struct kiocb   kiocb;
1896         ssize_t        result;
1897
1898         ENTRY;
1899
1900         init_sync_kiocb(&kiocb, file);
1901         kiocb.ki_pos = *ppos;
1902 #ifdef HAVE_KIOCB_KI_LEFT
1903         kiocb.ki_left = count;
1904 #elif defined(HAVE_KI_NBYTES)
1905         kiocb.ki_nbytes = count;
1906 #endif
1907
1908         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1909         *ppos = kiocb.ki_pos;
1910
1911         RETURN(result);
1912 }
1913 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1914
1915 /*
1916  * Send file content (through pagecache) somewhere with helper
1917  */
1918 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1919                                    struct pipe_inode_info *pipe, size_t count,
1920                                    unsigned int flags)
1921 {
1922         struct lu_env      *env;
1923         struct vvp_io_args *args;
1924         ssize_t             result;
1925         __u16               refcheck;
1926         ENTRY;
1927
1928         env = cl_env_get(&refcheck);
1929         if (IS_ERR(env))
1930                 RETURN(PTR_ERR(env));
1931
1932         args = ll_env_args(env, IO_SPLICE);
1933         args->u.splice.via_pipe = pipe;
1934         args->u.splice.via_flags = flags;
1935
1936         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1937         cl_env_put(env, &refcheck);
1938         RETURN(result);
1939 }
1940
1941 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1942                              __u64 flags, struct lov_user_md *lum, int lum_size)
1943 {
1944         struct lookup_intent oit = {
1945                 .it_op = IT_OPEN,
1946                 .it_flags = flags | MDS_OPEN_BY_FID,
1947         };
1948         int rc;
1949         ENTRY;
1950
1951         ll_inode_size_lock(inode);
1952         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1953         if (rc < 0)
1954                 GOTO(out_unlock, rc);
1955
1956         ll_release_openhandle(dentry, &oit);
1957
1958 out_unlock:
1959         ll_inode_size_unlock(inode);
1960         ll_intent_release(&oit);
1961
1962         RETURN(rc);
1963 }
1964
1965 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1966                              struct lov_mds_md **lmmp, int *lmm_size,
1967                              struct ptlrpc_request **request)
1968 {
1969         struct ll_sb_info *sbi = ll_i2sbi(inode);
1970         struct mdt_body  *body;
1971         struct lov_mds_md *lmm = NULL;
1972         struct ptlrpc_request *req = NULL;
1973         struct md_op_data *op_data;
1974         int rc, lmmsize;
1975
1976         rc = ll_get_default_mdsize(sbi, &lmmsize);
1977         if (rc)
1978                 RETURN(rc);
1979
1980         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1981                                      strlen(filename), lmmsize,
1982                                      LUSTRE_OPC_ANY, NULL);
1983         if (IS_ERR(op_data))
1984                 RETURN(PTR_ERR(op_data));
1985
1986         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1987         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1988         ll_finish_md_op_data(op_data);
1989         if (rc < 0) {
1990                 CDEBUG(D_INFO, "md_getattr_name failed "
1991                        "on %s: rc %d\n", filename, rc);
1992                 GOTO(out, rc);
1993         }
1994
1995         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1996         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1997
1998         lmmsize = body->mbo_eadatasize;
1999
2000         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2001                         lmmsize == 0) {
2002                 GOTO(out, rc = -ENODATA);
2003         }
2004
2005         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2006         LASSERT(lmm != NULL);
2007
2008         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2009             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2010             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2011                 GOTO(out, rc = -EPROTO);
2012
2013         /*
2014          * This is coming from the MDS, so is probably in
2015          * little endian.  We convert it to host endian before
2016          * passing it to userspace.
2017          */
2018         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2019                 int stripe_count;
2020
2021                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2022                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2023                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2024                         if (le32_to_cpu(lmm->lmm_pattern) &
2025                             LOV_PATTERN_F_RELEASED)
2026                                 stripe_count = 0;
2027                 }
2028
2029                 /* if function called for directory - we should
2030                  * avoid swab not existent lsm objects */
2031                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2032                         lustre_swab_lov_user_md_v1(
2033                                         (struct lov_user_md_v1 *)lmm);
2034                         if (S_ISREG(body->mbo_mode))
2035                                 lustre_swab_lov_user_md_objects(
2036                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2037                                     stripe_count);
2038                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2039                         lustre_swab_lov_user_md_v3(
2040                                         (struct lov_user_md_v3 *)lmm);
2041                         if (S_ISREG(body->mbo_mode))
2042                                 lustre_swab_lov_user_md_objects(
2043                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2044                                     stripe_count);
2045                 } else if (lmm->lmm_magic ==
2046                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2047                         lustre_swab_lov_comp_md_v1(
2048                                         (struct lov_comp_md_v1 *)lmm);
2049                 }
2050         }
2051
2052 out:
2053         *lmmp = lmm;
2054         *lmm_size = lmmsize;
2055         *request = req;
2056         return rc;
2057 }
2058
2059 static int ll_lov_setea(struct inode *inode, struct file *file,
2060                         void __user *arg)
2061 {
2062         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2063         struct lov_user_md      *lump;
2064         int                      lum_size = sizeof(struct lov_user_md) +
2065                                             sizeof(struct lov_user_ost_data);
2066         int                      rc;
2067         ENTRY;
2068
2069         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2070                 RETURN(-EPERM);
2071
2072         OBD_ALLOC_LARGE(lump, lum_size);
2073         if (lump == NULL)
2074                 RETURN(-ENOMEM);
2075
2076         if (copy_from_user(lump, arg, lum_size))
2077                 GOTO(out_lump, rc = -EFAULT);
2078
2079         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2080                                       lum_size);
2081         cl_lov_delay_create_clear(&file->f_flags);
2082
2083 out_lump:
2084         OBD_FREE_LARGE(lump, lum_size);
2085         RETURN(rc);
2086 }
2087
2088 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2089 {
2090         struct lu_env   *env;
2091         __u16           refcheck;
2092         int             rc;
2093         ENTRY;
2094
2095         env = cl_env_get(&refcheck);
2096         if (IS_ERR(env))
2097                 RETURN(PTR_ERR(env));
2098
2099         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2100         cl_env_put(env, &refcheck);
2101         RETURN(rc);
2102 }
2103
2104 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2105                             void __user *arg)
2106 {
2107         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2108         struct lov_user_md        *klum;
2109         int                        lum_size, rc;
2110         __u64                      flags = FMODE_WRITE;
2111         ENTRY;
2112
2113         rc = ll_copy_user_md(lum, &klum);
2114         if (rc < 0)
2115                 RETURN(rc);
2116
2117         lum_size = rc;
2118         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2119                                       lum_size);
2120         if (!rc) {
2121                 __u32 gen;
2122
2123                 rc = put_user(0, &lum->lmm_stripe_count);
2124                 if (rc)
2125                         GOTO(out, rc);
2126
2127                 rc = ll_layout_refresh(inode, &gen);
2128                 if (rc)
2129                         GOTO(out, rc);
2130
2131                 rc = ll_file_getstripe(inode, arg, lum_size);
2132         }
2133         cl_lov_delay_create_clear(&file->f_flags);
2134
2135 out:
2136         OBD_FREE(klum, lum_size);
2137         RETURN(rc);
2138 }
2139
2140 static int
2141 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2142 {
2143         struct ll_inode_info *lli = ll_i2info(inode);
2144         struct cl_object *obj = lli->lli_clob;
2145         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2146         struct ll_grouplock grouplock;
2147         int rc;
2148         ENTRY;
2149
2150         if (arg == 0) {
2151                 CWARN("group id for group lock must not be 0\n");
2152                 RETURN(-EINVAL);
2153         }
2154
2155         if (ll_file_nolock(file))
2156                 RETURN(-EOPNOTSUPP);
2157
2158         spin_lock(&lli->lli_lock);
2159         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2160                 CWARN("group lock already existed with gid %lu\n",
2161                       fd->fd_grouplock.lg_gid);
2162                 spin_unlock(&lli->lli_lock);
2163                 RETURN(-EINVAL);
2164         }
2165         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2166         spin_unlock(&lli->lli_lock);
2167
2168         /**
2169          * XXX: group lock needs to protect all OST objects while PFL
2170          * can add new OST objects during the IO, so we'd instantiate
2171          * all OST objects before getting its group lock.
2172          */
2173         if (obj) {
2174                 struct lu_env *env;
2175                 __u16 refcheck;
2176                 struct cl_layout cl = {
2177                         .cl_is_composite = false,
2178                 };
2179                 struct lu_extent ext = {
2180                         .e_start = 0,
2181                         .e_end = OBD_OBJECT_EOF,
2182                 };
2183
2184                 env = cl_env_get(&refcheck);
2185                 if (IS_ERR(env))
2186                         RETURN(PTR_ERR(env));
2187
2188                 rc = cl_object_layout_get(env, obj, &cl);
2189                 if (!rc && cl.cl_is_composite)
2190                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2191                                                     &ext);
2192
2193                 cl_env_put(env, &refcheck);
2194                 if (rc)
2195                         RETURN(rc);
2196         }
2197
2198         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2199                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2200         if (rc)
2201                 RETURN(rc);
2202
2203         spin_lock(&lli->lli_lock);
2204         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2205                 spin_unlock(&lli->lli_lock);
2206                 CERROR("another thread just won the race\n");
2207                 cl_put_grouplock(&grouplock);
2208                 RETURN(-EINVAL);
2209         }
2210
2211         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2212         fd->fd_grouplock = grouplock;
2213         spin_unlock(&lli->lli_lock);
2214
2215         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2216         RETURN(0);
2217 }
2218
2219 static int ll_put_grouplock(struct inode *inode, struct file *file,
2220                             unsigned long arg)
2221 {
2222         struct ll_inode_info   *lli = ll_i2info(inode);
2223         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2224         struct ll_grouplock     grouplock;
2225         ENTRY;
2226
2227         spin_lock(&lli->lli_lock);
2228         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2229                 spin_unlock(&lli->lli_lock);
2230                 CWARN("no group lock held\n");
2231                 RETURN(-EINVAL);
2232         }
2233
2234         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2235
2236         if (fd->fd_grouplock.lg_gid != arg) {
2237                 CWARN("group lock %lu doesn't match current id %lu\n",
2238                       arg, fd->fd_grouplock.lg_gid);
2239                 spin_unlock(&lli->lli_lock);
2240                 RETURN(-EINVAL);
2241         }
2242
2243         grouplock = fd->fd_grouplock;
2244         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2245         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2246         spin_unlock(&lli->lli_lock);
2247
2248         cl_put_grouplock(&grouplock);
2249         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2250         RETURN(0);
2251 }
2252
2253 /**
2254  * Close inode open handle
2255  *
2256  * \param dentry [in]     dentry which contains the inode
2257  * \param it     [in,out] intent which contains open info and result
2258  *
2259  * \retval 0     success
2260  * \retval <0    failure
2261  */
2262 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2263 {
2264         struct inode *inode = dentry->d_inode;
2265         struct obd_client_handle *och;
2266         int rc;
2267         ENTRY;
2268
2269         LASSERT(inode);
2270
2271         /* Root ? Do nothing. */
2272         if (dentry->d_inode->i_sb->s_root == dentry)
2273                 RETURN(0);
2274
2275         /* No open handle to close? Move away */
2276         if (!it_disposition(it, DISP_OPEN_OPEN))
2277                 RETURN(0);
2278
2279         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2280
2281         OBD_ALLOC(och, sizeof(*och));
2282         if (!och)
2283                 GOTO(out, rc = -ENOMEM);
2284
2285         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2286
2287         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2288 out:
2289         /* this one is in place of ll_file_open */
2290         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2291                 ptlrpc_req_finished(it->it_request);
2292                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2293         }
2294         RETURN(rc);
2295 }
2296
2297 /**
2298  * Get size for inode for which FIEMAP mapping is requested.
2299  * Make the FIEMAP get_info call and returns the result.
2300  * \param fiemap        kernel buffer to hold extens
2301  * \param num_bytes     kernel buffer size
2302  */
2303 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2304                         size_t num_bytes)
2305 {
2306         struct lu_env                   *env;
2307         __u16                           refcheck;
2308         int                             rc = 0;
2309         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2310         ENTRY;
2311
2312         /* Checks for fiemap flags */
2313         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2314                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2315                 return -EBADR;
2316         }
2317
2318         /* Check for FIEMAP_FLAG_SYNC */
2319         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2320                 rc = filemap_fdatawrite(inode->i_mapping);
2321                 if (rc)
2322                         return rc;
2323         }
2324
2325         env = cl_env_get(&refcheck);
2326         if (IS_ERR(env))
2327                 RETURN(PTR_ERR(env));
2328
2329         if (i_size_read(inode) == 0) {
2330                 rc = ll_glimpse_size(inode);
2331                 if (rc)
2332                         GOTO(out, rc);
2333         }
2334
2335         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2336         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2337         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2338
2339         /* If filesize is 0, then there would be no objects for mapping */
2340         if (fmkey.lfik_oa.o_size == 0) {
2341                 fiemap->fm_mapped_extents = 0;
2342                 GOTO(out, rc = 0);
2343         }
2344
2345         fmkey.lfik_fiemap = *fiemap;
2346
2347         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2348                               &fmkey, fiemap, &num_bytes);
2349 out:
2350         cl_env_put(env, &refcheck);
2351         RETURN(rc);
2352 }
2353
2354 int ll_fid2path(struct inode *inode, void __user *arg)
2355 {
2356         struct obd_export       *exp = ll_i2mdexp(inode);
2357         const struct getinfo_fid2path __user *gfin = arg;
2358         __u32                    pathlen;
2359         struct getinfo_fid2path *gfout;
2360         size_t                   outsize;
2361         int                      rc;
2362
2363         ENTRY;
2364
2365         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2366             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2367                 RETURN(-EPERM);
2368
2369         /* Only need to get the buflen */
2370         if (get_user(pathlen, &gfin->gf_pathlen))
2371                 RETURN(-EFAULT);
2372
2373         if (pathlen > PATH_MAX)
2374                 RETURN(-EINVAL);
2375
2376         outsize = sizeof(*gfout) + pathlen;
2377         OBD_ALLOC(gfout, outsize);
2378         if (gfout == NULL)
2379                 RETURN(-ENOMEM);
2380
2381         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2382                 GOTO(gf_free, rc = -EFAULT);
2383         /* append root FID after gfout to let MDT know the root FID so that it
2384          * can lookup the correct path, this is mainly for fileset.
2385          * old server without fileset mount support will ignore this. */
2386         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2387
2388         /* Call mdc_iocontrol */
2389         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2390         if (rc != 0)
2391                 GOTO(gf_free, rc);
2392
2393         if (copy_to_user(arg, gfout, outsize))
2394                 rc = -EFAULT;
2395
2396 gf_free:
2397         OBD_FREE(gfout, outsize);
2398         RETURN(rc);
2399 }
2400
2401 static int
2402 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2403 {
2404         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2405         struct lu_env *env;
2406         struct cl_io *io;
2407         __u16  refcheck;
2408         int result;
2409
2410         ENTRY;
2411
2412         ioc->idv_version = 0;
2413         ioc->idv_layout_version = UINT_MAX;
2414
2415         /* If no file object initialized, we consider its version is 0. */
2416         if (obj == NULL)
2417                 RETURN(0);
2418
2419         env = cl_env_get(&refcheck);
2420         if (IS_ERR(env))
2421                 RETURN(PTR_ERR(env));
2422
2423         io = vvp_env_thread_io(env);
2424         io->ci_obj = obj;
2425         io->u.ci_data_version.dv_data_version = 0;
2426         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2427         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2428
2429 restart:
2430         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2431                 result = cl_io_loop(env, io);
2432         else
2433                 result = io->ci_result;
2434
2435         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2436         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2437
2438         cl_io_fini(env, io);
2439
2440         if (unlikely(io->ci_need_restart))
2441                 goto restart;
2442
2443         cl_env_put(env, &refcheck);
2444
2445         RETURN(result);
2446 }
2447
2448 /*
2449  * Read the data_version for inode.
2450  *
2451  * This value is computed using stripe object version on OST.
2452  * Version is computed using server side locking.
2453  *
2454  * @param flags if do sync on the OST side;
2455  *              0: no sync
2456  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2457  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2458  */
2459 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2460 {
2461         struct ioc_data_version ioc = { .idv_flags = flags };
2462         int rc;
2463
2464         rc = ll_ioc_data_version(inode, &ioc);
2465         if (!rc)
2466                 *data_version = ioc.idv_version;
2467
2468         return rc;
2469 }
2470
2471 /*
2472  * Trigger a HSM release request for the provided inode.
2473  */
2474 int ll_hsm_release(struct inode *inode)
2475 {
2476         struct lu_env *env;
2477         struct obd_client_handle *och = NULL;
2478         __u64 data_version = 0;
2479         int rc;
2480         __u16 refcheck;
2481         ENTRY;
2482
2483         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2484                ll_get_fsname(inode->i_sb, NULL, 0),
2485                PFID(&ll_i2info(inode)->lli_fid));
2486
2487         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2488         if (IS_ERR(och))
2489                 GOTO(out, rc = PTR_ERR(och));
2490
2491         /* Grab latest data_version and [am]time values */
2492         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2493         if (rc != 0)
2494                 GOTO(out, rc);
2495
2496         env = cl_env_get(&refcheck);
2497         if (IS_ERR(env))
2498                 GOTO(out, rc = PTR_ERR(env));
2499
2500         rc = ll_merge_attr(env, inode);
2501         cl_env_put(env, &refcheck);
2502
2503         /* If error happen, we have the wrong size for a file.
2504          * Don't release it.
2505          */
2506         if (rc != 0)
2507                 GOTO(out, rc);
2508
2509         /* Release the file.
2510          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2511          * we still need it to pack l_remote_handle to MDT. */
2512         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2513                                        &data_version);
2514         och = NULL;
2515
2516         EXIT;
2517 out:
2518         if (och != NULL && !IS_ERR(och)) /* close the file */
2519                 ll_lease_close(och, inode, NULL);
2520
2521         return rc;
2522 }
2523
2524 struct ll_swap_stack {
2525         __u64                    dv1;
2526         __u64                    dv2;
2527         struct inode            *inode1;
2528         struct inode            *inode2;
2529         bool                     check_dv1;
2530         bool                     check_dv2;
2531 };
2532
2533 static int ll_swap_layouts(struct file *file1, struct file *file2,
2534                            struct lustre_swap_layouts *lsl)
2535 {
2536         struct mdc_swap_layouts  msl;
2537         struct md_op_data       *op_data;
2538         __u32                    gid;
2539         __u64                    dv;
2540         struct ll_swap_stack    *llss = NULL;
2541         int                      rc;
2542
2543         OBD_ALLOC_PTR(llss);
2544         if (llss == NULL)
2545                 RETURN(-ENOMEM);
2546
2547         llss->inode1 = file_inode(file1);
2548         llss->inode2 = file_inode(file2);
2549
2550         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2551         if (rc < 0)
2552                 GOTO(free, rc);
2553
2554         /* we use 2 bool because it is easier to swap than 2 bits */
2555         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2556                 llss->check_dv1 = true;
2557
2558         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2559                 llss->check_dv2 = true;
2560
2561         /* we cannot use lsl->sl_dvX directly because we may swap them */
2562         llss->dv1 = lsl->sl_dv1;
2563         llss->dv2 = lsl->sl_dv2;
2564
2565         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2566         if (rc == 0) /* same file, done! */
2567                 GOTO(free, rc);
2568
2569         if (rc < 0) { /* sequentialize it */
2570                 swap(llss->inode1, llss->inode2);
2571                 swap(file1, file2);
2572                 swap(llss->dv1, llss->dv2);
2573                 swap(llss->check_dv1, llss->check_dv2);
2574         }
2575
2576         gid = lsl->sl_gid;
2577         if (gid != 0) { /* application asks to flush dirty cache */
2578                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2579                 if (rc < 0)
2580                         GOTO(free, rc);
2581
2582                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2583                 if (rc < 0) {
2584                         ll_put_grouplock(llss->inode1, file1, gid);
2585                         GOTO(free, rc);
2586                 }
2587         }
2588
2589         /* ultimate check, before swaping the layouts we check if
2590          * dataversion has changed (if requested) */
2591         if (llss->check_dv1) {
2592                 rc = ll_data_version(llss->inode1, &dv, 0);
2593                 if (rc)
2594                         GOTO(putgl, rc);
2595                 if (dv != llss->dv1)
2596                         GOTO(putgl, rc = -EAGAIN);
2597         }
2598
2599         if (llss->check_dv2) {
2600                 rc = ll_data_version(llss->inode2, &dv, 0);
2601                 if (rc)
2602                         GOTO(putgl, rc);
2603                 if (dv != llss->dv2)
2604                         GOTO(putgl, rc = -EAGAIN);
2605         }
2606
2607         /* struct md_op_data is used to send the swap args to the mdt
2608          * only flags is missing, so we use struct mdc_swap_layouts
2609          * through the md_op_data->op_data */
2610         /* flags from user space have to be converted before they are send to
2611          * server, no flag is sent today, they are only used on the client */
2612         msl.msl_flags = 0;
2613         rc = -ENOMEM;
2614         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2615                                      0, LUSTRE_OPC_ANY, &msl);
2616         if (IS_ERR(op_data))
2617                 GOTO(free, rc = PTR_ERR(op_data));
2618
2619         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2620                            sizeof(*op_data), op_data, NULL);
2621         ll_finish_md_op_data(op_data);
2622
2623         if (rc < 0)
2624                 GOTO(putgl, rc);
2625
2626 putgl:
2627         if (gid != 0) {
2628                 ll_put_grouplock(llss->inode2, file2, gid);
2629                 ll_put_grouplock(llss->inode1, file1, gid);
2630         }
2631
2632 free:
2633         if (llss != NULL)
2634                 OBD_FREE_PTR(llss);
2635
2636         RETURN(rc);
2637 }
2638
2639 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2640 {
2641         struct md_op_data       *op_data;
2642         int                      rc;
2643         ENTRY;
2644
2645         /* Detect out-of range masks */
2646         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2647                 RETURN(-EINVAL);
2648
2649         /* Non-root users are forbidden to set or clear flags which are
2650          * NOT defined in HSM_USER_MASK. */
2651         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2652             !cfs_capable(CFS_CAP_SYS_ADMIN))
2653                 RETURN(-EPERM);
2654
2655         /* Detect out-of range archive id */
2656         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2657             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2658                 RETURN(-EINVAL);
2659
2660         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2661                                      LUSTRE_OPC_ANY, hss);
2662         if (IS_ERR(op_data))
2663                 RETURN(PTR_ERR(op_data));
2664
2665         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2666                            sizeof(*op_data), op_data, NULL);
2667
2668         ll_finish_md_op_data(op_data);
2669
2670         RETURN(rc);
2671 }
2672
2673 static int ll_hsm_import(struct inode *inode, struct file *file,
2674                          struct hsm_user_import *hui)
2675 {
2676         struct hsm_state_set    *hss = NULL;
2677         struct iattr            *attr = NULL;
2678         int                      rc;
2679         ENTRY;
2680
2681         if (!S_ISREG(inode->i_mode))
2682                 RETURN(-EINVAL);
2683
2684         /* set HSM flags */
2685         OBD_ALLOC_PTR(hss);
2686         if (hss == NULL)
2687                 GOTO(out, rc = -ENOMEM);
2688
2689         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2690         hss->hss_archive_id = hui->hui_archive_id;
2691         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2692         rc = ll_hsm_state_set(inode, hss);
2693         if (rc != 0)
2694                 GOTO(out, rc);
2695
2696         OBD_ALLOC_PTR(attr);
2697         if (attr == NULL)
2698                 GOTO(out, rc = -ENOMEM);
2699
2700         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2701         attr->ia_mode |= S_IFREG;
2702         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2703         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2704         attr->ia_size = hui->hui_size;
2705         attr->ia_mtime.tv_sec = hui->hui_mtime;
2706         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2707         attr->ia_atime.tv_sec = hui->hui_atime;
2708         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2709
2710         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2711                          ATTR_UID | ATTR_GID |
2712                          ATTR_MTIME | ATTR_MTIME_SET |
2713                          ATTR_ATIME | ATTR_ATIME_SET;
2714
2715         inode_lock(inode);
2716
2717         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2718         if (rc == -ENODATA)
2719                 rc = 0;
2720
2721         inode_unlock(inode);
2722
2723 out:
2724         if (hss != NULL)
2725                 OBD_FREE_PTR(hss);
2726
2727         if (attr != NULL)
2728                 OBD_FREE_PTR(attr);
2729
2730         RETURN(rc);
2731 }
2732
2733 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2734 {
2735         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2736                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2737 }
2738
2739 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2740 {
2741         struct inode *inode = file_inode(file);
2742         struct iattr ia = {
2743                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2744                             ATTR_MTIME | ATTR_MTIME_SET |
2745                             ATTR_CTIME,
2746                 .ia_atime = {
2747                         .tv_sec = lfu->lfu_atime_sec,
2748                         .tv_nsec = lfu->lfu_atime_nsec,
2749                 },
2750                 .ia_mtime = {
2751                         .tv_sec = lfu->lfu_mtime_sec,
2752                         .tv_nsec = lfu->lfu_mtime_nsec,
2753                 },
2754                 .ia_ctime = {
2755                         .tv_sec = lfu->lfu_ctime_sec,
2756                         .tv_nsec = lfu->lfu_ctime_nsec,
2757                 },
2758         };
2759         int rc;
2760         ENTRY;
2761
2762         if (!capable(CAP_SYS_ADMIN))
2763                 RETURN(-EPERM);
2764
2765         if (!S_ISREG(inode->i_mode))
2766                 RETURN(-EINVAL);
2767
2768         inode_lock(inode);
2769         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2770                             false);
2771         inode_unlock(inode);
2772
2773         RETURN(rc);
2774 }
2775
2776 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2777 {
2778         switch (mode) {
2779         case MODE_READ_USER:
2780                 return CLM_READ;
2781         case MODE_WRITE_USER:
2782                 return CLM_WRITE;
2783         default:
2784                 return -EINVAL;
2785         }
2786 }
2787
2788 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2789
2790 /* Used to allow the upper layers of the client to request an LDLM lock
2791  * without doing an actual read or write.
2792  *
2793  * Used for ladvise lockahead to manually request specific locks.
2794  *
2795  * \param[in] file      file this ladvise lock request is on
2796  * \param[in] ladvise   ladvise struct describing this lock request
2797  *
2798  * \retval 0            success, no detailed result available (sync requests
2799  *                      and requests sent to the server [not handled locally]
2800  *                      cannot return detailed results)
2801  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2802  *                                       see definitions for details.
2803  * \retval negative     negative errno on error
2804  */
2805 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2806 {
2807         struct lu_env *env = NULL;
2808         struct cl_io *io  = NULL;
2809         struct cl_lock *lock = NULL;
2810         struct cl_lock_descr *descr = NULL;
2811         struct dentry *dentry = file->f_path.dentry;
2812         struct inode *inode = dentry->d_inode;
2813         enum cl_lock_mode cl_mode;
2814         off_t start = ladvise->lla_start;
2815         off_t end = ladvise->lla_end;
2816         int result;
2817         __u16 refcheck;
2818
2819         ENTRY;
2820
2821         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2822                "start=%llu, end=%llu\n", dentry->d_name.len,
2823                dentry->d_name.name, dentry->d_inode,
2824                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2825                (__u64) end);
2826
2827         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2828         if (cl_mode < 0)
2829                 GOTO(out, result = cl_mode);
2830
2831         /* Get IO environment */
2832         result = cl_io_get(inode, &env, &io, &refcheck);
2833         if (result <= 0)
2834                 GOTO(out, result);
2835
2836         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2837         if (result > 0) {
2838                 /*
2839                  * nothing to do for this io. This currently happens when
2840                  * stripe sub-object's are not yet created.
2841                  */
2842                 result = io->ci_result;
2843         } else if (result == 0) {
2844                 lock = vvp_env_lock(env);
2845                 descr = &lock->cll_descr;
2846
2847                 descr->cld_obj   = io->ci_obj;
2848                 /* Convert byte offsets to pages */
2849                 descr->cld_start = cl_index(io->ci_obj, start);
2850                 descr->cld_end   = cl_index(io->ci_obj, end);
2851                 descr->cld_mode  = cl_mode;
2852                 /* CEF_MUST is used because we do not want to convert a
2853                  * lockahead request to a lockless lock */
2854                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2855                                        CEF_NONBLOCK;
2856
2857                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2858                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2859
2860                 result = cl_lock_request(env, io, lock);
2861
2862                 /* On success, we need to release the lock */
2863                 if (result >= 0)
2864                         cl_lock_release(env, lock);
2865         }
2866         cl_io_fini(env, io);
2867         cl_env_put(env, &refcheck);
2868
2869         /* -ECANCELED indicates a matching lock with a different extent
2870          * was already present, and -EEXIST indicates a matching lock
2871          * on exactly the same extent was already present.
2872          * We convert them to positive values for userspace to make
2873          * recognizing true errors easier.
2874          * Note we can only return these detailed results on async requests,
2875          * as sync requests look the same as i/o requests for locking. */
2876         if (result == -ECANCELED)
2877                 result = LLA_RESULT_DIFFERENT;
2878         else if (result == -EEXIST)
2879                 result = LLA_RESULT_SAME;
2880
2881 out:
2882         RETURN(result);
2883 }
2884 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2885
2886 static int ll_ladvise_sanity(struct inode *inode,
2887                              struct llapi_lu_ladvise *ladvise)
2888 {
2889         enum lu_ladvise_type advice = ladvise->lla_advice;
2890         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2891          * be in the first 32 bits of enum ladvise_flags */
2892         __u32 flags = ladvise->lla_peradvice_flags;
2893         /* 3 lines at 80 characters per line, should be plenty */
2894         int rc = 0;
2895
2896         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2897                 rc = -EINVAL;
2898                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2899                        "last supported advice is %s (value '%d'): rc = %d\n",
2900                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2901                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2902                 GOTO(out, rc);
2903         }
2904
2905         /* Per-advice checks */
2906         switch (advice) {
2907         case LU_LADVISE_LOCKNOEXPAND:
2908                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2909                         rc = -EINVAL;
2910                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2911                                "rc = %d\n",
2912                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2913                                ladvise_names[advice], rc);
2914                         GOTO(out, rc);
2915                 }
2916                 break;
2917         case LU_LADVISE_LOCKAHEAD:
2918                 /* Currently only READ and WRITE modes can be requested */
2919                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2920                     ladvise->lla_lockahead_mode == 0) {
2921                         rc = -EINVAL;
2922                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2923                                "rc = %d\n",
2924                                ll_get_fsname(inode->i_sb, NULL, 0),
2925                                ladvise->lla_lockahead_mode,
2926                                ladvise_names[advice], rc);
2927                         GOTO(out, rc);
2928                 }
2929         case LU_LADVISE_WILLREAD:
2930         case LU_LADVISE_DONTNEED:
2931         default:
2932                 /* Note fall through above - These checks apply to all advices
2933                  * except LOCKNOEXPAND */
2934                 if (flags & ~LF_DEFAULT_MASK) {
2935                         rc = -EINVAL;
2936                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2937                                "rc = %d\n",
2938                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2939                                ladvise_names[advice], rc);
2940                         GOTO(out, rc);
2941                 }
2942                 if (ladvise->lla_start >= ladvise->lla_end) {
2943                         rc = -EINVAL;
2944                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2945                                "for %s: rc = %d\n",
2946                                ll_get_fsname(inode->i_sb, NULL, 0),
2947                                ladvise->lla_start, ladvise->lla_end,
2948                                ladvise_names[advice], rc);
2949                         GOTO(out, rc);
2950                 }
2951                 break;
2952         }
2953
2954 out:
2955         return rc;
2956 }
2957 #undef ERRSIZE
2958
2959 /*
2960  * Give file access advices
2961  *
2962  * The ladvise interface is similar to Linux fadvise() system call, except it
2963  * forwards the advices directly from Lustre client to server. The server side
2964  * codes will apply appropriate read-ahead and caching techniques for the
2965  * corresponding files.
2966  *
2967  * A typical workload for ladvise is e.g. a bunch of different clients are
2968  * doing small random reads of a file, so prefetching pages into OSS cache
2969  * with big linear reads before the random IO is a net benefit. Fetching
2970  * all that data into each client cache with fadvise() may not be, due to
2971  * much more data being sent to the client.
2972  */
2973 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2974                       struct llapi_lu_ladvise *ladvise)
2975 {
2976         struct lu_env *env;
2977         struct cl_io *io;
2978         struct cl_ladvise_io *lio;
2979         int rc;
2980         __u16 refcheck;
2981         ENTRY;
2982
2983         env = cl_env_get(&refcheck);
2984         if (IS_ERR(env))
2985                 RETURN(PTR_ERR(env));
2986
2987         io = vvp_env_thread_io(env);
2988         io->ci_obj = ll_i2info(inode)->lli_clob;
2989
2990         /* initialize parameters for ladvise */
2991         lio = &io->u.ci_ladvise;
2992         lio->li_start = ladvise->lla_start;
2993         lio->li_end = ladvise->lla_end;
2994         lio->li_fid = ll_inode2fid(inode);
2995         lio->li_advice = ladvise->lla_advice;
2996         lio->li_flags = flags;
2997
2998         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2999                 rc = cl_io_loop(env, io);
3000         else
3001                 rc = io->ci_result;
3002
3003         cl_io_fini(env, io);
3004         cl_env_put(env, &refcheck);
3005         RETURN(rc);
3006 }
3007
3008 static int ll_lock_noexpand(struct file *file, int flags)
3009 {
3010         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3011
3012         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3013
3014         return 0;
3015 }
3016
3017 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3018                         unsigned long arg)
3019 {
3020         struct fsxattr fsxattr;
3021
3022         if (copy_from_user(&fsxattr,
3023                            (const struct fsxattr __user *)arg,
3024                            sizeof(fsxattr)))
3025                 RETURN(-EFAULT);
3026
3027         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3028         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3029                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3030         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3031         if (copy_to_user((struct fsxattr __user *)arg,
3032                          &fsxattr, sizeof(fsxattr)))
3033                 RETURN(-EFAULT);
3034
3035         RETURN(0);
3036 }
3037
3038 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3039 {
3040         /*
3041          * Project Quota ID state is only allowed to change from within the init
3042          * namespace. Enforce that restriction only if we are trying to change
3043          * the quota ID state. Everything else is allowed in user namespaces.
3044          */
3045         if (current_user_ns() == &init_user_ns)
3046                 return 0;
3047
3048         if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3049                 return -EINVAL;
3050
3051         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3052                 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3053                         return -EINVAL;
3054         } else {
3055                 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3056                         return -EINVAL;
3057         }
3058
3059         return 0;
3060 }
3061
3062 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3063                         unsigned long arg)
3064 {
3065
3066         struct md_op_data *op_data;
3067         struct ptlrpc_request *req = NULL;
3068         int rc = 0;
3069         struct fsxattr fsxattr;
3070         struct cl_object *obj;
3071         struct iattr *attr;
3072         int flags;
3073
3074         if (copy_from_user(&fsxattr,
3075                            (const struct fsxattr __user *)arg,
3076                            sizeof(fsxattr)))
3077                 RETURN(-EFAULT);
3078
3079         rc = ll_ioctl_check_project(inode, &fsxattr);
3080         if (rc)
3081                 RETURN(rc);
3082
3083         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3084                                      LUSTRE_OPC_ANY, NULL);
3085         if (IS_ERR(op_data))
3086                 RETURN(PTR_ERR(op_data));
3087
3088         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3089         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3090         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3091                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3092         op_data->op_projid = fsxattr.fsx_projid;
3093         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3094         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3095                         0, &req);
3096         ptlrpc_req_finished(req);
3097         if (rc)
3098                 GOTO(out_fsxattr, rc);
3099         ll_update_inode_flags(inode, op_data->op_attr_flags);
3100         obj = ll_i2info(inode)->lli_clob;
3101         if (obj == NULL)
3102                 GOTO(out_fsxattr, rc);
3103
3104         OBD_ALLOC_PTR(attr);
3105         if (attr == NULL)
3106                 GOTO(out_fsxattr, rc = -ENOMEM);
3107
3108         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3109                             fsxattr.fsx_xflags);
3110         OBD_FREE_PTR(attr);
3111 out_fsxattr:
3112         ll_finish_md_op_data(op_data);
3113         RETURN(rc);
3114 }
3115
3116 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3117                                  unsigned long arg)
3118 {
3119         struct inode            *inode = file_inode(file);
3120         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3121         struct ll_inode_info    *lli = ll_i2info(inode);
3122         struct obd_client_handle *och = NULL;
3123         struct split_param sp;
3124         bool lease_broken;
3125         fmode_t fmode = 0;
3126         enum mds_op_bias bias = 0;
3127         struct file *layout_file = NULL;
3128         void *data = NULL;
3129         size_t data_size = 0;
3130         long rc;
3131         ENTRY;
3132
3133         mutex_lock(&lli->lli_och_mutex);
3134         if (fd->fd_lease_och != NULL) {
3135                 och = fd->fd_lease_och;
3136                 fd->fd_lease_och = NULL;
3137         }
3138         mutex_unlock(&lli->lli_och_mutex);
3139
3140         if (och == NULL)
3141                 GOTO(out, rc = -ENOLCK);
3142
3143         fmode = och->och_flags;
3144
3145         switch (ioc->lil_flags) {
3146         case LL_LEASE_RESYNC_DONE:
3147                 if (ioc->lil_count > IOC_IDS_MAX)
3148                         GOTO(out, rc = -EINVAL);
3149
3150                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3151                 OBD_ALLOC(data, data_size);
3152                 if (!data)
3153                         GOTO(out, rc = -ENOMEM);
3154
3155                 if (copy_from_user(data, (void __user *)arg, data_size))
3156                         GOTO(out, rc = -EFAULT);
3157
3158                 bias = MDS_CLOSE_RESYNC_DONE;
3159                 break;
3160         case LL_LEASE_LAYOUT_MERGE: {
3161                 int fd;
3162
3163                 if (ioc->lil_count != 1)
3164                         GOTO(out, rc = -EINVAL);
3165
3166                 arg += sizeof(*ioc);
3167                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3168                         GOTO(out, rc = -EFAULT);
3169
3170                 layout_file = fget(fd);
3171                 if (!layout_file)
3172                         GOTO(out, rc = -EBADF);
3173
3174                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3175                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3176                         GOTO(out, rc = -EPERM);
3177
3178                 data = file_inode(layout_file);
3179                 bias = MDS_CLOSE_LAYOUT_MERGE;
3180                 break;
3181         }
3182         case LL_LEASE_LAYOUT_SPLIT: {
3183                 int fdv;
3184                 int mirror_id;
3185
3186                 if (ioc->lil_count != 2)
3187                         GOTO(out, rc = -EINVAL);
3188
3189                 arg += sizeof(*ioc);
3190                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3191                         GOTO(out, rc = -EFAULT);
3192
3193                 arg += sizeof(__u32);
3194                 if (copy_from_user(&mirror_id, (void __user *)arg,
3195                                    sizeof(__u32)))
3196                         GOTO(out, rc = -EFAULT);
3197
3198                 layout_file = fget(fdv);
3199                 if (!layout_file)
3200                         GOTO(out, rc = -EBADF);
3201
3202                 sp.sp_inode = file_inode(layout_file);
3203                 sp.sp_mirror_id = (__u16)mirror_id;
3204                 data = &sp;
3205                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3206                 break;
3207         }
3208         default:
3209                 /* without close intent */
3210                 break;
3211         }
3212
3213         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3214         if (rc < 0)
3215                 GOTO(out, rc);
3216
3217         rc = ll_lease_och_release(inode, file);
3218         if (rc < 0)
3219                 GOTO(out, rc);
3220
3221         if (lease_broken)
3222                 fmode = 0;
3223         EXIT;
3224
3225 out:
3226         switch (ioc->lil_flags) {
3227         case LL_LEASE_RESYNC_DONE:
3228                 if (data)
3229                         OBD_FREE(data, data_size);
3230                 break;
3231         case LL_LEASE_LAYOUT_MERGE:
3232         case LL_LEASE_LAYOUT_SPLIT:
3233                 if (layout_file)
3234                         fput(layout_file);
3235                 break;
3236         }
3237
3238         if (!rc)
3239                 rc = ll_lease_type_from_fmode(fmode);
3240         RETURN(rc);
3241 }
3242
3243 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3244                               unsigned long arg)
3245 {
3246         struct inode *inode = file_inode(file);
3247         struct ll_inode_info *lli = ll_i2info(inode);
3248         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3249         struct obd_client_handle *och = NULL;
3250         __u64 open_flags = 0;
3251         bool lease_broken;
3252         fmode_t fmode;
3253         long rc;
3254         ENTRY;
3255
3256         switch (ioc->lil_mode) {
3257         case LL_LEASE_WRLCK:
3258                 if (!(file->f_mode & FMODE_WRITE))
3259                         RETURN(-EPERM);
3260                 fmode = FMODE_WRITE;
3261                 break;
3262         case LL_LEASE_RDLCK:
3263                 if (!(file->f_mode & FMODE_READ))
3264                         RETURN(-EPERM);
3265                 fmode = FMODE_READ;
3266                 break;
3267         case LL_LEASE_UNLCK:
3268                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3269         default:
3270                 RETURN(-EINVAL);
3271         }
3272
3273         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3274
3275         /* apply for lease */
3276         if (ioc->lil_flags & LL_LEASE_RESYNC)
3277                 open_flags = MDS_OPEN_RESYNC;
3278         och = ll_lease_open(inode, file, fmode, open_flags);
3279         if (IS_ERR(och))
3280                 RETURN(PTR_ERR(och));
3281
3282         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3283                 rc = ll_lease_file_resync(och, inode, arg);
3284                 if (rc) {
3285                         ll_lease_close(och, inode, NULL);
3286                         RETURN(rc);
3287                 }
3288                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3289                 if (rc) {
3290                         ll_lease_close(och, inode, NULL);
3291                         RETURN(rc);
3292                 }
3293         }
3294
3295         rc = 0;
3296         mutex_lock(&lli->lli_och_mutex);
3297         if (fd->fd_lease_och == NULL) {
3298                 fd->fd_lease_och = och;
3299                 och = NULL;
3300         }
3301         mutex_unlock(&lli->lli_och_mutex);
3302         if (och != NULL) {
3303                 /* impossible now that only excl is supported for now */
3304                 ll_lease_close(och, inode, &lease_broken);
3305                 rc = -EBUSY;
3306         }
3307         RETURN(rc);
3308 }
3309
3310 static long
3311 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3312 {
3313         struct inode            *inode = file_inode(file);
3314         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3315         int                      flags, rc;
3316         ENTRY;
3317
3318         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3319                PFID(ll_inode2fid(inode)), inode, cmd);
3320         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3321
3322         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3323         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3324                 RETURN(-ENOTTY);
3325
3326         switch (cmd) {
3327         case LL_IOC_GETFLAGS:
3328                 /* Get the current value of the file flags */
3329                 return put_user(fd->fd_flags, (int __user *)arg);
3330         case LL_IOC_SETFLAGS:
3331         case LL_IOC_CLRFLAGS:
3332                 /* Set or clear specific file flags */
3333                 /* XXX This probably needs checks to ensure the flags are
3334                  *     not abused, and to handle any flag side effects.
3335                  */
3336                 if (get_user(flags, (int __user *) arg))
3337                         RETURN(-EFAULT);
3338
3339                 if (cmd == LL_IOC_SETFLAGS) {
3340                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3341                             !(file->f_flags & O_DIRECT)) {
3342                                 CERROR("%s: unable to disable locking on "
3343                                        "non-O_DIRECT file\n", current->comm);
3344                                 RETURN(-EINVAL);
3345                         }
3346
3347                         fd->fd_flags |= flags;
3348                 } else {
3349                         fd->fd_flags &= ~flags;
3350                 }
3351                 RETURN(0);
3352         case LL_IOC_LOV_SETSTRIPE:
3353         case LL_IOC_LOV_SETSTRIPE_NEW:
3354                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3355         case LL_IOC_LOV_SETEA:
3356                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3357         case LL_IOC_LOV_SWAP_LAYOUTS: {
3358                 struct file *file2;
3359                 struct lustre_swap_layouts lsl;
3360
3361                 if (copy_from_user(&lsl, (char __user *)arg,
3362                                    sizeof(struct lustre_swap_layouts)))
3363                         RETURN(-EFAULT);
3364
3365                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3366                         RETURN(-EPERM);
3367
3368                 file2 = fget(lsl.sl_fd);
3369                 if (file2 == NULL)
3370                         RETURN(-EBADF);
3371
3372                 /* O_WRONLY or O_RDWR */
3373                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3374                         GOTO(out, rc = -EPERM);
3375
3376                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3377                         struct inode                    *inode2;
3378                         struct ll_inode_info            *lli;
3379                         struct obd_client_handle        *och = NULL;
3380
3381                         lli = ll_i2info(inode);
3382                         mutex_lock(&lli->lli_och_mutex);
3383                         if (fd->fd_lease_och != NULL) {
3384                                 och = fd->fd_lease_och;
3385                                 fd->fd_lease_och = NULL;
3386                         }
3387                         mutex_unlock(&lli->lli_och_mutex);
3388                         if (och == NULL)
3389                                 GOTO(out, rc = -ENOLCK);
3390                         inode2 = file_inode(file2);
3391                         rc = ll_swap_layouts_close(och, inode, inode2);
3392                 } else {
3393                         rc = ll_swap_layouts(file, file2, &lsl);
3394                 }
3395 out:
3396                 fput(file2);
3397                 RETURN(rc);
3398         }
3399         case LL_IOC_LOV_GETSTRIPE:
3400         case LL_IOC_LOV_GETSTRIPE_NEW:
3401                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3402         case FS_IOC_GETFLAGS:
3403         case FS_IOC_SETFLAGS:
3404                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3405         case FSFILT_IOC_GETVERSION:
3406         case FS_IOC_GETVERSION:
3407                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3408         /* We need to special case any other ioctls we want to handle,
3409          * to send them to the MDS/OST as appropriate and to properly
3410          * network encode the arg field. */
3411         case FS_IOC_SETVERSION:
3412                 RETURN(-ENOTSUPP);
3413
3414         case LL_IOC_GROUP_LOCK:
3415                 RETURN(ll_get_grouplock(inode, file, arg));
3416         case LL_IOC_GROUP_UNLOCK:
3417                 RETURN(ll_put_grouplock(inode, file, arg));
3418         case IOC_OBD_STATFS:
3419                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3420
3421         case LL_IOC_FLUSHCTX:
3422                 RETURN(ll_flush_ctx(inode));
3423         case LL_IOC_PATH2FID: {
3424                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3425                                  sizeof(struct lu_fid)))
3426                         RETURN(-EFAULT);
3427
3428                 RETURN(0);
3429         }
3430         case LL_IOC_GETPARENT:
3431                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3432
3433         case OBD_IOC_FID2PATH:
3434                 RETURN(ll_fid2path(inode, (void __user *)arg));
3435         case LL_IOC_DATA_VERSION: {
3436                 struct ioc_data_version idv;
3437                 int rc;
3438
3439                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3440                         RETURN(-EFAULT);
3441
3442                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3443                 rc = ll_ioc_data_version(inode, &idv);
3444
3445                 if (rc == 0 &&
3446                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3447                         RETURN(-EFAULT);
3448
3449                 RETURN(rc);
3450         }
3451
3452         case LL_IOC_GET_MDTIDX: {
3453                 int mdtidx;
3454
3455                 mdtidx = ll_get_mdt_idx(inode);
3456                 if (mdtidx < 0)
3457                         RETURN(mdtidx);
3458
3459                 if (put_user((int)mdtidx, (int __user *)arg))
3460                         RETURN(-EFAULT);
3461
3462                 RETURN(0);
3463         }
3464         case OBD_IOC_GETDTNAME:
3465         case OBD_IOC_GETMDNAME:
3466                 RETURN(ll_get_obd_name(inode, cmd, arg));
3467         case LL_IOC_HSM_STATE_GET: {
3468                 struct md_op_data       *op_data;
3469                 struct hsm_user_state   *hus;
3470                 int                      rc;
3471
3472                 OBD_ALLOC_PTR(hus);
3473                 if (hus == NULL)
3474                         RETURN(-ENOMEM);
3475
3476                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3477                                              LUSTRE_OPC_ANY, hus);
3478                 if (IS_ERR(op_data)) {
3479                         OBD_FREE_PTR(hus);
3480                         RETURN(PTR_ERR(op_data));
3481                 }
3482
3483                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3484                                    op_data, NULL);
3485
3486                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3487                         rc = -EFAULT;
3488
3489                 ll_finish_md_op_data(op_data);
3490                 OBD_FREE_PTR(hus);
3491                 RETURN(rc);
3492         }
3493         case LL_IOC_HSM_STATE_SET: {
3494                 struct hsm_state_set    *hss;
3495                 int                      rc;
3496
3497                 OBD_ALLOC_PTR(hss);
3498                 if (hss == NULL)
3499                         RETURN(-ENOMEM);
3500
3501                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3502                         OBD_FREE_PTR(hss);
3503                         RETURN(-EFAULT);
3504                 }
3505
3506                 rc = ll_hsm_state_set(inode, hss);
3507
3508                 OBD_FREE_PTR(hss);
3509                 RETURN(rc);
3510         }
3511         case LL_IOC_HSM_ACTION: {
3512                 struct md_op_data               *op_data;
3513                 struct hsm_current_action       *hca;
3514                 int                              rc;
3515
3516                 OBD_ALLOC_PTR(hca);
3517                 if (hca == NULL)
3518                         RETURN(-ENOMEM);
3519
3520                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3521                                              LUSTRE_OPC_ANY, hca);
3522                 if (IS_ERR(op_data)) {
3523                         OBD_FREE_PTR(hca);
3524                         RETURN(PTR_ERR(op_data));
3525                 }
3526
3527                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3528                                    op_data, NULL);
3529
3530                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3531                         rc = -EFAULT;
3532
3533                 ll_finish_md_op_data(op_data);
3534                 OBD_FREE_PTR(hca);
3535                 RETURN(rc);
3536         }
3537         case LL_IOC_SET_LEASE_OLD: {
3538                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3539
3540                 RETURN(ll_file_set_lease(file, &ioc, 0));
3541         }
3542         case LL_IOC_SET_LEASE: {
3543                 struct ll_ioc_lease ioc;
3544
3545                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3546                         RETURN(-EFAULT);
3547
3548                 RETURN(ll_file_set_lease(file, &ioc, arg));
3549         }
3550         case LL_IOC_GET_LEASE: {
3551                 struct ll_inode_info *lli = ll_i2info(inode);
3552                 struct ldlm_lock *lock = NULL;
3553                 fmode_t fmode = 0;
3554
3555                 mutex_lock(&lli->lli_och_mutex);
3556                 if (fd->fd_lease_och != NULL) {
3557                         struct obd_client_handle *och = fd->fd_lease_och;
3558
3559                         lock = ldlm_handle2lock(&och->och_lease_handle);
3560                         if (lock != NULL) {
3561                                 lock_res_and_lock(lock);
3562                                 if (!ldlm_is_cancel(lock))
3563                                         fmode = och->och_flags;
3564
3565                                 unlock_res_and_lock(lock);
3566                                 LDLM_LOCK_PUT(lock);
3567                         }
3568                 }
3569                 mutex_unlock(&lli->lli_och_mutex);
3570
3571                 RETURN(ll_lease_type_from_fmode(fmode));
3572         }
3573         case LL_IOC_HSM_IMPORT: {
3574                 struct hsm_user_import *hui;
3575
3576                 OBD_ALLOC_PTR(hui);
3577                 if (hui == NULL)
3578                         RETURN(-ENOMEM);
3579
3580                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3581                         OBD_FREE_PTR(hui);
3582                         RETURN(-EFAULT);
3583                 }
3584
3585                 rc = ll_hsm_import(inode, file, hui);
3586
3587                 OBD_FREE_PTR(hui);
3588                 RETURN(rc);
3589         }
3590         case LL_IOC_FUTIMES_3: {
3591                 struct ll_futimes_3 lfu;
3592
3593                 if (copy_from_user(&lfu,
3594                                    (const struct ll_futimes_3 __user *)arg,
3595                                    sizeof(lfu)))
3596                         RETURN(-EFAULT);
3597
3598                 RETURN(ll_file_futimes_3(file, &lfu));
3599         }
3600         case LL_IOC_LADVISE: {
3601                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3602                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3603                 int i;
3604                 int num_advise;
3605                 int alloc_size = sizeof(*k_ladvise_hdr);
3606
3607                 rc = 0;
3608                 u_ladvise_hdr = (void __user *)arg;
3609                 OBD_ALLOC_PTR(k_ladvise_hdr);
3610                 if (k_ladvise_hdr == NULL)
3611                         RETURN(-ENOMEM);
3612
3613                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3614                         GOTO(out_ladvise, rc = -EFAULT);
3615
3616                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3617                     k_ladvise_hdr->lah_count < 1)
3618                         GOTO(out_ladvise, rc = -EINVAL);
3619
3620                 num_advise = k_ladvise_hdr->lah_count;
3621                 if (num_advise >= LAH_COUNT_MAX)
3622                         GOTO(out_ladvise, rc = -EFBIG);
3623
3624                 OBD_FREE_PTR(k_ladvise_hdr);
3625                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3626                                       lah_advise[num_advise]);
3627                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3628                 if (k_ladvise_hdr == NULL)
3629                         RETURN(-ENOMEM);
3630
3631                 /*
3632                  * TODO: submit multiple advices to one server in a single RPC
3633                  */
3634                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3635                         GOTO(out_ladvise, rc = -EFAULT);
3636
3637                 for (i = 0; i < num_advise; i++) {
3638                         struct llapi_lu_ladvise *k_ladvise =
3639                                         &k_ladvise_hdr->lah_advise[i];
3640                         struct llapi_lu_ladvise __user *u_ladvise =
3641                                         &u_ladvise_hdr->lah_advise[i];
3642
3643                         rc = ll_ladvise_sanity(inode, k_ladvise);
3644                         if (rc)
3645                                 GOTO(out_ladvise, rc);
3646
3647                         switch (k_ladvise->lla_advice) {
3648                         case LU_LADVISE_LOCKNOEXPAND:
3649                                 rc = ll_lock_noexpand(file,
3650                                                k_ladvise->lla_peradvice_flags);
3651                                 GOTO(out_ladvise, rc);
3652                         case LU_LADVISE_LOCKAHEAD:
3653
3654                                 rc = ll_file_lock_ahead(file, k_ladvise);
3655
3656                                 if (rc < 0)
3657                                         GOTO(out_ladvise, rc);
3658
3659                                 if (put_user(rc,
3660                                              &u_ladvise->lla_lockahead_result))
3661                                         GOTO(out_ladvise, rc = -EFAULT);
3662                                 break;
3663                         default:
3664                                 rc = ll_ladvise(inode, file,
3665                                                 k_ladvise_hdr->lah_flags,
3666                                                 k_ladvise);
3667                                 if (rc)
3668                                         GOTO(out_ladvise, rc);
3669                                 break;
3670                         }
3671
3672                 }
3673
3674 out_ladvise:
3675                 OBD_FREE(k_ladvise_hdr, alloc_size);
3676                 RETURN(rc);
3677         }
3678         case LL_IOC_FLR_SET_MIRROR: {
3679                 /* mirror I/O must be direct to avoid polluting page cache
3680                  * by stale data. */
3681                 if (!(file->f_flags & O_DIRECT))
3682                         RETURN(-EINVAL);
3683
3684                 fd->fd_designated_mirror = (__u32)arg;
3685                 RETURN(0);
3686         }
3687         case LL_IOC_FSGETXATTR:
3688                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3689         case LL_IOC_FSSETXATTR:
3690                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3691         case BLKSSZGET:
3692                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3693         default:
3694                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3695                                      (void __user *)arg));
3696         }
3697 }
3698
3699 #ifndef HAVE_FILE_LLSEEK_SIZE
3700 static inline loff_t
3701 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3702 {
3703         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3704                 return -EINVAL;
3705         if (offset > maxsize)
3706                 return -EINVAL;
3707
3708         if (offset != file->f_pos) {
3709                 file->f_pos = offset;
3710                 file->f_version = 0;
3711         }
3712         return offset;
3713 }
3714
3715 static loff_t
3716 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3717                 loff_t maxsize, loff_t eof)
3718 {
3719         struct inode *inode = file_inode(file);
3720
3721         switch (origin) {
3722         case SEEK_END:
3723                 offset += eof;
3724                 break;
3725         case SEEK_CUR:
3726                 /*
3727                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3728                  * position-querying operation.  Avoid rewriting the "same"
3729                  * f_pos value back to the file because a concurrent read(),
3730                  * write() or lseek() might have altered it
3731                  */
3732                 if (offset == 0)
3733                         return file->f_pos;
3734                 /*
3735                  * f_lock protects against read/modify/write race with other
3736                  * SEEK_CURs. Note that parallel writes and reads behave
3737                  * like SEEK_SET.
3738                  */
3739                 inode_lock(inode);
3740                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3741                 inode_unlock(inode);
3742                 return offset;
3743         case SEEK_DATA:
3744                 /*
3745                  * In the generic case the entire file is data, so as long as
3746                  * offset isn't at the end of the file then the offset is data.
3747                  */
3748                 if (offset >= eof)
3749                         return -ENXIO;
3750                 break;
3751         case SEEK_HOLE:
3752                 /*
3753                  * There is a virtual hole at the end of the file, so as long as
3754                  * offset isn't i_size or larger, return i_size.
3755                  */
3756                 if (offset >= eof)
3757                         return -ENXIO;
3758                 offset = eof;
3759                 break;
3760         }
3761
3762         return llseek_execute(file, offset, maxsize);
3763 }
3764 #endif
3765
3766 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3767 {
3768         struct inode *inode = file_inode(file);
3769         loff_t retval, eof = 0;
3770
3771         ENTRY;
3772         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3773                            (origin == SEEK_CUR) ? file->f_pos : 0);
3774         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3775                PFID(ll_inode2fid(inode)), inode, retval, retval,
3776                origin);
3777         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3778
3779         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3780                 retval = ll_glimpse_size(inode);
3781                 if (retval != 0)
3782                         RETURN(retval);
3783                 eof = i_size_read(inode);
3784         }
3785
3786         retval = ll_generic_file_llseek_size(file, offset, origin,
3787                                           ll_file_maxbytes(inode), eof);
3788         RETURN(retval);
3789 }
3790
3791 static int ll_flush(struct file *file, fl_owner_t id)
3792 {
3793         struct inode *inode = file_inode(file);
3794         struct ll_inode_info *lli = ll_i2info(inode);
3795         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3796         int rc, err;
3797
3798         LASSERT(!S_ISDIR(inode->i_mode));
3799
3800         /* catch async errors that were recorded back when async writeback
3801          * failed for pages in this mapping. */
3802         rc = lli->lli_async_rc;
3803         lli->lli_async_rc = 0;
3804         if (lli->lli_clob != NULL) {
3805                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3806                 if (rc == 0)
3807                         rc = err;
3808         }
3809
3810         /* The application has been told write failure already.
3811          * Do not report failure again. */
3812         if (fd->fd_write_failed)
3813                 return 0;
3814         return rc ? -EIO : 0;
3815 }
3816
3817 /**
3818  * Called to make sure a portion of file has been written out.
3819  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3820  *
3821  * Return how many pages have been written.
3822  */
3823 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3824                        enum cl_fsync_mode mode, int ignore_layout)
3825 {
3826         struct lu_env *env;
3827         struct cl_io *io;
3828         struct cl_fsync_io *fio;
3829         int result;
3830         __u16 refcheck;
3831         ENTRY;
3832
3833         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3834             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3835                 RETURN(-EINVAL);
3836
3837         env = cl_env_get(&refcheck);
3838         if (IS_ERR(env))
3839                 RETURN(PTR_ERR(env));
3840
3841         io = vvp_env_thread_io(env);
3842         io->ci_obj = ll_i2info(inode)->lli_clob;
3843         io->ci_ignore_layout = ignore_layout;
3844
3845         /* initialize parameters for sync */
3846         fio = &io->u.ci_fsync;
3847         fio->fi_start = start;
3848         fio->fi_end = end;
3849         fio->fi_fid = ll_inode2fid(inode);
3850         fio->fi_mode = mode;
3851         fio->fi_nr_written = 0;
3852
3853         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3854                 result = cl_io_loop(env, io);
3855         else
3856                 result = io->ci_result;
3857         if (result == 0)
3858                 result = fio->fi_nr_written;
3859         cl_io_fini(env, io);
3860         cl_env_put(env, &refcheck);
3861
3862         RETURN(result);
3863 }
3864
3865 /*
3866  * When dentry is provided (the 'else' case), file_dentry() may be
3867  * null and dentry must be used directly rather than pulled from
3868  * file_dentry() as is done otherwise.
3869  */
3870
3871 #ifdef HAVE_FILE_FSYNC_4ARGS
3872 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3873 {
3874         struct dentry *dentry = file_dentry(file);
3875         bool lock_inode;
3876 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3877 int ll_fsync(struct file *file, int datasync)
3878 {
3879         struct dentry *dentry = file_dentry(file);
3880         loff_t start = 0;
3881         loff_t end = LLONG_MAX;
3882 #else
3883 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3884 {
3885         loff_t start = 0;
3886         loff_t end = LLONG_MAX;
3887 #endif
3888         struct inode *inode = dentry->d_inode;
3889         struct ll_inode_info *lli = ll_i2info(inode);
3890         struct ptlrpc_request *req;
3891         int rc, err;
3892         ENTRY;
3893
3894         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3895                PFID(ll_inode2fid(inode)), inode);
3896         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3897
3898 #ifdef HAVE_FILE_FSYNC_4ARGS
3899         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3900         lock_inode = !lli->lli_inode_locked;
3901         if (lock_inode)
3902                 inode_lock(inode);
3903 #else
3904         /* fsync's caller has already called _fdata{sync,write}, we want
3905          * that IO to finish before calling the osc and mdc sync methods */
3906         rc = filemap_fdatawait(inode->i_mapping);
3907 #endif
3908
3909         /* catch async errors that were recorded back when async writeback
3910          * failed for pages in this mapping. */
3911         if (!S_ISDIR(inode->i_mode)) {
3912                 err = lli->lli_async_rc;
3913                 lli->lli_async_rc = 0;
3914                 if (rc == 0)
3915                         rc = err;
3916                 if (lli->lli_clob != NULL) {
3917                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3918                         if (rc == 0)
3919                                 rc = err;
3920                 }
3921         }
3922
3923         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3924         if (!rc)
3925                 rc = err;
3926         if (!err)
3927                 ptlrpc_req_finished(req);
3928
3929         if (S_ISREG(inode->i_mode)) {
3930                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3931
3932                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3933                 if (rc == 0 && err < 0)
3934                         rc = err;
3935                 if (rc < 0)
3936                         fd->fd_write_failed = true;
3937                 else
3938                         fd->fd_write_failed = false;
3939         }
3940
3941 #ifdef HAVE_FILE_FSYNC_4ARGS
3942         if (lock_inode)
3943                 inode_unlock(inode);
3944 #endif
3945         RETURN(rc);
3946 }
3947
3948 static int
3949 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3950 {
3951         struct inode *inode = file_inode(file);
3952         struct ll_sb_info *sbi = ll_i2sbi(inode);
3953         struct ldlm_enqueue_info einfo = {
3954                 .ei_type        = LDLM_FLOCK,
3955                 .ei_cb_cp       = ldlm_flock_completion_ast,
3956                 .ei_cbdata      = file_lock,
3957         };
3958         struct md_op_data *op_data;
3959         struct lustre_handle lockh = { 0 };
3960         union ldlm_policy_data flock = { { 0 } };
3961         int fl_type = file_lock->fl_type;
3962         __u64 flags = 0;
3963         int rc;
3964         int rc2 = 0;
3965         ENTRY;
3966
3967         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3968                PFID(ll_inode2fid(inode)), file_lock);
3969
3970         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3971
3972         if (file_lock->fl_flags & FL_FLOCK) {
3973                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3974                 /* flocks are whole-file locks */
3975                 flock.l_flock.end = OFFSET_MAX;
3976                 /* For flocks owner is determined by the local file desctiptor*/
3977                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3978         } else if (file_lock->fl_flags & FL_POSIX) {
3979                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3980                 flock.l_flock.start = file_lock->fl_start;
3981                 flock.l_flock.end = file_lock->fl_end;
3982         } else {
3983                 RETURN(-EINVAL);
3984         }
3985         flock.l_flock.pid = file_lock->fl_pid;
3986
3987         /* Somewhat ugly workaround for svc lockd.
3988          * lockd installs custom fl_lmops->lm_compare_owner that checks
3989          * for the fl_owner to be the same (which it always is on local node
3990          * I guess between lockd processes) and then compares pid.
3991          * As such we assign pid to the owner field to make it all work,
3992          * conflict with normal locks is unlikely since pid space and
3993          * pointer space for current->files are not intersecting */
3994         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3995                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3996
3997         switch (fl_type) {
3998         case F_RDLCK:
3999                 einfo.ei_mode = LCK_PR;
4000                 break;
4001         case F_UNLCK:
4002                 /* An unlock request may or may not have any relation to
4003                  * existing locks so we may not be able to pass a lock handle
4004                  * via a normal ldlm_lock_cancel() request. The request may even
4005                  * unlock a byte range in the middle of an existing lock. In
4006                  * order to process an unlock request we need all of the same
4007                  * information that is given with a normal read or write record
4008                  * lock request. To avoid creating another ldlm unlock (cancel)
4009                  * message we'll treat a LCK_NL flock request as an unlock. */
4010                 einfo.ei_mode = LCK_NL;
4011                 break;
4012         case F_WRLCK:
4013                 einfo.ei_mode = LCK_PW;
4014                 break;
4015         default:
4016                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4017                 RETURN (-ENOTSUPP);
4018         }
4019
4020         switch (cmd) {
4021         case F_SETLKW:
4022 #ifdef F_SETLKW64
4023         case F_SETLKW64:
4024 #endif
4025                 flags = 0;
4026                 break;
4027         case F_SETLK:
4028 #ifdef F_SETLK64
4029         case F_SETLK64:
4030 #endif
4031                 flags = LDLM_FL_BLOCK_NOWAIT;
4032                 break;
4033         case F_GETLK:
4034 #ifdef F_GETLK64
4035         case F_GETLK64:
4036 #endif
4037                 flags = LDLM_FL_TEST_LOCK;
4038                 break;
4039         default:
4040                 CERROR("unknown fcntl lock command: %d\n", cmd);
4041                 RETURN (-EINVAL);
4042         }
4043
4044         /* Save the old mode so that if the mode in the lock changes we
4045          * can decrement the appropriate reader or writer refcount. */
4046         file_lock->fl_type = einfo.ei_mode;
4047
4048         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4049                                      LUSTRE_OPC_ANY, NULL);
4050         if (IS_ERR(op_data))
4051                 RETURN(PTR_ERR(op_data));
4052
4053         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4054                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4055                flock.l_flock.pid, flags, einfo.ei_mode,
4056                flock.l_flock.start, flock.l_flock.end);
4057
4058         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4059                         flags);
4060
4061         /* Restore the file lock type if not TEST lock. */
4062         if (!(flags & LDLM_FL_TEST_LOCK))
4063                 file_lock->fl_type = fl_type;
4064
4065 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4066         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4067             !(flags & LDLM_FL_TEST_LOCK))
4068                 rc2  = locks_lock_file_wait(file, file_lock);
4069 #else
4070         if ((file_lock->fl_flags & FL_FLOCK) &&
4071             (rc == 0 || file_lock->fl_type == F_UNLCK))
4072                 rc2  = flock_lock_file_wait(file, file_lock);
4073         if ((file_lock->fl_flags & FL_POSIX) &&
4074             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4075             !(flags & LDLM_FL_TEST_LOCK))
4076                 rc2  = posix_lock_file_wait(file, file_lock);
4077 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4078
4079         if (rc2 && file_lock->fl_type != F_UNLCK) {
4080                 einfo.ei_mode = LCK_NL;
4081                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4082                            &lockh, flags);
4083                 rc = rc2;
4084         }
4085
4086         ll_finish_md_op_data(op_data);
4087
4088         RETURN(rc);
4089 }
4090
4091 int ll_get_fid_by_name(struct inode *parent, const char *name,
4092                        int namelen, struct lu_fid *fid,
4093                        struct inode **inode)
4094 {
4095         struct md_op_data       *op_data = NULL;
4096         struct mdt_body         *body;
4097         struct ptlrpc_request   *req;
4098         int                     rc;
4099         ENTRY;
4100
4101         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4102                                      LUSTRE_OPC_ANY, NULL);
4103         if (IS_ERR(op_data))
4104                 RETURN(PTR_ERR(op_data));
4105
4106         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4107         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4108         ll_finish_md_op_data(op_data);
4109         if (rc < 0)
4110                 RETURN(rc);
4111
4112         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4113         if (body == NULL)
4114                 GOTO(out_req, rc = -EFAULT);
4115         if (fid != NULL)
4116                 *fid = body->mbo_fid1;
4117
4118         if (inode != NULL)
4119                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4120 out_req:
4121         ptlrpc_req_finished(req);
4122         RETURN(rc);
4123 }
4124
4125 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4126                const char *name)
4127 {
4128         struct dentry *dchild = NULL;
4129         struct inode *child_inode = NULL;
4130         struct md_op_data *op_data;
4131         struct ptlrpc_request *request = NULL;
4132         struct obd_client_handle *och = NULL;
4133         struct qstr qstr;
4134         struct mdt_body *body;
4135         __u64 data_version = 0;
4136         size_t namelen = strlen(name);
4137         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4138         int rc;
4139         ENTRY;
4140
4141         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4142                PFID(ll_inode2fid(parent)), name,
4143                lum->lum_stripe_offset, lum->lum_stripe_count);
4144
4145         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4146             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4147                 lustre_swab_lmv_user_md(lum);
4148
4149         /* Get child FID first */
4150         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4151         qstr.name = name;
4152         qstr.len = namelen;
4153         dchild = d_lookup(file_dentry(file), &qstr);
4154         if (dchild) {
4155                 if (dchild->d_inode)
4156                         child_inode = igrab(dchild->d_inode);
4157                 dput(dchild);
4158         }
4159
4160         if (!child_inode) {
4161                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4162                                         &child_inode);
4163                 if (rc)
4164                         RETURN(rc);
4165         }
4166
4167         if (!child_inode)
4168                 RETURN(-ENOENT);
4169
4170         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4171               OBD_CONNECT2_DIR_MIGRATE)) {
4172                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4173                     ll_i2info(child_inode)->lli_lsm_md) {
4174                         CERROR("%s: MDT doesn't support stripe directory "
4175                                "migration!\n",
4176                                ll_get_fsname(parent->i_sb, NULL, 0));
4177                         GOTO(out_iput, rc = -EOPNOTSUPP);
4178                 }
4179         }
4180
4181         /*
4182          * lfs migrate command needs to be blocked on the client
4183          * by checking the migrate FID against the FID of the
4184          * filesystem root.
4185          */
4186         if (child_inode == parent->i_sb->s_root->d_inode)
4187                 GOTO(out_iput, rc = -EINVAL);
4188
4189         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4190                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4191         if (IS_ERR(op_data))
4192                 GOTO(out_iput, rc = PTR_ERR(op_data));
4193
4194         inode_lock(child_inode);
4195         op_data->op_fid3 = *ll_inode2fid(child_inode);
4196         if (!fid_is_sane(&op_data->op_fid3)) {
4197                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4198                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4199                        PFID(&op_data->op_fid3));
4200                 GOTO(out_unlock, rc = -EINVAL);
4201         }
4202
4203         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4204         op_data->op_data = lum;
4205         op_data->op_data_size = lumlen;
4206
4207 again:
4208         if (S_ISREG(child_inode->i_mode)) {
4209                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4210                 if (IS_ERR(och)) {
4211                         rc = PTR_ERR(och);
4212                         och = NULL;
4213                         GOTO(out_unlock, rc);
4214                 }
4215
4216                 rc = ll_data_version(child_inode, &data_version,
4217                                      LL_DV_WR_FLUSH);
4218                 if (rc != 0)
4219                         GOTO(out_close, rc);
4220
4221                 op_data->op_open_handle = och->och_open_handle;
4222                 op_data->op_data_version = data_version;
4223                 op_data->op_lease_handle = och->och_lease_handle;
4224                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4225
4226                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4227                 och->och_mod->mod_open_req->rq_replay = 0;
4228                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4229         }
4230
4231         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4232                        name, namelen, &request);
4233         if (rc == 0) {
4234                 LASSERT(request != NULL);
4235                 ll_update_times(request, parent);
4236
4237                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4238                 LASSERT(body != NULL);
4239
4240                 /* If the server does release layout lock, then we cleanup
4241                  * the client och here, otherwise release it in out_close: */
4242                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4243                         obd_mod_put(och->och_mod);
4244                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4245                                                   och);
4246                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4247                         OBD_FREE_PTR(och);
4248                         och = NULL;
4249                 }
4250         }
4251
4252         if (request != NULL) {
4253                 ptlrpc_req_finished(request);
4254                 request = NULL;
4255         }
4256
4257         /* Try again if the file layout has changed. */
4258         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4259                 goto again;
4260
4261 out_close:
4262         if (och)
4263                 ll_lease_close(och, child_inode, NULL);
4264         if (!rc)
4265                 clear_nlink(child_inode);
4266 out_unlock:
4267         inode_unlock(child_inode);
4268         ll_finish_md_op_data(op_data);
4269 out_iput:
4270         iput(child_inode);
4271         RETURN(rc);
4272 }
4273
4274 static int
4275 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4276 {
4277         ENTRY;
4278
4279         RETURN(-ENOSYS);
4280 }
4281
4282 /**
4283  * test if some locks matching bits and l_req_mode are acquired
4284  * - bits can be in different locks
4285  * - if found clear the common lock bits in *bits
4286  * - the bits not found, are kept in *bits
4287  * \param inode [IN]
4288  * \param bits [IN] searched lock bits [IN]
4289  * \param l_req_mode [IN] searched lock mode
4290  * \retval boolean, true iff all bits are found
4291  */
4292 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4293 {
4294         struct lustre_handle lockh;
4295         union ldlm_policy_data policy;
4296         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4297                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4298         struct lu_fid *fid;
4299         __u64 flags;
4300         int i;
4301         ENTRY;
4302
4303         if (!inode)
4304                RETURN(0);
4305
4306         fid = &ll_i2info(inode)->lli_fid;
4307         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4308                ldlm_lockname[mode]);
4309
4310         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4311         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4312                 policy.l_inodebits.bits = *bits & (1 << i);
4313                 if (policy.l_inodebits.bits == 0)
4314                         continue;
4315
4316                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4317                                   &policy, mode, &lockh)) {
4318                         struct ldlm_lock *lock;
4319
4320                         lock = ldlm_handle2lock(&lockh);
4321                         if (lock) {
4322                                 *bits &=
4323                                       ~(lock->l_policy_data.l_inodebits.bits);
4324                                 LDLM_LOCK_PUT(lock);
4325                         } else {
4326                                 *bits &= ~policy.l_inodebits.bits;
4327                         }
4328                 }
4329         }
4330         RETURN(*bits == 0);
4331 }
4332
4333 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4334                                struct lustre_handle *lockh, __u64 flags,
4335                                enum ldlm_mode mode)
4336 {
4337         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4338         struct lu_fid *fid;
4339         enum ldlm_mode rc;
4340         ENTRY;
4341
4342         fid = &ll_i2info(inode)->lli_fid;
4343         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4344
4345         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4346                            fid, LDLM_IBITS, &policy, mode, lockh);
4347
4348         RETURN(rc);
4349 }
4350
4351 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4352 {
4353         /* Already unlinked. Just update nlink and return success */
4354         if (rc == -ENOENT) {
4355                 clear_nlink(inode);
4356                 /* If it is striped directory, and there is bad stripe
4357                  * Let's revalidate the dentry again, instead of returning
4358                  * error */
4359                 if (S_ISDIR(inode->i_mode) &&
4360                     ll_i2info(inode)->lli_lsm_md != NULL)
4361                         return 0;
4362
4363                 /* This path cannot be hit for regular files unless in
4364                  * case of obscure races, so no need to to validate
4365                  * size. */
4366                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4367                         return 0;
4368         } else if (rc != 0) {
4369                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4370                              "%s: revalidate FID "DFID" error: rc = %d\n",
4371                              ll_get_fsname(inode->i_sb, NULL, 0),
4372                              PFID(ll_inode2fid(inode)), rc);
4373         }
4374
4375         return rc;
4376 }
4377
4378 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4379 {
4380         struct inode *inode = dentry->d_inode;
4381         struct obd_export *exp = ll_i2mdexp(inode);
4382         struct lookup_intent oit = {
4383                 .it_op = op,
4384         };
4385         struct ptlrpc_request *req = NULL;
4386         struct md_op_data *op_data;
4387         int rc = 0;
4388         ENTRY;
4389
4390         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4391                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4392
4393         /* Call getattr by fid, so do not provide name at all. */
4394         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4395                                      LUSTRE_OPC_ANY, NULL);
4396         if (IS_ERR(op_data))
4397                 RETURN(PTR_ERR(op_data));
4398
4399         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4400         ll_finish_md_op_data(op_data);
4401         if (rc < 0) {
4402                 rc = ll_inode_revalidate_fini(inode, rc);
4403                 GOTO(out, rc);
4404         }
4405
4406         rc = ll_revalidate_it_finish(req, &oit, dentry);
4407         if (rc != 0) {
4408                 ll_intent_release(&oit);
4409                 GOTO(out, rc);
4410         }
4411
4412         /* Unlinked? Unhash dentry, so it is not picked up later by
4413          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4414          * here to preserve get_cwd functionality on 2.6.
4415          * Bug 10503 */
4416         if (!dentry->d_inode->i_nlink) {
4417                 ll_lock_dcache(inode);
4418                 d_lustre_invalidate(dentry, 0);
4419                 ll_unlock_dcache(inode);
4420         }
4421
4422         ll_lookup_finish_locks(&oit, dentry);
4423 out:
4424         ptlrpc_req_finished(req);
4425
4426         return rc;
4427 }
4428
4429 static int ll_merge_md_attr(struct inode *inode)
4430 {
4431         struct ll_inode_info *lli = ll_i2info(inode);
4432         struct cl_attr attr = { 0 };
4433         int rc;
4434
4435         LASSERT(lli->lli_lsm_md != NULL);
4436         down_read(&lli->lli_lsm_sem);
4437         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4438                            &attr, ll_md_blocking_ast);
4439         up_read(&lli->lli_lsm_sem);
4440         if (rc != 0)
4441                 RETURN(rc);
4442
4443         set_nlink(inode, attr.cat_nlink);
4444         inode->i_blocks = attr.cat_blocks;
4445         i_size_write(inode, attr.cat_size);
4446
4447         ll_i2info(inode)->lli_atime = attr.cat_atime;
4448         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4449         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4450
4451         RETURN(0);
4452 }
4453
4454 static inline dev_t ll_compat_encode_dev(dev_t dev)
4455 {
4456         /* The compat_sys_*stat*() syscalls will fail unless the
4457          * device majors and minors are both less than 256. Note that
4458          * the value returned here will be passed through
4459          * old_encode_dev() in cp_compat_stat(). And so we are not
4460          * trying to return a valid compat (u16) device number, just
4461          * one that will pass the old_valid_dev() check. */
4462
4463         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4464 }
4465
4466 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4467 int ll_getattr(const struct path *path, struct kstat *stat,
4468                u32 request_mask, unsigned int flags)
4469 {
4470         struct dentry *de = path->dentry;
4471 #else
4472 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4473 {
4474 #endif
4475         struct inode *inode = de->d_inode;
4476         struct ll_sb_info *sbi = ll_i2sbi(inode);
4477         struct ll_inode_info *lli = ll_i2info(inode);
4478         int rc;
4479
4480         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4481
4482         rc = ll_inode_revalidate(de, IT_GETATTR);
4483         if (rc < 0)
4484                 RETURN(rc);
4485
4486         if (S_ISREG(inode->i_mode)) {
4487                 /* In case of restore, the MDT has the right size and has
4488                  * already send it back without granting the layout lock,
4489                  * inode is up-to-date so glimpse is useless.
4490                  * Also to glimpse we need the layout, in case of a running
4491                  * restore the MDT holds the layout lock so the glimpse will
4492                  * block up to the end of restore (getattr will block)
4493                  */
4494                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4495                         rc = ll_glimpse_size(inode);
4496                         if (rc < 0)
4497                                 RETURN(rc);
4498                 }
4499         } else {
4500                 /* If object isn't regular a file then don't validate size. */
4501                 if (S_ISDIR(inode->i_mode) &&
4502                     lli->lli_lsm_md != NULL) {
4503                         rc = ll_merge_md_attr(inode);
4504                         if (rc < 0)
4505                                 RETURN(rc);
4506                 }
4507
4508                 LTIME_S(inode->i_atime) = lli->lli_atime;
4509                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4510                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4511         }
4512
4513         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4514
4515         if (ll_need_32bit_api(sbi)) {
4516                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4517                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4518                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4519         } else {
4520                 stat->ino = inode->i_ino;
4521                 stat->dev = inode->i_sb->s_dev;
4522                 stat->rdev = inode->i_rdev;
4523         }
4524
4525         stat->mode = inode->i_mode;
4526         stat->uid = inode->i_uid;
4527         stat->gid = inode->i_gid;
4528         stat->atime = inode->i_atime;
4529         stat->mtime = inode->i_mtime;
4530         stat->ctime = inode->i_ctime;
4531         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4532
4533         stat->nlink = inode->i_nlink;
4534         stat->size = i_size_read(inode);
4535         stat->blocks = inode->i_blocks;
4536
4537         return 0;
4538 }
4539
4540 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4541                      __u64 start, __u64 len)
4542 {
4543         int             rc;
4544         size_t          num_bytes;
4545         struct fiemap   *fiemap;
4546         unsigned int    extent_count = fieinfo->fi_extents_max;
4547
4548         num_bytes = sizeof(*fiemap) + (extent_count *
4549                                        sizeof(struct fiemap_extent));
4550         OBD_ALLOC_LARGE(fiemap, num_bytes);
4551
4552         if (fiemap == NULL)
4553                 RETURN(-ENOMEM);
4554
4555         fiemap->fm_flags = fieinfo->fi_flags;
4556         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4557         fiemap->fm_start = start;
4558         fiemap->fm_length = len;
4559         if (extent_count > 0 &&
4560             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4561                            sizeof(struct fiemap_extent)) != 0)
4562                 GOTO(out, rc = -EFAULT);
4563
4564         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4565
4566         fieinfo->fi_flags = fiemap->fm_flags;
4567         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4568         if (extent_count > 0 &&
4569             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4570                          fiemap->fm_mapped_extents *
4571                          sizeof(struct fiemap_extent)) != 0)
4572                 GOTO(out, rc = -EFAULT);
4573 out:
4574         OBD_FREE_LARGE(fiemap, num_bytes);
4575         return rc;
4576 }
4577
4578 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4579 {
4580         struct ll_inode_info *lli = ll_i2info(inode);
4581         struct posix_acl *acl = NULL;
4582         ENTRY;
4583
4584         spin_lock(&lli->lli_lock);
4585         /* VFS' acl_permission_check->check_acl will release the refcount */
4586         acl = posix_acl_dup(lli->lli_posix_acl);
4587         spin_unlock(&lli->lli_lock);
4588
4589         RETURN(acl);
4590 }
4591
4592 #ifdef HAVE_IOP_SET_ACL
4593 #ifdef CONFIG_FS_POSIX_ACL
4594 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4595 {
4596         struct ll_sb_info *sbi = ll_i2sbi(inode);
4597         struct ptlrpc_request *req = NULL;
4598         const char *name = NULL;
4599         char *value = NULL;
4600         size_t value_size = 0;
4601         int rc = 0;
4602         ENTRY;
4603
4604         switch (type) {
4605         case ACL_TYPE_ACCESS:
4606                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4607                 if (acl)
4608                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4609                 break;
4610
4611         case ACL_TYPE_DEFAULT:
4612                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4613                 if (!S_ISDIR(inode->i_mode))
4614                         rc = acl ? -EACCES : 0;
4615                 break;
4616
4617         default:
4618                 rc = -EINVAL;
4619                 break;
4620         }
4621         if (rc)
4622                 return rc;
4623
4624         if (acl) {
4625                 value_size = posix_acl_xattr_size(acl->a_count);
4626                 value = kmalloc(value_size, GFP_NOFS);
4627                 if (value == NULL)
4628                         GOTO(out, rc = -ENOMEM);
4629
4630                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4631                 if (rc < 0)
4632                         GOTO(out_value, rc);
4633         }
4634
4635         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4636                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4637                          name, value, value_size, 0, 0, &req);
4638
4639         ptlrpc_req_finished(req);
4640 out_value:
4641         kfree(value);
4642 out:
4643         if (rc)
4644                 forget_cached_acl(inode, type);
4645         else
4646                 set_cached_acl(inode, type, acl);
4647         RETURN(rc);
4648 }
4649 #endif /* CONFIG_FS_POSIX_ACL */
4650 #endif /* HAVE_IOP_SET_ACL */
4651
4652 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4653 static int
4654 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4655 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4656 # else
4657 ll_check_acl(struct inode *inode, int mask)
4658 # endif
4659 {
4660 # ifdef CONFIG_FS_POSIX_ACL
4661         struct posix_acl *acl;
4662         int rc;
4663         ENTRY;
4664
4665 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4666         if (flags & IPERM_FLAG_RCU)
4667                 return -ECHILD;
4668 #  endif
4669         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4670
4671         if (!acl)
4672                 RETURN(-EAGAIN);
4673
4674         rc = posix_acl_permission(inode, acl, mask);
4675         posix_acl_release(acl);
4676
4677         RETURN(rc);
4678 # else /* !CONFIG_FS_POSIX_ACL */
4679         return -EAGAIN;
4680 # endif /* CONFIG_FS_POSIX_ACL */
4681 }
4682 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4683
4684 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4685 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4686 #else
4687 # ifdef HAVE_INODE_PERMISION_2ARGS
4688 int ll_inode_permission(struct inode *inode, int mask)
4689 # else
4690 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4691 # endif
4692 #endif
4693 {
4694         int rc = 0;
4695         struct ll_sb_info *sbi;
4696         struct root_squash_info *squash;
4697         struct cred *cred = NULL;
4698         const struct cred *old_cred = NULL;
4699         cfs_cap_t cap;
4700         bool squash_id = false;
4701         ENTRY;
4702
4703 #ifdef MAY_NOT_BLOCK
4704         if (mask & MAY_NOT_BLOCK)
4705                 return -ECHILD;
4706 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4707         if (flags & IPERM_FLAG_RCU)
4708                 return -ECHILD;
4709 #endif
4710
4711        /* as root inode are NOT getting validated in lookup operation,
4712         * need to do it before permission check. */
4713
4714         if (inode == inode->i_sb->s_root->d_inode) {
4715                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4716                 if (rc)
4717                         RETURN(rc);
4718         }
4719
4720         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4721                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4722
4723         /* squash fsuid/fsgid if needed */
4724         sbi = ll_i2sbi(inode);
4725         squash = &sbi->ll_squash;
4726         if (unlikely(squash->rsi_uid != 0 &&
4727                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4728                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4729                         squash_id = true;
4730         }
4731         if (squash_id) {
4732                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4733                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4734                        squash->rsi_uid, squash->rsi_gid);
4735
4736                 /* update current process's credentials
4737                  * and FS capability */
4738                 cred = prepare_creds();
4739                 if (cred == NULL)
4740                         RETURN(-ENOMEM);
4741
4742                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4743                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4744                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4745                         if ((1 << cap) & CFS_CAP_FS_MASK)
4746                                 cap_lower(cred->cap_effective, cap);
4747                 }
4748                 old_cred = override_creds(cred);
4749         }
4750
4751         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4752         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4753         /* restore current process's credentials and FS capability */
4754         if (squash_id) {
4755                 revert_creds(old_cred);
4756                 put_cred(cred);
4757         }
4758
4759         RETURN(rc);
4760 }
4761
4762 /* -o localflock - only provides locally consistent flock locks */
4763 struct file_operations ll_file_operations = {
4764 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4765 # ifdef HAVE_SYNC_READ_WRITE
4766         .read           = new_sync_read,
4767         .write          = new_sync_write,
4768 # endif
4769         .read_iter      = ll_file_read_iter,
4770         .write_iter     = ll_file_write_iter,
4771 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4772         .read           = ll_file_read,
4773         .aio_read       = ll_file_aio_read,
4774         .write          = ll_file_write,
4775         .aio_write      = ll_file_aio_write,
4776 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4777         .unlocked_ioctl = ll_file_ioctl,
4778         .open           = ll_file_open,
4779         .release        = ll_file_release,
4780         .mmap           = ll_file_mmap,
4781         .llseek         = ll_file_seek,
4782         .splice_read    = ll_file_splice_read,
4783         .fsync          = ll_fsync,
4784         .flush          = ll_flush
4785 };
4786
4787 struct file_operations ll_file_operations_flock = {
4788 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4789 # ifdef HAVE_SYNC_READ_WRITE
4790         .read           = new_sync_read,
4791         .write          = new_sync_write,
4792 # endif /* HAVE_SYNC_READ_WRITE */
4793         .read_iter      = ll_file_read_iter,
4794         .write_iter     = ll_file_write_iter,
4795 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4796         .read           = ll_file_read,
4797         .aio_read       = ll_file_aio_read,
4798         .write          = ll_file_write,
4799         .aio_write      = ll_file_aio_write,
4800 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4801         .unlocked_ioctl = ll_file_ioctl,
4802         .open           = ll_file_open,
4803         .release        = ll_file_release,
4804         .mmap           = ll_file_mmap,
4805         .llseek         = ll_file_seek,
4806         .splice_read    = ll_file_splice_read,
4807         .fsync          = ll_fsync,
4808         .flush          = ll_flush,
4809         .flock          = ll_file_flock,
4810         .lock           = ll_file_flock
4811 };
4812
4813 /* These are for -o noflock - to return ENOSYS on flock calls */
4814 struct file_operations ll_file_operations_noflock = {
4815 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4816 # ifdef HAVE_SYNC_READ_WRITE
4817         .read           = new_sync_read,
4818         .write          = new_sync_write,
4819 # endif /* HAVE_SYNC_READ_WRITE */
4820         .read_iter      = ll_file_read_iter,
4821         .write_iter     = ll_file_write_iter,
4822 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4823         .read           = ll_file_read,
4824         .aio_read       = ll_file_aio_read,
4825         .write          = ll_file_write,
4826         .aio_write      = ll_file_aio_write,
4827 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4828         .unlocked_ioctl = ll_file_ioctl,
4829         .open           = ll_file_open,
4830         .release        = ll_file_release,
4831         .mmap           = ll_file_mmap,
4832         .llseek         = ll_file_seek,
4833         .splice_read    = ll_file_splice_read,
4834         .fsync          = ll_fsync,
4835         .flush          = ll_flush,
4836         .flock          = ll_file_noflock,
4837         .lock           = ll_file_noflock
4838 };
4839
4840 struct inode_operations ll_file_inode_operations = {
4841         .setattr        = ll_setattr,
4842         .getattr        = ll_getattr,
4843         .permission     = ll_inode_permission,
4844 #ifdef HAVE_IOP_XATTR
4845         .setxattr       = ll_setxattr,
4846         .getxattr       = ll_getxattr,
4847         .removexattr    = ll_removexattr,
4848 #endif
4849         .listxattr      = ll_listxattr,
4850         .fiemap         = ll_fiemap,
4851 #ifdef HAVE_IOP_GET_ACL
4852         .get_acl        = ll_get_acl,
4853 #endif
4854 #ifdef HAVE_IOP_SET_ACL
4855         .set_acl        = ll_set_acl,
4856 #endif
4857 };
4858
4859 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4860 {
4861         struct ll_inode_info *lli = ll_i2info(inode);
4862         struct cl_object *obj = lli->lli_clob;
4863         struct lu_env *env;
4864         int rc;
4865         __u16 refcheck;
4866         ENTRY;
4867
4868         if (obj == NULL)
4869                 RETURN(0);
4870
4871         env = cl_env_get(&refcheck);
4872         if (IS_ERR(env))
4873                 RETURN(PTR_ERR(env));
4874
4875         rc = cl_conf_set(env, lli->lli_clob, conf);
4876         if (rc < 0)
4877                 GOTO(out, rc);
4878
4879         if (conf->coc_opc == OBJECT_CONF_SET) {
4880                 struct ldlm_lock *lock = conf->coc_lock;
4881                 struct cl_layout cl = {
4882                         .cl_layout_gen = 0,
4883                 };
4884
4885                 LASSERT(lock != NULL);
4886                 LASSERT(ldlm_has_layout(lock));
4887
4888                 /* it can only be allowed to match after layout is
4889                  * applied to inode otherwise false layout would be
4890                  * seen. Applying layout shoud happen before dropping
4891                  * the intent lock. */
4892                 ldlm_lock_allow_match(lock);
4893
4894                 rc = cl_object_layout_get(env, obj, &cl);
4895                 if (rc < 0)
4896                         GOTO(out, rc);
4897
4898                 CDEBUG(D_VFSTRACE,
4899                        DFID": layout version change: %u -> %u\n",
4900                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4901                        cl.cl_layout_gen);
4902                 ll_layout_version_set(lli, cl.cl_layout_gen);
4903         }
4904
4905 out:
4906         cl_env_put(env, &refcheck);
4907
4908         RETURN(rc);
4909 }
4910
4911 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4912 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4913
4914 {
4915         struct ll_sb_info *sbi = ll_i2sbi(inode);
4916         struct ptlrpc_request *req;
4917         struct mdt_body *body;
4918         void *lvbdata;
4919         void *lmm;
4920         int lmmsize;
4921         int rc;
4922         ENTRY;
4923
4924         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4925                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4926                lock->l_lvb_data, lock->l_lvb_len);
4927
4928         if (lock->l_lvb_data != NULL)
4929                 RETURN(0);
4930
4931         /* if layout lock was granted right away, the layout is returned
4932          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4933          * blocked and then granted via completion ast, we have to fetch
4934          * layout here. Please note that we can't use the LVB buffer in
4935          * completion AST because it doesn't have a large enough buffer */
4936         rc = ll_get_default_mdsize(sbi, &lmmsize);
4937         if (rc == 0)
4938                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4939                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4940         if (rc < 0)
4941                 RETURN(rc);
4942
4943         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4944         if (body == NULL)
4945                 GOTO(out, rc = -EPROTO);
4946
4947         lmmsize = body->mbo_eadatasize;
4948         if (lmmsize == 0) /* empty layout */
4949                 GOTO(out, rc = 0);
4950
4951         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4952         if (lmm == NULL)
4953                 GOTO(out, rc = -EFAULT);
4954
4955         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4956         if (lvbdata == NULL)
4957                 GOTO(out, rc = -ENOMEM);
4958
4959         memcpy(lvbdata, lmm, lmmsize);
4960         lock_res_and_lock(lock);
4961         if (unlikely(lock->l_lvb_data == NULL)) {
4962                 lock->l_lvb_type = LVB_T_LAYOUT;
4963                 lock->l_lvb_data = lvbdata;
4964                 lock->l_lvb_len = lmmsize;
4965                 lvbdata = NULL;
4966         }
4967         unlock_res_and_lock(lock);
4968
4969         if (lvbdata)
4970                 OBD_FREE_LARGE(lvbdata, lmmsize);
4971
4972         EXIT;
4973
4974 out:
4975         ptlrpc_req_finished(req);
4976         return rc;
4977 }
4978
4979 /**
4980  * Apply the layout to the inode. Layout lock is held and will be released
4981  * in this function.
4982  */
4983 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4984                               struct inode *inode)
4985 {
4986         struct ll_inode_info *lli = ll_i2info(inode);
4987         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4988         struct ldlm_lock *lock;
4989         struct cl_object_conf conf;
4990         int rc = 0;
4991         bool lvb_ready;
4992         bool wait_layout = false;
4993         ENTRY;
4994
4995         LASSERT(lustre_handle_is_used(lockh));
4996
4997         lock = ldlm_handle2lock(lockh);
4998         LASSERT(lock != NULL);
4999         LASSERT(ldlm_has_layout(lock));
5000
5001         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5002                    PFID(&lli->lli_fid), inode);
5003
5004         /* in case this is a caching lock and reinstate with new inode */
5005         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5006
5007         lock_res_and_lock(lock);
5008         lvb_ready = ldlm_is_lvb_ready(lock);
5009         unlock_res_and_lock(lock);
5010
5011         /* checking lvb_ready is racy but this is okay. The worst case is
5012          * that multi processes may configure the file on the same time. */
5013         if (lvb_ready)
5014                 GOTO(out, rc = 0);
5015
5016         rc = ll_layout_fetch(inode, lock);
5017         if (rc < 0)
5018                 GOTO(out, rc);
5019
5020         /* for layout lock, lmm is stored in lock's lvb.
5021          * lvb_data is immutable if the lock is held so it's safe to access it
5022          * without res lock.
5023          *
5024          * set layout to file. Unlikely this will fail as old layout was
5025          * surely eliminated */
5026         memset(&conf, 0, sizeof conf);
5027         conf.coc_opc = OBJECT_CONF_SET;
5028         conf.coc_inode = inode;
5029         conf.coc_lock = lock;
5030         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5031         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5032         rc = ll_layout_conf(inode, &conf);
5033
5034         /* refresh layout failed, need to wait */
5035         wait_layout = rc == -EBUSY;
5036         EXIT;
5037 out:
5038         LDLM_LOCK_PUT(lock);
5039         ldlm_lock_decref(lockh, mode);
5040
5041         /* wait for IO to complete if it's still being used. */
5042         if (wait_layout) {
5043                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5044                        ll_get_fsname(inode->i_sb, NULL, 0),
5045                        PFID(&lli->lli_fid), inode);
5046
5047                 memset(&conf, 0, sizeof conf);
5048                 conf.coc_opc = OBJECT_CONF_WAIT;
5049                 conf.coc_inode = inode;
5050                 rc = ll_layout_conf(inode, &conf);
5051                 if (rc == 0)
5052                         rc = -EAGAIN;
5053
5054                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5055                        ll_get_fsname(inode->i_sb, NULL, 0),
5056                        PFID(&lli->lli_fid), rc);
5057         }
5058         RETURN(rc);
5059 }
5060
5061 /**
5062  * Issue layout intent RPC to MDS.
5063  * \param inode [in]    file inode
5064  * \param intent [in]   layout intent
5065  *
5066  * \retval 0    on success
5067  * \retval < 0  error code
5068  */
5069 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5070 {
5071         struct ll_inode_info  *lli = ll_i2info(inode);
5072         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5073         struct md_op_data     *op_data;
5074         struct lookup_intent it;
5075         struct ptlrpc_request *req;
5076         int rc;
5077         ENTRY;
5078
5079         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5080                                      0, 0, LUSTRE_OPC_ANY, NULL);
5081         if (IS_ERR(op_data))
5082                 RETURN(PTR_ERR(op_data));
5083
5084         op_data->op_data = intent;
5085         op_data->op_data_size = sizeof(*intent);
5086
5087         memset(&it, 0, sizeof(it));
5088         it.it_op = IT_LAYOUT;
5089         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5090             intent->li_opc == LAYOUT_INTENT_TRUNC)
5091                 it.it_flags = FMODE_WRITE;
5092
5093         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5094                           ll_get_fsname(inode->i_sb, NULL, 0),
5095                           PFID(&lli->lli_fid), inode);
5096
5097         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5098                             &ll_md_blocking_ast, 0);
5099         if (it.it_request != NULL)
5100                 ptlrpc_req_finished(it.it_request);
5101         it.it_request = NULL;
5102
5103         ll_finish_md_op_data(op_data);
5104
5105         /* set lock data in case this is a new lock */
5106         if (!rc)
5107                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5108
5109         ll_intent_drop_lock(&it);
5110
5111         RETURN(rc);
5112 }
5113
5114 /**
5115  * This function checks if there exists a LAYOUT lock on the client side,
5116  * or enqueues it if it doesn't have one in cache.
5117  *
5118  * This function will not hold layout lock so it may be revoked any time after
5119  * this function returns. Any operations depend on layout should be redone
5120  * in that case.
5121  *
5122  * This function should be called before lov_io_init() to get an uptodate
5123  * layout version, the caller should save the version number and after IO
5124  * is finished, this function should be called again to verify that layout
5125  * is not changed during IO time.
5126  */
5127 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5128 {
5129         struct ll_inode_info    *lli = ll_i2info(inode);
5130         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5131         struct lustre_handle lockh;
5132         struct layout_intent intent = {
5133                 .li_opc = LAYOUT_INTENT_ACCESS,
5134         };
5135         enum ldlm_mode mode;
5136         int rc;
5137         ENTRY;
5138
5139         *gen = ll_layout_version_get(lli);
5140         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5141                 RETURN(0);
5142
5143         /* sanity checks */
5144         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5145         LASSERT(S_ISREG(inode->i_mode));
5146
5147         /* take layout lock mutex to enqueue layout lock exclusively. */
5148         mutex_lock(&lli->lli_layout_mutex);
5149
5150         while (1) {
5151                 /* mostly layout lock is caching on the local side, so try to
5152                  * match it before grabbing layout lock mutex. */
5153                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5154                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5155                 if (mode != 0) { /* hit cached lock */
5156                         rc = ll_layout_lock_set(&lockh, mode, inode);
5157                         if (rc == -EAGAIN)
5158                                 continue;
5159                         break;
5160                 }
5161
5162                 rc = ll_layout_intent(inode, &intent);
5163                 if (rc != 0)
5164                         break;
5165         }
5166
5167         if (rc == 0)
5168                 *gen = ll_layout_version_get(lli);
5169         mutex_unlock(&lli->lli_layout_mutex);
5170
5171         RETURN(rc);
5172 }
5173
5174 /**
5175  * Issue layout intent RPC indicating where in a file an IO is about to write.
5176  *
5177  * \param[in] inode     file inode.
5178  * \param[in] ext       write range with start offset of fille in bytes where
5179  *                      an IO is about to write, and exclusive end offset in
5180  *                      bytes.
5181  *
5182  * \retval 0    on success
5183  * \retval < 0  error code
5184  */
5185 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5186                            struct lu_extent *ext)
5187 {
5188         struct layout_intent intent = {
5189                 .li_opc = opc,
5190                 .li_extent.e_start = ext->e_start,
5191                 .li_extent.e_end = ext->e_end,
5192         };
5193         int rc;
5194         ENTRY;
5195
5196         rc = ll_layout_intent(inode, &intent);
5197
5198         RETURN(rc);
5199 }
5200
5201 /**
5202  *  This function send a restore request to the MDT
5203  */
5204 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5205 {
5206         struct hsm_user_request *hur;
5207         int                      len, rc;
5208         ENTRY;
5209
5210         len = sizeof(struct hsm_user_request) +
5211               sizeof(struct hsm_user_item);
5212         OBD_ALLOC(hur, len);
5213         if (hur == NULL)
5214                 RETURN(-ENOMEM);
5215
5216         hur->hur_request.hr_action = HUA_RESTORE;
5217         hur->hur_request.hr_archive_id = 0;
5218         hur->hur_request.hr_flags = 0;
5219         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5220                sizeof(hur->hur_user_item[0].hui_fid));
5221         hur->hur_user_item[0].hui_extent.offset = offset;
5222         hur->hur_user_item[0].hui_extent.length = length;
5223         hur->hur_request.hr_itemcount = 1;
5224         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5225                            len, hur, NULL);
5226         OBD_FREE(hur, len);
5227         RETURN(rc);
5228 }