lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                       ATTR_MTIME | ATTR_MTIME_SET |
 104                                       ATTR_CTIME);
 105         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 106         op_data->op_attr_blocks = inode->i_blocks;
 107         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 108         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 109                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 110         op_data->op_open_handle = och->och_open_handle;
 111
 112         if (och->och_flags & FMODE_WRITE &&
 113             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 114                 /* For HSM: if inode data has been modified, pack it so that
 115                  * MDT can set data dirty flag in the archive. */
 116                 op_data->op_bias |= MDS_DATA_MODIFIED;
 117
 118         EXIT;
 119 }
 120
 121 /**
 122  * Perform a close, possibly with a bias.
 123  * The meaning of "data" depends on the value of "bias".
 124  *
 125  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 126  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 127  * swap layouts with.
 128  */
 129 static int ll_close_inode_openhandle(struct inode *inode,
 130                                      struct obd_client_handle *och,
 131                                      enum mds_op_bias bias, void *data)
 132 {
 133         struct obd_export *md_exp = ll_i2mdexp(inode);
 134         const struct ll_inode_info *lli = ll_i2info(inode);
 135         struct md_op_data *op_data;
 136         struct ptlrpc_request *req = NULL;
 137         int rc;
 138         ENTRY;
 139
 140         if (class_exp2obd(md_exp) == NULL) {
 141                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 142                        ll_get_fsname(inode->i_sb, NULL, 0),
 143                        PFID(&lli->lli_fid));
 144                 GOTO(out, rc = 0);
 145         }
 146
 147         OBD_ALLOC_PTR(op_data);
 148         /* We leak openhandle and request here on error, but not much to be
 149          * done in OOM case since app won't retry close on error either. */
 150         if (op_data == NULL)
 151                 GOTO(out, rc = -ENOMEM);
 152
 153         ll_prepare_close(inode, op_data, och);
 154         switch (bias) {
 155         case MDS_CLOSE_LAYOUT_MERGE:
 156                 /* merge blocks from the victim inode */
 157                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 158                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 159                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 160         case MDS_CLOSE_LAYOUT_SPLIT:
 161         case MDS_CLOSE_LAYOUT_SWAP: {
 162                 struct split_param *sp = data;
 163
 164                 LASSERT(data != NULL);
 165                 op_data->op_bias |= bias;
 166                 op_data->op_data_version = 0;
 167                 op_data->op_lease_handle = och->och_lease_handle;
 168                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 169                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 170                         op_data->op_mirror_id = sp->sp_mirror_id;
 171                 } else {
 172                         op_data->op_fid2 = *ll_inode2fid(data);
 173                 }
 174                 break;
 175         }
 176
 177         case MDS_CLOSE_RESYNC_DONE: {
 178                 struct ll_ioc_lease *ioc = data;
 179
 180                 LASSERT(data != NULL);
 181                 op_data->op_attr_blocks +=
 182                         ioc->lil_count * op_data->op_attr_blocks;
 183                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 184                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 185                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 186
 187                 op_data->op_lease_handle = och->och_lease_handle;
 188                 op_data->op_data = &ioc->lil_ids[0];
 189                 op_data->op_data_size =
 190                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 191                 break;
 192         }
 193
 194         case MDS_HSM_RELEASE:
 195                 LASSERT(data != NULL);
 196                 op_data->op_bias |= MDS_HSM_RELEASE;
 197                 op_data->op_data_version = *(__u64 *)data;
 198                 op_data->op_lease_handle = och->och_lease_handle;
 199                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 200                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 201                 break;
 202
 203         default:
 204                 LASSERT(data == NULL);
 205                 break;
 206         }
 207
 208         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 209                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 210         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 211                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 212
 213         rc = md_close(md_exp, op_data, och->och_mod, &req);
 214         if (rc != 0 && rc != -EINTR)
 215                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 216                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 217
 218         if (rc == 0 && op_data->op_bias & bias) {
 219                 struct mdt_body *body;
 220
 221                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 222                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 223                         rc = -EBUSY;
 224         }
 225
 226         ll_finish_md_op_data(op_data);
 227         EXIT;
 228 out:
 229
 230         md_clear_open_replay_data(md_exp, och);
 231         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 232         OBD_FREE_PTR(och);
 233
 234         ptlrpc_req_finished(req);       /* This is close request */
 235         return rc;
 236 }
 237
 238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 239 {
 240         struct ll_inode_info *lli = ll_i2info(inode);
 241         struct obd_client_handle **och_p;
 242         struct obd_client_handle *och;
 243         __u64 *och_usecount;
 244         int rc = 0;
 245         ENTRY;
 246
 247         if (fmode & FMODE_WRITE) {
 248                 och_p = &lli->lli_mds_write_och;
 249                 och_usecount = &lli->lli_open_fd_write_count;
 250         } else if (fmode & FMODE_EXEC) {
 251                 och_p = &lli->lli_mds_exec_och;
 252                 och_usecount = &lli->lli_open_fd_exec_count;
 253         } else {
 254                 LASSERT(fmode & FMODE_READ);
 255                 och_p = &lli->lli_mds_read_och;
 256                 och_usecount = &lli->lli_open_fd_read_count;
 257         }
 258
 259         mutex_lock(&lli->lli_och_mutex);
 260         if (*och_usecount > 0) {
 261                 /* There are still users of this handle, so skip
 262                  * freeing it. */
 263                 mutex_unlock(&lli->lli_och_mutex);
 264                 RETURN(0);
 265         }
 266
 267         och = *och_p;
 268         *och_p = NULL;
 269         mutex_unlock(&lli->lli_och_mutex);
 270
 271         if (och != NULL) {
 272                 /* There might be a race and this handle may already
 273                  * be closed. */
 274                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 275         }
 276
 277         RETURN(rc);
 278 }
 279
 280 static int ll_md_close(struct inode *inode, struct file *file)
 281 {
 282         union ldlm_policy_data policy = {
 283                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 284         };
 285         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 286         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 287         struct ll_inode_info *lli = ll_i2info(inode);
 288         struct lustre_handle lockh;
 289         enum ldlm_mode lockmode;
 290         int rc = 0;
 291         ENTRY;
 292
 293         /* clear group lock, if present */
 294         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 295                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 296
 297         if (fd->fd_lease_och != NULL) {
 298                 bool lease_broken;
 299
 300                 /* Usually the lease is not released when the
 301                  * application crashed, we need to release here. */
 302                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 303                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 304                         PFID(&lli->lli_fid), rc, lease_broken);
 305
 306                 fd->fd_lease_och = NULL;
 307         }
 308
 309         if (fd->fd_och != NULL) {
 310                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 311                 fd->fd_och = NULL;
 312                 GOTO(out, rc);
 313         }
 314
 315         /* Let's see if we have good enough OPEN lock on the file and if
 316            we can skip talking to MDS */
 317         mutex_lock(&lli->lli_och_mutex);
 318         if (fd->fd_omode & FMODE_WRITE) {
 319                 lockmode = LCK_CW;
 320                 LASSERT(lli->lli_open_fd_write_count);
 321                 lli->lli_open_fd_write_count--;
 322         } else if (fd->fd_omode & FMODE_EXEC) {
 323                 lockmode = LCK_PR;
 324                 LASSERT(lli->lli_open_fd_exec_count);
 325                 lli->lli_open_fd_exec_count--;
 326         } else {
 327                 lockmode = LCK_CR;
 328                 LASSERT(lli->lli_open_fd_read_count);
 329                 lli->lli_open_fd_read_count--;
 330         }
 331         mutex_unlock(&lli->lli_och_mutex);
 332
 333         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 334                            LDLM_IBITS, &policy, lockmode, &lockh))
 335                 rc = ll_md_real_close(inode, fd->fd_omode);
 336
 337 out:
 338         LUSTRE_FPRIVATE(file) = NULL;
 339         ll_file_data_put(fd);
 340
 341         RETURN(rc);
 342 }
 343
 344 /* While this returns an error code, fput() the caller does not, so we need
 345  * to make every effort to clean up all of our state here.  Also, applications
 346  * rarely check close errors and even if an error is returned they will not
 347  * re-try the close call.
 348  */
 349 int ll_file_release(struct inode *inode, struct file *file)
 350 {
 351         struct ll_file_data *fd;
 352         struct ll_sb_info *sbi = ll_i2sbi(inode);
 353         struct ll_inode_info *lli = ll_i2info(inode);
 354         int rc;
 355         ENTRY;
 356
 357         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 358                PFID(ll_inode2fid(inode)), inode);
 359
 360         if (inode->i_sb->s_root != file_dentry(file))
 361                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 362         fd = LUSTRE_FPRIVATE(file);
 363         LASSERT(fd != NULL);
 364
 365         /* The last ref on @file, maybe not the the owner pid of statahead,
 366          * because parent and child process can share the same file handle. */
 367         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 368                 ll_deauthorize_statahead(inode, fd);
 369
 370         if (inode->i_sb->s_root == file_dentry(file)) {
 371                 LUSTRE_FPRIVATE(file) = NULL;
 372                 ll_file_data_put(fd);
 373                 RETURN(0);
 374         }
 375
 376         if (!S_ISDIR(inode->i_mode)) {
 377                 if (lli->lli_clob != NULL)
 378                         lov_read_and_clear_async_rc(lli->lli_clob);
 379                 lli->lli_async_rc = 0;
 380         }
 381
 382         rc = ll_md_close(inode, file);
 383
 384         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 385                 libcfs_debug_dumplog();
 386
 387         RETURN(rc);
 388 }
 389
 390 static inline int ll_dom_readpage(void *data, struct page *page)
 391 {
 392         struct niobuf_local *lnb = data;
 393         void *kaddr;
 394
 395         kaddr = ll_kmap_atomic(page, KM_USER0);
 396         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 397         if (lnb->lnb_len < PAGE_SIZE)
 398                 memset(kaddr + lnb->lnb_len, 0,
 399                        PAGE_SIZE - lnb->lnb_len);
 400         flush_dcache_page(page);
 401         SetPageUptodate(page);
 402         ll_kunmap_atomic(kaddr, KM_USER0);
 403         unlock_page(page);
 404
 405         return 0;
 406 }
 407
 408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 409                         struct lookup_intent *it)
 410 {
 411         struct ll_inode_info *lli = ll_i2info(inode);
 412         struct cl_object *obj = lli->lli_clob;
 413         struct address_space *mapping = inode->i_mapping;
 414         struct page *vmpage;
 415         struct niobuf_remote *rnb;
 416         char *data;
 417         struct lustre_handle lockh;
 418         struct ldlm_lock *lock;
 419         unsigned long index, start;
 420         struct niobuf_local lnb;
 421         bool dom_lock = false;
 422
 423         ENTRY;
 424
 425         if (obj == NULL)
 426                 RETURN_EXIT;
 427
 428         if (it->it_lock_mode != 0) {
 429                 lockh.cookie = it->it_lock_handle;
 430                 lock = ldlm_handle2lock(&lockh);
 431                 if (lock != NULL)
 432                         dom_lock = ldlm_has_dom(lock);
 433                 LDLM_LOCK_PUT(lock);
 434         }
 435         if (!dom_lock)
 436                 RETURN_EXIT;
 437
 438         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 439                                    RCL_SERVER))
 440                 RETURN_EXIT;
 441
 442         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 443         if (rnb == NULL || rnb->rnb_len == 0)
 444                 RETURN_EXIT;
 445
 446         CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
 447                rnb->rnb_len, i_size_read(inode));
 448
 449         data = (char *)rnb + sizeof(*rnb);
 450
 451         lnb.lnb_file_offset = rnb->rnb_offset;
 452         start = lnb.lnb_file_offset / PAGE_SIZE;
 453         index = 0;
 454         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 455         lnb.lnb_page_offset = 0;
 456         do {
 457                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 458                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 459                 if (lnb.lnb_len > PAGE_SIZE)
 460                         lnb.lnb_len = PAGE_SIZE;
 461
 462                 vmpage = read_cache_page(mapping, index + start,
 463                                          ll_dom_readpage, &lnb);
 464                 if (IS_ERR(vmpage)) {
 465                         CWARN("%s: cannot fill page %lu for "DFID
 466                               " with data: rc = %li\n",
 467                               ll_get_fsname(inode->i_sb, NULL, 0),
 468                               index + start, PFID(lu_object_fid(&obj->co_lu)),
 469                               PTR_ERR(vmpage));
 470                         break;
 471                 }
 472                 put_page(vmpage);
 473                 index++;
 474         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 475         EXIT;
 476 }
 477
 478 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 479                                 struct lookup_intent *itp)
 480 {
 481         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 482         struct dentry *parent = de->d_parent;
 483         const char *name = NULL;
 484         int len = 0;
 485         struct md_op_data *op_data;
 486         struct ptlrpc_request *req = NULL;
 487         int rc;
 488         ENTRY;
 489
 490         LASSERT(parent != NULL);
 491         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 492
 493         /* if server supports open-by-fid, or file name is invalid, don't pack
 494          * name in open request */
 495         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 496             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 497                 name = de->d_name.name;
 498                 len = de->d_name.len;
 499         }
 500
 501         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 502                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 503         if (IS_ERR(op_data))
 504                 RETURN(PTR_ERR(op_data));
 505         op_data->op_data = lmm;
 506         op_data->op_data_size = lmmsize;
 507
 508         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 509                             &ll_md_blocking_ast, 0);
 510         ll_finish_md_op_data(op_data);
 511         if (rc == -ESTALE) {
 512                 /* reason for keep own exit path - don`t flood log
 513                  * with messages with -ESTALE errors.
 514                  */
 515                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 516                      it_open_error(DISP_OPEN_OPEN, itp))
 517                         GOTO(out, rc);
 518                 ll_release_openhandle(de, itp);
 519                 GOTO(out, rc);
 520         }
 521
 522         if (it_disposition(itp, DISP_LOOKUP_NEG))
 523                 GOTO(out, rc = -ENOENT);
 524
 525         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 526                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 527                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 528                 GOTO(out, rc);
 529         }
 530
 531         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 532
 533         if (!rc && itp->it_lock_mode) {
 534                 ll_dom_finish_open(de->d_inode, req, itp);
 535                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 536         }
 537
 538 out:
 539         ptlrpc_req_finished(req);
 540         ll_intent_drop_lock(itp);
 541
 542         /* We did open by fid, but by the time we got to the server,
 543          * the object disappeared. If this is a create, we cannot really
 544          * tell the userspace that the file it was trying to create
 545          * does not exist. Instead let's return -ESTALE, and the VFS will
 546          * retry the create with LOOKUP_REVAL that we are going to catch
 547          * in ll_revalidate_dentry() and use lookup then.
 548          */
 549         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 550                 rc = -ESTALE;
 551
 552         RETURN(rc);
 553 }
 554
 555 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 556                        struct obd_client_handle *och)
 557 {
 558         struct mdt_body *body;
 559
 560         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 561         och->och_open_handle = body->mbo_open_handle;
 562         och->och_fid = body->mbo_fid1;
 563         och->och_lease_handle.cookie = it->it_lock_handle;
 564         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 565         och->och_flags = it->it_flags;
 566
 567         return md_set_open_replay_data(md_exp, och, it);
 568 }
 569
 570 static int ll_local_open(struct file *file, struct lookup_intent *it,
 571                          struct ll_file_data *fd, struct obd_client_handle *och)
 572 {
 573         struct inode *inode = file_inode(file);
 574         ENTRY;
 575
 576         LASSERT(!LUSTRE_FPRIVATE(file));
 577
 578         LASSERT(fd != NULL);
 579
 580         if (och) {
 581                 int rc;
 582
 583                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 584                 if (rc != 0)
 585                         RETURN(rc);
 586         }
 587
 588         LUSTRE_FPRIVATE(file) = fd;
 589         ll_readahead_init(inode, &fd->fd_ras);
 590         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 591
 592         /* ll_cl_context initialize */
 593         rwlock_init(&fd->fd_lock);
 594         INIT_LIST_HEAD(&fd->fd_lccs);
 595
 596         RETURN(0);
 597 }
 598
 599 /* Open a file, and (for the very first open) create objects on the OSTs at
 600  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 601  * creation or open until ll_lov_setstripe() ioctl is called.
 602  *
 603  * If we already have the stripe MD locally then we don't request it in
 604  * md_open(), by passing a lmm_size = 0.
 605  *
 606  * It is up to the application to ensure no other processes open this file
 607  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 608  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 609  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 610  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 611  */
 612 int ll_file_open(struct inode *inode, struct file *file)
 613 {
 614         struct ll_inode_info *lli = ll_i2info(inode);
 615         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 616                                           .it_flags = file->f_flags };
 617         struct obd_client_handle **och_p = NULL;
 618         __u64 *och_usecount = NULL;
 619         struct ll_file_data *fd;
 620         int rc = 0;
 621         ENTRY;
 622
 623         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 624                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 625
 626         it = file->private_data; /* XXX: compat macro */
 627         file->private_data = NULL; /* prevent ll_local_open assertion */
 628
 629         fd = ll_file_data_get();
 630         if (fd == NULL)
 631                 GOTO(out_nofiledata, rc = -ENOMEM);
 632
 633         fd->fd_file = file;
 634         if (S_ISDIR(inode->i_mode))
 635                 ll_authorize_statahead(inode, fd);
 636
 637         if (inode->i_sb->s_root == file_dentry(file)) {
 638                 LUSTRE_FPRIVATE(file) = fd;
 639                 RETURN(0);
 640         }
 641
 642         if (!it || !it->it_disposition) {
 643                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 644                  * because everything but O_ACCMODE mask was stripped from
 645                  * there */
 646                 if ((oit.it_flags + 1) & O_ACCMODE)
 647                         oit.it_flags++;
 648                 if (file->f_flags & O_TRUNC)
 649                         oit.it_flags |= FMODE_WRITE;
 650
 651                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 652                  * dentry_open after call to open_namei that checks permissions.
 653                  * Only nfsd_open call dentry_open directly without checking
 654                  * permissions and because of that this code below is safe.
 655                  */
 656                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 657                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 658
 659                 /* We do not want O_EXCL here, presumably we opened the file
 660                  * already? XXX - NFS implications? */
 661                 oit.it_flags &= ~O_EXCL;
 662
 663                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 664                  * created if necessary, then "IT_CREAT" should be set to keep
 665                  * consistent with it */
 666                 if (oit.it_flags & O_CREAT)
 667                         oit.it_op |= IT_CREAT;
 668
 669                 it = &oit;
 670         }
 671
 672 restart:
 673         /* Let's see if we have file open on MDS already. */
 674         if (it->it_flags & FMODE_WRITE) {
 675                 och_p = &lli->lli_mds_write_och;
 676                 och_usecount = &lli->lli_open_fd_write_count;
 677         } else if (it->it_flags & FMODE_EXEC) {
 678                 och_p = &lli->lli_mds_exec_och;
 679                 och_usecount = &lli->lli_open_fd_exec_count;
 680          } else {
 681                 och_p = &lli->lli_mds_read_och;
 682                 och_usecount = &lli->lli_open_fd_read_count;
 683         }
 684
 685         mutex_lock(&lli->lli_och_mutex);
 686         if (*och_p) { /* Open handle is present */
 687                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 688                         /* Well, there's extra open request that we do not need,
 689                            let's close it somehow. This will decref request. */
 690                         rc = it_open_error(DISP_OPEN_OPEN, it);
 691                         if (rc) {
 692                                 mutex_unlock(&lli->lli_och_mutex);
 693                                 GOTO(out_openerr, rc);
 694                         }
 695
 696                         ll_release_openhandle(file_dentry(file), it);
 697                 }
 698                 (*och_usecount)++;
 699
 700                 rc = ll_local_open(file, it, fd, NULL);
 701                 if (rc) {
 702                         (*och_usecount)--;
 703                         mutex_unlock(&lli->lli_och_mutex);
 704                         GOTO(out_openerr, rc);
 705                 }
 706         } else {
 707                 LASSERT(*och_usecount == 0);
 708                 if (!it->it_disposition) {
 709                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 710                         /* We cannot just request lock handle now, new ELC code
 711                            means that one of other OPEN locks for this file
 712                            could be cancelled, and since blocking ast handler
 713                            would attempt to grab och_mutex as well, that would
 714                            result in a deadlock */
 715                         mutex_unlock(&lli->lli_och_mutex);
 716                         /*
 717                          * Normally called under two situations:
 718                          * 1. NFS export.
 719                          * 2. A race/condition on MDS resulting in no open
 720                          *    handle to be returned from LOOKUP|OPEN request,
 721                          *    for example if the target entry was a symlink.
 722                          *
 723                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 724                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 725                          *  bit so that it's not confusing later callers.
 726                          *
 727                          *  NB; when ldd is NULL, it must have come via normal
 728                          *  lookup path only, since ll_iget_for_nfs always calls
 729                          *  ll_d_init().
 730                          */
 731                         if (ldd && ldd->lld_nfs_dentry) {
 732                                 ldd->lld_nfs_dentry = 0;
 733                                 it->it_flags |= MDS_OPEN_LOCK;
 734                         }
 735
 736                          /*
 737                          * Always specify MDS_OPEN_BY_FID because we don't want
 738                          * to get file with different fid.
 739                          */
 740                         it->it_flags |= MDS_OPEN_BY_FID;
 741                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 742                                                  it);
 743                         if (rc)
 744                                 GOTO(out_openerr, rc);
 745
 746                         goto restart;
 747                 }
 748                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 749                 if (!*och_p)
 750                         GOTO(out_och_free, rc = -ENOMEM);
 751
 752                 (*och_usecount)++;
 753
 754                 /* md_intent_lock() didn't get a request ref if there was an
 755                  * open error, so don't do cleanup on the request here
 756                  * (bug 3430) */
 757                 /* XXX (green): Should not we bail out on any error here, not
 758                  * just open error? */
 759                 rc = it_open_error(DISP_OPEN_OPEN, it);
 760                 if (rc != 0)
 761                         GOTO(out_och_free, rc);
 762
 763                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 764                          "inode %p: disposition %x, status %d\n", inode,
 765                          it_disposition(it, ~0), it->it_status);
 766
 767                 rc = ll_local_open(file, it, fd, *och_p);
 768                 if (rc)
 769                         GOTO(out_och_free, rc);
 770         }
 771         mutex_unlock(&lli->lli_och_mutex);
 772         fd = NULL;
 773
 774         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 775            different kind of OPEN lock for this same inode gets cancelled
 776            by ldlm_cancel_lru */
 777         if (!S_ISREG(inode->i_mode))
 778                 GOTO(out_och_free, rc);
 779
 780         cl_lov_delay_create_clear(&file->f_flags);
 781         GOTO(out_och_free, rc);
 782
 783 out_och_free:
 784         if (rc) {
 785                 if (och_p && *och_p) {
 786                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 787                         *och_p = NULL; /* OBD_FREE writes some magic there */
 788                         (*och_usecount)--;
 789                 }
 790                 mutex_unlock(&lli->lli_och_mutex);
 791
 792 out_openerr:
 793                 if (lli->lli_opendir_key == fd)
 794                         ll_deauthorize_statahead(inode, fd);
 795                 if (fd != NULL)
 796                         ll_file_data_put(fd);
 797         } else {
 798                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 799         }
 800
 801 out_nofiledata:
 802         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 803                 ptlrpc_req_finished(it->it_request);
 804                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 805         }
 806
 807         return rc;
 808 }
 809
 810 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 811                         struct ldlm_lock_desc *desc, void *data, int flag)
 812 {
 813         int rc;
 814         struct lustre_handle lockh;
 815         ENTRY;
 816
 817         switch (flag) {
 818         case LDLM_CB_BLOCKING:
 819                 ldlm_lock2handle(lock, &lockh);
 820                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 821                 if (rc < 0) {
 822                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 823                         RETURN(rc);
 824                 }
 825                 break;
 826         case LDLM_CB_CANCELING:
 827                 /* do nothing */
 828                 break;
 829         }
 830         RETURN(0);
 831 }
 832
 833 /**
 834  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 835  * and save it as fd->fd_och so as to force client to reopen the file even
 836  * if it has an open lock in cache already.
 837  */
 838 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 839                                 struct lustre_handle *old_open_handle)
 840 {
 841         struct ll_inode_info *lli = ll_i2info(inode);
 842         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 843         struct obd_client_handle **och_p;
 844         __u64 *och_usecount;
 845         int rc = 0;
 846         ENTRY;
 847
 848         /* Get the openhandle of the file */
 849         mutex_lock(&lli->lli_och_mutex);
 850         if (fd->fd_lease_och != NULL)
 851                 GOTO(out_unlock, rc = -EBUSY);
 852
 853         if (fd->fd_och == NULL) {
 854                 if (file->f_mode & FMODE_WRITE) {
 855                         LASSERT(lli->lli_mds_write_och != NULL);
 856                         och_p = &lli->lli_mds_write_och;
 857                         och_usecount = &lli->lli_open_fd_write_count;
 858                 } else {
 859                         LASSERT(lli->lli_mds_read_och != NULL);
 860                         och_p = &lli->lli_mds_read_och;
 861                         och_usecount = &lli->lli_open_fd_read_count;
 862                 }
 863
 864                 if (*och_usecount > 1)
 865                         GOTO(out_unlock, rc = -EBUSY);
 866
 867                 fd->fd_och = *och_p;
 868                 *och_usecount = 0;
 869                 *och_p = NULL;
 870         }
 871
 872         *old_open_handle = fd->fd_och->och_open_handle;
 873
 874         EXIT;
 875 out_unlock:
 876         mutex_unlock(&lli->lli_och_mutex);
 877         return rc;
 878 }
 879
 880 /**
 881  * Release ownership on lli_mds_*_och when putting back a file lease.
 882  */
 883 static int ll_lease_och_release(struct inode *inode, struct file *file)
 884 {
 885         struct ll_inode_info *lli = ll_i2info(inode);
 886         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 887         struct obd_client_handle **och_p;
 888         struct obd_client_handle *old_och = NULL;
 889         __u64 *och_usecount;
 890         int rc = 0;
 891         ENTRY;
 892
 893         mutex_lock(&lli->lli_och_mutex);
 894         if (file->f_mode & FMODE_WRITE) {
 895                 och_p = &lli->lli_mds_write_och;
 896                 och_usecount = &lli->lli_open_fd_write_count;
 897         } else {
 898                 och_p = &lli->lli_mds_read_och;
 899                 och_usecount = &lli->lli_open_fd_read_count;
 900         }
 901
 902         /* The file may have been open by another process (broken lease) so
 903          * *och_p is not NULL. In this case we should simply increase usecount
 904          * and close fd_och.
 905          */
 906         if (*och_p != NULL) {
 907                 old_och = fd->fd_och;
 908                 (*och_usecount)++;
 909         } else {
 910                 *och_p = fd->fd_och;
 911                 *och_usecount = 1;
 912         }
 913         fd->fd_och = NULL;
 914         mutex_unlock(&lli->lli_och_mutex);
 915
 916         if (old_och != NULL)
 917                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 918
 919         RETURN(rc);
 920 }
 921
 922 /**
 923  * Acquire a lease and open the file.
 924  */
 925 static struct obd_client_handle *
 926 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 927               __u64 open_flags)
 928 {
 929         struct lookup_intent it = { .it_op = IT_OPEN };
 930         struct ll_sb_info *sbi = ll_i2sbi(inode);
 931         struct md_op_data *op_data;
 932         struct ptlrpc_request *req = NULL;
 933         struct lustre_handle old_open_handle = { 0 };
 934         struct obd_client_handle *och = NULL;
 935         int rc;
 936         int rc2;
 937         ENTRY;
 938
 939         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 940                 RETURN(ERR_PTR(-EINVAL));
 941
 942         if (file != NULL) {
 943                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 944                         RETURN(ERR_PTR(-EPERM));
 945
 946                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
 947                 if (rc)
 948                         RETURN(ERR_PTR(rc));
 949         }
 950
 951         OBD_ALLOC_PTR(och);
 952         if (och == NULL)
 953                 RETURN(ERR_PTR(-ENOMEM));
 954
 955         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 956                                         LUSTRE_OPC_ANY, NULL);
 957         if (IS_ERR(op_data))
 958                 GOTO(out, rc = PTR_ERR(op_data));
 959
 960         /* To tell the MDT this openhandle is from the same owner */
 961         op_data->op_open_handle = old_open_handle;
 962
 963         it.it_flags = fmode | open_flags;
 964         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 965         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 966                             &ll_md_blocking_lease_ast,
 967         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 968          * it can be cancelled which may mislead applications that the lease is
 969          * broken;
 970          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
 971          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
 972          * doesn't deal with openhandle, so normal openhandle will be leaked. */
 973                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
 974         ll_finish_md_op_data(op_data);
 975         ptlrpc_req_finished(req);
 976         if (rc < 0)
 977                 GOTO(out_release_it, rc);
 978
 979         if (it_disposition(&it, DISP_LOOKUP_NEG))
 980                 GOTO(out_release_it, rc = -ENOENT);
 981
 982         rc = it_open_error(DISP_OPEN_OPEN, &it);
 983         if (rc)
 984                 GOTO(out_release_it, rc);
 985
 986         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
 987         ll_och_fill(sbi->ll_md_exp, &it, och);
 988
 989         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
 990                 GOTO(out_close, rc = -EOPNOTSUPP);
 991
 992         /* already get lease, handle lease lock */
 993         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
 994         if (it.it_lock_mode == 0 ||
 995             it.it_lock_bits != MDS_INODELOCK_OPEN) {
 996                 /* open lock must return for lease */
 997                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
 998                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
 999                         it.it_lock_bits);
1000                 GOTO(out_close, rc = -EPROTO);
1001         }
1002
1003         ll_intent_release(&it);
1004         RETURN(och);
1005
1006 out_close:
1007         /* Cancel open lock */
1008         if (it.it_lock_mode != 0) {
1009                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1010                                             it.it_lock_mode);
1011                 it.it_lock_mode = 0;
1012                 och->och_lease_handle.cookie = 0ULL;
1013         }
1014         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1015         if (rc2 < 0)
1016                 CERROR("%s: error closing file "DFID": %d\n",
1017                        ll_get_fsname(inode->i_sb, NULL, 0),
1018                        PFID(&ll_i2info(inode)->lli_fid), rc2);
1019         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1020 out_release_it:
1021         ll_intent_release(&it);
1022 out:
1023         if (och != NULL)
1024                 OBD_FREE_PTR(och);
1025         RETURN(ERR_PTR(rc));
1026 }
1027
1028 /**
1029  * Check whether a layout swap can be done between two inodes.
1030  *
1031  * \param[in] inode1  First inode to check
1032  * \param[in] inode2  Second inode to check
1033  *
1034  * \retval 0 on success, layout swap can be performed between both inodes
1035  * \retval negative error code if requirements are not met
1036  */
1037 static int ll_check_swap_layouts_validity(struct inode *inode1,
1038                                           struct inode *inode2)
1039 {
1040         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1041                 return -EINVAL;
1042
1043         if (inode_permission(inode1, MAY_WRITE) ||
1044             inode_permission(inode2, MAY_WRITE))
1045                 return -EPERM;
1046
1047         if (inode1->i_sb != inode2->i_sb)
1048                 return -EXDEV;
1049
1050         return 0;
1051 }
1052
1053 static int ll_swap_layouts_close(struct obd_client_handle *och,
1054                                  struct inode *inode, struct inode *inode2)
1055 {
1056         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1057         const struct lu_fid     *fid2;
1058         int                      rc;
1059         ENTRY;
1060
1061         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1062                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1063
1064         rc = ll_check_swap_layouts_validity(inode, inode2);
1065         if (rc < 0)
1066                 GOTO(out_free_och, rc);
1067
1068         /* We now know that inode2 is a lustre inode */
1069         fid2 = ll_inode2fid(inode2);
1070
1071         rc = lu_fid_cmp(fid1, fid2);
1072         if (rc == 0)
1073                 GOTO(out_free_och, rc = -EINVAL);
1074
1075         /* Close the file and {swap,merge} layouts between inode & inode2.
1076          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1077          * because we still need it to pack l_remote_handle to MDT. */
1078         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1079                                        inode2);
1080
1081         och = NULL; /* freed in ll_close_inode_openhandle() */
1082
1083 out_free_och:
1084         if (och != NULL)
1085                 OBD_FREE_PTR(och);
1086
1087         RETURN(rc);
1088 }
1089
1090 /**
1091  * Release lease and close the file.
1092  * It will check if the lease has ever broken.
1093  */
1094 static int ll_lease_close_intent(struct obd_client_handle *och,
1095                                  struct inode *inode,
1096                                  bool *lease_broken, enum mds_op_bias bias,
1097                                  void *data)
1098 {
1099         struct ldlm_lock *lock;
1100         bool cancelled = true;
1101         int rc;
1102         ENTRY;
1103
1104         lock = ldlm_handle2lock(&och->och_lease_handle);
1105         if (lock != NULL) {
1106                 lock_res_and_lock(lock);
1107                 cancelled = ldlm_is_cancel(lock);
1108                 unlock_res_and_lock(lock);
1109                 LDLM_LOCK_PUT(lock);
1110         }
1111
1112         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1113                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1114
1115         if (lease_broken != NULL)
1116                 *lease_broken = cancelled;
1117
1118         if (!cancelled && !bias)
1119                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1120
1121         if (cancelled) { /* no need to excute intent */
1122                 bias = 0;
1123                 data = NULL;
1124         }
1125
1126         rc = ll_close_inode_openhandle(inode, och, bias, data);
1127         RETURN(rc);
1128 }
1129
1130 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1131                           bool *lease_broken)
1132 {
1133         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1134 }
1135
1136 /**
1137  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1138  */
1139 static int ll_lease_file_resync(struct obd_client_handle *och,
1140                                 struct inode *inode)
1141 {
1142         struct ll_sb_info *sbi = ll_i2sbi(inode);
1143         struct md_op_data *op_data;
1144         __u64 data_version_unused;
1145         int rc;
1146         ENTRY;
1147
1148         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1149                                      LUSTRE_OPC_ANY, NULL);
1150         if (IS_ERR(op_data))
1151                 RETURN(PTR_ERR(op_data));
1152
1153         /* before starting file resync, it's necessary to clean up page cache
1154          * in client memory, otherwise once the layout version is increased,
1155          * writing back cached data will be denied the OSTs. */
1156         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1157         if (rc)
1158                 GOTO(out, rc);
1159
1160         op_data->op_lease_handle = och->och_lease_handle;
1161         rc = md_file_resync(sbi->ll_md_exp, op_data);
1162         if (rc)
1163                 GOTO(out, rc);
1164
1165         EXIT;
1166 out:
1167         ll_finish_md_op_data(op_data);
1168         return rc;
1169 }
1170
1171 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1172 {
1173         struct ll_inode_info *lli = ll_i2info(inode);
1174         struct cl_object *obj = lli->lli_clob;
1175         struct cl_attr *attr = vvp_env_thread_attr(env);
1176         s64 atime;
1177         s64 mtime;
1178         s64 ctime;
1179         int rc = 0;
1180
1181         ENTRY;
1182
1183         ll_inode_size_lock(inode);
1184
1185         /* Merge timestamps the most recently obtained from MDS with
1186          * timestamps obtained from OSTs.
1187          *
1188          * Do not overwrite atime of inode because it may be refreshed
1189          * by file_accessed() function. If the read was served by cache
1190          * data, there is no RPC to be sent so that atime may not be
1191          * transferred to OSTs at all. MDT only updates atime at close time
1192          * if it's at least 'mdd.*.atime_diff' older.
1193          * All in all, the atime in Lustre does not strictly comply with
1194          * POSIX. Solving this problem needs to send an RPC to MDT for each
1195          * read, this will hurt performance. */
1196         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1197                 LTIME_S(inode->i_atime) = lli->lli_atime;
1198                 lli->lli_update_atime = 0;
1199         }
1200         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1201         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1202
1203         atime = LTIME_S(inode->i_atime);
1204         mtime = LTIME_S(inode->i_mtime);
1205         ctime = LTIME_S(inode->i_ctime);
1206
1207         cl_object_attr_lock(obj);
1208         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1209                 rc = -EINVAL;
1210         else
1211                 rc = cl_object_attr_get(env, obj, attr);
1212         cl_object_attr_unlock(obj);
1213
1214         if (rc != 0)
1215                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1216
1217         if (atime < attr->cat_atime)
1218                 atime = attr->cat_atime;
1219
1220         if (ctime < attr->cat_ctime)
1221                 ctime = attr->cat_ctime;
1222
1223         if (mtime < attr->cat_mtime)
1224                 mtime = attr->cat_mtime;
1225
1226         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1227                PFID(&lli->lli_fid), attr->cat_size);
1228
1229         i_size_write(inode, attr->cat_size);
1230         inode->i_blocks = attr->cat_blocks;
1231
1232         LTIME_S(inode->i_atime) = atime;
1233         LTIME_S(inode->i_mtime) = mtime;
1234         LTIME_S(inode->i_ctime) = ctime;
1235
1236 out_size_unlock:
1237         ll_inode_size_unlock(inode);
1238
1239         RETURN(rc);
1240 }
1241
1242 /**
1243  * Set designated mirror for I/O.
1244  *
1245  * So far only read, write, and truncated can support to issue I/O to
1246  * designated mirror.
1247  */
1248 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1249 {
1250         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1251
1252         /* clear layout version for generic(non-resync) I/O in case it carries
1253          * stale layout version due to I/O restart */
1254         io->ci_layout_version = 0;
1255
1256         /* FLR: disable non-delay for designated mirror I/O because obviously
1257          * only one mirror is available */
1258         if (fd->fd_designated_mirror > 0) {
1259                 io->ci_ndelay = 0;
1260                 io->ci_designated_mirror = fd->fd_designated_mirror;
1261                 io->ci_layout_version = fd->fd_layout_version;
1262                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1263                                  * io to ptasks */
1264         }
1265
1266         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1267                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1268 }
1269
1270 static bool file_is_noatime(const struct file *file)
1271 {
1272         const struct vfsmount *mnt = file->f_path.mnt;
1273         const struct inode *inode = file_inode((struct file *)file);
1274
1275         /* Adapted from file_accessed() and touch_atime().*/
1276         if (file->f_flags & O_NOATIME)
1277                 return true;
1278
1279         if (inode->i_flags & S_NOATIME)
1280                 return true;
1281
1282         if (IS_NOATIME(inode))
1283                 return true;
1284
1285         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1286                 return true;
1287
1288         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1289                 return true;
1290
1291         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1292                 return true;
1293
1294         return false;
1295 }
1296
1297 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1298
1299 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1300 {
1301         struct inode *inode = file_inode(file);
1302         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1303
1304         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1305         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1306         io->u.ci_rw.rw_file = file;
1307         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1308         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1309         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1310
1311         if (iot == CIT_WRITE) {
1312                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1313                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1314                                            file->f_flags & O_DIRECT ||
1315                                            IS_SYNC(inode));
1316         }
1317         io->ci_obj = ll_i2info(inode)->lli_clob;
1318         io->ci_lockreq = CILR_MAYBE;
1319         if (ll_file_nolock(file)) {
1320                 io->ci_lockreq = CILR_NEVER;
1321                 io->ci_no_srvlock = 1;
1322         } else if (file->f_flags & O_APPEND) {
1323                 io->ci_lockreq = CILR_MANDATORY;
1324         }
1325         io->ci_noatime = file_is_noatime(file);
1326         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1327                 io->ci_pio = !io->u.ci_rw.rw_append;
1328         else
1329                 io->ci_pio = 0;
1330
1331         /* FLR: only use non-delay I/O for read as there is only one
1332          * avaliable mirror for write. */
1333         io->ci_ndelay = !(iot == CIT_WRITE);
1334
1335         ll_io_set_mirror(io, file);
1336 }
1337
1338 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1339 {
1340         struct cl_io_pt *pt = ptask->pt_cbdata;
1341         struct file *file = pt->cip_file;
1342         struct lu_env *env;
1343         struct cl_io *io;
1344         loff_t pos = pt->cip_pos;
1345         int rc;
1346         __u16 refcheck;
1347         ENTRY;
1348
1349         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1350                 file_dentry(file)->d_name.name,
1351                 pt->cip_iot == CIT_READ ? "read" : "write",
1352                 pos, pos + pt->cip_count);
1353
1354         env = cl_env_get(&refcheck);
1355         if (IS_ERR(env))
1356                 RETURN(PTR_ERR(env));
1357
1358         io = vvp_env_thread_io(env);
1359         ll_io_init(io, file, pt->cip_iot);
1360         io->u.ci_rw.rw_iter = pt->cip_iter;
1361         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1362         io->ci_pio = 0; /* It's already in parallel task */
1363
1364         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1365                            pt->cip_count - pt->cip_result);
1366         if (!rc) {
1367                 struct vvp_io *vio = vvp_env_io(env);
1368
1369                 vio->vui_io_subtype = IO_NORMAL;
1370                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1371
1372                 ll_cl_add(file, env, io, LCC_RW);
1373                 rc = cl_io_loop(env, io);
1374                 ll_cl_remove(file, env);
1375         } else {
1376                 /* cl_io_rw_init() handled IO */
1377                 rc = io->ci_result;
1378         }
1379
1380         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1381                 if (io->ci_nob > 0)
1382                         io->ci_nob /= 2;
1383                 rc = -EIO;
1384         }
1385
1386         if (io->ci_nob > 0) {
1387                 pt->cip_result += io->ci_nob;
1388                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1389                 pos += io->ci_nob;
1390                 pt->cip_iocb.ki_pos = pos;
1391 #ifdef HAVE_KIOCB_KI_LEFT
1392                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1393 #elif defined(HAVE_KI_NBYTES)
1394                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1395 #endif
1396         }
1397
1398         cl_io_fini(env, io);
1399         cl_env_put(env, &refcheck);
1400
1401         pt->cip_need_restart = io->ci_need_restart;
1402
1403         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1404                 file_dentry(file)->d_name.name,
1405                 pt->cip_iot == CIT_READ ? "read" : "write",
1406                 pt->cip_result, rc);
1407
1408         RETURN(pt->cip_result > 0 ? 0 : rc);
1409 }
1410
1411 static ssize_t
1412 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1413                    struct file *file, enum cl_io_type iot,
1414                    loff_t *ppos, size_t count)
1415 {
1416         struct range_lock       range;
1417         struct vvp_io           *vio = vvp_env_io(env);
1418         struct inode            *inode = file_inode(file);
1419         struct ll_inode_info    *lli = ll_i2info(inode);
1420         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1421         struct cl_io            *io;
1422         loff_t                  pos = *ppos;
1423         ssize_t                 result = 0;
1424         int                     rc = 0;
1425         unsigned                retried = 0;
1426         bool                    restarted = false;
1427
1428         ENTRY;
1429
1430         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1431                 file_dentry(file)->d_name.name,
1432                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1433
1434 restart:
1435         io = vvp_env_thread_io(env);
1436         ll_io_init(io, file, iot);
1437         if (args->via_io_subtype == IO_NORMAL) {
1438                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1439                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1440         }
1441         if (args->via_io_subtype != IO_NORMAL || restarted)
1442                 io->ci_pio = 0;
1443         io->ci_ndelay_tried = retried;
1444
1445         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1446                 bool range_locked = false;
1447
1448                 if (file->f_flags & O_APPEND)
1449                         range_lock_init(&range, 0, LUSTRE_EOF);
1450                 else
1451                         range_lock_init(&range, pos, pos + count - 1);
1452
1453                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1454                 vio->vui_io_subtype = args->via_io_subtype;
1455
1456                 switch (vio->vui_io_subtype) {
1457                 case IO_NORMAL:
1458                         /* Direct IO reads must also take range lock,
1459                          * or multiple reads will try to work on the same pages
1460                          * See LU-6227 for details. */
1461                         if (((iot == CIT_WRITE) ||
1462                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1463                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1464                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1465                                        RL_PARA(&range));
1466                                 rc = range_lock(&lli->lli_write_tree, &range);
1467                                 if (rc < 0)
1468                                         GOTO(out, rc);
1469
1470                                 range_locked = true;
1471                         }
1472                         break;
1473                 case IO_SPLICE:
1474                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1475                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1476                         break;
1477                 default:
1478                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1479                         LBUG();
1480                 }
1481
1482                 ll_cl_add(file, env, io, LCC_RW);
1483                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1484                     !lli->lli_inode_locked) {
1485                         inode_lock(inode);
1486                         lli->lli_inode_locked = 1;
1487                 }
1488                 rc = cl_io_loop(env, io);
1489                 if (lli->lli_inode_locked) {
1490                         lli->lli_inode_locked = 0;
1491                         inode_unlock(inode);
1492                 }
1493                 ll_cl_remove(file, env);
1494
1495                 if (range_locked) {
1496                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1497                                RL_PARA(&range));
1498                         range_unlock(&lli->lli_write_tree, &range);
1499                 }
1500         } else {
1501                 /* cl_io_rw_init() handled IO */
1502                 rc = io->ci_result;
1503         }
1504
1505         if (io->ci_nob > 0) {
1506                 result += io->ci_nob;
1507                 count  -= io->ci_nob;
1508
1509                 if (args->via_io_subtype == IO_NORMAL) {
1510                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1511
1512                         /* CLIO is too complicated. See LU-11069. */
1513                         if (cl_io_is_append(io))
1514                                 pos = io->u.ci_rw.rw_iocb.ki_pos;
1515                         else
1516                                 pos += io->ci_nob;
1517
1518                         args->u.normal.via_iocb->ki_pos = pos;
1519 #ifdef HAVE_KIOCB_KI_LEFT
1520                         args->u.normal.via_iocb->ki_left = count;
1521 #elif defined(HAVE_KI_NBYTES)
1522                         args->u.normal.via_iocb->ki_nbytes = count;
1523 #endif
1524                 } else {
1525                         /* for splice */
1526                         pos = io->u.ci_rw.rw_range.cir_pos;
1527                 }
1528         }
1529 out:
1530         cl_io_fini(env, io);
1531
1532         CDEBUG(D_VFSTRACE,
1533                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1534                file->f_path.dentry->d_name.name,
1535                iot, rc, result, io->ci_need_restart);
1536
1537         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1538                 CDEBUG(D_VFSTRACE,
1539                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1540                         file_dentry(file)->d_name.name,
1541                         iot == CIT_READ ? "read" : "write",
1542                         pos, pos + count, result, rc);
1543                 /* preserve the tried count for FLR */
1544                 retried = io->ci_ndelay_tried;
1545                 restarted = true;
1546                 goto restart;
1547         }
1548
1549         if (iot == CIT_READ) {
1550                 if (result > 0)
1551                         ll_stats_ops_tally(ll_i2sbi(inode),
1552                                            LPROC_LL_READ_BYTES, result);
1553         } else if (iot == CIT_WRITE) {
1554                 if (result > 0) {
1555                         ll_stats_ops_tally(ll_i2sbi(inode),
1556                                            LPROC_LL_WRITE_BYTES, result);
1557                         fd->fd_write_failed = false;
1558                 } else if (result == 0 && rc == 0) {
1559                         rc = io->ci_result;
1560                         if (rc < 0)
1561                                 fd->fd_write_failed = true;
1562                         else
1563                                 fd->fd_write_failed = false;
1564                 } else if (rc != -ERESTARTSYS) {
1565                         fd->fd_write_failed = true;
1566                 }
1567         }
1568
1569         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1570                 file_dentry(file)->d_name.name,
1571                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1572
1573         *ppos = pos;
1574
1575         RETURN(result > 0 ? result : rc);
1576 }
1577
1578 /**
1579  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1580  * especially for small I/O.
1581  *
1582  * To serve a read request, CLIO has to create and initialize a cl_io and
1583  * then request DLM lock. This has turned out to have siginificant overhead
1584  * and affects the performance of small I/O dramatically.
1585  *
1586  * It's not necessary to create a cl_io for each I/O. Under the help of read
1587  * ahead, most of the pages being read are already in memory cache and we can
1588  * read those pages directly because if the pages exist, the corresponding DLM
1589  * lock must exist so that page content must be valid.
1590  *
1591  * In fast read implementation, the llite speculatively finds and reads pages
1592  * in memory cache. There are three scenarios for fast read:
1593  *   - If the page exists and is uptodate, kernel VM will provide the data and
1594  *     CLIO won't be intervened;
1595  *   - If the page was brought into memory by read ahead, it will be exported
1596  *     and read ahead parameters will be updated;
1597  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1598  *     it will go back and invoke normal read, i.e., a cl_io will be created
1599  *     and DLM lock will be requested.
1600  *
1601  * POSIX compliance: posix standard states that read is intended to be atomic.
1602  * Lustre read implementation is in line with Linux kernel read implementation
1603  * and neither of them complies with POSIX standard in this matter. Fast read
1604  * doesn't make the situation worse on single node but it may interleave write
1605  * results from multiple nodes due to short read handling in ll_file_aio_read().
1606  *
1607  * \param env - lu_env
1608  * \param iocb - kiocb from kernel
1609  * \param iter - user space buffers where the data will be copied
1610  *
1611  * \retval - number of bytes have been read, or error code if error occurred.
1612  */
1613 static ssize_t
1614 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1615 {
1616         ssize_t result;
1617
1618         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1619                 return 0;
1620
1621         /* NB: we can't do direct IO for fast read because it will need a lock
1622          * to make IO engine happy. */
1623         if (iocb->ki_filp->f_flags & O_DIRECT)
1624                 return 0;
1625
1626         result = generic_file_read_iter(iocb, iter);
1627
1628         /* If the first page is not in cache, generic_file_aio_read() will be
1629          * returned with -ENODATA.
1630          * See corresponding code in ll_readpage(). */
1631         if (result == -ENODATA)
1632                 result = 0;
1633
1634         if (result > 0)
1635                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1636                                 LPROC_LL_READ_BYTES, result);
1637
1638         return result;
1639 }
1640
1641 /*
1642  * Read from a file (through the page cache).
1643  */
1644 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1645 {
1646         struct lu_env *env;
1647         struct vvp_io_args *args;
1648         ssize_t result;
1649         ssize_t rc2;
1650         __u16 refcheck;
1651
1652         result = ll_do_fast_read(iocb, to);
1653         if (result < 0 || iov_iter_count(to) == 0)
1654                 GOTO(out, result);
1655
1656         env = cl_env_get(&refcheck);
1657         if (IS_ERR(env))
1658                 return PTR_ERR(env);
1659
1660         args = ll_env_args(env, IO_NORMAL);
1661         args->u.normal.via_iter = to;
1662         args->u.normal.via_iocb = iocb;
1663
1664         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1665                                  &iocb->ki_pos, iov_iter_count(to));
1666         if (rc2 > 0)
1667                 result += rc2;
1668         else if (result == 0)
1669                 result = rc2;
1670
1671         cl_env_put(env, &refcheck);
1672 out:
1673         return result;
1674 }
1675
1676 /**
1677  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1678  * If a page is already in the page cache and dirty (and some other things -
1679  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1680  * write to it without doing a full I/O, because Lustre already knows about it
1681  * and will write it out.  This saves a lot of processing time.
1682  *
1683  * All writes here are within one page, so exclusion is handled by the page
1684  * lock on the vm page.  We do not do tiny writes for writes which touch
1685  * multiple pages because it's very unlikely multiple sequential pages are
1686  * are already dirty.
1687  *
1688  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1689  * and are unlikely to be to already dirty pages.
1690  *
1691  * Attribute updates are important here, we do them in ll_tiny_write_end.
1692  */
1693 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1694 {
1695         ssize_t count = iov_iter_count(iter);
1696         struct file *file = iocb->ki_filp;
1697         struct inode *inode = file_inode(file);
1698         ssize_t result = 0;
1699
1700         ENTRY;
1701
1702         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1703          * of function for why.
1704          */
1705         if (count >= PAGE_SIZE ||
1706             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1707                 RETURN(0);
1708
1709         result = __generic_file_write_iter(iocb, iter);
1710
1711         /* If the page is not already dirty, ll_tiny_write_begin returns
1712          * -ENODATA.  We continue on to normal write.
1713          */
1714         if (result == -ENODATA)
1715                 result = 0;
1716
1717         if (result > 0) {
1718                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1719                                    result);
1720                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1721         }
1722
1723         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1724
1725         RETURN(result);
1726 }
1727
1728 /*
1729  * Write to a file (through the page cache).
1730  */
1731 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1732 {
1733         struct vvp_io_args *args;
1734         struct lu_env *env;
1735         ssize_t rc_tiny = 0, rc_normal;
1736         __u16 refcheck;
1737
1738         ENTRY;
1739
1740         /* NB: we can't do direct IO for tiny writes because they use the page
1741          * cache, we can't do sync writes because tiny writes can't flush
1742          * pages, and we can't do append writes because we can't guarantee the
1743          * required DLM locks are held to protect file size.
1744          */
1745         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1746             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1747                 rc_tiny = ll_do_tiny_write(iocb, from);
1748
1749         /* In case of error, go on and try normal write - Only stop if tiny
1750          * write completed I/O.
1751          */
1752         if (iov_iter_count(from) == 0)
1753                 GOTO(out, rc_normal = rc_tiny);
1754
1755         env = cl_env_get(&refcheck);
1756         if (IS_ERR(env))
1757                 return PTR_ERR(env);
1758
1759         args = ll_env_args(env, IO_NORMAL);
1760         args->u.normal.via_iter = from;
1761         args->u.normal.via_iocb = iocb;
1762
1763         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1764                                     &iocb->ki_pos, iov_iter_count(from));
1765
1766         /* On success, combine bytes written. */
1767         if (rc_tiny >= 0 && rc_normal > 0)
1768                 rc_normal += rc_tiny;
1769         /* On error, only return error from normal write if tiny write did not
1770          * write any bytes.  Otherwise return bytes written by tiny write.
1771          */
1772         else if (rc_tiny > 0)
1773                 rc_normal = rc_tiny;
1774
1775         cl_env_put(env, &refcheck);
1776 out:
1777         RETURN(rc_normal);
1778 }
1779
1780 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1781 /*
1782  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1783  */
1784 static int ll_file_get_iov_count(const struct iovec *iov,
1785                                  unsigned long *nr_segs, size_t *count)
1786 {
1787         size_t cnt = 0;
1788         unsigned long seg;
1789
1790         for (seg = 0; seg < *nr_segs; seg++) {
1791                 const struct iovec *iv = &iov[seg];
1792
1793                 /*
1794                  * If any segment has a negative length, or the cumulative
1795                  * length ever wraps negative then return -EINVAL.
1796                  */
1797                 cnt += iv->iov_len;
1798                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1799                         return -EINVAL;
1800                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1801                         continue;
1802                 if (seg == 0)
1803                         return -EFAULT;
1804                 *nr_segs = seg;
1805                 cnt -= iv->iov_len;     /* This segment is no good */
1806                 break;
1807         }
1808         *count = cnt;
1809         return 0;
1810 }
1811
1812 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1813                                 unsigned long nr_segs, loff_t pos)
1814 {
1815         struct iov_iter to;
1816         size_t iov_count;
1817         ssize_t result;
1818         ENTRY;
1819
1820         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1821         if (result)
1822                 RETURN(result);
1823
1824 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1825         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1826 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1827         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1828 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1829
1830         result = ll_file_read_iter(iocb, &to);
1831
1832         RETURN(result);
1833 }
1834
1835 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1836                             loff_t *ppos)
1837 {
1838         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1839         struct kiocb   kiocb;
1840         ssize_t        result;
1841         ENTRY;
1842
1843         init_sync_kiocb(&kiocb, file);
1844         kiocb.ki_pos = *ppos;
1845 #ifdef HAVE_KIOCB_KI_LEFT
1846         kiocb.ki_left = count;
1847 #elif defined(HAVE_KI_NBYTES)
1848         kiocb.i_nbytes = count;
1849 #endif
1850
1851         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1852         *ppos = kiocb.ki_pos;
1853
1854         RETURN(result);
1855 }
1856
1857 /*
1858  * Write to a file (through the page cache).
1859  * AIO stuff
1860  */
1861 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1862                                  unsigned long nr_segs, loff_t pos)
1863 {
1864         struct iov_iter from;
1865         size_t iov_count;
1866         ssize_t result;
1867         ENTRY;
1868
1869         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1870         if (result)
1871                 RETURN(result);
1872
1873 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1874         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1875 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1876         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1877 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1878
1879         result = ll_file_write_iter(iocb, &from);
1880
1881         RETURN(result);
1882 }
1883
1884 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1885                              size_t count, loff_t *ppos)
1886 {
1887         struct iovec   iov = { .iov_base = (void __user *)buf,
1888                                .iov_len = count };
1889         struct kiocb   kiocb;
1890         ssize_t        result;
1891
1892         ENTRY;
1893
1894         init_sync_kiocb(&kiocb, file);
1895         kiocb.ki_pos = *ppos;
1896 #ifdef HAVE_KIOCB_KI_LEFT
1897         kiocb.ki_left = count;
1898 #elif defined(HAVE_KI_NBYTES)
1899         kiocb.ki_nbytes = count;
1900 #endif
1901
1902         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1903         *ppos = kiocb.ki_pos;
1904
1905         RETURN(result);
1906 }
1907 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1908
1909 /*
1910  * Send file content (through pagecache) somewhere with helper
1911  */
1912 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1913                                    struct pipe_inode_info *pipe, size_t count,
1914                                    unsigned int flags)
1915 {
1916         struct lu_env      *env;
1917         struct vvp_io_args *args;
1918         ssize_t             result;
1919         __u16               refcheck;
1920         ENTRY;
1921
1922         env = cl_env_get(&refcheck);
1923         if (IS_ERR(env))
1924                 RETURN(PTR_ERR(env));
1925
1926         args = ll_env_args(env, IO_SPLICE);
1927         args->u.splice.via_pipe = pipe;
1928         args->u.splice.via_flags = flags;
1929
1930         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1931         cl_env_put(env, &refcheck);
1932         RETURN(result);
1933 }
1934
1935 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1936                              __u64 flags, struct lov_user_md *lum, int lum_size)
1937 {
1938         struct lookup_intent oit = {
1939                 .it_op = IT_OPEN,
1940                 .it_flags = flags | MDS_OPEN_BY_FID,
1941         };
1942         int rc;
1943         ENTRY;
1944
1945         ll_inode_size_lock(inode);
1946         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1947         if (rc < 0)
1948                 GOTO(out_unlock, rc);
1949
1950         ll_release_openhandle(dentry, &oit);
1951
1952 out_unlock:
1953         ll_inode_size_unlock(inode);
1954         ll_intent_release(&oit);
1955
1956         RETURN(rc);
1957 }
1958
1959 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1960                              struct lov_mds_md **lmmp, int *lmm_size,
1961                              struct ptlrpc_request **request)
1962 {
1963         struct ll_sb_info *sbi = ll_i2sbi(inode);
1964         struct mdt_body  *body;
1965         struct lov_mds_md *lmm = NULL;
1966         struct ptlrpc_request *req = NULL;
1967         struct md_op_data *op_data;
1968         int rc, lmmsize;
1969
1970         rc = ll_get_default_mdsize(sbi, &lmmsize);
1971         if (rc)
1972                 RETURN(rc);
1973
1974         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1975                                      strlen(filename), lmmsize,
1976                                      LUSTRE_OPC_ANY, NULL);
1977         if (IS_ERR(op_data))
1978                 RETURN(PTR_ERR(op_data));
1979
1980         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1981         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1982         ll_finish_md_op_data(op_data);
1983         if (rc < 0) {
1984                 CDEBUG(D_INFO, "md_getattr_name failed "
1985                        "on %s: rc %d\n", filename, rc);
1986                 GOTO(out, rc);
1987         }
1988
1989         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1990         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1991
1992         lmmsize = body->mbo_eadatasize;
1993
1994         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1995                         lmmsize == 0) {
1996                 GOTO(out, rc = -ENODATA);
1997         }
1998
1999         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2000         LASSERT(lmm != NULL);
2001
2002         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2003             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2004             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2005                 GOTO(out, rc = -EPROTO);
2006
2007         /*
2008          * This is coming from the MDS, so is probably in
2009          * little endian.  We convert it to host endian before
2010          * passing it to userspace.
2011          */
2012         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2013                 int stripe_count;
2014
2015                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2016                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2017                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2018                         if (le32_to_cpu(lmm->lmm_pattern) &
2019                             LOV_PATTERN_F_RELEASED)
2020                                 stripe_count = 0;
2021                 }
2022
2023                 /* if function called for directory - we should
2024                  * avoid swab not existent lsm objects */
2025                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2026                         lustre_swab_lov_user_md_v1(
2027                                         (struct lov_user_md_v1 *)lmm);
2028                         if (S_ISREG(body->mbo_mode))
2029                                 lustre_swab_lov_user_md_objects(
2030                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2031                                     stripe_count);
2032                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2033                         lustre_swab_lov_user_md_v3(
2034                                         (struct lov_user_md_v3 *)lmm);
2035                         if (S_ISREG(body->mbo_mode))
2036                                 lustre_swab_lov_user_md_objects(
2037                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2038                                     stripe_count);
2039                 } else if (lmm->lmm_magic ==
2040                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2041                         lustre_swab_lov_comp_md_v1(
2042                                         (struct lov_comp_md_v1 *)lmm);
2043                 }
2044         }
2045
2046 out:
2047         *lmmp = lmm;
2048         *lmm_size = lmmsize;
2049         *request = req;
2050         return rc;
2051 }
2052
2053 static int ll_lov_setea(struct inode *inode, struct file *file,
2054                         void __user *arg)
2055 {
2056         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2057         struct lov_user_md      *lump;
2058         int                      lum_size = sizeof(struct lov_user_md) +
2059                                             sizeof(struct lov_user_ost_data);
2060         int                      rc;
2061         ENTRY;
2062
2063         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2064                 RETURN(-EPERM);
2065
2066         OBD_ALLOC_LARGE(lump, lum_size);
2067         if (lump == NULL)
2068                 RETURN(-ENOMEM);
2069
2070         if (copy_from_user(lump, arg, lum_size))
2071                 GOTO(out_lump, rc = -EFAULT);
2072
2073         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2074                                       lum_size);
2075         cl_lov_delay_create_clear(&file->f_flags);
2076
2077 out_lump:
2078         OBD_FREE_LARGE(lump, lum_size);
2079         RETURN(rc);
2080 }
2081
2082 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2083 {
2084         struct lu_env   *env;
2085         __u16           refcheck;
2086         int             rc;
2087         ENTRY;
2088
2089         env = cl_env_get(&refcheck);
2090         if (IS_ERR(env))
2091                 RETURN(PTR_ERR(env));
2092
2093         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2094         cl_env_put(env, &refcheck);
2095         RETURN(rc);
2096 }
2097
2098 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2099                             void __user *arg)
2100 {
2101         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2102         struct lov_user_md        *klum;
2103         int                        lum_size, rc;
2104         __u64                      flags = FMODE_WRITE;
2105         ENTRY;
2106
2107         rc = ll_copy_user_md(lum, &klum);
2108         if (rc < 0)
2109                 RETURN(rc);
2110
2111         lum_size = rc;
2112         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2113                                       lum_size);
2114         if (!rc) {
2115                 __u32 gen;
2116
2117                 rc = put_user(0, &lum->lmm_stripe_count);
2118                 if (rc)
2119                         GOTO(out, rc);
2120
2121                 rc = ll_layout_refresh(inode, &gen);
2122                 if (rc)
2123                         GOTO(out, rc);
2124
2125                 rc = ll_file_getstripe(inode, arg, lum_size);
2126         }
2127         cl_lov_delay_create_clear(&file->f_flags);
2128
2129 out:
2130         OBD_FREE(klum, lum_size);
2131         RETURN(rc);
2132 }
2133
2134 static int
2135 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2136 {
2137         struct ll_inode_info *lli = ll_i2info(inode);
2138         struct cl_object *obj = lli->lli_clob;
2139         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2140         struct ll_grouplock grouplock;
2141         int rc;
2142         ENTRY;
2143
2144         if (arg == 0) {
2145                 CWARN("group id for group lock must not be 0\n");
2146                 RETURN(-EINVAL);
2147         }
2148
2149         if (ll_file_nolock(file))
2150                 RETURN(-EOPNOTSUPP);
2151
2152         spin_lock(&lli->lli_lock);
2153         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2154                 CWARN("group lock already existed with gid %lu\n",
2155                       fd->fd_grouplock.lg_gid);
2156                 spin_unlock(&lli->lli_lock);
2157                 RETURN(-EINVAL);
2158         }
2159         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2160         spin_unlock(&lli->lli_lock);
2161
2162         /**
2163          * XXX: group lock needs to protect all OST objects while PFL
2164          * can add new OST objects during the IO, so we'd instantiate
2165          * all OST objects before getting its group lock.
2166          */
2167         if (obj) {
2168                 struct lu_env *env;
2169                 __u16 refcheck;
2170                 struct cl_layout cl = {
2171                         .cl_is_composite = false,
2172                 };
2173                 struct lu_extent ext = {
2174                         .e_start = 0,
2175                         .e_end = OBD_OBJECT_EOF,
2176                 };
2177
2178                 env = cl_env_get(&refcheck);
2179                 if (IS_ERR(env))
2180                         RETURN(PTR_ERR(env));
2181
2182                 rc = cl_object_layout_get(env, obj, &cl);
2183                 if (!rc && cl.cl_is_composite)
2184                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2185                                                     &ext);
2186
2187                 cl_env_put(env, &refcheck);
2188                 if (rc)
2189                         RETURN(rc);
2190         }
2191
2192         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2193                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2194         if (rc)
2195                 RETURN(rc);
2196
2197         spin_lock(&lli->lli_lock);
2198         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2199                 spin_unlock(&lli->lli_lock);
2200                 CERROR("another thread just won the race\n");
2201                 cl_put_grouplock(&grouplock);
2202                 RETURN(-EINVAL);
2203         }
2204
2205         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2206         fd->fd_grouplock = grouplock;
2207         spin_unlock(&lli->lli_lock);
2208
2209         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2210         RETURN(0);
2211 }
2212
2213 static int ll_put_grouplock(struct inode *inode, struct file *file,
2214                             unsigned long arg)
2215 {
2216         struct ll_inode_info   *lli = ll_i2info(inode);
2217         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2218         struct ll_grouplock     grouplock;
2219         ENTRY;
2220
2221         spin_lock(&lli->lli_lock);
2222         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2223                 spin_unlock(&lli->lli_lock);
2224                 CWARN("no group lock held\n");
2225                 RETURN(-EINVAL);
2226         }
2227
2228         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2229
2230         if (fd->fd_grouplock.lg_gid != arg) {
2231                 CWARN("group lock %lu doesn't match current id %lu\n",
2232                       arg, fd->fd_grouplock.lg_gid);
2233                 spin_unlock(&lli->lli_lock);
2234                 RETURN(-EINVAL);
2235         }
2236
2237         grouplock = fd->fd_grouplock;
2238         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2239         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2240         spin_unlock(&lli->lli_lock);
2241
2242         cl_put_grouplock(&grouplock);
2243         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2244         RETURN(0);
2245 }
2246
2247 /**
2248  * Close inode open handle
2249  *
2250  * \param dentry [in]     dentry which contains the inode
2251  * \param it     [in,out] intent which contains open info and result
2252  *
2253  * \retval 0     success
2254  * \retval <0    failure
2255  */
2256 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2257 {
2258         struct inode *inode = dentry->d_inode;
2259         struct obd_client_handle *och;
2260         int rc;
2261         ENTRY;
2262
2263         LASSERT(inode);
2264
2265         /* Root ? Do nothing. */
2266         if (dentry->d_inode->i_sb->s_root == dentry)
2267                 RETURN(0);
2268
2269         /* No open handle to close? Move away */
2270         if (!it_disposition(it, DISP_OPEN_OPEN))
2271                 RETURN(0);
2272
2273         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2274
2275         OBD_ALLOC(och, sizeof(*och));
2276         if (!och)
2277                 GOTO(out, rc = -ENOMEM);
2278
2279         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2280
2281         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2282 out:
2283         /* this one is in place of ll_file_open */
2284         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2285                 ptlrpc_req_finished(it->it_request);
2286                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2287         }
2288         RETURN(rc);
2289 }
2290
2291 /**
2292  * Get size for inode for which FIEMAP mapping is requested.
2293  * Make the FIEMAP get_info call and returns the result.
2294  * \param fiemap        kernel buffer to hold extens
2295  * \param num_bytes     kernel buffer size
2296  */
2297 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2298                         size_t num_bytes)
2299 {
2300         struct lu_env                   *env;
2301         __u16                           refcheck;
2302         int                             rc = 0;
2303         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2304         ENTRY;
2305
2306         /* Checks for fiemap flags */
2307         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2308                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2309                 return -EBADR;
2310         }
2311
2312         /* Check for FIEMAP_FLAG_SYNC */
2313         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2314                 rc = filemap_fdatawrite(inode->i_mapping);
2315                 if (rc)
2316                         return rc;
2317         }
2318
2319         env = cl_env_get(&refcheck);
2320         if (IS_ERR(env))
2321                 RETURN(PTR_ERR(env));
2322
2323         if (i_size_read(inode) == 0) {
2324                 rc = ll_glimpse_size(inode);
2325                 if (rc)
2326                         GOTO(out, rc);
2327         }
2328
2329         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2330         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2331         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2332
2333         /* If filesize is 0, then there would be no objects for mapping */
2334         if (fmkey.lfik_oa.o_size == 0) {
2335                 fiemap->fm_mapped_extents = 0;
2336                 GOTO(out, rc = 0);
2337         }
2338
2339         fmkey.lfik_fiemap = *fiemap;
2340
2341         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2342                               &fmkey, fiemap, &num_bytes);
2343 out:
2344         cl_env_put(env, &refcheck);
2345         RETURN(rc);
2346 }
2347
2348 int ll_fid2path(struct inode *inode, void __user *arg)
2349 {
2350         struct obd_export       *exp = ll_i2mdexp(inode);
2351         const struct getinfo_fid2path __user *gfin = arg;
2352         __u32                    pathlen;
2353         struct getinfo_fid2path *gfout;
2354         size_t                   outsize;
2355         int                      rc;
2356
2357         ENTRY;
2358
2359         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2360             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2361                 RETURN(-EPERM);
2362
2363         /* Only need to get the buflen */
2364         if (get_user(pathlen, &gfin->gf_pathlen))
2365                 RETURN(-EFAULT);
2366
2367         if (pathlen > PATH_MAX)
2368                 RETURN(-EINVAL);
2369
2370         outsize = sizeof(*gfout) + pathlen;
2371         OBD_ALLOC(gfout, outsize);
2372         if (gfout == NULL)
2373                 RETURN(-ENOMEM);
2374
2375         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2376                 GOTO(gf_free, rc = -EFAULT);
2377         /* append root FID after gfout to let MDT know the root FID so that it
2378          * can lookup the correct path, this is mainly for fileset.
2379          * old server without fileset mount support will ignore this. */
2380         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2381
2382         /* Call mdc_iocontrol */
2383         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2384         if (rc != 0)
2385                 GOTO(gf_free, rc);
2386
2387         if (copy_to_user(arg, gfout, outsize))
2388                 rc = -EFAULT;
2389
2390 gf_free:
2391         OBD_FREE(gfout, outsize);
2392         RETURN(rc);
2393 }
2394
2395 static int
2396 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2397 {
2398         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2399         struct lu_env *env;
2400         struct cl_io *io;
2401         __u16  refcheck;
2402         int result;
2403
2404         ENTRY;
2405
2406         ioc->idv_version = 0;
2407         ioc->idv_layout_version = UINT_MAX;
2408
2409         /* If no file object initialized, we consider its version is 0. */
2410         if (obj == NULL)
2411                 RETURN(0);
2412
2413         env = cl_env_get(&refcheck);
2414         if (IS_ERR(env))
2415                 RETURN(PTR_ERR(env));
2416
2417         io = vvp_env_thread_io(env);
2418         io->ci_obj = obj;
2419         io->u.ci_data_version.dv_data_version = 0;
2420         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2421         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2422
2423 restart:
2424         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2425                 result = cl_io_loop(env, io);
2426         else
2427                 result = io->ci_result;
2428
2429         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2430         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2431
2432         cl_io_fini(env, io);
2433
2434         if (unlikely(io->ci_need_restart))
2435                 goto restart;
2436
2437         cl_env_put(env, &refcheck);
2438
2439         RETURN(result);
2440 }
2441
2442 /*
2443  * Read the data_version for inode.
2444  *
2445  * This value is computed using stripe object version on OST.
2446  * Version is computed using server side locking.
2447  *
2448  * @param flags if do sync on the OST side;
2449  *              0: no sync
2450  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2451  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2452  */
2453 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2454 {
2455         struct ioc_data_version ioc = { .idv_flags = flags };
2456         int rc;
2457
2458         rc = ll_ioc_data_version(inode, &ioc);
2459         if (!rc)
2460                 *data_version = ioc.idv_version;
2461
2462         return rc;
2463 }
2464
2465 /*
2466  * Trigger a HSM release request for the provided inode.
2467  */
2468 int ll_hsm_release(struct inode *inode)
2469 {
2470         struct lu_env *env;
2471         struct obd_client_handle *och = NULL;
2472         __u64 data_version = 0;
2473         int rc;
2474         __u16 refcheck;
2475         ENTRY;
2476
2477         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2478                ll_get_fsname(inode->i_sb, NULL, 0),
2479                PFID(&ll_i2info(inode)->lli_fid));
2480
2481         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2482         if (IS_ERR(och))
2483                 GOTO(out, rc = PTR_ERR(och));
2484
2485         /* Grab latest data_version and [am]time values */
2486         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2487         if (rc != 0)
2488                 GOTO(out, rc);
2489
2490         env = cl_env_get(&refcheck);
2491         if (IS_ERR(env))
2492                 GOTO(out, rc = PTR_ERR(env));
2493
2494         rc = ll_merge_attr(env, inode);
2495         cl_env_put(env, &refcheck);
2496
2497         /* If error happen, we have the wrong size for a file.
2498          * Don't release it.
2499          */
2500         if (rc != 0)
2501                 GOTO(out, rc);
2502
2503         /* Release the file.
2504          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2505          * we still need it to pack l_remote_handle to MDT. */
2506         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2507                                        &data_version);
2508         och = NULL;
2509
2510         EXIT;
2511 out:
2512         if (och != NULL && !IS_ERR(och)) /* close the file */
2513                 ll_lease_close(och, inode, NULL);
2514
2515         return rc;
2516 }
2517
2518 struct ll_swap_stack {
2519         __u64                    dv1;
2520         __u64                    dv2;
2521         struct inode            *inode1;
2522         struct inode            *inode2;
2523         bool                     check_dv1;
2524         bool                     check_dv2;
2525 };
2526
2527 static int ll_swap_layouts(struct file *file1, struct file *file2,
2528                            struct lustre_swap_layouts *lsl)
2529 {
2530         struct mdc_swap_layouts  msl;
2531         struct md_op_data       *op_data;
2532         __u32                    gid;
2533         __u64                    dv;
2534         struct ll_swap_stack    *llss = NULL;
2535         int                      rc;
2536
2537         OBD_ALLOC_PTR(llss);
2538         if (llss == NULL)
2539                 RETURN(-ENOMEM);
2540
2541         llss->inode1 = file_inode(file1);
2542         llss->inode2 = file_inode(file2);
2543
2544         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2545         if (rc < 0)
2546                 GOTO(free, rc);
2547
2548         /* we use 2 bool because it is easier to swap than 2 bits */
2549         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2550                 llss->check_dv1 = true;
2551
2552         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2553                 llss->check_dv2 = true;
2554
2555         /* we cannot use lsl->sl_dvX directly because we may swap them */
2556         llss->dv1 = lsl->sl_dv1;
2557         llss->dv2 = lsl->sl_dv2;
2558
2559         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2560         if (rc == 0) /* same file, done! */
2561                 GOTO(free, rc);
2562
2563         if (rc < 0) { /* sequentialize it */
2564                 swap(llss->inode1, llss->inode2);
2565                 swap(file1, file2);
2566                 swap(llss->dv1, llss->dv2);
2567                 swap(llss->check_dv1, llss->check_dv2);
2568         }
2569
2570         gid = lsl->sl_gid;
2571         if (gid != 0) { /* application asks to flush dirty cache */
2572                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2573                 if (rc < 0)
2574                         GOTO(free, rc);
2575
2576                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2577                 if (rc < 0) {
2578                         ll_put_grouplock(llss->inode1, file1, gid);
2579                         GOTO(free, rc);
2580                 }
2581         }
2582
2583         /* ultimate check, before swaping the layouts we check if
2584          * dataversion has changed (if requested) */
2585         if (llss->check_dv1) {
2586                 rc = ll_data_version(llss->inode1, &dv, 0);
2587                 if (rc)
2588                         GOTO(putgl, rc);
2589                 if (dv != llss->dv1)
2590                         GOTO(putgl, rc = -EAGAIN);
2591         }
2592
2593         if (llss->check_dv2) {
2594                 rc = ll_data_version(llss->inode2, &dv, 0);
2595                 if (rc)
2596                         GOTO(putgl, rc);
2597                 if (dv != llss->dv2)
2598                         GOTO(putgl, rc = -EAGAIN);
2599         }
2600
2601         /* struct md_op_data is used to send the swap args to the mdt
2602          * only flags is missing, so we use struct mdc_swap_layouts
2603          * through the md_op_data->op_data */
2604         /* flags from user space have to be converted before they are send to
2605          * server, no flag is sent today, they are only used on the client */
2606         msl.msl_flags = 0;
2607         rc = -ENOMEM;
2608         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2609                                      0, LUSTRE_OPC_ANY, &msl);
2610         if (IS_ERR(op_data))
2611                 GOTO(free, rc = PTR_ERR(op_data));
2612
2613         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2614                            sizeof(*op_data), op_data, NULL);
2615         ll_finish_md_op_data(op_data);
2616
2617         if (rc < 0)
2618                 GOTO(putgl, rc);
2619
2620 putgl:
2621         if (gid != 0) {
2622                 ll_put_grouplock(llss->inode2, file2, gid);
2623                 ll_put_grouplock(llss->inode1, file1, gid);
2624         }
2625
2626 free:
2627         if (llss != NULL)
2628                 OBD_FREE_PTR(llss);
2629
2630         RETURN(rc);
2631 }
2632
2633 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2634 {
2635         struct md_op_data       *op_data;
2636         int                      rc;
2637         ENTRY;
2638
2639         /* Detect out-of range masks */
2640         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2641                 RETURN(-EINVAL);
2642
2643         /* Non-root users are forbidden to set or clear flags which are
2644          * NOT defined in HSM_USER_MASK. */
2645         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2646             !cfs_capable(CFS_CAP_SYS_ADMIN))
2647                 RETURN(-EPERM);
2648
2649         /* Detect out-of range archive id */
2650         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2651             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2652                 RETURN(-EINVAL);
2653
2654         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2655                                      LUSTRE_OPC_ANY, hss);
2656         if (IS_ERR(op_data))
2657                 RETURN(PTR_ERR(op_data));
2658
2659         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2660                            sizeof(*op_data), op_data, NULL);
2661
2662         ll_finish_md_op_data(op_data);
2663
2664         RETURN(rc);
2665 }
2666
2667 static int ll_hsm_import(struct inode *inode, struct file *file,
2668                          struct hsm_user_import *hui)
2669 {
2670         struct hsm_state_set    *hss = NULL;
2671         struct iattr            *attr = NULL;
2672         int                      rc;
2673         ENTRY;
2674
2675         if (!S_ISREG(inode->i_mode))
2676                 RETURN(-EINVAL);
2677
2678         /* set HSM flags */
2679         OBD_ALLOC_PTR(hss);
2680         if (hss == NULL)
2681                 GOTO(out, rc = -ENOMEM);
2682
2683         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2684         hss->hss_archive_id = hui->hui_archive_id;
2685         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2686         rc = ll_hsm_state_set(inode, hss);
2687         if (rc != 0)
2688                 GOTO(out, rc);
2689
2690         OBD_ALLOC_PTR(attr);
2691         if (attr == NULL)
2692                 GOTO(out, rc = -ENOMEM);
2693
2694         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2695         attr->ia_mode |= S_IFREG;
2696         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2697         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2698         attr->ia_size = hui->hui_size;
2699         attr->ia_mtime.tv_sec = hui->hui_mtime;
2700         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2701         attr->ia_atime.tv_sec = hui->hui_atime;
2702         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2703
2704         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2705                          ATTR_UID | ATTR_GID |
2706                          ATTR_MTIME | ATTR_MTIME_SET |
2707                          ATTR_ATIME | ATTR_ATIME_SET;
2708
2709         inode_lock(inode);
2710
2711         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2712         if (rc == -ENODATA)
2713                 rc = 0;
2714
2715         inode_unlock(inode);
2716
2717 out:
2718         if (hss != NULL)
2719                 OBD_FREE_PTR(hss);
2720
2721         if (attr != NULL)
2722                 OBD_FREE_PTR(attr);
2723
2724         RETURN(rc);
2725 }
2726
2727 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2728 {
2729         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2730                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2731 }
2732
2733 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2734 {
2735         struct inode *inode = file_inode(file);
2736         struct iattr ia = {
2737                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2738                             ATTR_MTIME | ATTR_MTIME_SET |
2739                             ATTR_CTIME,
2740                 .ia_atime = {
2741                         .tv_sec = lfu->lfu_atime_sec,
2742                         .tv_nsec = lfu->lfu_atime_nsec,
2743                 },
2744                 .ia_mtime = {
2745                         .tv_sec = lfu->lfu_mtime_sec,
2746                         .tv_nsec = lfu->lfu_mtime_nsec,
2747                 },
2748                 .ia_ctime = {
2749                         .tv_sec = lfu->lfu_ctime_sec,
2750                         .tv_nsec = lfu->lfu_ctime_nsec,
2751                 },
2752         };
2753         int rc;
2754         ENTRY;
2755
2756         if (!capable(CAP_SYS_ADMIN))
2757                 RETURN(-EPERM);
2758
2759         if (!S_ISREG(inode->i_mode))
2760                 RETURN(-EINVAL);
2761
2762         inode_lock(inode);
2763         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2764                             false);
2765         inode_unlock(inode);
2766
2767         RETURN(rc);
2768 }
2769
2770 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2771 {
2772         switch (mode) {
2773         case MODE_READ_USER:
2774                 return CLM_READ;
2775         case MODE_WRITE_USER:
2776                 return CLM_WRITE;
2777         default:
2778                 return -EINVAL;
2779         }
2780 }
2781
2782 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2783
2784 /* Used to allow the upper layers of the client to request an LDLM lock
2785  * without doing an actual read or write.
2786  *
2787  * Used for ladvise lockahead to manually request specific locks.
2788  *
2789  * \param[in] file      file this ladvise lock request is on
2790  * \param[in] ladvise   ladvise struct describing this lock request
2791  *
2792  * \retval 0            success, no detailed result available (sync requests
2793  *                      and requests sent to the server [not handled locally]
2794  *                      cannot return detailed results)
2795  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2796  *                                       see definitions for details.
2797  * \retval negative     negative errno on error
2798  */
2799 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2800 {
2801         struct lu_env *env = NULL;
2802         struct cl_io *io  = NULL;
2803         struct cl_lock *lock = NULL;
2804         struct cl_lock_descr *descr = NULL;
2805         struct dentry *dentry = file->f_path.dentry;
2806         struct inode *inode = dentry->d_inode;
2807         enum cl_lock_mode cl_mode;
2808         off_t start = ladvise->lla_start;
2809         off_t end = ladvise->lla_end;
2810         int result;
2811         __u16 refcheck;
2812
2813         ENTRY;
2814
2815         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2816                "start=%llu, end=%llu\n", dentry->d_name.len,
2817                dentry->d_name.name, dentry->d_inode,
2818                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2819                (__u64) end);
2820
2821         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2822         if (cl_mode < 0)
2823                 GOTO(out, result = cl_mode);
2824
2825         /* Get IO environment */
2826         result = cl_io_get(inode, &env, &io, &refcheck);
2827         if (result <= 0)
2828                 GOTO(out, result);
2829
2830         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2831         if (result > 0) {
2832                 /*
2833                  * nothing to do for this io. This currently happens when
2834                  * stripe sub-object's are not yet created.
2835                  */
2836                 result = io->ci_result;
2837         } else if (result == 0) {
2838                 lock = vvp_env_lock(env);
2839                 descr = &lock->cll_descr;
2840
2841                 descr->cld_obj   = io->ci_obj;
2842                 /* Convert byte offsets to pages */
2843                 descr->cld_start = cl_index(io->ci_obj, start);
2844                 descr->cld_end   = cl_index(io->ci_obj, end);
2845                 descr->cld_mode  = cl_mode;
2846                 /* CEF_MUST is used because we do not want to convert a
2847                  * lockahead request to a lockless lock */
2848                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2849                                        CEF_NONBLOCK;
2850
2851                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2852                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2853
2854                 result = cl_lock_request(env, io, lock);
2855
2856                 /* On success, we need to release the lock */
2857                 if (result >= 0)
2858                         cl_lock_release(env, lock);
2859         }
2860         cl_io_fini(env, io);
2861         cl_env_put(env, &refcheck);
2862
2863         /* -ECANCELED indicates a matching lock with a different extent
2864          * was already present, and -EEXIST indicates a matching lock
2865          * on exactly the same extent was already present.
2866          * We convert them to positive values for userspace to make
2867          * recognizing true errors easier.
2868          * Note we can only return these detailed results on async requests,
2869          * as sync requests look the same as i/o requests for locking. */
2870         if (result == -ECANCELED)
2871                 result = LLA_RESULT_DIFFERENT;
2872         else if (result == -EEXIST)
2873                 result = LLA_RESULT_SAME;
2874
2875 out:
2876         RETURN(result);
2877 }
2878 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2879
2880 static int ll_ladvise_sanity(struct inode *inode,
2881                              struct llapi_lu_ladvise *ladvise)
2882 {
2883         enum lu_ladvise_type advice = ladvise->lla_advice;
2884         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2885          * be in the first 32 bits of enum ladvise_flags */
2886         __u32 flags = ladvise->lla_peradvice_flags;
2887         /* 3 lines at 80 characters per line, should be plenty */
2888         int rc = 0;
2889
2890         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2891                 rc = -EINVAL;
2892                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2893                        "last supported advice is %s (value '%d'): rc = %d\n",
2894                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2895                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2896                 GOTO(out, rc);
2897         }
2898
2899         /* Per-advice checks */
2900         switch (advice) {
2901         case LU_LADVISE_LOCKNOEXPAND:
2902                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2903                         rc = -EINVAL;
2904                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2905                                "rc = %d\n",
2906                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2907                                ladvise_names[advice], rc);
2908                         GOTO(out, rc);
2909                 }
2910                 break;
2911         case LU_LADVISE_LOCKAHEAD:
2912                 /* Currently only READ and WRITE modes can be requested */
2913                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2914                     ladvise->lla_lockahead_mode == 0) {
2915                         rc = -EINVAL;
2916                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2917                                "rc = %d\n",
2918                                ll_get_fsname(inode->i_sb, NULL, 0),
2919                                ladvise->lla_lockahead_mode,
2920                                ladvise_names[advice], rc);
2921                         GOTO(out, rc);
2922                 }
2923         case LU_LADVISE_WILLREAD:
2924         case LU_LADVISE_DONTNEED:
2925         default:
2926                 /* Note fall through above - These checks apply to all advices
2927                  * except LOCKNOEXPAND */
2928                 if (flags & ~LF_DEFAULT_MASK) {
2929                         rc = -EINVAL;
2930                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2931                                "rc = %d\n",
2932                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2933                                ladvise_names[advice], rc);
2934                         GOTO(out, rc);
2935                 }
2936                 if (ladvise->lla_start >= ladvise->lla_end) {
2937                         rc = -EINVAL;
2938                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2939                                "for %s: rc = %d\n",
2940                                ll_get_fsname(inode->i_sb, NULL, 0),
2941                                ladvise->lla_start, ladvise->lla_end,
2942                                ladvise_names[advice], rc);
2943                         GOTO(out, rc);
2944                 }
2945                 break;
2946         }
2947
2948 out:
2949         return rc;
2950 }
2951 #undef ERRSIZE
2952
2953 /*
2954  * Give file access advices
2955  *
2956  * The ladvise interface is similar to Linux fadvise() system call, except it
2957  * forwards the advices directly from Lustre client to server. The server side
2958  * codes will apply appropriate read-ahead and caching techniques for the
2959  * corresponding files.
2960  *
2961  * A typical workload for ladvise is e.g. a bunch of different clients are
2962  * doing small random reads of a file, so prefetching pages into OSS cache
2963  * with big linear reads before the random IO is a net benefit. Fetching
2964  * all that data into each client cache with fadvise() may not be, due to
2965  * much more data being sent to the client.
2966  */
2967 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2968                       struct llapi_lu_ladvise *ladvise)
2969 {
2970         struct lu_env *env;
2971         struct cl_io *io;
2972         struct cl_ladvise_io *lio;
2973         int rc;
2974         __u16 refcheck;
2975         ENTRY;
2976
2977         env = cl_env_get(&refcheck);
2978         if (IS_ERR(env))
2979                 RETURN(PTR_ERR(env));
2980
2981         io = vvp_env_thread_io(env);
2982         io->ci_obj = ll_i2info(inode)->lli_clob;
2983
2984         /* initialize parameters for ladvise */
2985         lio = &io->u.ci_ladvise;
2986         lio->li_start = ladvise->lla_start;
2987         lio->li_end = ladvise->lla_end;
2988         lio->li_fid = ll_inode2fid(inode);
2989         lio->li_advice = ladvise->lla_advice;
2990         lio->li_flags = flags;
2991
2992         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2993                 rc = cl_io_loop(env, io);
2994         else
2995                 rc = io->ci_result;
2996
2997         cl_io_fini(env, io);
2998         cl_env_put(env, &refcheck);
2999         RETURN(rc);
3000 }
3001
3002 static int ll_lock_noexpand(struct file *file, int flags)
3003 {
3004         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3005
3006         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3007
3008         return 0;
3009 }
3010
3011 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3012                         unsigned long arg)
3013 {
3014         struct fsxattr fsxattr;
3015
3016         if (copy_from_user(&fsxattr,
3017                            (const struct fsxattr __user *)arg,
3018                            sizeof(fsxattr)))
3019                 RETURN(-EFAULT);
3020
3021         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3022         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3023                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3024         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3025         if (copy_to_user((struct fsxattr __user *)arg,
3026                          &fsxattr, sizeof(fsxattr)))
3027                 RETURN(-EFAULT);
3028
3029         RETURN(0);
3030 }
3031
3032 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3033                         unsigned long arg)
3034 {
3035
3036         struct md_op_data *op_data;
3037         struct ptlrpc_request *req = NULL;
3038         int rc = 0;
3039         struct fsxattr fsxattr;
3040         struct cl_object *obj;
3041         struct iattr *attr;
3042         int flags;
3043
3044         /* only root could change project ID */
3045         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
3046                 RETURN(-EPERM);
3047
3048         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3049                                      LUSTRE_OPC_ANY, NULL);
3050         if (IS_ERR(op_data))
3051                 RETURN(PTR_ERR(op_data));
3052
3053         if (copy_from_user(&fsxattr,
3054                            (const struct fsxattr __user *)arg,
3055                            sizeof(fsxattr)))
3056                 GOTO(out_fsxattr, rc = -EFAULT);
3057
3058         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3059         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3060         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3061                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3062         op_data->op_projid = fsxattr.fsx_projid;
3063         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3064         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3065                         0, &req);
3066         ptlrpc_req_finished(req);
3067         if (rc)
3068                 GOTO(out_fsxattr, rc);
3069         ll_update_inode_flags(inode, op_data->op_attr_flags);
3070         obj = ll_i2info(inode)->lli_clob;
3071         if (obj == NULL)
3072                 GOTO(out_fsxattr, rc);
3073
3074         OBD_ALLOC_PTR(attr);
3075         if (attr == NULL)
3076                 GOTO(out_fsxattr, rc = -ENOMEM);
3077
3078         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3079                             fsxattr.fsx_xflags);
3080         OBD_FREE_PTR(attr);
3081 out_fsxattr:
3082         ll_finish_md_op_data(op_data);
3083         RETURN(rc);
3084 }
3085
3086 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3087                                  unsigned long arg)
3088 {
3089         struct inode            *inode = file_inode(file);
3090         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3091         struct ll_inode_info    *lli = ll_i2info(inode);
3092         struct obd_client_handle *och = NULL;
3093         struct split_param sp;
3094         bool lease_broken;
3095         fmode_t fmode = 0;
3096         enum mds_op_bias bias = 0;
3097         struct file *layout_file = NULL;
3098         void *data = NULL;
3099         size_t data_size = 0;
3100         long rc;
3101         ENTRY;
3102
3103         mutex_lock(&lli->lli_och_mutex);
3104         if (fd->fd_lease_och != NULL) {
3105                 och = fd->fd_lease_och;
3106                 fd->fd_lease_och = NULL;
3107         }
3108         mutex_unlock(&lli->lli_och_mutex);
3109
3110         if (och == NULL)
3111                 GOTO(out, rc = -ENOLCK);
3112
3113         fmode = och->och_flags;
3114
3115         switch (ioc->lil_flags) {
3116         case LL_LEASE_RESYNC_DONE:
3117                 if (ioc->lil_count > IOC_IDS_MAX)
3118                         GOTO(out, rc = -EINVAL);
3119
3120                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3121                 OBD_ALLOC(data, data_size);
3122                 if (!data)
3123                         GOTO(out, rc = -ENOMEM);
3124
3125                 if (copy_from_user(data, (void __user *)arg, data_size))
3126                         GOTO(out, rc = -EFAULT);
3127
3128                 bias = MDS_CLOSE_RESYNC_DONE;
3129                 break;
3130         case LL_LEASE_LAYOUT_MERGE: {
3131                 int fd;
3132
3133                 if (ioc->lil_count != 1)
3134                         GOTO(out, rc = -EINVAL);
3135
3136                 arg += sizeof(*ioc);
3137                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3138                         GOTO(out, rc = -EFAULT);
3139
3140                 layout_file = fget(fd);
3141                 if (!layout_file)
3142                         GOTO(out, rc = -EBADF);
3143
3144                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3145                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3146                         GOTO(out, rc = -EPERM);
3147
3148                 data = file_inode(layout_file);
3149                 bias = MDS_CLOSE_LAYOUT_MERGE;
3150                 break;
3151         }
3152         case LL_LEASE_LAYOUT_SPLIT: {
3153                 int fdv;
3154                 int mirror_id;
3155
3156                 if (ioc->lil_count != 2)
3157                         GOTO(out, rc = -EINVAL);
3158
3159                 arg += sizeof(*ioc);
3160                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3161                         GOTO(out, rc = -EFAULT);
3162
3163                 arg += sizeof(__u32);
3164                 if (copy_from_user(&mirror_id, (void __user *)arg,
3165                                    sizeof(__u32)))
3166                         GOTO(out, rc = -EFAULT);
3167
3168                 layout_file = fget(fdv);
3169                 if (!layout_file)
3170                         GOTO(out, rc = -EBADF);
3171
3172                 sp.sp_inode = file_inode(layout_file);
3173                 sp.sp_mirror_id = (__u16)mirror_id;
3174                 data = &sp;
3175                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3176                 break;
3177         }
3178         default:
3179                 /* without close intent */
3180                 break;
3181         }
3182
3183         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3184         if (rc < 0)
3185                 GOTO(out, rc);
3186
3187         rc = ll_lease_och_release(inode, file);
3188         if (rc < 0)
3189                 GOTO(out, rc);
3190
3191         if (lease_broken)
3192                 fmode = 0;
3193         EXIT;
3194
3195 out:
3196         switch (ioc->lil_flags) {
3197         case LL_LEASE_RESYNC_DONE:
3198                 if (data)
3199                         OBD_FREE(data, data_size);
3200                 break;
3201         case LL_LEASE_LAYOUT_MERGE:
3202         case LL_LEASE_LAYOUT_SPLIT:
3203                 if (layout_file)
3204                         fput(layout_file);
3205                 break;
3206         }
3207
3208         if (!rc)
3209                 rc = ll_lease_type_from_fmode(fmode);
3210         RETURN(rc);
3211 }
3212
3213 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3214                               unsigned long arg)
3215 {
3216         struct inode *inode = file_inode(file);
3217         struct ll_inode_info *lli = ll_i2info(inode);
3218         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3219         struct obd_client_handle *och = NULL;
3220         __u64 open_flags = 0;
3221         bool lease_broken;
3222         fmode_t fmode;
3223         long rc;
3224         ENTRY;
3225
3226         switch (ioc->lil_mode) {
3227         case LL_LEASE_WRLCK:
3228                 if (!(file->f_mode & FMODE_WRITE))
3229                         RETURN(-EPERM);
3230                 fmode = FMODE_WRITE;
3231                 break;
3232         case LL_LEASE_RDLCK:
3233                 if (!(file->f_mode & FMODE_READ))
3234                         RETURN(-EPERM);
3235                 fmode = FMODE_READ;
3236                 break;
3237         case LL_LEASE_UNLCK:
3238                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3239         default:
3240                 RETURN(-EINVAL);
3241         }
3242
3243         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3244
3245         /* apply for lease */
3246         if (ioc->lil_flags & LL_LEASE_RESYNC)
3247                 open_flags = MDS_OPEN_RESYNC;
3248         och = ll_lease_open(inode, file, fmode, open_flags);
3249         if (IS_ERR(och))
3250                 RETURN(PTR_ERR(och));
3251
3252         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3253                 rc = ll_lease_file_resync(och, inode);
3254                 if (rc) {
3255                         ll_lease_close(och, inode, NULL);
3256                         RETURN(rc);
3257                 }
3258                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3259                 if (rc) {
3260                         ll_lease_close(och, inode, NULL);
3261                         RETURN(rc);
3262                 }
3263         }
3264
3265         rc = 0;
3266         mutex_lock(&lli->lli_och_mutex);
3267         if (fd->fd_lease_och == NULL) {
3268                 fd->fd_lease_och = och;
3269                 och = NULL;
3270         }
3271         mutex_unlock(&lli->lli_och_mutex);
3272         if (och != NULL) {
3273                 /* impossible now that only excl is supported for now */
3274                 ll_lease_close(och, inode, &lease_broken);
3275                 rc = -EBUSY;
3276         }
3277         RETURN(rc);
3278 }
3279
3280 static long
3281 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3282 {
3283         struct inode            *inode = file_inode(file);
3284         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3285         int                      flags, rc;
3286         ENTRY;
3287
3288         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3289                PFID(ll_inode2fid(inode)), inode, cmd);
3290         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3291
3292         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3293         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3294                 RETURN(-ENOTTY);
3295
3296         switch (cmd) {
3297         case LL_IOC_GETFLAGS:
3298                 /* Get the current value of the file flags */
3299                 return put_user(fd->fd_flags, (int __user *)arg);
3300         case LL_IOC_SETFLAGS:
3301         case LL_IOC_CLRFLAGS:
3302                 /* Set or clear specific file flags */
3303                 /* XXX This probably needs checks to ensure the flags are
3304                  *     not abused, and to handle any flag side effects.
3305                  */
3306                 if (get_user(flags, (int __user *) arg))
3307                         RETURN(-EFAULT);
3308
3309                 if (cmd == LL_IOC_SETFLAGS) {
3310                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3311                             !(file->f_flags & O_DIRECT)) {
3312                                 CERROR("%s: unable to disable locking on "
3313                                        "non-O_DIRECT file\n", current->comm);
3314                                 RETURN(-EINVAL);
3315                         }
3316
3317                         fd->fd_flags |= flags;
3318                 } else {
3319                         fd->fd_flags &= ~flags;
3320                 }
3321                 RETURN(0);
3322         case LL_IOC_LOV_SETSTRIPE:
3323         case LL_IOC_LOV_SETSTRIPE_NEW:
3324                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3325         case LL_IOC_LOV_SETEA:
3326                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3327         case LL_IOC_LOV_SWAP_LAYOUTS: {
3328                 struct file *file2;
3329                 struct lustre_swap_layouts lsl;
3330
3331                 if (copy_from_user(&lsl, (char __user *)arg,
3332                                    sizeof(struct lustre_swap_layouts)))
3333                         RETURN(-EFAULT);
3334
3335                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3336                         RETURN(-EPERM);
3337
3338                 file2 = fget(lsl.sl_fd);
3339                 if (file2 == NULL)
3340                         RETURN(-EBADF);
3341
3342                 /* O_WRONLY or O_RDWR */
3343                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3344                         GOTO(out, rc = -EPERM);
3345
3346                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3347                         struct inode                    *inode2;
3348                         struct ll_inode_info            *lli;
3349                         struct obd_client_handle        *och = NULL;
3350
3351                         lli = ll_i2info(inode);
3352                         mutex_lock(&lli->lli_och_mutex);
3353                         if (fd->fd_lease_och != NULL) {
3354                                 och = fd->fd_lease_och;
3355                                 fd->fd_lease_och = NULL;
3356                         }
3357                         mutex_unlock(&lli->lli_och_mutex);
3358                         if (och == NULL)
3359                                 GOTO(out, rc = -ENOLCK);
3360                         inode2 = file_inode(file2);
3361                         rc = ll_swap_layouts_close(och, inode, inode2);
3362                 } else {
3363                         rc = ll_swap_layouts(file, file2, &lsl);
3364                 }
3365 out:
3366                 fput(file2);
3367                 RETURN(rc);
3368         }
3369         case LL_IOC_LOV_GETSTRIPE:
3370         case LL_IOC_LOV_GETSTRIPE_NEW:
3371                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3372         case FS_IOC_GETFLAGS:
3373         case FS_IOC_SETFLAGS:
3374                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3375         case FSFILT_IOC_GETVERSION:
3376         case FS_IOC_GETVERSION:
3377                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3378         /* We need to special case any other ioctls we want to handle,
3379          * to send them to the MDS/OST as appropriate and to properly
3380          * network encode the arg field. */
3381         case FS_IOC_SETVERSION:
3382                 RETURN(-ENOTSUPP);
3383
3384         case LL_IOC_GROUP_LOCK:
3385                 RETURN(ll_get_grouplock(inode, file, arg));
3386         case LL_IOC_GROUP_UNLOCK:
3387                 RETURN(ll_put_grouplock(inode, file, arg));
3388         case IOC_OBD_STATFS:
3389                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3390
3391         case LL_IOC_FLUSHCTX:
3392                 RETURN(ll_flush_ctx(inode));
3393         case LL_IOC_PATH2FID: {
3394                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3395                                  sizeof(struct lu_fid)))
3396                         RETURN(-EFAULT);
3397
3398                 RETURN(0);
3399         }
3400         case LL_IOC_GETPARENT:
3401                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3402
3403         case OBD_IOC_FID2PATH:
3404                 RETURN(ll_fid2path(inode, (void __user *)arg));
3405         case LL_IOC_DATA_VERSION: {
3406                 struct ioc_data_version idv;
3407                 int rc;
3408
3409                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3410                         RETURN(-EFAULT);
3411
3412                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3413                 rc = ll_ioc_data_version(inode, &idv);
3414
3415                 if (rc == 0 &&
3416                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3417                         RETURN(-EFAULT);
3418
3419                 RETURN(rc);
3420         }
3421
3422         case LL_IOC_GET_MDTIDX: {
3423                 int mdtidx;
3424
3425                 mdtidx = ll_get_mdt_idx(inode);
3426                 if (mdtidx < 0)
3427                         RETURN(mdtidx);
3428
3429                 if (put_user((int)mdtidx, (int __user *)arg))
3430                         RETURN(-EFAULT);
3431
3432                 RETURN(0);
3433         }
3434         case OBD_IOC_GETDTNAME:
3435         case OBD_IOC_GETMDNAME:
3436                 RETURN(ll_get_obd_name(inode, cmd, arg));
3437         case LL_IOC_HSM_STATE_GET: {
3438                 struct md_op_data       *op_data;
3439                 struct hsm_user_state   *hus;
3440                 int                      rc;
3441
3442                 OBD_ALLOC_PTR(hus);
3443                 if (hus == NULL)
3444                         RETURN(-ENOMEM);
3445
3446                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3447                                              LUSTRE_OPC_ANY, hus);
3448                 if (IS_ERR(op_data)) {
3449                         OBD_FREE_PTR(hus);
3450                         RETURN(PTR_ERR(op_data));
3451                 }
3452
3453                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3454                                    op_data, NULL);
3455
3456                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3457                         rc = -EFAULT;
3458
3459                 ll_finish_md_op_data(op_data);
3460                 OBD_FREE_PTR(hus);
3461                 RETURN(rc);
3462         }
3463         case LL_IOC_HSM_STATE_SET: {
3464                 struct hsm_state_set    *hss;
3465                 int                      rc;
3466
3467                 OBD_ALLOC_PTR(hss);
3468                 if (hss == NULL)
3469                         RETURN(-ENOMEM);
3470
3471                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3472                         OBD_FREE_PTR(hss);
3473                         RETURN(-EFAULT);
3474                 }
3475
3476                 rc = ll_hsm_state_set(inode, hss);
3477
3478                 OBD_FREE_PTR(hss);
3479                 RETURN(rc);
3480         }
3481         case LL_IOC_HSM_ACTION: {
3482                 struct md_op_data               *op_data;
3483                 struct hsm_current_action       *hca;
3484                 int                              rc;
3485
3486                 OBD_ALLOC_PTR(hca);
3487                 if (hca == NULL)
3488                         RETURN(-ENOMEM);
3489
3490                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3491                                              LUSTRE_OPC_ANY, hca);
3492                 if (IS_ERR(op_data)) {
3493                         OBD_FREE_PTR(hca);
3494                         RETURN(PTR_ERR(op_data));
3495                 }
3496
3497                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3498                                    op_data, NULL);
3499
3500                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3501                         rc = -EFAULT;
3502
3503                 ll_finish_md_op_data(op_data);
3504                 OBD_FREE_PTR(hca);
3505                 RETURN(rc);
3506         }
3507         case LL_IOC_SET_LEASE_OLD: {
3508                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3509
3510                 RETURN(ll_file_set_lease(file, &ioc, 0));
3511         }
3512         case LL_IOC_SET_LEASE: {
3513                 struct ll_ioc_lease ioc;
3514
3515                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3516                         RETURN(-EFAULT);
3517
3518                 RETURN(ll_file_set_lease(file, &ioc, arg));
3519         }
3520         case LL_IOC_GET_LEASE: {
3521                 struct ll_inode_info *lli = ll_i2info(inode);
3522                 struct ldlm_lock *lock = NULL;
3523                 fmode_t fmode = 0;
3524
3525                 mutex_lock(&lli->lli_och_mutex);
3526                 if (fd->fd_lease_och != NULL) {
3527                         struct obd_client_handle *och = fd->fd_lease_och;
3528
3529                         lock = ldlm_handle2lock(&och->och_lease_handle);
3530                         if (lock != NULL) {
3531                                 lock_res_and_lock(lock);
3532                                 if (!ldlm_is_cancel(lock))
3533                                         fmode = och->och_flags;
3534
3535                                 unlock_res_and_lock(lock);
3536                                 LDLM_LOCK_PUT(lock);
3537                         }
3538                 }
3539                 mutex_unlock(&lli->lli_och_mutex);
3540
3541                 RETURN(ll_lease_type_from_fmode(fmode));
3542         }
3543         case LL_IOC_HSM_IMPORT: {
3544                 struct hsm_user_import *hui;
3545
3546                 OBD_ALLOC_PTR(hui);
3547                 if (hui == NULL)
3548                         RETURN(-ENOMEM);
3549
3550                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3551                         OBD_FREE_PTR(hui);
3552                         RETURN(-EFAULT);
3553                 }
3554
3555                 rc = ll_hsm_import(inode, file, hui);
3556
3557                 OBD_FREE_PTR(hui);
3558                 RETURN(rc);
3559         }
3560         case LL_IOC_FUTIMES_3: {
3561                 struct ll_futimes_3 lfu;
3562
3563                 if (copy_from_user(&lfu,
3564                                    (const struct ll_futimes_3 __user *)arg,
3565                                    sizeof(lfu)))
3566                         RETURN(-EFAULT);
3567
3568                 RETURN(ll_file_futimes_3(file, &lfu));
3569         }
3570         case LL_IOC_LADVISE: {
3571                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3572                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3573                 int i;
3574                 int num_advise;
3575                 int alloc_size = sizeof(*k_ladvise_hdr);
3576
3577                 rc = 0;
3578                 u_ladvise_hdr = (void __user *)arg;
3579                 OBD_ALLOC_PTR(k_ladvise_hdr);
3580                 if (k_ladvise_hdr == NULL)
3581                         RETURN(-ENOMEM);
3582
3583                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3584                         GOTO(out_ladvise, rc = -EFAULT);
3585
3586                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3587                     k_ladvise_hdr->lah_count < 1)
3588                         GOTO(out_ladvise, rc = -EINVAL);
3589
3590                 num_advise = k_ladvise_hdr->lah_count;
3591                 if (num_advise >= LAH_COUNT_MAX)
3592                         GOTO(out_ladvise, rc = -EFBIG);
3593
3594                 OBD_FREE_PTR(k_ladvise_hdr);
3595                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3596                                       lah_advise[num_advise]);
3597                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3598                 if (k_ladvise_hdr == NULL)
3599                         RETURN(-ENOMEM);
3600
3601                 /*
3602                  * TODO: submit multiple advices to one server in a single RPC
3603                  */
3604                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3605                         GOTO(out_ladvise, rc = -EFAULT);
3606
3607                 for (i = 0; i < num_advise; i++) {
3608                         struct llapi_lu_ladvise *k_ladvise =
3609                                         &k_ladvise_hdr->lah_advise[i];
3610                         struct llapi_lu_ladvise __user *u_ladvise =
3611                                         &u_ladvise_hdr->lah_advise[i];
3612
3613                         rc = ll_ladvise_sanity(inode, k_ladvise);
3614                         if (rc)
3615                                 GOTO(out_ladvise, rc);
3616
3617                         switch (k_ladvise->lla_advice) {
3618                         case LU_LADVISE_LOCKNOEXPAND:
3619                                 rc = ll_lock_noexpand(file,
3620                                                k_ladvise->lla_peradvice_flags);
3621                                 GOTO(out_ladvise, rc);
3622                         case LU_LADVISE_LOCKAHEAD:
3623
3624                                 rc = ll_file_lock_ahead(file, k_ladvise);
3625
3626                                 if (rc < 0)
3627                                         GOTO(out_ladvise, rc);
3628
3629                                 if (put_user(rc,
3630                                              &u_ladvise->lla_lockahead_result))
3631                                         GOTO(out_ladvise, rc = -EFAULT);
3632                                 break;
3633                         default:
3634                                 rc = ll_ladvise(inode, file,
3635                                                 k_ladvise_hdr->lah_flags,
3636                                                 k_ladvise);
3637                                 if (rc)
3638                                         GOTO(out_ladvise, rc);
3639                                 break;
3640                         }
3641
3642                 }
3643
3644 out_ladvise:
3645                 OBD_FREE(k_ladvise_hdr, alloc_size);
3646                 RETURN(rc);
3647         }
3648         case LL_IOC_FLR_SET_MIRROR: {
3649                 /* mirror I/O must be direct to avoid polluting page cache
3650                  * by stale data. */
3651                 if (!(file->f_flags & O_DIRECT))
3652                         RETURN(-EINVAL);
3653
3654                 fd->fd_designated_mirror = (__u32)arg;
3655                 RETURN(0);
3656         }
3657         case LL_IOC_FSGETXATTR:
3658                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3659         case LL_IOC_FSSETXATTR:
3660                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3661         case BLKSSZGET:
3662                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3663         default:
3664                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3665                                      (void __user *)arg));
3666         }
3667 }
3668
3669 #ifndef HAVE_FILE_LLSEEK_SIZE
3670 static inline loff_t
3671 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3672 {
3673         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3674                 return -EINVAL;
3675         if (offset > maxsize)
3676                 return -EINVAL;
3677
3678         if (offset != file->f_pos) {
3679                 file->f_pos = offset;
3680                 file->f_version = 0;
3681         }
3682         return offset;
3683 }
3684
3685 static loff_t
3686 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3687                 loff_t maxsize, loff_t eof)
3688 {
3689         struct inode *inode = file_inode(file);
3690
3691         switch (origin) {
3692         case SEEK_END:
3693                 offset += eof;
3694                 break;
3695         case SEEK_CUR:
3696                 /*
3697                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3698                  * position-querying operation.  Avoid rewriting the "same"
3699                  * f_pos value back to the file because a concurrent read(),
3700                  * write() or lseek() might have altered it
3701                  */
3702                 if (offset == 0)
3703                         return file->f_pos;
3704                 /*
3705                  * f_lock protects against read/modify/write race with other
3706                  * SEEK_CURs. Note that parallel writes and reads behave
3707                  * like SEEK_SET.
3708                  */
3709                 inode_lock(inode);
3710                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3711                 inode_unlock(inode);
3712                 return offset;
3713         case SEEK_DATA:
3714                 /*
3715                  * In the generic case the entire file is data, so as long as
3716                  * offset isn't at the end of the file then the offset is data.
3717                  */
3718                 if (offset >= eof)
3719                         return -ENXIO;
3720                 break;
3721         case SEEK_HOLE:
3722                 /*
3723                  * There is a virtual hole at the end of the file, so as long as
3724                  * offset isn't i_size or larger, return i_size.
3725                  */
3726                 if (offset >= eof)
3727                         return -ENXIO;
3728                 offset = eof;
3729                 break;
3730         }
3731
3732         return llseek_execute(file, offset, maxsize);
3733 }
3734 #endif
3735
3736 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3737 {
3738         struct inode *inode = file_inode(file);
3739         loff_t retval, eof = 0;
3740
3741         ENTRY;
3742         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3743                            (origin == SEEK_CUR) ? file->f_pos : 0);
3744         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3745                PFID(ll_inode2fid(inode)), inode, retval, retval,
3746                origin);
3747         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3748
3749         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3750                 retval = ll_glimpse_size(inode);
3751                 if (retval != 0)
3752                         RETURN(retval);
3753                 eof = i_size_read(inode);
3754         }
3755
3756         retval = ll_generic_file_llseek_size(file, offset, origin,
3757                                           ll_file_maxbytes(inode), eof);
3758         RETURN(retval);
3759 }
3760
3761 static int ll_flush(struct file *file, fl_owner_t id)
3762 {
3763         struct inode *inode = file_inode(file);
3764         struct ll_inode_info *lli = ll_i2info(inode);
3765         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3766         int rc, err;
3767
3768         LASSERT(!S_ISDIR(inode->i_mode));
3769
3770         /* catch async errors that were recorded back when async writeback
3771          * failed for pages in this mapping. */
3772         rc = lli->lli_async_rc;
3773         lli->lli_async_rc = 0;
3774         if (lli->lli_clob != NULL) {
3775                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3776                 if (rc == 0)
3777                         rc = err;
3778         }
3779
3780         /* The application has been told write failure already.
3781          * Do not report failure again. */
3782         if (fd->fd_write_failed)
3783                 return 0;
3784         return rc ? -EIO : 0;
3785 }
3786
3787 /**
3788  * Called to make sure a portion of file has been written out.
3789  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3790  *
3791  * Return how many pages have been written.
3792  */
3793 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3794                        enum cl_fsync_mode mode, int ignore_layout)
3795 {
3796         struct lu_env *env;
3797         struct cl_io *io;
3798         struct cl_fsync_io *fio;
3799         int result;
3800         __u16 refcheck;
3801         ENTRY;
3802
3803         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3804             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3805                 RETURN(-EINVAL);
3806
3807         env = cl_env_get(&refcheck);
3808         if (IS_ERR(env))
3809                 RETURN(PTR_ERR(env));
3810
3811         io = vvp_env_thread_io(env);
3812         io->ci_obj = ll_i2info(inode)->lli_clob;
3813         io->ci_ignore_layout = ignore_layout;
3814
3815         /* initialize parameters for sync */
3816         fio = &io->u.ci_fsync;
3817         fio->fi_start = start;
3818         fio->fi_end = end;
3819         fio->fi_fid = ll_inode2fid(inode);
3820         fio->fi_mode = mode;
3821         fio->fi_nr_written = 0;
3822
3823         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3824                 result = cl_io_loop(env, io);
3825         else
3826                 result = io->ci_result;
3827         if (result == 0)
3828                 result = fio->fi_nr_written;
3829         cl_io_fini(env, io);
3830         cl_env_put(env, &refcheck);
3831
3832         RETURN(result);
3833 }
3834
3835 /*
3836  * When dentry is provided (the 'else' case), file_dentry() may be
3837  * null and dentry must be used directly rather than pulled from
3838  * file_dentry() as is done otherwise.
3839  */
3840
3841 #ifdef HAVE_FILE_FSYNC_4ARGS
3842 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3843 {
3844         struct dentry *dentry = file_dentry(file);
3845         bool lock_inode;
3846 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3847 int ll_fsync(struct file *file, int datasync)
3848 {
3849         struct dentry *dentry = file_dentry(file);
3850         loff_t start = 0;
3851         loff_t end = LLONG_MAX;
3852 #else
3853 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3854 {
3855         loff_t start = 0;
3856         loff_t end = LLONG_MAX;
3857 #endif
3858         struct inode *inode = dentry->d_inode;
3859         struct ll_inode_info *lli = ll_i2info(inode);
3860         struct ptlrpc_request *req;
3861         int rc, err;
3862         ENTRY;
3863
3864         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3865                PFID(ll_inode2fid(inode)), inode);
3866         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3867
3868 #ifdef HAVE_FILE_FSYNC_4ARGS
3869         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3870         lock_inode = !lli->lli_inode_locked;
3871         if (lock_inode)
3872                 inode_lock(inode);
3873 #else
3874         /* fsync's caller has already called _fdata{sync,write}, we want
3875          * that IO to finish before calling the osc and mdc sync methods */
3876         rc = filemap_fdatawait(inode->i_mapping);
3877 #endif
3878
3879         /* catch async errors that were recorded back when async writeback
3880          * failed for pages in this mapping. */
3881         if (!S_ISDIR(inode->i_mode)) {
3882                 err = lli->lli_async_rc;
3883                 lli->lli_async_rc = 0;
3884                 if (rc == 0)
3885                         rc = err;
3886                 if (lli->lli_clob != NULL) {
3887                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3888                         if (rc == 0)
3889                                 rc = err;
3890                 }
3891         }
3892
3893         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3894         if (!rc)
3895                 rc = err;
3896         if (!err)
3897                 ptlrpc_req_finished(req);
3898
3899         if (S_ISREG(inode->i_mode)) {
3900                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3901
3902                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3903                 if (rc == 0 && err < 0)
3904                         rc = err;
3905                 if (rc < 0)
3906                         fd->fd_write_failed = true;
3907                 else
3908                         fd->fd_write_failed = false;
3909         }
3910
3911 #ifdef HAVE_FILE_FSYNC_4ARGS
3912         if (lock_inode)
3913                 inode_unlock(inode);
3914 #endif
3915         RETURN(rc);
3916 }
3917
3918 static int
3919 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3920 {
3921         struct inode *inode = file_inode(file);
3922         struct ll_sb_info *sbi = ll_i2sbi(inode);
3923         struct ldlm_enqueue_info einfo = {
3924                 .ei_type        = LDLM_FLOCK,
3925                 .ei_cb_cp       = ldlm_flock_completion_ast,
3926                 .ei_cbdata      = file_lock,
3927         };
3928         struct md_op_data *op_data;
3929         struct lustre_handle lockh = { 0 };
3930         union ldlm_policy_data flock = { { 0 } };
3931         int fl_type = file_lock->fl_type;
3932         __u64 flags = 0;
3933         int rc;
3934         int rc2 = 0;
3935         ENTRY;
3936
3937         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3938                PFID(ll_inode2fid(inode)), file_lock);
3939
3940         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3941
3942         if (file_lock->fl_flags & FL_FLOCK) {
3943                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3944                 /* flocks are whole-file locks */
3945                 flock.l_flock.end = OFFSET_MAX;
3946                 /* For flocks owner is determined by the local file desctiptor*/
3947                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3948         } else if (file_lock->fl_flags & FL_POSIX) {
3949                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3950                 flock.l_flock.start = file_lock->fl_start;
3951                 flock.l_flock.end = file_lock->fl_end;
3952         } else {
3953                 RETURN(-EINVAL);
3954         }
3955         flock.l_flock.pid = file_lock->fl_pid;
3956
3957         /* Somewhat ugly workaround for svc lockd.
3958          * lockd installs custom fl_lmops->lm_compare_owner that checks
3959          * for the fl_owner to be the same (which it always is on local node
3960          * I guess between lockd processes) and then compares pid.
3961          * As such we assign pid to the owner field to make it all work,
3962          * conflict with normal locks is unlikely since pid space and
3963          * pointer space for current->files are not intersecting */
3964         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3965                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3966
3967         switch (fl_type) {
3968         case F_RDLCK:
3969                 einfo.ei_mode = LCK_PR;
3970                 break;
3971         case F_UNLCK:
3972                 /* An unlock request may or may not have any relation to
3973                  * existing locks so we may not be able to pass a lock handle
3974                  * via a normal ldlm_lock_cancel() request. The request may even
3975                  * unlock a byte range in the middle of an existing lock. In
3976                  * order to process an unlock request we need all of the same
3977                  * information that is given with a normal read or write record
3978                  * lock request. To avoid creating another ldlm unlock (cancel)
3979                  * message we'll treat a LCK_NL flock request as an unlock. */
3980                 einfo.ei_mode = LCK_NL;
3981                 break;
3982         case F_WRLCK:
3983                 einfo.ei_mode = LCK_PW;
3984                 break;
3985         default:
3986                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
3987                 RETURN (-ENOTSUPP);
3988         }
3989
3990         switch (cmd) {
3991         case F_SETLKW:
3992 #ifdef F_SETLKW64
3993         case F_SETLKW64:
3994 #endif
3995                 flags = 0;
3996                 break;
3997         case F_SETLK:
3998 #ifdef F_SETLK64
3999         case F_SETLK64:
4000 #endif
4001                 flags = LDLM_FL_BLOCK_NOWAIT;
4002                 break;
4003         case F_GETLK:
4004 #ifdef F_GETLK64
4005         case F_GETLK64:
4006 #endif
4007                 flags = LDLM_FL_TEST_LOCK;
4008                 break;
4009         default:
4010                 CERROR("unknown fcntl lock command: %d\n", cmd);
4011                 RETURN (-EINVAL);
4012         }
4013
4014         /* Save the old mode so that if the mode in the lock changes we
4015          * can decrement the appropriate reader or writer refcount. */
4016         file_lock->fl_type = einfo.ei_mode;
4017
4018         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4019                                      LUSTRE_OPC_ANY, NULL);
4020         if (IS_ERR(op_data))
4021                 RETURN(PTR_ERR(op_data));
4022
4023         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4024                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4025                flock.l_flock.pid, flags, einfo.ei_mode,
4026                flock.l_flock.start, flock.l_flock.end);
4027
4028         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4029                         flags);
4030
4031         /* Restore the file lock type if not TEST lock. */
4032         if (!(flags & LDLM_FL_TEST_LOCK))
4033                 file_lock->fl_type = fl_type;
4034
4035 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4036         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4037             !(flags & LDLM_FL_TEST_LOCK))
4038                 rc2  = locks_lock_file_wait(file, file_lock);
4039 #else
4040         if ((file_lock->fl_flags & FL_FLOCK) &&
4041             (rc == 0 || file_lock->fl_type == F_UNLCK))
4042                 rc2  = flock_lock_file_wait(file, file_lock);
4043         if ((file_lock->fl_flags & FL_POSIX) &&
4044             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4045             !(flags & LDLM_FL_TEST_LOCK))
4046                 rc2  = posix_lock_file_wait(file, file_lock);
4047 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4048
4049         if (rc2 && file_lock->fl_type != F_UNLCK) {
4050                 einfo.ei_mode = LCK_NL;
4051                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4052                            &lockh, flags);
4053                 rc = rc2;
4054         }
4055
4056         ll_finish_md_op_data(op_data);
4057
4058         RETURN(rc);
4059 }
4060
4061 int ll_get_fid_by_name(struct inode *parent, const char *name,
4062                        int namelen, struct lu_fid *fid,
4063                        struct inode **inode)
4064 {
4065         struct md_op_data       *op_data = NULL;
4066         struct mdt_body         *body;
4067         struct ptlrpc_request   *req;
4068         int                     rc;
4069         ENTRY;
4070
4071         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4072                                      LUSTRE_OPC_ANY, NULL);
4073         if (IS_ERR(op_data))
4074                 RETURN(PTR_ERR(op_data));
4075
4076         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4077         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4078         ll_finish_md_op_data(op_data);
4079         if (rc < 0)
4080                 RETURN(rc);
4081
4082         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4083         if (body == NULL)
4084                 GOTO(out_req, rc = -EFAULT);
4085         if (fid != NULL)
4086                 *fid = body->mbo_fid1;
4087
4088         if (inode != NULL)
4089                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4090 out_req:
4091         ptlrpc_req_finished(req);
4092         RETURN(rc);
4093 }
4094
4095 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4096                const char *name)
4097 {
4098         struct dentry *dchild = NULL;
4099         struct inode *child_inode = NULL;
4100         struct md_op_data *op_data;
4101         struct ptlrpc_request *request = NULL;
4102         struct obd_client_handle *och = NULL;
4103         struct qstr qstr;
4104         struct mdt_body *body;
4105         __u64 data_version = 0;
4106         size_t namelen = strlen(name);
4107         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4108         int rc;
4109         ENTRY;
4110
4111         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4112                PFID(ll_inode2fid(parent)), name,
4113                lum->lum_stripe_offset, lum->lum_stripe_count);
4114
4115         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4116             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4117                 lustre_swab_lmv_user_md(lum);
4118
4119         /* Get child FID first */
4120         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4121         qstr.name = name;
4122         qstr.len = namelen;
4123         dchild = d_lookup(file_dentry(file), &qstr);
4124         if (dchild) {
4125                 if (dchild->d_inode)
4126                         child_inode = igrab(dchild->d_inode);
4127                 dput(dchild);
4128         }
4129
4130         if (!child_inode) {
4131                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4132                                         &child_inode);
4133                 if (rc)
4134                         RETURN(rc);
4135         }
4136
4137         if (!child_inode)
4138                 RETURN(-ENOENT);
4139
4140         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4141               OBD_CONNECT2_DIR_MIGRATE)) {
4142                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4143                     ll_i2info(child_inode)->lli_lsm_md) {
4144                         CERROR("%s: MDT doesn't support stripe directory "
4145                                "migration!\n",
4146                                ll_get_fsname(parent->i_sb, NULL, 0));
4147                         GOTO(out_iput, rc = -EOPNOTSUPP);
4148                 }
4149         }
4150
4151         /*
4152          * lfs migrate command needs to be blocked on the client
4153          * by checking the migrate FID against the FID of the
4154          * filesystem root.
4155          */
4156         if (child_inode == parent->i_sb->s_root->d_inode)
4157                 GOTO(out_iput, rc = -EINVAL);
4158
4159         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4160                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4161         if (IS_ERR(op_data))
4162                 GOTO(out_iput, rc = PTR_ERR(op_data));
4163
4164         inode_lock(child_inode);
4165         op_data->op_fid3 = *ll_inode2fid(child_inode);
4166         if (!fid_is_sane(&op_data->op_fid3)) {
4167                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4168                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4169                        PFID(&op_data->op_fid3));
4170                 GOTO(out_unlock, rc = -EINVAL);
4171         }
4172
4173         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4174         op_data->op_data = lum;
4175         op_data->op_data_size = lumlen;
4176
4177 again:
4178         if (S_ISREG(child_inode->i_mode)) {
4179                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4180                 if (IS_ERR(och)) {
4181                         rc = PTR_ERR(och);
4182                         och = NULL;
4183                         GOTO(out_unlock, rc);
4184                 }
4185
4186                 rc = ll_data_version(child_inode, &data_version,
4187                                      LL_DV_WR_FLUSH);
4188                 if (rc != 0)
4189                         GOTO(out_close, rc);
4190
4191                 op_data->op_open_handle = och->och_open_handle;
4192                 op_data->op_data_version = data_version;
4193                 op_data->op_lease_handle = och->och_lease_handle;
4194                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4195
4196                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4197                 och->och_mod->mod_open_req->rq_replay = 0;
4198                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4199         }
4200
4201         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4202                        name, namelen, &request);
4203         if (rc == 0) {
4204                 LASSERT(request != NULL);
4205                 ll_update_times(request, parent);
4206
4207                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4208                 LASSERT(body != NULL);
4209
4210                 /* If the server does release layout lock, then we cleanup
4211                  * the client och here, otherwise release it in out_close: */
4212                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4213                         obd_mod_put(och->och_mod);
4214                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4215                                                   och);
4216                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4217                         OBD_FREE_PTR(och);
4218                         och = NULL;
4219                 }
4220         }
4221
4222         if (request != NULL) {
4223                 ptlrpc_req_finished(request);
4224                 request = NULL;
4225         }
4226
4227         /* Try again if the file layout has changed. */
4228         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4229                 goto again;
4230
4231 out_close:
4232         if (och)
4233                 ll_lease_close(och, child_inode, NULL);
4234         if (!rc)
4235                 clear_nlink(child_inode);
4236 out_unlock:
4237         inode_unlock(child_inode);
4238         ll_finish_md_op_data(op_data);
4239 out_iput:
4240         iput(child_inode);
4241         RETURN(rc);
4242 }
4243
4244 static int
4245 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4246 {
4247         ENTRY;
4248
4249         RETURN(-ENOSYS);
4250 }
4251
4252 /**
4253  * test if some locks matching bits and l_req_mode are acquired
4254  * - bits can be in different locks
4255  * - if found clear the common lock bits in *bits
4256  * - the bits not found, are kept in *bits
4257  * \param inode [IN]
4258  * \param bits [IN] searched lock bits [IN]
4259  * \param l_req_mode [IN] searched lock mode
4260  * \retval boolean, true iff all bits are found
4261  */
4262 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4263 {
4264         struct lustre_handle lockh;
4265         union ldlm_policy_data policy;
4266         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4267                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4268         struct lu_fid *fid;
4269         __u64 flags;
4270         int i;
4271         ENTRY;
4272
4273         if (!inode)
4274                RETURN(0);
4275
4276         fid = &ll_i2info(inode)->lli_fid;
4277         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4278                ldlm_lockname[mode]);
4279
4280         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4281         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4282                 policy.l_inodebits.bits = *bits & (1 << i);
4283                 if (policy.l_inodebits.bits == 0)
4284                         continue;
4285
4286                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4287                                   &policy, mode, &lockh)) {
4288                         struct ldlm_lock *lock;
4289
4290                         lock = ldlm_handle2lock(&lockh);
4291                         if (lock) {
4292                                 *bits &=
4293                                       ~(lock->l_policy_data.l_inodebits.bits);
4294                                 LDLM_LOCK_PUT(lock);
4295                         } else {
4296                                 *bits &= ~policy.l_inodebits.bits;
4297                         }
4298                 }
4299         }
4300         RETURN(*bits == 0);
4301 }
4302
4303 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4304                                struct lustre_handle *lockh, __u64 flags,
4305                                enum ldlm_mode mode)
4306 {
4307         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4308         struct lu_fid *fid;
4309         enum ldlm_mode rc;
4310         ENTRY;
4311
4312         fid = &ll_i2info(inode)->lli_fid;
4313         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4314
4315         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4316                            fid, LDLM_IBITS, &policy, mode, lockh);
4317
4318         RETURN(rc);
4319 }
4320
4321 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4322 {
4323         /* Already unlinked. Just update nlink and return success */
4324         if (rc == -ENOENT) {
4325                 clear_nlink(inode);
4326                 /* If it is striped directory, and there is bad stripe
4327                  * Let's revalidate the dentry again, instead of returning
4328                  * error */
4329                 if (S_ISDIR(inode->i_mode) &&
4330                     ll_i2info(inode)->lli_lsm_md != NULL)
4331                         return 0;
4332
4333                 /* This path cannot be hit for regular files unless in
4334                  * case of obscure races, so no need to to validate
4335                  * size. */
4336                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4337                         return 0;
4338         } else if (rc != 0) {
4339                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4340                              "%s: revalidate FID "DFID" error: rc = %d\n",
4341                              ll_get_fsname(inode->i_sb, NULL, 0),
4342                              PFID(ll_inode2fid(inode)), rc);
4343         }
4344
4345         return rc;
4346 }
4347
4348 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4349 {
4350         struct inode *inode = dentry->d_inode;
4351         struct obd_export *exp = ll_i2mdexp(inode);
4352         struct lookup_intent oit = {
4353                 .it_op = op,
4354         };
4355         struct ptlrpc_request *req = NULL;
4356         struct md_op_data *op_data;
4357         int rc = 0;
4358         ENTRY;
4359
4360         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4361                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4362
4363         /* Call getattr by fid, so do not provide name at all. */
4364         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4365                                      LUSTRE_OPC_ANY, NULL);
4366         if (IS_ERR(op_data))
4367                 RETURN(PTR_ERR(op_data));
4368
4369         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4370         ll_finish_md_op_data(op_data);
4371         if (rc < 0) {
4372                 rc = ll_inode_revalidate_fini(inode, rc);
4373                 GOTO(out, rc);
4374         }
4375
4376         rc = ll_revalidate_it_finish(req, &oit, dentry);
4377         if (rc != 0) {
4378                 ll_intent_release(&oit);
4379                 GOTO(out, rc);
4380         }
4381
4382         /* Unlinked? Unhash dentry, so it is not picked up later by
4383          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4384          * here to preserve get_cwd functionality on 2.6.
4385          * Bug 10503 */
4386         if (!dentry->d_inode->i_nlink) {
4387                 ll_lock_dcache(inode);
4388                 d_lustre_invalidate(dentry, 0);
4389                 ll_unlock_dcache(inode);
4390         }
4391
4392         ll_lookup_finish_locks(&oit, dentry);
4393 out:
4394         ptlrpc_req_finished(req);
4395
4396         return rc;
4397 }
4398
4399 static int ll_merge_md_attr(struct inode *inode)
4400 {
4401         struct cl_attr attr = { 0 };
4402         int rc;
4403
4404         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4405         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4406                            &attr, ll_md_blocking_ast);
4407         if (rc != 0)
4408                 RETURN(rc);
4409
4410         set_nlink(inode, attr.cat_nlink);
4411         inode->i_blocks = attr.cat_blocks;
4412         i_size_write(inode, attr.cat_size);
4413
4414         ll_i2info(inode)->lli_atime = attr.cat_atime;
4415         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4416         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4417
4418         RETURN(0);
4419 }
4420
4421 static inline dev_t ll_compat_encode_dev(dev_t dev)
4422 {
4423         /* The compat_sys_*stat*() syscalls will fail unless the
4424          * device majors and minors are both less than 256. Note that
4425          * the value returned here will be passed through
4426          * old_encode_dev() in cp_compat_stat(). And so we are not
4427          * trying to return a valid compat (u16) device number, just
4428          * one that will pass the old_valid_dev() check. */
4429
4430         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4431 }
4432
4433 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4434 int ll_getattr(const struct path *path, struct kstat *stat,
4435                u32 request_mask, unsigned int flags)
4436 {
4437         struct dentry *de = path->dentry;
4438 #else
4439 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4440 {
4441 #endif
4442         struct inode *inode = de->d_inode;
4443         struct ll_sb_info *sbi = ll_i2sbi(inode);
4444         struct ll_inode_info *lli = ll_i2info(inode);
4445         int rc;
4446
4447         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4448
4449         rc = ll_inode_revalidate(de, IT_GETATTR);
4450         if (rc < 0)
4451                 RETURN(rc);
4452
4453         if (S_ISREG(inode->i_mode)) {
4454                 /* In case of restore, the MDT has the right size and has
4455                  * already send it back without granting the layout lock,
4456                  * inode is up-to-date so glimpse is useless.
4457                  * Also to glimpse we need the layout, in case of a running
4458                  * restore the MDT holds the layout lock so the glimpse will
4459                  * block up to the end of restore (getattr will block)
4460                  */
4461                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4462                         rc = ll_glimpse_size(inode);
4463                         if (rc < 0)
4464                                 RETURN(rc);
4465                 }
4466         } else {
4467                 /* If object isn't regular a file then don't validate size. */
4468                 if (S_ISDIR(inode->i_mode) &&
4469                     lli->lli_lsm_md != NULL) {
4470                         rc = ll_merge_md_attr(inode);
4471                         if (rc < 0)
4472                                 RETURN(rc);
4473                 }
4474
4475                 LTIME_S(inode->i_atime) = lli->lli_atime;
4476                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4477                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4478         }
4479
4480         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4481
4482         if (ll_need_32bit_api(sbi)) {
4483                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4484                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4485                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4486         } else {
4487                 stat->ino = inode->i_ino;
4488                 stat->dev = inode->i_sb->s_dev;
4489                 stat->rdev = inode->i_rdev;
4490         }
4491
4492         stat->mode = inode->i_mode;
4493         stat->uid = inode->i_uid;
4494         stat->gid = inode->i_gid;
4495         stat->atime = inode->i_atime;
4496         stat->mtime = inode->i_mtime;
4497         stat->ctime = inode->i_ctime;
4498         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4499
4500         stat->nlink = inode->i_nlink;
4501         stat->size = i_size_read(inode);
4502         stat->blocks = inode->i_blocks;
4503
4504         return 0;
4505 }
4506
4507 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4508                      __u64 start, __u64 len)
4509 {
4510         int             rc;
4511         size_t          num_bytes;
4512         struct fiemap   *fiemap;
4513         unsigned int    extent_count = fieinfo->fi_extents_max;
4514
4515         num_bytes = sizeof(*fiemap) + (extent_count *
4516                                        sizeof(struct fiemap_extent));
4517         OBD_ALLOC_LARGE(fiemap, num_bytes);
4518
4519         if (fiemap == NULL)
4520                 RETURN(-ENOMEM);
4521
4522         fiemap->fm_flags = fieinfo->fi_flags;
4523         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4524         fiemap->fm_start = start;
4525         fiemap->fm_length = len;
4526         if (extent_count > 0 &&
4527             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4528                            sizeof(struct fiemap_extent)) != 0)
4529                 GOTO(out, rc = -EFAULT);
4530
4531         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4532
4533         fieinfo->fi_flags = fiemap->fm_flags;
4534         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4535         if (extent_count > 0 &&
4536             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4537                          fiemap->fm_mapped_extents *
4538                          sizeof(struct fiemap_extent)) != 0)
4539                 GOTO(out, rc = -EFAULT);
4540 out:
4541         OBD_FREE_LARGE(fiemap, num_bytes);
4542         return rc;
4543 }
4544
4545 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4546 {
4547         struct ll_inode_info *lli = ll_i2info(inode);
4548         struct posix_acl *acl = NULL;
4549         ENTRY;
4550
4551         spin_lock(&lli->lli_lock);
4552         /* VFS' acl_permission_check->check_acl will release the refcount */
4553         acl = posix_acl_dup(lli->lli_posix_acl);
4554         spin_unlock(&lli->lli_lock);
4555
4556         RETURN(acl);
4557 }
4558
4559 #ifdef HAVE_IOP_SET_ACL
4560 #ifdef CONFIG_FS_POSIX_ACL
4561 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4562 {
4563         struct ll_sb_info *sbi = ll_i2sbi(inode);
4564         struct ptlrpc_request *req = NULL;
4565         const char *name = NULL;
4566         char *value = NULL;
4567         size_t value_size = 0;
4568         int rc = 0;
4569         ENTRY;
4570
4571         switch (type) {
4572         case ACL_TYPE_ACCESS:
4573                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4574                 if (acl)
4575                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4576                 break;
4577
4578         case ACL_TYPE_DEFAULT:
4579                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4580                 if (!S_ISDIR(inode->i_mode))
4581                         rc = acl ? -EACCES : 0;
4582                 break;
4583
4584         default:
4585                 rc = -EINVAL;
4586                 break;
4587         }
4588         if (rc)
4589                 return rc;
4590
4591         if (acl) {
4592                 value_size = posix_acl_xattr_size(acl->a_count);
4593                 value = kmalloc(value_size, GFP_NOFS);
4594                 if (value == NULL)
4595                         GOTO(out, rc = -ENOMEM);
4596
4597                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4598                 if (rc < 0)
4599                         GOTO(out_value, rc);
4600         }
4601
4602         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4603                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4604                          name, value, value_size, 0, 0, &req);
4605
4606         ptlrpc_req_finished(req);
4607 out_value:
4608         kfree(value);
4609 out:
4610         if (rc)
4611                 forget_cached_acl(inode, type);
4612         else
4613                 set_cached_acl(inode, type, acl);
4614         RETURN(rc);
4615 }
4616 #endif /* CONFIG_FS_POSIX_ACL */
4617 #endif /* HAVE_IOP_SET_ACL */
4618
4619 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4620 static int
4621 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4622 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4623 # else
4624 ll_check_acl(struct inode *inode, int mask)
4625 # endif
4626 {
4627 # ifdef CONFIG_FS_POSIX_ACL
4628         struct posix_acl *acl;
4629         int rc;
4630         ENTRY;
4631
4632 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4633         if (flags & IPERM_FLAG_RCU)
4634                 return -ECHILD;
4635 #  endif
4636         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4637
4638         if (!acl)
4639                 RETURN(-EAGAIN);
4640
4641         rc = posix_acl_permission(inode, acl, mask);
4642         posix_acl_release(acl);
4643
4644         RETURN(rc);
4645 # else /* !CONFIG_FS_POSIX_ACL */
4646         return -EAGAIN;
4647 # endif /* CONFIG_FS_POSIX_ACL */
4648 }
4649 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4650
4651 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4652 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4653 #else
4654 # ifdef HAVE_INODE_PERMISION_2ARGS
4655 int ll_inode_permission(struct inode *inode, int mask)
4656 # else
4657 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4658 # endif
4659 #endif
4660 {
4661         int rc = 0;
4662         struct ll_sb_info *sbi;
4663         struct root_squash_info *squash;
4664         struct cred *cred = NULL;
4665         const struct cred *old_cred = NULL;
4666         cfs_cap_t cap;
4667         bool squash_id = false;
4668         ENTRY;
4669
4670 #ifdef MAY_NOT_BLOCK
4671         if (mask & MAY_NOT_BLOCK)
4672                 return -ECHILD;
4673 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4674         if (flags & IPERM_FLAG_RCU)
4675                 return -ECHILD;
4676 #endif
4677
4678        /* as root inode are NOT getting validated in lookup operation,
4679         * need to do it before permission check. */
4680
4681         if (inode == inode->i_sb->s_root->d_inode) {
4682                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4683                 if (rc)
4684                         RETURN(rc);
4685         }
4686
4687         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4688                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4689
4690         /* squash fsuid/fsgid if needed */
4691         sbi = ll_i2sbi(inode);
4692         squash = &sbi->ll_squash;
4693         if (unlikely(squash->rsi_uid != 0 &&
4694                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4695                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4696                         squash_id = true;
4697         }
4698         if (squash_id) {
4699                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4700                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4701                        squash->rsi_uid, squash->rsi_gid);
4702
4703                 /* update current process's credentials
4704                  * and FS capability */
4705                 cred = prepare_creds();
4706                 if (cred == NULL)
4707                         RETURN(-ENOMEM);
4708
4709                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4710                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4711                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4712                         if ((1 << cap) & CFS_CAP_FS_MASK)
4713                                 cap_lower(cred->cap_effective, cap);
4714                 }
4715                 old_cred = override_creds(cred);
4716         }
4717
4718         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4719         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4720         /* restore current process's credentials and FS capability */
4721         if (squash_id) {
4722                 revert_creds(old_cred);
4723                 put_cred(cred);
4724         }
4725
4726         RETURN(rc);
4727 }
4728
4729 /* -o localflock - only provides locally consistent flock locks */
4730 struct file_operations ll_file_operations = {
4731 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4732 # ifdef HAVE_SYNC_READ_WRITE
4733         .read           = new_sync_read,
4734         .write          = new_sync_write,
4735 # endif
4736         .read_iter      = ll_file_read_iter,
4737         .write_iter     = ll_file_write_iter,
4738 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4739         .read           = ll_file_read,
4740         .aio_read       = ll_file_aio_read,
4741         .write          = ll_file_write,
4742         .aio_write      = ll_file_aio_write,
4743 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4744         .unlocked_ioctl = ll_file_ioctl,
4745         .open           = ll_file_open,
4746         .release        = ll_file_release,
4747         .mmap           = ll_file_mmap,
4748         .llseek         = ll_file_seek,
4749         .splice_read    = ll_file_splice_read,
4750         .fsync          = ll_fsync,
4751         .flush          = ll_flush
4752 };
4753
4754 struct file_operations ll_file_operations_flock = {
4755 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4756 # ifdef HAVE_SYNC_READ_WRITE
4757         .read           = new_sync_read,
4758         .write          = new_sync_write,
4759 # endif /* HAVE_SYNC_READ_WRITE */
4760         .read_iter      = ll_file_read_iter,
4761         .write_iter     = ll_file_write_iter,
4762 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4763         .read           = ll_file_read,
4764         .aio_read       = ll_file_aio_read,
4765         .write          = ll_file_write,
4766         .aio_write      = ll_file_aio_write,
4767 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4768         .unlocked_ioctl = ll_file_ioctl,
4769         .open           = ll_file_open,
4770         .release        = ll_file_release,
4771         .mmap           = ll_file_mmap,
4772         .llseek         = ll_file_seek,
4773         .splice_read    = ll_file_splice_read,
4774         .fsync          = ll_fsync,
4775         .flush          = ll_flush,
4776         .flock          = ll_file_flock,
4777         .lock           = ll_file_flock
4778 };
4779
4780 /* These are for -o noflock - to return ENOSYS on flock calls */
4781 struct file_operations ll_file_operations_noflock = {
4782 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4783 # ifdef HAVE_SYNC_READ_WRITE
4784         .read           = new_sync_read,
4785         .write          = new_sync_write,
4786 # endif /* HAVE_SYNC_READ_WRITE */
4787         .read_iter      = ll_file_read_iter,
4788         .write_iter     = ll_file_write_iter,
4789 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4790         .read           = ll_file_read,
4791         .aio_read       = ll_file_aio_read,
4792         .write          = ll_file_write,
4793         .aio_write      = ll_file_aio_write,
4794 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4795         .unlocked_ioctl = ll_file_ioctl,
4796         .open           = ll_file_open,
4797         .release        = ll_file_release,
4798         .mmap           = ll_file_mmap,
4799         .llseek         = ll_file_seek,
4800         .splice_read    = ll_file_splice_read,
4801         .fsync          = ll_fsync,
4802         .flush          = ll_flush,
4803         .flock          = ll_file_noflock,
4804         .lock           = ll_file_noflock
4805 };
4806
4807 struct inode_operations ll_file_inode_operations = {
4808         .setattr        = ll_setattr,
4809         .getattr        = ll_getattr,
4810         .permission     = ll_inode_permission,
4811 #ifdef HAVE_IOP_XATTR
4812         .setxattr       = ll_setxattr,
4813         .getxattr       = ll_getxattr,
4814         .removexattr    = ll_removexattr,
4815 #endif
4816         .listxattr      = ll_listxattr,
4817         .fiemap         = ll_fiemap,
4818 #ifdef HAVE_IOP_GET_ACL
4819         .get_acl        = ll_get_acl,
4820 #endif
4821 #ifdef HAVE_IOP_SET_ACL
4822         .set_acl        = ll_set_acl,
4823 #endif
4824 };
4825
4826 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4827 {
4828         struct ll_inode_info *lli = ll_i2info(inode);
4829         struct cl_object *obj = lli->lli_clob;
4830         struct lu_env *env;
4831         int rc;
4832         __u16 refcheck;
4833         ENTRY;
4834
4835         if (obj == NULL)
4836                 RETURN(0);
4837
4838         env = cl_env_get(&refcheck);
4839         if (IS_ERR(env))
4840                 RETURN(PTR_ERR(env));
4841
4842         rc = cl_conf_set(env, lli->lli_clob, conf);
4843         if (rc < 0)
4844                 GOTO(out, rc);
4845
4846         if (conf->coc_opc == OBJECT_CONF_SET) {
4847                 struct ldlm_lock *lock = conf->coc_lock;
4848                 struct cl_layout cl = {
4849                         .cl_layout_gen = 0,
4850                 };
4851
4852                 LASSERT(lock != NULL);
4853                 LASSERT(ldlm_has_layout(lock));
4854
4855                 /* it can only be allowed to match after layout is
4856                  * applied to inode otherwise false layout would be
4857                  * seen. Applying layout shoud happen before dropping
4858                  * the intent lock. */
4859                 ldlm_lock_allow_match(lock);
4860
4861                 rc = cl_object_layout_get(env, obj, &cl);
4862                 if (rc < 0)
4863                         GOTO(out, rc);
4864
4865                 CDEBUG(D_VFSTRACE,
4866                        DFID": layout version change: %u -> %u\n",
4867                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4868                        cl.cl_layout_gen);
4869                 ll_layout_version_set(lli, cl.cl_layout_gen);
4870         }
4871
4872 out:
4873         cl_env_put(env, &refcheck);
4874
4875         RETURN(rc);
4876 }
4877
4878 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4879 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4880
4881 {
4882         struct ll_sb_info *sbi = ll_i2sbi(inode);
4883         struct ptlrpc_request *req;
4884         struct mdt_body *body;
4885         void *lvbdata;
4886         void *lmm;
4887         int lmmsize;
4888         int rc;
4889         ENTRY;
4890
4891         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4892                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4893                lock->l_lvb_data, lock->l_lvb_len);
4894
4895         if (lock->l_lvb_data != NULL)
4896                 RETURN(0);
4897
4898         /* if layout lock was granted right away, the layout is returned
4899          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4900          * blocked and then granted via completion ast, we have to fetch
4901          * layout here. Please note that we can't use the LVB buffer in
4902          * completion AST because it doesn't have a large enough buffer */
4903         rc = ll_get_default_mdsize(sbi, &lmmsize);
4904         if (rc == 0)
4905                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4906                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4907         if (rc < 0)
4908                 RETURN(rc);
4909
4910         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4911         if (body == NULL)
4912                 GOTO(out, rc = -EPROTO);
4913
4914         lmmsize = body->mbo_eadatasize;
4915         if (lmmsize == 0) /* empty layout */
4916                 GOTO(out, rc = 0);
4917
4918         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4919         if (lmm == NULL)
4920                 GOTO(out, rc = -EFAULT);
4921
4922         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4923         if (lvbdata == NULL)
4924                 GOTO(out, rc = -ENOMEM);
4925
4926         memcpy(lvbdata, lmm, lmmsize);
4927         lock_res_and_lock(lock);
4928         if (unlikely(lock->l_lvb_data == NULL)) {
4929                 lock->l_lvb_type = LVB_T_LAYOUT;
4930                 lock->l_lvb_data = lvbdata;
4931                 lock->l_lvb_len = lmmsize;
4932                 lvbdata = NULL;
4933         }
4934         unlock_res_and_lock(lock);
4935
4936         if (lvbdata)
4937                 OBD_FREE_LARGE(lvbdata, lmmsize);
4938
4939         EXIT;
4940
4941 out:
4942         ptlrpc_req_finished(req);
4943         return rc;
4944 }
4945
4946 /**
4947  * Apply the layout to the inode. Layout lock is held and will be released
4948  * in this function.
4949  */
4950 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4951                               struct inode *inode)
4952 {
4953         struct ll_inode_info *lli = ll_i2info(inode);
4954         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4955         struct ldlm_lock *lock;
4956         struct cl_object_conf conf;
4957         int rc = 0;
4958         bool lvb_ready;
4959         bool wait_layout = false;
4960         ENTRY;
4961
4962         LASSERT(lustre_handle_is_used(lockh));
4963
4964         lock = ldlm_handle2lock(lockh);
4965         LASSERT(lock != NULL);
4966         LASSERT(ldlm_has_layout(lock));
4967
4968         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4969                    PFID(&lli->lli_fid), inode);
4970
4971         /* in case this is a caching lock and reinstate with new inode */
4972         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4973
4974         lock_res_and_lock(lock);
4975         lvb_ready = ldlm_is_lvb_ready(lock);
4976         unlock_res_and_lock(lock);
4977
4978         /* checking lvb_ready is racy but this is okay. The worst case is
4979          * that multi processes may configure the file on the same time. */
4980         if (lvb_ready)
4981                 GOTO(out, rc = 0);
4982
4983         rc = ll_layout_fetch(inode, lock);
4984         if (rc < 0)
4985                 GOTO(out, rc);
4986
4987         /* for layout lock, lmm is stored in lock's lvb.
4988          * lvb_data is immutable if the lock is held so it's safe to access it
4989          * without res lock.
4990          *
4991          * set layout to file. Unlikely this will fail as old layout was
4992          * surely eliminated */
4993         memset(&conf, 0, sizeof conf);
4994         conf.coc_opc = OBJECT_CONF_SET;
4995         conf.coc_inode = inode;
4996         conf.coc_lock = lock;
4997         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4998         conf.u.coc_layout.lb_len = lock->l_lvb_len;
4999         rc = ll_layout_conf(inode, &conf);
5000
5001         /* refresh layout failed, need to wait */
5002         wait_layout = rc == -EBUSY;
5003         EXIT;
5004 out:
5005         LDLM_LOCK_PUT(lock);
5006         ldlm_lock_decref(lockh, mode);
5007
5008         /* wait for IO to complete if it's still being used. */
5009         if (wait_layout) {
5010                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5011                        ll_get_fsname(inode->i_sb, NULL, 0),
5012                        PFID(&lli->lli_fid), inode);
5013
5014                 memset(&conf, 0, sizeof conf);
5015                 conf.coc_opc = OBJECT_CONF_WAIT;
5016                 conf.coc_inode = inode;
5017                 rc = ll_layout_conf(inode, &conf);
5018                 if (rc == 0)
5019                         rc = -EAGAIN;
5020
5021                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5022                        ll_get_fsname(inode->i_sb, NULL, 0),
5023                        PFID(&lli->lli_fid), rc);
5024         }
5025         RETURN(rc);
5026 }
5027
5028 /**
5029  * Issue layout intent RPC to MDS.
5030  * \param inode [in]    file inode
5031  * \param intent [in]   layout intent
5032  *
5033  * \retval 0    on success
5034  * \retval < 0  error code
5035  */
5036 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5037 {
5038         struct ll_inode_info  *lli = ll_i2info(inode);
5039         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5040         struct md_op_data     *op_data;
5041         struct lookup_intent it;
5042         struct ptlrpc_request *req;
5043         int rc;
5044         ENTRY;
5045
5046         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5047                                      0, 0, LUSTRE_OPC_ANY, NULL);
5048         if (IS_ERR(op_data))
5049                 RETURN(PTR_ERR(op_data));
5050
5051         op_data->op_data = intent;
5052         op_data->op_data_size = sizeof(*intent);
5053
5054         memset(&it, 0, sizeof(it));
5055         it.it_op = IT_LAYOUT;
5056         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5057             intent->li_opc == LAYOUT_INTENT_TRUNC)
5058                 it.it_flags = FMODE_WRITE;
5059
5060         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5061                           ll_get_fsname(inode->i_sb, NULL, 0),
5062                           PFID(&lli->lli_fid), inode);
5063
5064         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5065                             &ll_md_blocking_ast, 0);
5066         if (it.it_request != NULL)
5067                 ptlrpc_req_finished(it.it_request);
5068         it.it_request = NULL;
5069
5070         ll_finish_md_op_data(op_data);
5071
5072         /* set lock data in case this is a new lock */
5073         if (!rc)
5074                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5075
5076         ll_intent_drop_lock(&it);
5077
5078         RETURN(rc);
5079 }
5080
5081 /**
5082  * This function checks if there exists a LAYOUT lock on the client side,
5083  * or enqueues it if it doesn't have one in cache.
5084  *
5085  * This function will not hold layout lock so it may be revoked any time after
5086  * this function returns. Any operations depend on layout should be redone
5087  * in that case.
5088  *
5089  * This function should be called before lov_io_init() to get an uptodate
5090  * layout version, the caller should save the version number and after IO
5091  * is finished, this function should be called again to verify that layout
5092  * is not changed during IO time.
5093  */
5094 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5095 {
5096         struct ll_inode_info    *lli = ll_i2info(inode);
5097         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5098         struct lustre_handle lockh;
5099         struct layout_intent intent = {
5100                 .li_opc = LAYOUT_INTENT_ACCESS,
5101         };
5102         enum ldlm_mode mode;
5103         int rc;
5104         ENTRY;
5105
5106         *gen = ll_layout_version_get(lli);
5107         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5108                 RETURN(0);
5109
5110         /* sanity checks */
5111         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5112         LASSERT(S_ISREG(inode->i_mode));
5113
5114         /* take layout lock mutex to enqueue layout lock exclusively. */
5115         mutex_lock(&lli->lli_layout_mutex);
5116
5117         while (1) {
5118                 /* mostly layout lock is caching on the local side, so try to
5119                  * match it before grabbing layout lock mutex. */
5120                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5121                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5122                 if (mode != 0) { /* hit cached lock */
5123                         rc = ll_layout_lock_set(&lockh, mode, inode);
5124                         if (rc == -EAGAIN)
5125                                 continue;
5126                         break;
5127                 }
5128
5129                 rc = ll_layout_intent(inode, &intent);
5130                 if (rc != 0)
5131                         break;
5132         }
5133
5134         if (rc == 0)
5135                 *gen = ll_layout_version_get(lli);
5136         mutex_unlock(&lli->lli_layout_mutex);
5137
5138         RETURN(rc);
5139 }
5140
5141 /**
5142  * Issue layout intent RPC indicating where in a file an IO is about to write.
5143  *
5144  * \param[in] inode     file inode.
5145  * \param[in] ext       write range with start offset of fille in bytes where
5146  *                      an IO is about to write, and exclusive end offset in
5147  *                      bytes.
5148  *
5149  * \retval 0    on success
5150  * \retval < 0  error code
5151  */
5152 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5153                            struct lu_extent *ext)
5154 {
5155         struct layout_intent intent = {
5156                 .li_opc = opc,
5157                 .li_extent.e_start = ext->e_start,
5158                 .li_extent.e_end = ext->e_end,
5159         };
5160         int rc;
5161         ENTRY;
5162
5163         rc = ll_layout_intent(inode, &intent);
5164
5165         RETURN(rc);
5166 }
5167
5168 /**
5169  *  This function send a restore request to the MDT
5170  */
5171 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5172 {
5173         struct hsm_user_request *hur;
5174         int                      len, rc;
5175         ENTRY;
5176
5177         len = sizeof(struct hsm_user_request) +
5178               sizeof(struct hsm_user_item);
5179         OBD_ALLOC(hur, len);
5180         if (hur == NULL)
5181                 RETURN(-ENOMEM);
5182
5183         hur->hur_request.hr_action = HUA_RESTORE;
5184         hur->hur_request.hr_archive_id = 0;
5185         hur->hur_request.hr_flags = 0;
5186         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5187                sizeof(hur->hur_user_item[0].hui_fid));
5188         hur->hur_user_item[0].hui_extent.offset = offset;
5189         hur->hur_user_item[0].hui_extent.length = length;
5190         hur->hur_request.hr_itemcount = 1;
5191         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5192                            len, hur, NULL);
5193         OBD_FREE(hur, len);
5194         RETURN(rc);
5195 }