lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                      ATTR_MTIME | ATTR_MTIME_SET |
 104                                      ATTR_CTIME | ATTR_CTIME_SET;
 105         op_data->op_attr_blocks = inode->i_blocks;
 106         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 107         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 108                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 109         op_data->op_handle = och->och_fh;
 110
 111         if (och->och_flags & FMODE_WRITE &&
 112             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 113                 /* For HSM: if inode data has been modified, pack it so that
 114                  * MDT can set data dirty flag in the archive. */
 115                 op_data->op_bias |= MDS_DATA_MODIFIED;
 116
 117         EXIT;
 118 }
 119
 120 /**
 121  * Perform a close, possibly with a bias.
 122  * The meaning of "data" depends on the value of "bias".
 123  *
 124  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 125  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 126  * swap layouts with.
 127  */
 128 static int ll_close_inode_openhandle(struct inode *inode,
 129                                      struct obd_client_handle *och,
 130                                      enum mds_op_bias bias, void *data)
 131 {
 132         struct obd_export *md_exp = ll_i2mdexp(inode);
 133         const struct ll_inode_info *lli = ll_i2info(inode);
 134         struct md_op_data *op_data;
 135         struct ptlrpc_request *req = NULL;
 136         int rc;
 137         ENTRY;
 138
 139         if (class_exp2obd(md_exp) == NULL) {
 140                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 141                        ll_get_fsname(inode->i_sb, NULL, 0),
 142                        PFID(&lli->lli_fid));
 143                 GOTO(out, rc = 0);
 144         }
 145
 146         OBD_ALLOC_PTR(op_data);
 147         /* We leak openhandle and request here on error, but not much to be
 148          * done in OOM case since app won't retry close on error either. */
 149         if (op_data == NULL)
 150                 GOTO(out, rc = -ENOMEM);
 151
 152         ll_prepare_close(inode, op_data, och);
 153         switch (bias) {
 154         case MDS_CLOSE_LAYOUT_MERGE:
 155                 /* merge blocks from the victim inode */
 156                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 157                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 158         case MDS_CLOSE_LAYOUT_SPLIT:
 159         case MDS_CLOSE_LAYOUT_SWAP: {
 160                 struct split_param *sp = data;
 161
 162                 LASSERT(data != NULL);
 163                 op_data->op_bias |= bias;
 164                 op_data->op_data_version = 0;
 165                 op_data->op_lease_handle = och->och_lease_handle;
 166                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 167                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 168                         op_data->op_mirror_id = sp->sp_mirror_id;
 169                 } else {
 170                         op_data->op_fid2 = *ll_inode2fid(data);
 171                 }
 172                 break;
 173         }
 174
 175         case MDS_CLOSE_RESYNC_DONE: {
 176                 struct ll_ioc_lease *ioc = data;
 177
 178                 LASSERT(data != NULL);
 179                 op_data->op_attr_blocks +=
 180                         ioc->lil_count * op_data->op_attr_blocks;
 181                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 182                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 183
 184                 op_data->op_lease_handle = och->och_lease_handle;
 185                 op_data->op_data = &ioc->lil_ids[0];
 186                 op_data->op_data_size =
 187                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 188                 break;
 189         }
 190
 191         case MDS_HSM_RELEASE:
 192                 LASSERT(data != NULL);
 193                 op_data->op_bias |= MDS_HSM_RELEASE;
 194                 op_data->op_data_version = *(__u64 *)data;
 195                 op_data->op_lease_handle = och->och_lease_handle;
 196                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 197                 break;
 198
 199         default:
 200                 LASSERT(data == NULL);
 201                 break;
 202         }
 203
 204         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 205                 op_data->op_attr.ia_valid |= MDS_ATTR_LSIZE;
 206         if (!(op_data->op_attr.ia_valid & ATTR_BLOCKS))
 207                 op_data->op_attr.ia_valid |= MDS_ATTR_LBLOCKS;
 208
 209         rc = md_close(md_exp, op_data, och->och_mod, &req);
 210         if (rc != 0 && rc != -EINTR)
 211                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 212                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 213
 214         if (rc == 0 && op_data->op_bias & bias) {
 215                 struct mdt_body *body;
 216
 217                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 218                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 219                         rc = -EBUSY;
 220         }
 221
 222         ll_finish_md_op_data(op_data);
 223         EXIT;
 224 out:
 225
 226         md_clear_open_replay_data(md_exp, och);
 227         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 228         OBD_FREE_PTR(och);
 229
 230         ptlrpc_req_finished(req);       /* This is close request */
 231         return rc;
 232 }
 233
 234 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 235 {
 236         struct ll_inode_info *lli = ll_i2info(inode);
 237         struct obd_client_handle **och_p;
 238         struct obd_client_handle *och;
 239         __u64 *och_usecount;
 240         int rc = 0;
 241         ENTRY;
 242
 243         if (fmode & FMODE_WRITE) {
 244                 och_p = &lli->lli_mds_write_och;
 245                 och_usecount = &lli->lli_open_fd_write_count;
 246         } else if (fmode & FMODE_EXEC) {
 247                 och_p = &lli->lli_mds_exec_och;
 248                 och_usecount = &lli->lli_open_fd_exec_count;
 249         } else {
 250                 LASSERT(fmode & FMODE_READ);
 251                 och_p = &lli->lli_mds_read_och;
 252                 och_usecount = &lli->lli_open_fd_read_count;
 253         }
 254
 255         mutex_lock(&lli->lli_och_mutex);
 256         if (*och_usecount > 0) {
 257                 /* There are still users of this handle, so skip
 258                  * freeing it. */
 259                 mutex_unlock(&lli->lli_och_mutex);
 260                 RETURN(0);
 261         }
 262
 263         och = *och_p;
 264         *och_p = NULL;
 265         mutex_unlock(&lli->lli_och_mutex);
 266
 267         if (och != NULL) {
 268                 /* There might be a race and this handle may already
 269                  * be closed. */
 270                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 271         }
 272
 273         RETURN(rc);
 274 }
 275
 276 static int ll_md_close(struct inode *inode, struct file *file)
 277 {
 278         union ldlm_policy_data policy = {
 279                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 280         };
 281         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 282         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 283         struct ll_inode_info *lli = ll_i2info(inode);
 284         struct lustre_handle lockh;
 285         enum ldlm_mode lockmode;
 286         int rc = 0;
 287         ENTRY;
 288
 289         /* clear group lock, if present */
 290         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 291                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 292
 293         if (fd->fd_lease_och != NULL) {
 294                 bool lease_broken;
 295
 296                 /* Usually the lease is not released when the
 297                  * application crashed, we need to release here. */
 298                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 299                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 300                         PFID(&lli->lli_fid), rc, lease_broken);
 301
 302                 fd->fd_lease_och = NULL;
 303         }
 304
 305         if (fd->fd_och != NULL) {
 306                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 307                 fd->fd_och = NULL;
 308                 GOTO(out, rc);
 309         }
 310
 311         /* Let's see if we have good enough OPEN lock on the file and if
 312            we can skip talking to MDS */
 313         mutex_lock(&lli->lli_och_mutex);
 314         if (fd->fd_omode & FMODE_WRITE) {
 315                 lockmode = LCK_CW;
 316                 LASSERT(lli->lli_open_fd_write_count);
 317                 lli->lli_open_fd_write_count--;
 318         } else if (fd->fd_omode & FMODE_EXEC) {
 319                 lockmode = LCK_PR;
 320                 LASSERT(lli->lli_open_fd_exec_count);
 321                 lli->lli_open_fd_exec_count--;
 322         } else {
 323                 lockmode = LCK_CR;
 324                 LASSERT(lli->lli_open_fd_read_count);
 325                 lli->lli_open_fd_read_count--;
 326         }
 327         mutex_unlock(&lli->lli_och_mutex);
 328
 329         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 330                            LDLM_IBITS, &policy, lockmode, &lockh))
 331                 rc = ll_md_real_close(inode, fd->fd_omode);
 332
 333 out:
 334         LUSTRE_FPRIVATE(file) = NULL;
 335         ll_file_data_put(fd);
 336
 337         RETURN(rc);
 338 }
 339
 340 /* While this returns an error code, fput() the caller does not, so we need
 341  * to make every effort to clean up all of our state here.  Also, applications
 342  * rarely check close errors and even if an error is returned they will not
 343  * re-try the close call.
 344  */
 345 int ll_file_release(struct inode *inode, struct file *file)
 346 {
 347         struct ll_file_data *fd;
 348         struct ll_sb_info *sbi = ll_i2sbi(inode);
 349         struct ll_inode_info *lli = ll_i2info(inode);
 350         int rc;
 351         ENTRY;
 352
 353         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 354                PFID(ll_inode2fid(inode)), inode);
 355
 356         if (inode->i_sb->s_root != file_dentry(file))
 357                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 358         fd = LUSTRE_FPRIVATE(file);
 359         LASSERT(fd != NULL);
 360
 361         /* The last ref on @file, maybe not the the owner pid of statahead,
 362          * because parent and child process can share the same file handle. */
 363         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 364                 ll_deauthorize_statahead(inode, fd);
 365
 366         if (inode->i_sb->s_root == file_dentry(file)) {
 367                 LUSTRE_FPRIVATE(file) = NULL;
 368                 ll_file_data_put(fd);
 369                 RETURN(0);
 370         }
 371
 372         if (!S_ISDIR(inode->i_mode)) {
 373                 if (lli->lli_clob != NULL)
 374                         lov_read_and_clear_async_rc(lli->lli_clob);
 375                 lli->lli_async_rc = 0;
 376         }
 377
 378         rc = ll_md_close(inode, file);
 379
 380         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 381                 libcfs_debug_dumplog();
 382
 383         RETURN(rc);
 384 }
 385
 386 static inline int ll_dom_readpage(void *data, struct page *page)
 387 {
 388         struct niobuf_local *lnb = data;
 389         void *kaddr;
 390
 391         kaddr = ll_kmap_atomic(page, KM_USER0);
 392         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 393         if (lnb->lnb_len < PAGE_SIZE)
 394                 memset(kaddr + lnb->lnb_len, 0,
 395                        PAGE_SIZE - lnb->lnb_len);
 396         flush_dcache_page(page);
 397         SetPageUptodate(page);
 398         ll_kunmap_atomic(kaddr, KM_USER0);
 399         unlock_page(page);
 400
 401         return 0;
 402 }
 403
 404 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 405                         struct lookup_intent *it)
 406 {
 407         struct ll_inode_info *lli = ll_i2info(inode);
 408         struct cl_object *obj = lli->lli_clob;
 409         struct address_space *mapping = inode->i_mapping;
 410         struct page *vmpage;
 411         struct niobuf_remote *rnb;
 412         char *data;
 413         struct lu_env *env;
 414         struct cl_io *io;
 415         __u16 refcheck;
 416         struct lustre_handle lockh;
 417         struct ldlm_lock *lock;
 418         unsigned long index, start;
 419         struct niobuf_local lnb;
 420         int rc;
 421         bool dom_lock = false;
 422
 423         ENTRY;
 424
 425         if (obj == NULL)
 426                 RETURN_EXIT;
 427
 428         if (it->it_lock_mode != 0) {
 429                 lockh.cookie = it->it_lock_handle;
 430                 lock = ldlm_handle2lock(&lockh);
 431                 if (lock != NULL)
 432                         dom_lock = ldlm_has_dom(lock);
 433                 LDLM_LOCK_PUT(lock);
 434         }
 435
 436         if (!dom_lock)
 437                 RETURN_EXIT;
 438
 439         env = cl_env_get(&refcheck);
 440         if (IS_ERR(env))
 441                 RETURN_EXIT;
 442
 443         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 444                                    RCL_SERVER))
 445                 GOTO(out_env, rc = -ENODATA);
 446
 447         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 448         data = (char *)rnb + sizeof(*rnb);
 449
 450         if (rnb == NULL || rnb->rnb_len == 0)
 451                 GOTO(out_env, rc = 0);
 452
 453         CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
 454                rnb->rnb_len, i_size_read(inode));
 455
 456         io = vvp_env_thread_io(env);
 457         io->ci_obj = obj;
 458         io->ci_ignore_layout = 1;
 459         rc = cl_io_init(env, io, CIT_MISC, obj);
 460         if (rc)
 461                 GOTO(out_io, rc);
 462
 463         lnb.lnb_file_offset = rnb->rnb_offset;
 464         start = lnb.lnb_file_offset / PAGE_SIZE;
 465         index = 0;
 466         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 467         lnb.lnb_page_offset = 0;
 468         do {
 469                 struct cl_page *clp;
 470
 471                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 472                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 473                 if (lnb.lnb_len > PAGE_SIZE)
 474                         lnb.lnb_len = PAGE_SIZE;
 475
 476                 vmpage = read_cache_page(mapping, index + start,
 477                                          ll_dom_readpage, &lnb);
 478                 if (IS_ERR(vmpage)) {
 479                         CWARN("%s: cannot fill page %lu for "DFID
 480                               " with data: rc = %li\n",
 481                               ll_get_fsname(inode->i_sb, NULL, 0),
 482                               index + start, PFID(lu_object_fid(&obj->co_lu)),
 483                               PTR_ERR(vmpage));
 484                         break;
 485                 }
 486                 lock_page(vmpage);
 487                 if (vmpage->mapping == NULL) {
 488                         unlock_page(vmpage);
 489                         put_page(vmpage);
 490                         /* page was truncated */
 491                         GOTO(out_io, rc = -ENODATA);
 492                 }
 493                 clp = cl_page_find(env, obj, vmpage->index, vmpage,
 494                                    CPT_CACHEABLE);
 495                 if (IS_ERR(clp)) {
 496                         unlock_page(vmpage);
 497                         put_page(vmpage);
 498                         GOTO(out_io, rc = PTR_ERR(clp));
 499                 }
 500
 501                 /* export page */
 502                 cl_page_export(env, clp, 1);
 503                 cl_page_put(env, clp);
 504                 unlock_page(vmpage);
 505                 put_page(vmpage);
 506                 index++;
 507         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 508         rc = 0;
 509         EXIT;
 510 out_io:
 511         cl_io_fini(env, io);
 512 out_env:
 513         cl_env_put(env, &refcheck);
 514 }
 515
 516 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 517                                 struct lookup_intent *itp)
 518 {
 519         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 520         struct dentry *parent = de->d_parent;
 521         const char *name = NULL;
 522         int len = 0;
 523         struct md_op_data *op_data;
 524         struct ptlrpc_request *req = NULL;
 525         int rc;
 526         ENTRY;
 527
 528         LASSERT(parent != NULL);
 529         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 530
 531         /* if server supports open-by-fid, or file name is invalid, don't pack
 532          * name in open request */
 533         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 534             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 535                 name = de->d_name.name;
 536                 len = de->d_name.len;
 537         }
 538
 539         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 540                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 541         if (IS_ERR(op_data))
 542                 RETURN(PTR_ERR(op_data));
 543         op_data->op_data = lmm;
 544         op_data->op_data_size = lmmsize;
 545
 546         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 547                             &ll_md_blocking_ast, 0);
 548         ll_finish_md_op_data(op_data);
 549         if (rc == -ESTALE) {
 550                 /* reason for keep own exit path - don`t flood log
 551                  * with messages with -ESTALE errors.
 552                  */
 553                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 554                      it_open_error(DISP_OPEN_OPEN, itp))
 555                         GOTO(out, rc);
 556                 ll_release_openhandle(de, itp);
 557                 GOTO(out, rc);
 558         }
 559
 560         if (it_disposition(itp, DISP_LOOKUP_NEG))
 561                 GOTO(out, rc = -ENOENT);
 562
 563         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 564                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 565                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 566                 GOTO(out, rc);
 567         }
 568
 569         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 570
 571         if (!rc && itp->it_lock_mode) {
 572                 ll_dom_finish_open(de->d_inode, req, itp);
 573                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 574         }
 575
 576 out:
 577         ptlrpc_req_finished(req);
 578         ll_intent_drop_lock(itp);
 579
 580         /* We did open by fid, but by the time we got to the server,
 581          * the object disappeared. If this is a create, we cannot really
 582          * tell the userspace that the file it was trying to create
 583          * does not exist. Instead let's return -ESTALE, and the VFS will
 584          * retry the create with LOOKUP_REVAL that we are going to catch
 585          * in ll_revalidate_dentry() and use lookup then.
 586          */
 587         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 588                 rc = -ESTALE;
 589
 590         RETURN(rc);
 591 }
 592
 593 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 594                        struct obd_client_handle *och)
 595 {
 596         struct mdt_body *body;
 597
 598         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 599         och->och_fh = body->mbo_handle;
 600         och->och_fid = body->mbo_fid1;
 601         och->och_lease_handle.cookie = it->it_lock_handle;
 602         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 603         och->och_flags = it->it_flags;
 604
 605         return md_set_open_replay_data(md_exp, och, it);
 606 }
 607
 608 static int ll_local_open(struct file *file, struct lookup_intent *it,
 609                          struct ll_file_data *fd, struct obd_client_handle *och)
 610 {
 611         struct inode *inode = file_inode(file);
 612         ENTRY;
 613
 614         LASSERT(!LUSTRE_FPRIVATE(file));
 615
 616         LASSERT(fd != NULL);
 617
 618         if (och) {
 619                 int rc;
 620
 621                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 622                 if (rc != 0)
 623                         RETURN(rc);
 624         }
 625
 626         LUSTRE_FPRIVATE(file) = fd;
 627         ll_readahead_init(inode, &fd->fd_ras);
 628         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 629
 630         /* ll_cl_context initialize */
 631         rwlock_init(&fd->fd_lock);
 632         INIT_LIST_HEAD(&fd->fd_lccs);
 633
 634         RETURN(0);
 635 }
 636
 637 /* Open a file, and (for the very first open) create objects on the OSTs at
 638  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 639  * creation or open until ll_lov_setstripe() ioctl is called.
 640  *
 641  * If we already have the stripe MD locally then we don't request it in
 642  * md_open(), by passing a lmm_size = 0.
 643  *
 644  * It is up to the application to ensure no other processes open this file
 645  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 646  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 647  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 648  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 649  */
 650 int ll_file_open(struct inode *inode, struct file *file)
 651 {
 652         struct ll_inode_info *lli = ll_i2info(inode);
 653         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 654                                           .it_flags = file->f_flags };
 655         struct obd_client_handle **och_p = NULL;
 656         __u64 *och_usecount = NULL;
 657         struct ll_file_data *fd;
 658         int rc = 0;
 659         ENTRY;
 660
 661         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 662                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 663
 664         it = file->private_data; /* XXX: compat macro */
 665         file->private_data = NULL; /* prevent ll_local_open assertion */
 666
 667         fd = ll_file_data_get();
 668         if (fd == NULL)
 669                 GOTO(out_nofiledata, rc = -ENOMEM);
 670
 671         fd->fd_file = file;
 672         if (S_ISDIR(inode->i_mode))
 673                 ll_authorize_statahead(inode, fd);
 674
 675         if (inode->i_sb->s_root == file_dentry(file)) {
 676                 LUSTRE_FPRIVATE(file) = fd;
 677                 RETURN(0);
 678         }
 679
 680         if (!it || !it->it_disposition) {
 681                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 682                  * because everything but O_ACCMODE mask was stripped from
 683                  * there */
 684                 if ((oit.it_flags + 1) & O_ACCMODE)
 685                         oit.it_flags++;
 686                 if (file->f_flags & O_TRUNC)
 687                         oit.it_flags |= FMODE_WRITE;
 688
 689                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 690                  * dentry_open after call to open_namei that checks permissions.
 691                  * Only nfsd_open call dentry_open directly without checking
 692                  * permissions and because of that this code below is safe. */
 693                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 694                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 695
 696                 /* We do not want O_EXCL here, presumably we opened the file
 697                  * already? XXX - NFS implications? */
 698                 oit.it_flags &= ~O_EXCL;
 699
 700                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 701                  * created if necessary, then "IT_CREAT" should be set to keep
 702                  * consistent with it */
 703                 if (oit.it_flags & O_CREAT)
 704                         oit.it_op |= IT_CREAT;
 705
 706                 it = &oit;
 707         }
 708
 709 restart:
 710         /* Let's see if we have file open on MDS already. */
 711         if (it->it_flags & FMODE_WRITE) {
 712                 och_p = &lli->lli_mds_write_och;
 713                 och_usecount = &lli->lli_open_fd_write_count;
 714         } else if (it->it_flags & FMODE_EXEC) {
 715                 och_p = &lli->lli_mds_exec_och;
 716                 och_usecount = &lli->lli_open_fd_exec_count;
 717          } else {
 718                 och_p = &lli->lli_mds_read_och;
 719                 och_usecount = &lli->lli_open_fd_read_count;
 720         }
 721
 722         mutex_lock(&lli->lli_och_mutex);
 723         if (*och_p) { /* Open handle is present */
 724                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 725                         /* Well, there's extra open request that we do not need,
 726                            let's close it somehow. This will decref request. */
 727                         rc = it_open_error(DISP_OPEN_OPEN, it);
 728                         if (rc) {
 729                                 mutex_unlock(&lli->lli_och_mutex);
 730                                 GOTO(out_openerr, rc);
 731                         }
 732
 733                         ll_release_openhandle(file_dentry(file), it);
 734                 }
 735                 (*och_usecount)++;
 736
 737                 rc = ll_local_open(file, it, fd, NULL);
 738                 if (rc) {
 739                         (*och_usecount)--;
 740                         mutex_unlock(&lli->lli_och_mutex);
 741                         GOTO(out_openerr, rc);
 742                 }
 743         } else {
 744                 LASSERT(*och_usecount == 0);
 745                 if (!it->it_disposition) {
 746                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 747                         /* We cannot just request lock handle now, new ELC code
 748                            means that one of other OPEN locks for this file
 749                            could be cancelled, and since blocking ast handler
 750                            would attempt to grab och_mutex as well, that would
 751                            result in a deadlock */
 752                         mutex_unlock(&lli->lli_och_mutex);
 753                         /*
 754                          * Normally called under two situations:
 755                          * 1. NFS export.
 756                          * 2. A race/condition on MDS resulting in no open
 757                          *    handle to be returned from LOOKUP|OPEN request,
 758                          *    for example if the target entry was a symlink.
 759                          *
 760                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 761                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 762                          *  bit so that it's not confusing later callers.
 763                          *
 764                          *  NB; when ldd is NULL, it must have come via normal
 765                          *  lookup path only, since ll_iget_for_nfs always calls
 766                          *  ll_d_init().
 767                          */
 768                         if (ldd && ldd->lld_nfs_dentry) {
 769                                 ldd->lld_nfs_dentry = 0;
 770                                 it->it_flags |= MDS_OPEN_LOCK;
 771                         }
 772
 773                          /*
 774                          * Always specify MDS_OPEN_BY_FID because we don't want
 775                          * to get file with different fid.
 776                          */
 777                         it->it_flags |= MDS_OPEN_BY_FID;
 778                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 779                                                  it);
 780                         if (rc)
 781                                 GOTO(out_openerr, rc);
 782
 783                         goto restart;
 784                 }
 785                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 786                 if (!*och_p)
 787                         GOTO(out_och_free, rc = -ENOMEM);
 788
 789                 (*och_usecount)++;
 790
 791                 /* md_intent_lock() didn't get a request ref if there was an
 792                  * open error, so don't do cleanup on the request here
 793                  * (bug 3430) */
 794                 /* XXX (green): Should not we bail out on any error here, not
 795                  * just open error? */
 796                 rc = it_open_error(DISP_OPEN_OPEN, it);
 797                 if (rc != 0)
 798                         GOTO(out_och_free, rc);
 799
 800                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 801                          "inode %p: disposition %x, status %d\n", inode,
 802                          it_disposition(it, ~0), it->it_status);
 803
 804                 rc = ll_local_open(file, it, fd, *och_p);
 805                 if (rc)
 806                         GOTO(out_och_free, rc);
 807         }
 808         mutex_unlock(&lli->lli_och_mutex);
 809         fd = NULL;
 810
 811         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 812            different kind of OPEN lock for this same inode gets cancelled
 813            by ldlm_cancel_lru */
 814         if (!S_ISREG(inode->i_mode))
 815                 GOTO(out_och_free, rc);
 816
 817         cl_lov_delay_create_clear(&file->f_flags);
 818         GOTO(out_och_free, rc);
 819
 820 out_och_free:
 821         if (rc) {
 822                 if (och_p && *och_p) {
 823                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 824                         *och_p = NULL; /* OBD_FREE writes some magic there */
 825                         (*och_usecount)--;
 826                 }
 827                 mutex_unlock(&lli->lli_och_mutex);
 828
 829 out_openerr:
 830                 if (lli->lli_opendir_key == fd)
 831                         ll_deauthorize_statahead(inode, fd);
 832                 if (fd != NULL)
 833                         ll_file_data_put(fd);
 834         } else {
 835                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 836         }
 837
 838 out_nofiledata:
 839         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 840                 ptlrpc_req_finished(it->it_request);
 841                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 842         }
 843
 844         return rc;
 845 }
 846
 847 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 848                         struct ldlm_lock_desc *desc, void *data, int flag)
 849 {
 850         int rc;
 851         struct lustre_handle lockh;
 852         ENTRY;
 853
 854         switch (flag) {
 855         case LDLM_CB_BLOCKING:
 856                 ldlm_lock2handle(lock, &lockh);
 857                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 858                 if (rc < 0) {
 859                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 860                         RETURN(rc);
 861                 }
 862                 break;
 863         case LDLM_CB_CANCELING:
 864                 /* do nothing */
 865                 break;
 866         }
 867         RETURN(0);
 868 }
 869
 870 /**
 871  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 872  * and save it as fd->fd_och so as to force client to reopen the file even
 873  * if it has an open lock in cache already.
 874  */
 875 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 876                                 struct lustre_handle *old_handle)
 877 {
 878         struct ll_inode_info *lli = ll_i2info(inode);
 879         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 880         struct obd_client_handle **och_p;
 881         __u64 *och_usecount;
 882         int rc = 0;
 883         ENTRY;
 884
 885         /* Get the openhandle of the file */
 886         mutex_lock(&lli->lli_och_mutex);
 887         if (fd->fd_lease_och != NULL)
 888                 GOTO(out_unlock, rc = -EBUSY);
 889
 890         if (fd->fd_och == NULL) {
 891                 if (file->f_mode & FMODE_WRITE) {
 892                         LASSERT(lli->lli_mds_write_och != NULL);
 893                         och_p = &lli->lli_mds_write_och;
 894                         och_usecount = &lli->lli_open_fd_write_count;
 895                 } else {
 896                         LASSERT(lli->lli_mds_read_och != NULL);
 897                         och_p = &lli->lli_mds_read_och;
 898                         och_usecount = &lli->lli_open_fd_read_count;
 899                 }
 900
 901                 if (*och_usecount > 1)
 902                         GOTO(out_unlock, rc = -EBUSY);
 903
 904                 fd->fd_och = *och_p;
 905                 *och_usecount = 0;
 906                 *och_p = NULL;
 907         }
 908
 909         *old_handle = fd->fd_och->och_fh;
 910
 911         EXIT;
 912 out_unlock:
 913         mutex_unlock(&lli->lli_och_mutex);
 914         return rc;
 915 }
 916
 917 /**
 918  * Release ownership on lli_mds_*_och when putting back a file lease.
 919  */
 920 static int ll_lease_och_release(struct inode *inode, struct file *file)
 921 {
 922         struct ll_inode_info *lli = ll_i2info(inode);
 923         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 924         struct obd_client_handle **och_p;
 925         struct obd_client_handle *old_och = NULL;
 926         __u64 *och_usecount;
 927         int rc = 0;
 928         ENTRY;
 929
 930         mutex_lock(&lli->lli_och_mutex);
 931         if (file->f_mode & FMODE_WRITE) {
 932                 och_p = &lli->lli_mds_write_och;
 933                 och_usecount = &lli->lli_open_fd_write_count;
 934         } else {
 935                 och_p = &lli->lli_mds_read_och;
 936                 och_usecount = &lli->lli_open_fd_read_count;
 937         }
 938
 939         /* The file may have been open by another process (broken lease) so
 940          * *och_p is not NULL. In this case we should simply increase usecount
 941          * and close fd_och.
 942          */
 943         if (*och_p != NULL) {
 944                 old_och = fd->fd_och;
 945                 (*och_usecount)++;
 946         } else {
 947                 *och_p = fd->fd_och;
 948                 *och_usecount = 1;
 949         }
 950         fd->fd_och = NULL;
 951         mutex_unlock(&lli->lli_och_mutex);
 952
 953         if (old_och != NULL)
 954                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 955
 956         RETURN(rc);
 957 }
 958
 959 /**
 960  * Acquire a lease and open the file.
 961  */
 962 static struct obd_client_handle *
 963 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 964               __u64 open_flags)
 965 {
 966         struct lookup_intent it = { .it_op = IT_OPEN };
 967         struct ll_sb_info *sbi = ll_i2sbi(inode);
 968         struct md_op_data *op_data;
 969         struct ptlrpc_request *req = NULL;
 970         struct lustre_handle old_handle = { 0 };
 971         struct obd_client_handle *och = NULL;
 972         int rc;
 973         int rc2;
 974         ENTRY;
 975
 976         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 977                 RETURN(ERR_PTR(-EINVAL));
 978
 979         if (file != NULL) {
 980                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 981                         RETURN(ERR_PTR(-EPERM));
 982
 983                 rc = ll_lease_och_acquire(inode, file, &old_handle);
 984                 if (rc)
 985                         RETURN(ERR_PTR(rc));
 986         }
 987
 988         OBD_ALLOC_PTR(och);
 989         if (och == NULL)
 990                 RETURN(ERR_PTR(-ENOMEM));
 991
 992         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 993                                         LUSTRE_OPC_ANY, NULL);
 994         if (IS_ERR(op_data))
 995                 GOTO(out, rc = PTR_ERR(op_data));
 996
 997         /* To tell the MDT this openhandle is from the same owner */
 998         op_data->op_handle = old_handle;
 999
1000         it.it_flags = fmode | open_flags;
1001         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1002         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1003                             &ll_md_blocking_lease_ast,
1004         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1005          * it can be cancelled which may mislead applications that the lease is
1006          * broken;
1007          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1008          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1009          * doesn't deal with openhandle, so normal openhandle will be leaked. */
1010                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1011         ll_finish_md_op_data(op_data);
1012         ptlrpc_req_finished(req);
1013         if (rc < 0)
1014                 GOTO(out_release_it, rc);
1015
1016         if (it_disposition(&it, DISP_LOOKUP_NEG))
1017                 GOTO(out_release_it, rc = -ENOENT);
1018
1019         rc = it_open_error(DISP_OPEN_OPEN, &it);
1020         if (rc)
1021                 GOTO(out_release_it, rc);
1022
1023         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1024         ll_och_fill(sbi->ll_md_exp, &it, och);
1025
1026         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1027                 GOTO(out_close, rc = -EOPNOTSUPP);
1028
1029         /* already get lease, handle lease lock */
1030         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1031         if (it.it_lock_mode == 0 ||
1032             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1033                 /* open lock must return for lease */
1034                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1035                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1036                         it.it_lock_bits);
1037                 GOTO(out_close, rc = -EPROTO);
1038         }
1039
1040         ll_intent_release(&it);
1041         RETURN(och);
1042
1043 out_close:
1044         /* Cancel open lock */
1045         if (it.it_lock_mode != 0) {
1046                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1047                                             it.it_lock_mode);
1048                 it.it_lock_mode = 0;
1049                 och->och_lease_handle.cookie = 0ULL;
1050         }
1051         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1052         if (rc2 < 0)
1053                 CERROR("%s: error closing file "DFID": %d\n",
1054                        ll_get_fsname(inode->i_sb, NULL, 0),
1055                        PFID(&ll_i2info(inode)->lli_fid), rc2);
1056         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1057 out_release_it:
1058         ll_intent_release(&it);
1059 out:
1060         if (och != NULL)
1061                 OBD_FREE_PTR(och);
1062         RETURN(ERR_PTR(rc));
1063 }
1064
1065 /**
1066  * Check whether a layout swap can be done between two inodes.
1067  *
1068  * \param[in] inode1  First inode to check
1069  * \param[in] inode2  Second inode to check
1070  *
1071  * \retval 0 on success, layout swap can be performed between both inodes
1072  * \retval negative error code if requirements are not met
1073  */
1074 static int ll_check_swap_layouts_validity(struct inode *inode1,
1075                                           struct inode *inode2)
1076 {
1077         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1078                 return -EINVAL;
1079
1080         if (inode_permission(inode1, MAY_WRITE) ||
1081             inode_permission(inode2, MAY_WRITE))
1082                 return -EPERM;
1083
1084         if (inode1->i_sb != inode2->i_sb)
1085                 return -EXDEV;
1086
1087         return 0;
1088 }
1089
1090 static int ll_swap_layouts_close(struct obd_client_handle *och,
1091                                  struct inode *inode, struct inode *inode2)
1092 {
1093         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1094         const struct lu_fid     *fid2;
1095         int                      rc;
1096         ENTRY;
1097
1098         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1099                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1100
1101         rc = ll_check_swap_layouts_validity(inode, inode2);
1102         if (rc < 0)
1103                 GOTO(out_free_och, rc);
1104
1105         /* We now know that inode2 is a lustre inode */
1106         fid2 = ll_inode2fid(inode2);
1107
1108         rc = lu_fid_cmp(fid1, fid2);
1109         if (rc == 0)
1110                 GOTO(out_free_och, rc = -EINVAL);
1111
1112         /* Close the file and {swap,merge} layouts between inode & inode2.
1113          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1114          * because we still need it to pack l_remote_handle to MDT. */
1115         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1116                                        inode2);
1117
1118         och = NULL; /* freed in ll_close_inode_openhandle() */
1119
1120 out_free_och:
1121         if (och != NULL)
1122                 OBD_FREE_PTR(och);
1123
1124         RETURN(rc);
1125 }
1126
1127 /**
1128  * Release lease and close the file.
1129  * It will check if the lease has ever broken.
1130  */
1131 static int ll_lease_close_intent(struct obd_client_handle *och,
1132                                  struct inode *inode,
1133                                  bool *lease_broken, enum mds_op_bias bias,
1134                                  void *data)
1135 {
1136         struct ldlm_lock *lock;
1137         bool cancelled = true;
1138         int rc;
1139         ENTRY;
1140
1141         lock = ldlm_handle2lock(&och->och_lease_handle);
1142         if (lock != NULL) {
1143                 lock_res_and_lock(lock);
1144                 cancelled = ldlm_is_cancel(lock);
1145                 unlock_res_and_lock(lock);
1146                 LDLM_LOCK_PUT(lock);
1147         }
1148
1149         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1150                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1151
1152         if (lease_broken != NULL)
1153                 *lease_broken = cancelled;
1154
1155         if (!cancelled && !bias)
1156                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1157
1158         if (cancelled) { /* no need to excute intent */
1159                 bias = 0;
1160                 data = NULL;
1161         }
1162
1163         rc = ll_close_inode_openhandle(inode, och, bias, data);
1164         RETURN(rc);
1165 }
1166
1167 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1168                           bool *lease_broken)
1169 {
1170         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1171 }
1172
1173 /**
1174  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1175  */
1176 static int ll_lease_file_resync(struct obd_client_handle *och,
1177                                 struct inode *inode)
1178 {
1179         struct ll_sb_info *sbi = ll_i2sbi(inode);
1180         struct md_op_data *op_data;
1181         __u64 data_version_unused;
1182         int rc;
1183         ENTRY;
1184
1185         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1186                                      LUSTRE_OPC_ANY, NULL);
1187         if (IS_ERR(op_data))
1188                 RETURN(PTR_ERR(op_data));
1189
1190         /* before starting file resync, it's necessary to clean up page cache
1191          * in client memory, otherwise once the layout version is increased,
1192          * writing back cached data will be denied the OSTs. */
1193         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1194         if (rc)
1195                 GOTO(out, rc);
1196
1197         op_data->op_handle = och->och_lease_handle;
1198         rc = md_file_resync(sbi->ll_md_exp, op_data);
1199         if (rc)
1200                 GOTO(out, rc);
1201
1202         EXIT;
1203 out:
1204         ll_finish_md_op_data(op_data);
1205         return rc;
1206 }
1207
1208 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1209 {
1210         struct ll_inode_info *lli = ll_i2info(inode);
1211         struct cl_object *obj = lli->lli_clob;
1212         struct cl_attr *attr = vvp_env_thread_attr(env);
1213         s64 atime;
1214         s64 mtime;
1215         s64 ctime;
1216         int rc = 0;
1217
1218         ENTRY;
1219
1220         ll_inode_size_lock(inode);
1221
1222         /* Merge timestamps the most recently obtained from MDS with
1223          * timestamps obtained from OSTs.
1224          *
1225          * Do not overwrite atime of inode because it may be refreshed
1226          * by file_accessed() function. If the read was served by cache
1227          * data, there is no RPC to be sent so that atime may not be
1228          * transferred to OSTs at all. MDT only updates atime at close time
1229          * if it's at least 'mdd.*.atime_diff' older.
1230          * All in all, the atime in Lustre does not strictly comply with
1231          * POSIX. Solving this problem needs to send an RPC to MDT for each
1232          * read, this will hurt performance. */
1233         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1234                 LTIME_S(inode->i_atime) = lli->lli_atime;
1235                 lli->lli_update_atime = 0;
1236         }
1237         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1238         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1239
1240         atime = LTIME_S(inode->i_atime);
1241         mtime = LTIME_S(inode->i_mtime);
1242         ctime = LTIME_S(inode->i_ctime);
1243
1244         cl_object_attr_lock(obj);
1245         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1246                 rc = -EINVAL;
1247         else
1248                 rc = cl_object_attr_get(env, obj, attr);
1249         cl_object_attr_unlock(obj);
1250
1251         if (rc != 0)
1252                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1253
1254         if (atime < attr->cat_atime)
1255                 atime = attr->cat_atime;
1256
1257         if (ctime < attr->cat_ctime)
1258                 ctime = attr->cat_ctime;
1259
1260         if (mtime < attr->cat_mtime)
1261                 mtime = attr->cat_mtime;
1262
1263         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1264                PFID(&lli->lli_fid), attr->cat_size);
1265
1266         i_size_write(inode, attr->cat_size);
1267         inode->i_blocks = attr->cat_blocks;
1268
1269         LTIME_S(inode->i_atime) = atime;
1270         LTIME_S(inode->i_mtime) = mtime;
1271         LTIME_S(inode->i_ctime) = ctime;
1272
1273 out_size_unlock:
1274         ll_inode_size_unlock(inode);
1275
1276         RETURN(rc);
1277 }
1278
1279 /**
1280  * Set designated mirror for I/O.
1281  *
1282  * So far only read, write, and truncated can support to issue I/O to
1283  * designated mirror.
1284  */
1285 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1286 {
1287         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1288
1289         /* clear layout version for generic(non-resync) I/O in case it carries
1290          * stale layout version due to I/O restart */
1291         io->ci_layout_version = 0;
1292
1293         /* FLR: disable non-delay for designated mirror I/O because obviously
1294          * only one mirror is available */
1295         if (fd->fd_designated_mirror > 0) {
1296                 io->ci_ndelay = 0;
1297                 io->ci_designated_mirror = fd->fd_designated_mirror;
1298                 io->ci_layout_version = fd->fd_layout_version;
1299                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1300                                  * io to ptasks */
1301         }
1302
1303         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1304                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1305 }
1306
1307 static bool file_is_noatime(const struct file *file)
1308 {
1309         const struct vfsmount *mnt = file->f_path.mnt;
1310         const struct inode *inode = file_inode((struct file *)file);
1311
1312         /* Adapted from file_accessed() and touch_atime().*/
1313         if (file->f_flags & O_NOATIME)
1314                 return true;
1315
1316         if (inode->i_flags & S_NOATIME)
1317                 return true;
1318
1319         if (IS_NOATIME(inode))
1320                 return true;
1321
1322         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1323                 return true;
1324
1325         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1326                 return true;
1327
1328         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1329                 return true;
1330
1331         return false;
1332 }
1333
1334 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1335
1336 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1337 {
1338         struct inode *inode = file_inode(file);
1339         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1340
1341         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1342         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1343         io->u.ci_rw.rw_file = file;
1344         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1345         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1346         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1347
1348         if (iot == CIT_WRITE) {
1349                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1350                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1351                                            file->f_flags & O_DIRECT ||
1352                                            IS_SYNC(inode));
1353         }
1354         io->ci_obj = ll_i2info(inode)->lli_clob;
1355         io->ci_lockreq = CILR_MAYBE;
1356         if (ll_file_nolock(file)) {
1357                 io->ci_lockreq = CILR_NEVER;
1358                 io->ci_no_srvlock = 1;
1359         } else if (file->f_flags & O_APPEND) {
1360                 io->ci_lockreq = CILR_MANDATORY;
1361         }
1362         io->ci_noatime = file_is_noatime(file);
1363         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1364                 io->ci_pio = !io->u.ci_rw.rw_append;
1365         else
1366                 io->ci_pio = 0;
1367
1368         /* FLR: only use non-delay I/O for read as there is only one
1369          * avaliable mirror for write. */
1370         io->ci_ndelay = !(iot == CIT_WRITE);
1371
1372         ll_io_set_mirror(io, file);
1373 }
1374
1375 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1376 {
1377         struct cl_io_pt *pt = ptask->pt_cbdata;
1378         struct file *file = pt->cip_file;
1379         struct lu_env *env;
1380         struct cl_io *io;
1381         loff_t pos = pt->cip_pos;
1382         int rc;
1383         __u16 refcheck;
1384         ENTRY;
1385
1386         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1387                 file_dentry(file)->d_name.name,
1388                 pt->cip_iot == CIT_READ ? "read" : "write",
1389                 pos, pos + pt->cip_count);
1390
1391         env = cl_env_get(&refcheck);
1392         if (IS_ERR(env))
1393                 RETURN(PTR_ERR(env));
1394
1395         io = vvp_env_thread_io(env);
1396         ll_io_init(io, file, pt->cip_iot);
1397         io->u.ci_rw.rw_iter = pt->cip_iter;
1398         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1399         io->ci_pio = 0; /* It's already in parallel task */
1400
1401         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1402                            pt->cip_count - pt->cip_result);
1403         if (!rc) {
1404                 struct vvp_io *vio = vvp_env_io(env);
1405
1406                 vio->vui_io_subtype = IO_NORMAL;
1407                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1408
1409                 ll_cl_add(file, env, io, LCC_RW);
1410                 rc = cl_io_loop(env, io);
1411                 ll_cl_remove(file, env);
1412         } else {
1413                 /* cl_io_rw_init() handled IO */
1414                 rc = io->ci_result;
1415         }
1416
1417         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1418                 if (io->ci_nob > 0)
1419                         io->ci_nob /= 2;
1420                 rc = -EIO;
1421         }
1422
1423         if (io->ci_nob > 0) {
1424                 pt->cip_result += io->ci_nob;
1425                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1426                 pos += io->ci_nob;
1427                 pt->cip_iocb.ki_pos = pos;
1428 #ifdef HAVE_KIOCB_KI_LEFT
1429                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1430 #elif defined(HAVE_KI_NBYTES)
1431                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1432 #endif
1433         }
1434
1435         cl_io_fini(env, io);
1436         cl_env_put(env, &refcheck);
1437
1438         pt->cip_need_restart = io->ci_need_restart;
1439
1440         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1441                 file_dentry(file)->d_name.name,
1442                 pt->cip_iot == CIT_READ ? "read" : "write",
1443                 pt->cip_result, rc);
1444
1445         RETURN(pt->cip_result > 0 ? 0 : rc);
1446 }
1447
1448 static ssize_t
1449 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1450                    struct file *file, enum cl_io_type iot,
1451                    loff_t *ppos, size_t count)
1452 {
1453         struct range_lock       range;
1454         struct vvp_io           *vio = vvp_env_io(env);
1455         struct inode            *inode = file_inode(file);
1456         struct ll_inode_info    *lli = ll_i2info(inode);
1457         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1458         struct cl_io            *io;
1459         loff_t                  pos = *ppos;
1460         ssize_t                 result = 0;
1461         int                     rc = 0;
1462         unsigned                retried = 0;
1463         bool                    restarted = false;
1464
1465         ENTRY;
1466
1467         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1468                 file_dentry(file)->d_name.name,
1469                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1470
1471 restart:
1472         io = vvp_env_thread_io(env);
1473         ll_io_init(io, file, iot);
1474         if (args->via_io_subtype == IO_NORMAL) {
1475                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1476                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1477         }
1478         if (args->via_io_subtype != IO_NORMAL || restarted)
1479                 io->ci_pio = 0;
1480         io->ci_ndelay_tried = retried;
1481
1482         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1483                 bool range_locked = false;
1484
1485                 if (file->f_flags & O_APPEND)
1486                         range_lock_init(&range, 0, LUSTRE_EOF);
1487                 else
1488                         range_lock_init(&range, pos, pos + count - 1);
1489
1490                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1491                 vio->vui_io_subtype = args->via_io_subtype;
1492
1493                 switch (vio->vui_io_subtype) {
1494                 case IO_NORMAL:
1495                         /* Direct IO reads must also take range lock,
1496                          * or multiple reads will try to work on the same pages
1497                          * See LU-6227 for details. */
1498                         if (((iot == CIT_WRITE) ||
1499                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1500                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1501                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1502                                        RL_PARA(&range));
1503                                 rc = range_lock(&lli->lli_write_tree, &range);
1504                                 if (rc < 0)
1505                                         GOTO(out, rc);
1506
1507                                 range_locked = true;
1508                         }
1509                         break;
1510                 case IO_SPLICE:
1511                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1512                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1513                         break;
1514                 default:
1515                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1516                         LBUG();
1517                 }
1518
1519                 ll_cl_add(file, env, io, LCC_RW);
1520                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1521                     !lli->lli_inode_locked) {
1522                         inode_lock(inode);
1523                         lli->lli_inode_locked = 1;
1524                 }
1525                 rc = cl_io_loop(env, io);
1526                 if (lli->lli_inode_locked) {
1527                         lli->lli_inode_locked = 0;
1528                         inode_unlock(inode);
1529                 }
1530                 ll_cl_remove(file, env);
1531
1532                 if (range_locked) {
1533                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1534                                RL_PARA(&range));
1535                         range_unlock(&lli->lli_write_tree, &range);
1536                 }
1537         } else {
1538                 /* cl_io_rw_init() handled IO */
1539                 rc = io->ci_result;
1540         }
1541
1542         if (io->ci_nob > 0) {
1543                 result += io->ci_nob;
1544                 count  -= io->ci_nob;
1545
1546                 if (args->via_io_subtype == IO_NORMAL) {
1547                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1548
1549                         /* CLIO is too complicated. See LU-11069. */
1550                         if (cl_io_is_append(io))
1551                                 pos = io->u.ci_rw.rw_iocb.ki_pos;
1552                         else
1553                                 pos += io->ci_nob;
1554
1555                         args->u.normal.via_iocb->ki_pos = pos;
1556 #ifdef HAVE_KIOCB_KI_LEFT
1557                         args->u.normal.via_iocb->ki_left = count;
1558 #elif defined(HAVE_KI_NBYTES)
1559                         args->u.normal.via_iocb->ki_nbytes = count;
1560 #endif
1561                 } else {
1562                         /* for splice */
1563                         pos = io->u.ci_rw.rw_range.cir_pos;
1564                 }
1565         }
1566 out:
1567         cl_io_fini(env, io);
1568
1569         CDEBUG(D_VFSTRACE,
1570                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1571                file->f_path.dentry->d_name.name,
1572                iot, rc, result, io->ci_need_restart);
1573
1574         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1575                 CDEBUG(D_VFSTRACE,
1576                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1577                         file_dentry(file)->d_name.name,
1578                         iot == CIT_READ ? "read" : "write",
1579                         pos, pos + count, result, rc);
1580                 /* preserve the tried count for FLR */
1581                 retried = io->ci_ndelay_tried;
1582                 restarted = true;
1583                 goto restart;
1584         }
1585
1586         if (iot == CIT_READ) {
1587                 if (result > 0)
1588                         ll_stats_ops_tally(ll_i2sbi(inode),
1589                                            LPROC_LL_READ_BYTES, result);
1590         } else if (iot == CIT_WRITE) {
1591                 if (result > 0) {
1592                         ll_stats_ops_tally(ll_i2sbi(inode),
1593                                            LPROC_LL_WRITE_BYTES, result);
1594                         fd->fd_write_failed = false;
1595                 } else if (result == 0 && rc == 0) {
1596                         rc = io->ci_result;
1597                         if (rc < 0)
1598                                 fd->fd_write_failed = true;
1599                         else
1600                                 fd->fd_write_failed = false;
1601                 } else if (rc != -ERESTARTSYS) {
1602                         fd->fd_write_failed = true;
1603                 }
1604         }
1605
1606         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1607                 file_dentry(file)->d_name.name,
1608                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1609
1610         *ppos = pos;
1611
1612         RETURN(result > 0 ? result : rc);
1613 }
1614
1615 /**
1616  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1617  * especially for small I/O.
1618  *
1619  * To serve a read request, CLIO has to create and initialize a cl_io and
1620  * then request DLM lock. This has turned out to have siginificant overhead
1621  * and affects the performance of small I/O dramatically.
1622  *
1623  * It's not necessary to create a cl_io for each I/O. Under the help of read
1624  * ahead, most of the pages being read are already in memory cache and we can
1625  * read those pages directly because if the pages exist, the corresponding DLM
1626  * lock must exist so that page content must be valid.
1627  *
1628  * In fast read implementation, the llite speculatively finds and reads pages
1629  * in memory cache. There are three scenarios for fast read:
1630  *   - If the page exists and is uptodate, kernel VM will provide the data and
1631  *     CLIO won't be intervened;
1632  *   - If the page was brought into memory by read ahead, it will be exported
1633  *     and read ahead parameters will be updated;
1634  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1635  *     it will go back and invoke normal read, i.e., a cl_io will be created
1636  *     and DLM lock will be requested.
1637  *
1638  * POSIX compliance: posix standard states that read is intended to be atomic.
1639  * Lustre read implementation is in line with Linux kernel read implementation
1640  * and neither of them complies with POSIX standard in this matter. Fast read
1641  * doesn't make the situation worse on single node but it may interleave write
1642  * results from multiple nodes due to short read handling in ll_file_aio_read().
1643  *
1644  * \param env - lu_env
1645  * \param iocb - kiocb from kernel
1646  * \param iter - user space buffers where the data will be copied
1647  *
1648  * \retval - number of bytes have been read, or error code if error occurred.
1649  */
1650 static ssize_t
1651 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1652 {
1653         ssize_t result;
1654
1655         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1656                 return 0;
1657
1658         /* NB: we can't do direct IO for fast read because it will need a lock
1659          * to make IO engine happy. */
1660         if (iocb->ki_filp->f_flags & O_DIRECT)
1661                 return 0;
1662
1663         result = generic_file_read_iter(iocb, iter);
1664
1665         /* If the first page is not in cache, generic_file_aio_read() will be
1666          * returned with -ENODATA.
1667          * See corresponding code in ll_readpage(). */
1668         if (result == -ENODATA)
1669                 result = 0;
1670
1671         if (result > 0)
1672                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1673                                 LPROC_LL_READ_BYTES, result);
1674
1675         return result;
1676 }
1677
1678 /*
1679  * Read from a file (through the page cache).
1680  */
1681 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1682 {
1683         struct lu_env *env;
1684         struct vvp_io_args *args;
1685         ssize_t result;
1686         ssize_t rc2;
1687         __u16 refcheck;
1688
1689         result = ll_do_fast_read(iocb, to);
1690         if (result < 0 || iov_iter_count(to) == 0)
1691                 GOTO(out, result);
1692
1693         env = cl_env_get(&refcheck);
1694         if (IS_ERR(env))
1695                 return PTR_ERR(env);
1696
1697         args = ll_env_args(env, IO_NORMAL);
1698         args->u.normal.via_iter = to;
1699         args->u.normal.via_iocb = iocb;
1700
1701         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1702                                  &iocb->ki_pos, iov_iter_count(to));
1703         if (rc2 > 0)
1704                 result += rc2;
1705         else if (result == 0)
1706                 result = rc2;
1707
1708         cl_env_put(env, &refcheck);
1709 out:
1710         return result;
1711 }
1712
1713 /**
1714  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1715  * If a page is already in the page cache and dirty (and some other things -
1716  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1717  * write to it without doing a full I/O, because Lustre already knows about it
1718  * and will write it out.  This saves a lot of processing time.
1719  *
1720  * All writes here are within one page, so exclusion is handled by the page
1721  * lock on the vm page.  We do not do tiny writes for writes which touch
1722  * multiple pages because it's very unlikely multiple sequential pages are
1723  * are already dirty.
1724  *
1725  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1726  * and are unlikely to be to already dirty pages.
1727  *
1728  * Attribute updates are important here, we do them in ll_tiny_write_end.
1729  */
1730 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1731 {
1732         ssize_t count = iov_iter_count(iter);
1733         struct file *file = iocb->ki_filp;
1734         struct inode *inode = file_inode(file);
1735         ssize_t result = 0;
1736
1737         ENTRY;
1738
1739         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1740          * of function for why.
1741          */
1742         if (count >= PAGE_SIZE ||
1743             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1744                 RETURN(0);
1745
1746         result = __generic_file_write_iter(iocb, iter);
1747
1748         /* If the page is not already dirty, ll_tiny_write_begin returns
1749          * -ENODATA.  We continue on to normal write.
1750          */
1751         if (result == -ENODATA)
1752                 result = 0;
1753
1754         if (result > 0) {
1755                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1756                                    result);
1757                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1758         }
1759
1760         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1761
1762         RETURN(result);
1763 }
1764
1765 /*
1766  * Write to a file (through the page cache).
1767  */
1768 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1769 {
1770         struct vvp_io_args *args;
1771         struct lu_env *env;
1772         ssize_t rc_tiny = 0, rc_normal;
1773         __u16 refcheck;
1774
1775         ENTRY;
1776
1777         /* NB: we can't do direct IO for tiny writes because they use the page
1778          * cache, we can't do sync writes because tiny writes can't flush
1779          * pages, and we can't do append writes because we can't guarantee the
1780          * required DLM locks are held to protect file size.
1781          */
1782         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1783             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1784                 rc_tiny = ll_do_tiny_write(iocb, from);
1785
1786         /* In case of error, go on and try normal write - Only stop if tiny
1787          * write completed I/O.
1788          */
1789         if (iov_iter_count(from) == 0)
1790                 GOTO(out, rc_normal = rc_tiny);
1791
1792         env = cl_env_get(&refcheck);
1793         if (IS_ERR(env))
1794                 return PTR_ERR(env);
1795
1796         args = ll_env_args(env, IO_NORMAL);
1797         args->u.normal.via_iter = from;
1798         args->u.normal.via_iocb = iocb;
1799
1800         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1801                                     &iocb->ki_pos, iov_iter_count(from));
1802
1803         /* On success, combine bytes written. */
1804         if (rc_tiny >= 0 && rc_normal > 0)
1805                 rc_normal += rc_tiny;
1806         /* On error, only return error from normal write if tiny write did not
1807          * write any bytes.  Otherwise return bytes written by tiny write.
1808          */
1809         else if (rc_tiny > 0)
1810                 rc_normal = rc_tiny;
1811
1812         cl_env_put(env, &refcheck);
1813 out:
1814         RETURN(rc_normal);
1815 }
1816
1817 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1818 /*
1819  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1820  */
1821 static int ll_file_get_iov_count(const struct iovec *iov,
1822                                  unsigned long *nr_segs, size_t *count)
1823 {
1824         size_t cnt = 0;
1825         unsigned long seg;
1826
1827         for (seg = 0; seg < *nr_segs; seg++) {
1828                 const struct iovec *iv = &iov[seg];
1829
1830                 /*
1831                  * If any segment has a negative length, or the cumulative
1832                  * length ever wraps negative then return -EINVAL.
1833                  */
1834                 cnt += iv->iov_len;
1835                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1836                         return -EINVAL;
1837                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1838                         continue;
1839                 if (seg == 0)
1840                         return -EFAULT;
1841                 *nr_segs = seg;
1842                 cnt -= iv->iov_len;     /* This segment is no good */
1843                 break;
1844         }
1845         *count = cnt;
1846         return 0;
1847 }
1848
1849 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1850                                 unsigned long nr_segs, loff_t pos)
1851 {
1852         struct iov_iter to;
1853         size_t iov_count;
1854         ssize_t result;
1855         ENTRY;
1856
1857         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1858         if (result)
1859                 RETURN(result);
1860
1861 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1862         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1863 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1864         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1865 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1866
1867         result = ll_file_read_iter(iocb, &to);
1868
1869         RETURN(result);
1870 }
1871
1872 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1873                             loff_t *ppos)
1874 {
1875         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1876         struct kiocb   kiocb;
1877         ssize_t        result;
1878         ENTRY;
1879
1880         init_sync_kiocb(&kiocb, file);
1881         kiocb.ki_pos = *ppos;
1882 #ifdef HAVE_KIOCB_KI_LEFT
1883         kiocb.ki_left = count;
1884 #elif defined(HAVE_KI_NBYTES)
1885         kiocb.i_nbytes = count;
1886 #endif
1887
1888         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1889         *ppos = kiocb.ki_pos;
1890
1891         RETURN(result);
1892 }
1893
1894 /*
1895  * Write to a file (through the page cache).
1896  * AIO stuff
1897  */
1898 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1899                                  unsigned long nr_segs, loff_t pos)
1900 {
1901         struct iov_iter from;
1902         size_t iov_count;
1903         ssize_t result;
1904         ENTRY;
1905
1906         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1907         if (result)
1908                 RETURN(result);
1909
1910 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1911         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1912 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1913         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1914 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1915
1916         result = ll_file_write_iter(iocb, &from);
1917
1918         RETURN(result);
1919 }
1920
1921 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1922                              size_t count, loff_t *ppos)
1923 {
1924         struct iovec   iov = { .iov_base = (void __user *)buf,
1925                                .iov_len = count };
1926         struct kiocb   kiocb;
1927         ssize_t        result;
1928
1929         ENTRY;
1930
1931         init_sync_kiocb(&kiocb, file);
1932         kiocb.ki_pos = *ppos;
1933 #ifdef HAVE_KIOCB_KI_LEFT
1934         kiocb.ki_left = count;
1935 #elif defined(HAVE_KI_NBYTES)
1936         kiocb.ki_nbytes = count;
1937 #endif
1938
1939         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1940         *ppos = kiocb.ki_pos;
1941
1942         RETURN(result);
1943 }
1944 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1945
1946 /*
1947  * Send file content (through pagecache) somewhere with helper
1948  */
1949 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1950                                    struct pipe_inode_info *pipe, size_t count,
1951                                    unsigned int flags)
1952 {
1953         struct lu_env      *env;
1954         struct vvp_io_args *args;
1955         ssize_t             result;
1956         __u16               refcheck;
1957         ENTRY;
1958
1959         env = cl_env_get(&refcheck);
1960         if (IS_ERR(env))
1961                 RETURN(PTR_ERR(env));
1962
1963         args = ll_env_args(env, IO_SPLICE);
1964         args->u.splice.via_pipe = pipe;
1965         args->u.splice.via_flags = flags;
1966
1967         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1968         cl_env_put(env, &refcheck);
1969         RETURN(result);
1970 }
1971
1972 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1973                              __u64 flags, struct lov_user_md *lum, int lum_size)
1974 {
1975         struct lookup_intent oit = {
1976                 .it_op = IT_OPEN,
1977                 .it_flags = flags | MDS_OPEN_BY_FID,
1978         };
1979         int rc;
1980         ENTRY;
1981
1982         ll_inode_size_lock(inode);
1983         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1984         if (rc < 0)
1985                 GOTO(out_unlock, rc);
1986
1987         ll_release_openhandle(dentry, &oit);
1988
1989 out_unlock:
1990         ll_inode_size_unlock(inode);
1991         ll_intent_release(&oit);
1992
1993         RETURN(rc);
1994 }
1995
1996 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1997                              struct lov_mds_md **lmmp, int *lmm_size,
1998                              struct ptlrpc_request **request)
1999 {
2000         struct ll_sb_info *sbi = ll_i2sbi(inode);
2001         struct mdt_body  *body;
2002         struct lov_mds_md *lmm = NULL;
2003         struct ptlrpc_request *req = NULL;
2004         struct md_op_data *op_data;
2005         int rc, lmmsize;
2006
2007         rc = ll_get_default_mdsize(sbi, &lmmsize);
2008         if (rc)
2009                 RETURN(rc);
2010
2011         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2012                                      strlen(filename), lmmsize,
2013                                      LUSTRE_OPC_ANY, NULL);
2014         if (IS_ERR(op_data))
2015                 RETURN(PTR_ERR(op_data));
2016
2017         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2018         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2019         ll_finish_md_op_data(op_data);
2020         if (rc < 0) {
2021                 CDEBUG(D_INFO, "md_getattr_name failed "
2022                        "on %s: rc %d\n", filename, rc);
2023                 GOTO(out, rc);
2024         }
2025
2026         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2027         LASSERT(body != NULL); /* checked by mdc_getattr_name */
2028
2029         lmmsize = body->mbo_eadatasize;
2030
2031         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2032                         lmmsize == 0) {
2033                 GOTO(out, rc = -ENODATA);
2034         }
2035
2036         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2037         LASSERT(lmm != NULL);
2038
2039         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2040             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2041             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2042                 GOTO(out, rc = -EPROTO);
2043
2044         /*
2045          * This is coming from the MDS, so is probably in
2046          * little endian.  We convert it to host endian before
2047          * passing it to userspace.
2048          */
2049         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2050                 int stripe_count;
2051
2052                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2053                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2054                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2055                         if (le32_to_cpu(lmm->lmm_pattern) &
2056                             LOV_PATTERN_F_RELEASED)
2057                                 stripe_count = 0;
2058                 }
2059
2060                 /* if function called for directory - we should
2061                  * avoid swab not existent lsm objects */
2062                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2063                         lustre_swab_lov_user_md_v1(
2064                                         (struct lov_user_md_v1 *)lmm);
2065                         if (S_ISREG(body->mbo_mode))
2066                                 lustre_swab_lov_user_md_objects(
2067                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2068                                     stripe_count);
2069                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2070                         lustre_swab_lov_user_md_v3(
2071                                         (struct lov_user_md_v3 *)lmm);
2072                         if (S_ISREG(body->mbo_mode))
2073                                 lustre_swab_lov_user_md_objects(
2074                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2075                                     stripe_count);
2076                 } else if (lmm->lmm_magic ==
2077                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2078                         lustre_swab_lov_comp_md_v1(
2079                                         (struct lov_comp_md_v1 *)lmm);
2080                 }
2081         }
2082
2083 out:
2084         *lmmp = lmm;
2085         *lmm_size = lmmsize;
2086         *request = req;
2087         return rc;
2088 }
2089
2090 static int ll_lov_setea(struct inode *inode, struct file *file,
2091                         void __user *arg)
2092 {
2093         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2094         struct lov_user_md      *lump;
2095         int                      lum_size = sizeof(struct lov_user_md) +
2096                                             sizeof(struct lov_user_ost_data);
2097         int                      rc;
2098         ENTRY;
2099
2100         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2101                 RETURN(-EPERM);
2102
2103         OBD_ALLOC_LARGE(lump, lum_size);
2104         if (lump == NULL)
2105                 RETURN(-ENOMEM);
2106
2107         if (copy_from_user(lump, arg, lum_size))
2108                 GOTO(out_lump, rc = -EFAULT);
2109
2110         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2111                                       lum_size);
2112         cl_lov_delay_create_clear(&file->f_flags);
2113
2114 out_lump:
2115         OBD_FREE_LARGE(lump, lum_size);
2116         RETURN(rc);
2117 }
2118
2119 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2120 {
2121         struct lu_env   *env;
2122         __u16           refcheck;
2123         int             rc;
2124         ENTRY;
2125
2126         env = cl_env_get(&refcheck);
2127         if (IS_ERR(env))
2128                 RETURN(PTR_ERR(env));
2129
2130         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2131         cl_env_put(env, &refcheck);
2132         RETURN(rc);
2133 }
2134
2135 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2136                             void __user *arg)
2137 {
2138         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2139         struct lov_user_md        *klum;
2140         int                        lum_size, rc;
2141         __u64                      flags = FMODE_WRITE;
2142         ENTRY;
2143
2144         rc = ll_copy_user_md(lum, &klum);
2145         if (rc < 0)
2146                 RETURN(rc);
2147
2148         lum_size = rc;
2149         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2150                                       lum_size);
2151         if (!rc) {
2152                 __u32 gen;
2153
2154                 rc = put_user(0, &lum->lmm_stripe_count);
2155                 if (rc)
2156                         GOTO(out, rc);
2157
2158                 rc = ll_layout_refresh(inode, &gen);
2159                 if (rc)
2160                         GOTO(out, rc);
2161
2162                 rc = ll_file_getstripe(inode, arg, lum_size);
2163         }
2164         cl_lov_delay_create_clear(&file->f_flags);
2165
2166 out:
2167         OBD_FREE(klum, lum_size);
2168         RETURN(rc);
2169 }
2170
2171 static int
2172 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2173 {
2174         struct ll_inode_info *lli = ll_i2info(inode);
2175         struct cl_object *obj = lli->lli_clob;
2176         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2177         struct ll_grouplock grouplock;
2178         int rc;
2179         ENTRY;
2180
2181         if (arg == 0) {
2182                 CWARN("group id for group lock must not be 0\n");
2183                 RETURN(-EINVAL);
2184         }
2185
2186         if (ll_file_nolock(file))
2187                 RETURN(-EOPNOTSUPP);
2188
2189         spin_lock(&lli->lli_lock);
2190         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2191                 CWARN("group lock already existed with gid %lu\n",
2192                       fd->fd_grouplock.lg_gid);
2193                 spin_unlock(&lli->lli_lock);
2194                 RETURN(-EINVAL);
2195         }
2196         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2197         spin_unlock(&lli->lli_lock);
2198
2199         /**
2200          * XXX: group lock needs to protect all OST objects while PFL
2201          * can add new OST objects during the IO, so we'd instantiate
2202          * all OST objects before getting its group lock.
2203          */
2204         if (obj) {
2205                 struct lu_env *env;
2206                 __u16 refcheck;
2207                 struct cl_layout cl = {
2208                         .cl_is_composite = false,
2209                 };
2210                 struct lu_extent ext = {
2211                         .e_start = 0,
2212                         .e_end = OBD_OBJECT_EOF,
2213                 };
2214
2215                 env = cl_env_get(&refcheck);
2216                 if (IS_ERR(env))
2217                         RETURN(PTR_ERR(env));
2218
2219                 rc = cl_object_layout_get(env, obj, &cl);
2220                 if (!rc && cl.cl_is_composite)
2221                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2222                                                     &ext);
2223
2224                 cl_env_put(env, &refcheck);
2225                 if (rc)
2226                         RETURN(rc);
2227         }
2228
2229         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2230                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2231         if (rc)
2232                 RETURN(rc);
2233
2234         spin_lock(&lli->lli_lock);
2235         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2236                 spin_unlock(&lli->lli_lock);
2237                 CERROR("another thread just won the race\n");
2238                 cl_put_grouplock(&grouplock);
2239                 RETURN(-EINVAL);
2240         }
2241
2242         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2243         fd->fd_grouplock = grouplock;
2244         spin_unlock(&lli->lli_lock);
2245
2246         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2247         RETURN(0);
2248 }
2249
2250 static int ll_put_grouplock(struct inode *inode, struct file *file,
2251                             unsigned long arg)
2252 {
2253         struct ll_inode_info   *lli = ll_i2info(inode);
2254         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2255         struct ll_grouplock     grouplock;
2256         ENTRY;
2257
2258         spin_lock(&lli->lli_lock);
2259         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2260                 spin_unlock(&lli->lli_lock);
2261                 CWARN("no group lock held\n");
2262                 RETURN(-EINVAL);
2263         }
2264
2265         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2266
2267         if (fd->fd_grouplock.lg_gid != arg) {
2268                 CWARN("group lock %lu doesn't match current id %lu\n",
2269                       arg, fd->fd_grouplock.lg_gid);
2270                 spin_unlock(&lli->lli_lock);
2271                 RETURN(-EINVAL);
2272         }
2273
2274         grouplock = fd->fd_grouplock;
2275         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2276         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2277         spin_unlock(&lli->lli_lock);
2278
2279         cl_put_grouplock(&grouplock);
2280         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2281         RETURN(0);
2282 }
2283
2284 /**
2285  * Close inode open handle
2286  *
2287  * \param dentry [in]     dentry which contains the inode
2288  * \param it     [in,out] intent which contains open info and result
2289  *
2290  * \retval 0     success
2291  * \retval <0    failure
2292  */
2293 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2294 {
2295         struct inode *inode = dentry->d_inode;
2296         struct obd_client_handle *och;
2297         int rc;
2298         ENTRY;
2299
2300         LASSERT(inode);
2301
2302         /* Root ? Do nothing. */
2303         if (dentry->d_inode->i_sb->s_root == dentry)
2304                 RETURN(0);
2305
2306         /* No open handle to close? Move away */
2307         if (!it_disposition(it, DISP_OPEN_OPEN))
2308                 RETURN(0);
2309
2310         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2311
2312         OBD_ALLOC(och, sizeof(*och));
2313         if (!och)
2314                 GOTO(out, rc = -ENOMEM);
2315
2316         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2317
2318         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2319 out:
2320         /* this one is in place of ll_file_open */
2321         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2322                 ptlrpc_req_finished(it->it_request);
2323                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2324         }
2325         RETURN(rc);
2326 }
2327
2328 /**
2329  * Get size for inode for which FIEMAP mapping is requested.
2330  * Make the FIEMAP get_info call and returns the result.
2331  * \param fiemap        kernel buffer to hold extens
2332  * \param num_bytes     kernel buffer size
2333  */
2334 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2335                         size_t num_bytes)
2336 {
2337         struct lu_env                   *env;
2338         __u16                           refcheck;
2339         int                             rc = 0;
2340         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2341         ENTRY;
2342
2343         /* Checks for fiemap flags */
2344         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2345                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2346                 return -EBADR;
2347         }
2348
2349         /* Check for FIEMAP_FLAG_SYNC */
2350         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2351                 rc = filemap_fdatawrite(inode->i_mapping);
2352                 if (rc)
2353                         return rc;
2354         }
2355
2356         env = cl_env_get(&refcheck);
2357         if (IS_ERR(env))
2358                 RETURN(PTR_ERR(env));
2359
2360         if (i_size_read(inode) == 0) {
2361                 rc = ll_glimpse_size(inode);
2362                 if (rc)
2363                         GOTO(out, rc);
2364         }
2365
2366         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2367         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2368         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2369
2370         /* If filesize is 0, then there would be no objects for mapping */
2371         if (fmkey.lfik_oa.o_size == 0) {
2372                 fiemap->fm_mapped_extents = 0;
2373                 GOTO(out, rc = 0);
2374         }
2375
2376         fmkey.lfik_fiemap = *fiemap;
2377
2378         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2379                               &fmkey, fiemap, &num_bytes);
2380 out:
2381         cl_env_put(env, &refcheck);
2382         RETURN(rc);
2383 }
2384
2385 int ll_fid2path(struct inode *inode, void __user *arg)
2386 {
2387         struct obd_export       *exp = ll_i2mdexp(inode);
2388         const struct getinfo_fid2path __user *gfin = arg;
2389         __u32                    pathlen;
2390         struct getinfo_fid2path *gfout;
2391         size_t                   outsize;
2392         int                      rc;
2393
2394         ENTRY;
2395
2396         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2397             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2398                 RETURN(-EPERM);
2399
2400         /* Only need to get the buflen */
2401         if (get_user(pathlen, &gfin->gf_pathlen))
2402                 RETURN(-EFAULT);
2403
2404         if (pathlen > PATH_MAX)
2405                 RETURN(-EINVAL);
2406
2407         outsize = sizeof(*gfout) + pathlen;
2408         OBD_ALLOC(gfout, outsize);
2409         if (gfout == NULL)
2410                 RETURN(-ENOMEM);
2411
2412         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2413                 GOTO(gf_free, rc = -EFAULT);
2414         /* append root FID after gfout to let MDT know the root FID so that it
2415          * can lookup the correct path, this is mainly for fileset.
2416          * old server without fileset mount support will ignore this. */
2417         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2418
2419         /* Call mdc_iocontrol */
2420         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2421         if (rc != 0)
2422                 GOTO(gf_free, rc);
2423
2424         if (copy_to_user(arg, gfout, outsize))
2425                 rc = -EFAULT;
2426
2427 gf_free:
2428         OBD_FREE(gfout, outsize);
2429         RETURN(rc);
2430 }
2431
2432 static int
2433 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2434 {
2435         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2436         struct lu_env *env;
2437         struct cl_io *io;
2438         __u16  refcheck;
2439         int result;
2440
2441         ENTRY;
2442
2443         ioc->idv_version = 0;
2444         ioc->idv_layout_version = UINT_MAX;
2445
2446         /* If no file object initialized, we consider its version is 0. */
2447         if (obj == NULL)
2448                 RETURN(0);
2449
2450         env = cl_env_get(&refcheck);
2451         if (IS_ERR(env))
2452                 RETURN(PTR_ERR(env));
2453
2454         io = vvp_env_thread_io(env);
2455         io->ci_obj = obj;
2456         io->u.ci_data_version.dv_data_version = 0;
2457         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2458         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2459
2460 restart:
2461         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2462                 result = cl_io_loop(env, io);
2463         else
2464                 result = io->ci_result;
2465
2466         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2467         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2468
2469         cl_io_fini(env, io);
2470
2471         if (unlikely(io->ci_need_restart))
2472                 goto restart;
2473
2474         cl_env_put(env, &refcheck);
2475
2476         RETURN(result);
2477 }
2478
2479 /*
2480  * Read the data_version for inode.
2481  *
2482  * This value is computed using stripe object version on OST.
2483  * Version is computed using server side locking.
2484  *
2485  * @param flags if do sync on the OST side;
2486  *              0: no sync
2487  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2488  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2489  */
2490 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2491 {
2492         struct ioc_data_version ioc = { .idv_flags = flags };
2493         int rc;
2494
2495         rc = ll_ioc_data_version(inode, &ioc);
2496         if (!rc)
2497                 *data_version = ioc.idv_version;
2498
2499         return rc;
2500 }
2501
2502 /*
2503  * Trigger a HSM release request for the provided inode.
2504  */
2505 int ll_hsm_release(struct inode *inode)
2506 {
2507         struct lu_env *env;
2508         struct obd_client_handle *och = NULL;
2509         __u64 data_version = 0;
2510         int rc;
2511         __u16 refcheck;
2512         ENTRY;
2513
2514         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2515                ll_get_fsname(inode->i_sb, NULL, 0),
2516                PFID(&ll_i2info(inode)->lli_fid));
2517
2518         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2519         if (IS_ERR(och))
2520                 GOTO(out, rc = PTR_ERR(och));
2521
2522         /* Grab latest data_version and [am]time values */
2523         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2524         if (rc != 0)
2525                 GOTO(out, rc);
2526
2527         env = cl_env_get(&refcheck);
2528         if (IS_ERR(env))
2529                 GOTO(out, rc = PTR_ERR(env));
2530
2531         rc = ll_merge_attr(env, inode);
2532         cl_env_put(env, &refcheck);
2533
2534         /* If error happen, we have the wrong size for a file.
2535          * Don't release it.
2536          */
2537         if (rc != 0)
2538                 GOTO(out, rc);
2539
2540         /* Release the file.
2541          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2542          * we still need it to pack l_remote_handle to MDT. */
2543         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2544                                        &data_version);
2545         och = NULL;
2546
2547         EXIT;
2548 out:
2549         if (och != NULL && !IS_ERR(och)) /* close the file */
2550                 ll_lease_close(och, inode, NULL);
2551
2552         return rc;
2553 }
2554
2555 struct ll_swap_stack {
2556         __u64                    dv1;
2557         __u64                    dv2;
2558         struct inode            *inode1;
2559         struct inode            *inode2;
2560         bool                     check_dv1;
2561         bool                     check_dv2;
2562 };
2563
2564 static int ll_swap_layouts(struct file *file1, struct file *file2,
2565                            struct lustre_swap_layouts *lsl)
2566 {
2567         struct mdc_swap_layouts  msl;
2568         struct md_op_data       *op_data;
2569         __u32                    gid;
2570         __u64                    dv;
2571         struct ll_swap_stack    *llss = NULL;
2572         int                      rc;
2573
2574         OBD_ALLOC_PTR(llss);
2575         if (llss == NULL)
2576                 RETURN(-ENOMEM);
2577
2578         llss->inode1 = file_inode(file1);
2579         llss->inode2 = file_inode(file2);
2580
2581         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2582         if (rc < 0)
2583                 GOTO(free, rc);
2584
2585         /* we use 2 bool because it is easier to swap than 2 bits */
2586         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2587                 llss->check_dv1 = true;
2588
2589         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2590                 llss->check_dv2 = true;
2591
2592         /* we cannot use lsl->sl_dvX directly because we may swap them */
2593         llss->dv1 = lsl->sl_dv1;
2594         llss->dv2 = lsl->sl_dv2;
2595
2596         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2597         if (rc == 0) /* same file, done! */
2598                 GOTO(free, rc);
2599
2600         if (rc < 0) { /* sequentialize it */
2601                 swap(llss->inode1, llss->inode2);
2602                 swap(file1, file2);
2603                 swap(llss->dv1, llss->dv2);
2604                 swap(llss->check_dv1, llss->check_dv2);
2605         }
2606
2607         gid = lsl->sl_gid;
2608         if (gid != 0) { /* application asks to flush dirty cache */
2609                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2610                 if (rc < 0)
2611                         GOTO(free, rc);
2612
2613                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2614                 if (rc < 0) {
2615                         ll_put_grouplock(llss->inode1, file1, gid);
2616                         GOTO(free, rc);
2617                 }
2618         }
2619
2620         /* ultimate check, before swaping the layouts we check if
2621          * dataversion has changed (if requested) */
2622         if (llss->check_dv1) {
2623                 rc = ll_data_version(llss->inode1, &dv, 0);
2624                 if (rc)
2625                         GOTO(putgl, rc);
2626                 if (dv != llss->dv1)
2627                         GOTO(putgl, rc = -EAGAIN);
2628         }
2629
2630         if (llss->check_dv2) {
2631                 rc = ll_data_version(llss->inode2, &dv, 0);
2632                 if (rc)
2633                         GOTO(putgl, rc);
2634                 if (dv != llss->dv2)
2635                         GOTO(putgl, rc = -EAGAIN);
2636         }
2637
2638         /* struct md_op_data is used to send the swap args to the mdt
2639          * only flags is missing, so we use struct mdc_swap_layouts
2640          * through the md_op_data->op_data */
2641         /* flags from user space have to be converted before they are send to
2642          * server, no flag is sent today, they are only used on the client */
2643         msl.msl_flags = 0;
2644         rc = -ENOMEM;
2645         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2646                                      0, LUSTRE_OPC_ANY, &msl);
2647         if (IS_ERR(op_data))
2648                 GOTO(free, rc = PTR_ERR(op_data));
2649
2650         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2651                            sizeof(*op_data), op_data, NULL);
2652         ll_finish_md_op_data(op_data);
2653
2654         if (rc < 0)
2655                 GOTO(putgl, rc);
2656
2657 putgl:
2658         if (gid != 0) {
2659                 ll_put_grouplock(llss->inode2, file2, gid);
2660                 ll_put_grouplock(llss->inode1, file1, gid);
2661         }
2662
2663 free:
2664         if (llss != NULL)
2665                 OBD_FREE_PTR(llss);
2666
2667         RETURN(rc);
2668 }
2669
2670 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2671 {
2672         struct md_op_data       *op_data;
2673         int                      rc;
2674         ENTRY;
2675
2676         /* Detect out-of range masks */
2677         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2678                 RETURN(-EINVAL);
2679
2680         /* Non-root users are forbidden to set or clear flags which are
2681          * NOT defined in HSM_USER_MASK. */
2682         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2683             !cfs_capable(CFS_CAP_SYS_ADMIN))
2684                 RETURN(-EPERM);
2685
2686         /* Detect out-of range archive id */
2687         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2688             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2689                 RETURN(-EINVAL);
2690
2691         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2692                                      LUSTRE_OPC_ANY, hss);
2693         if (IS_ERR(op_data))
2694                 RETURN(PTR_ERR(op_data));
2695
2696         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2697                            sizeof(*op_data), op_data, NULL);
2698
2699         ll_finish_md_op_data(op_data);
2700
2701         RETURN(rc);
2702 }
2703
2704 static int ll_hsm_import(struct inode *inode, struct file *file,
2705                          struct hsm_user_import *hui)
2706 {
2707         struct hsm_state_set    *hss = NULL;
2708         struct iattr            *attr = NULL;
2709         int                      rc;
2710         ENTRY;
2711
2712         if (!S_ISREG(inode->i_mode))
2713                 RETURN(-EINVAL);
2714
2715         /* set HSM flags */
2716         OBD_ALLOC_PTR(hss);
2717         if (hss == NULL)
2718                 GOTO(out, rc = -ENOMEM);
2719
2720         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2721         hss->hss_archive_id = hui->hui_archive_id;
2722         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2723         rc = ll_hsm_state_set(inode, hss);
2724         if (rc != 0)
2725                 GOTO(out, rc);
2726
2727         OBD_ALLOC_PTR(attr);
2728         if (attr == NULL)
2729                 GOTO(out, rc = -ENOMEM);
2730
2731         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2732         attr->ia_mode |= S_IFREG;
2733         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2734         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2735         attr->ia_size = hui->hui_size;
2736         attr->ia_mtime.tv_sec = hui->hui_mtime;
2737         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2738         attr->ia_atime.tv_sec = hui->hui_atime;
2739         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2740
2741         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2742                          ATTR_UID | ATTR_GID |
2743                          ATTR_MTIME | ATTR_MTIME_SET |
2744                          ATTR_ATIME | ATTR_ATIME_SET;
2745
2746         inode_lock(inode);
2747
2748         rc = ll_setattr_raw(file_dentry(file), attr, true);
2749         if (rc == -ENODATA)
2750                 rc = 0;
2751
2752         inode_unlock(inode);
2753
2754 out:
2755         if (hss != NULL)
2756                 OBD_FREE_PTR(hss);
2757
2758         if (attr != NULL)
2759                 OBD_FREE_PTR(attr);
2760
2761         RETURN(rc);
2762 }
2763
2764 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2765 {
2766         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2767                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2768 }
2769
2770 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2771 {
2772         struct inode *inode = file_inode(file);
2773         struct iattr ia = {
2774                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2775                             ATTR_MTIME | ATTR_MTIME_SET |
2776                             ATTR_CTIME | ATTR_CTIME_SET,
2777                 .ia_atime = {
2778                         .tv_sec = lfu->lfu_atime_sec,
2779                         .tv_nsec = lfu->lfu_atime_nsec,
2780                 },
2781                 .ia_mtime = {
2782                         .tv_sec = lfu->lfu_mtime_sec,
2783                         .tv_nsec = lfu->lfu_mtime_nsec,
2784                 },
2785                 .ia_ctime = {
2786                         .tv_sec = lfu->lfu_ctime_sec,
2787                         .tv_nsec = lfu->lfu_ctime_nsec,
2788                 },
2789         };
2790         int rc;
2791         ENTRY;
2792
2793         if (!capable(CAP_SYS_ADMIN))
2794                 RETURN(-EPERM);
2795
2796         if (!S_ISREG(inode->i_mode))
2797                 RETURN(-EINVAL);
2798
2799         inode_lock(inode);
2800         rc = ll_setattr_raw(file_dentry(file), &ia, false);
2801         inode_unlock(inode);
2802
2803         RETURN(rc);
2804 }
2805
2806 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2807 {
2808         switch (mode) {
2809         case MODE_READ_USER:
2810                 return CLM_READ;
2811         case MODE_WRITE_USER:
2812                 return CLM_WRITE;
2813         default:
2814                 return -EINVAL;
2815         }
2816 }
2817
2818 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2819
2820 /* Used to allow the upper layers of the client to request an LDLM lock
2821  * without doing an actual read or write.
2822  *
2823  * Used for ladvise lockahead to manually request specific locks.
2824  *
2825  * \param[in] file      file this ladvise lock request is on
2826  * \param[in] ladvise   ladvise struct describing this lock request
2827  *
2828  * \retval 0            success, no detailed result available (sync requests
2829  *                      and requests sent to the server [not handled locally]
2830  *                      cannot return detailed results)
2831  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2832  *                                       see definitions for details.
2833  * \retval negative     negative errno on error
2834  */
2835 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2836 {
2837         struct lu_env *env = NULL;
2838         struct cl_io *io  = NULL;
2839         struct cl_lock *lock = NULL;
2840         struct cl_lock_descr *descr = NULL;
2841         struct dentry *dentry = file->f_path.dentry;
2842         struct inode *inode = dentry->d_inode;
2843         enum cl_lock_mode cl_mode;
2844         off_t start = ladvise->lla_start;
2845         off_t end = ladvise->lla_end;
2846         int result;
2847         __u16 refcheck;
2848
2849         ENTRY;
2850
2851         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2852                "start=%llu, end=%llu\n", dentry->d_name.len,
2853                dentry->d_name.name, dentry->d_inode,
2854                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2855                (__u64) end);
2856
2857         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2858         if (cl_mode < 0)
2859                 GOTO(out, result = cl_mode);
2860
2861         /* Get IO environment */
2862         result = cl_io_get(inode, &env, &io, &refcheck);
2863         if (result <= 0)
2864                 GOTO(out, result);
2865
2866         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2867         if (result > 0) {
2868                 /*
2869                  * nothing to do for this io. This currently happens when
2870                  * stripe sub-object's are not yet created.
2871                  */
2872                 result = io->ci_result;
2873         } else if (result == 0) {
2874                 lock = vvp_env_lock(env);
2875                 descr = &lock->cll_descr;
2876
2877                 descr->cld_obj   = io->ci_obj;
2878                 /* Convert byte offsets to pages */
2879                 descr->cld_start = cl_index(io->ci_obj, start);
2880                 descr->cld_end   = cl_index(io->ci_obj, end);
2881                 descr->cld_mode  = cl_mode;
2882                 /* CEF_MUST is used because we do not want to convert a
2883                  * lockahead request to a lockless lock */
2884                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2885                                        CEF_NONBLOCK;
2886
2887                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2888                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2889
2890                 result = cl_lock_request(env, io, lock);
2891
2892                 /* On success, we need to release the lock */
2893                 if (result >= 0)
2894                         cl_lock_release(env, lock);
2895         }
2896         cl_io_fini(env, io);
2897         cl_env_put(env, &refcheck);
2898
2899         /* -ECANCELED indicates a matching lock with a different extent
2900          * was already present, and -EEXIST indicates a matching lock
2901          * on exactly the same extent was already present.
2902          * We convert them to positive values for userspace to make
2903          * recognizing true errors easier.
2904          * Note we can only return these detailed results on async requests,
2905          * as sync requests look the same as i/o requests for locking. */
2906         if (result == -ECANCELED)
2907                 result = LLA_RESULT_DIFFERENT;
2908         else if (result == -EEXIST)
2909                 result = LLA_RESULT_SAME;
2910
2911 out:
2912         RETURN(result);
2913 }
2914 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2915
2916 static int ll_ladvise_sanity(struct inode *inode,
2917                              struct llapi_lu_ladvise *ladvise)
2918 {
2919         enum lu_ladvise_type advice = ladvise->lla_advice;
2920         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2921          * be in the first 32 bits of enum ladvise_flags */
2922         __u32 flags = ladvise->lla_peradvice_flags;
2923         /* 3 lines at 80 characters per line, should be plenty */
2924         int rc = 0;
2925
2926         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2927                 rc = -EINVAL;
2928                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2929                        "last supported advice is %s (value '%d'): rc = %d\n",
2930                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2931                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2932                 GOTO(out, rc);
2933         }
2934
2935         /* Per-advice checks */
2936         switch (advice) {
2937         case LU_LADVISE_LOCKNOEXPAND:
2938                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2939                         rc = -EINVAL;
2940                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2941                                "rc = %d\n",
2942                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2943                                ladvise_names[advice], rc);
2944                         GOTO(out, rc);
2945                 }
2946                 break;
2947         case LU_LADVISE_LOCKAHEAD:
2948                 /* Currently only READ and WRITE modes can be requested */
2949                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2950                     ladvise->lla_lockahead_mode == 0) {
2951                         rc = -EINVAL;
2952                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2953                                "rc = %d\n",
2954                                ll_get_fsname(inode->i_sb, NULL, 0),
2955                                ladvise->lla_lockahead_mode,
2956                                ladvise_names[advice], rc);
2957                         GOTO(out, rc);
2958                 }
2959         case LU_LADVISE_WILLREAD:
2960         case LU_LADVISE_DONTNEED:
2961         default:
2962                 /* Note fall through above - These checks apply to all advices
2963                  * except LOCKNOEXPAND */
2964                 if (flags & ~LF_DEFAULT_MASK) {
2965                         rc = -EINVAL;
2966                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2967                                "rc = %d\n",
2968                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2969                                ladvise_names[advice], rc);
2970                         GOTO(out, rc);
2971                 }
2972                 if (ladvise->lla_start >= ladvise->lla_end) {
2973                         rc = -EINVAL;
2974                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2975                                "for %s: rc = %d\n",
2976                                ll_get_fsname(inode->i_sb, NULL, 0),
2977                                ladvise->lla_start, ladvise->lla_end,
2978                                ladvise_names[advice], rc);
2979                         GOTO(out, rc);
2980                 }
2981                 break;
2982         }
2983
2984 out:
2985         return rc;
2986 }
2987 #undef ERRSIZE
2988
2989 /*
2990  * Give file access advices
2991  *
2992  * The ladvise interface is similar to Linux fadvise() system call, except it
2993  * forwards the advices directly from Lustre client to server. The server side
2994  * codes will apply appropriate read-ahead and caching techniques for the
2995  * corresponding files.
2996  *
2997  * A typical workload for ladvise is e.g. a bunch of different clients are
2998  * doing small random reads of a file, so prefetching pages into OSS cache
2999  * with big linear reads before the random IO is a net benefit. Fetching
3000  * all that data into each client cache with fadvise() may not be, due to
3001  * much more data being sent to the client.
3002  */
3003 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3004                       struct llapi_lu_ladvise *ladvise)
3005 {
3006         struct lu_env *env;
3007         struct cl_io *io;
3008         struct cl_ladvise_io *lio;
3009         int rc;
3010         __u16 refcheck;
3011         ENTRY;
3012
3013         env = cl_env_get(&refcheck);
3014         if (IS_ERR(env))
3015                 RETURN(PTR_ERR(env));
3016
3017         io = vvp_env_thread_io(env);
3018         io->ci_obj = ll_i2info(inode)->lli_clob;
3019
3020         /* initialize parameters for ladvise */
3021         lio = &io->u.ci_ladvise;
3022         lio->li_start = ladvise->lla_start;
3023         lio->li_end = ladvise->lla_end;
3024         lio->li_fid = ll_inode2fid(inode);
3025         lio->li_advice = ladvise->lla_advice;
3026         lio->li_flags = flags;
3027
3028         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3029                 rc = cl_io_loop(env, io);
3030         else
3031                 rc = io->ci_result;
3032
3033         cl_io_fini(env, io);
3034         cl_env_put(env, &refcheck);
3035         RETURN(rc);
3036 }
3037
3038 static int ll_lock_noexpand(struct file *file, int flags)
3039 {
3040         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3041
3042         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3043
3044         return 0;
3045 }
3046
3047 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3048                         unsigned long arg)
3049 {
3050         struct fsxattr fsxattr;
3051
3052         if (copy_from_user(&fsxattr,
3053                            (const struct fsxattr __user *)arg,
3054                            sizeof(fsxattr)))
3055                 RETURN(-EFAULT);
3056
3057         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3058         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3059                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3060         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3061         if (copy_to_user((struct fsxattr __user *)arg,
3062                          &fsxattr, sizeof(fsxattr)))
3063                 RETURN(-EFAULT);
3064
3065         RETURN(0);
3066 }
3067
3068 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3069                         unsigned long arg)
3070 {
3071
3072         struct md_op_data *op_data;
3073         struct ptlrpc_request *req = NULL;
3074         int rc = 0;
3075         struct fsxattr fsxattr;
3076         struct cl_object *obj;
3077         struct iattr *attr;
3078         int flags;
3079
3080         /* only root could change project ID */
3081         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
3082                 RETURN(-EPERM);
3083
3084         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3085                                      LUSTRE_OPC_ANY, NULL);
3086         if (IS_ERR(op_data))
3087                 RETURN(PTR_ERR(op_data));
3088
3089         if (copy_from_user(&fsxattr,
3090                            (const struct fsxattr __user *)arg,
3091                            sizeof(fsxattr)))
3092                 GOTO(out_fsxattr, rc = -EFAULT);
3093
3094         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3095         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3096         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3097                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3098         op_data->op_projid = fsxattr.fsx_projid;
3099         op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
3100         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3101                         0, &req);
3102         ptlrpc_req_finished(req);
3103         if (rc)
3104                 GOTO(out_fsxattr, rc);
3105         ll_update_inode_flags(inode, op_data->op_attr_flags);
3106         obj = ll_i2info(inode)->lli_clob;
3107         if (obj == NULL)
3108                 GOTO(out_fsxattr, rc);
3109
3110         OBD_ALLOC_PTR(attr);
3111         if (attr == NULL)
3112                 GOTO(out_fsxattr, rc = -ENOMEM);
3113
3114         attr->ia_valid = ATTR_ATTR_FLAG;
3115         rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
3116         OBD_FREE_PTR(attr);
3117 out_fsxattr:
3118         ll_finish_md_op_data(op_data);
3119         RETURN(rc);
3120 }
3121
3122 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3123                                  unsigned long arg)
3124 {
3125         struct inode            *inode = file_inode(file);
3126         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3127         struct ll_inode_info    *lli = ll_i2info(inode);
3128         struct obd_client_handle *och = NULL;
3129         struct split_param sp;
3130         bool lease_broken;
3131         fmode_t fmode = 0;
3132         enum mds_op_bias bias = 0;
3133         struct file *layout_file = NULL;
3134         void *data = NULL;
3135         size_t data_size = 0;
3136         long rc;
3137         ENTRY;
3138
3139         mutex_lock(&lli->lli_och_mutex);
3140         if (fd->fd_lease_och != NULL) {
3141                 och = fd->fd_lease_och;
3142                 fd->fd_lease_och = NULL;
3143         }
3144         mutex_unlock(&lli->lli_och_mutex);
3145
3146         if (och == NULL)
3147                 GOTO(out, rc = -ENOLCK);
3148
3149         fmode = och->och_flags;
3150
3151         switch (ioc->lil_flags) {
3152         case LL_LEASE_RESYNC_DONE:
3153                 if (ioc->lil_count > IOC_IDS_MAX)
3154                         GOTO(out, rc = -EINVAL);
3155
3156                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3157                 OBD_ALLOC(data, data_size);
3158                 if (!data)
3159                         GOTO(out, rc = -ENOMEM);
3160
3161                 if (copy_from_user(data, (void __user *)arg, data_size))
3162                         GOTO(out, rc = -EFAULT);
3163
3164                 bias = MDS_CLOSE_RESYNC_DONE;
3165                 break;
3166         case LL_LEASE_LAYOUT_MERGE: {
3167                 int fd;
3168
3169                 if (ioc->lil_count != 1)
3170                         GOTO(out, rc = -EINVAL);
3171
3172                 arg += sizeof(*ioc);
3173                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3174                         GOTO(out, rc = -EFAULT);
3175
3176                 layout_file = fget(fd);
3177                 if (!layout_file)
3178                         GOTO(out, rc = -EBADF);
3179
3180                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3181                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3182                         GOTO(out, rc = -EPERM);
3183
3184                 data = file_inode(layout_file);
3185                 bias = MDS_CLOSE_LAYOUT_MERGE;
3186                 break;
3187         }
3188         case LL_LEASE_LAYOUT_SPLIT: {
3189                 int fdv;
3190                 int mirror_id;
3191
3192                 if (ioc->lil_count != 2)
3193                         GOTO(out, rc = -EINVAL);
3194
3195                 arg += sizeof(*ioc);
3196                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3197                         GOTO(out, rc = -EFAULT);
3198
3199                 arg += sizeof(__u32);
3200                 if (copy_from_user(&mirror_id, (void __user *)arg,
3201                                    sizeof(__u32)))
3202                         GOTO(out, rc = -EFAULT);
3203
3204                 layout_file = fget(fdv);
3205                 if (!layout_file)
3206                         GOTO(out, rc = -EBADF);
3207
3208                 sp.sp_inode = file_inode(layout_file);
3209                 sp.sp_mirror_id = (__u16)mirror_id;
3210                 data = &sp;
3211                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3212                 break;
3213         }
3214         default:
3215                 /* without close intent */
3216                 break;
3217         }
3218
3219         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3220         if (rc < 0)
3221                 GOTO(out, rc);
3222
3223         rc = ll_lease_och_release(inode, file);
3224         if (rc < 0)
3225                 GOTO(out, rc);
3226
3227         if (lease_broken)
3228                 fmode = 0;
3229         EXIT;
3230
3231 out:
3232         switch (ioc->lil_flags) {
3233         case LL_LEASE_RESYNC_DONE:
3234                 if (data)
3235                         OBD_FREE(data, data_size);
3236                 break;
3237         case LL_LEASE_LAYOUT_MERGE:
3238         case LL_LEASE_LAYOUT_SPLIT:
3239                 if (layout_file)
3240                         fput(layout_file);
3241                 break;
3242         }
3243
3244         if (!rc)
3245                 rc = ll_lease_type_from_fmode(fmode);
3246         RETURN(rc);
3247 }
3248
3249 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3250                               unsigned long arg)
3251 {
3252         struct inode *inode = file_inode(file);
3253         struct ll_inode_info *lli = ll_i2info(inode);
3254         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3255         struct obd_client_handle *och = NULL;
3256         __u64 open_flags = 0;
3257         bool lease_broken;
3258         fmode_t fmode;
3259         long rc;
3260         ENTRY;
3261
3262         switch (ioc->lil_mode) {
3263         case LL_LEASE_WRLCK:
3264                 if (!(file->f_mode & FMODE_WRITE))
3265                         RETURN(-EPERM);
3266                 fmode = FMODE_WRITE;
3267                 break;
3268         case LL_LEASE_RDLCK:
3269                 if (!(file->f_mode & FMODE_READ))
3270                         RETURN(-EPERM);
3271                 fmode = FMODE_READ;
3272                 break;
3273         case LL_LEASE_UNLCK:
3274                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3275         default:
3276                 RETURN(-EINVAL);
3277         }
3278
3279         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3280
3281         /* apply for lease */
3282         if (ioc->lil_flags & LL_LEASE_RESYNC)
3283                 open_flags = MDS_OPEN_RESYNC;
3284         och = ll_lease_open(inode, file, fmode, open_flags);
3285         if (IS_ERR(och))
3286                 RETURN(PTR_ERR(och));
3287
3288         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3289                 rc = ll_lease_file_resync(och, inode);
3290                 if (rc) {
3291                         ll_lease_close(och, inode, NULL);
3292                         RETURN(rc);
3293                 }
3294                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3295                 if (rc) {
3296                         ll_lease_close(och, inode, NULL);
3297                         RETURN(rc);
3298                 }
3299         }
3300
3301         rc = 0;
3302         mutex_lock(&lli->lli_och_mutex);
3303         if (fd->fd_lease_och == NULL) {
3304                 fd->fd_lease_och = och;
3305                 och = NULL;
3306         }
3307         mutex_unlock(&lli->lli_och_mutex);
3308         if (och != NULL) {
3309                 /* impossible now that only excl is supported for now */
3310                 ll_lease_close(och, inode, &lease_broken);
3311                 rc = -EBUSY;
3312         }
3313         RETURN(rc);
3314 }
3315
3316 static long
3317 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3318 {
3319         struct inode            *inode = file_inode(file);
3320         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3321         int                      flags, rc;
3322         ENTRY;
3323
3324         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3325                PFID(ll_inode2fid(inode)), inode, cmd);
3326         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3327
3328         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3329         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3330                 RETURN(-ENOTTY);
3331
3332         switch (cmd) {
3333         case LL_IOC_GETFLAGS:
3334                 /* Get the current value of the file flags */
3335                 return put_user(fd->fd_flags, (int __user *)arg);
3336         case LL_IOC_SETFLAGS:
3337         case LL_IOC_CLRFLAGS:
3338                 /* Set or clear specific file flags */
3339                 /* XXX This probably needs checks to ensure the flags are
3340                  *     not abused, and to handle any flag side effects.
3341                  */
3342                 if (get_user(flags, (int __user *) arg))
3343                         RETURN(-EFAULT);
3344
3345                 if (cmd == LL_IOC_SETFLAGS) {
3346                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3347                             !(file->f_flags & O_DIRECT)) {
3348                                 CERROR("%s: unable to disable locking on "
3349                                        "non-O_DIRECT file\n", current->comm);
3350                                 RETURN(-EINVAL);
3351                         }
3352
3353                         fd->fd_flags |= flags;
3354                 } else {
3355                         fd->fd_flags &= ~flags;
3356                 }
3357                 RETURN(0);
3358         case LL_IOC_LOV_SETSTRIPE:
3359         case LL_IOC_LOV_SETSTRIPE_NEW:
3360                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3361         case LL_IOC_LOV_SETEA:
3362                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3363         case LL_IOC_LOV_SWAP_LAYOUTS: {
3364                 struct file *file2;
3365                 struct lustre_swap_layouts lsl;
3366
3367                 if (copy_from_user(&lsl, (char __user *)arg,
3368                                    sizeof(struct lustre_swap_layouts)))
3369                         RETURN(-EFAULT);
3370
3371                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3372                         RETURN(-EPERM);
3373
3374                 file2 = fget(lsl.sl_fd);
3375                 if (file2 == NULL)
3376                         RETURN(-EBADF);
3377
3378                 /* O_WRONLY or O_RDWR */
3379                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3380                         GOTO(out, rc = -EPERM);
3381
3382                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3383                         struct inode                    *inode2;
3384                         struct ll_inode_info            *lli;
3385                         struct obd_client_handle        *och = NULL;
3386
3387                         lli = ll_i2info(inode);
3388                         mutex_lock(&lli->lli_och_mutex);
3389                         if (fd->fd_lease_och != NULL) {
3390                                 och = fd->fd_lease_och;
3391                                 fd->fd_lease_och = NULL;
3392                         }
3393                         mutex_unlock(&lli->lli_och_mutex);
3394                         if (och == NULL)
3395                                 GOTO(out, rc = -ENOLCK);
3396                         inode2 = file_inode(file2);
3397                         rc = ll_swap_layouts_close(och, inode, inode2);
3398                 } else {
3399                         rc = ll_swap_layouts(file, file2, &lsl);
3400                 }
3401 out:
3402                 fput(file2);
3403                 RETURN(rc);
3404         }
3405         case LL_IOC_LOV_GETSTRIPE:
3406         case LL_IOC_LOV_GETSTRIPE_NEW:
3407                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3408         case FS_IOC_GETFLAGS:
3409         case FS_IOC_SETFLAGS:
3410                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3411         case FSFILT_IOC_GETVERSION:
3412         case FS_IOC_GETVERSION:
3413                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3414         /* We need to special case any other ioctls we want to handle,
3415          * to send them to the MDS/OST as appropriate and to properly
3416          * network encode the arg field. */
3417         case FS_IOC_SETVERSION:
3418                 RETURN(-ENOTSUPP);
3419
3420         case LL_IOC_GROUP_LOCK:
3421                 RETURN(ll_get_grouplock(inode, file, arg));
3422         case LL_IOC_GROUP_UNLOCK:
3423                 RETURN(ll_put_grouplock(inode, file, arg));
3424         case IOC_OBD_STATFS:
3425                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3426
3427         case LL_IOC_FLUSHCTX:
3428                 RETURN(ll_flush_ctx(inode));
3429         case LL_IOC_PATH2FID: {
3430                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3431                                  sizeof(struct lu_fid)))
3432                         RETURN(-EFAULT);
3433
3434                 RETURN(0);
3435         }
3436         case LL_IOC_GETPARENT:
3437                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3438
3439         case OBD_IOC_FID2PATH:
3440                 RETURN(ll_fid2path(inode, (void __user *)arg));
3441         case LL_IOC_DATA_VERSION: {
3442                 struct ioc_data_version idv;
3443                 int rc;
3444
3445                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3446                         RETURN(-EFAULT);
3447
3448                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3449                 rc = ll_ioc_data_version(inode, &idv);
3450
3451                 if (rc == 0 &&
3452                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3453                         RETURN(-EFAULT);
3454
3455                 RETURN(rc);
3456         }
3457
3458         case LL_IOC_GET_MDTIDX: {
3459                 int mdtidx;
3460
3461                 mdtidx = ll_get_mdt_idx(inode);
3462                 if (mdtidx < 0)
3463                         RETURN(mdtidx);
3464
3465                 if (put_user((int)mdtidx, (int __user *)arg))
3466                         RETURN(-EFAULT);
3467
3468                 RETURN(0);
3469         }
3470         case OBD_IOC_GETDTNAME:
3471         case OBD_IOC_GETMDNAME:
3472                 RETURN(ll_get_obd_name(inode, cmd, arg));
3473         case LL_IOC_HSM_STATE_GET: {
3474                 struct md_op_data       *op_data;
3475                 struct hsm_user_state   *hus;
3476                 int                      rc;
3477
3478                 OBD_ALLOC_PTR(hus);
3479                 if (hus == NULL)
3480                         RETURN(-ENOMEM);
3481
3482                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3483                                              LUSTRE_OPC_ANY, hus);
3484                 if (IS_ERR(op_data)) {
3485                         OBD_FREE_PTR(hus);
3486                         RETURN(PTR_ERR(op_data));
3487                 }
3488
3489                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3490                                    op_data, NULL);
3491
3492                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3493                         rc = -EFAULT;
3494
3495                 ll_finish_md_op_data(op_data);
3496                 OBD_FREE_PTR(hus);
3497                 RETURN(rc);
3498         }
3499         case LL_IOC_HSM_STATE_SET: {
3500                 struct hsm_state_set    *hss;
3501                 int                      rc;
3502
3503                 OBD_ALLOC_PTR(hss);
3504                 if (hss == NULL)
3505                         RETURN(-ENOMEM);
3506
3507                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3508                         OBD_FREE_PTR(hss);
3509                         RETURN(-EFAULT);
3510                 }
3511
3512                 rc = ll_hsm_state_set(inode, hss);
3513
3514                 OBD_FREE_PTR(hss);
3515                 RETURN(rc);
3516         }
3517         case LL_IOC_HSM_ACTION: {
3518                 struct md_op_data               *op_data;
3519                 struct hsm_current_action       *hca;
3520                 int                              rc;
3521
3522                 OBD_ALLOC_PTR(hca);
3523                 if (hca == NULL)
3524                         RETURN(-ENOMEM);
3525
3526                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3527                                              LUSTRE_OPC_ANY, hca);
3528                 if (IS_ERR(op_data)) {
3529                         OBD_FREE_PTR(hca);
3530                         RETURN(PTR_ERR(op_data));
3531                 }
3532
3533                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3534                                    op_data, NULL);
3535
3536                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3537                         rc = -EFAULT;
3538
3539                 ll_finish_md_op_data(op_data);
3540                 OBD_FREE_PTR(hca);
3541                 RETURN(rc);
3542         }
3543         case LL_IOC_SET_LEASE_OLD: {
3544                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3545
3546                 RETURN(ll_file_set_lease(file, &ioc, 0));
3547         }
3548         case LL_IOC_SET_LEASE: {
3549                 struct ll_ioc_lease ioc;
3550
3551                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3552                         RETURN(-EFAULT);
3553
3554                 RETURN(ll_file_set_lease(file, &ioc, arg));
3555         }
3556         case LL_IOC_GET_LEASE: {
3557                 struct ll_inode_info *lli = ll_i2info(inode);
3558                 struct ldlm_lock *lock = NULL;
3559                 fmode_t fmode = 0;
3560
3561                 mutex_lock(&lli->lli_och_mutex);
3562                 if (fd->fd_lease_och != NULL) {
3563                         struct obd_client_handle *och = fd->fd_lease_och;
3564
3565                         lock = ldlm_handle2lock(&och->och_lease_handle);
3566                         if (lock != NULL) {
3567                                 lock_res_and_lock(lock);
3568                                 if (!ldlm_is_cancel(lock))
3569                                         fmode = och->och_flags;
3570
3571                                 unlock_res_and_lock(lock);
3572                                 LDLM_LOCK_PUT(lock);
3573                         }
3574                 }
3575                 mutex_unlock(&lli->lli_och_mutex);
3576
3577                 RETURN(ll_lease_type_from_fmode(fmode));
3578         }
3579         case LL_IOC_HSM_IMPORT: {
3580                 struct hsm_user_import *hui;
3581
3582                 OBD_ALLOC_PTR(hui);
3583                 if (hui == NULL)
3584                         RETURN(-ENOMEM);
3585
3586                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3587                         OBD_FREE_PTR(hui);
3588                         RETURN(-EFAULT);
3589                 }
3590
3591                 rc = ll_hsm_import(inode, file, hui);
3592
3593                 OBD_FREE_PTR(hui);
3594                 RETURN(rc);
3595         }
3596         case LL_IOC_FUTIMES_3: {
3597                 struct ll_futimes_3 lfu;
3598
3599                 if (copy_from_user(&lfu,
3600                                    (const struct ll_futimes_3 __user *)arg,
3601                                    sizeof(lfu)))
3602                         RETURN(-EFAULT);
3603
3604                 RETURN(ll_file_futimes_3(file, &lfu));
3605         }
3606         case LL_IOC_LADVISE: {
3607                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3608                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3609                 int i;
3610                 int num_advise;
3611                 int alloc_size = sizeof(*k_ladvise_hdr);
3612
3613                 rc = 0;
3614                 u_ladvise_hdr = (void __user *)arg;
3615                 OBD_ALLOC_PTR(k_ladvise_hdr);
3616                 if (k_ladvise_hdr == NULL)
3617                         RETURN(-ENOMEM);
3618
3619                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3620                         GOTO(out_ladvise, rc = -EFAULT);
3621
3622                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3623                     k_ladvise_hdr->lah_count < 1)
3624                         GOTO(out_ladvise, rc = -EINVAL);
3625
3626                 num_advise = k_ladvise_hdr->lah_count;
3627                 if (num_advise >= LAH_COUNT_MAX)
3628                         GOTO(out_ladvise, rc = -EFBIG);
3629
3630                 OBD_FREE_PTR(k_ladvise_hdr);
3631                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3632                                       lah_advise[num_advise]);
3633                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3634                 if (k_ladvise_hdr == NULL)
3635                         RETURN(-ENOMEM);
3636
3637                 /*
3638                  * TODO: submit multiple advices to one server in a single RPC
3639                  */
3640                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3641                         GOTO(out_ladvise, rc = -EFAULT);
3642
3643                 for (i = 0; i < num_advise; i++) {
3644                         struct llapi_lu_ladvise *k_ladvise =
3645                                         &k_ladvise_hdr->lah_advise[i];
3646                         struct llapi_lu_ladvise __user *u_ladvise =
3647                                         &u_ladvise_hdr->lah_advise[i];
3648
3649                         rc = ll_ladvise_sanity(inode, k_ladvise);
3650                         if (rc)
3651                                 GOTO(out_ladvise, rc);
3652
3653                         switch (k_ladvise->lla_advice) {
3654                         case LU_LADVISE_LOCKNOEXPAND:
3655                                 rc = ll_lock_noexpand(file,
3656                                                k_ladvise->lla_peradvice_flags);
3657                                 GOTO(out_ladvise, rc);
3658                         case LU_LADVISE_LOCKAHEAD:
3659
3660                                 rc = ll_file_lock_ahead(file, k_ladvise);
3661
3662                                 if (rc < 0)
3663                                         GOTO(out_ladvise, rc);
3664
3665                                 if (put_user(rc,
3666                                              &u_ladvise->lla_lockahead_result))
3667                                         GOTO(out_ladvise, rc = -EFAULT);
3668                                 break;
3669                         default:
3670                                 rc = ll_ladvise(inode, file,
3671                                                 k_ladvise_hdr->lah_flags,
3672                                                 k_ladvise);
3673                                 if (rc)
3674                                         GOTO(out_ladvise, rc);
3675                                 break;
3676                         }
3677
3678                 }
3679
3680 out_ladvise:
3681                 OBD_FREE(k_ladvise_hdr, alloc_size);
3682                 RETURN(rc);
3683         }
3684         case LL_IOC_FLR_SET_MIRROR: {
3685                 /* mirror I/O must be direct to avoid polluting page cache
3686                  * by stale data. */
3687                 if (!(file->f_flags & O_DIRECT))
3688                         RETURN(-EINVAL);
3689
3690                 fd->fd_designated_mirror = (__u32)arg;
3691                 RETURN(0);
3692         }
3693         case LL_IOC_FSGETXATTR:
3694                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3695         case LL_IOC_FSSETXATTR:
3696                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3697         case BLKSSZGET:
3698                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3699         default:
3700                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3701                                      (void __user *)arg));
3702         }
3703 }
3704
3705 #ifndef HAVE_FILE_LLSEEK_SIZE
3706 static inline loff_t
3707 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3708 {
3709         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3710                 return -EINVAL;
3711         if (offset > maxsize)
3712                 return -EINVAL;
3713
3714         if (offset != file->f_pos) {
3715                 file->f_pos = offset;
3716                 file->f_version = 0;
3717         }
3718         return offset;
3719 }
3720
3721 static loff_t
3722 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3723                 loff_t maxsize, loff_t eof)
3724 {
3725         struct inode *inode = file_inode(file);
3726
3727         switch (origin) {
3728         case SEEK_END:
3729                 offset += eof;
3730                 break;
3731         case SEEK_CUR:
3732                 /*
3733                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3734                  * position-querying operation.  Avoid rewriting the "same"
3735                  * f_pos value back to the file because a concurrent read(),
3736                  * write() or lseek() might have altered it
3737                  */
3738                 if (offset == 0)
3739                         return file->f_pos;
3740                 /*
3741                  * f_lock protects against read/modify/write race with other
3742                  * SEEK_CURs. Note that parallel writes and reads behave
3743                  * like SEEK_SET.
3744                  */
3745                 inode_lock(inode);
3746                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3747                 inode_unlock(inode);
3748                 return offset;
3749         case SEEK_DATA:
3750                 /*
3751                  * In the generic case the entire file is data, so as long as
3752                  * offset isn't at the end of the file then the offset is data.
3753                  */
3754                 if (offset >= eof)
3755                         return -ENXIO;
3756                 break;
3757         case SEEK_HOLE:
3758                 /*
3759                  * There is a virtual hole at the end of the file, so as long as
3760                  * offset isn't i_size or larger, return i_size.
3761                  */
3762                 if (offset >= eof)
3763                         return -ENXIO;
3764                 offset = eof;
3765                 break;
3766         }
3767
3768         return llseek_execute(file, offset, maxsize);
3769 }
3770 #endif
3771
3772 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3773 {
3774         struct inode *inode = file_inode(file);
3775         loff_t retval, eof = 0;
3776
3777         ENTRY;
3778         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3779                            (origin == SEEK_CUR) ? file->f_pos : 0);
3780         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3781                PFID(ll_inode2fid(inode)), inode, retval, retval,
3782                origin);
3783         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3784
3785         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3786                 retval = ll_glimpse_size(inode);
3787                 if (retval != 0)
3788                         RETURN(retval);
3789                 eof = i_size_read(inode);
3790         }
3791
3792         retval = ll_generic_file_llseek_size(file, offset, origin,
3793                                           ll_file_maxbytes(inode), eof);
3794         RETURN(retval);
3795 }
3796
3797 static int ll_flush(struct file *file, fl_owner_t id)
3798 {
3799         struct inode *inode = file_inode(file);
3800         struct ll_inode_info *lli = ll_i2info(inode);
3801         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3802         int rc, err;
3803
3804         LASSERT(!S_ISDIR(inode->i_mode));
3805
3806         /* catch async errors that were recorded back when async writeback
3807          * failed for pages in this mapping. */
3808         rc = lli->lli_async_rc;
3809         lli->lli_async_rc = 0;
3810         if (lli->lli_clob != NULL) {
3811                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3812                 if (rc == 0)
3813                         rc = err;
3814         }
3815
3816         /* The application has been told write failure already.
3817          * Do not report failure again. */
3818         if (fd->fd_write_failed)
3819                 return 0;
3820         return rc ? -EIO : 0;
3821 }
3822
3823 /**
3824  * Called to make sure a portion of file has been written out.
3825  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3826  *
3827  * Return how many pages have been written.
3828  */
3829 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3830                        enum cl_fsync_mode mode, int ignore_layout)
3831 {
3832         struct lu_env *env;
3833         struct cl_io *io;
3834         struct cl_fsync_io *fio;
3835         int result;
3836         __u16 refcheck;
3837         ENTRY;
3838
3839         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3840             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3841                 RETURN(-EINVAL);
3842
3843         env = cl_env_get(&refcheck);
3844         if (IS_ERR(env))
3845                 RETURN(PTR_ERR(env));
3846
3847         io = vvp_env_thread_io(env);
3848         io->ci_obj = ll_i2info(inode)->lli_clob;
3849         io->ci_ignore_layout = ignore_layout;
3850
3851         /* initialize parameters for sync */
3852         fio = &io->u.ci_fsync;
3853         fio->fi_start = start;
3854         fio->fi_end = end;
3855         fio->fi_fid = ll_inode2fid(inode);
3856         fio->fi_mode = mode;
3857         fio->fi_nr_written = 0;
3858
3859         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3860                 result = cl_io_loop(env, io);
3861         else
3862                 result = io->ci_result;
3863         if (result == 0)
3864                 result = fio->fi_nr_written;
3865         cl_io_fini(env, io);
3866         cl_env_put(env, &refcheck);
3867
3868         RETURN(result);
3869 }
3870
3871 /*
3872  * When dentry is provided (the 'else' case), file_dentry() may be
3873  * null and dentry must be used directly rather than pulled from
3874  * file_dentry() as is done otherwise.
3875  */
3876
3877 #ifdef HAVE_FILE_FSYNC_4ARGS
3878 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3879 {
3880         struct dentry *dentry = file_dentry(file);
3881         bool lock_inode;
3882 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3883 int ll_fsync(struct file *file, int datasync)
3884 {
3885         struct dentry *dentry = file_dentry(file);
3886         loff_t start = 0;
3887         loff_t end = LLONG_MAX;
3888 #else
3889 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3890 {
3891         loff_t start = 0;
3892         loff_t end = LLONG_MAX;
3893 #endif
3894         struct inode *inode = dentry->d_inode;
3895         struct ll_inode_info *lli = ll_i2info(inode);
3896         struct ptlrpc_request *req;
3897         int rc, err;
3898         ENTRY;
3899
3900         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3901                PFID(ll_inode2fid(inode)), inode);
3902         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3903
3904 #ifdef HAVE_FILE_FSYNC_4ARGS
3905         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3906         lock_inode = !lli->lli_inode_locked;
3907         if (lock_inode)
3908                 inode_lock(inode);
3909 #else
3910         /* fsync's caller has already called _fdata{sync,write}, we want
3911          * that IO to finish before calling the osc and mdc sync methods */
3912         rc = filemap_fdatawait(inode->i_mapping);
3913 #endif
3914
3915         /* catch async errors that were recorded back when async writeback
3916          * failed for pages in this mapping. */
3917         if (!S_ISDIR(inode->i_mode)) {
3918                 err = lli->lli_async_rc;
3919                 lli->lli_async_rc = 0;
3920                 if (rc == 0)
3921                         rc = err;
3922                 if (lli->lli_clob != NULL) {
3923                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3924                         if (rc == 0)
3925                                 rc = err;
3926                 }
3927         }
3928
3929         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3930         if (!rc)
3931                 rc = err;
3932         if (!err)
3933                 ptlrpc_req_finished(req);
3934
3935         if (S_ISREG(inode->i_mode)) {
3936                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3937
3938                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3939                 if (rc == 0 && err < 0)
3940                         rc = err;
3941                 if (rc < 0)
3942                         fd->fd_write_failed = true;
3943                 else
3944                         fd->fd_write_failed = false;
3945         }
3946
3947 #ifdef HAVE_FILE_FSYNC_4ARGS
3948         if (lock_inode)
3949                 inode_unlock(inode);
3950 #endif
3951         RETURN(rc);
3952 }
3953
3954 static int
3955 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3956 {
3957         struct inode *inode = file_inode(file);
3958         struct ll_sb_info *sbi = ll_i2sbi(inode);
3959         struct ldlm_enqueue_info einfo = {
3960                 .ei_type        = LDLM_FLOCK,
3961                 .ei_cb_cp       = ldlm_flock_completion_ast,
3962                 .ei_cbdata      = file_lock,
3963         };
3964         struct md_op_data *op_data;
3965         struct lustre_handle lockh = { 0 };
3966         union ldlm_policy_data flock = { { 0 } };
3967         int fl_type = file_lock->fl_type;
3968         __u64 flags = 0;
3969         int rc;
3970         int rc2 = 0;
3971         ENTRY;
3972
3973         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3974                PFID(ll_inode2fid(inode)), file_lock);
3975
3976         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3977
3978         if (file_lock->fl_flags & FL_FLOCK) {
3979                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3980                 /* flocks are whole-file locks */
3981                 flock.l_flock.end = OFFSET_MAX;
3982                 /* For flocks owner is determined by the local file desctiptor*/
3983                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3984         } else if (file_lock->fl_flags & FL_POSIX) {
3985                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3986                 flock.l_flock.start = file_lock->fl_start;
3987                 flock.l_flock.end = file_lock->fl_end;
3988         } else {
3989                 RETURN(-EINVAL);
3990         }
3991         flock.l_flock.pid = file_lock->fl_pid;
3992
3993         /* Somewhat ugly workaround for svc lockd.
3994          * lockd installs custom fl_lmops->lm_compare_owner that checks
3995          * for the fl_owner to be the same (which it always is on local node
3996          * I guess between lockd processes) and then compares pid.
3997          * As such we assign pid to the owner field to make it all work,
3998          * conflict with normal locks is unlikely since pid space and
3999          * pointer space for current->files are not intersecting */
4000         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4001                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4002
4003         switch (fl_type) {
4004         case F_RDLCK:
4005                 einfo.ei_mode = LCK_PR;
4006                 break;
4007         case F_UNLCK:
4008                 /* An unlock request may or may not have any relation to
4009                  * existing locks so we may not be able to pass a lock handle
4010                  * via a normal ldlm_lock_cancel() request. The request may even
4011                  * unlock a byte range in the middle of an existing lock. In
4012                  * order to process an unlock request we need all of the same
4013                  * information that is given with a normal read or write record
4014                  * lock request. To avoid creating another ldlm unlock (cancel)
4015                  * message we'll treat a LCK_NL flock request as an unlock. */
4016                 einfo.ei_mode = LCK_NL;
4017                 break;
4018         case F_WRLCK:
4019                 einfo.ei_mode = LCK_PW;
4020                 break;
4021         default:
4022                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4023                 RETURN (-ENOTSUPP);
4024         }
4025
4026         switch (cmd) {
4027         case F_SETLKW:
4028 #ifdef F_SETLKW64
4029         case F_SETLKW64:
4030 #endif
4031                 flags = 0;
4032                 break;
4033         case F_SETLK:
4034 #ifdef F_SETLK64
4035         case F_SETLK64:
4036 #endif
4037                 flags = LDLM_FL_BLOCK_NOWAIT;
4038                 break;
4039         case F_GETLK:
4040 #ifdef F_GETLK64
4041         case F_GETLK64:
4042 #endif
4043                 flags = LDLM_FL_TEST_LOCK;
4044                 break;
4045         default:
4046                 CERROR("unknown fcntl lock command: %d\n", cmd);
4047                 RETURN (-EINVAL);
4048         }
4049
4050         /* Save the old mode so that if the mode in the lock changes we
4051          * can decrement the appropriate reader or writer refcount. */
4052         file_lock->fl_type = einfo.ei_mode;
4053
4054         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4055                                      LUSTRE_OPC_ANY, NULL);
4056         if (IS_ERR(op_data))
4057                 RETURN(PTR_ERR(op_data));
4058
4059         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4060                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4061                flock.l_flock.pid, flags, einfo.ei_mode,
4062                flock.l_flock.start, flock.l_flock.end);
4063
4064         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4065                         flags);
4066
4067         /* Restore the file lock type if not TEST lock. */
4068         if (!(flags & LDLM_FL_TEST_LOCK))
4069                 file_lock->fl_type = fl_type;
4070
4071 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4072         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4073             !(flags & LDLM_FL_TEST_LOCK))
4074                 rc2  = locks_lock_file_wait(file, file_lock);
4075 #else
4076         if ((file_lock->fl_flags & FL_FLOCK) &&
4077             (rc == 0 || file_lock->fl_type == F_UNLCK))
4078                 rc2  = flock_lock_file_wait(file, file_lock);
4079         if ((file_lock->fl_flags & FL_POSIX) &&
4080             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4081             !(flags & LDLM_FL_TEST_LOCK))
4082                 rc2  = posix_lock_file_wait(file, file_lock);
4083 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4084
4085         if (rc2 && file_lock->fl_type != F_UNLCK) {
4086                 einfo.ei_mode = LCK_NL;
4087                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4088                            &lockh, flags);
4089                 rc = rc2;
4090         }
4091
4092         ll_finish_md_op_data(op_data);
4093
4094         RETURN(rc);
4095 }
4096
4097 int ll_get_fid_by_name(struct inode *parent, const char *name,
4098                        int namelen, struct lu_fid *fid,
4099                        struct inode **inode)
4100 {
4101         struct md_op_data       *op_data = NULL;
4102         struct mdt_body         *body;
4103         struct ptlrpc_request   *req;
4104         int                     rc;
4105         ENTRY;
4106
4107         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4108                                      LUSTRE_OPC_ANY, NULL);
4109         if (IS_ERR(op_data))
4110                 RETURN(PTR_ERR(op_data));
4111
4112         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4113         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4114         ll_finish_md_op_data(op_data);
4115         if (rc < 0)
4116                 RETURN(rc);
4117
4118         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4119         if (body == NULL)
4120                 GOTO(out_req, rc = -EFAULT);
4121         if (fid != NULL)
4122                 *fid = body->mbo_fid1;
4123
4124         if (inode != NULL)
4125                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4126 out_req:
4127         ptlrpc_req_finished(req);
4128         RETURN(rc);
4129 }
4130
4131 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4132                const char *name)
4133 {
4134         struct dentry *dchild = NULL;
4135         struct inode *child_inode = NULL;
4136         struct md_op_data *op_data;
4137         struct ptlrpc_request *request = NULL;
4138         struct obd_client_handle *och = NULL;
4139         struct qstr qstr;
4140         struct mdt_body *body;
4141         __u64 data_version = 0;
4142         size_t namelen = strlen(name);
4143         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4144         int rc;
4145         ENTRY;
4146
4147         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4148                PFID(ll_inode2fid(parent)), name,
4149                lum->lum_stripe_offset, lum->lum_stripe_count);
4150
4151         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4152             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4153                 lustre_swab_lmv_user_md(lum);
4154
4155         /* Get child FID first */
4156         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4157         qstr.name = name;
4158         qstr.len = namelen;
4159         dchild = d_lookup(file_dentry(file), &qstr);
4160         if (dchild) {
4161                 if (dchild->d_inode)
4162                         child_inode = igrab(dchild->d_inode);
4163                 dput(dchild);
4164         }
4165
4166         if (!child_inode) {
4167                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4168                                         &child_inode);
4169                 if (rc)
4170                         RETURN(rc);
4171         }
4172
4173         if (!child_inode)
4174                 RETURN(-ENOENT);
4175
4176         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4177               OBD_CONNECT2_DIR_MIGRATE)) {
4178                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4179                     ll_i2info(child_inode)->lli_lsm_md) {
4180                         CERROR("%s: MDT doesn't support stripe directory "
4181                                "migration!\n",
4182                                ll_get_fsname(parent->i_sb, NULL, 0));
4183                         GOTO(out_iput, rc = -EOPNOTSUPP);
4184                 }
4185         }
4186
4187         /*
4188          * lfs migrate command needs to be blocked on the client
4189          * by checking the migrate FID against the FID of the
4190          * filesystem root.
4191          */
4192         if (child_inode == parent->i_sb->s_root->d_inode)
4193                 GOTO(out_iput, rc = -EINVAL);
4194
4195         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4196                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4197         if (IS_ERR(op_data))
4198                 GOTO(out_iput, rc = PTR_ERR(op_data));
4199
4200         inode_lock(child_inode);
4201         op_data->op_fid3 = *ll_inode2fid(child_inode);
4202         if (!fid_is_sane(&op_data->op_fid3)) {
4203                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4204                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4205                        PFID(&op_data->op_fid3));
4206                 GOTO(out_unlock, rc = -EINVAL);
4207         }
4208
4209         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4210         op_data->op_data = lum;
4211         op_data->op_data_size = lumlen;
4212
4213 again:
4214         if (S_ISREG(child_inode->i_mode)) {
4215                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4216                 if (IS_ERR(och)) {
4217                         rc = PTR_ERR(och);
4218                         och = NULL;
4219                         GOTO(out_unlock, rc);
4220                 }
4221
4222                 rc = ll_data_version(child_inode, &data_version,
4223                                      LL_DV_WR_FLUSH);
4224                 if (rc != 0)
4225                         GOTO(out_close, rc);
4226
4227                 op_data->op_handle = och->och_fh;
4228                 op_data->op_data_version = data_version;
4229                 op_data->op_lease_handle = och->och_lease_handle;
4230                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4231
4232                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4233                 och->och_mod->mod_open_req->rq_replay = 0;
4234                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4235         }
4236
4237         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4238                        name, namelen, &request);
4239         if (rc == 0) {
4240                 LASSERT(request != NULL);
4241                 ll_update_times(request, parent);
4242
4243                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4244                 LASSERT(body != NULL);
4245
4246                 /* If the server does release layout lock, then we cleanup
4247                  * the client och here, otherwise release it in out_close: */
4248                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4249                         obd_mod_put(och->och_mod);
4250                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4251                                                   och);
4252                         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4253                         OBD_FREE_PTR(och);
4254                         och = NULL;
4255                 }
4256         }
4257
4258         if (request != NULL) {
4259                 ptlrpc_req_finished(request);
4260                 request = NULL;
4261         }
4262
4263         /* Try again if the file layout has changed. */
4264         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4265                 goto again;
4266
4267 out_close:
4268         if (och)
4269                 ll_lease_close(och, child_inode, NULL);
4270         if (!rc)
4271                 clear_nlink(child_inode);
4272 out_unlock:
4273         inode_unlock(child_inode);
4274         ll_finish_md_op_data(op_data);
4275 out_iput:
4276         iput(child_inode);
4277         RETURN(rc);
4278 }
4279
4280 static int
4281 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4282 {
4283         ENTRY;
4284
4285         RETURN(-ENOSYS);
4286 }
4287
4288 /**
4289  * test if some locks matching bits and l_req_mode are acquired
4290  * - bits can be in different locks
4291  * - if found clear the common lock bits in *bits
4292  * - the bits not found, are kept in *bits
4293  * \param inode [IN]
4294  * \param bits [IN] searched lock bits [IN]
4295  * \param l_req_mode [IN] searched lock mode
4296  * \retval boolean, true iff all bits are found
4297  */
4298 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4299 {
4300         struct lustre_handle lockh;
4301         union ldlm_policy_data policy;
4302         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4303                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4304         struct lu_fid *fid;
4305         __u64 flags;
4306         int i;
4307         ENTRY;
4308
4309         if (!inode)
4310                RETURN(0);
4311
4312         fid = &ll_i2info(inode)->lli_fid;
4313         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4314                ldlm_lockname[mode]);
4315
4316         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4317         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4318                 policy.l_inodebits.bits = *bits & (1 << i);
4319                 if (policy.l_inodebits.bits == 0)
4320                         continue;
4321
4322                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4323                                   &policy, mode, &lockh)) {
4324                         struct ldlm_lock *lock;
4325
4326                         lock = ldlm_handle2lock(&lockh);
4327                         if (lock) {
4328                                 *bits &=
4329                                       ~(lock->l_policy_data.l_inodebits.bits);
4330                                 LDLM_LOCK_PUT(lock);
4331                         } else {
4332                                 *bits &= ~policy.l_inodebits.bits;
4333                         }
4334                 }
4335         }
4336         RETURN(*bits == 0);
4337 }
4338
4339 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4340                                struct lustre_handle *lockh, __u64 flags,
4341                                enum ldlm_mode mode)
4342 {
4343         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4344         struct lu_fid *fid;
4345         enum ldlm_mode rc;
4346         ENTRY;
4347
4348         fid = &ll_i2info(inode)->lli_fid;
4349         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4350
4351         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4352                            fid, LDLM_IBITS, &policy, mode, lockh);
4353
4354         RETURN(rc);
4355 }
4356
4357 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4358 {
4359         /* Already unlinked. Just update nlink and return success */
4360         if (rc == -ENOENT) {
4361                 clear_nlink(inode);
4362                 /* If it is striped directory, and there is bad stripe
4363                  * Let's revalidate the dentry again, instead of returning
4364                  * error */
4365                 if (S_ISDIR(inode->i_mode) &&
4366                     ll_i2info(inode)->lli_lsm_md != NULL)
4367                         return 0;
4368
4369                 /* This path cannot be hit for regular files unless in
4370                  * case of obscure races, so no need to to validate
4371                  * size. */
4372                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4373                         return 0;
4374         } else if (rc != 0) {
4375                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4376                              "%s: revalidate FID "DFID" error: rc = %d\n",
4377                              ll_get_fsname(inode->i_sb, NULL, 0),
4378                              PFID(ll_inode2fid(inode)), rc);
4379         }
4380
4381         return rc;
4382 }
4383
4384 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4385 {
4386         struct inode *inode = dentry->d_inode;
4387         struct obd_export *exp = ll_i2mdexp(inode);
4388         struct lookup_intent oit = {
4389                 .it_op = op,
4390         };
4391         struct ptlrpc_request *req = NULL;
4392         struct md_op_data *op_data;
4393         int rc = 0;
4394         ENTRY;
4395
4396         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4397                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4398
4399         /* Call getattr by fid, so do not provide name at all. */
4400         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4401                                      LUSTRE_OPC_ANY, NULL);
4402         if (IS_ERR(op_data))
4403                 RETURN(PTR_ERR(op_data));
4404
4405         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4406         ll_finish_md_op_data(op_data);
4407         if (rc < 0) {
4408                 rc = ll_inode_revalidate_fini(inode, rc);
4409                 GOTO(out, rc);
4410         }
4411
4412         rc = ll_revalidate_it_finish(req, &oit, dentry);
4413         if (rc != 0) {
4414                 ll_intent_release(&oit);
4415                 GOTO(out, rc);
4416         }
4417
4418         /* Unlinked? Unhash dentry, so it is not picked up later by
4419          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4420          * here to preserve get_cwd functionality on 2.6.
4421          * Bug 10503 */
4422         if (!dentry->d_inode->i_nlink) {
4423                 ll_lock_dcache(inode);
4424                 d_lustre_invalidate(dentry, 0);
4425                 ll_unlock_dcache(inode);
4426         }
4427
4428         ll_lookup_finish_locks(&oit, dentry);
4429 out:
4430         ptlrpc_req_finished(req);
4431
4432         return rc;
4433 }
4434
4435 static int ll_merge_md_attr(struct inode *inode)
4436 {
4437         struct cl_attr attr = { 0 };
4438         int rc;
4439
4440         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4441         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4442                            &attr, ll_md_blocking_ast);
4443         if (rc != 0)
4444                 RETURN(rc);
4445
4446         set_nlink(inode, attr.cat_nlink);
4447         inode->i_blocks = attr.cat_blocks;
4448         i_size_write(inode, attr.cat_size);
4449
4450         ll_i2info(inode)->lli_atime = attr.cat_atime;
4451         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4452         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4453
4454         RETURN(0);
4455 }
4456
4457 static inline dev_t ll_compat_encode_dev(dev_t dev)
4458 {
4459         /* The compat_sys_*stat*() syscalls will fail unless the
4460          * device majors and minors are both less than 256. Note that
4461          * the value returned here will be passed through
4462          * old_encode_dev() in cp_compat_stat(). And so we are not
4463          * trying to return a valid compat (u16) device number, just
4464          * one that will pass the old_valid_dev() check. */
4465
4466         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4467 }
4468
4469 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4470 int ll_getattr(const struct path *path, struct kstat *stat,
4471                u32 request_mask, unsigned int flags)
4472 {
4473         struct dentry *de = path->dentry;
4474 #else
4475 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4476 {
4477 #endif
4478         struct inode *inode = de->d_inode;
4479         struct ll_sb_info *sbi = ll_i2sbi(inode);
4480         struct ll_inode_info *lli = ll_i2info(inode);
4481         int rc;
4482
4483         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4484
4485         rc = ll_inode_revalidate(de, IT_GETATTR);
4486         if (rc < 0)
4487                 RETURN(rc);
4488
4489         if (S_ISREG(inode->i_mode)) {
4490                 /* In case of restore, the MDT has the right size and has
4491                  * already send it back without granting the layout lock,
4492                  * inode is up-to-date so glimpse is useless.
4493                  * Also to glimpse we need the layout, in case of a running
4494                  * restore the MDT holds the layout lock so the glimpse will
4495                  * block up to the end of restore (getattr will block)
4496                  */
4497                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4498                         rc = ll_glimpse_size(inode);
4499                         if (rc < 0)
4500                                 RETURN(rc);
4501                 }
4502         } else {
4503                 /* If object isn't regular a file then don't validate size. */
4504                 if (S_ISDIR(inode->i_mode) &&
4505                     lli->lli_lsm_md != NULL) {
4506                         rc = ll_merge_md_attr(inode);
4507                         if (rc < 0)
4508                                 RETURN(rc);
4509                 }
4510
4511                 LTIME_S(inode->i_atime) = lli->lli_atime;
4512                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4513                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4514         }
4515
4516         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4517
4518         if (ll_need_32bit_api(sbi)) {
4519                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4520                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4521                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4522         } else {
4523                 stat->ino = inode->i_ino;
4524                 stat->dev = inode->i_sb->s_dev;
4525                 stat->rdev = inode->i_rdev;
4526         }
4527
4528         stat->mode = inode->i_mode;
4529         stat->uid = inode->i_uid;
4530         stat->gid = inode->i_gid;
4531         stat->atime = inode->i_atime;
4532         stat->mtime = inode->i_mtime;
4533         stat->ctime = inode->i_ctime;
4534         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4535
4536         stat->nlink = inode->i_nlink;
4537         stat->size = i_size_read(inode);
4538         stat->blocks = inode->i_blocks;
4539
4540         return 0;
4541 }
4542
4543 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4544                      __u64 start, __u64 len)
4545 {
4546         int             rc;
4547         size_t          num_bytes;
4548         struct fiemap   *fiemap;
4549         unsigned int    extent_count = fieinfo->fi_extents_max;
4550
4551         num_bytes = sizeof(*fiemap) + (extent_count *
4552                                        sizeof(struct fiemap_extent));
4553         OBD_ALLOC_LARGE(fiemap, num_bytes);
4554
4555         if (fiemap == NULL)
4556                 RETURN(-ENOMEM);
4557
4558         fiemap->fm_flags = fieinfo->fi_flags;
4559         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4560         fiemap->fm_start = start;
4561         fiemap->fm_length = len;
4562         if (extent_count > 0 &&
4563             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4564                            sizeof(struct fiemap_extent)) != 0)
4565                 GOTO(out, rc = -EFAULT);
4566
4567         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4568
4569         fieinfo->fi_flags = fiemap->fm_flags;
4570         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4571         if (extent_count > 0 &&
4572             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4573                          fiemap->fm_mapped_extents *
4574                          sizeof(struct fiemap_extent)) != 0)
4575                 GOTO(out, rc = -EFAULT);
4576 out:
4577         OBD_FREE_LARGE(fiemap, num_bytes);
4578         return rc;
4579 }
4580
4581 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4582 {
4583         struct ll_inode_info *lli = ll_i2info(inode);
4584         struct posix_acl *acl = NULL;
4585         ENTRY;
4586
4587         spin_lock(&lli->lli_lock);
4588         /* VFS' acl_permission_check->check_acl will release the refcount */
4589         acl = posix_acl_dup(lli->lli_posix_acl);
4590         spin_unlock(&lli->lli_lock);
4591
4592         RETURN(acl);
4593 }
4594
4595 #ifdef HAVE_IOP_SET_ACL
4596 #ifdef CONFIG_FS_POSIX_ACL
4597 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4598 {
4599         struct ll_sb_info *sbi = ll_i2sbi(inode);
4600         struct ptlrpc_request *req = NULL;
4601         const char *name = NULL;
4602         char *value = NULL;
4603         size_t value_size = 0;
4604         int rc = 0;
4605         ENTRY;
4606
4607         switch (type) {
4608         case ACL_TYPE_ACCESS:
4609                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4610                 if (acl)
4611                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4612                 break;
4613
4614         case ACL_TYPE_DEFAULT:
4615                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4616                 if (!S_ISDIR(inode->i_mode))
4617                         rc = acl ? -EACCES : 0;
4618                 break;
4619
4620         default:
4621                 rc = -EINVAL;
4622                 break;
4623         }
4624         if (rc)
4625                 return rc;
4626
4627         if (acl) {
4628                 value_size = posix_acl_xattr_size(acl->a_count);
4629                 value = kmalloc(value_size, GFP_NOFS);
4630                 if (value == NULL)
4631                         GOTO(out, rc = -ENOMEM);
4632
4633                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4634                 if (rc < 0)
4635                         GOTO(out_value, rc);
4636         }
4637
4638         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4639                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4640                          name, value, value_size, 0, 0, &req);
4641
4642         ptlrpc_req_finished(req);
4643 out_value:
4644         kfree(value);
4645 out:
4646         if (rc)
4647                 forget_cached_acl(inode, type);
4648         else
4649                 set_cached_acl(inode, type, acl);
4650         RETURN(rc);
4651 }
4652 #endif /* CONFIG_FS_POSIX_ACL */
4653 #endif /* HAVE_IOP_SET_ACL */
4654
4655 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4656 static int
4657 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4658 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4659 # else
4660 ll_check_acl(struct inode *inode, int mask)
4661 # endif
4662 {
4663 # ifdef CONFIG_FS_POSIX_ACL
4664         struct posix_acl *acl;
4665         int rc;
4666         ENTRY;
4667
4668 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4669         if (flags & IPERM_FLAG_RCU)
4670                 return -ECHILD;
4671 #  endif
4672         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4673
4674         if (!acl)
4675                 RETURN(-EAGAIN);
4676
4677         rc = posix_acl_permission(inode, acl, mask);
4678         posix_acl_release(acl);
4679
4680         RETURN(rc);
4681 # else /* !CONFIG_FS_POSIX_ACL */
4682         return -EAGAIN;
4683 # endif /* CONFIG_FS_POSIX_ACL */
4684 }
4685 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4686
4687 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4688 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4689 #else
4690 # ifdef HAVE_INODE_PERMISION_2ARGS
4691 int ll_inode_permission(struct inode *inode, int mask)
4692 # else
4693 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4694 # endif
4695 #endif
4696 {
4697         int rc = 0;
4698         struct ll_sb_info *sbi;
4699         struct root_squash_info *squash;
4700         struct cred *cred = NULL;
4701         const struct cred *old_cred = NULL;
4702         cfs_cap_t cap;
4703         bool squash_id = false;
4704         ENTRY;
4705
4706 #ifdef MAY_NOT_BLOCK
4707         if (mask & MAY_NOT_BLOCK)
4708                 return -ECHILD;
4709 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4710         if (flags & IPERM_FLAG_RCU)
4711                 return -ECHILD;
4712 #endif
4713
4714        /* as root inode are NOT getting validated in lookup operation,
4715         * need to do it before permission check. */
4716
4717         if (inode == inode->i_sb->s_root->d_inode) {
4718                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4719                 if (rc)
4720                         RETURN(rc);
4721         }
4722
4723         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4724                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4725
4726         /* squash fsuid/fsgid if needed */
4727         sbi = ll_i2sbi(inode);
4728         squash = &sbi->ll_squash;
4729         if (unlikely(squash->rsi_uid != 0 &&
4730                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4731                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4732                         squash_id = true;
4733         }
4734         if (squash_id) {
4735                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4736                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4737                        squash->rsi_uid, squash->rsi_gid);
4738
4739                 /* update current process's credentials
4740                  * and FS capability */
4741                 cred = prepare_creds();
4742                 if (cred == NULL)
4743                         RETURN(-ENOMEM);
4744
4745                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4746                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4747                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4748                         if ((1 << cap) & CFS_CAP_FS_MASK)
4749                                 cap_lower(cred->cap_effective, cap);
4750                 }
4751                 old_cred = override_creds(cred);
4752         }
4753
4754         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4755         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4756         /* restore current process's credentials and FS capability */
4757         if (squash_id) {
4758                 revert_creds(old_cred);
4759                 put_cred(cred);
4760         }
4761
4762         RETURN(rc);
4763 }
4764
4765 /* -o localflock - only provides locally consistent flock locks */
4766 struct file_operations ll_file_operations = {
4767 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4768 # ifdef HAVE_SYNC_READ_WRITE
4769         .read           = new_sync_read,
4770         .write          = new_sync_write,
4771 # endif
4772         .read_iter      = ll_file_read_iter,
4773         .write_iter     = ll_file_write_iter,
4774 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4775         .read           = ll_file_read,
4776         .aio_read       = ll_file_aio_read,
4777         .write          = ll_file_write,
4778         .aio_write      = ll_file_aio_write,
4779 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4780         .unlocked_ioctl = ll_file_ioctl,
4781         .open           = ll_file_open,
4782         .release        = ll_file_release,
4783         .mmap           = ll_file_mmap,
4784         .llseek         = ll_file_seek,
4785         .splice_read    = ll_file_splice_read,
4786         .fsync          = ll_fsync,
4787         .flush          = ll_flush
4788 };
4789
4790 struct file_operations ll_file_operations_flock = {
4791 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4792 # ifdef HAVE_SYNC_READ_WRITE
4793         .read           = new_sync_read,
4794         .write          = new_sync_write,
4795 # endif /* HAVE_SYNC_READ_WRITE */
4796         .read_iter      = ll_file_read_iter,
4797         .write_iter     = ll_file_write_iter,
4798 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4799         .read           = ll_file_read,
4800         .aio_read       = ll_file_aio_read,
4801         .write          = ll_file_write,
4802         .aio_write      = ll_file_aio_write,
4803 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4804         .unlocked_ioctl = ll_file_ioctl,
4805         .open           = ll_file_open,
4806         .release        = ll_file_release,
4807         .mmap           = ll_file_mmap,
4808         .llseek         = ll_file_seek,
4809         .splice_read    = ll_file_splice_read,
4810         .fsync          = ll_fsync,
4811         .flush          = ll_flush,
4812         .flock          = ll_file_flock,
4813         .lock           = ll_file_flock
4814 };
4815
4816 /* These are for -o noflock - to return ENOSYS on flock calls */
4817 struct file_operations ll_file_operations_noflock = {
4818 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4819 # ifdef HAVE_SYNC_READ_WRITE
4820         .read           = new_sync_read,
4821         .write          = new_sync_write,
4822 # endif /* HAVE_SYNC_READ_WRITE */
4823         .read_iter      = ll_file_read_iter,
4824         .write_iter     = ll_file_write_iter,
4825 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4826         .read           = ll_file_read,
4827         .aio_read       = ll_file_aio_read,
4828         .write          = ll_file_write,
4829         .aio_write      = ll_file_aio_write,
4830 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4831         .unlocked_ioctl = ll_file_ioctl,
4832         .open           = ll_file_open,
4833         .release        = ll_file_release,
4834         .mmap           = ll_file_mmap,
4835         .llseek         = ll_file_seek,
4836         .splice_read    = ll_file_splice_read,
4837         .fsync          = ll_fsync,
4838         .flush          = ll_flush,
4839         .flock          = ll_file_noflock,
4840         .lock           = ll_file_noflock
4841 };
4842
4843 struct inode_operations ll_file_inode_operations = {
4844         .setattr        = ll_setattr,
4845         .getattr        = ll_getattr,
4846         .permission     = ll_inode_permission,
4847 #ifdef HAVE_IOP_XATTR
4848         .setxattr       = ll_setxattr,
4849         .getxattr       = ll_getxattr,
4850         .removexattr    = ll_removexattr,
4851 #endif
4852         .listxattr      = ll_listxattr,
4853         .fiemap         = ll_fiemap,
4854 #ifdef HAVE_IOP_GET_ACL
4855         .get_acl        = ll_get_acl,
4856 #endif
4857 #ifdef HAVE_IOP_SET_ACL
4858         .set_acl        = ll_set_acl,
4859 #endif
4860 };
4861
4862 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4863 {
4864         struct ll_inode_info *lli = ll_i2info(inode);
4865         struct cl_object *obj = lli->lli_clob;
4866         struct lu_env *env;
4867         int rc;
4868         __u16 refcheck;
4869         ENTRY;
4870
4871         if (obj == NULL)
4872                 RETURN(0);
4873
4874         env = cl_env_get(&refcheck);
4875         if (IS_ERR(env))
4876                 RETURN(PTR_ERR(env));
4877
4878         rc = cl_conf_set(env, lli->lli_clob, conf);
4879         if (rc < 0)
4880                 GOTO(out, rc);
4881
4882         if (conf->coc_opc == OBJECT_CONF_SET) {
4883                 struct ldlm_lock *lock = conf->coc_lock;
4884                 struct cl_layout cl = {
4885                         .cl_layout_gen = 0,
4886                 };
4887
4888                 LASSERT(lock != NULL);
4889                 LASSERT(ldlm_has_layout(lock));
4890
4891                 /* it can only be allowed to match after layout is
4892                  * applied to inode otherwise false layout would be
4893                  * seen. Applying layout shoud happen before dropping
4894                  * the intent lock. */
4895                 ldlm_lock_allow_match(lock);
4896
4897                 rc = cl_object_layout_get(env, obj, &cl);
4898                 if (rc < 0)
4899                         GOTO(out, rc);
4900
4901                 CDEBUG(D_VFSTRACE,
4902                        DFID": layout version change: %u -> %u\n",
4903                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4904                        cl.cl_layout_gen);
4905                 ll_layout_version_set(lli, cl.cl_layout_gen);
4906         }
4907
4908 out:
4909         cl_env_put(env, &refcheck);
4910
4911         RETURN(rc);
4912 }
4913
4914 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4915 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4916
4917 {
4918         struct ll_sb_info *sbi = ll_i2sbi(inode);
4919         struct ptlrpc_request *req;
4920         struct mdt_body *body;
4921         void *lvbdata;
4922         void *lmm;
4923         int lmmsize;
4924         int rc;
4925         ENTRY;
4926
4927         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4928                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4929                lock->l_lvb_data, lock->l_lvb_len);
4930
4931         if (lock->l_lvb_data != NULL)
4932                 RETURN(0);
4933
4934         /* if layout lock was granted right away, the layout is returned
4935          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4936          * blocked and then granted via completion ast, we have to fetch
4937          * layout here. Please note that we can't use the LVB buffer in
4938          * completion AST because it doesn't have a large enough buffer */
4939         rc = ll_get_default_mdsize(sbi, &lmmsize);
4940         if (rc == 0)
4941                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4942                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4943         if (rc < 0)
4944                 RETURN(rc);
4945
4946         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4947         if (body == NULL)
4948                 GOTO(out, rc = -EPROTO);
4949
4950         lmmsize = body->mbo_eadatasize;
4951         if (lmmsize == 0) /* empty layout */
4952                 GOTO(out, rc = 0);
4953
4954         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4955         if (lmm == NULL)
4956                 GOTO(out, rc = -EFAULT);
4957
4958         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4959         if (lvbdata == NULL)
4960                 GOTO(out, rc = -ENOMEM);
4961
4962         memcpy(lvbdata, lmm, lmmsize);
4963         lock_res_and_lock(lock);
4964         if (unlikely(lock->l_lvb_data == NULL)) {
4965                 lock->l_lvb_type = LVB_T_LAYOUT;
4966                 lock->l_lvb_data = lvbdata;
4967                 lock->l_lvb_len = lmmsize;
4968                 lvbdata = NULL;
4969         }
4970         unlock_res_and_lock(lock);
4971
4972         if (lvbdata)
4973                 OBD_FREE_LARGE(lvbdata, lmmsize);
4974
4975         EXIT;
4976
4977 out:
4978         ptlrpc_req_finished(req);
4979         return rc;
4980 }
4981
4982 /**
4983  * Apply the layout to the inode. Layout lock is held and will be released
4984  * in this function.
4985  */
4986 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4987                               struct inode *inode)
4988 {
4989         struct ll_inode_info *lli = ll_i2info(inode);
4990         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4991         struct ldlm_lock *lock;
4992         struct cl_object_conf conf;
4993         int rc = 0;
4994         bool lvb_ready;
4995         bool wait_layout = false;
4996         ENTRY;
4997
4998         LASSERT(lustre_handle_is_used(lockh));
4999
5000         lock = ldlm_handle2lock(lockh);
5001         LASSERT(lock != NULL);
5002         LASSERT(ldlm_has_layout(lock));
5003
5004         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5005                    PFID(&lli->lli_fid), inode);
5006
5007         /* in case this is a caching lock and reinstate with new inode */
5008         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5009
5010         lock_res_and_lock(lock);
5011         lvb_ready = ldlm_is_lvb_ready(lock);
5012         unlock_res_and_lock(lock);
5013
5014         /* checking lvb_ready is racy but this is okay. The worst case is
5015          * that multi processes may configure the file on the same time. */
5016         if (lvb_ready)
5017                 GOTO(out, rc = 0);
5018
5019         rc = ll_layout_fetch(inode, lock);
5020         if (rc < 0)
5021                 GOTO(out, rc);
5022
5023         /* for layout lock, lmm is stored in lock's lvb.
5024          * lvb_data is immutable if the lock is held so it's safe to access it
5025          * without res lock.
5026          *
5027          * set layout to file. Unlikely this will fail as old layout was
5028          * surely eliminated */
5029         memset(&conf, 0, sizeof conf);
5030         conf.coc_opc = OBJECT_CONF_SET;
5031         conf.coc_inode = inode;
5032         conf.coc_lock = lock;
5033         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5034         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5035         rc = ll_layout_conf(inode, &conf);
5036
5037         /* refresh layout failed, need to wait */
5038         wait_layout = rc == -EBUSY;
5039         EXIT;
5040 out:
5041         LDLM_LOCK_PUT(lock);
5042         ldlm_lock_decref(lockh, mode);
5043
5044         /* wait for IO to complete if it's still being used. */
5045         if (wait_layout) {
5046                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5047                        ll_get_fsname(inode->i_sb, NULL, 0),
5048                        PFID(&lli->lli_fid), inode);
5049
5050                 memset(&conf, 0, sizeof conf);
5051                 conf.coc_opc = OBJECT_CONF_WAIT;
5052                 conf.coc_inode = inode;
5053                 rc = ll_layout_conf(inode, &conf);
5054                 if (rc == 0)
5055                         rc = -EAGAIN;
5056
5057                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5058                        ll_get_fsname(inode->i_sb, NULL, 0),
5059                        PFID(&lli->lli_fid), rc);
5060         }
5061         RETURN(rc);
5062 }
5063
5064 /**
5065  * Issue layout intent RPC to MDS.
5066  * \param inode [in]    file inode
5067  * \param intent [in]   layout intent
5068  *
5069  * \retval 0    on success
5070  * \retval < 0  error code
5071  */
5072 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5073 {
5074         struct ll_inode_info  *lli = ll_i2info(inode);
5075         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5076         struct md_op_data     *op_data;
5077         struct lookup_intent it;
5078         struct ptlrpc_request *req;
5079         int rc;
5080         ENTRY;
5081
5082         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5083                                      0, 0, LUSTRE_OPC_ANY, NULL);
5084         if (IS_ERR(op_data))
5085                 RETURN(PTR_ERR(op_data));
5086
5087         op_data->op_data = intent;
5088         op_data->op_data_size = sizeof(*intent);
5089
5090         memset(&it, 0, sizeof(it));
5091         it.it_op = IT_LAYOUT;
5092         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5093             intent->li_opc == LAYOUT_INTENT_TRUNC)
5094                 it.it_flags = FMODE_WRITE;
5095
5096         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5097                           ll_get_fsname(inode->i_sb, NULL, 0),
5098                           PFID(&lli->lli_fid), inode);
5099
5100         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5101                             &ll_md_blocking_ast, 0);
5102         if (it.it_request != NULL)
5103                 ptlrpc_req_finished(it.it_request);
5104         it.it_request = NULL;
5105
5106         ll_finish_md_op_data(op_data);
5107
5108         /* set lock data in case this is a new lock */
5109         if (!rc)
5110                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5111
5112         ll_intent_drop_lock(&it);
5113
5114         RETURN(rc);
5115 }
5116
5117 /**
5118  * This function checks if there exists a LAYOUT lock on the client side,
5119  * or enqueues it if it doesn't have one in cache.
5120  *
5121  * This function will not hold layout lock so it may be revoked any time after
5122  * this function returns. Any operations depend on layout should be redone
5123  * in that case.
5124  *
5125  * This function should be called before lov_io_init() to get an uptodate
5126  * layout version, the caller should save the version number and after IO
5127  * is finished, this function should be called again to verify that layout
5128  * is not changed during IO time.
5129  */
5130 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5131 {
5132         struct ll_inode_info    *lli = ll_i2info(inode);
5133         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5134         struct lustre_handle lockh;
5135         struct layout_intent intent = {
5136                 .li_opc = LAYOUT_INTENT_ACCESS,
5137         };
5138         enum ldlm_mode mode;
5139         int rc;
5140         ENTRY;
5141
5142         *gen = ll_layout_version_get(lli);
5143         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5144                 RETURN(0);
5145
5146         /* sanity checks */
5147         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5148         LASSERT(S_ISREG(inode->i_mode));
5149
5150         /* take layout lock mutex to enqueue layout lock exclusively. */
5151         mutex_lock(&lli->lli_layout_mutex);
5152
5153         while (1) {
5154                 /* mostly layout lock is caching on the local side, so try to
5155                  * match it before grabbing layout lock mutex. */
5156                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5157                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5158                 if (mode != 0) { /* hit cached lock */
5159                         rc = ll_layout_lock_set(&lockh, mode, inode);
5160                         if (rc == -EAGAIN)
5161                                 continue;
5162                         break;
5163                 }
5164
5165                 rc = ll_layout_intent(inode, &intent);
5166                 if (rc != 0)
5167                         break;
5168         }
5169
5170         if (rc == 0)
5171                 *gen = ll_layout_version_get(lli);
5172         mutex_unlock(&lli->lli_layout_mutex);
5173
5174         RETURN(rc);
5175 }
5176
5177 /**
5178  * Issue layout intent RPC indicating where in a file an IO is about to write.
5179  *
5180  * \param[in] inode     file inode.
5181  * \param[in] ext       write range with start offset of fille in bytes where
5182  *                      an IO is about to write, and exclusive end offset in
5183  *                      bytes.
5184  *
5185  * \retval 0    on success
5186  * \retval < 0  error code
5187  */
5188 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5189                            struct lu_extent *ext)
5190 {
5191         struct layout_intent intent = {
5192                 .li_opc = opc,
5193                 .li_extent.e_start = ext->e_start,
5194                 .li_extent.e_end = ext->e_end,
5195         };
5196         int rc;
5197         ENTRY;
5198
5199         rc = ll_layout_intent(inode, &intent);
5200
5201         RETURN(rc);
5202 }
5203
5204 /**
5205  *  This function send a restore request to the MDT
5206  */
5207 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5208 {
5209         struct hsm_user_request *hur;
5210         int                      len, rc;
5211         ENTRY;
5212
5213         len = sizeof(struct hsm_user_request) +
5214               sizeof(struct hsm_user_item);
5215         OBD_ALLOC(hur, len);
5216         if (hur == NULL)
5217                 RETURN(-ENOMEM);
5218
5219         hur->hur_request.hr_action = HUA_RESTORE;
5220         hur->hur_request.hr_archive_id = 0;
5221         hur->hur_request.hr_flags = 0;
5222         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5223                sizeof(hur->hur_user_item[0].hui_fid));
5224         hur->hur_user_item[0].hui_extent.offset = offset;
5225         hur->hur_user_item[0].hui_extent.length = length;
5226         hur->hur_request.hr_itemcount = 1;
5227         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5228                            len, hur, NULL);
5229         OBD_FREE(hur, len);
5230         RETURN(rc);
5231 }