lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                      ATTR_MTIME | ATTR_MTIME_SET |
 104                                      ATTR_CTIME | ATTR_CTIME_SET;
 105         op_data->op_attr_blocks = inode->i_blocks;
 106         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 107         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 108                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 109         op_data->op_handle = och->och_fh;
 110
 111         if (och->och_flags & FMODE_WRITE &&
 112             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 113                 /* For HSM: if inode data has been modified, pack it so that
 114                  * MDT can set data dirty flag in the archive. */
 115                 op_data->op_bias |= MDS_DATA_MODIFIED;
 116
 117         EXIT;
 118 }
 119
 120 /**
 121  * Perform a close, possibly with a bias.
 122  * The meaning of "data" depends on the value of "bias".
 123  *
 124  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 125  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 126  * swap layouts with.
 127  */
 128 static int ll_close_inode_openhandle(struct inode *inode,
 129                                      struct obd_client_handle *och,
 130                                      enum mds_op_bias bias, void *data)
 131 {
 132         struct obd_export *md_exp = ll_i2mdexp(inode);
 133         const struct ll_inode_info *lli = ll_i2info(inode);
 134         struct md_op_data *op_data;
 135         struct ptlrpc_request *req = NULL;
 136         int rc;
 137         ENTRY;
 138
 139         if (class_exp2obd(md_exp) == NULL) {
 140                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 141                        ll_get_fsname(inode->i_sb, NULL, 0),
 142                        PFID(&lli->lli_fid));
 143                 GOTO(out, rc = 0);
 144         }
 145
 146         OBD_ALLOC_PTR(op_data);
 147         /* We leak openhandle and request here on error, but not much to be
 148          * done in OOM case since app won't retry close on error either. */
 149         if (op_data == NULL)
 150                 GOTO(out, rc = -ENOMEM);
 151
 152         ll_prepare_close(inode, op_data, och);
 153         switch (bias) {
 154         case MDS_CLOSE_LAYOUT_MERGE:
 155                 /* merge blocks from the victim inode */
 156                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 157                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 158         case MDS_CLOSE_LAYOUT_SPLIT:
 159         case MDS_CLOSE_LAYOUT_SWAP: {
 160                 struct split_param *sp = data;
 161
 162                 LASSERT(data != NULL);
 163                 op_data->op_bias |= bias;
 164                 op_data->op_data_version = 0;
 165                 op_data->op_lease_handle = och->och_lease_handle;
 166                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 167                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 168                         op_data->op_mirror_id = sp->sp_mirror_id;
 169                 } else {
 170                         op_data->op_fid2 = *ll_inode2fid(data);
 171                 }
 172                 break;
 173         }
 174
 175         case MDS_CLOSE_RESYNC_DONE: {
 176                 struct ll_ioc_lease *ioc = data;
 177
 178                 LASSERT(data != NULL);
 179                 op_data->op_attr_blocks +=
 180                         ioc->lil_count * op_data->op_attr_blocks;
 181                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 182                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 183
 184                 op_data->op_lease_handle = och->och_lease_handle;
 185                 op_data->op_data = &ioc->lil_ids[0];
 186                 op_data->op_data_size =
 187                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 188                 break;
 189         }
 190
 191         case MDS_HSM_RELEASE:
 192                 LASSERT(data != NULL);
 193                 op_data->op_bias |= MDS_HSM_RELEASE;
 194                 op_data->op_data_version = *(__u64 *)data;
 195                 op_data->op_lease_handle = och->och_lease_handle;
 196                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 197                 break;
 198
 199         default:
 200                 LASSERT(data == NULL);
 201                 break;
 202         }
 203
 204         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 205                 op_data->op_attr.ia_valid |= MDS_ATTR_LSIZE;
 206         if (!(op_data->op_attr.ia_valid & ATTR_BLOCKS))
 207                 op_data->op_attr.ia_valid |= MDS_ATTR_LBLOCKS;
 208
 209         rc = md_close(md_exp, op_data, och->och_mod, &req);
 210         if (rc != 0 && rc != -EINTR)
 211                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 212                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 213
 214         if (rc == 0 && op_data->op_bias & bias) {
 215                 struct mdt_body *body;
 216
 217                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 218                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 219                         rc = -EBUSY;
 220         }
 221
 222         ll_finish_md_op_data(op_data);
 223         EXIT;
 224 out:
 225
 226         md_clear_open_replay_data(md_exp, och);
 227         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 228         OBD_FREE_PTR(och);
 229
 230         ptlrpc_req_finished(req);       /* This is close request */
 231         return rc;
 232 }
 233
 234 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 235 {
 236         struct ll_inode_info *lli = ll_i2info(inode);
 237         struct obd_client_handle **och_p;
 238         struct obd_client_handle *och;
 239         __u64 *och_usecount;
 240         int rc = 0;
 241         ENTRY;
 242
 243         if (fmode & FMODE_WRITE) {
 244                 och_p = &lli->lli_mds_write_och;
 245                 och_usecount = &lli->lli_open_fd_write_count;
 246         } else if (fmode & FMODE_EXEC) {
 247                 och_p = &lli->lli_mds_exec_och;
 248                 och_usecount = &lli->lli_open_fd_exec_count;
 249         } else {
 250                 LASSERT(fmode & FMODE_READ);
 251                 och_p = &lli->lli_mds_read_och;
 252                 och_usecount = &lli->lli_open_fd_read_count;
 253         }
 254
 255         mutex_lock(&lli->lli_och_mutex);
 256         if (*och_usecount > 0) {
 257                 /* There are still users of this handle, so skip
 258                  * freeing it. */
 259                 mutex_unlock(&lli->lli_och_mutex);
 260                 RETURN(0);
 261         }
 262
 263         och = *och_p;
 264         *och_p = NULL;
 265         mutex_unlock(&lli->lli_och_mutex);
 266
 267         if (och != NULL) {
 268                 /* There might be a race and this handle may already
 269                  * be closed. */
 270                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 271         }
 272
 273         RETURN(rc);
 274 }
 275
 276 static int ll_md_close(struct inode *inode, struct file *file)
 277 {
 278         union ldlm_policy_data policy = {
 279                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 280         };
 281         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 282         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 283         struct ll_inode_info *lli = ll_i2info(inode);
 284         struct lustre_handle lockh;
 285         enum ldlm_mode lockmode;
 286         int rc = 0;
 287         ENTRY;
 288
 289         /* clear group lock, if present */
 290         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 291                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 292
 293         if (fd->fd_lease_och != NULL) {
 294                 bool lease_broken;
 295
 296                 /* Usually the lease is not released when the
 297                  * application crashed, we need to release here. */
 298                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 299                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 300                         PFID(&lli->lli_fid), rc, lease_broken);
 301
 302                 fd->fd_lease_och = NULL;
 303         }
 304
 305         if (fd->fd_och != NULL) {
 306                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 307                 fd->fd_och = NULL;
 308                 GOTO(out, rc);
 309         }
 310
 311         /* Let's see if we have good enough OPEN lock on the file and if
 312            we can skip talking to MDS */
 313         mutex_lock(&lli->lli_och_mutex);
 314         if (fd->fd_omode & FMODE_WRITE) {
 315                 lockmode = LCK_CW;
 316                 LASSERT(lli->lli_open_fd_write_count);
 317                 lli->lli_open_fd_write_count--;
 318         } else if (fd->fd_omode & FMODE_EXEC) {
 319                 lockmode = LCK_PR;
 320                 LASSERT(lli->lli_open_fd_exec_count);
 321                 lli->lli_open_fd_exec_count--;
 322         } else {
 323                 lockmode = LCK_CR;
 324                 LASSERT(lli->lli_open_fd_read_count);
 325                 lli->lli_open_fd_read_count--;
 326         }
 327         mutex_unlock(&lli->lli_och_mutex);
 328
 329         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 330                            LDLM_IBITS, &policy, lockmode, &lockh))
 331                 rc = ll_md_real_close(inode, fd->fd_omode);
 332
 333 out:
 334         LUSTRE_FPRIVATE(file) = NULL;
 335         ll_file_data_put(fd);
 336
 337         RETURN(rc);
 338 }
 339
 340 /* While this returns an error code, fput() the caller does not, so we need
 341  * to make every effort to clean up all of our state here.  Also, applications
 342  * rarely check close errors and even if an error is returned they will not
 343  * re-try the close call.
 344  */
 345 int ll_file_release(struct inode *inode, struct file *file)
 346 {
 347         struct ll_file_data *fd;
 348         struct ll_sb_info *sbi = ll_i2sbi(inode);
 349         struct ll_inode_info *lli = ll_i2info(inode);
 350         int rc;
 351         ENTRY;
 352
 353         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 354                PFID(ll_inode2fid(inode)), inode);
 355
 356         if (inode->i_sb->s_root != file_dentry(file))
 357                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 358         fd = LUSTRE_FPRIVATE(file);
 359         LASSERT(fd != NULL);
 360
 361         /* The last ref on @file, maybe not the the owner pid of statahead,
 362          * because parent and child process can share the same file handle. */
 363         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 364                 ll_deauthorize_statahead(inode, fd);
 365
 366         if (inode->i_sb->s_root == file_dentry(file)) {
 367                 LUSTRE_FPRIVATE(file) = NULL;
 368                 ll_file_data_put(fd);
 369                 RETURN(0);
 370         }
 371
 372         if (!S_ISDIR(inode->i_mode)) {
 373                 if (lli->lli_clob != NULL)
 374                         lov_read_and_clear_async_rc(lli->lli_clob);
 375                 lli->lli_async_rc = 0;
 376         }
 377
 378         rc = ll_md_close(inode, file);
 379
 380         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 381                 libcfs_debug_dumplog();
 382
 383         RETURN(rc);
 384 }
 385
 386 static inline int ll_dom_readpage(void *data, struct page *page)
 387 {
 388         struct niobuf_local *lnb = data;
 389         void *kaddr;
 390
 391         kaddr = ll_kmap_atomic(page, KM_USER0);
 392         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 393         if (lnb->lnb_len < PAGE_SIZE)
 394                 memset(kaddr + lnb->lnb_len, 0,
 395                        PAGE_SIZE - lnb->lnb_len);
 396         flush_dcache_page(page);
 397         SetPageUptodate(page);
 398         ll_kunmap_atomic(kaddr, KM_USER0);
 399         unlock_page(page);
 400
 401         return 0;
 402 }
 403
 404 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 405                         struct lookup_intent *it)
 406 {
 407         struct ll_inode_info *lli = ll_i2info(inode);
 408         struct cl_object *obj = lli->lli_clob;
 409         struct address_space *mapping = inode->i_mapping;
 410         struct page *vmpage;
 411         struct niobuf_remote *rnb;
 412         char *data;
 413         struct lu_env *env;
 414         struct cl_io *io;
 415         __u16 refcheck;
 416         struct lustre_handle lockh;
 417         struct ldlm_lock *lock;
 418         unsigned long index, start;
 419         struct niobuf_local lnb;
 420         int rc;
 421         bool dom_lock = false;
 422
 423         ENTRY;
 424
 425         if (obj == NULL)
 426                 RETURN_EXIT;
 427
 428         if (it->it_lock_mode != 0) {
 429                 lockh.cookie = it->it_lock_handle;
 430                 lock = ldlm_handle2lock(&lockh);
 431                 if (lock != NULL)
 432                         dom_lock = ldlm_has_dom(lock);
 433                 LDLM_LOCK_PUT(lock);
 434         }
 435
 436         if (!dom_lock)
 437                 RETURN_EXIT;
 438
 439         env = cl_env_get(&refcheck);
 440         if (IS_ERR(env))
 441                 RETURN_EXIT;
 442
 443         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 444                                    RCL_SERVER))
 445                 GOTO(out_env, rc = -ENODATA);
 446
 447         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 448         data = (char *)rnb + sizeof(*rnb);
 449
 450         if (rnb == NULL || rnb->rnb_len == 0)
 451                 GOTO(out_env, rc = 0);
 452
 453         CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
 454                rnb->rnb_len, i_size_read(inode));
 455
 456         io = vvp_env_thread_io(env);
 457         io->ci_obj = obj;
 458         io->ci_ignore_layout = 1;
 459         rc = cl_io_init(env, io, CIT_MISC, obj);
 460         if (rc)
 461                 GOTO(out_io, rc);
 462
 463         lnb.lnb_file_offset = rnb->rnb_offset;
 464         start = lnb.lnb_file_offset / PAGE_SIZE;
 465         index = 0;
 466         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 467         lnb.lnb_page_offset = 0;
 468         do {
 469                 struct cl_page *clp;
 470
 471                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 472                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 473                 if (lnb.lnb_len > PAGE_SIZE)
 474                         lnb.lnb_len = PAGE_SIZE;
 475
 476                 vmpage = read_cache_page(mapping, index + start,
 477                                          ll_dom_readpage, &lnb);
 478                 if (IS_ERR(vmpage)) {
 479                         CWARN("%s: cannot fill page %lu for "DFID
 480                               " with data: rc = %li\n",
 481                               ll_get_fsname(inode->i_sb, NULL, 0),
 482                               index + start, PFID(lu_object_fid(&obj->co_lu)),
 483                               PTR_ERR(vmpage));
 484                         break;
 485                 }
 486                 lock_page(vmpage);
 487                 if (vmpage->mapping == NULL) {
 488                         unlock_page(vmpage);
 489                         put_page(vmpage);
 490                         /* page was truncated */
 491                         GOTO(out_io, rc = -ENODATA);
 492                 }
 493                 clp = cl_page_find(env, obj, vmpage->index, vmpage,
 494                                    CPT_CACHEABLE);
 495                 if (IS_ERR(clp)) {
 496                         unlock_page(vmpage);
 497                         put_page(vmpage);
 498                         GOTO(out_io, rc = PTR_ERR(clp));
 499                 }
 500
 501                 /* export page */
 502                 cl_page_export(env, clp, 1);
 503                 cl_page_put(env, clp);
 504                 unlock_page(vmpage);
 505                 put_page(vmpage);
 506                 index++;
 507         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 508         rc = 0;
 509         EXIT;
 510 out_io:
 511         cl_io_fini(env, io);
 512 out_env:
 513         cl_env_put(env, &refcheck);
 514 }
 515
 516 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 517                                 struct lookup_intent *itp)
 518 {
 519         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 520         struct dentry *parent = de->d_parent;
 521         const char *name = NULL;
 522         int len = 0;
 523         struct md_op_data *op_data;
 524         struct ptlrpc_request *req = NULL;
 525         int rc;
 526         ENTRY;
 527
 528         LASSERT(parent != NULL);
 529         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 530
 531         /* if server supports open-by-fid, or file name is invalid, don't pack
 532          * name in open request */
 533         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 534             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 535                 name = de->d_name.name;
 536                 len = de->d_name.len;
 537         }
 538
 539         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 540                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 541         if (IS_ERR(op_data))
 542                 RETURN(PTR_ERR(op_data));
 543         op_data->op_data = lmm;
 544         op_data->op_data_size = lmmsize;
 545
 546         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 547                             &ll_md_blocking_ast, 0);
 548         ll_finish_md_op_data(op_data);
 549         if (rc == -ESTALE) {
 550                 /* reason for keep own exit path - don`t flood log
 551                  * with messages with -ESTALE errors.
 552                  */
 553                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 554                      it_open_error(DISP_OPEN_OPEN, itp))
 555                         GOTO(out, rc);
 556                 ll_release_openhandle(de, itp);
 557                 GOTO(out, rc);
 558         }
 559
 560         if (it_disposition(itp, DISP_LOOKUP_NEG))
 561                 GOTO(out, rc = -ENOENT);
 562
 563         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 564                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 565                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 566                 GOTO(out, rc);
 567         }
 568
 569         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 570
 571         if (!rc && itp->it_lock_mode) {
 572                 ll_dom_finish_open(de->d_inode, req, itp);
 573                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 574         }
 575
 576 out:
 577         ptlrpc_req_finished(req);
 578         ll_intent_drop_lock(itp);
 579
 580         /* We did open by fid, but by the time we got to the server,
 581          * the object disappeared. If this is a create, we cannot really
 582          * tell the userspace that the file it was trying to create
 583          * does not exist. Instead let's return -ESTALE, and the VFS will
 584          * retry the create with LOOKUP_REVAL that we are going to catch
 585          * in ll_revalidate_dentry() and use lookup then.
 586          */
 587         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 588                 rc = -ESTALE;
 589
 590         RETURN(rc);
 591 }
 592
 593 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 594                        struct obd_client_handle *och)
 595 {
 596         struct mdt_body *body;
 597
 598         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 599         och->och_fh = body->mbo_handle;
 600         och->och_fid = body->mbo_fid1;
 601         och->och_lease_handle.cookie = it->it_lock_handle;
 602         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 603         och->och_flags = it->it_flags;
 604
 605         return md_set_open_replay_data(md_exp, och, it);
 606 }
 607
 608 static int ll_local_open(struct file *file, struct lookup_intent *it,
 609                          struct ll_file_data *fd, struct obd_client_handle *och)
 610 {
 611         struct inode *inode = file_inode(file);
 612         ENTRY;
 613
 614         LASSERT(!LUSTRE_FPRIVATE(file));
 615
 616         LASSERT(fd != NULL);
 617
 618         if (och) {
 619                 int rc;
 620
 621                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 622                 if (rc != 0)
 623                         RETURN(rc);
 624         }
 625
 626         LUSTRE_FPRIVATE(file) = fd;
 627         ll_readahead_init(inode, &fd->fd_ras);
 628         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 629
 630         /* ll_cl_context initialize */
 631         rwlock_init(&fd->fd_lock);
 632         INIT_LIST_HEAD(&fd->fd_lccs);
 633
 634         RETURN(0);
 635 }
 636
 637 /* Open a file, and (for the very first open) create objects on the OSTs at
 638  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 639  * creation or open until ll_lov_setstripe() ioctl is called.
 640  *
 641  * If we already have the stripe MD locally then we don't request it in
 642  * md_open(), by passing a lmm_size = 0.
 643  *
 644  * It is up to the application to ensure no other processes open this file
 645  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 646  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 647  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 648  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 649  */
 650 int ll_file_open(struct inode *inode, struct file *file)
 651 {
 652         struct ll_inode_info *lli = ll_i2info(inode);
 653         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 654                                           .it_flags = file->f_flags };
 655         struct obd_client_handle **och_p = NULL;
 656         __u64 *och_usecount = NULL;
 657         struct ll_file_data *fd;
 658         int rc = 0;
 659         ENTRY;
 660
 661         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 662                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 663
 664         it = file->private_data; /* XXX: compat macro */
 665         file->private_data = NULL; /* prevent ll_local_open assertion */
 666
 667         fd = ll_file_data_get();
 668         if (fd == NULL)
 669                 GOTO(out_nofiledata, rc = -ENOMEM);
 670
 671         fd->fd_file = file;
 672         if (S_ISDIR(inode->i_mode))
 673                 ll_authorize_statahead(inode, fd);
 674
 675         if (inode->i_sb->s_root == file_dentry(file)) {
 676                 LUSTRE_FPRIVATE(file) = fd;
 677                 RETURN(0);
 678         }
 679
 680         if (!it || !it->it_disposition) {
 681                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 682                  * because everything but O_ACCMODE mask was stripped from
 683                  * there */
 684                 if ((oit.it_flags + 1) & O_ACCMODE)
 685                         oit.it_flags++;
 686                 if (file->f_flags & O_TRUNC)
 687                         oit.it_flags |= FMODE_WRITE;
 688
 689                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 690                  * dentry_open after call to open_namei that checks permissions.
 691                  * Only nfsd_open call dentry_open directly without checking
 692                  * permissions and because of that this code below is safe. */
 693                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 694                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 695
 696                 /* We do not want O_EXCL here, presumably we opened the file
 697                  * already? XXX - NFS implications? */
 698                 oit.it_flags &= ~O_EXCL;
 699
 700                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 701                  * created if necessary, then "IT_CREAT" should be set to keep
 702                  * consistent with it */
 703                 if (oit.it_flags & O_CREAT)
 704                         oit.it_op |= IT_CREAT;
 705
 706                 it = &oit;
 707         }
 708
 709 restart:
 710         /* Let's see if we have file open on MDS already. */
 711         if (it->it_flags & FMODE_WRITE) {
 712                 och_p = &lli->lli_mds_write_och;
 713                 och_usecount = &lli->lli_open_fd_write_count;
 714         } else if (it->it_flags & FMODE_EXEC) {
 715                 och_p = &lli->lli_mds_exec_och;
 716                 och_usecount = &lli->lli_open_fd_exec_count;
 717          } else {
 718                 och_p = &lli->lli_mds_read_och;
 719                 och_usecount = &lli->lli_open_fd_read_count;
 720         }
 721
 722         mutex_lock(&lli->lli_och_mutex);
 723         if (*och_p) { /* Open handle is present */
 724                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 725                         /* Well, there's extra open request that we do not need,
 726                            let's close it somehow. This will decref request. */
 727                         rc = it_open_error(DISP_OPEN_OPEN, it);
 728                         if (rc) {
 729                                 mutex_unlock(&lli->lli_och_mutex);
 730                                 GOTO(out_openerr, rc);
 731                         }
 732
 733                         ll_release_openhandle(file_dentry(file), it);
 734                 }
 735                 (*och_usecount)++;
 736
 737                 rc = ll_local_open(file, it, fd, NULL);
 738                 if (rc) {
 739                         (*och_usecount)--;
 740                         mutex_unlock(&lli->lli_och_mutex);
 741                         GOTO(out_openerr, rc);
 742                 }
 743         } else {
 744                 LASSERT(*och_usecount == 0);
 745                 if (!it->it_disposition) {
 746                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 747                         /* We cannot just request lock handle now, new ELC code
 748                            means that one of other OPEN locks for this file
 749                            could be cancelled, and since blocking ast handler
 750                            would attempt to grab och_mutex as well, that would
 751                            result in a deadlock */
 752                         mutex_unlock(&lli->lli_och_mutex);
 753                         /*
 754                          * Normally called under two situations:
 755                          * 1. NFS export.
 756                          * 2. A race/condition on MDS resulting in no open
 757                          *    handle to be returned from LOOKUP|OPEN request,
 758                          *    for example if the target entry was a symlink.
 759                          *
 760                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 761                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 762                          *  bit so that it's not confusing later callers.
 763                          *
 764                          *  NB; when ldd is NULL, it must have come via normal
 765                          *  lookup path only, since ll_iget_for_nfs always calls
 766                          *  ll_d_init().
 767                          */
 768                         if (ldd && ldd->lld_nfs_dentry) {
 769                                 ldd->lld_nfs_dentry = 0;
 770                                 it->it_flags |= MDS_OPEN_LOCK;
 771                         }
 772
 773                          /*
 774                          * Always specify MDS_OPEN_BY_FID because we don't want
 775                          * to get file with different fid.
 776                          */
 777                         it->it_flags |= MDS_OPEN_BY_FID;
 778                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 779                                                  it);
 780                         if (rc)
 781                                 GOTO(out_openerr, rc);
 782
 783                         goto restart;
 784                 }
 785                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 786                 if (!*och_p)
 787                         GOTO(out_och_free, rc = -ENOMEM);
 788
 789                 (*och_usecount)++;
 790
 791                 /* md_intent_lock() didn't get a request ref if there was an
 792                  * open error, so don't do cleanup on the request here
 793                  * (bug 3430) */
 794                 /* XXX (green): Should not we bail out on any error here, not
 795                  * just open error? */
 796                 rc = it_open_error(DISP_OPEN_OPEN, it);
 797                 if (rc != 0)
 798                         GOTO(out_och_free, rc);
 799
 800                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 801                          "inode %p: disposition %x, status %d\n", inode,
 802                          it_disposition(it, ~0), it->it_status);
 803
 804                 rc = ll_local_open(file, it, fd, *och_p);
 805                 if (rc)
 806                         GOTO(out_och_free, rc);
 807         }
 808         mutex_unlock(&lli->lli_och_mutex);
 809         fd = NULL;
 810
 811         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 812            different kind of OPEN lock for this same inode gets cancelled
 813            by ldlm_cancel_lru */
 814         if (!S_ISREG(inode->i_mode))
 815                 GOTO(out_och_free, rc);
 816
 817         cl_lov_delay_create_clear(&file->f_flags);
 818         GOTO(out_och_free, rc);
 819
 820 out_och_free:
 821         if (rc) {
 822                 if (och_p && *och_p) {
 823                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 824                         *och_p = NULL; /* OBD_FREE writes some magic there */
 825                         (*och_usecount)--;
 826                 }
 827                 mutex_unlock(&lli->lli_och_mutex);
 828
 829 out_openerr:
 830                 if (lli->lli_opendir_key == fd)
 831                         ll_deauthorize_statahead(inode, fd);
 832                 if (fd != NULL)
 833                         ll_file_data_put(fd);
 834         } else {
 835                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 836         }
 837
 838 out_nofiledata:
 839         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 840                 ptlrpc_req_finished(it->it_request);
 841                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 842         }
 843
 844         return rc;
 845 }
 846
 847 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 848                         struct ldlm_lock_desc *desc, void *data, int flag)
 849 {
 850         int rc;
 851         struct lustre_handle lockh;
 852         ENTRY;
 853
 854         switch (flag) {
 855         case LDLM_CB_BLOCKING:
 856                 ldlm_lock2handle(lock, &lockh);
 857                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 858                 if (rc < 0) {
 859                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 860                         RETURN(rc);
 861                 }
 862                 break;
 863         case LDLM_CB_CANCELING:
 864                 /* do nothing */
 865                 break;
 866         }
 867         RETURN(0);
 868 }
 869
 870 /**
 871  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 872  * and save it as fd->fd_och so as to force client to reopen the file even
 873  * if it has an open lock in cache already.
 874  */
 875 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 876                                 struct lustre_handle *old_handle)
 877 {
 878         struct ll_inode_info *lli = ll_i2info(inode);
 879         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 880         struct obd_client_handle **och_p;
 881         __u64 *och_usecount;
 882         int rc = 0;
 883         ENTRY;
 884
 885         /* Get the openhandle of the file */
 886         mutex_lock(&lli->lli_och_mutex);
 887         if (fd->fd_lease_och != NULL)
 888                 GOTO(out_unlock, rc = -EBUSY);
 889
 890         if (fd->fd_och == NULL) {
 891                 if (file->f_mode & FMODE_WRITE) {
 892                         LASSERT(lli->lli_mds_write_och != NULL);
 893                         och_p = &lli->lli_mds_write_och;
 894                         och_usecount = &lli->lli_open_fd_write_count;
 895                 } else {
 896                         LASSERT(lli->lli_mds_read_och != NULL);
 897                         och_p = &lli->lli_mds_read_och;
 898                         och_usecount = &lli->lli_open_fd_read_count;
 899                 }
 900
 901                 if (*och_usecount > 1)
 902                         GOTO(out_unlock, rc = -EBUSY);
 903
 904                 fd->fd_och = *och_p;
 905                 *och_usecount = 0;
 906                 *och_p = NULL;
 907         }
 908
 909         *old_handle = fd->fd_och->och_fh;
 910
 911         EXIT;
 912 out_unlock:
 913         mutex_unlock(&lli->lli_och_mutex);
 914         return rc;
 915 }
 916
 917 /**
 918  * Release ownership on lli_mds_*_och when putting back a file lease.
 919  */
 920 static int ll_lease_och_release(struct inode *inode, struct file *file)
 921 {
 922         struct ll_inode_info *lli = ll_i2info(inode);
 923         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 924         struct obd_client_handle **och_p;
 925         struct obd_client_handle *old_och = NULL;
 926         __u64 *och_usecount;
 927         int rc = 0;
 928         ENTRY;
 929
 930         mutex_lock(&lli->lli_och_mutex);
 931         if (file->f_mode & FMODE_WRITE) {
 932                 och_p = &lli->lli_mds_write_och;
 933                 och_usecount = &lli->lli_open_fd_write_count;
 934         } else {
 935                 och_p = &lli->lli_mds_read_och;
 936                 och_usecount = &lli->lli_open_fd_read_count;
 937         }
 938
 939         /* The file may have been open by another process (broken lease) so
 940          * *och_p is not NULL. In this case we should simply increase usecount
 941          * and close fd_och.
 942          */
 943         if (*och_p != NULL) {
 944                 old_och = fd->fd_och;
 945                 (*och_usecount)++;
 946         } else {
 947                 *och_p = fd->fd_och;
 948                 *och_usecount = 1;
 949         }
 950         fd->fd_och = NULL;
 951         mutex_unlock(&lli->lli_och_mutex);
 952
 953         if (old_och != NULL)
 954                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 955
 956         RETURN(rc);
 957 }
 958
 959 /**
 960  * Acquire a lease and open the file.
 961  */
 962 static struct obd_client_handle *
 963 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 964               __u64 open_flags)
 965 {
 966         struct lookup_intent it = { .it_op = IT_OPEN };
 967         struct ll_sb_info *sbi = ll_i2sbi(inode);
 968         struct md_op_data *op_data;
 969         struct ptlrpc_request *req = NULL;
 970         struct lustre_handle old_handle = { 0 };
 971         struct obd_client_handle *och = NULL;
 972         int rc;
 973         int rc2;
 974         ENTRY;
 975
 976         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 977                 RETURN(ERR_PTR(-EINVAL));
 978
 979         if (file != NULL) {
 980                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 981                         RETURN(ERR_PTR(-EPERM));
 982
 983                 rc = ll_lease_och_acquire(inode, file, &old_handle);
 984                 if (rc)
 985                         RETURN(ERR_PTR(rc));
 986         }
 987
 988         OBD_ALLOC_PTR(och);
 989         if (och == NULL)
 990                 RETURN(ERR_PTR(-ENOMEM));
 991
 992         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 993                                         LUSTRE_OPC_ANY, NULL);
 994         if (IS_ERR(op_data))
 995                 GOTO(out, rc = PTR_ERR(op_data));
 996
 997         /* To tell the MDT this openhandle is from the same owner */
 998         op_data->op_handle = old_handle;
 999
1000         it.it_flags = fmode | open_flags;
1001         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1002         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1003                             &ll_md_blocking_lease_ast,
1004         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1005          * it can be cancelled which may mislead applications that the lease is
1006          * broken;
1007          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1008          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1009          * doesn't deal with openhandle, so normal openhandle will be leaked. */
1010                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1011         ll_finish_md_op_data(op_data);
1012         ptlrpc_req_finished(req);
1013         if (rc < 0)
1014                 GOTO(out_release_it, rc);
1015
1016         if (it_disposition(&it, DISP_LOOKUP_NEG))
1017                 GOTO(out_release_it, rc = -ENOENT);
1018
1019         rc = it_open_error(DISP_OPEN_OPEN, &it);
1020         if (rc)
1021                 GOTO(out_release_it, rc);
1022
1023         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1024         ll_och_fill(sbi->ll_md_exp, &it, och);
1025
1026         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1027                 GOTO(out_close, rc = -EOPNOTSUPP);
1028
1029         /* already get lease, handle lease lock */
1030         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1031         if (it.it_lock_mode == 0 ||
1032             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1033                 /* open lock must return for lease */
1034                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1035                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1036                         it.it_lock_bits);
1037                 GOTO(out_close, rc = -EPROTO);
1038         }
1039
1040         ll_intent_release(&it);
1041         RETURN(och);
1042
1043 out_close:
1044         /* Cancel open lock */
1045         if (it.it_lock_mode != 0) {
1046                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1047                                             it.it_lock_mode);
1048                 it.it_lock_mode = 0;
1049                 och->och_lease_handle.cookie = 0ULL;
1050         }
1051         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1052         if (rc2 < 0)
1053                 CERROR("%s: error closing file "DFID": %d\n",
1054                        ll_get_fsname(inode->i_sb, NULL, 0),
1055                        PFID(&ll_i2info(inode)->lli_fid), rc2);
1056         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1057 out_release_it:
1058         ll_intent_release(&it);
1059 out:
1060         if (och != NULL)
1061                 OBD_FREE_PTR(och);
1062         RETURN(ERR_PTR(rc));
1063 }
1064
1065 /**
1066  * Check whether a layout swap can be done between two inodes.
1067  *
1068  * \param[in] inode1  First inode to check
1069  * \param[in] inode2  Second inode to check
1070  *
1071  * \retval 0 on success, layout swap can be performed between both inodes
1072  * \retval negative error code if requirements are not met
1073  */
1074 static int ll_check_swap_layouts_validity(struct inode *inode1,
1075                                           struct inode *inode2)
1076 {
1077         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1078                 return -EINVAL;
1079
1080         if (inode_permission(inode1, MAY_WRITE) ||
1081             inode_permission(inode2, MAY_WRITE))
1082                 return -EPERM;
1083
1084         if (inode1->i_sb != inode2->i_sb)
1085                 return -EXDEV;
1086
1087         return 0;
1088 }
1089
1090 static int ll_swap_layouts_close(struct obd_client_handle *och,
1091                                  struct inode *inode, struct inode *inode2)
1092 {
1093         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1094         const struct lu_fid     *fid2;
1095         int                      rc;
1096         ENTRY;
1097
1098         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1099                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1100
1101         rc = ll_check_swap_layouts_validity(inode, inode2);
1102         if (rc < 0)
1103                 GOTO(out_free_och, rc);
1104
1105         /* We now know that inode2 is a lustre inode */
1106         fid2 = ll_inode2fid(inode2);
1107
1108         rc = lu_fid_cmp(fid1, fid2);
1109         if (rc == 0)
1110                 GOTO(out_free_och, rc = -EINVAL);
1111
1112         /* Close the file and {swap,merge} layouts between inode & inode2.
1113          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1114          * because we still need it to pack l_remote_handle to MDT. */
1115         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1116                                        inode2);
1117
1118         och = NULL; /* freed in ll_close_inode_openhandle() */
1119
1120 out_free_och:
1121         if (och != NULL)
1122                 OBD_FREE_PTR(och);
1123
1124         RETURN(rc);
1125 }
1126
1127 /**
1128  * Release lease and close the file.
1129  * It will check if the lease has ever broken.
1130  */
1131 static int ll_lease_close_intent(struct obd_client_handle *och,
1132                                  struct inode *inode,
1133                                  bool *lease_broken, enum mds_op_bias bias,
1134                                  void *data)
1135 {
1136         struct ldlm_lock *lock;
1137         bool cancelled = true;
1138         int rc;
1139         ENTRY;
1140
1141         lock = ldlm_handle2lock(&och->och_lease_handle);
1142         if (lock != NULL) {
1143                 lock_res_and_lock(lock);
1144                 cancelled = ldlm_is_cancel(lock);
1145                 unlock_res_and_lock(lock);
1146                 LDLM_LOCK_PUT(lock);
1147         }
1148
1149         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1150                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1151
1152         if (lease_broken != NULL)
1153                 *lease_broken = cancelled;
1154
1155         if (!cancelled && !bias)
1156                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1157
1158         if (cancelled) { /* no need to excute intent */
1159                 bias = 0;
1160                 data = NULL;
1161         }
1162
1163         rc = ll_close_inode_openhandle(inode, och, bias, data);
1164         RETURN(rc);
1165 }
1166
1167 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1168                           bool *lease_broken)
1169 {
1170         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1171 }
1172
1173 /**
1174  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1175  */
1176 static int ll_lease_file_resync(struct obd_client_handle *och,
1177                                 struct inode *inode)
1178 {
1179         struct ll_sb_info *sbi = ll_i2sbi(inode);
1180         struct md_op_data *op_data;
1181         __u64 data_version_unused;
1182         int rc;
1183         ENTRY;
1184
1185         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1186                                      LUSTRE_OPC_ANY, NULL);
1187         if (IS_ERR(op_data))
1188                 RETURN(PTR_ERR(op_data));
1189
1190         /* before starting file resync, it's necessary to clean up page cache
1191          * in client memory, otherwise once the layout version is increased,
1192          * writing back cached data will be denied the OSTs. */
1193         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1194         if (rc)
1195                 GOTO(out, rc);
1196
1197         op_data->op_handle = och->och_lease_handle;
1198         rc = md_file_resync(sbi->ll_md_exp, op_data);
1199         if (rc)
1200                 GOTO(out, rc);
1201
1202         EXIT;
1203 out:
1204         ll_finish_md_op_data(op_data);
1205         return rc;
1206 }
1207
1208 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1209 {
1210         struct ll_inode_info *lli = ll_i2info(inode);
1211         struct cl_object *obj = lli->lli_clob;
1212         struct cl_attr *attr = vvp_env_thread_attr(env);
1213         s64 atime;
1214         s64 mtime;
1215         s64 ctime;
1216         int rc = 0;
1217
1218         ENTRY;
1219
1220         ll_inode_size_lock(inode);
1221
1222         /* Merge timestamps the most recently obtained from MDS with
1223          * timestamps obtained from OSTs.
1224          *
1225          * Do not overwrite atime of inode because it may be refreshed
1226          * by file_accessed() function. If the read was served by cache
1227          * data, there is no RPC to be sent so that atime may not be
1228          * transferred to OSTs at all. MDT only updates atime at close time
1229          * if it's at least 'mdd.*.atime_diff' older.
1230          * All in all, the atime in Lustre does not strictly comply with
1231          * POSIX. Solving this problem needs to send an RPC to MDT for each
1232          * read, this will hurt performance. */
1233         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1234                 LTIME_S(inode->i_atime) = lli->lli_atime;
1235                 lli->lli_update_atime = 0;
1236         }
1237         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1238         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1239
1240         atime = LTIME_S(inode->i_atime);
1241         mtime = LTIME_S(inode->i_mtime);
1242         ctime = LTIME_S(inode->i_ctime);
1243
1244         cl_object_attr_lock(obj);
1245         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1246                 rc = -EINVAL;
1247         else
1248                 rc = cl_object_attr_get(env, obj, attr);
1249         cl_object_attr_unlock(obj);
1250
1251         if (rc != 0)
1252                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1253
1254         if (atime < attr->cat_atime)
1255                 atime = attr->cat_atime;
1256
1257         if (ctime < attr->cat_ctime)
1258                 ctime = attr->cat_ctime;
1259
1260         if (mtime < attr->cat_mtime)
1261                 mtime = attr->cat_mtime;
1262
1263         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1264                PFID(&lli->lli_fid), attr->cat_size);
1265
1266         i_size_write(inode, attr->cat_size);
1267         inode->i_blocks = attr->cat_blocks;
1268
1269         LTIME_S(inode->i_atime) = atime;
1270         LTIME_S(inode->i_mtime) = mtime;
1271         LTIME_S(inode->i_ctime) = ctime;
1272
1273 out_size_unlock:
1274         ll_inode_size_unlock(inode);
1275
1276         RETURN(rc);
1277 }
1278
1279 /**
1280  * Set designated mirror for I/O.
1281  *
1282  * So far only read, write, and truncated can support to issue I/O to
1283  * designated mirror.
1284  */
1285 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1286 {
1287         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1288
1289         /* clear layout version for generic(non-resync) I/O in case it carries
1290          * stale layout version due to I/O restart */
1291         io->ci_layout_version = 0;
1292
1293         /* FLR: disable non-delay for designated mirror I/O because obviously
1294          * only one mirror is available */
1295         if (fd->fd_designated_mirror > 0) {
1296                 io->ci_ndelay = 0;
1297                 io->ci_designated_mirror = fd->fd_designated_mirror;
1298                 io->ci_layout_version = fd->fd_layout_version;
1299                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1300                                  * io to ptasks */
1301         }
1302
1303         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1304                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1305 }
1306
1307 static bool file_is_noatime(const struct file *file)
1308 {
1309         const struct vfsmount *mnt = file->f_path.mnt;
1310         const struct inode *inode = file_inode((struct file *)file);
1311
1312         /* Adapted from file_accessed() and touch_atime().*/
1313         if (file->f_flags & O_NOATIME)
1314                 return true;
1315
1316         if (inode->i_flags & S_NOATIME)
1317                 return true;
1318
1319         if (IS_NOATIME(inode))
1320                 return true;
1321
1322         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1323                 return true;
1324
1325         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1326                 return true;
1327
1328         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1329                 return true;
1330
1331         return false;
1332 }
1333
1334 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1335
1336 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1337 {
1338         struct inode *inode = file_inode(file);
1339         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1340
1341         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1342         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1343         io->u.ci_rw.rw_file = file;
1344         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1345         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1346         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1347
1348         if (iot == CIT_WRITE) {
1349                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1350                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1351                                            file->f_flags & O_DIRECT ||
1352                                            IS_SYNC(inode));
1353         }
1354         io->ci_obj = ll_i2info(inode)->lli_clob;
1355         io->ci_lockreq = CILR_MAYBE;
1356         if (ll_file_nolock(file)) {
1357                 io->ci_lockreq = CILR_NEVER;
1358                 io->ci_no_srvlock = 1;
1359         } else if (file->f_flags & O_APPEND) {
1360                 io->ci_lockreq = CILR_MANDATORY;
1361         }
1362         io->ci_noatime = file_is_noatime(file);
1363         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1364                 io->ci_pio = !io->u.ci_rw.rw_append;
1365         else
1366                 io->ci_pio = 0;
1367
1368         /* FLR: only use non-delay I/O for read as there is only one
1369          * avaliable mirror for write. */
1370         io->ci_ndelay = !(iot == CIT_WRITE);
1371
1372         ll_io_set_mirror(io, file);
1373 }
1374
1375 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1376 {
1377         struct cl_io_pt *pt = ptask->pt_cbdata;
1378         struct file *file = pt->cip_file;
1379         struct lu_env *env;
1380         struct cl_io *io;
1381         loff_t pos = pt->cip_pos;
1382         int rc;
1383         __u16 refcheck;
1384         ENTRY;
1385
1386         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1387                 file_dentry(file)->d_name.name,
1388                 pt->cip_iot == CIT_READ ? "read" : "write",
1389                 pos, pos + pt->cip_count);
1390
1391         env = cl_env_get(&refcheck);
1392         if (IS_ERR(env))
1393                 RETURN(PTR_ERR(env));
1394
1395         io = vvp_env_thread_io(env);
1396         ll_io_init(io, file, pt->cip_iot);
1397         io->u.ci_rw.rw_iter = pt->cip_iter;
1398         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1399         io->ci_pio = 0; /* It's already in parallel task */
1400
1401         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1402                            pt->cip_count - pt->cip_result);
1403         if (!rc) {
1404                 struct vvp_io *vio = vvp_env_io(env);
1405
1406                 vio->vui_io_subtype = IO_NORMAL;
1407                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1408
1409                 ll_cl_add(file, env, io, LCC_RW);
1410                 rc = cl_io_loop(env, io);
1411                 ll_cl_remove(file, env);
1412         } else {
1413                 /* cl_io_rw_init() handled IO */
1414                 rc = io->ci_result;
1415         }
1416
1417         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1418                 if (io->ci_nob > 0)
1419                         io->ci_nob /= 2;
1420                 rc = -EIO;
1421         }
1422
1423         if (io->ci_nob > 0) {
1424                 pt->cip_result += io->ci_nob;
1425                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1426                 pos += io->ci_nob;
1427                 pt->cip_iocb.ki_pos = pos;
1428 #ifdef HAVE_KIOCB_KI_LEFT
1429                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1430 #elif defined(HAVE_KI_NBYTES)
1431                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1432 #endif
1433         }
1434
1435         cl_io_fini(env, io);
1436         cl_env_put(env, &refcheck);
1437
1438         pt->cip_need_restart = io->ci_need_restart;
1439
1440         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1441                 file_dentry(file)->d_name.name,
1442                 pt->cip_iot == CIT_READ ? "read" : "write",
1443                 pt->cip_result, rc);
1444
1445         RETURN(pt->cip_result > 0 ? 0 : rc);
1446 }
1447
1448 static ssize_t
1449 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1450                    struct file *file, enum cl_io_type iot,
1451                    loff_t *ppos, size_t count)
1452 {
1453         struct range_lock       range;
1454         struct vvp_io           *vio = vvp_env_io(env);
1455         struct inode            *inode = file_inode(file);
1456         struct ll_inode_info    *lli = ll_i2info(inode);
1457         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1458         struct cl_io            *io;
1459         loff_t                  pos = *ppos;
1460         ssize_t                 result = 0;
1461         int                     rc = 0;
1462         unsigned                retried = 0;
1463         bool                    restarted = false;
1464
1465         ENTRY;
1466
1467         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1468                 file_dentry(file)->d_name.name,
1469                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1470
1471 restart:
1472         io = vvp_env_thread_io(env);
1473         ll_io_init(io, file, iot);
1474         if (args->via_io_subtype == IO_NORMAL) {
1475                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1476                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1477         }
1478         if (args->via_io_subtype != IO_NORMAL || restarted)
1479                 io->ci_pio = 0;
1480         io->ci_ndelay_tried = retried;
1481
1482         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1483                 bool range_locked = false;
1484
1485                 if (file->f_flags & O_APPEND)
1486                         range_lock_init(&range, 0, LUSTRE_EOF);
1487                 else
1488                         range_lock_init(&range, pos, pos + count - 1);
1489
1490                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1491                 vio->vui_io_subtype = args->via_io_subtype;
1492
1493                 switch (vio->vui_io_subtype) {
1494                 case IO_NORMAL:
1495                         /* Direct IO reads must also take range lock,
1496                          * or multiple reads will try to work on the same pages
1497                          * See LU-6227 for details. */
1498                         if (((iot == CIT_WRITE) ||
1499                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1500                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1501                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1502                                        RL_PARA(&range));
1503                                 rc = range_lock(&lli->lli_write_tree, &range);
1504                                 if (rc < 0)
1505                                         GOTO(out, rc);
1506
1507                                 range_locked = true;
1508                         }
1509                         break;
1510                 case IO_SPLICE:
1511                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1512                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1513                         break;
1514                 default:
1515                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1516                         LBUG();
1517                 }
1518
1519                 ll_cl_add(file, env, io, LCC_RW);
1520                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1521                     !lli->lli_inode_locked) {
1522                         inode_lock(inode);
1523                         lli->lli_inode_locked = 1;
1524                 }
1525                 rc = cl_io_loop(env, io);
1526                 if (lli->lli_inode_locked) {
1527                         lli->lli_inode_locked = 0;
1528                         inode_unlock(inode);
1529                 }
1530                 ll_cl_remove(file, env);
1531
1532                 if (range_locked) {
1533                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1534                                RL_PARA(&range));
1535                         range_unlock(&lli->lli_write_tree, &range);
1536                 }
1537         } else {
1538                 /* cl_io_rw_init() handled IO */
1539                 rc = io->ci_result;
1540         }
1541
1542         if (io->ci_nob > 0) {
1543                 result += io->ci_nob;
1544                 count  -= io->ci_nob;
1545
1546                 if (args->via_io_subtype == IO_NORMAL) {
1547                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1548
1549                         /* CLIO is too complicated. See LU-11069. */
1550                         if (cl_io_is_append(io))
1551                                 pos = io->u.ci_rw.rw_iocb.ki_pos;
1552                         else
1553                                 pos += io->ci_nob;
1554
1555                         args->u.normal.via_iocb->ki_pos = pos;
1556 #ifdef HAVE_KIOCB_KI_LEFT
1557                         args->u.normal.via_iocb->ki_left = count;
1558 #elif defined(HAVE_KI_NBYTES)
1559                         args->u.normal.via_iocb->ki_nbytes = count;
1560 #endif
1561                 } else {
1562                         /* for splice */
1563                         pos = io->u.ci_rw.rw_range.cir_pos;
1564                 }
1565         }
1566 out:
1567         cl_io_fini(env, io);
1568
1569         CDEBUG(D_VFSTRACE,
1570                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1571                file->f_path.dentry->d_name.name,
1572                iot, rc, result, io->ci_need_restart);
1573
1574         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1575                 CDEBUG(D_VFSTRACE,
1576                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1577                         file_dentry(file)->d_name.name,
1578                         iot == CIT_READ ? "read" : "write",
1579                         pos, pos + count, result, rc);
1580                 /* preserve the tried count for FLR */
1581                 retried = io->ci_ndelay_tried;
1582                 restarted = true;
1583                 goto restart;
1584         }
1585
1586         if (iot == CIT_READ) {
1587                 if (result > 0)
1588                         ll_stats_ops_tally(ll_i2sbi(inode),
1589                                            LPROC_LL_READ_BYTES, result);
1590         } else if (iot == CIT_WRITE) {
1591                 if (result > 0) {
1592                         ll_stats_ops_tally(ll_i2sbi(inode),
1593                                            LPROC_LL_WRITE_BYTES, result);
1594                         fd->fd_write_failed = false;
1595                 } else if (result == 0 && rc == 0) {
1596                         rc = io->ci_result;
1597                         if (rc < 0)
1598                                 fd->fd_write_failed = true;
1599                         else
1600                                 fd->fd_write_failed = false;
1601                 } else if (rc != -ERESTARTSYS) {
1602                         fd->fd_write_failed = true;
1603                 }
1604         }
1605
1606         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1607                 file_dentry(file)->d_name.name,
1608                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1609
1610         *ppos = pos;
1611
1612         RETURN(result > 0 ? result : rc);
1613 }
1614
1615 /**
1616  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1617  * especially for small I/O.
1618  *
1619  * To serve a read request, CLIO has to create and initialize a cl_io and
1620  * then request DLM lock. This has turned out to have siginificant overhead
1621  * and affects the performance of small I/O dramatically.
1622  *
1623  * It's not necessary to create a cl_io for each I/O. Under the help of read
1624  * ahead, most of the pages being read are already in memory cache and we can
1625  * read those pages directly because if the pages exist, the corresponding DLM
1626  * lock must exist so that page content must be valid.
1627  *
1628  * In fast read implementation, the llite speculatively finds and reads pages
1629  * in memory cache. There are three scenarios for fast read:
1630  *   - If the page exists and is uptodate, kernel VM will provide the data and
1631  *     CLIO won't be intervened;
1632  *   - If the page was brought into memory by read ahead, it will be exported
1633  *     and read ahead parameters will be updated;
1634  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1635  *     it will go back and invoke normal read, i.e., a cl_io will be created
1636  *     and DLM lock will be requested.
1637  *
1638  * POSIX compliance: posix standard states that read is intended to be atomic.
1639  * Lustre read implementation is in line with Linux kernel read implementation
1640  * and neither of them complies with POSIX standard in this matter. Fast read
1641  * doesn't make the situation worse on single node but it may interleave write
1642  * results from multiple nodes due to short read handling in ll_file_aio_read().
1643  *
1644  * \param env - lu_env
1645  * \param iocb - kiocb from kernel
1646  * \param iter - user space buffers where the data will be copied
1647  *
1648  * \retval - number of bytes have been read, or error code if error occurred.
1649  */
1650 static ssize_t
1651 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1652 {
1653         ssize_t result;
1654
1655         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1656                 return 0;
1657
1658         /* NB: we can't do direct IO for fast read because it will need a lock
1659          * to make IO engine happy. */
1660         if (iocb->ki_filp->f_flags & O_DIRECT)
1661                 return 0;
1662
1663         result = generic_file_read_iter(iocb, iter);
1664
1665         /* If the first page is not in cache, generic_file_aio_read() will be
1666          * returned with -ENODATA.
1667          * See corresponding code in ll_readpage(). */
1668         if (result == -ENODATA)
1669                 result = 0;
1670
1671         if (result > 0)
1672                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1673                                 LPROC_LL_READ_BYTES, result);
1674
1675         return result;
1676 }
1677
1678 /*
1679  * Read from a file (through the page cache).
1680  */
1681 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1682 {
1683         struct lu_env *env;
1684         struct vvp_io_args *args;
1685         ssize_t result;
1686         ssize_t rc2;
1687         __u16 refcheck;
1688
1689         result = ll_do_fast_read(iocb, to);
1690         if (result < 0 || iov_iter_count(to) == 0)
1691                 GOTO(out, result);
1692
1693         env = cl_env_get(&refcheck);
1694         if (IS_ERR(env))
1695                 return PTR_ERR(env);
1696
1697         args = ll_env_args(env, IO_NORMAL);
1698         args->u.normal.via_iter = to;
1699         args->u.normal.via_iocb = iocb;
1700
1701         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1702                                  &iocb->ki_pos, iov_iter_count(to));
1703         if (rc2 > 0)
1704                 result += rc2;
1705         else if (result == 0)
1706                 result = rc2;
1707
1708         cl_env_put(env, &refcheck);
1709 out:
1710         return result;
1711 }
1712
1713 /**
1714  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1715  * If a page is already in the page cache and dirty (and some other things -
1716  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1717  * write to it without doing a full I/O, because Lustre already knows about it
1718  * and will write it out.  This saves a lot of processing time.
1719  *
1720  * All writes here are within one page, so exclusion is handled by the page
1721  * lock on the vm page.  We do not do tiny writes for writes which touch
1722  * multiple pages because it's very unlikely multiple sequential pages are
1723  * are already dirty.
1724  *
1725  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1726  * and are unlikely to be to already dirty pages.
1727  *
1728  * Attribute updates are important here, we do them in ll_tiny_write_end.
1729  */
1730 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1731 {
1732         ssize_t count = iov_iter_count(iter);
1733         struct file *file = iocb->ki_filp;
1734         struct inode *inode = file_inode(file);
1735         ssize_t result = 0;
1736
1737         ENTRY;
1738
1739         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1740          * of function for why.
1741          */
1742         if (count >= PAGE_SIZE ||
1743             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1744                 RETURN(0);
1745
1746         result = __generic_file_write_iter(iocb, iter);
1747
1748         /* If the page is not already dirty, ll_tiny_write_begin returns
1749          * -ENODATA.  We continue on to normal write.
1750          */
1751         if (result == -ENODATA)
1752                 result = 0;
1753
1754         if (result > 0) {
1755                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1756                                    result);
1757                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1758         }
1759
1760         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1761
1762         RETURN(result);
1763 }
1764
1765 /*
1766  * Write to a file (through the page cache).
1767  */
1768 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1769 {
1770         struct vvp_io_args *args;
1771         struct lu_env *env;
1772         ssize_t rc_tiny = 0, rc_normal;
1773         __u16 refcheck;
1774
1775         ENTRY;
1776
1777         /* NB: we can't do direct IO for tiny writes because they use the page
1778          * cache, we can't do sync writes because tiny writes can't flush
1779          * pages, and we can't do append writes because we can't guarantee the
1780          * required DLM locks are held to protect file size.
1781          */
1782         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1783             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1784                 rc_tiny = ll_do_tiny_write(iocb, from);
1785
1786         /* In case of error, go on and try normal write - Only stop if tiny
1787          * write completed I/O.
1788          */
1789         if (iov_iter_count(from) == 0)
1790                 GOTO(out, rc_normal = rc_tiny);
1791
1792         env = cl_env_get(&refcheck);
1793         if (IS_ERR(env))
1794                 return PTR_ERR(env);
1795
1796         args = ll_env_args(env, IO_NORMAL);
1797         args->u.normal.via_iter = from;
1798         args->u.normal.via_iocb = iocb;
1799
1800         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1801                                     &iocb->ki_pos, iov_iter_count(from));
1802
1803         /* On success, combine bytes written. */
1804         if (rc_tiny >= 0 && rc_normal > 0)
1805                 rc_normal += rc_tiny;
1806         /* On error, only return error from normal write if tiny write did not
1807          * write any bytes.  Otherwise return bytes written by tiny write.
1808          */
1809         else if (rc_tiny > 0)
1810                 rc_normal = rc_tiny;
1811
1812         cl_env_put(env, &refcheck);
1813 out:
1814         RETURN(rc_normal);
1815 }
1816
1817 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1818 /*
1819  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1820  */
1821 static int ll_file_get_iov_count(const struct iovec *iov,
1822                                  unsigned long *nr_segs, size_t *count)
1823 {
1824         size_t cnt = 0;
1825         unsigned long seg;
1826
1827         for (seg = 0; seg < *nr_segs; seg++) {
1828                 const struct iovec *iv = &iov[seg];
1829
1830                 /*
1831                  * If any segment has a negative length, or the cumulative
1832                  * length ever wraps negative then return -EINVAL.
1833                  */
1834                 cnt += iv->iov_len;
1835                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1836                         return -EINVAL;
1837                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1838                         continue;
1839                 if (seg == 0)
1840                         return -EFAULT;
1841                 *nr_segs = seg;
1842                 cnt -= iv->iov_len;     /* This segment is no good */
1843                 break;
1844         }
1845         *count = cnt;
1846         return 0;
1847 }
1848
1849 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1850                                 unsigned long nr_segs, loff_t pos)
1851 {
1852         struct iov_iter to;
1853         size_t iov_count;
1854         ssize_t result;
1855         ENTRY;
1856
1857         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1858         if (result)
1859                 RETURN(result);
1860
1861 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1862         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1863 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1864         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1865 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1866
1867         result = ll_file_read_iter(iocb, &to);
1868
1869         RETURN(result);
1870 }
1871
1872 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1873                             loff_t *ppos)
1874 {
1875         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1876         struct kiocb   kiocb;
1877         ssize_t        result;
1878         ENTRY;
1879
1880         init_sync_kiocb(&kiocb, file);
1881         kiocb.ki_pos = *ppos;
1882 #ifdef HAVE_KIOCB_KI_LEFT
1883         kiocb.ki_left = count;
1884 #elif defined(HAVE_KI_NBYTES)
1885         kiocb.i_nbytes = count;
1886 #endif
1887
1888         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1889         *ppos = kiocb.ki_pos;
1890
1891         RETURN(result);
1892 }
1893
1894 /*
1895  * Write to a file (through the page cache).
1896  * AIO stuff
1897  */
1898 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1899                                  unsigned long nr_segs, loff_t pos)
1900 {
1901         struct iov_iter from;
1902         size_t iov_count;
1903         ssize_t result;
1904         ENTRY;
1905
1906         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1907         if (result)
1908                 RETURN(result);
1909
1910 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1911         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1912 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1913         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1914 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1915
1916         result = ll_file_write_iter(iocb, &from);
1917
1918         RETURN(result);
1919 }
1920
1921 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1922                              size_t count, loff_t *ppos)
1923 {
1924         struct iovec   iov = { .iov_base = (void __user *)buf,
1925                                .iov_len = count };
1926         struct kiocb   kiocb;
1927         ssize_t        result;
1928
1929         ENTRY;
1930
1931         init_sync_kiocb(&kiocb, file);
1932         kiocb.ki_pos = *ppos;
1933 #ifdef HAVE_KIOCB_KI_LEFT
1934         kiocb.ki_left = count;
1935 #elif defined(HAVE_KI_NBYTES)
1936         kiocb.ki_nbytes = count;
1937 #endif
1938
1939         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1940         *ppos = kiocb.ki_pos;
1941
1942         RETURN(result);
1943 }
1944 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1945
1946 /*
1947  * Send file content (through pagecache) somewhere with helper
1948  */
1949 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1950                                    struct pipe_inode_info *pipe, size_t count,
1951                                    unsigned int flags)
1952 {
1953         struct lu_env      *env;
1954         struct vvp_io_args *args;
1955         ssize_t             result;
1956         __u16               refcheck;
1957         ENTRY;
1958
1959         env = cl_env_get(&refcheck);
1960         if (IS_ERR(env))
1961                 RETURN(PTR_ERR(env));
1962
1963         args = ll_env_args(env, IO_SPLICE);
1964         args->u.splice.via_pipe = pipe;
1965         args->u.splice.via_flags = flags;
1966
1967         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1968         cl_env_put(env, &refcheck);
1969         RETURN(result);
1970 }
1971
1972 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1973                              __u64 flags, struct lov_user_md *lum, int lum_size)
1974 {
1975         struct lookup_intent oit = {
1976                 .it_op = IT_OPEN,
1977                 .it_flags = flags | MDS_OPEN_BY_FID,
1978         };
1979         int rc;
1980         ENTRY;
1981
1982         ll_inode_size_lock(inode);
1983         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1984         if (rc < 0)
1985                 GOTO(out_unlock, rc);
1986
1987         ll_release_openhandle(dentry, &oit);
1988
1989 out_unlock:
1990         ll_inode_size_unlock(inode);
1991         ll_intent_release(&oit);
1992
1993         RETURN(rc);
1994 }
1995
1996 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1997                              struct lov_mds_md **lmmp, int *lmm_size,
1998                              struct ptlrpc_request **request)
1999 {
2000         struct ll_sb_info *sbi = ll_i2sbi(inode);
2001         struct mdt_body  *body;
2002         struct lov_mds_md *lmm = NULL;
2003         struct ptlrpc_request *req = NULL;
2004         struct md_op_data *op_data;
2005         int rc, lmmsize;
2006
2007         rc = ll_get_default_mdsize(sbi, &lmmsize);
2008         if (rc)
2009                 RETURN(rc);
2010
2011         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2012                                      strlen(filename), lmmsize,
2013                                      LUSTRE_OPC_ANY, NULL);
2014         if (IS_ERR(op_data))
2015                 RETURN(PTR_ERR(op_data));
2016
2017         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2018         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2019         ll_finish_md_op_data(op_data);
2020         if (rc < 0) {
2021                 CDEBUG(D_INFO, "md_getattr_name failed "
2022                        "on %s: rc %d\n", filename, rc);
2023                 GOTO(out, rc);
2024         }
2025
2026         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2027         LASSERT(body != NULL); /* checked by mdc_getattr_name */
2028
2029         lmmsize = body->mbo_eadatasize;
2030
2031         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2032                         lmmsize == 0) {
2033                 GOTO(out, rc = -ENODATA);
2034         }
2035
2036         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2037         LASSERT(lmm != NULL);
2038
2039         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2040             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2041             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2042                 GOTO(out, rc = -EPROTO);
2043
2044         /*
2045          * This is coming from the MDS, so is probably in
2046          * little endian.  We convert it to host endian before
2047          * passing it to userspace.
2048          */
2049         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2050                 int stripe_count;
2051
2052                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2053                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2054                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2055                         if (le32_to_cpu(lmm->lmm_pattern) &
2056                             LOV_PATTERN_F_RELEASED)
2057                                 stripe_count = 0;
2058                 }
2059
2060                 /* if function called for directory - we should
2061                  * avoid swab not existent lsm objects */
2062                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2063                         lustre_swab_lov_user_md_v1(
2064                                         (struct lov_user_md_v1 *)lmm);
2065                         if (S_ISREG(body->mbo_mode))
2066                                 lustre_swab_lov_user_md_objects(
2067                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2068                                     stripe_count);
2069                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2070                         lustre_swab_lov_user_md_v3(
2071                                         (struct lov_user_md_v3 *)lmm);
2072                         if (S_ISREG(body->mbo_mode))
2073                                 lustre_swab_lov_user_md_objects(
2074                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2075                                     stripe_count);
2076                 } else if (lmm->lmm_magic ==
2077                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2078                         lustre_swab_lov_comp_md_v1(
2079                                         (struct lov_comp_md_v1 *)lmm);
2080                 }
2081         }
2082
2083 out:
2084         *lmmp = lmm;
2085         *lmm_size = lmmsize;
2086         *request = req;
2087         return rc;
2088 }
2089
2090 static int ll_lov_setea(struct inode *inode, struct file *file,
2091                         void __user *arg)
2092 {
2093         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2094         struct lov_user_md      *lump;
2095         int                      lum_size = sizeof(struct lov_user_md) +
2096                                             sizeof(struct lov_user_ost_data);
2097         int                      rc;
2098         ENTRY;
2099
2100         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2101                 RETURN(-EPERM);
2102
2103         OBD_ALLOC_LARGE(lump, lum_size);
2104         if (lump == NULL)
2105                 RETURN(-ENOMEM);
2106
2107         if (copy_from_user(lump, arg, lum_size))
2108                 GOTO(out_lump, rc = -EFAULT);
2109
2110         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2111                                       lum_size);
2112         cl_lov_delay_create_clear(&file->f_flags);
2113
2114 out_lump:
2115         OBD_FREE_LARGE(lump, lum_size);
2116         RETURN(rc);
2117 }
2118
2119 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2120 {
2121         struct lu_env   *env;
2122         __u16           refcheck;
2123         int             rc;
2124         ENTRY;
2125
2126         env = cl_env_get(&refcheck);
2127         if (IS_ERR(env))
2128                 RETURN(PTR_ERR(env));
2129
2130         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2131         cl_env_put(env, &refcheck);
2132         RETURN(rc);
2133 }
2134
2135 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2136                             void __user *arg)
2137 {
2138         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2139         struct lov_user_md        *klum;
2140         int                        lum_size, rc;
2141         __u64                      flags = FMODE_WRITE;
2142         ENTRY;
2143
2144         rc = ll_copy_user_md(lum, &klum);
2145         if (rc < 0)
2146                 RETURN(rc);
2147
2148         lum_size = rc;
2149         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2150                                       lum_size);
2151         if (!rc) {
2152                 __u32 gen;
2153
2154                 rc = put_user(0, &lum->lmm_stripe_count);
2155                 if (rc)
2156                         GOTO(out, rc);
2157
2158                 rc = ll_layout_refresh(inode, &gen);
2159                 if (rc)
2160                         GOTO(out, rc);
2161
2162                 rc = ll_file_getstripe(inode, arg, lum_size);
2163         }
2164         cl_lov_delay_create_clear(&file->f_flags);
2165
2166 out:
2167         OBD_FREE(klum, lum_size);
2168         RETURN(rc);
2169 }
2170
2171 static int
2172 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2173 {
2174         struct ll_inode_info *lli = ll_i2info(inode);
2175         struct cl_object *obj = lli->lli_clob;
2176         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2177         struct ll_grouplock grouplock;
2178         int rc;
2179         ENTRY;
2180
2181         if (arg == 0) {
2182                 CWARN("group id for group lock must not be 0\n");
2183                 RETURN(-EINVAL);
2184         }
2185
2186         if (ll_file_nolock(file))
2187                 RETURN(-EOPNOTSUPP);
2188
2189         spin_lock(&lli->lli_lock);
2190         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2191                 CWARN("group lock already existed with gid %lu\n",
2192                       fd->fd_grouplock.lg_gid);
2193                 spin_unlock(&lli->lli_lock);
2194                 RETURN(-EINVAL);
2195         }
2196         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2197         spin_unlock(&lli->lli_lock);
2198
2199         /**
2200          * XXX: group lock needs to protect all OST objects while PFL
2201          * can add new OST objects during the IO, so we'd instantiate
2202          * all OST objects before getting its group lock.
2203          */
2204         if (obj) {
2205                 struct lu_env *env;
2206                 __u16 refcheck;
2207                 struct cl_layout cl = {
2208                         .cl_is_composite = false,
2209                 };
2210                 struct lu_extent ext = {
2211                         .e_start = 0,
2212                         .e_end = OBD_OBJECT_EOF,
2213                 };
2214
2215                 env = cl_env_get(&refcheck);
2216                 if (IS_ERR(env))
2217                         RETURN(PTR_ERR(env));
2218
2219                 rc = cl_object_layout_get(env, obj, &cl);
2220                 if (!rc && cl.cl_is_composite)
2221                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2222                                                     &ext);
2223
2224                 cl_env_put(env, &refcheck);
2225                 if (rc)
2226                         RETURN(rc);
2227         }
2228
2229         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2230                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2231         if (rc)
2232                 RETURN(rc);
2233
2234         spin_lock(&lli->lli_lock);
2235         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2236                 spin_unlock(&lli->lli_lock);
2237                 CERROR("another thread just won the race\n");
2238                 cl_put_grouplock(&grouplock);
2239                 RETURN(-EINVAL);
2240         }
2241
2242         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2243         fd->fd_grouplock = grouplock;
2244         spin_unlock(&lli->lli_lock);
2245
2246         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2247         RETURN(0);
2248 }
2249
2250 static int ll_put_grouplock(struct inode *inode, struct file *file,
2251                             unsigned long arg)
2252 {
2253         struct ll_inode_info   *lli = ll_i2info(inode);
2254         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2255         struct ll_grouplock     grouplock;
2256         ENTRY;
2257
2258         spin_lock(&lli->lli_lock);
2259         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2260                 spin_unlock(&lli->lli_lock);
2261                 CWARN("no group lock held\n");
2262                 RETURN(-EINVAL);
2263         }
2264
2265         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2266
2267         if (fd->fd_grouplock.lg_gid != arg) {
2268                 CWARN("group lock %lu doesn't match current id %lu\n",
2269                       arg, fd->fd_grouplock.lg_gid);
2270                 spin_unlock(&lli->lli_lock);
2271                 RETURN(-EINVAL);
2272         }
2273
2274         grouplock = fd->fd_grouplock;
2275         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2276         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2277         spin_unlock(&lli->lli_lock);
2278
2279         cl_put_grouplock(&grouplock);
2280         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2281         RETURN(0);
2282 }
2283
2284 /**
2285  * Close inode open handle
2286  *
2287  * \param dentry [in]     dentry which contains the inode
2288  * \param it     [in,out] intent which contains open info and result
2289  *
2290  * \retval 0     success
2291  * \retval <0    failure
2292  */
2293 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2294 {
2295         struct inode *inode = dentry->d_inode;
2296         struct obd_client_handle *och;
2297         int rc;
2298         ENTRY;
2299
2300         LASSERT(inode);
2301
2302         /* Root ? Do nothing. */
2303         if (dentry->d_inode->i_sb->s_root == dentry)
2304                 RETURN(0);
2305
2306         /* No open handle to close? Move away */
2307         if (!it_disposition(it, DISP_OPEN_OPEN))
2308                 RETURN(0);
2309
2310         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2311
2312         OBD_ALLOC(och, sizeof(*och));
2313         if (!och)
2314                 GOTO(out, rc = -ENOMEM);
2315
2316         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2317
2318         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2319 out:
2320         /* this one is in place of ll_file_open */
2321         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2322                 ptlrpc_req_finished(it->it_request);
2323                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2324         }
2325         RETURN(rc);
2326 }
2327
2328 /**
2329  * Get size for inode for which FIEMAP mapping is requested.
2330  * Make the FIEMAP get_info call and returns the result.
2331  * \param fiemap        kernel buffer to hold extens
2332  * \param num_bytes     kernel buffer size
2333  */
2334 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2335                         size_t num_bytes)
2336 {
2337         struct lu_env                   *env;
2338         __u16                           refcheck;
2339         int                             rc = 0;
2340         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2341         ENTRY;
2342
2343         /* Checks for fiemap flags */
2344         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2345                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2346                 return -EBADR;
2347         }
2348
2349         /* Check for FIEMAP_FLAG_SYNC */
2350         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2351                 rc = filemap_fdatawrite(inode->i_mapping);
2352                 if (rc)
2353                         return rc;
2354         }
2355
2356         env = cl_env_get(&refcheck);
2357         if (IS_ERR(env))
2358                 RETURN(PTR_ERR(env));
2359
2360         if (i_size_read(inode) == 0) {
2361                 rc = ll_glimpse_size(inode);
2362                 if (rc)
2363                         GOTO(out, rc);
2364         }
2365
2366         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2367         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2368         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2369
2370         /* If filesize is 0, then there would be no objects for mapping */
2371         if (fmkey.lfik_oa.o_size == 0) {
2372                 fiemap->fm_mapped_extents = 0;
2373                 GOTO(out, rc = 0);
2374         }
2375
2376         fmkey.lfik_fiemap = *fiemap;
2377
2378         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2379                               &fmkey, fiemap, &num_bytes);
2380 out:
2381         cl_env_put(env, &refcheck);
2382         RETURN(rc);
2383 }
2384
2385 int ll_fid2path(struct inode *inode, void __user *arg)
2386 {
2387         struct obd_export       *exp = ll_i2mdexp(inode);
2388         const struct getinfo_fid2path __user *gfin = arg;
2389         __u32                    pathlen;
2390         struct getinfo_fid2path *gfout;
2391         size_t                   outsize;
2392         int                      rc;
2393
2394         ENTRY;
2395
2396         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2397             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2398                 RETURN(-EPERM);
2399
2400         /* Only need to get the buflen */
2401         if (get_user(pathlen, &gfin->gf_pathlen))
2402                 RETURN(-EFAULT);
2403
2404         if (pathlen > PATH_MAX)
2405                 RETURN(-EINVAL);
2406
2407         outsize = sizeof(*gfout) + pathlen;
2408         OBD_ALLOC(gfout, outsize);
2409         if (gfout == NULL)
2410                 RETURN(-ENOMEM);
2411
2412         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2413                 GOTO(gf_free, rc = -EFAULT);
2414         /* append root FID after gfout to let MDT know the root FID so that it
2415          * can lookup the correct path, this is mainly for fileset.
2416          * old server without fileset mount support will ignore this. */
2417         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2418
2419         /* Call mdc_iocontrol */
2420         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2421         if (rc != 0)
2422                 GOTO(gf_free, rc);
2423
2424         if (copy_to_user(arg, gfout, outsize))
2425                 rc = -EFAULT;
2426
2427 gf_free:
2428         OBD_FREE(gfout, outsize);
2429         RETURN(rc);
2430 }
2431
2432 static int
2433 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2434 {
2435         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2436         struct lu_env *env;
2437         struct cl_io *io;
2438         __u16  refcheck;
2439         int result;
2440
2441         ENTRY;
2442
2443         ioc->idv_version = 0;
2444         ioc->idv_layout_version = UINT_MAX;
2445
2446         /* If no file object initialized, we consider its version is 0. */
2447         if (obj == NULL)
2448                 RETURN(0);
2449
2450         env = cl_env_get(&refcheck);
2451         if (IS_ERR(env))
2452                 RETURN(PTR_ERR(env));
2453
2454         io = vvp_env_thread_io(env);
2455         io->ci_obj = obj;
2456         io->u.ci_data_version.dv_data_version = 0;
2457         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2458         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2459
2460 restart:
2461         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2462                 result = cl_io_loop(env, io);
2463         else
2464                 result = io->ci_result;
2465
2466         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2467         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2468
2469         cl_io_fini(env, io);
2470
2471         if (unlikely(io->ci_need_restart))
2472                 goto restart;
2473
2474         cl_env_put(env, &refcheck);
2475
2476         RETURN(result);
2477 }
2478
2479 /*
2480  * Read the data_version for inode.
2481  *
2482  * This value is computed using stripe object version on OST.
2483  * Version is computed using server side locking.
2484  *
2485  * @param flags if do sync on the OST side;
2486  *              0: no sync
2487  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2488  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2489  */
2490 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2491 {
2492         struct ioc_data_version ioc = { .idv_flags = flags };
2493         int rc;
2494
2495         rc = ll_ioc_data_version(inode, &ioc);
2496         if (!rc)
2497                 *data_version = ioc.idv_version;
2498
2499         return rc;
2500 }
2501
2502 /*
2503  * Trigger a HSM release request for the provided inode.
2504  */
2505 int ll_hsm_release(struct inode *inode)
2506 {
2507         struct lu_env *env;
2508         struct obd_client_handle *och = NULL;
2509         __u64 data_version = 0;
2510         int rc;
2511         __u16 refcheck;
2512         ENTRY;
2513
2514         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2515                ll_get_fsname(inode->i_sb, NULL, 0),
2516                PFID(&ll_i2info(inode)->lli_fid));
2517
2518         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2519         if (IS_ERR(och))
2520                 GOTO(out, rc = PTR_ERR(och));
2521
2522         /* Grab latest data_version and [am]time values */
2523         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2524         if (rc != 0)
2525                 GOTO(out, rc);
2526
2527         env = cl_env_get(&refcheck);
2528         if (IS_ERR(env))
2529                 GOTO(out, rc = PTR_ERR(env));
2530
2531         rc = ll_merge_attr(env, inode);
2532         cl_env_put(env, &refcheck);
2533
2534         /* If error happen, we have the wrong size for a file.
2535          * Don't release it.
2536          */
2537         if (rc != 0)
2538                 GOTO(out, rc);
2539
2540         /* Release the file.
2541          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2542          * we still need it to pack l_remote_handle to MDT. */
2543         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2544                                        &data_version);
2545         och = NULL;
2546
2547         EXIT;
2548 out:
2549         if (och != NULL && !IS_ERR(och)) /* close the file */
2550                 ll_lease_close(och, inode, NULL);
2551
2552         return rc;
2553 }
2554
2555 struct ll_swap_stack {
2556         __u64                    dv1;
2557         __u64                    dv2;
2558         struct inode            *inode1;
2559         struct inode            *inode2;
2560         bool                     check_dv1;
2561         bool                     check_dv2;
2562 };
2563
2564 static int ll_swap_layouts(struct file *file1, struct file *file2,
2565                            struct lustre_swap_layouts *lsl)
2566 {
2567         struct mdc_swap_layouts  msl;
2568         struct md_op_data       *op_data;
2569         __u32                    gid;
2570         __u64                    dv;
2571         struct ll_swap_stack    *llss = NULL;
2572         int                      rc;
2573
2574         OBD_ALLOC_PTR(llss);
2575         if (llss == NULL)
2576                 RETURN(-ENOMEM);
2577
2578         llss->inode1 = file_inode(file1);
2579         llss->inode2 = file_inode(file2);
2580
2581         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2582         if (rc < 0)
2583                 GOTO(free, rc);
2584
2585         /* we use 2 bool because it is easier to swap than 2 bits */
2586         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2587                 llss->check_dv1 = true;
2588
2589         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2590                 llss->check_dv2 = true;
2591
2592         /* we cannot use lsl->sl_dvX directly because we may swap them */
2593         llss->dv1 = lsl->sl_dv1;
2594         llss->dv2 = lsl->sl_dv2;
2595
2596         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2597         if (rc == 0) /* same file, done! */
2598                 GOTO(free, rc);
2599
2600         if (rc < 0) { /* sequentialize it */
2601                 swap(llss->inode1, llss->inode2);
2602                 swap(file1, file2);
2603                 swap(llss->dv1, llss->dv2);
2604                 swap(llss->check_dv1, llss->check_dv2);
2605         }
2606
2607         gid = lsl->sl_gid;
2608         if (gid != 0) { /* application asks to flush dirty cache */
2609                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2610                 if (rc < 0)
2611                         GOTO(free, rc);
2612
2613                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2614                 if (rc < 0) {
2615                         ll_put_grouplock(llss->inode1, file1, gid);
2616                         GOTO(free, rc);
2617                 }
2618         }
2619
2620         /* ultimate check, before swaping the layouts we check if
2621          * dataversion has changed (if requested) */
2622         if (llss->check_dv1) {
2623                 rc = ll_data_version(llss->inode1, &dv, 0);
2624                 if (rc)
2625                         GOTO(putgl, rc);
2626                 if (dv != llss->dv1)
2627                         GOTO(putgl, rc = -EAGAIN);
2628         }
2629
2630         if (llss->check_dv2) {
2631                 rc = ll_data_version(llss->inode2, &dv, 0);
2632                 if (rc)
2633                         GOTO(putgl, rc);
2634                 if (dv != llss->dv2)
2635                         GOTO(putgl, rc = -EAGAIN);
2636         }
2637
2638         /* struct md_op_data is used to send the swap args to the mdt
2639          * only flags is missing, so we use struct mdc_swap_layouts
2640          * through the md_op_data->op_data */
2641         /* flags from user space have to be converted before they are send to
2642          * server, no flag is sent today, they are only used on the client */
2643         msl.msl_flags = 0;
2644         rc = -ENOMEM;
2645         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2646                                      0, LUSTRE_OPC_ANY, &msl);
2647         if (IS_ERR(op_data))
2648                 GOTO(free, rc = PTR_ERR(op_data));
2649
2650         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2651                            sizeof(*op_data), op_data, NULL);
2652         ll_finish_md_op_data(op_data);
2653
2654         if (rc < 0)
2655                 GOTO(putgl, rc);
2656
2657 putgl:
2658         if (gid != 0) {
2659                 ll_put_grouplock(llss->inode2, file2, gid);
2660                 ll_put_grouplock(llss->inode1, file1, gid);
2661         }
2662
2663 free:
2664         if (llss != NULL)
2665                 OBD_FREE_PTR(llss);
2666
2667         RETURN(rc);
2668 }
2669
2670 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2671 {
2672         struct md_op_data       *op_data;
2673         int                      rc;
2674         ENTRY;
2675
2676         /* Detect out-of range masks */
2677         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2678                 RETURN(-EINVAL);
2679
2680         /* Non-root users are forbidden to set or clear flags which are
2681          * NOT defined in HSM_USER_MASK. */
2682         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2683             !cfs_capable(CFS_CAP_SYS_ADMIN))
2684                 RETURN(-EPERM);
2685
2686         /* Detect out-of range archive id */
2687         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2688             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2689                 RETURN(-EINVAL);
2690
2691         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2692                                      LUSTRE_OPC_ANY, hss);
2693         if (IS_ERR(op_data))
2694                 RETURN(PTR_ERR(op_data));
2695
2696         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2697                            sizeof(*op_data), op_data, NULL);
2698
2699         ll_finish_md_op_data(op_data);
2700
2701         RETURN(rc);
2702 }
2703
2704 static int ll_hsm_import(struct inode *inode, struct file *file,
2705                          struct hsm_user_import *hui)
2706 {
2707         struct hsm_state_set    *hss = NULL;
2708         struct iattr            *attr = NULL;
2709         int                      rc;
2710         ENTRY;
2711
2712         if (!S_ISREG(inode->i_mode))
2713                 RETURN(-EINVAL);
2714
2715         /* set HSM flags */
2716         OBD_ALLOC_PTR(hss);
2717         if (hss == NULL)
2718                 GOTO(out, rc = -ENOMEM);
2719
2720         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2721         hss->hss_archive_id = hui->hui_archive_id;
2722         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2723         rc = ll_hsm_state_set(inode, hss);
2724         if (rc != 0)
2725                 GOTO(out, rc);
2726
2727         OBD_ALLOC_PTR(attr);
2728         if (attr == NULL)
2729                 GOTO(out, rc = -ENOMEM);
2730
2731         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2732         attr->ia_mode |= S_IFREG;
2733         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2734         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2735         attr->ia_size = hui->hui_size;
2736         attr->ia_mtime.tv_sec = hui->hui_mtime;
2737         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2738         attr->ia_atime.tv_sec = hui->hui_atime;
2739         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2740
2741         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2742                          ATTR_UID | ATTR_GID |
2743                          ATTR_MTIME | ATTR_MTIME_SET |
2744                          ATTR_ATIME | ATTR_ATIME_SET;
2745
2746         inode_lock(inode);
2747
2748         rc = ll_setattr_raw(file_dentry(file), attr, true);
2749         if (rc == -ENODATA)
2750                 rc = 0;
2751
2752         inode_unlock(inode);
2753
2754 out:
2755         if (hss != NULL)
2756                 OBD_FREE_PTR(hss);
2757
2758         if (attr != NULL)
2759                 OBD_FREE_PTR(attr);
2760
2761         RETURN(rc);
2762 }
2763
2764 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2765 {
2766         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2767                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2768 }
2769
2770 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2771 {
2772         struct inode *inode = file_inode(file);
2773         struct iattr ia = {
2774                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2775                             ATTR_MTIME | ATTR_MTIME_SET |
2776                             ATTR_CTIME | ATTR_CTIME_SET,
2777                 .ia_atime = {
2778                         .tv_sec = lfu->lfu_atime_sec,
2779                         .tv_nsec = lfu->lfu_atime_nsec,
2780                 },
2781                 .ia_mtime = {
2782                         .tv_sec = lfu->lfu_mtime_sec,
2783                         .tv_nsec = lfu->lfu_mtime_nsec,
2784                 },
2785                 .ia_ctime = {
2786                         .tv_sec = lfu->lfu_ctime_sec,
2787                         .tv_nsec = lfu->lfu_ctime_nsec,
2788                 },
2789         };
2790         int rc;
2791         ENTRY;
2792
2793         if (!capable(CAP_SYS_ADMIN))
2794                 RETURN(-EPERM);
2795
2796         if (!S_ISREG(inode->i_mode))
2797                 RETURN(-EINVAL);
2798
2799         inode_lock(inode);
2800         rc = ll_setattr_raw(file_dentry(file), &ia, false);
2801         inode_unlock(inode);
2802
2803         RETURN(rc);
2804 }
2805
2806 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2807 {
2808         switch (mode) {
2809         case MODE_READ_USER:
2810                 return CLM_READ;
2811         case MODE_WRITE_USER:
2812                 return CLM_WRITE;
2813         default:
2814                 return -EINVAL;
2815         }
2816 }
2817
2818 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2819
2820 /* Used to allow the upper layers of the client to request an LDLM lock
2821  * without doing an actual read or write.
2822  *
2823  * Used for ladvise lockahead to manually request specific locks.
2824  *
2825  * \param[in] file      file this ladvise lock request is on
2826  * \param[in] ladvise   ladvise struct describing this lock request
2827  *
2828  * \retval 0            success, no detailed result available (sync requests
2829  *                      and requests sent to the server [not handled locally]
2830  *                      cannot return detailed results)
2831  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2832  *                                       see definitions for details.
2833  * \retval negative     negative errno on error
2834  */
2835 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2836 {
2837         struct lu_env *env = NULL;
2838         struct cl_io *io  = NULL;
2839         struct cl_lock *lock = NULL;
2840         struct cl_lock_descr *descr = NULL;
2841         struct dentry *dentry = file->f_path.dentry;
2842         struct inode *inode = dentry->d_inode;
2843         enum cl_lock_mode cl_mode;
2844         off_t start = ladvise->lla_start;
2845         off_t end = ladvise->lla_end;
2846         int result;
2847         __u16 refcheck;
2848
2849         ENTRY;
2850
2851         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2852                "start=%llu, end=%llu\n", dentry->d_name.len,
2853                dentry->d_name.name, dentry->d_inode,
2854                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2855                (__u64) end);
2856
2857         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2858         if (cl_mode < 0)
2859                 GOTO(out, result = cl_mode);
2860
2861         /* Get IO environment */
2862         result = cl_io_get(inode, &env, &io, &refcheck);
2863         if (result <= 0)
2864                 GOTO(out, result);
2865
2866         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2867         if (result > 0) {
2868                 /*
2869                  * nothing to do for this io. This currently happens when
2870                  * stripe sub-object's are not yet created.
2871                  */
2872                 result = io->ci_result;
2873         } else if (result == 0) {
2874                 lock = vvp_env_lock(env);
2875                 descr = &lock->cll_descr;
2876
2877                 descr->cld_obj   = io->ci_obj;
2878                 /* Convert byte offsets to pages */
2879                 descr->cld_start = cl_index(io->ci_obj, start);
2880                 descr->cld_end   = cl_index(io->ci_obj, end);
2881                 descr->cld_mode  = cl_mode;
2882                 /* CEF_MUST is used because we do not want to convert a
2883                  * lockahead request to a lockless lock */
2884                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2885                                        CEF_NONBLOCK;
2886
2887                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2888                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2889
2890                 result = cl_lock_request(env, io, lock);
2891
2892                 /* On success, we need to release the lock */
2893                 if (result >= 0)
2894                         cl_lock_release(env, lock);
2895         }
2896         cl_io_fini(env, io);
2897         cl_env_put(env, &refcheck);
2898
2899         /* -ECANCELED indicates a matching lock with a different extent
2900          * was already present, and -EEXIST indicates a matching lock
2901          * on exactly the same extent was already present.
2902          * We convert them to positive values for userspace to make
2903          * recognizing true errors easier.
2904          * Note we can only return these detailed results on async requests,
2905          * as sync requests look the same as i/o requests for locking. */
2906         if (result == -ECANCELED)
2907                 result = LLA_RESULT_DIFFERENT;
2908         else if (result == -EEXIST)
2909                 result = LLA_RESULT_SAME;
2910
2911 out:
2912         RETURN(result);
2913 }
2914 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2915
2916 static int ll_ladvise_sanity(struct inode *inode,
2917                              struct llapi_lu_ladvise *ladvise)
2918 {
2919         enum lu_ladvise_type advice = ladvise->lla_advice;
2920         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2921          * be in the first 32 bits of enum ladvise_flags */
2922         __u32 flags = ladvise->lla_peradvice_flags;
2923         /* 3 lines at 80 characters per line, should be plenty */
2924         int rc = 0;
2925
2926         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2927                 rc = -EINVAL;
2928                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2929                        "last supported advice is %s (value '%d'): rc = %d\n",
2930                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2931                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2932                 GOTO(out, rc);
2933         }
2934
2935         /* Per-advice checks */
2936         switch (advice) {
2937         case LU_LADVISE_LOCKNOEXPAND:
2938                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2939                         rc = -EINVAL;
2940                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2941                                "rc = %d\n",
2942                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2943                                ladvise_names[advice], rc);
2944                         GOTO(out, rc);
2945                 }
2946                 break;
2947         case LU_LADVISE_LOCKAHEAD:
2948                 /* Currently only READ and WRITE modes can be requested */
2949                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2950                     ladvise->lla_lockahead_mode == 0) {
2951                         rc = -EINVAL;
2952                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2953                                "rc = %d\n",
2954                                ll_get_fsname(inode->i_sb, NULL, 0),
2955                                ladvise->lla_lockahead_mode,
2956                                ladvise_names[advice], rc);
2957                         GOTO(out, rc);
2958                 }
2959         case LU_LADVISE_WILLREAD:
2960         case LU_LADVISE_DONTNEED:
2961         default:
2962                 /* Note fall through above - These checks apply to all advices
2963                  * except LOCKNOEXPAND */
2964                 if (flags & ~LF_DEFAULT_MASK) {
2965                         rc = -EINVAL;
2966                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2967                                "rc = %d\n",
2968                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2969                                ladvise_names[advice], rc);
2970                         GOTO(out, rc);
2971                 }
2972                 if (ladvise->lla_start >= ladvise->lla_end) {
2973                         rc = -EINVAL;
2974                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2975                                "for %s: rc = %d\n",
2976                                ll_get_fsname(inode->i_sb, NULL, 0),
2977                                ladvise->lla_start, ladvise->lla_end,
2978                                ladvise_names[advice], rc);
2979                         GOTO(out, rc);
2980                 }
2981                 break;
2982         }
2983
2984 out:
2985         return rc;
2986 }
2987 #undef ERRSIZE
2988
2989 /*
2990  * Give file access advices
2991  *
2992  * The ladvise interface is similar to Linux fadvise() system call, except it
2993  * forwards the advices directly from Lustre client to server. The server side
2994  * codes will apply appropriate read-ahead and caching techniques for the
2995  * corresponding files.
2996  *
2997  * A typical workload for ladvise is e.g. a bunch of different clients are
2998  * doing small random reads of a file, so prefetching pages into OSS cache
2999  * with big linear reads before the random IO is a net benefit. Fetching
3000  * all that data into each client cache with fadvise() may not be, due to
3001  * much more data being sent to the client.
3002  */
3003 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3004                       struct llapi_lu_ladvise *ladvise)
3005 {
3006         struct lu_env *env;
3007         struct cl_io *io;
3008         struct cl_ladvise_io *lio;
3009         int rc;
3010         __u16 refcheck;
3011         ENTRY;
3012
3013         env = cl_env_get(&refcheck);
3014         if (IS_ERR(env))
3015                 RETURN(PTR_ERR(env));
3016
3017         io = vvp_env_thread_io(env);
3018         io->ci_obj = ll_i2info(inode)->lli_clob;
3019
3020         /* initialize parameters for ladvise */
3021         lio = &io->u.ci_ladvise;
3022         lio->li_start = ladvise->lla_start;
3023         lio->li_end = ladvise->lla_end;
3024         lio->li_fid = ll_inode2fid(inode);
3025         lio->li_advice = ladvise->lla_advice;
3026         lio->li_flags = flags;
3027
3028         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3029                 rc = cl_io_loop(env, io);
3030         else
3031                 rc = io->ci_result;
3032
3033         cl_io_fini(env, io);
3034         cl_env_put(env, &refcheck);
3035         RETURN(rc);
3036 }
3037
3038 static int ll_lock_noexpand(struct file *file, int flags)
3039 {
3040         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3041
3042         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3043
3044         return 0;
3045 }
3046
3047 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3048                         unsigned long arg)
3049 {
3050         struct fsxattr fsxattr;
3051
3052         if (copy_from_user(&fsxattr,
3053                            (const struct fsxattr __user *)arg,
3054                            sizeof(fsxattr)))
3055                 RETURN(-EFAULT);
3056
3057         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3058         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3059                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3060         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3061         if (copy_to_user((struct fsxattr __user *)arg,
3062                          &fsxattr, sizeof(fsxattr)))
3063                 RETURN(-EFAULT);
3064
3065         RETURN(0);
3066 }
3067
3068 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3069                         unsigned long arg)
3070 {
3071
3072         struct md_op_data *op_data;
3073         struct ptlrpc_request *req = NULL;
3074         int rc = 0;
3075         struct fsxattr fsxattr;
3076         struct cl_object *obj;
3077         int flags;
3078
3079         /* only root could change project ID */
3080         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
3081                 RETURN(-EPERM);
3082
3083         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3084                                      LUSTRE_OPC_ANY, NULL);
3085         if (IS_ERR(op_data))
3086                 RETURN(PTR_ERR(op_data));
3087
3088         if (copy_from_user(&fsxattr,
3089                            (const struct fsxattr __user *)arg,
3090                            sizeof(fsxattr)))
3091                 GOTO(out_fsxattr1, rc = -EFAULT);
3092
3093         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3094         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3095         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3096                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3097         op_data->op_projid = fsxattr.fsx_projid;
3098         op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
3099         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3100                         0, &req);
3101         ptlrpc_req_finished(req);
3102
3103         obj = ll_i2info(inode)->lli_clob;
3104         if (obj) {
3105                 struct iattr *attr;
3106
3107                 ll_update_inode_flags(inode, op_data->op_attr_flags);
3108                 OBD_ALLOC_PTR(attr);
3109                 if (attr == NULL)
3110                         GOTO(out_fsxattr1, rc = -ENOMEM);
3111                 attr->ia_valid = ATTR_ATTR_FLAG;
3112                 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
3113
3114                 OBD_FREE_PTR(attr);
3115         }
3116 out_fsxattr1:
3117         ll_finish_md_op_data(op_data);
3118         RETURN(rc);
3119 }
3120
3121 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3122                                  unsigned long arg)
3123 {
3124         struct inode            *inode = file_inode(file);
3125         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3126         struct ll_inode_info    *lli = ll_i2info(inode);
3127         struct obd_client_handle *och = NULL;
3128         struct split_param sp;
3129         bool lease_broken;
3130         fmode_t fmode = 0;
3131         enum mds_op_bias bias = 0;
3132         struct file *layout_file = NULL;
3133         void *data = NULL;
3134         size_t data_size = 0;
3135         long rc;
3136         ENTRY;
3137
3138         mutex_lock(&lli->lli_och_mutex);
3139         if (fd->fd_lease_och != NULL) {
3140                 och = fd->fd_lease_och;
3141                 fd->fd_lease_och = NULL;
3142         }
3143         mutex_unlock(&lli->lli_och_mutex);
3144
3145         if (och == NULL)
3146                 GOTO(out, rc = -ENOLCK);
3147
3148         fmode = och->och_flags;
3149
3150         switch (ioc->lil_flags) {
3151         case LL_LEASE_RESYNC_DONE:
3152                 if (ioc->lil_count > IOC_IDS_MAX)
3153                         GOTO(out, rc = -EINVAL);
3154
3155                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3156                 OBD_ALLOC(data, data_size);
3157                 if (!data)
3158                         GOTO(out, rc = -ENOMEM);
3159
3160                 if (copy_from_user(data, (void __user *)arg, data_size))
3161                         GOTO(out, rc = -EFAULT);
3162
3163                 bias = MDS_CLOSE_RESYNC_DONE;
3164                 break;
3165         case LL_LEASE_LAYOUT_MERGE: {
3166                 int fd;
3167
3168                 if (ioc->lil_count != 1)
3169                         GOTO(out, rc = -EINVAL);
3170
3171                 arg += sizeof(*ioc);
3172                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3173                         GOTO(out, rc = -EFAULT);
3174
3175                 layout_file = fget(fd);
3176                 if (!layout_file)
3177                         GOTO(out, rc = -EBADF);
3178
3179                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3180                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3181                         GOTO(out, rc = -EPERM);
3182
3183                 data = file_inode(layout_file);
3184                 bias = MDS_CLOSE_LAYOUT_MERGE;
3185                 break;
3186         }
3187         case LL_LEASE_LAYOUT_SPLIT: {
3188                 int fdv;
3189                 int mirror_id;
3190
3191                 if (ioc->lil_count != 2)
3192                         GOTO(out, rc = -EINVAL);
3193
3194                 arg += sizeof(*ioc);
3195                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3196                         GOTO(out, rc = -EFAULT);
3197
3198                 arg += sizeof(__u32);
3199                 if (copy_from_user(&mirror_id, (void __user *)arg,
3200                                    sizeof(__u32)))
3201                         GOTO(out, rc = -EFAULT);
3202
3203                 layout_file = fget(fdv);
3204                 if (!layout_file)
3205                         GOTO(out, rc = -EBADF);
3206
3207                 sp.sp_inode = file_inode(layout_file);
3208                 sp.sp_mirror_id = (__u16)mirror_id;
3209                 data = &sp;
3210                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3211                 break;
3212         }
3213         default:
3214                 /* without close intent */
3215                 break;
3216         }
3217
3218         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3219         if (rc < 0)
3220                 GOTO(out, rc);
3221
3222         rc = ll_lease_och_release(inode, file);
3223         if (rc < 0)
3224                 GOTO(out, rc);
3225
3226         if (lease_broken)
3227                 fmode = 0;
3228         EXIT;
3229
3230 out:
3231         switch (ioc->lil_flags) {
3232         case LL_LEASE_RESYNC_DONE:
3233                 if (data)
3234                         OBD_FREE(data, data_size);
3235                 break;
3236         case LL_LEASE_LAYOUT_MERGE:
3237         case LL_LEASE_LAYOUT_SPLIT:
3238                 if (layout_file)
3239                         fput(layout_file);
3240                 break;
3241         }
3242
3243         if (!rc)
3244                 rc = ll_lease_type_from_fmode(fmode);
3245         RETURN(rc);
3246 }
3247
3248 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3249                               unsigned long arg)
3250 {
3251         struct inode *inode = file_inode(file);
3252         struct ll_inode_info *lli = ll_i2info(inode);
3253         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3254         struct obd_client_handle *och = NULL;
3255         __u64 open_flags = 0;
3256         bool lease_broken;
3257         fmode_t fmode;
3258         long rc;
3259         ENTRY;
3260
3261         switch (ioc->lil_mode) {
3262         case LL_LEASE_WRLCK:
3263                 if (!(file->f_mode & FMODE_WRITE))
3264                         RETURN(-EPERM);
3265                 fmode = FMODE_WRITE;
3266                 break;
3267         case LL_LEASE_RDLCK:
3268                 if (!(file->f_mode & FMODE_READ))
3269                         RETURN(-EPERM);
3270                 fmode = FMODE_READ;
3271                 break;
3272         case LL_LEASE_UNLCK:
3273                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3274         default:
3275                 RETURN(-EINVAL);
3276         }
3277
3278         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3279
3280         /* apply for lease */
3281         if (ioc->lil_flags & LL_LEASE_RESYNC)
3282                 open_flags = MDS_OPEN_RESYNC;
3283         och = ll_lease_open(inode, file, fmode, open_flags);
3284         if (IS_ERR(och))
3285                 RETURN(PTR_ERR(och));
3286
3287         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3288                 rc = ll_lease_file_resync(och, inode);
3289                 if (rc) {
3290                         ll_lease_close(och, inode, NULL);
3291                         RETURN(rc);
3292                 }
3293                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3294                 if (rc) {
3295                         ll_lease_close(och, inode, NULL);
3296                         RETURN(rc);
3297                 }
3298         }
3299
3300         rc = 0;
3301         mutex_lock(&lli->lli_och_mutex);
3302         if (fd->fd_lease_och == NULL) {
3303                 fd->fd_lease_och = och;
3304                 och = NULL;
3305         }
3306         mutex_unlock(&lli->lli_och_mutex);
3307         if (och != NULL) {
3308                 /* impossible now that only excl is supported for now */
3309                 ll_lease_close(och, inode, &lease_broken);
3310                 rc = -EBUSY;
3311         }
3312         RETURN(rc);
3313 }
3314
3315 static long
3316 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3317 {
3318         struct inode            *inode = file_inode(file);
3319         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3320         int                      flags, rc;
3321         ENTRY;
3322
3323         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3324                PFID(ll_inode2fid(inode)), inode, cmd);
3325         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3326
3327         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3328         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3329                 RETURN(-ENOTTY);
3330
3331         switch (cmd) {
3332         case LL_IOC_GETFLAGS:
3333                 /* Get the current value of the file flags */
3334                 return put_user(fd->fd_flags, (int __user *)arg);
3335         case LL_IOC_SETFLAGS:
3336         case LL_IOC_CLRFLAGS:
3337                 /* Set or clear specific file flags */
3338                 /* XXX This probably needs checks to ensure the flags are
3339                  *     not abused, and to handle any flag side effects.
3340                  */
3341                 if (get_user(flags, (int __user *) arg))
3342                         RETURN(-EFAULT);
3343
3344                 if (cmd == LL_IOC_SETFLAGS) {
3345                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3346                             !(file->f_flags & O_DIRECT)) {
3347                                 CERROR("%s: unable to disable locking on "
3348                                        "non-O_DIRECT file\n", current->comm);
3349                                 RETURN(-EINVAL);
3350                         }
3351
3352                         fd->fd_flags |= flags;
3353                 } else {
3354                         fd->fd_flags &= ~flags;
3355                 }
3356                 RETURN(0);
3357         case LL_IOC_LOV_SETSTRIPE:
3358         case LL_IOC_LOV_SETSTRIPE_NEW:
3359                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3360         case LL_IOC_LOV_SETEA:
3361                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3362         case LL_IOC_LOV_SWAP_LAYOUTS: {
3363                 struct file *file2;
3364                 struct lustre_swap_layouts lsl;
3365
3366                 if (copy_from_user(&lsl, (char __user *)arg,
3367                                    sizeof(struct lustre_swap_layouts)))
3368                         RETURN(-EFAULT);
3369
3370                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3371                         RETURN(-EPERM);
3372
3373                 file2 = fget(lsl.sl_fd);
3374                 if (file2 == NULL)
3375                         RETURN(-EBADF);
3376
3377                 /* O_WRONLY or O_RDWR */
3378                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3379                         GOTO(out, rc = -EPERM);
3380
3381                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3382                         struct inode                    *inode2;
3383                         struct ll_inode_info            *lli;
3384                         struct obd_client_handle        *och = NULL;
3385
3386                         lli = ll_i2info(inode);
3387                         mutex_lock(&lli->lli_och_mutex);
3388                         if (fd->fd_lease_och != NULL) {
3389                                 och = fd->fd_lease_och;
3390                                 fd->fd_lease_och = NULL;
3391                         }
3392                         mutex_unlock(&lli->lli_och_mutex);
3393                         if (och == NULL)
3394                                 GOTO(out, rc = -ENOLCK);
3395                         inode2 = file_inode(file2);
3396                         rc = ll_swap_layouts_close(och, inode, inode2);
3397                 } else {
3398                         rc = ll_swap_layouts(file, file2, &lsl);
3399                 }
3400 out:
3401                 fput(file2);
3402                 RETURN(rc);
3403         }
3404         case LL_IOC_LOV_GETSTRIPE:
3405         case LL_IOC_LOV_GETSTRIPE_NEW:
3406                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3407         case FS_IOC_GETFLAGS:
3408         case FS_IOC_SETFLAGS:
3409                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3410         case FSFILT_IOC_GETVERSION:
3411         case FS_IOC_GETVERSION:
3412                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3413         /* We need to special case any other ioctls we want to handle,
3414          * to send them to the MDS/OST as appropriate and to properly
3415          * network encode the arg field. */
3416         case FS_IOC_SETVERSION:
3417                 RETURN(-ENOTSUPP);
3418
3419         case LL_IOC_GROUP_LOCK:
3420                 RETURN(ll_get_grouplock(inode, file, arg));
3421         case LL_IOC_GROUP_UNLOCK:
3422                 RETURN(ll_put_grouplock(inode, file, arg));
3423         case IOC_OBD_STATFS:
3424                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3425
3426         case LL_IOC_FLUSHCTX:
3427                 RETURN(ll_flush_ctx(inode));
3428         case LL_IOC_PATH2FID: {
3429                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3430                                  sizeof(struct lu_fid)))
3431                         RETURN(-EFAULT);
3432
3433                 RETURN(0);
3434         }
3435         case LL_IOC_GETPARENT:
3436                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3437
3438         case OBD_IOC_FID2PATH:
3439                 RETURN(ll_fid2path(inode, (void __user *)arg));
3440         case LL_IOC_DATA_VERSION: {
3441                 struct ioc_data_version idv;
3442                 int rc;
3443
3444                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3445                         RETURN(-EFAULT);
3446
3447                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3448                 rc = ll_ioc_data_version(inode, &idv);
3449
3450                 if (rc == 0 &&
3451                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3452                         RETURN(-EFAULT);
3453
3454                 RETURN(rc);
3455         }
3456
3457         case LL_IOC_GET_MDTIDX: {
3458                 int mdtidx;
3459
3460                 mdtidx = ll_get_mdt_idx(inode);
3461                 if (mdtidx < 0)
3462                         RETURN(mdtidx);
3463
3464                 if (put_user((int)mdtidx, (int __user *)arg))
3465                         RETURN(-EFAULT);
3466
3467                 RETURN(0);
3468         }
3469         case OBD_IOC_GETDTNAME:
3470         case OBD_IOC_GETMDNAME:
3471                 RETURN(ll_get_obd_name(inode, cmd, arg));
3472         case LL_IOC_HSM_STATE_GET: {
3473                 struct md_op_data       *op_data;
3474                 struct hsm_user_state   *hus;
3475                 int                      rc;
3476
3477                 OBD_ALLOC_PTR(hus);
3478                 if (hus == NULL)
3479                         RETURN(-ENOMEM);
3480
3481                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3482                                              LUSTRE_OPC_ANY, hus);
3483                 if (IS_ERR(op_data)) {
3484                         OBD_FREE_PTR(hus);
3485                         RETURN(PTR_ERR(op_data));
3486                 }
3487
3488                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3489                                    op_data, NULL);
3490
3491                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3492                         rc = -EFAULT;
3493
3494                 ll_finish_md_op_data(op_data);
3495                 OBD_FREE_PTR(hus);
3496                 RETURN(rc);
3497         }
3498         case LL_IOC_HSM_STATE_SET: {
3499                 struct hsm_state_set    *hss;
3500                 int                      rc;
3501
3502                 OBD_ALLOC_PTR(hss);
3503                 if (hss == NULL)
3504                         RETURN(-ENOMEM);
3505
3506                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3507                         OBD_FREE_PTR(hss);
3508                         RETURN(-EFAULT);
3509                 }
3510
3511                 rc = ll_hsm_state_set(inode, hss);
3512
3513                 OBD_FREE_PTR(hss);
3514                 RETURN(rc);
3515         }
3516         case LL_IOC_HSM_ACTION: {
3517                 struct md_op_data               *op_data;
3518                 struct hsm_current_action       *hca;
3519                 int                              rc;
3520
3521                 OBD_ALLOC_PTR(hca);
3522                 if (hca == NULL)
3523                         RETURN(-ENOMEM);
3524
3525                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3526                                              LUSTRE_OPC_ANY, hca);
3527                 if (IS_ERR(op_data)) {
3528                         OBD_FREE_PTR(hca);
3529                         RETURN(PTR_ERR(op_data));
3530                 }
3531
3532                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3533                                    op_data, NULL);
3534
3535                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3536                         rc = -EFAULT;
3537
3538                 ll_finish_md_op_data(op_data);
3539                 OBD_FREE_PTR(hca);
3540                 RETURN(rc);
3541         }
3542         case LL_IOC_SET_LEASE_OLD: {
3543                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3544
3545                 RETURN(ll_file_set_lease(file, &ioc, 0));
3546         }
3547         case LL_IOC_SET_LEASE: {
3548                 struct ll_ioc_lease ioc;
3549
3550                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3551                         RETURN(-EFAULT);
3552
3553                 RETURN(ll_file_set_lease(file, &ioc, arg));
3554         }
3555         case LL_IOC_GET_LEASE: {
3556                 struct ll_inode_info *lli = ll_i2info(inode);
3557                 struct ldlm_lock *lock = NULL;
3558                 fmode_t fmode = 0;
3559
3560                 mutex_lock(&lli->lli_och_mutex);
3561                 if (fd->fd_lease_och != NULL) {
3562                         struct obd_client_handle *och = fd->fd_lease_och;
3563
3564                         lock = ldlm_handle2lock(&och->och_lease_handle);
3565                         if (lock != NULL) {
3566                                 lock_res_and_lock(lock);
3567                                 if (!ldlm_is_cancel(lock))
3568                                         fmode = och->och_flags;
3569
3570                                 unlock_res_and_lock(lock);
3571                                 LDLM_LOCK_PUT(lock);
3572                         }
3573                 }
3574                 mutex_unlock(&lli->lli_och_mutex);
3575
3576                 RETURN(ll_lease_type_from_fmode(fmode));
3577         }
3578         case LL_IOC_HSM_IMPORT: {
3579                 struct hsm_user_import *hui;
3580
3581                 OBD_ALLOC_PTR(hui);
3582                 if (hui == NULL)
3583                         RETURN(-ENOMEM);
3584
3585                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3586                         OBD_FREE_PTR(hui);
3587                         RETURN(-EFAULT);
3588                 }
3589
3590                 rc = ll_hsm_import(inode, file, hui);
3591
3592                 OBD_FREE_PTR(hui);
3593                 RETURN(rc);
3594         }
3595         case LL_IOC_FUTIMES_3: {
3596                 struct ll_futimes_3 lfu;
3597
3598                 if (copy_from_user(&lfu,
3599                                    (const struct ll_futimes_3 __user *)arg,
3600                                    sizeof(lfu)))
3601                         RETURN(-EFAULT);
3602
3603                 RETURN(ll_file_futimes_3(file, &lfu));
3604         }
3605         case LL_IOC_LADVISE: {
3606                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3607                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3608                 int i;
3609                 int num_advise;
3610                 int alloc_size = sizeof(*k_ladvise_hdr);
3611
3612                 rc = 0;
3613                 u_ladvise_hdr = (void __user *)arg;
3614                 OBD_ALLOC_PTR(k_ladvise_hdr);
3615                 if (k_ladvise_hdr == NULL)
3616                         RETURN(-ENOMEM);
3617
3618                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3619                         GOTO(out_ladvise, rc = -EFAULT);
3620
3621                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3622                     k_ladvise_hdr->lah_count < 1)
3623                         GOTO(out_ladvise, rc = -EINVAL);
3624
3625                 num_advise = k_ladvise_hdr->lah_count;
3626                 if (num_advise >= LAH_COUNT_MAX)
3627                         GOTO(out_ladvise, rc = -EFBIG);
3628
3629                 OBD_FREE_PTR(k_ladvise_hdr);
3630                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3631                                       lah_advise[num_advise]);
3632                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3633                 if (k_ladvise_hdr == NULL)
3634                         RETURN(-ENOMEM);
3635
3636                 /*
3637                  * TODO: submit multiple advices to one server in a single RPC
3638                  */
3639                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3640                         GOTO(out_ladvise, rc = -EFAULT);
3641
3642                 for (i = 0; i < num_advise; i++) {
3643                         struct llapi_lu_ladvise *k_ladvise =
3644                                         &k_ladvise_hdr->lah_advise[i];
3645                         struct llapi_lu_ladvise __user *u_ladvise =
3646                                         &u_ladvise_hdr->lah_advise[i];
3647
3648                         rc = ll_ladvise_sanity(inode, k_ladvise);
3649                         if (rc)
3650                                 GOTO(out_ladvise, rc);
3651
3652                         switch (k_ladvise->lla_advice) {
3653                         case LU_LADVISE_LOCKNOEXPAND:
3654                                 rc = ll_lock_noexpand(file,
3655                                                k_ladvise->lla_peradvice_flags);
3656                                 GOTO(out_ladvise, rc);
3657                         case LU_LADVISE_LOCKAHEAD:
3658
3659                                 rc = ll_file_lock_ahead(file, k_ladvise);
3660
3661                                 if (rc < 0)
3662                                         GOTO(out_ladvise, rc);
3663
3664                                 if (put_user(rc,
3665                                              &u_ladvise->lla_lockahead_result))
3666                                         GOTO(out_ladvise, rc = -EFAULT);
3667                                 break;
3668                         default:
3669                                 rc = ll_ladvise(inode, file,
3670                                                 k_ladvise_hdr->lah_flags,
3671                                                 k_ladvise);
3672                                 if (rc)
3673                                         GOTO(out_ladvise, rc);
3674                                 break;
3675                         }
3676
3677                 }
3678
3679 out_ladvise:
3680                 OBD_FREE(k_ladvise_hdr, alloc_size);
3681                 RETURN(rc);
3682         }
3683         case LL_IOC_FLR_SET_MIRROR: {
3684                 /* mirror I/O must be direct to avoid polluting page cache
3685                  * by stale data. */
3686                 if (!(file->f_flags & O_DIRECT))
3687                         RETURN(-EINVAL);
3688
3689                 fd->fd_designated_mirror = (__u32)arg;
3690                 RETURN(0);
3691         }
3692         case LL_IOC_FSGETXATTR:
3693                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3694         case LL_IOC_FSSETXATTR:
3695                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3696         case BLKSSZGET:
3697                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3698         default:
3699                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3700                                      (void __user *)arg));
3701         }
3702 }
3703
3704 #ifndef HAVE_FILE_LLSEEK_SIZE
3705 static inline loff_t
3706 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3707 {
3708         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3709                 return -EINVAL;
3710         if (offset > maxsize)
3711                 return -EINVAL;
3712
3713         if (offset != file->f_pos) {
3714                 file->f_pos = offset;
3715                 file->f_version = 0;
3716         }
3717         return offset;
3718 }
3719
3720 static loff_t
3721 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3722                 loff_t maxsize, loff_t eof)
3723 {
3724         struct inode *inode = file_inode(file);
3725
3726         switch (origin) {
3727         case SEEK_END:
3728                 offset += eof;
3729                 break;
3730         case SEEK_CUR:
3731                 /*
3732                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3733                  * position-querying operation.  Avoid rewriting the "same"
3734                  * f_pos value back to the file because a concurrent read(),
3735                  * write() or lseek() might have altered it
3736                  */
3737                 if (offset == 0)
3738                         return file->f_pos;
3739                 /*
3740                  * f_lock protects against read/modify/write race with other
3741                  * SEEK_CURs. Note that parallel writes and reads behave
3742                  * like SEEK_SET.
3743                  */
3744                 inode_lock(inode);
3745                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3746                 inode_unlock(inode);
3747                 return offset;
3748         case SEEK_DATA:
3749                 /*
3750                  * In the generic case the entire file is data, so as long as
3751                  * offset isn't at the end of the file then the offset is data.
3752                  */
3753                 if (offset >= eof)
3754                         return -ENXIO;
3755                 break;
3756         case SEEK_HOLE:
3757                 /*
3758                  * There is a virtual hole at the end of the file, so as long as
3759                  * offset isn't i_size or larger, return i_size.
3760                  */
3761                 if (offset >= eof)
3762                         return -ENXIO;
3763                 offset = eof;
3764                 break;
3765         }
3766
3767         return llseek_execute(file, offset, maxsize);
3768 }
3769 #endif
3770
3771 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3772 {
3773         struct inode *inode = file_inode(file);
3774         loff_t retval, eof = 0;
3775
3776         ENTRY;
3777         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3778                            (origin == SEEK_CUR) ? file->f_pos : 0);
3779         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3780                PFID(ll_inode2fid(inode)), inode, retval, retval,
3781                origin);
3782         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3783
3784         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3785                 retval = ll_glimpse_size(inode);
3786                 if (retval != 0)
3787                         RETURN(retval);
3788                 eof = i_size_read(inode);
3789         }
3790
3791         retval = ll_generic_file_llseek_size(file, offset, origin,
3792                                           ll_file_maxbytes(inode), eof);
3793         RETURN(retval);
3794 }
3795
3796 static int ll_flush(struct file *file, fl_owner_t id)
3797 {
3798         struct inode *inode = file_inode(file);
3799         struct ll_inode_info *lli = ll_i2info(inode);
3800         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3801         int rc, err;
3802
3803         LASSERT(!S_ISDIR(inode->i_mode));
3804
3805         /* catch async errors that were recorded back when async writeback
3806          * failed for pages in this mapping. */
3807         rc = lli->lli_async_rc;
3808         lli->lli_async_rc = 0;
3809         if (lli->lli_clob != NULL) {
3810                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3811                 if (rc == 0)
3812                         rc = err;
3813         }
3814
3815         /* The application has been told write failure already.
3816          * Do not report failure again. */
3817         if (fd->fd_write_failed)
3818                 return 0;
3819         return rc ? -EIO : 0;
3820 }
3821
3822 /**
3823  * Called to make sure a portion of file has been written out.
3824  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3825  *
3826  * Return how many pages have been written.
3827  */
3828 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3829                        enum cl_fsync_mode mode, int ignore_layout)
3830 {
3831         struct lu_env *env;
3832         struct cl_io *io;
3833         struct cl_fsync_io *fio;
3834         int result;
3835         __u16 refcheck;
3836         ENTRY;
3837
3838         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3839             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3840                 RETURN(-EINVAL);
3841
3842         env = cl_env_get(&refcheck);
3843         if (IS_ERR(env))
3844                 RETURN(PTR_ERR(env));
3845
3846         io = vvp_env_thread_io(env);
3847         io->ci_obj = ll_i2info(inode)->lli_clob;
3848         io->ci_ignore_layout = ignore_layout;
3849
3850         /* initialize parameters for sync */
3851         fio = &io->u.ci_fsync;
3852         fio->fi_start = start;
3853         fio->fi_end = end;
3854         fio->fi_fid = ll_inode2fid(inode);
3855         fio->fi_mode = mode;
3856         fio->fi_nr_written = 0;
3857
3858         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3859                 result = cl_io_loop(env, io);
3860         else
3861                 result = io->ci_result;
3862         if (result == 0)
3863                 result = fio->fi_nr_written;
3864         cl_io_fini(env, io);
3865         cl_env_put(env, &refcheck);
3866
3867         RETURN(result);
3868 }
3869
3870 /*
3871  * When dentry is provided (the 'else' case), file_dentry() may be
3872  * null and dentry must be used directly rather than pulled from
3873  * file_dentry() as is done otherwise.
3874  */
3875
3876 #ifdef HAVE_FILE_FSYNC_4ARGS
3877 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3878 {
3879         struct dentry *dentry = file_dentry(file);
3880         bool lock_inode;
3881 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3882 int ll_fsync(struct file *file, int datasync)
3883 {
3884         struct dentry *dentry = file_dentry(file);
3885         loff_t start = 0;
3886         loff_t end = LLONG_MAX;
3887 #else
3888 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3889 {
3890         loff_t start = 0;
3891         loff_t end = LLONG_MAX;
3892 #endif
3893         struct inode *inode = dentry->d_inode;
3894         struct ll_inode_info *lli = ll_i2info(inode);
3895         struct ptlrpc_request *req;
3896         int rc, err;
3897         ENTRY;
3898
3899         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3900                PFID(ll_inode2fid(inode)), inode);
3901         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3902
3903 #ifdef HAVE_FILE_FSYNC_4ARGS
3904         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3905         lock_inode = !lli->lli_inode_locked;
3906         if (lock_inode)
3907                 inode_lock(inode);
3908 #else
3909         /* fsync's caller has already called _fdata{sync,write}, we want
3910          * that IO to finish before calling the osc and mdc sync methods */
3911         rc = filemap_fdatawait(inode->i_mapping);
3912 #endif
3913
3914         /* catch async errors that were recorded back when async writeback
3915          * failed for pages in this mapping. */
3916         if (!S_ISDIR(inode->i_mode)) {
3917                 err = lli->lli_async_rc;
3918                 lli->lli_async_rc = 0;
3919                 if (rc == 0)
3920                         rc = err;
3921                 if (lli->lli_clob != NULL) {
3922                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3923                         if (rc == 0)
3924                                 rc = err;
3925                 }
3926         }
3927
3928         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3929         if (!rc)
3930                 rc = err;
3931         if (!err)
3932                 ptlrpc_req_finished(req);
3933
3934         if (S_ISREG(inode->i_mode)) {
3935                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3936
3937                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3938                 if (rc == 0 && err < 0)
3939                         rc = err;
3940                 if (rc < 0)
3941                         fd->fd_write_failed = true;
3942                 else
3943                         fd->fd_write_failed = false;
3944         }
3945
3946 #ifdef HAVE_FILE_FSYNC_4ARGS
3947         if (lock_inode)
3948                 inode_unlock(inode);
3949 #endif
3950         RETURN(rc);
3951 }
3952
3953 static int
3954 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3955 {
3956         struct inode *inode = file_inode(file);
3957         struct ll_sb_info *sbi = ll_i2sbi(inode);
3958         struct ldlm_enqueue_info einfo = {
3959                 .ei_type        = LDLM_FLOCK,
3960                 .ei_cb_cp       = ldlm_flock_completion_ast,
3961                 .ei_cbdata      = file_lock,
3962         };
3963         struct md_op_data *op_data;
3964         struct lustre_handle lockh = { 0 };
3965         union ldlm_policy_data flock = { { 0 } };
3966         int fl_type = file_lock->fl_type;
3967         __u64 flags = 0;
3968         int rc;
3969         int rc2 = 0;
3970         ENTRY;
3971
3972         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3973                PFID(ll_inode2fid(inode)), file_lock);
3974
3975         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3976
3977         if (file_lock->fl_flags & FL_FLOCK) {
3978                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3979                 /* flocks are whole-file locks */
3980                 flock.l_flock.end = OFFSET_MAX;
3981                 /* For flocks owner is determined by the local file desctiptor*/
3982                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3983         } else if (file_lock->fl_flags & FL_POSIX) {
3984                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3985                 flock.l_flock.start = file_lock->fl_start;
3986                 flock.l_flock.end = file_lock->fl_end;
3987         } else {
3988                 RETURN(-EINVAL);
3989         }
3990         flock.l_flock.pid = file_lock->fl_pid;
3991
3992         /* Somewhat ugly workaround for svc lockd.
3993          * lockd installs custom fl_lmops->lm_compare_owner that checks
3994          * for the fl_owner to be the same (which it always is on local node
3995          * I guess between lockd processes) and then compares pid.
3996          * As such we assign pid to the owner field to make it all work,
3997          * conflict with normal locks is unlikely since pid space and
3998          * pointer space for current->files are not intersecting */
3999         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4000                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4001
4002         switch (fl_type) {
4003         case F_RDLCK:
4004                 einfo.ei_mode = LCK_PR;
4005                 break;
4006         case F_UNLCK:
4007                 /* An unlock request may or may not have any relation to
4008                  * existing locks so we may not be able to pass a lock handle
4009                  * via a normal ldlm_lock_cancel() request. The request may even
4010                  * unlock a byte range in the middle of an existing lock. In
4011                  * order to process an unlock request we need all of the same
4012                  * information that is given with a normal read or write record
4013                  * lock request. To avoid creating another ldlm unlock (cancel)
4014                  * message we'll treat a LCK_NL flock request as an unlock. */
4015                 einfo.ei_mode = LCK_NL;
4016                 break;
4017         case F_WRLCK:
4018                 einfo.ei_mode = LCK_PW;
4019                 break;
4020         default:
4021                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4022                 RETURN (-ENOTSUPP);
4023         }
4024
4025         switch (cmd) {
4026         case F_SETLKW:
4027 #ifdef F_SETLKW64
4028         case F_SETLKW64:
4029 #endif
4030                 flags = 0;
4031                 break;
4032         case F_SETLK:
4033 #ifdef F_SETLK64
4034         case F_SETLK64:
4035 #endif
4036                 flags = LDLM_FL_BLOCK_NOWAIT;
4037                 break;
4038         case F_GETLK:
4039 #ifdef F_GETLK64
4040         case F_GETLK64:
4041 #endif
4042                 flags = LDLM_FL_TEST_LOCK;
4043                 break;
4044         default:
4045                 CERROR("unknown fcntl lock command: %d\n", cmd);
4046                 RETURN (-EINVAL);
4047         }
4048
4049         /* Save the old mode so that if the mode in the lock changes we
4050          * can decrement the appropriate reader or writer refcount. */
4051         file_lock->fl_type = einfo.ei_mode;
4052
4053         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4054                                      LUSTRE_OPC_ANY, NULL);
4055         if (IS_ERR(op_data))
4056                 RETURN(PTR_ERR(op_data));
4057
4058         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4059                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4060                flock.l_flock.pid, flags, einfo.ei_mode,
4061                flock.l_flock.start, flock.l_flock.end);
4062
4063         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4064                         flags);
4065
4066         /* Restore the file lock type if not TEST lock. */
4067         if (!(flags & LDLM_FL_TEST_LOCK))
4068                 file_lock->fl_type = fl_type;
4069
4070 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4071         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4072             !(flags & LDLM_FL_TEST_LOCK))
4073                 rc2  = locks_lock_file_wait(file, file_lock);
4074 #else
4075         if ((file_lock->fl_flags & FL_FLOCK) &&
4076             (rc == 0 || file_lock->fl_type == F_UNLCK))
4077                 rc2  = flock_lock_file_wait(file, file_lock);
4078         if ((file_lock->fl_flags & FL_POSIX) &&
4079             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4080             !(flags & LDLM_FL_TEST_LOCK))
4081                 rc2  = posix_lock_file_wait(file, file_lock);
4082 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4083
4084         if (rc2 && file_lock->fl_type != F_UNLCK) {
4085                 einfo.ei_mode = LCK_NL;
4086                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4087                            &lockh, flags);
4088                 rc = rc2;
4089         }
4090
4091         ll_finish_md_op_data(op_data);
4092
4093         RETURN(rc);
4094 }
4095
4096 int ll_get_fid_by_name(struct inode *parent, const char *name,
4097                        int namelen, struct lu_fid *fid,
4098                        struct inode **inode)
4099 {
4100         struct md_op_data       *op_data = NULL;
4101         struct mdt_body         *body;
4102         struct ptlrpc_request   *req;
4103         int                     rc;
4104         ENTRY;
4105
4106         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4107                                      LUSTRE_OPC_ANY, NULL);
4108         if (IS_ERR(op_data))
4109                 RETURN(PTR_ERR(op_data));
4110
4111         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4112         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4113         ll_finish_md_op_data(op_data);
4114         if (rc < 0)
4115                 RETURN(rc);
4116
4117         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4118         if (body == NULL)
4119                 GOTO(out_req, rc = -EFAULT);
4120         if (fid != NULL)
4121                 *fid = body->mbo_fid1;
4122
4123         if (inode != NULL)
4124                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4125 out_req:
4126         ptlrpc_req_finished(req);
4127         RETURN(rc);
4128 }
4129
4130 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4131                const char *name)
4132 {
4133         struct dentry *dchild = NULL;
4134         struct inode *child_inode = NULL;
4135         struct md_op_data *op_data;
4136         struct ptlrpc_request *request = NULL;
4137         struct obd_client_handle *och = NULL;
4138         struct qstr qstr;
4139         struct mdt_body *body;
4140         __u64 data_version = 0;
4141         size_t namelen = strlen(name);
4142         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4143         int rc;
4144         ENTRY;
4145
4146         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4147                PFID(ll_inode2fid(parent)), name,
4148                lum->lum_stripe_offset, lum->lum_stripe_count);
4149
4150         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4151             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4152                 lustre_swab_lmv_user_md(lum);
4153
4154         /* Get child FID first */
4155         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4156         qstr.name = name;
4157         qstr.len = namelen;
4158         dchild = d_lookup(file_dentry(file), &qstr);
4159         if (dchild) {
4160                 if (dchild->d_inode)
4161                         child_inode = igrab(dchild->d_inode);
4162                 dput(dchild);
4163         }
4164
4165         if (!child_inode) {
4166                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4167                                         &child_inode);
4168                 if (rc)
4169                         RETURN(rc);
4170         }
4171
4172         if (!child_inode)
4173                 RETURN(-ENOENT);
4174
4175         /*
4176          * lfs migrate command needs to be blocked on the client
4177          * by checking the migrate FID against the FID of the
4178          * filesystem root.
4179          */
4180         if (child_inode == parent->i_sb->s_root->d_inode)
4181                 GOTO(out_iput, rc = -EINVAL);
4182
4183         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4184                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4185         if (IS_ERR(op_data))
4186                 GOTO(out_iput, rc = PTR_ERR(op_data));
4187
4188         inode_lock(child_inode);
4189         op_data->op_fid3 = *ll_inode2fid(child_inode);
4190         if (!fid_is_sane(&op_data->op_fid3)) {
4191                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4192                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4193                        PFID(&op_data->op_fid3));
4194                 GOTO(out_unlock, rc = -EINVAL);
4195         }
4196
4197         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4198         op_data->op_data = lum;
4199         op_data->op_data_size = lumlen;
4200
4201 again:
4202         if (S_ISREG(child_inode->i_mode)) {
4203                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4204                 if (IS_ERR(och)) {
4205                         rc = PTR_ERR(och);
4206                         och = NULL;
4207                         GOTO(out_unlock, rc);
4208                 }
4209
4210                 rc = ll_data_version(child_inode, &data_version,
4211                                      LL_DV_WR_FLUSH);
4212                 if (rc != 0)
4213                         GOTO(out_close, rc);
4214
4215                 op_data->op_handle = och->och_fh;
4216                 op_data->op_data_version = data_version;
4217                 op_data->op_lease_handle = och->och_lease_handle;
4218                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4219
4220                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4221                 och->och_mod->mod_open_req->rq_replay = 0;
4222                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4223         }
4224
4225         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4226                        name, namelen, &request);
4227         if (rc == 0) {
4228                 LASSERT(request != NULL);
4229                 ll_update_times(request, parent);
4230
4231                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4232                 LASSERT(body != NULL);
4233
4234                 /* If the server does release layout lock, then we cleanup
4235                  * the client och here, otherwise release it in out_close: */
4236                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4237                         obd_mod_put(och->och_mod);
4238                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4239                                                   och);
4240                         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4241                         OBD_FREE_PTR(och);
4242                         och = NULL;
4243                 }
4244         }
4245
4246         if (request != NULL) {
4247                 ptlrpc_req_finished(request);
4248                 request = NULL;
4249         }
4250
4251         /* Try again if the file layout has changed. */
4252         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4253                 goto again;
4254
4255 out_close:
4256         if (och)
4257                 ll_lease_close(och, child_inode, NULL);
4258         if (!rc)
4259                 clear_nlink(child_inode);
4260 out_unlock:
4261         inode_unlock(child_inode);
4262         ll_finish_md_op_data(op_data);
4263 out_iput:
4264         iput(child_inode);
4265         RETURN(rc);
4266 }
4267
4268 static int
4269 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4270 {
4271         ENTRY;
4272
4273         RETURN(-ENOSYS);
4274 }
4275
4276 /**
4277  * test if some locks matching bits and l_req_mode are acquired
4278  * - bits can be in different locks
4279  * - if found clear the common lock bits in *bits
4280  * - the bits not found, are kept in *bits
4281  * \param inode [IN]
4282  * \param bits [IN] searched lock bits [IN]
4283  * \param l_req_mode [IN] searched lock mode
4284  * \retval boolean, true iff all bits are found
4285  */
4286 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4287 {
4288         struct lustre_handle lockh;
4289         union ldlm_policy_data policy;
4290         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4291                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4292         struct lu_fid *fid;
4293         __u64 flags;
4294         int i;
4295         ENTRY;
4296
4297         if (!inode)
4298                RETURN(0);
4299
4300         fid = &ll_i2info(inode)->lli_fid;
4301         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4302                ldlm_lockname[mode]);
4303
4304         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4305         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4306                 policy.l_inodebits.bits = *bits & (1 << i);
4307                 if (policy.l_inodebits.bits == 0)
4308                         continue;
4309
4310                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4311                                   &policy, mode, &lockh)) {
4312                         struct ldlm_lock *lock;
4313
4314                         lock = ldlm_handle2lock(&lockh);
4315                         if (lock) {
4316                                 *bits &=
4317                                       ~(lock->l_policy_data.l_inodebits.bits);
4318                                 LDLM_LOCK_PUT(lock);
4319                         } else {
4320                                 *bits &= ~policy.l_inodebits.bits;
4321                         }
4322                 }
4323         }
4324         RETURN(*bits == 0);
4325 }
4326
4327 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4328                                struct lustre_handle *lockh, __u64 flags,
4329                                enum ldlm_mode mode)
4330 {
4331         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4332         struct lu_fid *fid;
4333         enum ldlm_mode rc;
4334         ENTRY;
4335
4336         fid = &ll_i2info(inode)->lli_fid;
4337         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4338
4339         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4340                            fid, LDLM_IBITS, &policy, mode, lockh);
4341
4342         RETURN(rc);
4343 }
4344
4345 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4346 {
4347         /* Already unlinked. Just update nlink and return success */
4348         if (rc == -ENOENT) {
4349                 clear_nlink(inode);
4350                 /* If it is striped directory, and there is bad stripe
4351                  * Let's revalidate the dentry again, instead of returning
4352                  * error */
4353                 if (S_ISDIR(inode->i_mode) &&
4354                     ll_i2info(inode)->lli_lsm_md != NULL)
4355                         return 0;
4356
4357                 /* This path cannot be hit for regular files unless in
4358                  * case of obscure races, so no need to to validate
4359                  * size. */
4360                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4361                         return 0;
4362         } else if (rc != 0) {
4363                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4364                              "%s: revalidate FID "DFID" error: rc = %d\n",
4365                              ll_get_fsname(inode->i_sb, NULL, 0),
4366                              PFID(ll_inode2fid(inode)), rc);
4367         }
4368
4369         return rc;
4370 }
4371
4372 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4373 {
4374         struct inode *inode = dentry->d_inode;
4375         struct obd_export *exp = ll_i2mdexp(inode);
4376         struct lookup_intent oit = {
4377                 .it_op = op,
4378         };
4379         struct ptlrpc_request *req = NULL;
4380         struct md_op_data *op_data;
4381         int rc = 0;
4382         ENTRY;
4383
4384         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4385                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4386
4387         /* Call getattr by fid, so do not provide name at all. */
4388         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4389                                      LUSTRE_OPC_ANY, NULL);
4390         if (IS_ERR(op_data))
4391                 RETURN(PTR_ERR(op_data));
4392
4393         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4394         ll_finish_md_op_data(op_data);
4395         if (rc < 0) {
4396                 rc = ll_inode_revalidate_fini(inode, rc);
4397                 GOTO(out, rc);
4398         }
4399
4400         rc = ll_revalidate_it_finish(req, &oit, dentry);
4401         if (rc != 0) {
4402                 ll_intent_release(&oit);
4403                 GOTO(out, rc);
4404         }
4405
4406         /* Unlinked? Unhash dentry, so it is not picked up later by
4407          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4408          * here to preserve get_cwd functionality on 2.6.
4409          * Bug 10503 */
4410         if (!dentry->d_inode->i_nlink) {
4411                 ll_lock_dcache(inode);
4412                 d_lustre_invalidate(dentry, 0);
4413                 ll_unlock_dcache(inode);
4414         }
4415
4416         ll_lookup_finish_locks(&oit, dentry);
4417 out:
4418         ptlrpc_req_finished(req);
4419
4420         return rc;
4421 }
4422
4423 static int ll_merge_md_attr(struct inode *inode)
4424 {
4425         struct cl_attr attr = { 0 };
4426         int rc;
4427
4428         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4429         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4430                            &attr, ll_md_blocking_ast);
4431         if (rc != 0)
4432                 RETURN(rc);
4433
4434         set_nlink(inode, attr.cat_nlink);
4435         inode->i_blocks = attr.cat_blocks;
4436         i_size_write(inode, attr.cat_size);
4437
4438         ll_i2info(inode)->lli_atime = attr.cat_atime;
4439         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4440         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4441
4442         RETURN(0);
4443 }
4444
4445 static inline dev_t ll_compat_encode_dev(dev_t dev)
4446 {
4447         /* The compat_sys_*stat*() syscalls will fail unless the
4448          * device majors and minors are both less than 256. Note that
4449          * the value returned here will be passed through
4450          * old_encode_dev() in cp_compat_stat(). And so we are not
4451          * trying to return a valid compat (u16) device number, just
4452          * one that will pass the old_valid_dev() check. */
4453
4454         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4455 }
4456
4457 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4458 int ll_getattr(const struct path *path, struct kstat *stat,
4459                u32 request_mask, unsigned int flags)
4460 {
4461         struct dentry *de = path->dentry;
4462 #else
4463 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4464 {
4465 #endif
4466         struct inode *inode = de->d_inode;
4467         struct ll_sb_info *sbi = ll_i2sbi(inode);
4468         struct ll_inode_info *lli = ll_i2info(inode);
4469         int rc;
4470
4471         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4472
4473         rc = ll_inode_revalidate(de, IT_GETATTR);
4474         if (rc < 0)
4475                 RETURN(rc);
4476
4477         if (S_ISREG(inode->i_mode)) {
4478                 /* In case of restore, the MDT has the right size and has
4479                  * already send it back without granting the layout lock,
4480                  * inode is up-to-date so glimpse is useless.
4481                  * Also to glimpse we need the layout, in case of a running
4482                  * restore the MDT holds the layout lock so the glimpse will
4483                  * block up to the end of restore (getattr will block)
4484                  */
4485                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4486                         rc = ll_glimpse_size(inode);
4487                         if (rc < 0)
4488                                 RETURN(rc);
4489                 }
4490         } else {
4491                 /* If object isn't regular a file then don't validate size. */
4492                 if (S_ISDIR(inode->i_mode) &&
4493                     lli->lli_lsm_md != NULL) {
4494                         rc = ll_merge_md_attr(inode);
4495                         if (rc < 0)
4496                                 RETURN(rc);
4497                 }
4498
4499                 LTIME_S(inode->i_atime) = lli->lli_atime;
4500                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4501                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4502         }
4503
4504         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4505
4506         if (ll_need_32bit_api(sbi)) {
4507                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4508                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4509                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4510         } else {
4511                 stat->ino = inode->i_ino;
4512                 stat->dev = inode->i_sb->s_dev;
4513                 stat->rdev = inode->i_rdev;
4514         }
4515
4516         stat->mode = inode->i_mode;
4517         stat->uid = inode->i_uid;
4518         stat->gid = inode->i_gid;
4519         stat->atime = inode->i_atime;
4520         stat->mtime = inode->i_mtime;
4521         stat->ctime = inode->i_ctime;
4522         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4523
4524         stat->nlink = inode->i_nlink;
4525         stat->size = i_size_read(inode);
4526         stat->blocks = inode->i_blocks;
4527
4528         return 0;
4529 }
4530
4531 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4532                      __u64 start, __u64 len)
4533 {
4534         int             rc;
4535         size_t          num_bytes;
4536         struct fiemap   *fiemap;
4537         unsigned int    extent_count = fieinfo->fi_extents_max;
4538
4539         num_bytes = sizeof(*fiemap) + (extent_count *
4540                                        sizeof(struct fiemap_extent));
4541         OBD_ALLOC_LARGE(fiemap, num_bytes);
4542
4543         if (fiemap == NULL)
4544                 RETURN(-ENOMEM);
4545
4546         fiemap->fm_flags = fieinfo->fi_flags;
4547         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4548         fiemap->fm_start = start;
4549         fiemap->fm_length = len;
4550         if (extent_count > 0 &&
4551             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4552                            sizeof(struct fiemap_extent)) != 0)
4553                 GOTO(out, rc = -EFAULT);
4554
4555         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4556
4557         fieinfo->fi_flags = fiemap->fm_flags;
4558         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4559         if (extent_count > 0 &&
4560             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4561                          fiemap->fm_mapped_extents *
4562                          sizeof(struct fiemap_extent)) != 0)
4563                 GOTO(out, rc = -EFAULT);
4564 out:
4565         OBD_FREE_LARGE(fiemap, num_bytes);
4566         return rc;
4567 }
4568
4569 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4570 {
4571         struct ll_inode_info *lli = ll_i2info(inode);
4572         struct posix_acl *acl = NULL;
4573         ENTRY;
4574
4575         spin_lock(&lli->lli_lock);
4576         /* VFS' acl_permission_check->check_acl will release the refcount */
4577         acl = posix_acl_dup(lli->lli_posix_acl);
4578         spin_unlock(&lli->lli_lock);
4579
4580         RETURN(acl);
4581 }
4582
4583 #ifdef HAVE_IOP_SET_ACL
4584 #ifdef CONFIG_FS_POSIX_ACL
4585 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4586 {
4587         struct ll_sb_info *sbi = ll_i2sbi(inode);
4588         struct ptlrpc_request *req = NULL;
4589         const char *name = NULL;
4590         char *value = NULL;
4591         size_t value_size = 0;
4592         int rc = 0;
4593         ENTRY;
4594
4595         switch (type) {
4596         case ACL_TYPE_ACCESS:
4597                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4598                 if (acl)
4599                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4600                 break;
4601
4602         case ACL_TYPE_DEFAULT:
4603                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4604                 if (!S_ISDIR(inode->i_mode))
4605                         rc = acl ? -EACCES : 0;
4606                 break;
4607
4608         default:
4609                 rc = -EINVAL;
4610                 break;
4611         }
4612         if (rc)
4613                 return rc;
4614
4615         if (acl) {
4616                 value_size = posix_acl_xattr_size(acl->a_count);
4617                 value = kmalloc(value_size, GFP_NOFS);
4618                 if (value == NULL)
4619                         GOTO(out, rc = -ENOMEM);
4620
4621                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4622                 if (rc < 0)
4623                         GOTO(out_value, rc);
4624         }
4625
4626         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4627                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4628                          name, value, value_size, 0, 0, &req);
4629
4630         ptlrpc_req_finished(req);
4631 out_value:
4632         kfree(value);
4633 out:
4634         if (rc)
4635                 forget_cached_acl(inode, type);
4636         else
4637                 set_cached_acl(inode, type, acl);
4638         RETURN(rc);
4639 }
4640 #endif /* CONFIG_FS_POSIX_ACL */
4641 #endif /* HAVE_IOP_SET_ACL */
4642
4643 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4644 static int
4645 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4646 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4647 # else
4648 ll_check_acl(struct inode *inode, int mask)
4649 # endif
4650 {
4651 # ifdef CONFIG_FS_POSIX_ACL
4652         struct posix_acl *acl;
4653         int rc;
4654         ENTRY;
4655
4656 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4657         if (flags & IPERM_FLAG_RCU)
4658                 return -ECHILD;
4659 #  endif
4660         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4661
4662         if (!acl)
4663                 RETURN(-EAGAIN);
4664
4665         rc = posix_acl_permission(inode, acl, mask);
4666         posix_acl_release(acl);
4667
4668         RETURN(rc);
4669 # else /* !CONFIG_FS_POSIX_ACL */
4670         return -EAGAIN;
4671 # endif /* CONFIG_FS_POSIX_ACL */
4672 }
4673 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4674
4675 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4676 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4677 #else
4678 # ifdef HAVE_INODE_PERMISION_2ARGS
4679 int ll_inode_permission(struct inode *inode, int mask)
4680 # else
4681 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4682 # endif
4683 #endif
4684 {
4685         int rc = 0;
4686         struct ll_sb_info *sbi;
4687         struct root_squash_info *squash;
4688         struct cred *cred = NULL;
4689         const struct cred *old_cred = NULL;
4690         cfs_cap_t cap;
4691         bool squash_id = false;
4692         ENTRY;
4693
4694 #ifdef MAY_NOT_BLOCK
4695         if (mask & MAY_NOT_BLOCK)
4696                 return -ECHILD;
4697 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4698         if (flags & IPERM_FLAG_RCU)
4699                 return -ECHILD;
4700 #endif
4701
4702        /* as root inode are NOT getting validated in lookup operation,
4703         * need to do it before permission check. */
4704
4705         if (inode == inode->i_sb->s_root->d_inode) {
4706                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4707                 if (rc)
4708                         RETURN(rc);
4709         }
4710
4711         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4712                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4713
4714         /* squash fsuid/fsgid if needed */
4715         sbi = ll_i2sbi(inode);
4716         squash = &sbi->ll_squash;
4717         if (unlikely(squash->rsi_uid != 0 &&
4718                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4719                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4720                         squash_id = true;
4721         }
4722         if (squash_id) {
4723                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4724                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4725                        squash->rsi_uid, squash->rsi_gid);
4726
4727                 /* update current process's credentials
4728                  * and FS capability */
4729                 cred = prepare_creds();
4730                 if (cred == NULL)
4731                         RETURN(-ENOMEM);
4732
4733                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4734                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4735                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4736                         if ((1 << cap) & CFS_CAP_FS_MASK)
4737                                 cap_lower(cred->cap_effective, cap);
4738                 }
4739                 old_cred = override_creds(cred);
4740         }
4741
4742         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4743         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4744         /* restore current process's credentials and FS capability */
4745         if (squash_id) {
4746                 revert_creds(old_cred);
4747                 put_cred(cred);
4748         }
4749
4750         RETURN(rc);
4751 }
4752
4753 /* -o localflock - only provides locally consistent flock locks */
4754 struct file_operations ll_file_operations = {
4755 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4756 # ifdef HAVE_SYNC_READ_WRITE
4757         .read           = new_sync_read,
4758         .write          = new_sync_write,
4759 # endif
4760         .read_iter      = ll_file_read_iter,
4761         .write_iter     = ll_file_write_iter,
4762 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4763         .read           = ll_file_read,
4764         .aio_read       = ll_file_aio_read,
4765         .write          = ll_file_write,
4766         .aio_write      = ll_file_aio_write,
4767 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4768         .unlocked_ioctl = ll_file_ioctl,
4769         .open           = ll_file_open,
4770         .release        = ll_file_release,
4771         .mmap           = ll_file_mmap,
4772         .llseek         = ll_file_seek,
4773         .splice_read    = ll_file_splice_read,
4774         .fsync          = ll_fsync,
4775         .flush          = ll_flush
4776 };
4777
4778 struct file_operations ll_file_operations_flock = {
4779 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4780 # ifdef HAVE_SYNC_READ_WRITE
4781         .read           = new_sync_read,
4782         .write          = new_sync_write,
4783 # endif /* HAVE_SYNC_READ_WRITE */
4784         .read_iter      = ll_file_read_iter,
4785         .write_iter     = ll_file_write_iter,
4786 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4787         .read           = ll_file_read,
4788         .aio_read       = ll_file_aio_read,
4789         .write          = ll_file_write,
4790         .aio_write      = ll_file_aio_write,
4791 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4792         .unlocked_ioctl = ll_file_ioctl,
4793         .open           = ll_file_open,
4794         .release        = ll_file_release,
4795         .mmap           = ll_file_mmap,
4796         .llseek         = ll_file_seek,
4797         .splice_read    = ll_file_splice_read,
4798         .fsync          = ll_fsync,
4799         .flush          = ll_flush,
4800         .flock          = ll_file_flock,
4801         .lock           = ll_file_flock
4802 };
4803
4804 /* These are for -o noflock - to return ENOSYS on flock calls */
4805 struct file_operations ll_file_operations_noflock = {
4806 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4807 # ifdef HAVE_SYNC_READ_WRITE
4808         .read           = new_sync_read,
4809         .write          = new_sync_write,
4810 # endif /* HAVE_SYNC_READ_WRITE */
4811         .read_iter      = ll_file_read_iter,
4812         .write_iter     = ll_file_write_iter,
4813 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4814         .read           = ll_file_read,
4815         .aio_read       = ll_file_aio_read,
4816         .write          = ll_file_write,
4817         .aio_write      = ll_file_aio_write,
4818 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4819         .unlocked_ioctl = ll_file_ioctl,
4820         .open           = ll_file_open,
4821         .release        = ll_file_release,
4822         .mmap           = ll_file_mmap,
4823         .llseek         = ll_file_seek,
4824         .splice_read    = ll_file_splice_read,
4825         .fsync          = ll_fsync,
4826         .flush          = ll_flush,
4827         .flock          = ll_file_noflock,
4828         .lock           = ll_file_noflock
4829 };
4830
4831 struct inode_operations ll_file_inode_operations = {
4832         .setattr        = ll_setattr,
4833         .getattr        = ll_getattr,
4834         .permission     = ll_inode_permission,
4835 #ifdef HAVE_IOP_XATTR
4836         .setxattr       = ll_setxattr,
4837         .getxattr       = ll_getxattr,
4838         .removexattr    = ll_removexattr,
4839 #endif
4840         .listxattr      = ll_listxattr,
4841         .fiemap         = ll_fiemap,
4842 #ifdef HAVE_IOP_GET_ACL
4843         .get_acl        = ll_get_acl,
4844 #endif
4845 #ifdef HAVE_IOP_SET_ACL
4846         .set_acl        = ll_set_acl,
4847 #endif
4848 };
4849
4850 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4851 {
4852         struct ll_inode_info *lli = ll_i2info(inode);
4853         struct cl_object *obj = lli->lli_clob;
4854         struct lu_env *env;
4855         int rc;
4856         __u16 refcheck;
4857         ENTRY;
4858
4859         if (obj == NULL)
4860                 RETURN(0);
4861
4862         env = cl_env_get(&refcheck);
4863         if (IS_ERR(env))
4864                 RETURN(PTR_ERR(env));
4865
4866         rc = cl_conf_set(env, lli->lli_clob, conf);
4867         if (rc < 0)
4868                 GOTO(out, rc);
4869
4870         if (conf->coc_opc == OBJECT_CONF_SET) {
4871                 struct ldlm_lock *lock = conf->coc_lock;
4872                 struct cl_layout cl = {
4873                         .cl_layout_gen = 0,
4874                 };
4875
4876                 LASSERT(lock != NULL);
4877                 LASSERT(ldlm_has_layout(lock));
4878
4879                 /* it can only be allowed to match after layout is
4880                  * applied to inode otherwise false layout would be
4881                  * seen. Applying layout shoud happen before dropping
4882                  * the intent lock. */
4883                 ldlm_lock_allow_match(lock);
4884
4885                 rc = cl_object_layout_get(env, obj, &cl);
4886                 if (rc < 0)
4887                         GOTO(out, rc);
4888
4889                 CDEBUG(D_VFSTRACE,
4890                        DFID": layout version change: %u -> %u\n",
4891                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4892                        cl.cl_layout_gen);
4893                 ll_layout_version_set(lli, cl.cl_layout_gen);
4894         }
4895
4896 out:
4897         cl_env_put(env, &refcheck);
4898
4899         RETURN(rc);
4900 }
4901
4902 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4903 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4904
4905 {
4906         struct ll_sb_info *sbi = ll_i2sbi(inode);
4907         struct ptlrpc_request *req;
4908         struct mdt_body *body;
4909         void *lvbdata;
4910         void *lmm;
4911         int lmmsize;
4912         int rc;
4913         ENTRY;
4914
4915         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4916                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4917                lock->l_lvb_data, lock->l_lvb_len);
4918
4919         if (lock->l_lvb_data != NULL)
4920                 RETURN(0);
4921
4922         /* if layout lock was granted right away, the layout is returned
4923          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4924          * blocked and then granted via completion ast, we have to fetch
4925          * layout here. Please note that we can't use the LVB buffer in
4926          * completion AST because it doesn't have a large enough buffer */
4927         rc = ll_get_default_mdsize(sbi, &lmmsize);
4928         if (rc == 0)
4929                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4930                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4931         if (rc < 0)
4932                 RETURN(rc);
4933
4934         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4935         if (body == NULL)
4936                 GOTO(out, rc = -EPROTO);
4937
4938         lmmsize = body->mbo_eadatasize;
4939         if (lmmsize == 0) /* empty layout */
4940                 GOTO(out, rc = 0);
4941
4942         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4943         if (lmm == NULL)
4944                 GOTO(out, rc = -EFAULT);
4945
4946         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4947         if (lvbdata == NULL)
4948                 GOTO(out, rc = -ENOMEM);
4949
4950         memcpy(lvbdata, lmm, lmmsize);
4951         lock_res_and_lock(lock);
4952         if (unlikely(lock->l_lvb_data == NULL)) {
4953                 lock->l_lvb_type = LVB_T_LAYOUT;
4954                 lock->l_lvb_data = lvbdata;
4955                 lock->l_lvb_len = lmmsize;
4956                 lvbdata = NULL;
4957         }
4958         unlock_res_and_lock(lock);
4959
4960         if (lvbdata)
4961                 OBD_FREE_LARGE(lvbdata, lmmsize);
4962
4963         EXIT;
4964
4965 out:
4966         ptlrpc_req_finished(req);
4967         return rc;
4968 }
4969
4970 /**
4971  * Apply the layout to the inode. Layout lock is held and will be released
4972  * in this function.
4973  */
4974 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4975                               struct inode *inode)
4976 {
4977         struct ll_inode_info *lli = ll_i2info(inode);
4978         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4979         struct ldlm_lock *lock;
4980         struct cl_object_conf conf;
4981         int rc = 0;
4982         bool lvb_ready;
4983         bool wait_layout = false;
4984         ENTRY;
4985
4986         LASSERT(lustre_handle_is_used(lockh));
4987
4988         lock = ldlm_handle2lock(lockh);
4989         LASSERT(lock != NULL);
4990         LASSERT(ldlm_has_layout(lock));
4991
4992         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4993                    PFID(&lli->lli_fid), inode);
4994
4995         /* in case this is a caching lock and reinstate with new inode */
4996         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4997
4998         lock_res_and_lock(lock);
4999         lvb_ready = ldlm_is_lvb_ready(lock);
5000         unlock_res_and_lock(lock);
5001
5002         /* checking lvb_ready is racy but this is okay. The worst case is
5003          * that multi processes may configure the file on the same time. */
5004         if (lvb_ready)
5005                 GOTO(out, rc = 0);
5006
5007         rc = ll_layout_fetch(inode, lock);
5008         if (rc < 0)
5009                 GOTO(out, rc);
5010
5011         /* for layout lock, lmm is stored in lock's lvb.
5012          * lvb_data is immutable if the lock is held so it's safe to access it
5013          * without res lock.
5014          *
5015          * set layout to file. Unlikely this will fail as old layout was
5016          * surely eliminated */
5017         memset(&conf, 0, sizeof conf);
5018         conf.coc_opc = OBJECT_CONF_SET;
5019         conf.coc_inode = inode;
5020         conf.coc_lock = lock;
5021         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5022         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5023         rc = ll_layout_conf(inode, &conf);
5024
5025         /* refresh layout failed, need to wait */
5026         wait_layout = rc == -EBUSY;
5027         EXIT;
5028 out:
5029         LDLM_LOCK_PUT(lock);
5030         ldlm_lock_decref(lockh, mode);
5031
5032         /* wait for IO to complete if it's still being used. */
5033         if (wait_layout) {
5034                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5035                        ll_get_fsname(inode->i_sb, NULL, 0),
5036                        PFID(&lli->lli_fid), inode);
5037
5038                 memset(&conf, 0, sizeof conf);
5039                 conf.coc_opc = OBJECT_CONF_WAIT;
5040                 conf.coc_inode = inode;
5041                 rc = ll_layout_conf(inode, &conf);
5042                 if (rc == 0)
5043                         rc = -EAGAIN;
5044
5045                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5046                        ll_get_fsname(inode->i_sb, NULL, 0),
5047                        PFID(&lli->lli_fid), rc);
5048         }
5049         RETURN(rc);
5050 }
5051
5052 /**
5053  * Issue layout intent RPC to MDS.
5054  * \param inode [in]    file inode
5055  * \param intent [in]   layout intent
5056  *
5057  * \retval 0    on success
5058  * \retval < 0  error code
5059  */
5060 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5061 {
5062         struct ll_inode_info  *lli = ll_i2info(inode);
5063         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5064         struct md_op_data     *op_data;
5065         struct lookup_intent it;
5066         struct ptlrpc_request *req;
5067         int rc;
5068         ENTRY;
5069
5070         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5071                                      0, 0, LUSTRE_OPC_ANY, NULL);
5072         if (IS_ERR(op_data))
5073                 RETURN(PTR_ERR(op_data));
5074
5075         op_data->op_data = intent;
5076         op_data->op_data_size = sizeof(*intent);
5077
5078         memset(&it, 0, sizeof(it));
5079         it.it_op = IT_LAYOUT;
5080         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5081             intent->li_opc == LAYOUT_INTENT_TRUNC)
5082                 it.it_flags = FMODE_WRITE;
5083
5084         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5085                           ll_get_fsname(inode->i_sb, NULL, 0),
5086                           PFID(&lli->lli_fid), inode);
5087
5088         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5089                             &ll_md_blocking_ast, 0);
5090         if (it.it_request != NULL)
5091                 ptlrpc_req_finished(it.it_request);
5092         it.it_request = NULL;
5093
5094         ll_finish_md_op_data(op_data);
5095
5096         /* set lock data in case this is a new lock */
5097         if (!rc)
5098                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5099
5100         ll_intent_drop_lock(&it);
5101
5102         RETURN(rc);
5103 }
5104
5105 /**
5106  * This function checks if there exists a LAYOUT lock on the client side,
5107  * or enqueues it if it doesn't have one in cache.
5108  *
5109  * This function will not hold layout lock so it may be revoked any time after
5110  * this function returns. Any operations depend on layout should be redone
5111  * in that case.
5112  *
5113  * This function should be called before lov_io_init() to get an uptodate
5114  * layout version, the caller should save the version number and after IO
5115  * is finished, this function should be called again to verify that layout
5116  * is not changed during IO time.
5117  */
5118 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5119 {
5120         struct ll_inode_info    *lli = ll_i2info(inode);
5121         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5122         struct lustre_handle lockh;
5123         struct layout_intent intent = {
5124                 .li_opc = LAYOUT_INTENT_ACCESS,
5125         };
5126         enum ldlm_mode mode;
5127         int rc;
5128         ENTRY;
5129
5130         *gen = ll_layout_version_get(lli);
5131         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5132                 RETURN(0);
5133
5134         /* sanity checks */
5135         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5136         LASSERT(S_ISREG(inode->i_mode));
5137
5138         /* take layout lock mutex to enqueue layout lock exclusively. */
5139         mutex_lock(&lli->lli_layout_mutex);
5140
5141         while (1) {
5142                 /* mostly layout lock is caching on the local side, so try to
5143                  * match it before grabbing layout lock mutex. */
5144                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5145                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5146                 if (mode != 0) { /* hit cached lock */
5147                         rc = ll_layout_lock_set(&lockh, mode, inode);
5148                         if (rc == -EAGAIN)
5149                                 continue;
5150                         break;
5151                 }
5152
5153                 rc = ll_layout_intent(inode, &intent);
5154                 if (rc != 0)
5155                         break;
5156         }
5157
5158         if (rc == 0)
5159                 *gen = ll_layout_version_get(lli);
5160         mutex_unlock(&lli->lli_layout_mutex);
5161
5162         RETURN(rc);
5163 }
5164
5165 /**
5166  * Issue layout intent RPC indicating where in a file an IO is about to write.
5167  *
5168  * \param[in] inode     file inode.
5169  * \param[in] ext       write range with start offset of fille in bytes where
5170  *                      an IO is about to write, and exclusive end offset in
5171  *                      bytes.
5172  *
5173  * \retval 0    on success
5174  * \retval < 0  error code
5175  */
5176 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5177                            struct lu_extent *ext)
5178 {
5179         struct layout_intent intent = {
5180                 .li_opc = opc,
5181                 .li_extent.e_start = ext->e_start,
5182                 .li_extent.e_end = ext->e_end,
5183         };
5184         int rc;
5185         ENTRY;
5186
5187         rc = ll_layout_intent(inode, &intent);
5188
5189         RETURN(rc);
5190 }
5191
5192 /**
5193  *  This function send a restore request to the MDT
5194  */
5195 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5196 {
5197         struct hsm_user_request *hur;
5198         int                      len, rc;
5199         ENTRY;
5200
5201         len = sizeof(struct hsm_user_request) +
5202               sizeof(struct hsm_user_item);
5203         OBD_ALLOC(hur, len);
5204         if (hur == NULL)
5205                 RETURN(-ENOMEM);
5206
5207         hur->hur_request.hr_action = HUA_RESTORE;
5208         hur->hur_request.hr_archive_id = 0;
5209         hur->hur_request.hr_flags = 0;
5210         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5211                sizeof(hur->hur_user_item[0].hui_fid));
5212         hur->hur_user_item[0].hui_extent.offset = offset;
5213         hur->hur_user_item[0].hui_extent.length = length;
5214         hur->hur_request.hr_itemcount = 1;
5215         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5216                            len, hur, NULL);
5217         OBD_FREE(hur, len);
5218         RETURN(rc);
5219 }