lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                      ATTR_MTIME | ATTR_MTIME_SET |
 104                                      ATTR_CTIME | ATTR_CTIME_SET;
 105         op_data->op_attr_blocks = inode->i_blocks;
 106         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 107         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 108                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 109         op_data->op_handle = och->och_fh;
 110
 111         if (och->och_flags & FMODE_WRITE &&
 112             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 113                 /* For HSM: if inode data has been modified, pack it so that
 114                  * MDT can set data dirty flag in the archive. */
 115                 op_data->op_bias |= MDS_DATA_MODIFIED;
 116
 117         EXIT;
 118 }
 119
 120 /**
 121  * Perform a close, possibly with a bias.
 122  * The meaning of "data" depends on the value of "bias".
 123  *
 124  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 125  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 126  * swap layouts with.
 127  */
 128 static int ll_close_inode_openhandle(struct inode *inode,
 129                                      struct obd_client_handle *och,
 130                                      enum mds_op_bias bias, void *data)
 131 {
 132         struct obd_export *md_exp = ll_i2mdexp(inode);
 133         const struct ll_inode_info *lli = ll_i2info(inode);
 134         struct md_op_data *op_data;
 135         struct ptlrpc_request *req = NULL;
 136         int rc;
 137         ENTRY;
 138
 139         if (class_exp2obd(md_exp) == NULL) {
 140                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 141                        ll_get_fsname(inode->i_sb, NULL, 0),
 142                        PFID(&lli->lli_fid));
 143                 GOTO(out, rc = 0);
 144         }
 145
 146         OBD_ALLOC_PTR(op_data);
 147         /* We leak openhandle and request here on error, but not much to be
 148          * done in OOM case since app won't retry close on error either. */
 149         if (op_data == NULL)
 150                 GOTO(out, rc = -ENOMEM);
 151
 152         ll_prepare_close(inode, op_data, och);
 153         switch (bias) {
 154         case MDS_CLOSE_LAYOUT_MERGE:
 155                 /* merge blocks from the victim inode */
 156                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 157                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 158         case MDS_CLOSE_LAYOUT_SPLIT:
 159         case MDS_CLOSE_LAYOUT_SWAP: {
 160                 struct split_param *sp = data;
 161
 162                 LASSERT(data != NULL);
 163                 op_data->op_bias |= bias;
 164                 op_data->op_data_version = 0;
 165                 op_data->op_lease_handle = och->och_lease_handle;
 166                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 167                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 168                         op_data->op_mirror_id = sp->sp_mirror_id;
 169                 } else {
 170                         op_data->op_fid2 = *ll_inode2fid(data);
 171                 }
 172                 break;
 173         }
 174
 175         case MDS_CLOSE_RESYNC_DONE: {
 176                 struct ll_ioc_lease *ioc = data;
 177
 178                 LASSERT(data != NULL);
 179                 op_data->op_attr_blocks +=
 180                         ioc->lil_count * op_data->op_attr_blocks;
 181                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 182                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 183
 184                 op_data->op_lease_handle = och->och_lease_handle;
 185                 op_data->op_data = &ioc->lil_ids[0];
 186                 op_data->op_data_size =
 187                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 188                 break;
 189         }
 190
 191         case MDS_HSM_RELEASE:
 192                 LASSERT(data != NULL);
 193                 op_data->op_bias |= MDS_HSM_RELEASE;
 194                 op_data->op_data_version = *(__u64 *)data;
 195                 op_data->op_lease_handle = och->och_lease_handle;
 196                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
 197                 break;
 198
 199         default:
 200                 LASSERT(data == NULL);
 201                 break;
 202         }
 203
 204         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 205                 op_data->op_attr.ia_valid |= MDS_ATTR_LSIZE;
 206         if (!(op_data->op_attr.ia_valid & ATTR_BLOCKS))
 207                 op_data->op_attr.ia_valid |= MDS_ATTR_LBLOCKS;
 208
 209         rc = md_close(md_exp, op_data, och->och_mod, &req);
 210         if (rc != 0 && rc != -EINTR)
 211                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 212                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 213
 214         if (rc == 0 && op_data->op_bias & bias) {
 215                 struct mdt_body *body;
 216
 217                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 218                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 219                         rc = -EBUSY;
 220         }
 221
 222         ll_finish_md_op_data(op_data);
 223         EXIT;
 224 out:
 225
 226         md_clear_open_replay_data(md_exp, och);
 227         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 228         OBD_FREE_PTR(och);
 229
 230         ptlrpc_req_finished(req);       /* This is close request */
 231         return rc;
 232 }
 233
 234 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 235 {
 236         struct ll_inode_info *lli = ll_i2info(inode);
 237         struct obd_client_handle **och_p;
 238         struct obd_client_handle *och;
 239         __u64 *och_usecount;
 240         int rc = 0;
 241         ENTRY;
 242
 243         if (fmode & FMODE_WRITE) {
 244                 och_p = &lli->lli_mds_write_och;
 245                 och_usecount = &lli->lli_open_fd_write_count;
 246         } else if (fmode & FMODE_EXEC) {
 247                 och_p = &lli->lli_mds_exec_och;
 248                 och_usecount = &lli->lli_open_fd_exec_count;
 249         } else {
 250                 LASSERT(fmode & FMODE_READ);
 251                 och_p = &lli->lli_mds_read_och;
 252                 och_usecount = &lli->lli_open_fd_read_count;
 253         }
 254
 255         mutex_lock(&lli->lli_och_mutex);
 256         if (*och_usecount > 0) {
 257                 /* There are still users of this handle, so skip
 258                  * freeing it. */
 259                 mutex_unlock(&lli->lli_och_mutex);
 260                 RETURN(0);
 261         }
 262
 263         och = *och_p;
 264         *och_p = NULL;
 265         mutex_unlock(&lli->lli_och_mutex);
 266
 267         if (och != NULL) {
 268                 /* There might be a race and this handle may already
 269                  * be closed. */
 270                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 271         }
 272
 273         RETURN(rc);
 274 }
 275
 276 static int ll_md_close(struct inode *inode, struct file *file)
 277 {
 278         union ldlm_policy_data policy = {
 279                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 280         };
 281         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 282         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 283         struct ll_inode_info *lli = ll_i2info(inode);
 284         struct lustre_handle lockh;
 285         enum ldlm_mode lockmode;
 286         int rc = 0;
 287         ENTRY;
 288
 289         /* clear group lock, if present */
 290         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 291                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 292
 293         if (fd->fd_lease_och != NULL) {
 294                 bool lease_broken;
 295
 296                 /* Usually the lease is not released when the
 297                  * application crashed, we need to release here. */
 298                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 299                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 300                         PFID(&lli->lli_fid), rc, lease_broken);
 301
 302                 fd->fd_lease_och = NULL;
 303         }
 304
 305         if (fd->fd_och != NULL) {
 306                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 307                 fd->fd_och = NULL;
 308                 GOTO(out, rc);
 309         }
 310
 311         /* Let's see if we have good enough OPEN lock on the file and if
 312            we can skip talking to MDS */
 313         mutex_lock(&lli->lli_och_mutex);
 314         if (fd->fd_omode & FMODE_WRITE) {
 315                 lockmode = LCK_CW;
 316                 LASSERT(lli->lli_open_fd_write_count);
 317                 lli->lli_open_fd_write_count--;
 318         } else if (fd->fd_omode & FMODE_EXEC) {
 319                 lockmode = LCK_PR;
 320                 LASSERT(lli->lli_open_fd_exec_count);
 321                 lli->lli_open_fd_exec_count--;
 322         } else {
 323                 lockmode = LCK_CR;
 324                 LASSERT(lli->lli_open_fd_read_count);
 325                 lli->lli_open_fd_read_count--;
 326         }
 327         mutex_unlock(&lli->lli_och_mutex);
 328
 329         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 330                            LDLM_IBITS, &policy, lockmode, &lockh))
 331                 rc = ll_md_real_close(inode, fd->fd_omode);
 332
 333 out:
 334         LUSTRE_FPRIVATE(file) = NULL;
 335         ll_file_data_put(fd);
 336
 337         RETURN(rc);
 338 }
 339
 340 /* While this returns an error code, fput() the caller does not, so we need
 341  * to make every effort to clean up all of our state here.  Also, applications
 342  * rarely check close errors and even if an error is returned they will not
 343  * re-try the close call.
 344  */
 345 int ll_file_release(struct inode *inode, struct file *file)
 346 {
 347         struct ll_file_data *fd;
 348         struct ll_sb_info *sbi = ll_i2sbi(inode);
 349         struct ll_inode_info *lli = ll_i2info(inode);
 350         int rc;
 351         ENTRY;
 352
 353         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 354                PFID(ll_inode2fid(inode)), inode);
 355
 356         if (inode->i_sb->s_root != file_dentry(file))
 357                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 358         fd = LUSTRE_FPRIVATE(file);
 359         LASSERT(fd != NULL);
 360
 361         /* The last ref on @file, maybe not the the owner pid of statahead,
 362          * because parent and child process can share the same file handle. */
 363         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 364                 ll_deauthorize_statahead(inode, fd);
 365
 366         if (inode->i_sb->s_root == file_dentry(file)) {
 367                 LUSTRE_FPRIVATE(file) = NULL;
 368                 ll_file_data_put(fd);
 369                 RETURN(0);
 370         }
 371
 372         if (!S_ISDIR(inode->i_mode)) {
 373                 if (lli->lli_clob != NULL)
 374                         lov_read_and_clear_async_rc(lli->lli_clob);
 375                 lli->lli_async_rc = 0;
 376         }
 377
 378         rc = ll_md_close(inode, file);
 379
 380         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 381                 libcfs_debug_dumplog();
 382
 383         RETURN(rc);
 384 }
 385
 386 static inline int ll_dom_readpage(void *data, struct page *page)
 387 {
 388         struct niobuf_local *lnb = data;
 389         void *kaddr;
 390
 391         kaddr = ll_kmap_atomic(page, KM_USER0);
 392         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 393         if (lnb->lnb_len < PAGE_SIZE)
 394                 memset(kaddr + lnb->lnb_len, 0,
 395                        PAGE_SIZE - lnb->lnb_len);
 396         flush_dcache_page(page);
 397         SetPageUptodate(page);
 398         ll_kunmap_atomic(kaddr, KM_USER0);
 399         unlock_page(page);
 400
 401         return 0;
 402 }
 403
 404 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 405                         struct lookup_intent *it)
 406 {
 407         struct ll_inode_info *lli = ll_i2info(inode);
 408         struct cl_object *obj = lli->lli_clob;
 409         struct address_space *mapping = inode->i_mapping;
 410         struct page *vmpage;
 411         struct niobuf_remote *rnb;
 412         char *data;
 413         struct lu_env *env;
 414         struct cl_io *io;
 415         __u16 refcheck;
 416         struct lustre_handle lockh;
 417         struct ldlm_lock *lock;
 418         unsigned long index, start;
 419         struct niobuf_local lnb;
 420         int rc;
 421         bool dom_lock = false;
 422
 423         ENTRY;
 424
 425         if (obj == NULL)
 426                 RETURN_EXIT;
 427
 428         if (it->it_lock_mode != 0) {
 429                 lockh.cookie = it->it_lock_handle;
 430                 lock = ldlm_handle2lock(&lockh);
 431                 if (lock != NULL)
 432                         dom_lock = ldlm_has_dom(lock);
 433                 LDLM_LOCK_PUT(lock);
 434         }
 435
 436         if (!dom_lock)
 437                 RETURN_EXIT;
 438
 439         env = cl_env_get(&refcheck);
 440         if (IS_ERR(env))
 441                 RETURN_EXIT;
 442
 443         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 444                                    RCL_SERVER))
 445                 GOTO(out_env, rc = -ENODATA);
 446
 447         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 448         data = (char *)rnb + sizeof(*rnb);
 449
 450         if (rnb == NULL || rnb->rnb_len == 0)
 451                 GOTO(out_env, rc = 0);
 452
 453         CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
 454                rnb->rnb_len, i_size_read(inode));
 455
 456         io = vvp_env_thread_io(env);
 457         io->ci_obj = obj;
 458         io->ci_ignore_layout = 1;
 459         rc = cl_io_init(env, io, CIT_MISC, obj);
 460         if (rc)
 461                 GOTO(out_io, rc);
 462
 463         lnb.lnb_file_offset = rnb->rnb_offset;
 464         start = lnb.lnb_file_offset / PAGE_SIZE;
 465         index = 0;
 466         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 467         lnb.lnb_page_offset = 0;
 468         do {
 469                 struct cl_page *clp;
 470
 471                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 472                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 473                 if (lnb.lnb_len > PAGE_SIZE)
 474                         lnb.lnb_len = PAGE_SIZE;
 475
 476                 vmpage = read_cache_page(mapping, index + start,
 477                                          ll_dom_readpage, &lnb);
 478                 if (IS_ERR(vmpage)) {
 479                         CWARN("%s: cannot fill page %lu for "DFID
 480                               " with data: rc = %li\n",
 481                               ll_get_fsname(inode->i_sb, NULL, 0),
 482                               index + start, PFID(lu_object_fid(&obj->co_lu)),
 483                               PTR_ERR(vmpage));
 484                         break;
 485                 }
 486                 lock_page(vmpage);
 487                 clp = cl_page_find(env, obj, vmpage->index, vmpage,
 488                                    CPT_CACHEABLE);
 489                 if (IS_ERR(clp)) {
 490                         unlock_page(vmpage);
 491                         put_page(vmpage);
 492                         GOTO(out_io, rc = PTR_ERR(clp));
 493                 }
 494
 495                 /* export page */
 496                 cl_page_export(env, clp, 1);
 497                 cl_page_put(env, clp);
 498                 unlock_page(vmpage);
 499                 put_page(vmpage);
 500                 index++;
 501         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 502         rc = 0;
 503         EXIT;
 504 out_io:
 505         cl_io_fini(env, io);
 506 out_env:
 507         cl_env_put(env, &refcheck);
 508 }
 509
 510 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 511                                 struct lookup_intent *itp)
 512 {
 513         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 514         struct dentry *parent = de->d_parent;
 515         const char *name = NULL;
 516         int len = 0;
 517         struct md_op_data *op_data;
 518         struct ptlrpc_request *req = NULL;
 519         int rc;
 520         ENTRY;
 521
 522         LASSERT(parent != NULL);
 523         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 524
 525         /* if server supports open-by-fid, or file name is invalid, don't pack
 526          * name in open request */
 527         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 528             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 529                 name = de->d_name.name;
 530                 len = de->d_name.len;
 531         }
 532
 533         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 534                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 535         if (IS_ERR(op_data))
 536                 RETURN(PTR_ERR(op_data));
 537         op_data->op_data = lmm;
 538         op_data->op_data_size = lmmsize;
 539
 540         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 541                             &ll_md_blocking_ast, 0);
 542         ll_finish_md_op_data(op_data);
 543         if (rc == -ESTALE) {
 544                 /* reason for keep own exit path - don`t flood log
 545                  * with messages with -ESTALE errors.
 546                  */
 547                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 548                      it_open_error(DISP_OPEN_OPEN, itp))
 549                         GOTO(out, rc);
 550                 ll_release_openhandle(de, itp);
 551                 GOTO(out, rc);
 552         }
 553
 554         if (it_disposition(itp, DISP_LOOKUP_NEG))
 555                 GOTO(out, rc = -ENOENT);
 556
 557         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 558                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 559                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 560                 GOTO(out, rc);
 561         }
 562
 563         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 564
 565         if (!rc && itp->it_lock_mode) {
 566                 ll_dom_finish_open(de->d_inode, req, itp);
 567                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 568         }
 569
 570 out:
 571         ptlrpc_req_finished(req);
 572         ll_intent_drop_lock(itp);
 573
 574         /* We did open by fid, but by the time we got to the server,
 575          * the object disappeared. If this is a create, we cannot really
 576          * tell the userspace that the file it was trying to create
 577          * does not exist. Instead let's return -ESTALE, and the VFS will
 578          * retry the create with LOOKUP_REVAL that we are going to catch
 579          * in ll_revalidate_dentry() and use lookup then.
 580          */
 581         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 582                 rc = -ESTALE;
 583
 584         RETURN(rc);
 585 }
 586
 587 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 588                        struct obd_client_handle *och)
 589 {
 590         struct mdt_body *body;
 591
 592         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 593         och->och_fh = body->mbo_handle;
 594         och->och_fid = body->mbo_fid1;
 595         och->och_lease_handle.cookie = it->it_lock_handle;
 596         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 597         och->och_flags = it->it_flags;
 598
 599         return md_set_open_replay_data(md_exp, och, it);
 600 }
 601
 602 static int ll_local_open(struct file *file, struct lookup_intent *it,
 603                          struct ll_file_data *fd, struct obd_client_handle *och)
 604 {
 605         struct inode *inode = file_inode(file);
 606         ENTRY;
 607
 608         LASSERT(!LUSTRE_FPRIVATE(file));
 609
 610         LASSERT(fd != NULL);
 611
 612         if (och) {
 613                 int rc;
 614
 615                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 616                 if (rc != 0)
 617                         RETURN(rc);
 618         }
 619
 620         LUSTRE_FPRIVATE(file) = fd;
 621         ll_readahead_init(inode, &fd->fd_ras);
 622         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 623
 624         /* ll_cl_context initialize */
 625         rwlock_init(&fd->fd_lock);
 626         INIT_LIST_HEAD(&fd->fd_lccs);
 627
 628         RETURN(0);
 629 }
 630
 631 /* Open a file, and (for the very first open) create objects on the OSTs at
 632  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 633  * creation or open until ll_lov_setstripe() ioctl is called.
 634  *
 635  * If we already have the stripe MD locally then we don't request it in
 636  * md_open(), by passing a lmm_size = 0.
 637  *
 638  * It is up to the application to ensure no other processes open this file
 639  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 640  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 641  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 642  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 643  */
 644 int ll_file_open(struct inode *inode, struct file *file)
 645 {
 646         struct ll_inode_info *lli = ll_i2info(inode);
 647         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 648                                           .it_flags = file->f_flags };
 649         struct obd_client_handle **och_p = NULL;
 650         __u64 *och_usecount = NULL;
 651         struct ll_file_data *fd;
 652         int rc = 0;
 653         ENTRY;
 654
 655         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 656                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 657
 658         it = file->private_data; /* XXX: compat macro */
 659         file->private_data = NULL; /* prevent ll_local_open assertion */
 660
 661         fd = ll_file_data_get();
 662         if (fd == NULL)
 663                 GOTO(out_nofiledata, rc = -ENOMEM);
 664
 665         fd->fd_file = file;
 666         if (S_ISDIR(inode->i_mode))
 667                 ll_authorize_statahead(inode, fd);
 668
 669         if (inode->i_sb->s_root == file_dentry(file)) {
 670                 LUSTRE_FPRIVATE(file) = fd;
 671                 RETURN(0);
 672         }
 673
 674         if (!it || !it->it_disposition) {
 675                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 676                  * because everything but O_ACCMODE mask was stripped from
 677                  * there */
 678                 if ((oit.it_flags + 1) & O_ACCMODE)
 679                         oit.it_flags++;
 680                 if (file->f_flags & O_TRUNC)
 681                         oit.it_flags |= FMODE_WRITE;
 682
 683                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 684                  * dentry_open after call to open_namei that checks permissions.
 685                  * Only nfsd_open call dentry_open directly without checking
 686                  * permissions and because of that this code below is safe. */
 687                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 688                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 689
 690                 /* We do not want O_EXCL here, presumably we opened the file
 691                  * already? XXX - NFS implications? */
 692                 oit.it_flags &= ~O_EXCL;
 693
 694                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 695                  * created if necessary, then "IT_CREAT" should be set to keep
 696                  * consistent with it */
 697                 if (oit.it_flags & O_CREAT)
 698                         oit.it_op |= IT_CREAT;
 699
 700                 it = &oit;
 701         }
 702
 703 restart:
 704         /* Let's see if we have file open on MDS already. */
 705         if (it->it_flags & FMODE_WRITE) {
 706                 och_p = &lli->lli_mds_write_och;
 707                 och_usecount = &lli->lli_open_fd_write_count;
 708         } else if (it->it_flags & FMODE_EXEC) {
 709                 och_p = &lli->lli_mds_exec_och;
 710                 och_usecount = &lli->lli_open_fd_exec_count;
 711          } else {
 712                 och_p = &lli->lli_mds_read_och;
 713                 och_usecount = &lli->lli_open_fd_read_count;
 714         }
 715
 716         mutex_lock(&lli->lli_och_mutex);
 717         if (*och_p) { /* Open handle is present */
 718                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 719                         /* Well, there's extra open request that we do not need,
 720                            let's close it somehow. This will decref request. */
 721                         rc = it_open_error(DISP_OPEN_OPEN, it);
 722                         if (rc) {
 723                                 mutex_unlock(&lli->lli_och_mutex);
 724                                 GOTO(out_openerr, rc);
 725                         }
 726
 727                         ll_release_openhandle(file_dentry(file), it);
 728                 }
 729                 (*och_usecount)++;
 730
 731                 rc = ll_local_open(file, it, fd, NULL);
 732                 if (rc) {
 733                         (*och_usecount)--;
 734                         mutex_unlock(&lli->lli_och_mutex);
 735                         GOTO(out_openerr, rc);
 736                 }
 737         } else {
 738                 LASSERT(*och_usecount == 0);
 739                 if (!it->it_disposition) {
 740                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 741                         /* We cannot just request lock handle now, new ELC code
 742                            means that one of other OPEN locks for this file
 743                            could be cancelled, and since blocking ast handler
 744                            would attempt to grab och_mutex as well, that would
 745                            result in a deadlock */
 746                         mutex_unlock(&lli->lli_och_mutex);
 747                         /*
 748                          * Normally called under two situations:
 749                          * 1. NFS export.
 750                          * 2. A race/condition on MDS resulting in no open
 751                          *    handle to be returned from LOOKUP|OPEN request,
 752                          *    for example if the target entry was a symlink.
 753                          *
 754                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 755                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 756                          *  bit so that it's not confusing later callers.
 757                          *
 758                          *  NB; when ldd is NULL, it must have come via normal
 759                          *  lookup path only, since ll_iget_for_nfs always calls
 760                          *  ll_d_init().
 761                          */
 762                         if (ldd && ldd->lld_nfs_dentry) {
 763                                 ldd->lld_nfs_dentry = 0;
 764                                 it->it_flags |= MDS_OPEN_LOCK;
 765                         }
 766
 767                          /*
 768                          * Always specify MDS_OPEN_BY_FID because we don't want
 769                          * to get file with different fid.
 770                          */
 771                         it->it_flags |= MDS_OPEN_BY_FID;
 772                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 773                                                  it);
 774                         if (rc)
 775                                 GOTO(out_openerr, rc);
 776
 777                         goto restart;
 778                 }
 779                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 780                 if (!*och_p)
 781                         GOTO(out_och_free, rc = -ENOMEM);
 782
 783                 (*och_usecount)++;
 784
 785                 /* md_intent_lock() didn't get a request ref if there was an
 786                  * open error, so don't do cleanup on the request here
 787                  * (bug 3430) */
 788                 /* XXX (green): Should not we bail out on any error here, not
 789                  * just open error? */
 790                 rc = it_open_error(DISP_OPEN_OPEN, it);
 791                 if (rc != 0)
 792                         GOTO(out_och_free, rc);
 793
 794                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 795                          "inode %p: disposition %x, status %d\n", inode,
 796                          it_disposition(it, ~0), it->it_status);
 797
 798                 rc = ll_local_open(file, it, fd, *och_p);
 799                 if (rc)
 800                         GOTO(out_och_free, rc);
 801         }
 802         mutex_unlock(&lli->lli_och_mutex);
 803         fd = NULL;
 804
 805         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 806            different kind of OPEN lock for this same inode gets cancelled
 807            by ldlm_cancel_lru */
 808         if (!S_ISREG(inode->i_mode))
 809                 GOTO(out_och_free, rc);
 810
 811         cl_lov_delay_create_clear(&file->f_flags);
 812         GOTO(out_och_free, rc);
 813
 814 out_och_free:
 815         if (rc) {
 816                 if (och_p && *och_p) {
 817                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 818                         *och_p = NULL; /* OBD_FREE writes some magic there */
 819                         (*och_usecount)--;
 820                 }
 821                 mutex_unlock(&lli->lli_och_mutex);
 822
 823 out_openerr:
 824                 if (lli->lli_opendir_key == fd)
 825                         ll_deauthorize_statahead(inode, fd);
 826                 if (fd != NULL)
 827                         ll_file_data_put(fd);
 828         } else {
 829                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 830         }
 831
 832 out_nofiledata:
 833         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 834                 ptlrpc_req_finished(it->it_request);
 835                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 836         }
 837
 838         return rc;
 839 }
 840
 841 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 842                         struct ldlm_lock_desc *desc, void *data, int flag)
 843 {
 844         int rc;
 845         struct lustre_handle lockh;
 846         ENTRY;
 847
 848         switch (flag) {
 849         case LDLM_CB_BLOCKING:
 850                 ldlm_lock2handle(lock, &lockh);
 851                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 852                 if (rc < 0) {
 853                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 854                         RETURN(rc);
 855                 }
 856                 break;
 857         case LDLM_CB_CANCELING:
 858                 /* do nothing */
 859                 break;
 860         }
 861         RETURN(0);
 862 }
 863
 864 /**
 865  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 866  * and save it as fd->fd_och so as to force client to reopen the file even
 867  * if it has an open lock in cache already.
 868  */
 869 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 870                                 struct lustre_handle *old_handle)
 871 {
 872         struct ll_inode_info *lli = ll_i2info(inode);
 873         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 874         struct obd_client_handle **och_p;
 875         __u64 *och_usecount;
 876         int rc = 0;
 877         ENTRY;
 878
 879         /* Get the openhandle of the file */
 880         mutex_lock(&lli->lli_och_mutex);
 881         if (fd->fd_lease_och != NULL)
 882                 GOTO(out_unlock, rc = -EBUSY);
 883
 884         if (fd->fd_och == NULL) {
 885                 if (file->f_mode & FMODE_WRITE) {
 886                         LASSERT(lli->lli_mds_write_och != NULL);
 887                         och_p = &lli->lli_mds_write_och;
 888                         och_usecount = &lli->lli_open_fd_write_count;
 889                 } else {
 890                         LASSERT(lli->lli_mds_read_och != NULL);
 891                         och_p = &lli->lli_mds_read_och;
 892                         och_usecount = &lli->lli_open_fd_read_count;
 893                 }
 894
 895                 if (*och_usecount > 1)
 896                         GOTO(out_unlock, rc = -EBUSY);
 897
 898                 fd->fd_och = *och_p;
 899                 *och_usecount = 0;
 900                 *och_p = NULL;
 901         }
 902
 903         *old_handle = fd->fd_och->och_fh;
 904
 905         EXIT;
 906 out_unlock:
 907         mutex_unlock(&lli->lli_och_mutex);
 908         return rc;
 909 }
 910
 911 /**
 912  * Release ownership on lli_mds_*_och when putting back a file lease.
 913  */
 914 static int ll_lease_och_release(struct inode *inode, struct file *file)
 915 {
 916         struct ll_inode_info *lli = ll_i2info(inode);
 917         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 918         struct obd_client_handle **och_p;
 919         struct obd_client_handle *old_och = NULL;
 920         __u64 *och_usecount;
 921         int rc = 0;
 922         ENTRY;
 923
 924         mutex_lock(&lli->lli_och_mutex);
 925         if (file->f_mode & FMODE_WRITE) {
 926                 och_p = &lli->lli_mds_write_och;
 927                 och_usecount = &lli->lli_open_fd_write_count;
 928         } else {
 929                 och_p = &lli->lli_mds_read_och;
 930                 och_usecount = &lli->lli_open_fd_read_count;
 931         }
 932
 933         /* The file may have been open by another process (broken lease) so
 934          * *och_p is not NULL. In this case we should simply increase usecount
 935          * and close fd_och.
 936          */
 937         if (*och_p != NULL) {
 938                 old_och = fd->fd_och;
 939                 (*och_usecount)++;
 940         } else {
 941                 *och_p = fd->fd_och;
 942                 *och_usecount = 1;
 943         }
 944         fd->fd_och = NULL;
 945         mutex_unlock(&lli->lli_och_mutex);
 946
 947         if (old_och != NULL)
 948                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 949
 950         RETURN(rc);
 951 }
 952
 953 /**
 954  * Acquire a lease and open the file.
 955  */
 956 static struct obd_client_handle *
 957 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 958               __u64 open_flags)
 959 {
 960         struct lookup_intent it = { .it_op = IT_OPEN };
 961         struct ll_sb_info *sbi = ll_i2sbi(inode);
 962         struct md_op_data *op_data;
 963         struct ptlrpc_request *req = NULL;
 964         struct lustre_handle old_handle = { 0 };
 965         struct obd_client_handle *och = NULL;
 966         int rc;
 967         int rc2;
 968         ENTRY;
 969
 970         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 971                 RETURN(ERR_PTR(-EINVAL));
 972
 973         if (file != NULL) {
 974                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 975                         RETURN(ERR_PTR(-EPERM));
 976
 977                 rc = ll_lease_och_acquire(inode, file, &old_handle);
 978                 if (rc)
 979                         RETURN(ERR_PTR(rc));
 980         }
 981
 982         OBD_ALLOC_PTR(och);
 983         if (och == NULL)
 984                 RETURN(ERR_PTR(-ENOMEM));
 985
 986         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 987                                         LUSTRE_OPC_ANY, NULL);
 988         if (IS_ERR(op_data))
 989                 GOTO(out, rc = PTR_ERR(op_data));
 990
 991         /* To tell the MDT this openhandle is from the same owner */
 992         op_data->op_handle = old_handle;
 993
 994         it.it_flags = fmode | open_flags;
 995         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
 996         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
 997                             &ll_md_blocking_lease_ast,
 998         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
 999          * it can be cancelled which may mislead applications that the lease is
1000          * broken;
1001          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1002          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1003          * doesn't deal with openhandle, so normal openhandle will be leaked. */
1004                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1005         ll_finish_md_op_data(op_data);
1006         ptlrpc_req_finished(req);
1007         if (rc < 0)
1008                 GOTO(out_release_it, rc);
1009
1010         if (it_disposition(&it, DISP_LOOKUP_NEG))
1011                 GOTO(out_release_it, rc = -ENOENT);
1012
1013         rc = it_open_error(DISP_OPEN_OPEN, &it);
1014         if (rc)
1015                 GOTO(out_release_it, rc);
1016
1017         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1018         ll_och_fill(sbi->ll_md_exp, &it, och);
1019
1020         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1021                 GOTO(out_close, rc = -EOPNOTSUPP);
1022
1023         /* already get lease, handle lease lock */
1024         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1025         if (it.it_lock_mode == 0 ||
1026             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1027                 /* open lock must return for lease */
1028                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1029                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1030                         it.it_lock_bits);
1031                 GOTO(out_close, rc = -EPROTO);
1032         }
1033
1034         ll_intent_release(&it);
1035         RETURN(och);
1036
1037 out_close:
1038         /* Cancel open lock */
1039         if (it.it_lock_mode != 0) {
1040                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1041                                             it.it_lock_mode);
1042                 it.it_lock_mode = 0;
1043                 och->och_lease_handle.cookie = 0ULL;
1044         }
1045         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1046         if (rc2 < 0)
1047                 CERROR("%s: error closing file "DFID": %d\n",
1048                        ll_get_fsname(inode->i_sb, NULL, 0),
1049                        PFID(&ll_i2info(inode)->lli_fid), rc2);
1050         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1051 out_release_it:
1052         ll_intent_release(&it);
1053 out:
1054         if (och != NULL)
1055                 OBD_FREE_PTR(och);
1056         RETURN(ERR_PTR(rc));
1057 }
1058
1059 /**
1060  * Check whether a layout swap can be done between two inodes.
1061  *
1062  * \param[in] inode1  First inode to check
1063  * \param[in] inode2  Second inode to check
1064  *
1065  * \retval 0 on success, layout swap can be performed between both inodes
1066  * \retval negative error code if requirements are not met
1067  */
1068 static int ll_check_swap_layouts_validity(struct inode *inode1,
1069                                           struct inode *inode2)
1070 {
1071         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1072                 return -EINVAL;
1073
1074         if (inode_permission(inode1, MAY_WRITE) ||
1075             inode_permission(inode2, MAY_WRITE))
1076                 return -EPERM;
1077
1078         if (inode1->i_sb != inode2->i_sb)
1079                 return -EXDEV;
1080
1081         return 0;
1082 }
1083
1084 static int ll_swap_layouts_close(struct obd_client_handle *och,
1085                                  struct inode *inode, struct inode *inode2)
1086 {
1087         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1088         const struct lu_fid     *fid2;
1089         int                      rc;
1090         ENTRY;
1091
1092         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1093                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1094
1095         rc = ll_check_swap_layouts_validity(inode, inode2);
1096         if (rc < 0)
1097                 GOTO(out_free_och, rc);
1098
1099         /* We now know that inode2 is a lustre inode */
1100         fid2 = ll_inode2fid(inode2);
1101
1102         rc = lu_fid_cmp(fid1, fid2);
1103         if (rc == 0)
1104                 GOTO(out_free_och, rc = -EINVAL);
1105
1106         /* Close the file and {swap,merge} layouts between inode & inode2.
1107          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1108          * because we still need it to pack l_remote_handle to MDT. */
1109         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1110                                        inode2);
1111
1112         och = NULL; /* freed in ll_close_inode_openhandle() */
1113
1114 out_free_och:
1115         if (och != NULL)
1116                 OBD_FREE_PTR(och);
1117
1118         RETURN(rc);
1119 }
1120
1121 /**
1122  * Release lease and close the file.
1123  * It will check if the lease has ever broken.
1124  */
1125 static int ll_lease_close_intent(struct obd_client_handle *och,
1126                                  struct inode *inode,
1127                                  bool *lease_broken, enum mds_op_bias bias,
1128                                  void *data)
1129 {
1130         struct ldlm_lock *lock;
1131         bool cancelled = true;
1132         int rc;
1133         ENTRY;
1134
1135         lock = ldlm_handle2lock(&och->och_lease_handle);
1136         if (lock != NULL) {
1137                 lock_res_and_lock(lock);
1138                 cancelled = ldlm_is_cancel(lock);
1139                 unlock_res_and_lock(lock);
1140                 LDLM_LOCK_PUT(lock);
1141         }
1142
1143         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1144                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1145
1146         if (lease_broken != NULL)
1147                 *lease_broken = cancelled;
1148
1149         if (!cancelled && !bias)
1150                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1151
1152         if (cancelled) { /* no need to excute intent */
1153                 bias = 0;
1154                 data = NULL;
1155         }
1156
1157         rc = ll_close_inode_openhandle(inode, och, bias, data);
1158         RETURN(rc);
1159 }
1160
1161 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1162                           bool *lease_broken)
1163 {
1164         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1165 }
1166
1167 /**
1168  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1169  */
1170 static int ll_lease_file_resync(struct obd_client_handle *och,
1171                                 struct inode *inode)
1172 {
1173         struct ll_sb_info *sbi = ll_i2sbi(inode);
1174         struct md_op_data *op_data;
1175         __u64 data_version_unused;
1176         int rc;
1177         ENTRY;
1178
1179         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1180                                      LUSTRE_OPC_ANY, NULL);
1181         if (IS_ERR(op_data))
1182                 RETURN(PTR_ERR(op_data));
1183
1184         /* before starting file resync, it's necessary to clean up page cache
1185          * in client memory, otherwise once the layout version is increased,
1186          * writing back cached data will be denied the OSTs. */
1187         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1188         if (rc)
1189                 GOTO(out, rc);
1190
1191         op_data->op_handle = och->och_lease_handle;
1192         rc = md_file_resync(sbi->ll_md_exp, op_data);
1193         if (rc)
1194                 GOTO(out, rc);
1195
1196         EXIT;
1197 out:
1198         ll_finish_md_op_data(op_data);
1199         return rc;
1200 }
1201
1202 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1203 {
1204         struct ll_inode_info *lli = ll_i2info(inode);
1205         struct cl_object *obj = lli->lli_clob;
1206         struct cl_attr *attr = vvp_env_thread_attr(env);
1207         s64 atime;
1208         s64 mtime;
1209         s64 ctime;
1210         int rc = 0;
1211
1212         ENTRY;
1213
1214         ll_inode_size_lock(inode);
1215
1216         /* Merge timestamps the most recently obtained from MDS with
1217          * timestamps obtained from OSTs.
1218          *
1219          * Do not overwrite atime of inode because it may be refreshed
1220          * by file_accessed() function. If the read was served by cache
1221          * data, there is no RPC to be sent so that atime may not be
1222          * transferred to OSTs at all. MDT only updates atime at close time
1223          * if it's at least 'mdd.*.atime_diff' older.
1224          * All in all, the atime in Lustre does not strictly comply with
1225          * POSIX. Solving this problem needs to send an RPC to MDT for each
1226          * read, this will hurt performance. */
1227         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1228                 LTIME_S(inode->i_atime) = lli->lli_atime;
1229                 lli->lli_update_atime = 0;
1230         }
1231         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1232         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1233
1234         atime = LTIME_S(inode->i_atime);
1235         mtime = LTIME_S(inode->i_mtime);
1236         ctime = LTIME_S(inode->i_ctime);
1237
1238         cl_object_attr_lock(obj);
1239         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1240                 rc = -EINVAL;
1241         else
1242                 rc = cl_object_attr_get(env, obj, attr);
1243         cl_object_attr_unlock(obj);
1244
1245         if (rc != 0)
1246                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1247
1248         if (atime < attr->cat_atime)
1249                 atime = attr->cat_atime;
1250
1251         if (ctime < attr->cat_ctime)
1252                 ctime = attr->cat_ctime;
1253
1254         if (mtime < attr->cat_mtime)
1255                 mtime = attr->cat_mtime;
1256
1257         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1258                PFID(&lli->lli_fid), attr->cat_size);
1259
1260         i_size_write(inode, attr->cat_size);
1261         inode->i_blocks = attr->cat_blocks;
1262
1263         LTIME_S(inode->i_atime) = atime;
1264         LTIME_S(inode->i_mtime) = mtime;
1265         LTIME_S(inode->i_ctime) = ctime;
1266
1267 out_size_unlock:
1268         ll_inode_size_unlock(inode);
1269
1270         RETURN(rc);
1271 }
1272
1273 /**
1274  * Set designated mirror for I/O.
1275  *
1276  * So far only read, write, and truncated can support to issue I/O to
1277  * designated mirror.
1278  */
1279 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1280 {
1281         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1282
1283         /* clear layout version for generic(non-resync) I/O in case it carries
1284          * stale layout version due to I/O restart */
1285         io->ci_layout_version = 0;
1286
1287         /* FLR: disable non-delay for designated mirror I/O because obviously
1288          * only one mirror is available */
1289         if (fd->fd_designated_mirror > 0) {
1290                 io->ci_ndelay = 0;
1291                 io->ci_designated_mirror = fd->fd_designated_mirror;
1292                 io->ci_layout_version = fd->fd_layout_version;
1293                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1294                                  * io to ptasks */
1295         }
1296
1297         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1298                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1299 }
1300
1301 static bool file_is_noatime(const struct file *file)
1302 {
1303         const struct vfsmount *mnt = file->f_path.mnt;
1304         const struct inode *inode = file_inode((struct file *)file);
1305
1306         /* Adapted from file_accessed() and touch_atime().*/
1307         if (file->f_flags & O_NOATIME)
1308                 return true;
1309
1310         if (inode->i_flags & S_NOATIME)
1311                 return true;
1312
1313         if (IS_NOATIME(inode))
1314                 return true;
1315
1316         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1317                 return true;
1318
1319         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1320                 return true;
1321
1322         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1323                 return true;
1324
1325         return false;
1326 }
1327
1328 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1329
1330 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1331 {
1332         struct inode *inode = file_inode(file);
1333         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1334
1335         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1336         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1337         io->u.ci_rw.rw_file = file;
1338         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1339         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1340         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1341
1342         if (iot == CIT_WRITE) {
1343                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1344                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1345                                            file->f_flags & O_DIRECT ||
1346                                            IS_SYNC(inode));
1347         }
1348         io->ci_obj = ll_i2info(inode)->lli_clob;
1349         io->ci_lockreq = CILR_MAYBE;
1350         if (ll_file_nolock(file)) {
1351                 io->ci_lockreq = CILR_NEVER;
1352                 io->ci_no_srvlock = 1;
1353         } else if (file->f_flags & O_APPEND) {
1354                 io->ci_lockreq = CILR_MANDATORY;
1355         }
1356         io->ci_noatime = file_is_noatime(file);
1357         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1358                 io->ci_pio = !io->u.ci_rw.rw_append;
1359         else
1360                 io->ci_pio = 0;
1361
1362         /* FLR: only use non-delay I/O for read as there is only one
1363          * avaliable mirror for write. */
1364         io->ci_ndelay = !(iot == CIT_WRITE);
1365
1366         ll_io_set_mirror(io, file);
1367 }
1368
1369 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1370 {
1371         struct cl_io_pt *pt = ptask->pt_cbdata;
1372         struct file *file = pt->cip_file;
1373         struct lu_env *env;
1374         struct cl_io *io;
1375         loff_t pos = pt->cip_pos;
1376         int rc;
1377         __u16 refcheck;
1378         ENTRY;
1379
1380         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1381                 file_dentry(file)->d_name.name,
1382                 pt->cip_iot == CIT_READ ? "read" : "write",
1383                 pos, pos + pt->cip_count);
1384
1385         env = cl_env_get(&refcheck);
1386         if (IS_ERR(env))
1387                 RETURN(PTR_ERR(env));
1388
1389         io = vvp_env_thread_io(env);
1390         ll_io_init(io, file, pt->cip_iot);
1391         io->u.ci_rw.rw_iter = pt->cip_iter;
1392         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1393         io->ci_pio = 0; /* It's already in parallel task */
1394
1395         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1396                            pt->cip_count - pt->cip_result);
1397         if (!rc) {
1398                 struct vvp_io *vio = vvp_env_io(env);
1399
1400                 vio->vui_io_subtype = IO_NORMAL;
1401                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1402
1403                 ll_cl_add(file, env, io, LCC_RW);
1404                 rc = cl_io_loop(env, io);
1405                 ll_cl_remove(file, env);
1406         } else {
1407                 /* cl_io_rw_init() handled IO */
1408                 rc = io->ci_result;
1409         }
1410
1411         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1412                 if (io->ci_nob > 0)
1413                         io->ci_nob /= 2;
1414                 rc = -EIO;
1415         }
1416
1417         if (io->ci_nob > 0) {
1418                 pt->cip_result += io->ci_nob;
1419                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1420                 pos += io->ci_nob;
1421                 pt->cip_iocb.ki_pos = pos;
1422 #ifdef HAVE_KIOCB_KI_LEFT
1423                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1424 #elif defined(HAVE_KI_NBYTES)
1425                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1426 #endif
1427         }
1428
1429         cl_io_fini(env, io);
1430         cl_env_put(env, &refcheck);
1431
1432         pt->cip_need_restart = io->ci_need_restart;
1433
1434         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1435                 file_dentry(file)->d_name.name,
1436                 pt->cip_iot == CIT_READ ? "read" : "write",
1437                 pt->cip_result, rc);
1438
1439         RETURN(pt->cip_result > 0 ? 0 : rc);
1440 }
1441
1442 static ssize_t
1443 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1444                    struct file *file, enum cl_io_type iot,
1445                    loff_t *ppos, size_t count)
1446 {
1447         struct range_lock       range;
1448         struct vvp_io           *vio = vvp_env_io(env);
1449         struct inode            *inode = file_inode(file);
1450         struct ll_inode_info    *lli = ll_i2info(inode);
1451         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1452         struct cl_io            *io;
1453         loff_t                  pos = *ppos;
1454         ssize_t                 result = 0;
1455         int                     rc = 0;
1456         unsigned                retried = 0;
1457         bool                    restarted = false;
1458
1459         ENTRY;
1460
1461         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1462                 file_dentry(file)->d_name.name,
1463                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1464
1465 restart:
1466         io = vvp_env_thread_io(env);
1467         ll_io_init(io, file, iot);
1468         if (args->via_io_subtype == IO_NORMAL) {
1469                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1470                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1471         }
1472         if (args->via_io_subtype != IO_NORMAL || restarted)
1473                 io->ci_pio = 0;
1474         io->ci_ndelay_tried = retried;
1475
1476         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1477                 bool range_locked = false;
1478
1479                 if (file->f_flags & O_APPEND)
1480                         range_lock_init(&range, 0, LUSTRE_EOF);
1481                 else
1482                         range_lock_init(&range, pos, pos + count - 1);
1483
1484                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1485                 vio->vui_io_subtype = args->via_io_subtype;
1486
1487                 switch (vio->vui_io_subtype) {
1488                 case IO_NORMAL:
1489                         /* Direct IO reads must also take range lock,
1490                          * or multiple reads will try to work on the same pages
1491                          * See LU-6227 for details. */
1492                         if (((iot == CIT_WRITE) ||
1493                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1494                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1495                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1496                                        RL_PARA(&range));
1497                                 rc = range_lock(&lli->lli_write_tree, &range);
1498                                 if (rc < 0)
1499                                         GOTO(out, rc);
1500
1501                                 range_locked = true;
1502                         }
1503                         break;
1504                 case IO_SPLICE:
1505                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1506                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1507                         break;
1508                 default:
1509                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1510                         LBUG();
1511                 }
1512
1513                 ll_cl_add(file, env, io, LCC_RW);
1514                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1515                     !lli->lli_inode_locked) {
1516                         inode_lock(inode);
1517                         lli->lli_inode_locked = 1;
1518                 }
1519                 rc = cl_io_loop(env, io);
1520                 if (lli->lli_inode_locked) {
1521                         lli->lli_inode_locked = 0;
1522                         inode_unlock(inode);
1523                 }
1524                 ll_cl_remove(file, env);
1525
1526                 if (range_locked) {
1527                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1528                                RL_PARA(&range));
1529                         range_unlock(&lli->lli_write_tree, &range);
1530                 }
1531         } else {
1532                 /* cl_io_rw_init() handled IO */
1533                 rc = io->ci_result;
1534         }
1535
1536         if (io->ci_nob > 0) {
1537                 result += io->ci_nob;
1538                 count  -= io->ci_nob;
1539
1540                 if (args->via_io_subtype == IO_NORMAL) {
1541                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1542
1543                         /* CLIO is too complicated. See LU-11069. */
1544                         if (cl_io_is_append(io))
1545                                 pos = io->u.ci_rw.rw_iocb.ki_pos;
1546                         else
1547                                 pos += io->ci_nob;
1548
1549                         args->u.normal.via_iocb->ki_pos = pos;
1550 #ifdef HAVE_KIOCB_KI_LEFT
1551                         args->u.normal.via_iocb->ki_left = count;
1552 #elif defined(HAVE_KI_NBYTES)
1553                         args->u.normal.via_iocb->ki_nbytes = count;
1554 #endif
1555                 } else {
1556                         /* for splice */
1557                         pos = io->u.ci_rw.rw_range.cir_pos;
1558                 }
1559         }
1560 out:
1561         cl_io_fini(env, io);
1562
1563         CDEBUG(D_VFSTRACE,
1564                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1565                file->f_path.dentry->d_name.name,
1566                iot, rc, result, io->ci_need_restart);
1567
1568         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1569                 CDEBUG(D_VFSTRACE,
1570                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1571                         file_dentry(file)->d_name.name,
1572                         iot == CIT_READ ? "read" : "write",
1573                         pos, pos + count, result, rc);
1574                 /* preserve the tried count for FLR */
1575                 retried = io->ci_ndelay_tried;
1576                 restarted = true;
1577                 goto restart;
1578         }
1579
1580         if (iot == CIT_READ) {
1581                 if (result > 0)
1582                         ll_stats_ops_tally(ll_i2sbi(inode),
1583                                            LPROC_LL_READ_BYTES, result);
1584         } else if (iot == CIT_WRITE) {
1585                 if (result > 0) {
1586                         ll_stats_ops_tally(ll_i2sbi(inode),
1587                                            LPROC_LL_WRITE_BYTES, result);
1588                         fd->fd_write_failed = false;
1589                 } else if (result == 0 && rc == 0) {
1590                         rc = io->ci_result;
1591                         if (rc < 0)
1592                                 fd->fd_write_failed = true;
1593                         else
1594                                 fd->fd_write_failed = false;
1595                 } else if (rc != -ERESTARTSYS) {
1596                         fd->fd_write_failed = true;
1597                 }
1598         }
1599
1600         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1601                 file_dentry(file)->d_name.name,
1602                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1603
1604         *ppos = pos;
1605
1606         RETURN(result > 0 ? result : rc);
1607 }
1608
1609 /**
1610  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1611  * especially for small I/O.
1612  *
1613  * To serve a read request, CLIO has to create and initialize a cl_io and
1614  * then request DLM lock. This has turned out to have siginificant overhead
1615  * and affects the performance of small I/O dramatically.
1616  *
1617  * It's not necessary to create a cl_io for each I/O. Under the help of read
1618  * ahead, most of the pages being read are already in memory cache and we can
1619  * read those pages directly because if the pages exist, the corresponding DLM
1620  * lock must exist so that page content must be valid.
1621  *
1622  * In fast read implementation, the llite speculatively finds and reads pages
1623  * in memory cache. There are three scenarios for fast read:
1624  *   - If the page exists and is uptodate, kernel VM will provide the data and
1625  *     CLIO won't be intervened;
1626  *   - If the page was brought into memory by read ahead, it will be exported
1627  *     and read ahead parameters will be updated;
1628  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1629  *     it will go back and invoke normal read, i.e., a cl_io will be created
1630  *     and DLM lock will be requested.
1631  *
1632  * POSIX compliance: posix standard states that read is intended to be atomic.
1633  * Lustre read implementation is in line with Linux kernel read implementation
1634  * and neither of them complies with POSIX standard in this matter. Fast read
1635  * doesn't make the situation worse on single node but it may interleave write
1636  * results from multiple nodes due to short read handling in ll_file_aio_read().
1637  *
1638  * \param env - lu_env
1639  * \param iocb - kiocb from kernel
1640  * \param iter - user space buffers where the data will be copied
1641  *
1642  * \retval - number of bytes have been read, or error code if error occurred.
1643  */
1644 static ssize_t
1645 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1646 {
1647         ssize_t result;
1648
1649         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1650                 return 0;
1651
1652         /* NB: we can't do direct IO for fast read because it will need a lock
1653          * to make IO engine happy. */
1654         if (iocb->ki_filp->f_flags & O_DIRECT)
1655                 return 0;
1656
1657         result = generic_file_read_iter(iocb, iter);
1658
1659         /* If the first page is not in cache, generic_file_aio_read() will be
1660          * returned with -ENODATA.
1661          * See corresponding code in ll_readpage(). */
1662         if (result == -ENODATA)
1663                 result = 0;
1664
1665         if (result > 0)
1666                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1667                                 LPROC_LL_READ_BYTES, result);
1668
1669         return result;
1670 }
1671
1672 /*
1673  * Read from a file (through the page cache).
1674  */
1675 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1676 {
1677         struct lu_env *env;
1678         struct vvp_io_args *args;
1679         ssize_t result;
1680         ssize_t rc2;
1681         __u16 refcheck;
1682
1683         result = ll_do_fast_read(iocb, to);
1684         if (result < 0 || iov_iter_count(to) == 0)
1685                 GOTO(out, result);
1686
1687         env = cl_env_get(&refcheck);
1688         if (IS_ERR(env))
1689                 return PTR_ERR(env);
1690
1691         args = ll_env_args(env, IO_NORMAL);
1692         args->u.normal.via_iter = to;
1693         args->u.normal.via_iocb = iocb;
1694
1695         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1696                                  &iocb->ki_pos, iov_iter_count(to));
1697         if (rc2 > 0)
1698                 result += rc2;
1699         else if (result == 0)
1700                 result = rc2;
1701
1702         cl_env_put(env, &refcheck);
1703 out:
1704         return result;
1705 }
1706
1707 /**
1708  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1709  * If a page is already in the page cache and dirty (and some other things -
1710  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1711  * write to it without doing a full I/O, because Lustre already knows about it
1712  * and will write it out.  This saves a lot of processing time.
1713  *
1714  * All writes here are within one page, so exclusion is handled by the page
1715  * lock on the vm page.  We do not do tiny writes for writes which touch
1716  * multiple pages because it's very unlikely multiple sequential pages are
1717  * are already dirty.
1718  *
1719  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1720  * and are unlikely to be to already dirty pages.
1721  *
1722  * Attribute updates are important here, we do them in ll_tiny_write_end.
1723  */
1724 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1725 {
1726         ssize_t count = iov_iter_count(iter);
1727         struct file *file = iocb->ki_filp;
1728         struct inode *inode = file_inode(file);
1729         ssize_t result = 0;
1730
1731         ENTRY;
1732
1733         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1734          * of function for why.
1735          */
1736         if (count >= PAGE_SIZE ||
1737             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1738                 RETURN(0);
1739
1740         result = __generic_file_write_iter(iocb, iter);
1741
1742         /* If the page is not already dirty, ll_tiny_write_begin returns
1743          * -ENODATA.  We continue on to normal write.
1744          */
1745         if (result == -ENODATA)
1746                 result = 0;
1747
1748         if (result > 0) {
1749                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1750                                    result);
1751                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1752         }
1753
1754         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1755
1756         RETURN(result);
1757 }
1758
1759 /*
1760  * Write to a file (through the page cache).
1761  */
1762 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1763 {
1764         struct vvp_io_args *args;
1765         struct lu_env *env;
1766         ssize_t rc_tiny = 0, rc_normal;
1767         __u16 refcheck;
1768
1769         ENTRY;
1770
1771         /* NB: we can't do direct IO for tiny writes because they use the page
1772          * cache, we can't do sync writes because tiny writes can't flush
1773          * pages, and we can't do append writes because we can't guarantee the
1774          * required DLM locks are held to protect file size.
1775          */
1776         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1777             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1778                 rc_tiny = ll_do_tiny_write(iocb, from);
1779
1780         /* In case of error, go on and try normal write - Only stop if tiny
1781          * write completed I/O.
1782          */
1783         if (iov_iter_count(from) == 0)
1784                 GOTO(out, rc_normal = rc_tiny);
1785
1786         env = cl_env_get(&refcheck);
1787         if (IS_ERR(env))
1788                 return PTR_ERR(env);
1789
1790         args = ll_env_args(env, IO_NORMAL);
1791         args->u.normal.via_iter = from;
1792         args->u.normal.via_iocb = iocb;
1793
1794         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1795                                     &iocb->ki_pos, iov_iter_count(from));
1796
1797         /* On success, combine bytes written. */
1798         if (rc_tiny >= 0 && rc_normal > 0)
1799                 rc_normal += rc_tiny;
1800         /* On error, only return error from normal write if tiny write did not
1801          * write any bytes.  Otherwise return bytes written by tiny write.
1802          */
1803         else if (rc_tiny > 0)
1804                 rc_normal = rc_tiny;
1805
1806         cl_env_put(env, &refcheck);
1807 out:
1808         RETURN(rc_normal);
1809 }
1810
1811 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1812 /*
1813  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1814  */
1815 static int ll_file_get_iov_count(const struct iovec *iov,
1816                                  unsigned long *nr_segs, size_t *count)
1817 {
1818         size_t cnt = 0;
1819         unsigned long seg;
1820
1821         for (seg = 0; seg < *nr_segs; seg++) {
1822                 const struct iovec *iv = &iov[seg];
1823
1824                 /*
1825                  * If any segment has a negative length, or the cumulative
1826                  * length ever wraps negative then return -EINVAL.
1827                  */
1828                 cnt += iv->iov_len;
1829                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1830                         return -EINVAL;
1831                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1832                         continue;
1833                 if (seg == 0)
1834                         return -EFAULT;
1835                 *nr_segs = seg;
1836                 cnt -= iv->iov_len;     /* This segment is no good */
1837                 break;
1838         }
1839         *count = cnt;
1840         return 0;
1841 }
1842
1843 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1844                                 unsigned long nr_segs, loff_t pos)
1845 {
1846         struct iov_iter to;
1847         size_t iov_count;
1848         ssize_t result;
1849         ENTRY;
1850
1851         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1852         if (result)
1853                 RETURN(result);
1854
1855 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1856         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1857 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1858         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1859 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1860
1861         result = ll_file_read_iter(iocb, &to);
1862
1863         RETURN(result);
1864 }
1865
1866 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1867                             loff_t *ppos)
1868 {
1869         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1870         struct kiocb   kiocb;
1871         ssize_t        result;
1872         ENTRY;
1873
1874         init_sync_kiocb(&kiocb, file);
1875         kiocb.ki_pos = *ppos;
1876 #ifdef HAVE_KIOCB_KI_LEFT
1877         kiocb.ki_left = count;
1878 #elif defined(HAVE_KI_NBYTES)
1879         kiocb.i_nbytes = count;
1880 #endif
1881
1882         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1883         *ppos = kiocb.ki_pos;
1884
1885         RETURN(result);
1886 }
1887
1888 /*
1889  * Write to a file (through the page cache).
1890  * AIO stuff
1891  */
1892 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1893                                  unsigned long nr_segs, loff_t pos)
1894 {
1895         struct iov_iter from;
1896         size_t iov_count;
1897         ssize_t result;
1898         ENTRY;
1899
1900         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1901         if (result)
1902                 RETURN(result);
1903
1904 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1905         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1906 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1907         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1908 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1909
1910         result = ll_file_write_iter(iocb, &from);
1911
1912         RETURN(result);
1913 }
1914
1915 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1916                              size_t count, loff_t *ppos)
1917 {
1918         struct iovec   iov = { .iov_base = (void __user *)buf,
1919                                .iov_len = count };
1920         struct kiocb   kiocb;
1921         ssize_t        result;
1922
1923         ENTRY;
1924
1925         init_sync_kiocb(&kiocb, file);
1926         kiocb.ki_pos = *ppos;
1927 #ifdef HAVE_KIOCB_KI_LEFT
1928         kiocb.ki_left = count;
1929 #elif defined(HAVE_KI_NBYTES)
1930         kiocb.ki_nbytes = count;
1931 #endif
1932
1933         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1934         *ppos = kiocb.ki_pos;
1935
1936         RETURN(result);
1937 }
1938 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1939
1940 /*
1941  * Send file content (through pagecache) somewhere with helper
1942  */
1943 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1944                                    struct pipe_inode_info *pipe, size_t count,
1945                                    unsigned int flags)
1946 {
1947         struct lu_env      *env;
1948         struct vvp_io_args *args;
1949         ssize_t             result;
1950         __u16               refcheck;
1951         ENTRY;
1952
1953         env = cl_env_get(&refcheck);
1954         if (IS_ERR(env))
1955                 RETURN(PTR_ERR(env));
1956
1957         args = ll_env_args(env, IO_SPLICE);
1958         args->u.splice.via_pipe = pipe;
1959         args->u.splice.via_flags = flags;
1960
1961         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1962         cl_env_put(env, &refcheck);
1963         RETURN(result);
1964 }
1965
1966 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1967                              __u64 flags, struct lov_user_md *lum, int lum_size)
1968 {
1969         struct lookup_intent oit = {
1970                 .it_op = IT_OPEN,
1971                 .it_flags = flags | MDS_OPEN_BY_FID,
1972         };
1973         int rc;
1974         ENTRY;
1975
1976         ll_inode_size_lock(inode);
1977         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1978         if (rc < 0)
1979                 GOTO(out_unlock, rc);
1980
1981         ll_release_openhandle(dentry, &oit);
1982
1983 out_unlock:
1984         ll_inode_size_unlock(inode);
1985         ll_intent_release(&oit);
1986
1987         RETURN(rc);
1988 }
1989
1990 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1991                              struct lov_mds_md **lmmp, int *lmm_size,
1992                              struct ptlrpc_request **request)
1993 {
1994         struct ll_sb_info *sbi = ll_i2sbi(inode);
1995         struct mdt_body  *body;
1996         struct lov_mds_md *lmm = NULL;
1997         struct ptlrpc_request *req = NULL;
1998         struct md_op_data *op_data;
1999         int rc, lmmsize;
2000
2001         rc = ll_get_default_mdsize(sbi, &lmmsize);
2002         if (rc)
2003                 RETURN(rc);
2004
2005         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2006                                      strlen(filename), lmmsize,
2007                                      LUSTRE_OPC_ANY, NULL);
2008         if (IS_ERR(op_data))
2009                 RETURN(PTR_ERR(op_data));
2010
2011         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2012         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2013         ll_finish_md_op_data(op_data);
2014         if (rc < 0) {
2015                 CDEBUG(D_INFO, "md_getattr_name failed "
2016                        "on %s: rc %d\n", filename, rc);
2017                 GOTO(out, rc);
2018         }
2019
2020         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2021         LASSERT(body != NULL); /* checked by mdc_getattr_name */
2022
2023         lmmsize = body->mbo_eadatasize;
2024
2025         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2026                         lmmsize == 0) {
2027                 GOTO(out, rc = -ENODATA);
2028         }
2029
2030         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2031         LASSERT(lmm != NULL);
2032
2033         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2034             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2035             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2036                 GOTO(out, rc = -EPROTO);
2037
2038         /*
2039          * This is coming from the MDS, so is probably in
2040          * little endian.  We convert it to host endian before
2041          * passing it to userspace.
2042          */
2043         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2044                 int stripe_count;
2045
2046                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2047                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2048                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2049                         if (le32_to_cpu(lmm->lmm_pattern) &
2050                             LOV_PATTERN_F_RELEASED)
2051                                 stripe_count = 0;
2052                 }
2053
2054                 /* if function called for directory - we should
2055                  * avoid swab not existent lsm objects */
2056                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2057                         lustre_swab_lov_user_md_v1(
2058                                         (struct lov_user_md_v1 *)lmm);
2059                         if (S_ISREG(body->mbo_mode))
2060                                 lustre_swab_lov_user_md_objects(
2061                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2062                                     stripe_count);
2063                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2064                         lustre_swab_lov_user_md_v3(
2065                                         (struct lov_user_md_v3 *)lmm);
2066                         if (S_ISREG(body->mbo_mode))
2067                                 lustre_swab_lov_user_md_objects(
2068                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2069                                     stripe_count);
2070                 } else if (lmm->lmm_magic ==
2071                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2072                         lustre_swab_lov_comp_md_v1(
2073                                         (struct lov_comp_md_v1 *)lmm);
2074                 }
2075         }
2076
2077 out:
2078         *lmmp = lmm;
2079         *lmm_size = lmmsize;
2080         *request = req;
2081         return rc;
2082 }
2083
2084 static int ll_lov_setea(struct inode *inode, struct file *file,
2085                         void __user *arg)
2086 {
2087         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2088         struct lov_user_md      *lump;
2089         int                      lum_size = sizeof(struct lov_user_md) +
2090                                             sizeof(struct lov_user_ost_data);
2091         int                      rc;
2092         ENTRY;
2093
2094         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2095                 RETURN(-EPERM);
2096
2097         OBD_ALLOC_LARGE(lump, lum_size);
2098         if (lump == NULL)
2099                 RETURN(-ENOMEM);
2100
2101         if (copy_from_user(lump, arg, lum_size))
2102                 GOTO(out_lump, rc = -EFAULT);
2103
2104         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2105                                       lum_size);
2106         cl_lov_delay_create_clear(&file->f_flags);
2107
2108 out_lump:
2109         OBD_FREE_LARGE(lump, lum_size);
2110         RETURN(rc);
2111 }
2112
2113 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2114 {
2115         struct lu_env   *env;
2116         __u16           refcheck;
2117         int             rc;
2118         ENTRY;
2119
2120         env = cl_env_get(&refcheck);
2121         if (IS_ERR(env))
2122                 RETURN(PTR_ERR(env));
2123
2124         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2125         cl_env_put(env, &refcheck);
2126         RETURN(rc);
2127 }
2128
2129 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2130                             void __user *arg)
2131 {
2132         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2133         struct lov_user_md        *klum;
2134         int                        lum_size, rc;
2135         __u64                      flags = FMODE_WRITE;
2136         ENTRY;
2137
2138         rc = ll_copy_user_md(lum, &klum);
2139         if (rc < 0)
2140                 RETURN(rc);
2141
2142         lum_size = rc;
2143         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2144                                       lum_size);
2145         if (!rc) {
2146                 __u32 gen;
2147
2148                 rc = put_user(0, &lum->lmm_stripe_count);
2149                 if (rc)
2150                         GOTO(out, rc);
2151
2152                 rc = ll_layout_refresh(inode, &gen);
2153                 if (rc)
2154                         GOTO(out, rc);
2155
2156                 rc = ll_file_getstripe(inode, arg, lum_size);
2157         }
2158         cl_lov_delay_create_clear(&file->f_flags);
2159
2160 out:
2161         OBD_FREE(klum, lum_size);
2162         RETURN(rc);
2163 }
2164
2165 static int
2166 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2167 {
2168         struct ll_inode_info *lli = ll_i2info(inode);
2169         struct cl_object *obj = lli->lli_clob;
2170         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2171         struct ll_grouplock grouplock;
2172         int rc;
2173         ENTRY;
2174
2175         if (arg == 0) {
2176                 CWARN("group id for group lock must not be 0\n");
2177                 RETURN(-EINVAL);
2178         }
2179
2180         if (ll_file_nolock(file))
2181                 RETURN(-EOPNOTSUPP);
2182
2183         spin_lock(&lli->lli_lock);
2184         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2185                 CWARN("group lock already existed with gid %lu\n",
2186                       fd->fd_grouplock.lg_gid);
2187                 spin_unlock(&lli->lli_lock);
2188                 RETURN(-EINVAL);
2189         }
2190         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2191         spin_unlock(&lli->lli_lock);
2192
2193         /**
2194          * XXX: group lock needs to protect all OST objects while PFL
2195          * can add new OST objects during the IO, so we'd instantiate
2196          * all OST objects before getting its group lock.
2197          */
2198         if (obj) {
2199                 struct lu_env *env;
2200                 __u16 refcheck;
2201                 struct cl_layout cl = {
2202                         .cl_is_composite = false,
2203                 };
2204                 struct lu_extent ext = {
2205                         .e_start = 0,
2206                         .e_end = OBD_OBJECT_EOF,
2207                 };
2208
2209                 env = cl_env_get(&refcheck);
2210                 if (IS_ERR(env))
2211                         RETURN(PTR_ERR(env));
2212
2213                 rc = cl_object_layout_get(env, obj, &cl);
2214                 if (!rc && cl.cl_is_composite)
2215                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2216                                                     &ext);
2217
2218                 cl_env_put(env, &refcheck);
2219                 if (rc)
2220                         RETURN(rc);
2221         }
2222
2223         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2224                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2225         if (rc)
2226                 RETURN(rc);
2227
2228         spin_lock(&lli->lli_lock);
2229         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2230                 spin_unlock(&lli->lli_lock);
2231                 CERROR("another thread just won the race\n");
2232                 cl_put_grouplock(&grouplock);
2233                 RETURN(-EINVAL);
2234         }
2235
2236         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2237         fd->fd_grouplock = grouplock;
2238         spin_unlock(&lli->lli_lock);
2239
2240         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2241         RETURN(0);
2242 }
2243
2244 static int ll_put_grouplock(struct inode *inode, struct file *file,
2245                             unsigned long arg)
2246 {
2247         struct ll_inode_info   *lli = ll_i2info(inode);
2248         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2249         struct ll_grouplock     grouplock;
2250         ENTRY;
2251
2252         spin_lock(&lli->lli_lock);
2253         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2254                 spin_unlock(&lli->lli_lock);
2255                 CWARN("no group lock held\n");
2256                 RETURN(-EINVAL);
2257         }
2258
2259         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2260
2261         if (fd->fd_grouplock.lg_gid != arg) {
2262                 CWARN("group lock %lu doesn't match current id %lu\n",
2263                       arg, fd->fd_grouplock.lg_gid);
2264                 spin_unlock(&lli->lli_lock);
2265                 RETURN(-EINVAL);
2266         }
2267
2268         grouplock = fd->fd_grouplock;
2269         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2270         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2271         spin_unlock(&lli->lli_lock);
2272
2273         cl_put_grouplock(&grouplock);
2274         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2275         RETURN(0);
2276 }
2277
2278 /**
2279  * Close inode open handle
2280  *
2281  * \param dentry [in]     dentry which contains the inode
2282  * \param it     [in,out] intent which contains open info and result
2283  *
2284  * \retval 0     success
2285  * \retval <0    failure
2286  */
2287 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2288 {
2289         struct inode *inode = dentry->d_inode;
2290         struct obd_client_handle *och;
2291         int rc;
2292         ENTRY;
2293
2294         LASSERT(inode);
2295
2296         /* Root ? Do nothing. */
2297         if (dentry->d_inode->i_sb->s_root == dentry)
2298                 RETURN(0);
2299
2300         /* No open handle to close? Move away */
2301         if (!it_disposition(it, DISP_OPEN_OPEN))
2302                 RETURN(0);
2303
2304         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2305
2306         OBD_ALLOC(och, sizeof(*och));
2307         if (!och)
2308                 GOTO(out, rc = -ENOMEM);
2309
2310         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2311
2312         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2313 out:
2314         /* this one is in place of ll_file_open */
2315         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2316                 ptlrpc_req_finished(it->it_request);
2317                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2318         }
2319         RETURN(rc);
2320 }
2321
2322 /**
2323  * Get size for inode for which FIEMAP mapping is requested.
2324  * Make the FIEMAP get_info call and returns the result.
2325  * \param fiemap        kernel buffer to hold extens
2326  * \param num_bytes     kernel buffer size
2327  */
2328 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2329                         size_t num_bytes)
2330 {
2331         struct lu_env                   *env;
2332         __u16                           refcheck;
2333         int                             rc = 0;
2334         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2335         ENTRY;
2336
2337         /* Checks for fiemap flags */
2338         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2339                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2340                 return -EBADR;
2341         }
2342
2343         /* Check for FIEMAP_FLAG_SYNC */
2344         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2345                 rc = filemap_fdatawrite(inode->i_mapping);
2346                 if (rc)
2347                         return rc;
2348         }
2349
2350         env = cl_env_get(&refcheck);
2351         if (IS_ERR(env))
2352                 RETURN(PTR_ERR(env));
2353
2354         if (i_size_read(inode) == 0) {
2355                 rc = ll_glimpse_size(inode);
2356                 if (rc)
2357                         GOTO(out, rc);
2358         }
2359
2360         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2361         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2362         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2363
2364         /* If filesize is 0, then there would be no objects for mapping */
2365         if (fmkey.lfik_oa.o_size == 0) {
2366                 fiemap->fm_mapped_extents = 0;
2367                 GOTO(out, rc = 0);
2368         }
2369
2370         fmkey.lfik_fiemap = *fiemap;
2371
2372         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2373                               &fmkey, fiemap, &num_bytes);
2374 out:
2375         cl_env_put(env, &refcheck);
2376         RETURN(rc);
2377 }
2378
2379 int ll_fid2path(struct inode *inode, void __user *arg)
2380 {
2381         struct obd_export       *exp = ll_i2mdexp(inode);
2382         const struct getinfo_fid2path __user *gfin = arg;
2383         __u32                    pathlen;
2384         struct getinfo_fid2path *gfout;
2385         size_t                   outsize;
2386         int                      rc;
2387
2388         ENTRY;
2389
2390         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2391             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2392                 RETURN(-EPERM);
2393
2394         /* Only need to get the buflen */
2395         if (get_user(pathlen, &gfin->gf_pathlen))
2396                 RETURN(-EFAULT);
2397
2398         if (pathlen > PATH_MAX)
2399                 RETURN(-EINVAL);
2400
2401         outsize = sizeof(*gfout) + pathlen;
2402         OBD_ALLOC(gfout, outsize);
2403         if (gfout == NULL)
2404                 RETURN(-ENOMEM);
2405
2406         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2407                 GOTO(gf_free, rc = -EFAULT);
2408         /* append root FID after gfout to let MDT know the root FID so that it
2409          * can lookup the correct path, this is mainly for fileset.
2410          * old server without fileset mount support will ignore this. */
2411         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2412
2413         /* Call mdc_iocontrol */
2414         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2415         if (rc != 0)
2416                 GOTO(gf_free, rc);
2417
2418         if (copy_to_user(arg, gfout, outsize))
2419                 rc = -EFAULT;
2420
2421 gf_free:
2422         OBD_FREE(gfout, outsize);
2423         RETURN(rc);
2424 }
2425
2426 static int
2427 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2428 {
2429         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2430         struct lu_env *env;
2431         struct cl_io *io;
2432         __u16  refcheck;
2433         int result;
2434
2435         ENTRY;
2436
2437         ioc->idv_version = 0;
2438         ioc->idv_layout_version = UINT_MAX;
2439
2440         /* If no file object initialized, we consider its version is 0. */
2441         if (obj == NULL)
2442                 RETURN(0);
2443
2444         env = cl_env_get(&refcheck);
2445         if (IS_ERR(env))
2446                 RETURN(PTR_ERR(env));
2447
2448         io = vvp_env_thread_io(env);
2449         io->ci_obj = obj;
2450         io->u.ci_data_version.dv_data_version = 0;
2451         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2452         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2453
2454 restart:
2455         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2456                 result = cl_io_loop(env, io);
2457         else
2458                 result = io->ci_result;
2459
2460         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2461         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2462
2463         cl_io_fini(env, io);
2464
2465         if (unlikely(io->ci_need_restart))
2466                 goto restart;
2467
2468         cl_env_put(env, &refcheck);
2469
2470         RETURN(result);
2471 }
2472
2473 /*
2474  * Read the data_version for inode.
2475  *
2476  * This value is computed using stripe object version on OST.
2477  * Version is computed using server side locking.
2478  *
2479  * @param flags if do sync on the OST side;
2480  *              0: no sync
2481  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2482  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2483  */
2484 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2485 {
2486         struct ioc_data_version ioc = { .idv_flags = flags };
2487         int rc;
2488
2489         rc = ll_ioc_data_version(inode, &ioc);
2490         if (!rc)
2491                 *data_version = ioc.idv_version;
2492
2493         return rc;
2494 }
2495
2496 /*
2497  * Trigger a HSM release request for the provided inode.
2498  */
2499 int ll_hsm_release(struct inode *inode)
2500 {
2501         struct lu_env *env;
2502         struct obd_client_handle *och = NULL;
2503         __u64 data_version = 0;
2504         int rc;
2505         __u16 refcheck;
2506         ENTRY;
2507
2508         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2509                ll_get_fsname(inode->i_sb, NULL, 0),
2510                PFID(&ll_i2info(inode)->lli_fid));
2511
2512         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2513         if (IS_ERR(och))
2514                 GOTO(out, rc = PTR_ERR(och));
2515
2516         /* Grab latest data_version and [am]time values */
2517         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2518         if (rc != 0)
2519                 GOTO(out, rc);
2520
2521         env = cl_env_get(&refcheck);
2522         if (IS_ERR(env))
2523                 GOTO(out, rc = PTR_ERR(env));
2524
2525         rc = ll_merge_attr(env, inode);
2526         cl_env_put(env, &refcheck);
2527
2528         /* If error happen, we have the wrong size for a file.
2529          * Don't release it.
2530          */
2531         if (rc != 0)
2532                 GOTO(out, rc);
2533
2534         /* Release the file.
2535          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2536          * we still need it to pack l_remote_handle to MDT. */
2537         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2538                                        &data_version);
2539         och = NULL;
2540
2541         EXIT;
2542 out:
2543         if (och != NULL && !IS_ERR(och)) /* close the file */
2544                 ll_lease_close(och, inode, NULL);
2545
2546         return rc;
2547 }
2548
2549 struct ll_swap_stack {
2550         __u64                    dv1;
2551         __u64                    dv2;
2552         struct inode            *inode1;
2553         struct inode            *inode2;
2554         bool                     check_dv1;
2555         bool                     check_dv2;
2556 };
2557
2558 static int ll_swap_layouts(struct file *file1, struct file *file2,
2559                            struct lustre_swap_layouts *lsl)
2560 {
2561         struct mdc_swap_layouts  msl;
2562         struct md_op_data       *op_data;
2563         __u32                    gid;
2564         __u64                    dv;
2565         struct ll_swap_stack    *llss = NULL;
2566         int                      rc;
2567
2568         OBD_ALLOC_PTR(llss);
2569         if (llss == NULL)
2570                 RETURN(-ENOMEM);
2571
2572         llss->inode1 = file_inode(file1);
2573         llss->inode2 = file_inode(file2);
2574
2575         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2576         if (rc < 0)
2577                 GOTO(free, rc);
2578
2579         /* we use 2 bool because it is easier to swap than 2 bits */
2580         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2581                 llss->check_dv1 = true;
2582
2583         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2584                 llss->check_dv2 = true;
2585
2586         /* we cannot use lsl->sl_dvX directly because we may swap them */
2587         llss->dv1 = lsl->sl_dv1;
2588         llss->dv2 = lsl->sl_dv2;
2589
2590         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2591         if (rc == 0) /* same file, done! */
2592                 GOTO(free, rc);
2593
2594         if (rc < 0) { /* sequentialize it */
2595                 swap(llss->inode1, llss->inode2);
2596                 swap(file1, file2);
2597                 swap(llss->dv1, llss->dv2);
2598                 swap(llss->check_dv1, llss->check_dv2);
2599         }
2600
2601         gid = lsl->sl_gid;
2602         if (gid != 0) { /* application asks to flush dirty cache */
2603                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2604                 if (rc < 0)
2605                         GOTO(free, rc);
2606
2607                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2608                 if (rc < 0) {
2609                         ll_put_grouplock(llss->inode1, file1, gid);
2610                         GOTO(free, rc);
2611                 }
2612         }
2613
2614         /* ultimate check, before swaping the layouts we check if
2615          * dataversion has changed (if requested) */
2616         if (llss->check_dv1) {
2617                 rc = ll_data_version(llss->inode1, &dv, 0);
2618                 if (rc)
2619                         GOTO(putgl, rc);
2620                 if (dv != llss->dv1)
2621                         GOTO(putgl, rc = -EAGAIN);
2622         }
2623
2624         if (llss->check_dv2) {
2625                 rc = ll_data_version(llss->inode2, &dv, 0);
2626                 if (rc)
2627                         GOTO(putgl, rc);
2628                 if (dv != llss->dv2)
2629                         GOTO(putgl, rc = -EAGAIN);
2630         }
2631
2632         /* struct md_op_data is used to send the swap args to the mdt
2633          * only flags is missing, so we use struct mdc_swap_layouts
2634          * through the md_op_data->op_data */
2635         /* flags from user space have to be converted before they are send to
2636          * server, no flag is sent today, they are only used on the client */
2637         msl.msl_flags = 0;
2638         rc = -ENOMEM;
2639         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2640                                      0, LUSTRE_OPC_ANY, &msl);
2641         if (IS_ERR(op_data))
2642                 GOTO(free, rc = PTR_ERR(op_data));
2643
2644         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2645                            sizeof(*op_data), op_data, NULL);
2646         ll_finish_md_op_data(op_data);
2647
2648         if (rc < 0)
2649                 GOTO(putgl, rc);
2650
2651 putgl:
2652         if (gid != 0) {
2653                 ll_put_grouplock(llss->inode2, file2, gid);
2654                 ll_put_grouplock(llss->inode1, file1, gid);
2655         }
2656
2657 free:
2658         if (llss != NULL)
2659                 OBD_FREE_PTR(llss);
2660
2661         RETURN(rc);
2662 }
2663
2664 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2665 {
2666         struct md_op_data       *op_data;
2667         int                      rc;
2668         ENTRY;
2669
2670         /* Detect out-of range masks */
2671         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2672                 RETURN(-EINVAL);
2673
2674         /* Non-root users are forbidden to set or clear flags which are
2675          * NOT defined in HSM_USER_MASK. */
2676         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2677             !cfs_capable(CFS_CAP_SYS_ADMIN))
2678                 RETURN(-EPERM);
2679
2680         /* Detect out-of range archive id */
2681         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2682             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2683                 RETURN(-EINVAL);
2684
2685         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2686                                      LUSTRE_OPC_ANY, hss);
2687         if (IS_ERR(op_data))
2688                 RETURN(PTR_ERR(op_data));
2689
2690         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2691                            sizeof(*op_data), op_data, NULL);
2692
2693         ll_finish_md_op_data(op_data);
2694
2695         RETURN(rc);
2696 }
2697
2698 static int ll_hsm_import(struct inode *inode, struct file *file,
2699                          struct hsm_user_import *hui)
2700 {
2701         struct hsm_state_set    *hss = NULL;
2702         struct iattr            *attr = NULL;
2703         int                      rc;
2704         ENTRY;
2705
2706         if (!S_ISREG(inode->i_mode))
2707                 RETURN(-EINVAL);
2708
2709         /* set HSM flags */
2710         OBD_ALLOC_PTR(hss);
2711         if (hss == NULL)
2712                 GOTO(out, rc = -ENOMEM);
2713
2714         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2715         hss->hss_archive_id = hui->hui_archive_id;
2716         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2717         rc = ll_hsm_state_set(inode, hss);
2718         if (rc != 0)
2719                 GOTO(out, rc);
2720
2721         OBD_ALLOC_PTR(attr);
2722         if (attr == NULL)
2723                 GOTO(out, rc = -ENOMEM);
2724
2725         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2726         attr->ia_mode |= S_IFREG;
2727         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2728         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2729         attr->ia_size = hui->hui_size;
2730         attr->ia_mtime.tv_sec = hui->hui_mtime;
2731         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2732         attr->ia_atime.tv_sec = hui->hui_atime;
2733         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2734
2735         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2736                          ATTR_UID | ATTR_GID |
2737                          ATTR_MTIME | ATTR_MTIME_SET |
2738                          ATTR_ATIME | ATTR_ATIME_SET;
2739
2740         inode_lock(inode);
2741
2742         rc = ll_setattr_raw(file_dentry(file), attr, true);
2743         if (rc == -ENODATA)
2744                 rc = 0;
2745
2746         inode_unlock(inode);
2747
2748 out:
2749         if (hss != NULL)
2750                 OBD_FREE_PTR(hss);
2751
2752         if (attr != NULL)
2753                 OBD_FREE_PTR(attr);
2754
2755         RETURN(rc);
2756 }
2757
2758 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2759 {
2760         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2761                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2762 }
2763
2764 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2765 {
2766         struct inode *inode = file_inode(file);
2767         struct iattr ia = {
2768                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2769                             ATTR_MTIME | ATTR_MTIME_SET |
2770                             ATTR_CTIME | ATTR_CTIME_SET,
2771                 .ia_atime = {
2772                         .tv_sec = lfu->lfu_atime_sec,
2773                         .tv_nsec = lfu->lfu_atime_nsec,
2774                 },
2775                 .ia_mtime = {
2776                         .tv_sec = lfu->lfu_mtime_sec,
2777                         .tv_nsec = lfu->lfu_mtime_nsec,
2778                 },
2779                 .ia_ctime = {
2780                         .tv_sec = lfu->lfu_ctime_sec,
2781                         .tv_nsec = lfu->lfu_ctime_nsec,
2782                 },
2783         };
2784         int rc;
2785         ENTRY;
2786
2787         if (!capable(CAP_SYS_ADMIN))
2788                 RETURN(-EPERM);
2789
2790         if (!S_ISREG(inode->i_mode))
2791                 RETURN(-EINVAL);
2792
2793         inode_lock(inode);
2794         rc = ll_setattr_raw(file_dentry(file), &ia, false);
2795         inode_unlock(inode);
2796
2797         RETURN(rc);
2798 }
2799
2800 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2801 {
2802         switch (mode) {
2803         case MODE_READ_USER:
2804                 return CLM_READ;
2805         case MODE_WRITE_USER:
2806                 return CLM_WRITE;
2807         default:
2808                 return -EINVAL;
2809         }
2810 }
2811
2812 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2813
2814 /* Used to allow the upper layers of the client to request an LDLM lock
2815  * without doing an actual read or write.
2816  *
2817  * Used for ladvise lockahead to manually request specific locks.
2818  *
2819  * \param[in] file      file this ladvise lock request is on
2820  * \param[in] ladvise   ladvise struct describing this lock request
2821  *
2822  * \retval 0            success, no detailed result available (sync requests
2823  *                      and requests sent to the server [not handled locally]
2824  *                      cannot return detailed results)
2825  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2826  *                                       see definitions for details.
2827  * \retval negative     negative errno on error
2828  */
2829 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2830 {
2831         struct lu_env *env = NULL;
2832         struct cl_io *io  = NULL;
2833         struct cl_lock *lock = NULL;
2834         struct cl_lock_descr *descr = NULL;
2835         struct dentry *dentry = file->f_path.dentry;
2836         struct inode *inode = dentry->d_inode;
2837         enum cl_lock_mode cl_mode;
2838         off_t start = ladvise->lla_start;
2839         off_t end = ladvise->lla_end;
2840         int result;
2841         __u16 refcheck;
2842
2843         ENTRY;
2844
2845         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2846                "start=%llu, end=%llu\n", dentry->d_name.len,
2847                dentry->d_name.name, dentry->d_inode,
2848                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2849                (__u64) end);
2850
2851         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2852         if (cl_mode < 0)
2853                 GOTO(out, result = cl_mode);
2854
2855         /* Get IO environment */
2856         result = cl_io_get(inode, &env, &io, &refcheck);
2857         if (result <= 0)
2858                 GOTO(out, result);
2859
2860         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2861         if (result > 0) {
2862                 /*
2863                  * nothing to do for this io. This currently happens when
2864                  * stripe sub-object's are not yet created.
2865                  */
2866                 result = io->ci_result;
2867         } else if (result == 0) {
2868                 lock = vvp_env_lock(env);
2869                 descr = &lock->cll_descr;
2870
2871                 descr->cld_obj   = io->ci_obj;
2872                 /* Convert byte offsets to pages */
2873                 descr->cld_start = cl_index(io->ci_obj, start);
2874                 descr->cld_end   = cl_index(io->ci_obj, end);
2875                 descr->cld_mode  = cl_mode;
2876                 /* CEF_MUST is used because we do not want to convert a
2877                  * lockahead request to a lockless lock */
2878                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2879                                        CEF_NONBLOCK;
2880
2881                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2882                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2883
2884                 result = cl_lock_request(env, io, lock);
2885
2886                 /* On success, we need to release the lock */
2887                 if (result >= 0)
2888                         cl_lock_release(env, lock);
2889         }
2890         cl_io_fini(env, io);
2891         cl_env_put(env, &refcheck);
2892
2893         /* -ECANCELED indicates a matching lock with a different extent
2894          * was already present, and -EEXIST indicates a matching lock
2895          * on exactly the same extent was already present.
2896          * We convert them to positive values for userspace to make
2897          * recognizing true errors easier.
2898          * Note we can only return these detailed results on async requests,
2899          * as sync requests look the same as i/o requests for locking. */
2900         if (result == -ECANCELED)
2901                 result = LLA_RESULT_DIFFERENT;
2902         else if (result == -EEXIST)
2903                 result = LLA_RESULT_SAME;
2904
2905 out:
2906         RETURN(result);
2907 }
2908 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2909
2910 static int ll_ladvise_sanity(struct inode *inode,
2911                              struct llapi_lu_ladvise *ladvise)
2912 {
2913         enum lu_ladvise_type advice = ladvise->lla_advice;
2914         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2915          * be in the first 32 bits of enum ladvise_flags */
2916         __u32 flags = ladvise->lla_peradvice_flags;
2917         /* 3 lines at 80 characters per line, should be plenty */
2918         int rc = 0;
2919
2920         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2921                 rc = -EINVAL;
2922                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2923                        "last supported advice is %s (value '%d'): rc = %d\n",
2924                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2925                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2926                 GOTO(out, rc);
2927         }
2928
2929         /* Per-advice checks */
2930         switch (advice) {
2931         case LU_LADVISE_LOCKNOEXPAND:
2932                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2933                         rc = -EINVAL;
2934                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2935                                "rc = %d\n",
2936                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2937                                ladvise_names[advice], rc);
2938                         GOTO(out, rc);
2939                 }
2940                 break;
2941         case LU_LADVISE_LOCKAHEAD:
2942                 /* Currently only READ and WRITE modes can be requested */
2943                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2944                     ladvise->lla_lockahead_mode == 0) {
2945                         rc = -EINVAL;
2946                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2947                                "rc = %d\n",
2948                                ll_get_fsname(inode->i_sb, NULL, 0),
2949                                ladvise->lla_lockahead_mode,
2950                                ladvise_names[advice], rc);
2951                         GOTO(out, rc);
2952                 }
2953         case LU_LADVISE_WILLREAD:
2954         case LU_LADVISE_DONTNEED:
2955         default:
2956                 /* Note fall through above - These checks apply to all advices
2957                  * except LOCKNOEXPAND */
2958                 if (flags & ~LF_DEFAULT_MASK) {
2959                         rc = -EINVAL;
2960                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2961                                "rc = %d\n",
2962                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2963                                ladvise_names[advice], rc);
2964                         GOTO(out, rc);
2965                 }
2966                 if (ladvise->lla_start >= ladvise->lla_end) {
2967                         rc = -EINVAL;
2968                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2969                                "for %s: rc = %d\n",
2970                                ll_get_fsname(inode->i_sb, NULL, 0),
2971                                ladvise->lla_start, ladvise->lla_end,
2972                                ladvise_names[advice], rc);
2973                         GOTO(out, rc);
2974                 }
2975                 break;
2976         }
2977
2978 out:
2979         return rc;
2980 }
2981 #undef ERRSIZE
2982
2983 /*
2984  * Give file access advices
2985  *
2986  * The ladvise interface is similar to Linux fadvise() system call, except it
2987  * forwards the advices directly from Lustre client to server. The server side
2988  * codes will apply appropriate read-ahead and caching techniques for the
2989  * corresponding files.
2990  *
2991  * A typical workload for ladvise is e.g. a bunch of different clients are
2992  * doing small random reads of a file, so prefetching pages into OSS cache
2993  * with big linear reads before the random IO is a net benefit. Fetching
2994  * all that data into each client cache with fadvise() may not be, due to
2995  * much more data being sent to the client.
2996  */
2997 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2998                       struct llapi_lu_ladvise *ladvise)
2999 {
3000         struct lu_env *env;
3001         struct cl_io *io;
3002         struct cl_ladvise_io *lio;
3003         int rc;
3004         __u16 refcheck;
3005         ENTRY;
3006
3007         env = cl_env_get(&refcheck);
3008         if (IS_ERR(env))
3009                 RETURN(PTR_ERR(env));
3010
3011         io = vvp_env_thread_io(env);
3012         io->ci_obj = ll_i2info(inode)->lli_clob;
3013
3014         /* initialize parameters for ladvise */
3015         lio = &io->u.ci_ladvise;
3016         lio->li_start = ladvise->lla_start;
3017         lio->li_end = ladvise->lla_end;
3018         lio->li_fid = ll_inode2fid(inode);
3019         lio->li_advice = ladvise->lla_advice;
3020         lio->li_flags = flags;
3021
3022         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3023                 rc = cl_io_loop(env, io);
3024         else
3025                 rc = io->ci_result;
3026
3027         cl_io_fini(env, io);
3028         cl_env_put(env, &refcheck);
3029         RETURN(rc);
3030 }
3031
3032 static int ll_lock_noexpand(struct file *file, int flags)
3033 {
3034         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3035
3036         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3037
3038         return 0;
3039 }
3040
3041 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3042                         unsigned long arg)
3043 {
3044         struct fsxattr fsxattr;
3045
3046         if (copy_from_user(&fsxattr,
3047                            (const struct fsxattr __user *)arg,
3048                            sizeof(fsxattr)))
3049                 RETURN(-EFAULT);
3050
3051         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3052         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3053                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3054         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3055         if (copy_to_user((struct fsxattr __user *)arg,
3056                          &fsxattr, sizeof(fsxattr)))
3057                 RETURN(-EFAULT);
3058
3059         RETURN(0);
3060 }
3061
3062 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3063                         unsigned long arg)
3064 {
3065
3066         struct md_op_data *op_data;
3067         struct ptlrpc_request *req = NULL;
3068         int rc = 0;
3069         struct fsxattr fsxattr;
3070         struct cl_object *obj;
3071         int flags;
3072
3073         /* only root could change project ID */
3074         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
3075                 RETURN(-EPERM);
3076
3077         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3078                                      LUSTRE_OPC_ANY, NULL);
3079         if (IS_ERR(op_data))
3080                 RETURN(PTR_ERR(op_data));
3081
3082         if (copy_from_user(&fsxattr,
3083                            (const struct fsxattr __user *)arg,
3084                            sizeof(fsxattr)))
3085                 GOTO(out_fsxattr1, rc = -EFAULT);
3086
3087         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3088         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3089         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3090                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3091         op_data->op_projid = fsxattr.fsx_projid;
3092         op_data->op_attr.ia_valid |= (MDS_ATTR_PROJID | ATTR_ATTR_FLAG);
3093         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3094                         0, &req);
3095         ptlrpc_req_finished(req);
3096
3097         obj = ll_i2info(inode)->lli_clob;
3098         if (obj) {
3099                 struct iattr *attr;
3100
3101                 ll_update_inode_flags(inode, op_data->op_attr_flags);
3102                 OBD_ALLOC_PTR(attr);
3103                 if (attr == NULL)
3104                         GOTO(out_fsxattr1, rc = -ENOMEM);
3105                 attr->ia_valid = ATTR_ATTR_FLAG;
3106                 rc = cl_setattr_ost(obj, attr, fsxattr.fsx_xflags);
3107
3108                 OBD_FREE_PTR(attr);
3109         }
3110 out_fsxattr1:
3111         ll_finish_md_op_data(op_data);
3112         RETURN(rc);
3113 }
3114
3115 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3116                                  unsigned long arg)
3117 {
3118         struct inode            *inode = file_inode(file);
3119         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3120         struct ll_inode_info    *lli = ll_i2info(inode);
3121         struct obd_client_handle *och = NULL;
3122         struct split_param sp;
3123         bool lease_broken;
3124         fmode_t fmode = 0;
3125         enum mds_op_bias bias = 0;
3126         struct file *layout_file = NULL;
3127         void *data = NULL;
3128         size_t data_size = 0;
3129         long rc;
3130         ENTRY;
3131
3132         mutex_lock(&lli->lli_och_mutex);
3133         if (fd->fd_lease_och != NULL) {
3134                 och = fd->fd_lease_och;
3135                 fd->fd_lease_och = NULL;
3136         }
3137         mutex_unlock(&lli->lli_och_mutex);
3138
3139         if (och == NULL)
3140                 GOTO(out, rc = -ENOLCK);
3141
3142         fmode = och->och_flags;
3143
3144         switch (ioc->lil_flags) {
3145         case LL_LEASE_RESYNC_DONE:
3146                 if (ioc->lil_count > IOC_IDS_MAX)
3147                         GOTO(out, rc = -EINVAL);
3148
3149                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3150                 OBD_ALLOC(data, data_size);
3151                 if (!data)
3152                         GOTO(out, rc = -ENOMEM);
3153
3154                 if (copy_from_user(data, (void __user *)arg, data_size))
3155                         GOTO(out, rc = -EFAULT);
3156
3157                 bias = MDS_CLOSE_RESYNC_DONE;
3158                 break;
3159         case LL_LEASE_LAYOUT_MERGE: {
3160                 int fd;
3161
3162                 if (ioc->lil_count != 1)
3163                         GOTO(out, rc = -EINVAL);
3164
3165                 arg += sizeof(*ioc);
3166                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3167                         GOTO(out, rc = -EFAULT);
3168
3169                 layout_file = fget(fd);
3170                 if (!layout_file)
3171                         GOTO(out, rc = -EBADF);
3172
3173                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3174                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3175                         GOTO(out, rc = -EPERM);
3176
3177                 data = file_inode(layout_file);
3178                 bias = MDS_CLOSE_LAYOUT_MERGE;
3179                 break;
3180         }
3181         case LL_LEASE_LAYOUT_SPLIT: {
3182                 int fdv;
3183                 int mirror_id;
3184
3185                 if (ioc->lil_count != 2)
3186                         GOTO(out, rc = -EINVAL);
3187
3188                 arg += sizeof(*ioc);
3189                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3190                         GOTO(out, rc = -EFAULT);
3191
3192                 arg += sizeof(__u32);
3193                 if (copy_from_user(&mirror_id, (void __user *)arg,
3194                                    sizeof(__u32)))
3195                         GOTO(out, rc = -EFAULT);
3196
3197                 layout_file = fget(fdv);
3198                 if (!layout_file)
3199                         GOTO(out, rc = -EBADF);
3200
3201                 sp.sp_inode = file_inode(layout_file);
3202                 sp.sp_mirror_id = (__u16)mirror_id;
3203                 data = &sp;
3204                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3205                 break;
3206         }
3207         default:
3208                 /* without close intent */
3209                 break;
3210         }
3211
3212         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3213         if (rc < 0)
3214                 GOTO(out, rc);
3215
3216         rc = ll_lease_och_release(inode, file);
3217         if (rc < 0)
3218                 GOTO(out, rc);
3219
3220         if (lease_broken)
3221                 fmode = 0;
3222         EXIT;
3223
3224 out:
3225         switch (ioc->lil_flags) {
3226         case LL_LEASE_RESYNC_DONE:
3227                 if (data)
3228                         OBD_FREE(data, data_size);
3229                 break;
3230         case LL_LEASE_LAYOUT_MERGE:
3231         case LL_LEASE_LAYOUT_SPLIT:
3232                 if (layout_file)
3233                         fput(layout_file);
3234                 break;
3235         }
3236
3237         if (!rc)
3238                 rc = ll_lease_type_from_fmode(fmode);
3239         RETURN(rc);
3240 }
3241
3242 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3243                               unsigned long arg)
3244 {
3245         struct inode *inode = file_inode(file);
3246         struct ll_inode_info *lli = ll_i2info(inode);
3247         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3248         struct obd_client_handle *och = NULL;
3249         __u64 open_flags = 0;
3250         bool lease_broken;
3251         fmode_t fmode;
3252         long rc;
3253         ENTRY;
3254
3255         switch (ioc->lil_mode) {
3256         case LL_LEASE_WRLCK:
3257                 if (!(file->f_mode & FMODE_WRITE))
3258                         RETURN(-EPERM);
3259                 fmode = FMODE_WRITE;
3260                 break;
3261         case LL_LEASE_RDLCK:
3262                 if (!(file->f_mode & FMODE_READ))
3263                         RETURN(-EPERM);
3264                 fmode = FMODE_READ;
3265                 break;
3266         case LL_LEASE_UNLCK:
3267                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3268         default:
3269                 RETURN(-EINVAL);
3270         }
3271
3272         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3273
3274         /* apply for lease */
3275         if (ioc->lil_flags & LL_LEASE_RESYNC)
3276                 open_flags = MDS_OPEN_RESYNC;
3277         och = ll_lease_open(inode, file, fmode, open_flags);
3278         if (IS_ERR(och))
3279                 RETURN(PTR_ERR(och));
3280
3281         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3282                 rc = ll_lease_file_resync(och, inode);
3283                 if (rc) {
3284                         ll_lease_close(och, inode, NULL);
3285                         RETURN(rc);
3286                 }
3287                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3288                 if (rc) {
3289                         ll_lease_close(och, inode, NULL);
3290                         RETURN(rc);
3291                 }
3292         }
3293
3294         rc = 0;
3295         mutex_lock(&lli->lli_och_mutex);
3296         if (fd->fd_lease_och == NULL) {
3297                 fd->fd_lease_och = och;
3298                 och = NULL;
3299         }
3300         mutex_unlock(&lli->lli_och_mutex);
3301         if (och != NULL) {
3302                 /* impossible now that only excl is supported for now */
3303                 ll_lease_close(och, inode, &lease_broken);
3304                 rc = -EBUSY;
3305         }
3306         RETURN(rc);
3307 }
3308
3309 static long
3310 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3311 {
3312         struct inode            *inode = file_inode(file);
3313         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3314         int                      flags, rc;
3315         ENTRY;
3316
3317         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3318                PFID(ll_inode2fid(inode)), inode, cmd);
3319         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3320
3321         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3322         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3323                 RETURN(-ENOTTY);
3324
3325         switch (cmd) {
3326         case LL_IOC_GETFLAGS:
3327                 /* Get the current value of the file flags */
3328                 return put_user(fd->fd_flags, (int __user *)arg);
3329         case LL_IOC_SETFLAGS:
3330         case LL_IOC_CLRFLAGS:
3331                 /* Set or clear specific file flags */
3332                 /* XXX This probably needs checks to ensure the flags are
3333                  *     not abused, and to handle any flag side effects.
3334                  */
3335                 if (get_user(flags, (int __user *) arg))
3336                         RETURN(-EFAULT);
3337
3338                 if (cmd == LL_IOC_SETFLAGS) {
3339                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3340                             !(file->f_flags & O_DIRECT)) {
3341                                 CERROR("%s: unable to disable locking on "
3342                                        "non-O_DIRECT file\n", current->comm);
3343                                 RETURN(-EINVAL);
3344                         }
3345
3346                         fd->fd_flags |= flags;
3347                 } else {
3348                         fd->fd_flags &= ~flags;
3349                 }
3350                 RETURN(0);
3351         case LL_IOC_LOV_SETSTRIPE:
3352         case LL_IOC_LOV_SETSTRIPE_NEW:
3353                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3354         case LL_IOC_LOV_SETEA:
3355                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3356         case LL_IOC_LOV_SWAP_LAYOUTS: {
3357                 struct file *file2;
3358                 struct lustre_swap_layouts lsl;
3359
3360                 if (copy_from_user(&lsl, (char __user *)arg,
3361                                    sizeof(struct lustre_swap_layouts)))
3362                         RETURN(-EFAULT);
3363
3364                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3365                         RETURN(-EPERM);
3366
3367                 file2 = fget(lsl.sl_fd);
3368                 if (file2 == NULL)
3369                         RETURN(-EBADF);
3370
3371                 /* O_WRONLY or O_RDWR */
3372                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3373                         GOTO(out, rc = -EPERM);
3374
3375                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3376                         struct inode                    *inode2;
3377                         struct ll_inode_info            *lli;
3378                         struct obd_client_handle        *och = NULL;
3379
3380                         lli = ll_i2info(inode);
3381                         mutex_lock(&lli->lli_och_mutex);
3382                         if (fd->fd_lease_och != NULL) {
3383                                 och = fd->fd_lease_och;
3384                                 fd->fd_lease_och = NULL;
3385                         }
3386                         mutex_unlock(&lli->lli_och_mutex);
3387                         if (och == NULL)
3388                                 GOTO(out, rc = -ENOLCK);
3389                         inode2 = file_inode(file2);
3390                         rc = ll_swap_layouts_close(och, inode, inode2);
3391                 } else {
3392                         rc = ll_swap_layouts(file, file2, &lsl);
3393                 }
3394 out:
3395                 fput(file2);
3396                 RETURN(rc);
3397         }
3398         case LL_IOC_LOV_GETSTRIPE:
3399         case LL_IOC_LOV_GETSTRIPE_NEW:
3400                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3401         case FS_IOC_GETFLAGS:
3402         case FS_IOC_SETFLAGS:
3403                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3404         case FSFILT_IOC_GETVERSION:
3405         case FS_IOC_GETVERSION:
3406                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3407         /* We need to special case any other ioctls we want to handle,
3408          * to send them to the MDS/OST as appropriate and to properly
3409          * network encode the arg field. */
3410         case FS_IOC_SETVERSION:
3411                 RETURN(-ENOTSUPP);
3412
3413         case LL_IOC_GROUP_LOCK:
3414                 RETURN(ll_get_grouplock(inode, file, arg));
3415         case LL_IOC_GROUP_UNLOCK:
3416                 RETURN(ll_put_grouplock(inode, file, arg));
3417         case IOC_OBD_STATFS:
3418                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3419
3420         case LL_IOC_FLUSHCTX:
3421                 RETURN(ll_flush_ctx(inode));
3422         case LL_IOC_PATH2FID: {
3423                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3424                                  sizeof(struct lu_fid)))
3425                         RETURN(-EFAULT);
3426
3427                 RETURN(0);
3428         }
3429         case LL_IOC_GETPARENT:
3430                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3431
3432         case OBD_IOC_FID2PATH:
3433                 RETURN(ll_fid2path(inode, (void __user *)arg));
3434         case LL_IOC_DATA_VERSION: {
3435                 struct ioc_data_version idv;
3436                 int rc;
3437
3438                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3439                         RETURN(-EFAULT);
3440
3441                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3442                 rc = ll_ioc_data_version(inode, &idv);
3443
3444                 if (rc == 0 &&
3445                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3446                         RETURN(-EFAULT);
3447
3448                 RETURN(rc);
3449         }
3450
3451         case LL_IOC_GET_MDTIDX: {
3452                 int mdtidx;
3453
3454                 mdtidx = ll_get_mdt_idx(inode);
3455                 if (mdtidx < 0)
3456                         RETURN(mdtidx);
3457
3458                 if (put_user((int)mdtidx, (int __user *)arg))
3459                         RETURN(-EFAULT);
3460
3461                 RETURN(0);
3462         }
3463         case OBD_IOC_GETDTNAME:
3464         case OBD_IOC_GETMDNAME:
3465                 RETURN(ll_get_obd_name(inode, cmd, arg));
3466         case LL_IOC_HSM_STATE_GET: {
3467                 struct md_op_data       *op_data;
3468                 struct hsm_user_state   *hus;
3469                 int                      rc;
3470
3471                 OBD_ALLOC_PTR(hus);
3472                 if (hus == NULL)
3473                         RETURN(-ENOMEM);
3474
3475                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3476                                              LUSTRE_OPC_ANY, hus);
3477                 if (IS_ERR(op_data)) {
3478                         OBD_FREE_PTR(hus);
3479                         RETURN(PTR_ERR(op_data));
3480                 }
3481
3482                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3483                                    op_data, NULL);
3484
3485                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3486                         rc = -EFAULT;
3487
3488                 ll_finish_md_op_data(op_data);
3489                 OBD_FREE_PTR(hus);
3490                 RETURN(rc);
3491         }
3492         case LL_IOC_HSM_STATE_SET: {
3493                 struct hsm_state_set    *hss;
3494                 int                      rc;
3495
3496                 OBD_ALLOC_PTR(hss);
3497                 if (hss == NULL)
3498                         RETURN(-ENOMEM);
3499
3500                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3501                         OBD_FREE_PTR(hss);
3502                         RETURN(-EFAULT);
3503                 }
3504
3505                 rc = ll_hsm_state_set(inode, hss);
3506
3507                 OBD_FREE_PTR(hss);
3508                 RETURN(rc);
3509         }
3510         case LL_IOC_HSM_ACTION: {
3511                 struct md_op_data               *op_data;
3512                 struct hsm_current_action       *hca;
3513                 int                              rc;
3514
3515                 OBD_ALLOC_PTR(hca);
3516                 if (hca == NULL)
3517                         RETURN(-ENOMEM);
3518
3519                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3520                                              LUSTRE_OPC_ANY, hca);
3521                 if (IS_ERR(op_data)) {
3522                         OBD_FREE_PTR(hca);
3523                         RETURN(PTR_ERR(op_data));
3524                 }
3525
3526                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3527                                    op_data, NULL);
3528
3529                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3530                         rc = -EFAULT;
3531
3532                 ll_finish_md_op_data(op_data);
3533                 OBD_FREE_PTR(hca);
3534                 RETURN(rc);
3535         }
3536         case LL_IOC_SET_LEASE_OLD: {
3537                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3538
3539                 RETURN(ll_file_set_lease(file, &ioc, 0));
3540         }
3541         case LL_IOC_SET_LEASE: {
3542                 struct ll_ioc_lease ioc;
3543
3544                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3545                         RETURN(-EFAULT);
3546
3547                 RETURN(ll_file_set_lease(file, &ioc, arg));
3548         }
3549         case LL_IOC_GET_LEASE: {
3550                 struct ll_inode_info *lli = ll_i2info(inode);
3551                 struct ldlm_lock *lock = NULL;
3552                 fmode_t fmode = 0;
3553
3554                 mutex_lock(&lli->lli_och_mutex);
3555                 if (fd->fd_lease_och != NULL) {
3556                         struct obd_client_handle *och = fd->fd_lease_och;
3557
3558                         lock = ldlm_handle2lock(&och->och_lease_handle);
3559                         if (lock != NULL) {
3560                                 lock_res_and_lock(lock);
3561                                 if (!ldlm_is_cancel(lock))
3562                                         fmode = och->och_flags;
3563
3564                                 unlock_res_and_lock(lock);
3565                                 LDLM_LOCK_PUT(lock);
3566                         }
3567                 }
3568                 mutex_unlock(&lli->lli_och_mutex);
3569
3570                 RETURN(ll_lease_type_from_fmode(fmode));
3571         }
3572         case LL_IOC_HSM_IMPORT: {
3573                 struct hsm_user_import *hui;
3574
3575                 OBD_ALLOC_PTR(hui);
3576                 if (hui == NULL)
3577                         RETURN(-ENOMEM);
3578
3579                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3580                         OBD_FREE_PTR(hui);
3581                         RETURN(-EFAULT);
3582                 }
3583
3584                 rc = ll_hsm_import(inode, file, hui);
3585
3586                 OBD_FREE_PTR(hui);
3587                 RETURN(rc);
3588         }
3589         case LL_IOC_FUTIMES_3: {
3590                 struct ll_futimes_3 lfu;
3591
3592                 if (copy_from_user(&lfu,
3593                                    (const struct ll_futimes_3 __user *)arg,
3594                                    sizeof(lfu)))
3595                         RETURN(-EFAULT);
3596
3597                 RETURN(ll_file_futimes_3(file, &lfu));
3598         }
3599         case LL_IOC_LADVISE: {
3600                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3601                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3602                 int i;
3603                 int num_advise;
3604                 int alloc_size = sizeof(*k_ladvise_hdr);
3605
3606                 rc = 0;
3607                 u_ladvise_hdr = (void __user *)arg;
3608                 OBD_ALLOC_PTR(k_ladvise_hdr);
3609                 if (k_ladvise_hdr == NULL)
3610                         RETURN(-ENOMEM);
3611
3612                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3613                         GOTO(out_ladvise, rc = -EFAULT);
3614
3615                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3616                     k_ladvise_hdr->lah_count < 1)
3617                         GOTO(out_ladvise, rc = -EINVAL);
3618
3619                 num_advise = k_ladvise_hdr->lah_count;
3620                 if (num_advise >= LAH_COUNT_MAX)
3621                         GOTO(out_ladvise, rc = -EFBIG);
3622
3623                 OBD_FREE_PTR(k_ladvise_hdr);
3624                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3625                                       lah_advise[num_advise]);
3626                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3627                 if (k_ladvise_hdr == NULL)
3628                         RETURN(-ENOMEM);
3629
3630                 /*
3631                  * TODO: submit multiple advices to one server in a single RPC
3632                  */
3633                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3634                         GOTO(out_ladvise, rc = -EFAULT);
3635
3636                 for (i = 0; i < num_advise; i++) {
3637                         struct llapi_lu_ladvise *k_ladvise =
3638                                         &k_ladvise_hdr->lah_advise[i];
3639                         struct llapi_lu_ladvise __user *u_ladvise =
3640                                         &u_ladvise_hdr->lah_advise[i];
3641
3642                         rc = ll_ladvise_sanity(inode, k_ladvise);
3643                         if (rc)
3644                                 GOTO(out_ladvise, rc);
3645
3646                         switch (k_ladvise->lla_advice) {
3647                         case LU_LADVISE_LOCKNOEXPAND:
3648                                 rc = ll_lock_noexpand(file,
3649                                                k_ladvise->lla_peradvice_flags);
3650                                 GOTO(out_ladvise, rc);
3651                         case LU_LADVISE_LOCKAHEAD:
3652
3653                                 rc = ll_file_lock_ahead(file, k_ladvise);
3654
3655                                 if (rc < 0)
3656                                         GOTO(out_ladvise, rc);
3657
3658                                 if (put_user(rc,
3659                                              &u_ladvise->lla_lockahead_result))
3660                                         GOTO(out_ladvise, rc = -EFAULT);
3661                                 break;
3662                         default:
3663                                 rc = ll_ladvise(inode, file,
3664                                                 k_ladvise_hdr->lah_flags,
3665                                                 k_ladvise);
3666                                 if (rc)
3667                                         GOTO(out_ladvise, rc);
3668                                 break;
3669                         }
3670
3671                 }
3672
3673 out_ladvise:
3674                 OBD_FREE(k_ladvise_hdr, alloc_size);
3675                 RETURN(rc);
3676         }
3677         case LL_IOC_FLR_SET_MIRROR: {
3678                 /* mirror I/O must be direct to avoid polluting page cache
3679                  * by stale data. */
3680                 if (!(file->f_flags & O_DIRECT))
3681                         RETURN(-EINVAL);
3682
3683                 fd->fd_designated_mirror = (__u32)arg;
3684                 RETURN(0);
3685         }
3686         case LL_IOC_FSGETXATTR:
3687                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3688         case LL_IOC_FSSETXATTR:
3689                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3690         case BLKSSZGET:
3691                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3692         default:
3693                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3694                                      (void __user *)arg));
3695         }
3696 }
3697
3698 #ifndef HAVE_FILE_LLSEEK_SIZE
3699 static inline loff_t
3700 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3701 {
3702         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3703                 return -EINVAL;
3704         if (offset > maxsize)
3705                 return -EINVAL;
3706
3707         if (offset != file->f_pos) {
3708                 file->f_pos = offset;
3709                 file->f_version = 0;
3710         }
3711         return offset;
3712 }
3713
3714 static loff_t
3715 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3716                 loff_t maxsize, loff_t eof)
3717 {
3718         struct inode *inode = file_inode(file);
3719
3720         switch (origin) {
3721         case SEEK_END:
3722                 offset += eof;
3723                 break;
3724         case SEEK_CUR:
3725                 /*
3726                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3727                  * position-querying operation.  Avoid rewriting the "same"
3728                  * f_pos value back to the file because a concurrent read(),
3729                  * write() or lseek() might have altered it
3730                  */
3731                 if (offset == 0)
3732                         return file->f_pos;
3733                 /*
3734                  * f_lock protects against read/modify/write race with other
3735                  * SEEK_CURs. Note that parallel writes and reads behave
3736                  * like SEEK_SET.
3737                  */
3738                 inode_lock(inode);
3739                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3740                 inode_unlock(inode);
3741                 return offset;
3742         case SEEK_DATA:
3743                 /*
3744                  * In the generic case the entire file is data, so as long as
3745                  * offset isn't at the end of the file then the offset is data.
3746                  */
3747                 if (offset >= eof)
3748                         return -ENXIO;
3749                 break;
3750         case SEEK_HOLE:
3751                 /*
3752                  * There is a virtual hole at the end of the file, so as long as
3753                  * offset isn't i_size or larger, return i_size.
3754                  */
3755                 if (offset >= eof)
3756                         return -ENXIO;
3757                 offset = eof;
3758                 break;
3759         }
3760
3761         return llseek_execute(file, offset, maxsize);
3762 }
3763 #endif
3764
3765 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3766 {
3767         struct inode *inode = file_inode(file);
3768         loff_t retval, eof = 0;
3769
3770         ENTRY;
3771         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3772                            (origin == SEEK_CUR) ? file->f_pos : 0);
3773         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3774                PFID(ll_inode2fid(inode)), inode, retval, retval,
3775                origin);
3776         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3777
3778         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3779                 retval = ll_glimpse_size(inode);
3780                 if (retval != 0)
3781                         RETURN(retval);
3782                 eof = i_size_read(inode);
3783         }
3784
3785         retval = ll_generic_file_llseek_size(file, offset, origin,
3786                                           ll_file_maxbytes(inode), eof);
3787         RETURN(retval);
3788 }
3789
3790 static int ll_flush(struct file *file, fl_owner_t id)
3791 {
3792         struct inode *inode = file_inode(file);
3793         struct ll_inode_info *lli = ll_i2info(inode);
3794         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3795         int rc, err;
3796
3797         LASSERT(!S_ISDIR(inode->i_mode));
3798
3799         /* catch async errors that were recorded back when async writeback
3800          * failed for pages in this mapping. */
3801         rc = lli->lli_async_rc;
3802         lli->lli_async_rc = 0;
3803         if (lli->lli_clob != NULL) {
3804                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3805                 if (rc == 0)
3806                         rc = err;
3807         }
3808
3809         /* The application has been told write failure already.
3810          * Do not report failure again. */
3811         if (fd->fd_write_failed)
3812                 return 0;
3813         return rc ? -EIO : 0;
3814 }
3815
3816 /**
3817  * Called to make sure a portion of file has been written out.
3818  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3819  *
3820  * Return how many pages have been written.
3821  */
3822 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3823                        enum cl_fsync_mode mode, int ignore_layout)
3824 {
3825         struct lu_env *env;
3826         struct cl_io *io;
3827         struct cl_fsync_io *fio;
3828         int result;
3829         __u16 refcheck;
3830         ENTRY;
3831
3832         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3833             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3834                 RETURN(-EINVAL);
3835
3836         env = cl_env_get(&refcheck);
3837         if (IS_ERR(env))
3838                 RETURN(PTR_ERR(env));
3839
3840         io = vvp_env_thread_io(env);
3841         io->ci_obj = ll_i2info(inode)->lli_clob;
3842         io->ci_ignore_layout = ignore_layout;
3843
3844         /* initialize parameters for sync */
3845         fio = &io->u.ci_fsync;
3846         fio->fi_start = start;
3847         fio->fi_end = end;
3848         fio->fi_fid = ll_inode2fid(inode);
3849         fio->fi_mode = mode;
3850         fio->fi_nr_written = 0;
3851
3852         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3853                 result = cl_io_loop(env, io);
3854         else
3855                 result = io->ci_result;
3856         if (result == 0)
3857                 result = fio->fi_nr_written;
3858         cl_io_fini(env, io);
3859         cl_env_put(env, &refcheck);
3860
3861         RETURN(result);
3862 }
3863
3864 /*
3865  * When dentry is provided (the 'else' case), file_dentry() may be
3866  * null and dentry must be used directly rather than pulled from
3867  * file_dentry() as is done otherwise.
3868  */
3869
3870 #ifdef HAVE_FILE_FSYNC_4ARGS
3871 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3872 {
3873         struct dentry *dentry = file_dentry(file);
3874         bool lock_inode;
3875 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3876 int ll_fsync(struct file *file, int datasync)
3877 {
3878         struct dentry *dentry = file_dentry(file);
3879         loff_t start = 0;
3880         loff_t end = LLONG_MAX;
3881 #else
3882 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3883 {
3884         loff_t start = 0;
3885         loff_t end = LLONG_MAX;
3886 #endif
3887         struct inode *inode = dentry->d_inode;
3888         struct ll_inode_info *lli = ll_i2info(inode);
3889         struct ptlrpc_request *req;
3890         int rc, err;
3891         ENTRY;
3892
3893         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3894                PFID(ll_inode2fid(inode)), inode);
3895         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3896
3897 #ifdef HAVE_FILE_FSYNC_4ARGS
3898         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3899         lock_inode = !lli->lli_inode_locked;
3900         if (lock_inode)
3901                 inode_lock(inode);
3902 #else
3903         /* fsync's caller has already called _fdata{sync,write}, we want
3904          * that IO to finish before calling the osc and mdc sync methods */
3905         rc = filemap_fdatawait(inode->i_mapping);
3906 #endif
3907
3908         /* catch async errors that were recorded back when async writeback
3909          * failed for pages in this mapping. */
3910         if (!S_ISDIR(inode->i_mode)) {
3911                 err = lli->lli_async_rc;
3912                 lli->lli_async_rc = 0;
3913                 if (rc == 0)
3914                         rc = err;
3915                 if (lli->lli_clob != NULL) {
3916                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3917                         if (rc == 0)
3918                                 rc = err;
3919                 }
3920         }
3921
3922         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3923         if (!rc)
3924                 rc = err;
3925         if (!err)
3926                 ptlrpc_req_finished(req);
3927
3928         if (S_ISREG(inode->i_mode)) {
3929                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3930
3931                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3932                 if (rc == 0 && err < 0)
3933                         rc = err;
3934                 if (rc < 0)
3935                         fd->fd_write_failed = true;
3936                 else
3937                         fd->fd_write_failed = false;
3938         }
3939
3940 #ifdef HAVE_FILE_FSYNC_4ARGS
3941         if (lock_inode)
3942                 inode_unlock(inode);
3943 #endif
3944         RETURN(rc);
3945 }
3946
3947 static int
3948 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3949 {
3950         struct inode *inode = file_inode(file);
3951         struct ll_sb_info *sbi = ll_i2sbi(inode);
3952         struct ldlm_enqueue_info einfo = {
3953                 .ei_type        = LDLM_FLOCK,
3954                 .ei_cb_cp       = ldlm_flock_completion_ast,
3955                 .ei_cbdata      = file_lock,
3956         };
3957         struct md_op_data *op_data;
3958         struct lustre_handle lockh = { 0 };
3959         union ldlm_policy_data flock = { { 0 } };
3960         int fl_type = file_lock->fl_type;
3961         __u64 flags = 0;
3962         int rc;
3963         int rc2 = 0;
3964         ENTRY;
3965
3966         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3967                PFID(ll_inode2fid(inode)), file_lock);
3968
3969         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3970
3971         if (file_lock->fl_flags & FL_FLOCK) {
3972                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3973                 /* flocks are whole-file locks */
3974                 flock.l_flock.end = OFFSET_MAX;
3975                 /* For flocks owner is determined by the local file desctiptor*/
3976                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3977         } else if (file_lock->fl_flags & FL_POSIX) {
3978                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3979                 flock.l_flock.start = file_lock->fl_start;
3980                 flock.l_flock.end = file_lock->fl_end;
3981         } else {
3982                 RETURN(-EINVAL);
3983         }
3984         flock.l_flock.pid = file_lock->fl_pid;
3985
3986         /* Somewhat ugly workaround for svc lockd.
3987          * lockd installs custom fl_lmops->lm_compare_owner that checks
3988          * for the fl_owner to be the same (which it always is on local node
3989          * I guess between lockd processes) and then compares pid.
3990          * As such we assign pid to the owner field to make it all work,
3991          * conflict with normal locks is unlikely since pid space and
3992          * pointer space for current->files are not intersecting */
3993         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3994                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3995
3996         switch (fl_type) {
3997         case F_RDLCK:
3998                 einfo.ei_mode = LCK_PR;
3999                 break;
4000         case F_UNLCK:
4001                 /* An unlock request may or may not have any relation to
4002                  * existing locks so we may not be able to pass a lock handle
4003                  * via a normal ldlm_lock_cancel() request. The request may even
4004                  * unlock a byte range in the middle of an existing lock. In
4005                  * order to process an unlock request we need all of the same
4006                  * information that is given with a normal read or write record
4007                  * lock request. To avoid creating another ldlm unlock (cancel)
4008                  * message we'll treat a LCK_NL flock request as an unlock. */
4009                 einfo.ei_mode = LCK_NL;
4010                 break;
4011         case F_WRLCK:
4012                 einfo.ei_mode = LCK_PW;
4013                 break;
4014         default:
4015                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4016                 RETURN (-ENOTSUPP);
4017         }
4018
4019         switch (cmd) {
4020         case F_SETLKW:
4021 #ifdef F_SETLKW64
4022         case F_SETLKW64:
4023 #endif
4024                 flags = 0;
4025                 break;
4026         case F_SETLK:
4027 #ifdef F_SETLK64
4028         case F_SETLK64:
4029 #endif
4030                 flags = LDLM_FL_BLOCK_NOWAIT;
4031                 break;
4032         case F_GETLK:
4033 #ifdef F_GETLK64
4034         case F_GETLK64:
4035 #endif
4036                 flags = LDLM_FL_TEST_LOCK;
4037                 break;
4038         default:
4039                 CERROR("unknown fcntl lock command: %d\n", cmd);
4040                 RETURN (-EINVAL);
4041         }
4042
4043         /* Save the old mode so that if the mode in the lock changes we
4044          * can decrement the appropriate reader or writer refcount. */
4045         file_lock->fl_type = einfo.ei_mode;
4046
4047         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4048                                      LUSTRE_OPC_ANY, NULL);
4049         if (IS_ERR(op_data))
4050                 RETURN(PTR_ERR(op_data));
4051
4052         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4053                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4054                flock.l_flock.pid, flags, einfo.ei_mode,
4055                flock.l_flock.start, flock.l_flock.end);
4056
4057         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4058                         flags);
4059
4060         /* Restore the file lock type if not TEST lock. */
4061         if (!(flags & LDLM_FL_TEST_LOCK))
4062                 file_lock->fl_type = fl_type;
4063
4064 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4065         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4066             !(flags & LDLM_FL_TEST_LOCK))
4067                 rc2  = locks_lock_file_wait(file, file_lock);
4068 #else
4069         if ((file_lock->fl_flags & FL_FLOCK) &&
4070             (rc == 0 || file_lock->fl_type == F_UNLCK))
4071                 rc2  = flock_lock_file_wait(file, file_lock);
4072         if ((file_lock->fl_flags & FL_POSIX) &&
4073             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4074             !(flags & LDLM_FL_TEST_LOCK))
4075                 rc2  = posix_lock_file_wait(file, file_lock);
4076 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4077
4078         if (rc2 && file_lock->fl_type != F_UNLCK) {
4079                 einfo.ei_mode = LCK_NL;
4080                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4081                            &lockh, flags);
4082                 rc = rc2;
4083         }
4084
4085         ll_finish_md_op_data(op_data);
4086
4087         RETURN(rc);
4088 }
4089
4090 int ll_get_fid_by_name(struct inode *parent, const char *name,
4091                        int namelen, struct lu_fid *fid,
4092                        struct inode **inode)
4093 {
4094         struct md_op_data       *op_data = NULL;
4095         struct mdt_body         *body;
4096         struct ptlrpc_request   *req;
4097         int                     rc;
4098         ENTRY;
4099
4100         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4101                                      LUSTRE_OPC_ANY, NULL);
4102         if (IS_ERR(op_data))
4103                 RETURN(PTR_ERR(op_data));
4104
4105         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4106         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4107         ll_finish_md_op_data(op_data);
4108         if (rc < 0)
4109                 RETURN(rc);
4110
4111         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4112         if (body == NULL)
4113                 GOTO(out_req, rc = -EFAULT);
4114         if (fid != NULL)
4115                 *fid = body->mbo_fid1;
4116
4117         if (inode != NULL)
4118                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4119 out_req:
4120         ptlrpc_req_finished(req);
4121         RETURN(rc);
4122 }
4123
4124 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4125                const char *name)
4126 {
4127         struct dentry *dchild = NULL;
4128         struct inode *child_inode = NULL;
4129         struct md_op_data *op_data;
4130         struct ptlrpc_request *request = NULL;
4131         struct obd_client_handle *och = NULL;
4132         struct qstr qstr;
4133         struct mdt_body *body;
4134         __u64 data_version = 0;
4135         size_t namelen = strlen(name);
4136         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4137         int rc;
4138         ENTRY;
4139
4140         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4141                PFID(ll_inode2fid(parent)), name,
4142                lum->lum_stripe_offset, lum->lum_stripe_count);
4143
4144         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4145             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4146                 lustre_swab_lmv_user_md(lum);
4147
4148         /* Get child FID first */
4149         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4150         qstr.name = name;
4151         qstr.len = namelen;
4152         dchild = d_lookup(file_dentry(file), &qstr);
4153         if (dchild) {
4154                 if (dchild->d_inode)
4155                         child_inode = igrab(dchild->d_inode);
4156                 dput(dchild);
4157         }
4158
4159         if (!child_inode) {
4160                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4161                                         &child_inode);
4162                 if (rc)
4163                         RETURN(rc);
4164         }
4165
4166         if (!child_inode)
4167                 RETURN(-ENOENT);
4168
4169         /*
4170          * lfs migrate command needs to be blocked on the client
4171          * by checking the migrate FID against the FID of the
4172          * filesystem root.
4173          */
4174         if (child_inode == parent->i_sb->s_root->d_inode)
4175                 GOTO(out_iput, rc = -EINVAL);
4176
4177         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4178                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4179         if (IS_ERR(op_data))
4180                 GOTO(out_iput, rc = PTR_ERR(op_data));
4181
4182         inode_lock(child_inode);
4183         op_data->op_fid3 = *ll_inode2fid(child_inode);
4184         if (!fid_is_sane(&op_data->op_fid3)) {
4185                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4186                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4187                        PFID(&op_data->op_fid3));
4188                 GOTO(out_unlock, rc = -EINVAL);
4189         }
4190
4191         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4192         op_data->op_data = lum;
4193         op_data->op_data_size = lumlen;
4194
4195 again:
4196         if (S_ISREG(child_inode->i_mode)) {
4197                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4198                 if (IS_ERR(och)) {
4199                         rc = PTR_ERR(och);
4200                         och = NULL;
4201                         GOTO(out_unlock, rc);
4202                 }
4203
4204                 rc = ll_data_version(child_inode, &data_version,
4205                                      LL_DV_WR_FLUSH);
4206                 if (rc != 0)
4207                         GOTO(out_close, rc);
4208
4209                 op_data->op_handle = och->och_fh;
4210                 op_data->op_data_version = data_version;
4211                 op_data->op_lease_handle = och->och_lease_handle;
4212                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4213
4214                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4215                 och->och_mod->mod_open_req->rq_replay = 0;
4216                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4217         }
4218
4219         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4220                        name, namelen, &request);
4221         if (rc == 0) {
4222                 LASSERT(request != NULL);
4223                 ll_update_times(request, parent);
4224
4225                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4226                 LASSERT(body != NULL);
4227
4228                 /* If the server does release layout lock, then we cleanup
4229                  * the client och here, otherwise release it in out_close: */
4230                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4231                         obd_mod_put(och->och_mod);
4232                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4233                                                   och);
4234                         och->och_fh.cookie = DEAD_HANDLE_MAGIC;
4235                         OBD_FREE_PTR(och);
4236                         och = NULL;
4237                 }
4238         }
4239
4240         if (request != NULL) {
4241                 ptlrpc_req_finished(request);
4242                 request = NULL;
4243         }
4244
4245         /* Try again if the file layout has changed. */
4246         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4247                 goto again;
4248
4249 out_close:
4250         if (och)
4251                 ll_lease_close(och, child_inode, NULL);
4252         if (!rc)
4253                 clear_nlink(child_inode);
4254 out_unlock:
4255         inode_unlock(child_inode);
4256         ll_finish_md_op_data(op_data);
4257 out_iput:
4258         iput(child_inode);
4259         RETURN(rc);
4260 }
4261
4262 static int
4263 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4264 {
4265         ENTRY;
4266
4267         RETURN(-ENOSYS);
4268 }
4269
4270 /**
4271  * test if some locks matching bits and l_req_mode are acquired
4272  * - bits can be in different locks
4273  * - if found clear the common lock bits in *bits
4274  * - the bits not found, are kept in *bits
4275  * \param inode [IN]
4276  * \param bits [IN] searched lock bits [IN]
4277  * \param l_req_mode [IN] searched lock mode
4278  * \retval boolean, true iff all bits are found
4279  */
4280 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4281 {
4282         struct lustre_handle lockh;
4283         union ldlm_policy_data policy;
4284         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4285                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4286         struct lu_fid *fid;
4287         __u64 flags;
4288         int i;
4289         ENTRY;
4290
4291         if (!inode)
4292                RETURN(0);
4293
4294         fid = &ll_i2info(inode)->lli_fid;
4295         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4296                ldlm_lockname[mode]);
4297
4298         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4299         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4300                 policy.l_inodebits.bits = *bits & (1 << i);
4301                 if (policy.l_inodebits.bits == 0)
4302                         continue;
4303
4304                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4305                                   &policy, mode, &lockh)) {
4306                         struct ldlm_lock *lock;
4307
4308                         lock = ldlm_handle2lock(&lockh);
4309                         if (lock) {
4310                                 *bits &=
4311                                       ~(lock->l_policy_data.l_inodebits.bits);
4312                                 LDLM_LOCK_PUT(lock);
4313                         } else {
4314                                 *bits &= ~policy.l_inodebits.bits;
4315                         }
4316                 }
4317         }
4318         RETURN(*bits == 0);
4319 }
4320
4321 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4322                                struct lustre_handle *lockh, __u64 flags,
4323                                enum ldlm_mode mode)
4324 {
4325         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4326         struct lu_fid *fid;
4327         enum ldlm_mode rc;
4328         ENTRY;
4329
4330         fid = &ll_i2info(inode)->lli_fid;
4331         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4332
4333         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4334                            fid, LDLM_IBITS, &policy, mode, lockh);
4335
4336         RETURN(rc);
4337 }
4338
4339 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4340 {
4341         /* Already unlinked. Just update nlink and return success */
4342         if (rc == -ENOENT) {
4343                 clear_nlink(inode);
4344                 /* If it is striped directory, and there is bad stripe
4345                  * Let's revalidate the dentry again, instead of returning
4346                  * error */
4347                 if (S_ISDIR(inode->i_mode) &&
4348                     ll_i2info(inode)->lli_lsm_md != NULL)
4349                         return 0;
4350
4351                 /* This path cannot be hit for regular files unless in
4352                  * case of obscure races, so no need to to validate
4353                  * size. */
4354                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4355                         return 0;
4356         } else if (rc != 0) {
4357                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4358                              "%s: revalidate FID "DFID" error: rc = %d\n",
4359                              ll_get_fsname(inode->i_sb, NULL, 0),
4360                              PFID(ll_inode2fid(inode)), rc);
4361         }
4362
4363         return rc;
4364 }
4365
4366 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4367 {
4368         struct inode *inode = dentry->d_inode;
4369         struct obd_export *exp = ll_i2mdexp(inode);
4370         struct lookup_intent oit = {
4371                 .it_op = op,
4372         };
4373         struct ptlrpc_request *req = NULL;
4374         struct md_op_data *op_data;
4375         int rc = 0;
4376         ENTRY;
4377
4378         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4379                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4380
4381         /* Call getattr by fid, so do not provide name at all. */
4382         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4383                                      LUSTRE_OPC_ANY, NULL);
4384         if (IS_ERR(op_data))
4385                 RETURN(PTR_ERR(op_data));
4386
4387         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4388         ll_finish_md_op_data(op_data);
4389         if (rc < 0) {
4390                 rc = ll_inode_revalidate_fini(inode, rc);
4391                 GOTO(out, rc);
4392         }
4393
4394         rc = ll_revalidate_it_finish(req, &oit, dentry);
4395         if (rc != 0) {
4396                 ll_intent_release(&oit);
4397                 GOTO(out, rc);
4398         }
4399
4400         /* Unlinked? Unhash dentry, so it is not picked up later by
4401          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4402          * here to preserve get_cwd functionality on 2.6.
4403          * Bug 10503 */
4404         if (!dentry->d_inode->i_nlink) {
4405                 ll_lock_dcache(inode);
4406                 d_lustre_invalidate(dentry, 0);
4407                 ll_unlock_dcache(inode);
4408         }
4409
4410         ll_lookup_finish_locks(&oit, dentry);
4411 out:
4412         ptlrpc_req_finished(req);
4413
4414         return rc;
4415 }
4416
4417 static int ll_merge_md_attr(struct inode *inode)
4418 {
4419         struct cl_attr attr = { 0 };
4420         int rc;
4421
4422         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4423         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4424                            &attr, ll_md_blocking_ast);
4425         if (rc != 0)
4426                 RETURN(rc);
4427
4428         set_nlink(inode, attr.cat_nlink);
4429         inode->i_blocks = attr.cat_blocks;
4430         i_size_write(inode, attr.cat_size);
4431
4432         ll_i2info(inode)->lli_atime = attr.cat_atime;
4433         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4434         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4435
4436         RETURN(0);
4437 }
4438
4439 static inline dev_t ll_compat_encode_dev(dev_t dev)
4440 {
4441         /* The compat_sys_*stat*() syscalls will fail unless the
4442          * device majors and minors are both less than 256. Note that
4443          * the value returned here will be passed through
4444          * old_encode_dev() in cp_compat_stat(). And so we are not
4445          * trying to return a valid compat (u16) device number, just
4446          * one that will pass the old_valid_dev() check. */
4447
4448         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4449 }
4450
4451 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4452 int ll_getattr(const struct path *path, struct kstat *stat,
4453                u32 request_mask, unsigned int flags)
4454 {
4455         struct dentry *de = path->dentry;
4456 #else
4457 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4458 {
4459 #endif
4460         struct inode *inode = de->d_inode;
4461         struct ll_sb_info *sbi = ll_i2sbi(inode);
4462         struct ll_inode_info *lli = ll_i2info(inode);
4463         int rc;
4464
4465         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4466
4467         rc = ll_inode_revalidate(de, IT_GETATTR);
4468         if (rc < 0)
4469                 RETURN(rc);
4470
4471         if (S_ISREG(inode->i_mode)) {
4472                 /* In case of restore, the MDT has the right size and has
4473                  * already send it back without granting the layout lock,
4474                  * inode is up-to-date so glimpse is useless.
4475                  * Also to glimpse we need the layout, in case of a running
4476                  * restore the MDT holds the layout lock so the glimpse will
4477                  * block up to the end of restore (getattr will block)
4478                  */
4479                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4480                         rc = ll_glimpse_size(inode);
4481                         if (rc < 0)
4482                                 RETURN(rc);
4483                 }
4484         } else {
4485                 /* If object isn't regular a file then don't validate size. */
4486                 if (S_ISDIR(inode->i_mode) &&
4487                     lli->lli_lsm_md != NULL) {
4488                         rc = ll_merge_md_attr(inode);
4489                         if (rc < 0)
4490                                 RETURN(rc);
4491                 }
4492
4493                 LTIME_S(inode->i_atime) = lli->lli_atime;
4494                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4495                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4496         }
4497
4498         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4499
4500         if (ll_need_32bit_api(sbi)) {
4501                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4502                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4503                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4504         } else {
4505                 stat->ino = inode->i_ino;
4506                 stat->dev = inode->i_sb->s_dev;
4507                 stat->rdev = inode->i_rdev;
4508         }
4509
4510         stat->mode = inode->i_mode;
4511         stat->uid = inode->i_uid;
4512         stat->gid = inode->i_gid;
4513         stat->atime = inode->i_atime;
4514         stat->mtime = inode->i_mtime;
4515         stat->ctime = inode->i_ctime;
4516         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4517
4518         stat->nlink = inode->i_nlink;
4519         stat->size = i_size_read(inode);
4520         stat->blocks = inode->i_blocks;
4521
4522         return 0;
4523 }
4524
4525 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4526                      __u64 start, __u64 len)
4527 {
4528         int             rc;
4529         size_t          num_bytes;
4530         struct fiemap   *fiemap;
4531         unsigned int    extent_count = fieinfo->fi_extents_max;
4532
4533         num_bytes = sizeof(*fiemap) + (extent_count *
4534                                        sizeof(struct fiemap_extent));
4535         OBD_ALLOC_LARGE(fiemap, num_bytes);
4536
4537         if (fiemap == NULL)
4538                 RETURN(-ENOMEM);
4539
4540         fiemap->fm_flags = fieinfo->fi_flags;
4541         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4542         fiemap->fm_start = start;
4543         fiemap->fm_length = len;
4544         if (extent_count > 0 &&
4545             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4546                            sizeof(struct fiemap_extent)) != 0)
4547                 GOTO(out, rc = -EFAULT);
4548
4549         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4550
4551         fieinfo->fi_flags = fiemap->fm_flags;
4552         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4553         if (extent_count > 0 &&
4554             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4555                          fiemap->fm_mapped_extents *
4556                          sizeof(struct fiemap_extent)) != 0)
4557                 GOTO(out, rc = -EFAULT);
4558 out:
4559         OBD_FREE_LARGE(fiemap, num_bytes);
4560         return rc;
4561 }
4562
4563 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4564 {
4565         struct ll_inode_info *lli = ll_i2info(inode);
4566         struct posix_acl *acl = NULL;
4567         ENTRY;
4568
4569         spin_lock(&lli->lli_lock);
4570         /* VFS' acl_permission_check->check_acl will release the refcount */
4571         acl = posix_acl_dup(lli->lli_posix_acl);
4572         spin_unlock(&lli->lli_lock);
4573
4574         RETURN(acl);
4575 }
4576
4577 #ifdef HAVE_IOP_SET_ACL
4578 #ifdef CONFIG_FS_POSIX_ACL
4579 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4580 {
4581         struct ll_sb_info *sbi = ll_i2sbi(inode);
4582         struct ptlrpc_request *req = NULL;
4583         const char *name = NULL;
4584         char *value = NULL;
4585         size_t value_size = 0;
4586         int rc = 0;
4587         ENTRY;
4588
4589         switch (type) {
4590         case ACL_TYPE_ACCESS:
4591                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4592                 if (acl)
4593                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4594                 break;
4595
4596         case ACL_TYPE_DEFAULT:
4597                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4598                 if (!S_ISDIR(inode->i_mode))
4599                         rc = acl ? -EACCES : 0;
4600                 break;
4601
4602         default:
4603                 rc = -EINVAL;
4604                 break;
4605         }
4606         if (rc)
4607                 return rc;
4608
4609         if (acl) {
4610                 value_size = posix_acl_xattr_size(acl->a_count);
4611                 value = kmalloc(value_size, GFP_NOFS);
4612                 if (value == NULL)
4613                         GOTO(out, rc = -ENOMEM);
4614
4615                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4616                 if (rc < 0)
4617                         GOTO(out_value, rc);
4618         }
4619
4620         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4621                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4622                          name, value, value_size, 0, 0, &req);
4623
4624         ptlrpc_req_finished(req);
4625 out_value:
4626         kfree(value);
4627 out:
4628         if (rc)
4629                 forget_cached_acl(inode, type);
4630         else
4631                 set_cached_acl(inode, type, acl);
4632         RETURN(rc);
4633 }
4634 #endif /* CONFIG_FS_POSIX_ACL */
4635 #endif /* HAVE_IOP_SET_ACL */
4636
4637 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4638 static int
4639 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4640 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4641 # else
4642 ll_check_acl(struct inode *inode, int mask)
4643 # endif
4644 {
4645 # ifdef CONFIG_FS_POSIX_ACL
4646         struct posix_acl *acl;
4647         int rc;
4648         ENTRY;
4649
4650 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4651         if (flags & IPERM_FLAG_RCU)
4652                 return -ECHILD;
4653 #  endif
4654         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4655
4656         if (!acl)
4657                 RETURN(-EAGAIN);
4658
4659         rc = posix_acl_permission(inode, acl, mask);
4660         posix_acl_release(acl);
4661
4662         RETURN(rc);
4663 # else /* !CONFIG_FS_POSIX_ACL */
4664         return -EAGAIN;
4665 # endif /* CONFIG_FS_POSIX_ACL */
4666 }
4667 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4668
4669 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4670 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4671 #else
4672 # ifdef HAVE_INODE_PERMISION_2ARGS
4673 int ll_inode_permission(struct inode *inode, int mask)
4674 # else
4675 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4676 # endif
4677 #endif
4678 {
4679         int rc = 0;
4680         struct ll_sb_info *sbi;
4681         struct root_squash_info *squash;
4682         struct cred *cred = NULL;
4683         const struct cred *old_cred = NULL;
4684         cfs_cap_t cap;
4685         bool squash_id = false;
4686         ENTRY;
4687
4688 #ifdef MAY_NOT_BLOCK
4689         if (mask & MAY_NOT_BLOCK)
4690                 return -ECHILD;
4691 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4692         if (flags & IPERM_FLAG_RCU)
4693                 return -ECHILD;
4694 #endif
4695
4696        /* as root inode are NOT getting validated in lookup operation,
4697         * need to do it before permission check. */
4698
4699         if (inode == inode->i_sb->s_root->d_inode) {
4700                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4701                 if (rc)
4702                         RETURN(rc);
4703         }
4704
4705         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4706                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4707
4708         /* squash fsuid/fsgid if needed */
4709         sbi = ll_i2sbi(inode);
4710         squash = &sbi->ll_squash;
4711         if (unlikely(squash->rsi_uid != 0 &&
4712                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4713                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4714                         squash_id = true;
4715         }
4716         if (squash_id) {
4717                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4718                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4719                        squash->rsi_uid, squash->rsi_gid);
4720
4721                 /* update current process's credentials
4722                  * and FS capability */
4723                 cred = prepare_creds();
4724                 if (cred == NULL)
4725                         RETURN(-ENOMEM);
4726
4727                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4728                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4729                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4730                         if ((1 << cap) & CFS_CAP_FS_MASK)
4731                                 cap_lower(cred->cap_effective, cap);
4732                 }
4733                 old_cred = override_creds(cred);
4734         }
4735
4736         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4737         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4738         /* restore current process's credentials and FS capability */
4739         if (squash_id) {
4740                 revert_creds(old_cred);
4741                 put_cred(cred);
4742         }
4743
4744         RETURN(rc);
4745 }
4746
4747 /* -o localflock - only provides locally consistent flock locks */
4748 struct file_operations ll_file_operations = {
4749 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4750 # ifdef HAVE_SYNC_READ_WRITE
4751         .read           = new_sync_read,
4752         .write          = new_sync_write,
4753 # endif
4754         .read_iter      = ll_file_read_iter,
4755         .write_iter     = ll_file_write_iter,
4756 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4757         .read           = ll_file_read,
4758         .aio_read       = ll_file_aio_read,
4759         .write          = ll_file_write,
4760         .aio_write      = ll_file_aio_write,
4761 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4762         .unlocked_ioctl = ll_file_ioctl,
4763         .open           = ll_file_open,
4764         .release        = ll_file_release,
4765         .mmap           = ll_file_mmap,
4766         .llseek         = ll_file_seek,
4767         .splice_read    = ll_file_splice_read,
4768         .fsync          = ll_fsync,
4769         .flush          = ll_flush
4770 };
4771
4772 struct file_operations ll_file_operations_flock = {
4773 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4774 # ifdef HAVE_SYNC_READ_WRITE
4775         .read           = new_sync_read,
4776         .write          = new_sync_write,
4777 # endif /* HAVE_SYNC_READ_WRITE */
4778         .read_iter      = ll_file_read_iter,
4779         .write_iter     = ll_file_write_iter,
4780 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4781         .read           = ll_file_read,
4782         .aio_read       = ll_file_aio_read,
4783         .write          = ll_file_write,
4784         .aio_write      = ll_file_aio_write,
4785 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4786         .unlocked_ioctl = ll_file_ioctl,
4787         .open           = ll_file_open,
4788         .release        = ll_file_release,
4789         .mmap           = ll_file_mmap,
4790         .llseek         = ll_file_seek,
4791         .splice_read    = ll_file_splice_read,
4792         .fsync          = ll_fsync,
4793         .flush          = ll_flush,
4794         .flock          = ll_file_flock,
4795         .lock           = ll_file_flock
4796 };
4797
4798 /* These are for -o noflock - to return ENOSYS on flock calls */
4799 struct file_operations ll_file_operations_noflock = {
4800 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4801 # ifdef HAVE_SYNC_READ_WRITE
4802         .read           = new_sync_read,
4803         .write          = new_sync_write,
4804 # endif /* HAVE_SYNC_READ_WRITE */
4805         .read_iter      = ll_file_read_iter,
4806         .write_iter     = ll_file_write_iter,
4807 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4808         .read           = ll_file_read,
4809         .aio_read       = ll_file_aio_read,
4810         .write          = ll_file_write,
4811         .aio_write      = ll_file_aio_write,
4812 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4813         .unlocked_ioctl = ll_file_ioctl,
4814         .open           = ll_file_open,
4815         .release        = ll_file_release,
4816         .mmap           = ll_file_mmap,
4817         .llseek         = ll_file_seek,
4818         .splice_read    = ll_file_splice_read,
4819         .fsync          = ll_fsync,
4820         .flush          = ll_flush,
4821         .flock          = ll_file_noflock,
4822         .lock           = ll_file_noflock
4823 };
4824
4825 struct inode_operations ll_file_inode_operations = {
4826         .setattr        = ll_setattr,
4827         .getattr        = ll_getattr,
4828         .permission     = ll_inode_permission,
4829 #ifdef HAVE_IOP_XATTR
4830         .setxattr       = ll_setxattr,
4831         .getxattr       = ll_getxattr,
4832         .removexattr    = ll_removexattr,
4833 #endif
4834         .listxattr      = ll_listxattr,
4835         .fiemap         = ll_fiemap,
4836 #ifdef HAVE_IOP_GET_ACL
4837         .get_acl        = ll_get_acl,
4838 #endif
4839 #ifdef HAVE_IOP_SET_ACL
4840         .set_acl        = ll_set_acl,
4841 #endif
4842 };
4843
4844 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4845 {
4846         struct ll_inode_info *lli = ll_i2info(inode);
4847         struct cl_object *obj = lli->lli_clob;
4848         struct lu_env *env;
4849         int rc;
4850         __u16 refcheck;
4851         ENTRY;
4852
4853         if (obj == NULL)
4854                 RETURN(0);
4855
4856         env = cl_env_get(&refcheck);
4857         if (IS_ERR(env))
4858                 RETURN(PTR_ERR(env));
4859
4860         rc = cl_conf_set(env, lli->lli_clob, conf);
4861         if (rc < 0)
4862                 GOTO(out, rc);
4863
4864         if (conf->coc_opc == OBJECT_CONF_SET) {
4865                 struct ldlm_lock *lock = conf->coc_lock;
4866                 struct cl_layout cl = {
4867                         .cl_layout_gen = 0,
4868                 };
4869
4870                 LASSERT(lock != NULL);
4871                 LASSERT(ldlm_has_layout(lock));
4872
4873                 /* it can only be allowed to match after layout is
4874                  * applied to inode otherwise false layout would be
4875                  * seen. Applying layout shoud happen before dropping
4876                  * the intent lock. */
4877                 ldlm_lock_allow_match(lock);
4878
4879                 rc = cl_object_layout_get(env, obj, &cl);
4880                 if (rc < 0)
4881                         GOTO(out, rc);
4882
4883                 CDEBUG(D_VFSTRACE,
4884                        DFID": layout version change: %u -> %u\n",
4885                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4886                        cl.cl_layout_gen);
4887                 ll_layout_version_set(lli, cl.cl_layout_gen);
4888         }
4889
4890 out:
4891         cl_env_put(env, &refcheck);
4892
4893         RETURN(rc);
4894 }
4895
4896 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4897 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4898
4899 {
4900         struct ll_sb_info *sbi = ll_i2sbi(inode);
4901         struct ptlrpc_request *req;
4902         struct mdt_body *body;
4903         void *lvbdata;
4904         void *lmm;
4905         int lmmsize;
4906         int rc;
4907         ENTRY;
4908
4909         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4910                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4911                lock->l_lvb_data, lock->l_lvb_len);
4912
4913         if (lock->l_lvb_data != NULL)
4914                 RETURN(0);
4915
4916         /* if layout lock was granted right away, the layout is returned
4917          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4918          * blocked and then granted via completion ast, we have to fetch
4919          * layout here. Please note that we can't use the LVB buffer in
4920          * completion AST because it doesn't have a large enough buffer */
4921         rc = ll_get_default_mdsize(sbi, &lmmsize);
4922         if (rc == 0)
4923                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4924                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4925         if (rc < 0)
4926                 RETURN(rc);
4927
4928         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4929         if (body == NULL)
4930                 GOTO(out, rc = -EPROTO);
4931
4932         lmmsize = body->mbo_eadatasize;
4933         if (lmmsize == 0) /* empty layout */
4934                 GOTO(out, rc = 0);
4935
4936         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4937         if (lmm == NULL)
4938                 GOTO(out, rc = -EFAULT);
4939
4940         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4941         if (lvbdata == NULL)
4942                 GOTO(out, rc = -ENOMEM);
4943
4944         memcpy(lvbdata, lmm, lmmsize);
4945         lock_res_and_lock(lock);
4946         if (unlikely(lock->l_lvb_data == NULL)) {
4947                 lock->l_lvb_type = LVB_T_LAYOUT;
4948                 lock->l_lvb_data = lvbdata;
4949                 lock->l_lvb_len = lmmsize;
4950                 lvbdata = NULL;
4951         }
4952         unlock_res_and_lock(lock);
4953
4954         if (lvbdata)
4955                 OBD_FREE_LARGE(lvbdata, lmmsize);
4956
4957         EXIT;
4958
4959 out:
4960         ptlrpc_req_finished(req);
4961         return rc;
4962 }
4963
4964 /**
4965  * Apply the layout to the inode. Layout lock is held and will be released
4966  * in this function.
4967  */
4968 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4969                               struct inode *inode)
4970 {
4971         struct ll_inode_info *lli = ll_i2info(inode);
4972         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4973         struct ldlm_lock *lock;
4974         struct cl_object_conf conf;
4975         int rc = 0;
4976         bool lvb_ready;
4977         bool wait_layout = false;
4978         ENTRY;
4979
4980         LASSERT(lustre_handle_is_used(lockh));
4981
4982         lock = ldlm_handle2lock(lockh);
4983         LASSERT(lock != NULL);
4984         LASSERT(ldlm_has_layout(lock));
4985
4986         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4987                    PFID(&lli->lli_fid), inode);
4988
4989         /* in case this is a caching lock and reinstate with new inode */
4990         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4991
4992         lock_res_and_lock(lock);
4993         lvb_ready = ldlm_is_lvb_ready(lock);
4994         unlock_res_and_lock(lock);
4995
4996         /* checking lvb_ready is racy but this is okay. The worst case is
4997          * that multi processes may configure the file on the same time. */
4998         if (lvb_ready)
4999                 GOTO(out, rc = 0);
5000
5001         rc = ll_layout_fetch(inode, lock);
5002         if (rc < 0)
5003                 GOTO(out, rc);
5004
5005         /* for layout lock, lmm is stored in lock's lvb.
5006          * lvb_data is immutable if the lock is held so it's safe to access it
5007          * without res lock.
5008          *
5009          * set layout to file. Unlikely this will fail as old layout was
5010          * surely eliminated */
5011         memset(&conf, 0, sizeof conf);
5012         conf.coc_opc = OBJECT_CONF_SET;
5013         conf.coc_inode = inode;
5014         conf.coc_lock = lock;
5015         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5016         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5017         rc = ll_layout_conf(inode, &conf);
5018
5019         /* refresh layout failed, need to wait */
5020         wait_layout = rc == -EBUSY;
5021         EXIT;
5022 out:
5023         LDLM_LOCK_PUT(lock);
5024         ldlm_lock_decref(lockh, mode);
5025
5026         /* wait for IO to complete if it's still being used. */
5027         if (wait_layout) {
5028                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5029                        ll_get_fsname(inode->i_sb, NULL, 0),
5030                        PFID(&lli->lli_fid), inode);
5031
5032                 memset(&conf, 0, sizeof conf);
5033                 conf.coc_opc = OBJECT_CONF_WAIT;
5034                 conf.coc_inode = inode;
5035                 rc = ll_layout_conf(inode, &conf);
5036                 if (rc == 0)
5037                         rc = -EAGAIN;
5038
5039                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5040                        ll_get_fsname(inode->i_sb, NULL, 0),
5041                        PFID(&lli->lli_fid), rc);
5042         }
5043         RETURN(rc);
5044 }
5045
5046 /**
5047  * Issue layout intent RPC to MDS.
5048  * \param inode [in]    file inode
5049  * \param intent [in]   layout intent
5050  *
5051  * \retval 0    on success
5052  * \retval < 0  error code
5053  */
5054 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5055 {
5056         struct ll_inode_info  *lli = ll_i2info(inode);
5057         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5058         struct md_op_data     *op_data;
5059         struct lookup_intent it;
5060         struct ptlrpc_request *req;
5061         int rc;
5062         ENTRY;
5063
5064         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5065                                      0, 0, LUSTRE_OPC_ANY, NULL);
5066         if (IS_ERR(op_data))
5067                 RETURN(PTR_ERR(op_data));
5068
5069         op_data->op_data = intent;
5070         op_data->op_data_size = sizeof(*intent);
5071
5072         memset(&it, 0, sizeof(it));
5073         it.it_op = IT_LAYOUT;
5074         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5075             intent->li_opc == LAYOUT_INTENT_TRUNC)
5076                 it.it_flags = FMODE_WRITE;
5077
5078         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5079                           ll_get_fsname(inode->i_sb, NULL, 0),
5080                           PFID(&lli->lli_fid), inode);
5081
5082         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5083                             &ll_md_blocking_ast, 0);
5084         if (it.it_request != NULL)
5085                 ptlrpc_req_finished(it.it_request);
5086         it.it_request = NULL;
5087
5088         ll_finish_md_op_data(op_data);
5089
5090         /* set lock data in case this is a new lock */
5091         if (!rc)
5092                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5093
5094         ll_intent_drop_lock(&it);
5095
5096         RETURN(rc);
5097 }
5098
5099 /**
5100  * This function checks if there exists a LAYOUT lock on the client side,
5101  * or enqueues it if it doesn't have one in cache.
5102  *
5103  * This function will not hold layout lock so it may be revoked any time after
5104  * this function returns. Any operations depend on layout should be redone
5105  * in that case.
5106  *
5107  * This function should be called before lov_io_init() to get an uptodate
5108  * layout version, the caller should save the version number and after IO
5109  * is finished, this function should be called again to verify that layout
5110  * is not changed during IO time.
5111  */
5112 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5113 {
5114         struct ll_inode_info    *lli = ll_i2info(inode);
5115         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5116         struct lustre_handle lockh;
5117         struct layout_intent intent = {
5118                 .li_opc = LAYOUT_INTENT_ACCESS,
5119         };
5120         enum ldlm_mode mode;
5121         int rc;
5122         ENTRY;
5123
5124         *gen = ll_layout_version_get(lli);
5125         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5126                 RETURN(0);
5127
5128         /* sanity checks */
5129         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5130         LASSERT(S_ISREG(inode->i_mode));
5131
5132         /* take layout lock mutex to enqueue layout lock exclusively. */
5133         mutex_lock(&lli->lli_layout_mutex);
5134
5135         while (1) {
5136                 /* mostly layout lock is caching on the local side, so try to
5137                  * match it before grabbing layout lock mutex. */
5138                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5139                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5140                 if (mode != 0) { /* hit cached lock */
5141                         rc = ll_layout_lock_set(&lockh, mode, inode);
5142                         if (rc == -EAGAIN)
5143                                 continue;
5144                         break;
5145                 }
5146
5147                 rc = ll_layout_intent(inode, &intent);
5148                 if (rc != 0)
5149                         break;
5150         }
5151
5152         if (rc == 0)
5153                 *gen = ll_layout_version_get(lli);
5154         mutex_unlock(&lli->lli_layout_mutex);
5155
5156         RETURN(rc);
5157 }
5158
5159 /**
5160  * Issue layout intent RPC indicating where in a file an IO is about to write.
5161  *
5162  * \param[in] inode     file inode.
5163  * \param[in] ext       write range with start offset of fille in bytes where
5164  *                      an IO is about to write, and exclusive end offset in
5165  *                      bytes.
5166  *
5167  * \retval 0    on success
5168  * \retval < 0  error code
5169  */
5170 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5171                            struct lu_extent *ext)
5172 {
5173         struct layout_intent intent = {
5174                 .li_opc = opc,
5175                 .li_extent.e_start = ext->e_start,
5176                 .li_extent.e_end = ext->e_end,
5177         };
5178         int rc;
5179         ENTRY;
5180
5181         rc = ll_layout_intent(inode, &intent);
5182
5183         RETURN(rc);
5184 }
5185
5186 /**
5187  *  This function send a restore request to the MDT
5188  */
5189 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5190 {
5191         struct hsm_user_request *hur;
5192         int                      len, rc;
5193         ENTRY;
5194
5195         len = sizeof(struct hsm_user_request) +
5196               sizeof(struct hsm_user_item);
5197         OBD_ALLOC(hur, len);
5198         if (hur == NULL)
5199                 RETURN(-ENOMEM);
5200
5201         hur->hur_request.hr_action = HUA_RESTORE;
5202         hur->hur_request.hr_archive_id = 0;
5203         hur->hur_request.hr_flags = 0;
5204         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5205                sizeof(hur->hur_user_item[0].hui_fid));
5206         hur->hur_user_item[0].hui_extent.offset = offset;
5207         hur->hur_user_item[0].hui_extent.length = length;
5208         hur->hur_request.hr_itemcount = 1;
5209         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5210                            len, hur, NULL);
5211         OBD_FREE(hur, len);
5212         RETURN(rc);
5213 }