lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                       ATTR_MTIME | ATTR_MTIME_SET |
 104                                       ATTR_CTIME);
 105         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 106         op_data->op_attr_blocks = inode->i_blocks;
 107         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 108         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 109                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 110         op_data->op_open_handle = och->och_open_handle;
 111
 112         if (och->och_flags & FMODE_WRITE &&
 113             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 114                 /* For HSM: if inode data has been modified, pack it so that
 115                  * MDT can set data dirty flag in the archive. */
 116                 op_data->op_bias |= MDS_DATA_MODIFIED;
 117
 118         EXIT;
 119 }
 120
 121 /**
 122  * Perform a close, possibly with a bias.
 123  * The meaning of "data" depends on the value of "bias".
 124  *
 125  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 126  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 127  * swap layouts with.
 128  */
 129 static int ll_close_inode_openhandle(struct inode *inode,
 130                                      struct obd_client_handle *och,
 131                                      enum mds_op_bias bias, void *data)
 132 {
 133         struct obd_export *md_exp = ll_i2mdexp(inode);
 134         const struct ll_inode_info *lli = ll_i2info(inode);
 135         struct md_op_data *op_data;
 136         struct ptlrpc_request *req = NULL;
 137         int rc;
 138         ENTRY;
 139
 140         if (class_exp2obd(md_exp) == NULL) {
 141                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 142                        ll_get_fsname(inode->i_sb, NULL, 0),
 143                        PFID(&lli->lli_fid));
 144                 GOTO(out, rc = 0);
 145         }
 146
 147         OBD_ALLOC_PTR(op_data);
 148         /* We leak openhandle and request here on error, but not much to be
 149          * done in OOM case since app won't retry close on error either. */
 150         if (op_data == NULL)
 151                 GOTO(out, rc = -ENOMEM);
 152
 153         ll_prepare_close(inode, op_data, och);
 154         switch (bias) {
 155         case MDS_CLOSE_LAYOUT_MERGE:
 156                 /* merge blocks from the victim inode */
 157                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 158                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 159                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 160         case MDS_CLOSE_LAYOUT_SPLIT:
 161         case MDS_CLOSE_LAYOUT_SWAP: {
 162                 struct split_param *sp = data;
 163
 164                 LASSERT(data != NULL);
 165                 op_data->op_bias |= bias;
 166                 op_data->op_data_version = 0;
 167                 op_data->op_lease_handle = och->och_lease_handle;
 168                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 169                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 170                         op_data->op_mirror_id = sp->sp_mirror_id;
 171                 } else {
 172                         op_data->op_fid2 = *ll_inode2fid(data);
 173                 }
 174                 break;
 175         }
 176
 177         case MDS_CLOSE_RESYNC_DONE: {
 178                 struct ll_ioc_lease *ioc = data;
 179
 180                 LASSERT(data != NULL);
 181                 op_data->op_attr_blocks +=
 182                         ioc->lil_count * op_data->op_attr_blocks;
 183                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 184                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 185                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 186
 187                 op_data->op_lease_handle = och->och_lease_handle;
 188                 op_data->op_data = &ioc->lil_ids[0];
 189                 op_data->op_data_size =
 190                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 191                 break;
 192         }
 193
 194         case MDS_HSM_RELEASE:
 195                 LASSERT(data != NULL);
 196                 op_data->op_bias |= MDS_HSM_RELEASE;
 197                 op_data->op_data_version = *(__u64 *)data;
 198                 op_data->op_lease_handle = och->och_lease_handle;
 199                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 200                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 201                 break;
 202
 203         default:
 204                 LASSERT(data == NULL);
 205                 break;
 206         }
 207
 208         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 209                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 210         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 211                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 212
 213         rc = md_close(md_exp, op_data, och->och_mod, &req);
 214         if (rc != 0 && rc != -EINTR)
 215                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 216                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 217
 218         if (rc == 0 && op_data->op_bias & bias) {
 219                 struct mdt_body *body;
 220
 221                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 222                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 223                         rc = -EBUSY;
 224         }
 225
 226         ll_finish_md_op_data(op_data);
 227         EXIT;
 228 out:
 229
 230         md_clear_open_replay_data(md_exp, och);
 231         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 232         OBD_FREE_PTR(och);
 233
 234         ptlrpc_req_finished(req);       /* This is close request */
 235         return rc;
 236 }
 237
 238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 239 {
 240         struct ll_inode_info *lli = ll_i2info(inode);
 241         struct obd_client_handle **och_p;
 242         struct obd_client_handle *och;
 243         __u64 *och_usecount;
 244         int rc = 0;
 245         ENTRY;
 246
 247         if (fmode & FMODE_WRITE) {
 248                 och_p = &lli->lli_mds_write_och;
 249                 och_usecount = &lli->lli_open_fd_write_count;
 250         } else if (fmode & FMODE_EXEC) {
 251                 och_p = &lli->lli_mds_exec_och;
 252                 och_usecount = &lli->lli_open_fd_exec_count;
 253         } else {
 254                 LASSERT(fmode & FMODE_READ);
 255                 och_p = &lli->lli_mds_read_och;
 256                 och_usecount = &lli->lli_open_fd_read_count;
 257         }
 258
 259         mutex_lock(&lli->lli_och_mutex);
 260         if (*och_usecount > 0) {
 261                 /* There are still users of this handle, so skip
 262                  * freeing it. */
 263                 mutex_unlock(&lli->lli_och_mutex);
 264                 RETURN(0);
 265         }
 266
 267         och = *och_p;
 268         *och_p = NULL;
 269         mutex_unlock(&lli->lli_och_mutex);
 270
 271         if (och != NULL) {
 272                 /* There might be a race and this handle may already
 273                  * be closed. */
 274                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 275         }
 276
 277         RETURN(rc);
 278 }
 279
 280 static int ll_md_close(struct inode *inode, struct file *file)
 281 {
 282         union ldlm_policy_data policy = {
 283                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 284         };
 285         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 286         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 287         struct ll_inode_info *lli = ll_i2info(inode);
 288         struct lustre_handle lockh;
 289         enum ldlm_mode lockmode;
 290         int rc = 0;
 291         ENTRY;
 292
 293         /* clear group lock, if present */
 294         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 295                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 296
 297         if (fd->fd_lease_och != NULL) {
 298                 bool lease_broken;
 299
 300                 /* Usually the lease is not released when the
 301                  * application crashed, we need to release here. */
 302                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 303                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 304                         PFID(&lli->lli_fid), rc, lease_broken);
 305
 306                 fd->fd_lease_och = NULL;
 307         }
 308
 309         if (fd->fd_och != NULL) {
 310                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 311                 fd->fd_och = NULL;
 312                 GOTO(out, rc);
 313         }
 314
 315         /* Let's see if we have good enough OPEN lock on the file and if
 316            we can skip talking to MDS */
 317         mutex_lock(&lli->lli_och_mutex);
 318         if (fd->fd_omode & FMODE_WRITE) {
 319                 lockmode = LCK_CW;
 320                 LASSERT(lli->lli_open_fd_write_count);
 321                 lli->lli_open_fd_write_count--;
 322         } else if (fd->fd_omode & FMODE_EXEC) {
 323                 lockmode = LCK_PR;
 324                 LASSERT(lli->lli_open_fd_exec_count);
 325                 lli->lli_open_fd_exec_count--;
 326         } else {
 327                 lockmode = LCK_CR;
 328                 LASSERT(lli->lli_open_fd_read_count);
 329                 lli->lli_open_fd_read_count--;
 330         }
 331         mutex_unlock(&lli->lli_och_mutex);
 332
 333         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 334                            LDLM_IBITS, &policy, lockmode, &lockh))
 335                 rc = ll_md_real_close(inode, fd->fd_omode);
 336
 337 out:
 338         LUSTRE_FPRIVATE(file) = NULL;
 339         ll_file_data_put(fd);
 340
 341         RETURN(rc);
 342 }
 343
 344 /* While this returns an error code, fput() the caller does not, so we need
 345  * to make every effort to clean up all of our state here.  Also, applications
 346  * rarely check close errors and even if an error is returned they will not
 347  * re-try the close call.
 348  */
 349 int ll_file_release(struct inode *inode, struct file *file)
 350 {
 351         struct ll_file_data *fd;
 352         struct ll_sb_info *sbi = ll_i2sbi(inode);
 353         struct ll_inode_info *lli = ll_i2info(inode);
 354         int rc;
 355         ENTRY;
 356
 357         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 358                PFID(ll_inode2fid(inode)), inode);
 359
 360         if (inode->i_sb->s_root != file_dentry(file))
 361                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 362         fd = LUSTRE_FPRIVATE(file);
 363         LASSERT(fd != NULL);
 364
 365         /* The last ref on @file, maybe not the the owner pid of statahead,
 366          * because parent and child process can share the same file handle. */
 367         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 368                 ll_deauthorize_statahead(inode, fd);
 369
 370         if (inode->i_sb->s_root == file_dentry(file)) {
 371                 LUSTRE_FPRIVATE(file) = NULL;
 372                 ll_file_data_put(fd);
 373                 RETURN(0);
 374         }
 375
 376         if (!S_ISDIR(inode->i_mode)) {
 377                 if (lli->lli_clob != NULL)
 378                         lov_read_and_clear_async_rc(lli->lli_clob);
 379                 lli->lli_async_rc = 0;
 380         }
 381
 382         rc = ll_md_close(inode, file);
 383
 384         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 385                 libcfs_debug_dumplog();
 386
 387         RETURN(rc);
 388 }
 389
 390 static inline int ll_dom_readpage(void *data, struct page *page)
 391 {
 392         struct niobuf_local *lnb = data;
 393         void *kaddr;
 394
 395         kaddr = ll_kmap_atomic(page, KM_USER0);
 396         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 397         if (lnb->lnb_len < PAGE_SIZE)
 398                 memset(kaddr + lnb->lnb_len, 0,
 399                        PAGE_SIZE - lnb->lnb_len);
 400         flush_dcache_page(page);
 401         SetPageUptodate(page);
 402         ll_kunmap_atomic(kaddr, KM_USER0);
 403         unlock_page(page);
 404
 405         return 0;
 406 }
 407
 408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 409                         struct lookup_intent *it)
 410 {
 411         struct ll_inode_info *lli = ll_i2info(inode);
 412         struct cl_object *obj = lli->lli_clob;
 413         struct address_space *mapping = inode->i_mapping;
 414         struct page *vmpage;
 415         struct niobuf_remote *rnb;
 416         char *data;
 417         struct lu_env *env;
 418         struct cl_io *io;
 419         __u16 refcheck;
 420         struct lustre_handle lockh;
 421         struct ldlm_lock *lock;
 422         unsigned long index, start;
 423         struct niobuf_local lnb;
 424         int rc;
 425         bool dom_lock = false;
 426
 427         ENTRY;
 428
 429         if (obj == NULL)
 430                 RETURN_EXIT;
 431
 432         if (it->it_lock_mode != 0) {
 433                 lockh.cookie = it->it_lock_handle;
 434                 lock = ldlm_handle2lock(&lockh);
 435                 if (lock != NULL)
 436                         dom_lock = ldlm_has_dom(lock);
 437                 LDLM_LOCK_PUT(lock);
 438         }
 439
 440         if (!dom_lock)
 441                 RETURN_EXIT;
 442
 443         env = cl_env_get(&refcheck);
 444         if (IS_ERR(env))
 445                 RETURN_EXIT;
 446
 447         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 448                                    RCL_SERVER))
 449                 GOTO(out_env, rc = -ENODATA);
 450
 451         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 452         data = (char *)rnb + sizeof(*rnb);
 453
 454         if (rnb == NULL || rnb->rnb_len == 0)
 455                 GOTO(out_env, rc = 0);
 456
 457         CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
 458                rnb->rnb_len, i_size_read(inode));
 459
 460         io = vvp_env_thread_io(env);
 461         io->ci_obj = obj;
 462         io->ci_ignore_layout = 1;
 463         rc = cl_io_init(env, io, CIT_MISC, obj);
 464         if (rc)
 465                 GOTO(out_io, rc);
 466
 467         lnb.lnb_file_offset = rnb->rnb_offset;
 468         start = lnb.lnb_file_offset / PAGE_SIZE;
 469         index = 0;
 470         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 471         lnb.lnb_page_offset = 0;
 472         do {
 473                 struct cl_page *clp;
 474
 475                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 476                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 477                 if (lnb.lnb_len > PAGE_SIZE)
 478                         lnb.lnb_len = PAGE_SIZE;
 479
 480                 vmpage = read_cache_page(mapping, index + start,
 481                                          ll_dom_readpage, &lnb);
 482                 if (IS_ERR(vmpage)) {
 483                         CWARN("%s: cannot fill page %lu for "DFID
 484                               " with data: rc = %li\n",
 485                               ll_get_fsname(inode->i_sb, NULL, 0),
 486                               index + start, PFID(lu_object_fid(&obj->co_lu)),
 487                               PTR_ERR(vmpage));
 488                         break;
 489                 }
 490                 lock_page(vmpage);
 491                 if (vmpage->mapping == NULL) {
 492                         unlock_page(vmpage);
 493                         put_page(vmpage);
 494                         /* page was truncated */
 495                         GOTO(out_io, rc = -ENODATA);
 496                 }
 497                 clp = cl_page_find(env, obj, vmpage->index, vmpage,
 498                                    CPT_CACHEABLE);
 499                 if (IS_ERR(clp)) {
 500                         unlock_page(vmpage);
 501                         put_page(vmpage);
 502                         GOTO(out_io, rc = PTR_ERR(clp));
 503                 }
 504
 505                 /* export page */
 506                 cl_page_export(env, clp, 1);
 507                 cl_page_put(env, clp);
 508                 unlock_page(vmpage);
 509                 put_page(vmpage);
 510                 index++;
 511         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 512         rc = 0;
 513         EXIT;
 514 out_io:
 515         cl_io_fini(env, io);
 516 out_env:
 517         cl_env_put(env, &refcheck);
 518 }
 519
 520 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 521                                 struct lookup_intent *itp)
 522 {
 523         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 524         struct dentry *parent = de->d_parent;
 525         const char *name = NULL;
 526         int len = 0;
 527         struct md_op_data *op_data;
 528         struct ptlrpc_request *req = NULL;
 529         int rc;
 530         ENTRY;
 531
 532         LASSERT(parent != NULL);
 533         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 534
 535         /* if server supports open-by-fid, or file name is invalid, don't pack
 536          * name in open request */
 537         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
 538             lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
 539                 name = de->d_name.name;
 540                 len = de->d_name.len;
 541         }
 542
 543         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 544                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 545         if (IS_ERR(op_data))
 546                 RETURN(PTR_ERR(op_data));
 547         op_data->op_data = lmm;
 548         op_data->op_data_size = lmmsize;
 549
 550         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 551                             &ll_md_blocking_ast, 0);
 552         ll_finish_md_op_data(op_data);
 553         if (rc == -ESTALE) {
 554                 /* reason for keep own exit path - don`t flood log
 555                  * with messages with -ESTALE errors.
 556                  */
 557                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 558                      it_open_error(DISP_OPEN_OPEN, itp))
 559                         GOTO(out, rc);
 560                 ll_release_openhandle(de, itp);
 561                 GOTO(out, rc);
 562         }
 563
 564         if (it_disposition(itp, DISP_LOOKUP_NEG))
 565                 GOTO(out, rc = -ENOENT);
 566
 567         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 568                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 569                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 570                 GOTO(out, rc);
 571         }
 572
 573         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 574
 575         if (!rc && itp->it_lock_mode) {
 576                 ll_dom_finish_open(de->d_inode, req, itp);
 577                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 578         }
 579
 580 out:
 581         ptlrpc_req_finished(req);
 582         ll_intent_drop_lock(itp);
 583
 584         /* We did open by fid, but by the time we got to the server,
 585          * the object disappeared. If this is a create, we cannot really
 586          * tell the userspace that the file it was trying to create
 587          * does not exist. Instead let's return -ESTALE, and the VFS will
 588          * retry the create with LOOKUP_REVAL that we are going to catch
 589          * in ll_revalidate_dentry() and use lookup then.
 590          */
 591         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 592                 rc = -ESTALE;
 593
 594         RETURN(rc);
 595 }
 596
 597 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 598                        struct obd_client_handle *och)
 599 {
 600         struct mdt_body *body;
 601
 602         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 603         och->och_open_handle = body->mbo_open_handle;
 604         och->och_fid = body->mbo_fid1;
 605         och->och_lease_handle.cookie = it->it_lock_handle;
 606         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 607         och->och_flags = it->it_flags;
 608
 609         return md_set_open_replay_data(md_exp, och, it);
 610 }
 611
 612 static int ll_local_open(struct file *file, struct lookup_intent *it,
 613                          struct ll_file_data *fd, struct obd_client_handle *och)
 614 {
 615         struct inode *inode = file_inode(file);
 616         ENTRY;
 617
 618         LASSERT(!LUSTRE_FPRIVATE(file));
 619
 620         LASSERT(fd != NULL);
 621
 622         if (och) {
 623                 int rc;
 624
 625                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 626                 if (rc != 0)
 627                         RETURN(rc);
 628         }
 629
 630         LUSTRE_FPRIVATE(file) = fd;
 631         ll_readahead_init(inode, &fd->fd_ras);
 632         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 633
 634         /* ll_cl_context initialize */
 635         rwlock_init(&fd->fd_lock);
 636         INIT_LIST_HEAD(&fd->fd_lccs);
 637
 638         RETURN(0);
 639 }
 640
 641 /* Open a file, and (for the very first open) create objects on the OSTs at
 642  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 643  * creation or open until ll_lov_setstripe() ioctl is called.
 644  *
 645  * If we already have the stripe MD locally then we don't request it in
 646  * md_open(), by passing a lmm_size = 0.
 647  *
 648  * It is up to the application to ensure no other processes open this file
 649  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 650  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 651  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 652  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 653  */
 654 int ll_file_open(struct inode *inode, struct file *file)
 655 {
 656         struct ll_inode_info *lli = ll_i2info(inode);
 657         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 658                                           .it_flags = file->f_flags };
 659         struct obd_client_handle **och_p = NULL;
 660         __u64 *och_usecount = NULL;
 661         struct ll_file_data *fd;
 662         int rc = 0;
 663         ENTRY;
 664
 665         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 666                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 667
 668         it = file->private_data; /* XXX: compat macro */
 669         file->private_data = NULL; /* prevent ll_local_open assertion */
 670
 671         fd = ll_file_data_get();
 672         if (fd == NULL)
 673                 GOTO(out_nofiledata, rc = -ENOMEM);
 674
 675         fd->fd_file = file;
 676         if (S_ISDIR(inode->i_mode))
 677                 ll_authorize_statahead(inode, fd);
 678
 679         if (inode->i_sb->s_root == file_dentry(file)) {
 680                 LUSTRE_FPRIVATE(file) = fd;
 681                 RETURN(0);
 682         }
 683
 684         if (!it || !it->it_disposition) {
 685                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 686                  * because everything but O_ACCMODE mask was stripped from
 687                  * there */
 688                 if ((oit.it_flags + 1) & O_ACCMODE)
 689                         oit.it_flags++;
 690                 if (file->f_flags & O_TRUNC)
 691                         oit.it_flags |= FMODE_WRITE;
 692
 693                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 694                  * dentry_open after call to open_namei that checks permissions.
 695                  * Only nfsd_open call dentry_open directly without checking
 696                  * permissions and because of that this code below is safe.
 697                  */
 698                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 699                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 700
 701                 /* We do not want O_EXCL here, presumably we opened the file
 702                  * already? XXX - NFS implications? */
 703                 oit.it_flags &= ~O_EXCL;
 704
 705                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 706                  * created if necessary, then "IT_CREAT" should be set to keep
 707                  * consistent with it */
 708                 if (oit.it_flags & O_CREAT)
 709                         oit.it_op |= IT_CREAT;
 710
 711                 it = &oit;
 712         }
 713
 714 restart:
 715         /* Let's see if we have file open on MDS already. */
 716         if (it->it_flags & FMODE_WRITE) {
 717                 och_p = &lli->lli_mds_write_och;
 718                 och_usecount = &lli->lli_open_fd_write_count;
 719         } else if (it->it_flags & FMODE_EXEC) {
 720                 och_p = &lli->lli_mds_exec_och;
 721                 och_usecount = &lli->lli_open_fd_exec_count;
 722          } else {
 723                 och_p = &lli->lli_mds_read_och;
 724                 och_usecount = &lli->lli_open_fd_read_count;
 725         }
 726
 727         mutex_lock(&lli->lli_och_mutex);
 728         if (*och_p) { /* Open handle is present */
 729                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 730                         /* Well, there's extra open request that we do not need,
 731                            let's close it somehow. This will decref request. */
 732                         rc = it_open_error(DISP_OPEN_OPEN, it);
 733                         if (rc) {
 734                                 mutex_unlock(&lli->lli_och_mutex);
 735                                 GOTO(out_openerr, rc);
 736                         }
 737
 738                         ll_release_openhandle(file_dentry(file), it);
 739                 }
 740                 (*och_usecount)++;
 741
 742                 rc = ll_local_open(file, it, fd, NULL);
 743                 if (rc) {
 744                         (*och_usecount)--;
 745                         mutex_unlock(&lli->lli_och_mutex);
 746                         GOTO(out_openerr, rc);
 747                 }
 748         } else {
 749                 LASSERT(*och_usecount == 0);
 750                 if (!it->it_disposition) {
 751                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 752                         /* We cannot just request lock handle now, new ELC code
 753                            means that one of other OPEN locks for this file
 754                            could be cancelled, and since blocking ast handler
 755                            would attempt to grab och_mutex as well, that would
 756                            result in a deadlock */
 757                         mutex_unlock(&lli->lli_och_mutex);
 758                         /*
 759                          * Normally called under two situations:
 760                          * 1. NFS export.
 761                          * 2. A race/condition on MDS resulting in no open
 762                          *    handle to be returned from LOOKUP|OPEN request,
 763                          *    for example if the target entry was a symlink.
 764                          *
 765                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 766                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 767                          *  bit so that it's not confusing later callers.
 768                          *
 769                          *  NB; when ldd is NULL, it must have come via normal
 770                          *  lookup path only, since ll_iget_for_nfs always calls
 771                          *  ll_d_init().
 772                          */
 773                         if (ldd && ldd->lld_nfs_dentry) {
 774                                 ldd->lld_nfs_dentry = 0;
 775                                 it->it_flags |= MDS_OPEN_LOCK;
 776                         }
 777
 778                          /*
 779                          * Always specify MDS_OPEN_BY_FID because we don't want
 780                          * to get file with different fid.
 781                          */
 782                         it->it_flags |= MDS_OPEN_BY_FID;
 783                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 784                                                  it);
 785                         if (rc)
 786                                 GOTO(out_openerr, rc);
 787
 788                         goto restart;
 789                 }
 790                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 791                 if (!*och_p)
 792                         GOTO(out_och_free, rc = -ENOMEM);
 793
 794                 (*och_usecount)++;
 795
 796                 /* md_intent_lock() didn't get a request ref if there was an
 797                  * open error, so don't do cleanup on the request here
 798                  * (bug 3430) */
 799                 /* XXX (green): Should not we bail out on any error here, not
 800                  * just open error? */
 801                 rc = it_open_error(DISP_OPEN_OPEN, it);
 802                 if (rc != 0)
 803                         GOTO(out_och_free, rc);
 804
 805                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 806                          "inode %p: disposition %x, status %d\n", inode,
 807                          it_disposition(it, ~0), it->it_status);
 808
 809                 rc = ll_local_open(file, it, fd, *och_p);
 810                 if (rc)
 811                         GOTO(out_och_free, rc);
 812         }
 813         mutex_unlock(&lli->lli_och_mutex);
 814         fd = NULL;
 815
 816         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 817            different kind of OPEN lock for this same inode gets cancelled
 818            by ldlm_cancel_lru */
 819         if (!S_ISREG(inode->i_mode))
 820                 GOTO(out_och_free, rc);
 821
 822         cl_lov_delay_create_clear(&file->f_flags);
 823         GOTO(out_och_free, rc);
 824
 825 out_och_free:
 826         if (rc) {
 827                 if (och_p && *och_p) {
 828                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 829                         *och_p = NULL; /* OBD_FREE writes some magic there */
 830                         (*och_usecount)--;
 831                 }
 832                 mutex_unlock(&lli->lli_och_mutex);
 833
 834 out_openerr:
 835                 if (lli->lli_opendir_key == fd)
 836                         ll_deauthorize_statahead(inode, fd);
 837                 if (fd != NULL)
 838                         ll_file_data_put(fd);
 839         } else {
 840                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 841         }
 842
 843 out_nofiledata:
 844         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 845                 ptlrpc_req_finished(it->it_request);
 846                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 847         }
 848
 849         return rc;
 850 }
 851
 852 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 853                         struct ldlm_lock_desc *desc, void *data, int flag)
 854 {
 855         int rc;
 856         struct lustre_handle lockh;
 857         ENTRY;
 858
 859         switch (flag) {
 860         case LDLM_CB_BLOCKING:
 861                 ldlm_lock2handle(lock, &lockh);
 862                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 863                 if (rc < 0) {
 864                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 865                         RETURN(rc);
 866                 }
 867                 break;
 868         case LDLM_CB_CANCELING:
 869                 /* do nothing */
 870                 break;
 871         }
 872         RETURN(0);
 873 }
 874
 875 /**
 876  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 877  * and save it as fd->fd_och so as to force client to reopen the file even
 878  * if it has an open lock in cache already.
 879  */
 880 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 881                                 struct lustre_handle *old_open_handle)
 882 {
 883         struct ll_inode_info *lli = ll_i2info(inode);
 884         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 885         struct obd_client_handle **och_p;
 886         __u64 *och_usecount;
 887         int rc = 0;
 888         ENTRY;
 889
 890         /* Get the openhandle of the file */
 891         mutex_lock(&lli->lli_och_mutex);
 892         if (fd->fd_lease_och != NULL)
 893                 GOTO(out_unlock, rc = -EBUSY);
 894
 895         if (fd->fd_och == NULL) {
 896                 if (file->f_mode & FMODE_WRITE) {
 897                         LASSERT(lli->lli_mds_write_och != NULL);
 898                         och_p = &lli->lli_mds_write_och;
 899                         och_usecount = &lli->lli_open_fd_write_count;
 900                 } else {
 901                         LASSERT(lli->lli_mds_read_och != NULL);
 902                         och_p = &lli->lli_mds_read_och;
 903                         och_usecount = &lli->lli_open_fd_read_count;
 904                 }
 905
 906                 if (*och_usecount > 1)
 907                         GOTO(out_unlock, rc = -EBUSY);
 908
 909                 fd->fd_och = *och_p;
 910                 *och_usecount = 0;
 911                 *och_p = NULL;
 912         }
 913
 914         *old_open_handle = fd->fd_och->och_open_handle;
 915
 916         EXIT;
 917 out_unlock:
 918         mutex_unlock(&lli->lli_och_mutex);
 919         return rc;
 920 }
 921
 922 /**
 923  * Release ownership on lli_mds_*_och when putting back a file lease.
 924  */
 925 static int ll_lease_och_release(struct inode *inode, struct file *file)
 926 {
 927         struct ll_inode_info *lli = ll_i2info(inode);
 928         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 929         struct obd_client_handle **och_p;
 930         struct obd_client_handle *old_och = NULL;
 931         __u64 *och_usecount;
 932         int rc = 0;
 933         ENTRY;
 934
 935         mutex_lock(&lli->lli_och_mutex);
 936         if (file->f_mode & FMODE_WRITE) {
 937                 och_p = &lli->lli_mds_write_och;
 938                 och_usecount = &lli->lli_open_fd_write_count;
 939         } else {
 940                 och_p = &lli->lli_mds_read_och;
 941                 och_usecount = &lli->lli_open_fd_read_count;
 942         }
 943
 944         /* The file may have been open by another process (broken lease) so
 945          * *och_p is not NULL. In this case we should simply increase usecount
 946          * and close fd_och.
 947          */
 948         if (*och_p != NULL) {
 949                 old_och = fd->fd_och;
 950                 (*och_usecount)++;
 951         } else {
 952                 *och_p = fd->fd_och;
 953                 *och_usecount = 1;
 954         }
 955         fd->fd_och = NULL;
 956         mutex_unlock(&lli->lli_och_mutex);
 957
 958         if (old_och != NULL)
 959                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 960
 961         RETURN(rc);
 962 }
 963
 964 /**
 965  * Acquire a lease and open the file.
 966  */
 967 static struct obd_client_handle *
 968 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 969               __u64 open_flags)
 970 {
 971         struct lookup_intent it = { .it_op = IT_OPEN };
 972         struct ll_sb_info *sbi = ll_i2sbi(inode);
 973         struct md_op_data *op_data;
 974         struct ptlrpc_request *req = NULL;
 975         struct lustre_handle old_open_handle = { 0 };
 976         struct obd_client_handle *och = NULL;
 977         int rc;
 978         int rc2;
 979         ENTRY;
 980
 981         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 982                 RETURN(ERR_PTR(-EINVAL));
 983
 984         if (file != NULL) {
 985                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 986                         RETURN(ERR_PTR(-EPERM));
 987
 988                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
 989                 if (rc)
 990                         RETURN(ERR_PTR(rc));
 991         }
 992
 993         OBD_ALLOC_PTR(och);
 994         if (och == NULL)
 995                 RETURN(ERR_PTR(-ENOMEM));
 996
 997         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 998                                         LUSTRE_OPC_ANY, NULL);
 999         if (IS_ERR(op_data))
1000                 GOTO(out, rc = PTR_ERR(op_data));
1001
1002         /* To tell the MDT this openhandle is from the same owner */
1003         op_data->op_open_handle = old_open_handle;
1004
1005         it.it_flags = fmode | open_flags;
1006         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1007         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1008                             &ll_md_blocking_lease_ast,
1009         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1010          * it can be cancelled which may mislead applications that the lease is
1011          * broken;
1012          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1013          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1014          * doesn't deal with openhandle, so normal openhandle will be leaked. */
1015                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1016         ll_finish_md_op_data(op_data);
1017         ptlrpc_req_finished(req);
1018         if (rc < 0)
1019                 GOTO(out_release_it, rc);
1020
1021         if (it_disposition(&it, DISP_LOOKUP_NEG))
1022                 GOTO(out_release_it, rc = -ENOENT);
1023
1024         rc = it_open_error(DISP_OPEN_OPEN, &it);
1025         if (rc)
1026                 GOTO(out_release_it, rc);
1027
1028         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1029         ll_och_fill(sbi->ll_md_exp, &it, och);
1030
1031         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1032                 GOTO(out_close, rc = -EOPNOTSUPP);
1033
1034         /* already get lease, handle lease lock */
1035         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1036         if (it.it_lock_mode == 0 ||
1037             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1038                 /* open lock must return for lease */
1039                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1040                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1041                         it.it_lock_bits);
1042                 GOTO(out_close, rc = -EPROTO);
1043         }
1044
1045         ll_intent_release(&it);
1046         RETURN(och);
1047
1048 out_close:
1049         /* Cancel open lock */
1050         if (it.it_lock_mode != 0) {
1051                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1052                                             it.it_lock_mode);
1053                 it.it_lock_mode = 0;
1054                 och->och_lease_handle.cookie = 0ULL;
1055         }
1056         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1057         if (rc2 < 0)
1058                 CERROR("%s: error closing file "DFID": %d\n",
1059                        ll_get_fsname(inode->i_sb, NULL, 0),
1060                        PFID(&ll_i2info(inode)->lli_fid), rc2);
1061         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1062 out_release_it:
1063         ll_intent_release(&it);
1064 out:
1065         if (och != NULL)
1066                 OBD_FREE_PTR(och);
1067         RETURN(ERR_PTR(rc));
1068 }
1069
1070 /**
1071  * Check whether a layout swap can be done between two inodes.
1072  *
1073  * \param[in] inode1  First inode to check
1074  * \param[in] inode2  Second inode to check
1075  *
1076  * \retval 0 on success, layout swap can be performed between both inodes
1077  * \retval negative error code if requirements are not met
1078  */
1079 static int ll_check_swap_layouts_validity(struct inode *inode1,
1080                                           struct inode *inode2)
1081 {
1082         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1083                 return -EINVAL;
1084
1085         if (inode_permission(inode1, MAY_WRITE) ||
1086             inode_permission(inode2, MAY_WRITE))
1087                 return -EPERM;
1088
1089         if (inode1->i_sb != inode2->i_sb)
1090                 return -EXDEV;
1091
1092         return 0;
1093 }
1094
1095 static int ll_swap_layouts_close(struct obd_client_handle *och,
1096                                  struct inode *inode, struct inode *inode2)
1097 {
1098         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1099         const struct lu_fid     *fid2;
1100         int                      rc;
1101         ENTRY;
1102
1103         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1104                ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1105
1106         rc = ll_check_swap_layouts_validity(inode, inode2);
1107         if (rc < 0)
1108                 GOTO(out_free_och, rc);
1109
1110         /* We now know that inode2 is a lustre inode */
1111         fid2 = ll_inode2fid(inode2);
1112
1113         rc = lu_fid_cmp(fid1, fid2);
1114         if (rc == 0)
1115                 GOTO(out_free_och, rc = -EINVAL);
1116
1117         /* Close the file and {swap,merge} layouts between inode & inode2.
1118          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1119          * because we still need it to pack l_remote_handle to MDT. */
1120         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1121                                        inode2);
1122
1123         och = NULL; /* freed in ll_close_inode_openhandle() */
1124
1125 out_free_och:
1126         if (och != NULL)
1127                 OBD_FREE_PTR(och);
1128
1129         RETURN(rc);
1130 }
1131
1132 /**
1133  * Release lease and close the file.
1134  * It will check if the lease has ever broken.
1135  */
1136 static int ll_lease_close_intent(struct obd_client_handle *och,
1137                                  struct inode *inode,
1138                                  bool *lease_broken, enum mds_op_bias bias,
1139                                  void *data)
1140 {
1141         struct ldlm_lock *lock;
1142         bool cancelled = true;
1143         int rc;
1144         ENTRY;
1145
1146         lock = ldlm_handle2lock(&och->och_lease_handle);
1147         if (lock != NULL) {
1148                 lock_res_and_lock(lock);
1149                 cancelled = ldlm_is_cancel(lock);
1150                 unlock_res_and_lock(lock);
1151                 LDLM_LOCK_PUT(lock);
1152         }
1153
1154         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1155                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1156
1157         if (lease_broken != NULL)
1158                 *lease_broken = cancelled;
1159
1160         if (!cancelled && !bias)
1161                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1162
1163         if (cancelled) { /* no need to excute intent */
1164                 bias = 0;
1165                 data = NULL;
1166         }
1167
1168         rc = ll_close_inode_openhandle(inode, och, bias, data);
1169         RETURN(rc);
1170 }
1171
1172 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1173                           bool *lease_broken)
1174 {
1175         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1176 }
1177
1178 /**
1179  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1180  */
1181 static int ll_lease_file_resync(struct obd_client_handle *och,
1182                                 struct inode *inode)
1183 {
1184         struct ll_sb_info *sbi = ll_i2sbi(inode);
1185         struct md_op_data *op_data;
1186         __u64 data_version_unused;
1187         int rc;
1188         ENTRY;
1189
1190         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1191                                      LUSTRE_OPC_ANY, NULL);
1192         if (IS_ERR(op_data))
1193                 RETURN(PTR_ERR(op_data));
1194
1195         /* before starting file resync, it's necessary to clean up page cache
1196          * in client memory, otherwise once the layout version is increased,
1197          * writing back cached data will be denied the OSTs. */
1198         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1199         if (rc)
1200                 GOTO(out, rc);
1201
1202         op_data->op_lease_handle = och->och_lease_handle;
1203         rc = md_file_resync(sbi->ll_md_exp, op_data);
1204         if (rc)
1205                 GOTO(out, rc);
1206
1207         EXIT;
1208 out:
1209         ll_finish_md_op_data(op_data);
1210         return rc;
1211 }
1212
1213 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1214 {
1215         struct ll_inode_info *lli = ll_i2info(inode);
1216         struct cl_object *obj = lli->lli_clob;
1217         struct cl_attr *attr = vvp_env_thread_attr(env);
1218         s64 atime;
1219         s64 mtime;
1220         s64 ctime;
1221         int rc = 0;
1222
1223         ENTRY;
1224
1225         ll_inode_size_lock(inode);
1226
1227         /* Merge timestamps the most recently obtained from MDS with
1228          * timestamps obtained from OSTs.
1229          *
1230          * Do not overwrite atime of inode because it may be refreshed
1231          * by file_accessed() function. If the read was served by cache
1232          * data, there is no RPC to be sent so that atime may not be
1233          * transferred to OSTs at all. MDT only updates atime at close time
1234          * if it's at least 'mdd.*.atime_diff' older.
1235          * All in all, the atime in Lustre does not strictly comply with
1236          * POSIX. Solving this problem needs to send an RPC to MDT for each
1237          * read, this will hurt performance. */
1238         if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1239                 LTIME_S(inode->i_atime) = lli->lli_atime;
1240                 lli->lli_update_atime = 0;
1241         }
1242         LTIME_S(inode->i_mtime) = lli->lli_mtime;
1243         LTIME_S(inode->i_ctime) = lli->lli_ctime;
1244
1245         atime = LTIME_S(inode->i_atime);
1246         mtime = LTIME_S(inode->i_mtime);
1247         ctime = LTIME_S(inode->i_ctime);
1248
1249         cl_object_attr_lock(obj);
1250         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1251                 rc = -EINVAL;
1252         else
1253                 rc = cl_object_attr_get(env, obj, attr);
1254         cl_object_attr_unlock(obj);
1255
1256         if (rc != 0)
1257                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1258
1259         if (atime < attr->cat_atime)
1260                 atime = attr->cat_atime;
1261
1262         if (ctime < attr->cat_ctime)
1263                 ctime = attr->cat_ctime;
1264
1265         if (mtime < attr->cat_mtime)
1266                 mtime = attr->cat_mtime;
1267
1268         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1269                PFID(&lli->lli_fid), attr->cat_size);
1270
1271         i_size_write(inode, attr->cat_size);
1272         inode->i_blocks = attr->cat_blocks;
1273
1274         LTIME_S(inode->i_atime) = atime;
1275         LTIME_S(inode->i_mtime) = mtime;
1276         LTIME_S(inode->i_ctime) = ctime;
1277
1278 out_size_unlock:
1279         ll_inode_size_unlock(inode);
1280
1281         RETURN(rc);
1282 }
1283
1284 /**
1285  * Set designated mirror for I/O.
1286  *
1287  * So far only read, write, and truncated can support to issue I/O to
1288  * designated mirror.
1289  */
1290 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1291 {
1292         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1293
1294         /* clear layout version for generic(non-resync) I/O in case it carries
1295          * stale layout version due to I/O restart */
1296         io->ci_layout_version = 0;
1297
1298         /* FLR: disable non-delay for designated mirror I/O because obviously
1299          * only one mirror is available */
1300         if (fd->fd_designated_mirror > 0) {
1301                 io->ci_ndelay = 0;
1302                 io->ci_designated_mirror = fd->fd_designated_mirror;
1303                 io->ci_layout_version = fd->fd_layout_version;
1304                 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1305                                  * io to ptasks */
1306         }
1307
1308         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1309                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1310 }
1311
1312 static bool file_is_noatime(const struct file *file)
1313 {
1314         const struct vfsmount *mnt = file->f_path.mnt;
1315         const struct inode *inode = file_inode((struct file *)file);
1316
1317         /* Adapted from file_accessed() and touch_atime().*/
1318         if (file->f_flags & O_NOATIME)
1319                 return true;
1320
1321         if (inode->i_flags & S_NOATIME)
1322                 return true;
1323
1324         if (IS_NOATIME(inode))
1325                 return true;
1326
1327         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1328                 return true;
1329
1330         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1331                 return true;
1332
1333         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1334                 return true;
1335
1336         return false;
1337 }
1338
1339 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1340
1341 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1342 {
1343         struct inode *inode = file_inode(file);
1344         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1345
1346         memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1347         init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1348         io->u.ci_rw.rw_file = file;
1349         io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1350         io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1351         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1352
1353         if (iot == CIT_WRITE) {
1354                 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1355                 io->u.ci_rw.rw_sync   = !!(file->f_flags & O_SYNC ||
1356                                            file->f_flags & O_DIRECT ||
1357                                            IS_SYNC(inode));
1358         }
1359         io->ci_obj = ll_i2info(inode)->lli_clob;
1360         io->ci_lockreq = CILR_MAYBE;
1361         if (ll_file_nolock(file)) {
1362                 io->ci_lockreq = CILR_NEVER;
1363                 io->ci_no_srvlock = 1;
1364         } else if (file->f_flags & O_APPEND) {
1365                 io->ci_lockreq = CILR_MANDATORY;
1366         }
1367         io->ci_noatime = file_is_noatime(file);
1368         if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1369                 io->ci_pio = !io->u.ci_rw.rw_append;
1370         else
1371                 io->ci_pio = 0;
1372
1373         /* FLR: only use non-delay I/O for read as there is only one
1374          * avaliable mirror for write. */
1375         io->ci_ndelay = !(iot == CIT_WRITE);
1376
1377         ll_io_set_mirror(io, file);
1378 }
1379
1380 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1381 {
1382         struct cl_io_pt *pt = ptask->pt_cbdata;
1383         struct file *file = pt->cip_file;
1384         struct lu_env *env;
1385         struct cl_io *io;
1386         loff_t pos = pt->cip_pos;
1387         int rc;
1388         __u16 refcheck;
1389         ENTRY;
1390
1391         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1392                 file_dentry(file)->d_name.name,
1393                 pt->cip_iot == CIT_READ ? "read" : "write",
1394                 pos, pos + pt->cip_count);
1395
1396         env = cl_env_get(&refcheck);
1397         if (IS_ERR(env))
1398                 RETURN(PTR_ERR(env));
1399
1400         io = vvp_env_thread_io(env);
1401         ll_io_init(io, file, pt->cip_iot);
1402         io->u.ci_rw.rw_iter = pt->cip_iter;
1403         io->u.ci_rw.rw_iocb = pt->cip_iocb;
1404         io->ci_pio = 0; /* It's already in parallel task */
1405
1406         rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1407                            pt->cip_count - pt->cip_result);
1408         if (!rc) {
1409                 struct vvp_io *vio = vvp_env_io(env);
1410
1411                 vio->vui_io_subtype = IO_NORMAL;
1412                 vio->vui_fd = LUSTRE_FPRIVATE(file);
1413
1414                 ll_cl_add(file, env, io, LCC_RW);
1415                 rc = cl_io_loop(env, io);
1416                 ll_cl_remove(file, env);
1417         } else {
1418                 /* cl_io_rw_init() handled IO */
1419                 rc = io->ci_result;
1420         }
1421
1422         if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1423                 if (io->ci_nob > 0)
1424                         io->ci_nob /= 2;
1425                 rc = -EIO;
1426         }
1427
1428         if (io->ci_nob > 0) {
1429                 pt->cip_result += io->ci_nob;
1430                 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1431                 pos += io->ci_nob;
1432                 pt->cip_iocb.ki_pos = pos;
1433 #ifdef HAVE_KIOCB_KI_LEFT
1434                 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1435 #elif defined(HAVE_KI_NBYTES)
1436                 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1437 #endif
1438         }
1439
1440         cl_io_fini(env, io);
1441         cl_env_put(env, &refcheck);
1442
1443         pt->cip_need_restart = io->ci_need_restart;
1444
1445         CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1446                 file_dentry(file)->d_name.name,
1447                 pt->cip_iot == CIT_READ ? "read" : "write",
1448                 pt->cip_result, rc);
1449
1450         RETURN(pt->cip_result > 0 ? 0 : rc);
1451 }
1452
1453 static ssize_t
1454 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1455                    struct file *file, enum cl_io_type iot,
1456                    loff_t *ppos, size_t count)
1457 {
1458         struct range_lock       range;
1459         struct vvp_io           *vio = vvp_env_io(env);
1460         struct inode            *inode = file_inode(file);
1461         struct ll_inode_info    *lli = ll_i2info(inode);
1462         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1463         struct cl_io            *io;
1464         loff_t                  pos = *ppos;
1465         ssize_t                 result = 0;
1466         int                     rc = 0;
1467         unsigned                retried = 0;
1468         bool                    restarted = false;
1469
1470         ENTRY;
1471
1472         CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1473                 file_dentry(file)->d_name.name,
1474                 iot == CIT_READ ? "read" : "write", pos, pos + count);
1475
1476 restart:
1477         io = vvp_env_thread_io(env);
1478         ll_io_init(io, file, iot);
1479         if (args->via_io_subtype == IO_NORMAL) {
1480                 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1481                 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1482         }
1483         if (args->via_io_subtype != IO_NORMAL || restarted)
1484                 io->ci_pio = 0;
1485         io->ci_ndelay_tried = retried;
1486
1487         if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1488                 bool range_locked = false;
1489
1490                 if (file->f_flags & O_APPEND)
1491                         range_lock_init(&range, 0, LUSTRE_EOF);
1492                 else
1493                         range_lock_init(&range, pos, pos + count - 1);
1494
1495                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1496                 vio->vui_io_subtype = args->via_io_subtype;
1497
1498                 switch (vio->vui_io_subtype) {
1499                 case IO_NORMAL:
1500                         /* Direct IO reads must also take range lock,
1501                          * or multiple reads will try to work on the same pages
1502                          * See LU-6227 for details. */
1503                         if (((iot == CIT_WRITE) ||
1504                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1505                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1506                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1507                                        RL_PARA(&range));
1508                                 rc = range_lock(&lli->lli_write_tree, &range);
1509                                 if (rc < 0)
1510                                         GOTO(out, rc);
1511
1512                                 range_locked = true;
1513                         }
1514                         break;
1515                 case IO_SPLICE:
1516                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1517                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1518                         break;
1519                 default:
1520                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1521                         LBUG();
1522                 }
1523
1524                 ll_cl_add(file, env, io, LCC_RW);
1525                 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1526                     !lli->lli_inode_locked) {
1527                         inode_lock(inode);
1528                         lli->lli_inode_locked = 1;
1529                 }
1530                 rc = cl_io_loop(env, io);
1531                 if (lli->lli_inode_locked) {
1532                         lli->lli_inode_locked = 0;
1533                         inode_unlock(inode);
1534                 }
1535                 ll_cl_remove(file, env);
1536
1537                 if (range_locked) {
1538                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1539                                RL_PARA(&range));
1540                         range_unlock(&lli->lli_write_tree, &range);
1541                 }
1542         } else {
1543                 /* cl_io_rw_init() handled IO */
1544                 rc = io->ci_result;
1545         }
1546
1547         if (io->ci_nob > 0) {
1548                 result += io->ci_nob;
1549                 count  -= io->ci_nob;
1550
1551                 if (args->via_io_subtype == IO_NORMAL) {
1552                         iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1553
1554                         /* CLIO is too complicated. See LU-11069. */
1555                         if (cl_io_is_append(io))
1556                                 pos = io->u.ci_rw.rw_iocb.ki_pos;
1557                         else
1558                                 pos += io->ci_nob;
1559
1560                         args->u.normal.via_iocb->ki_pos = pos;
1561 #ifdef HAVE_KIOCB_KI_LEFT
1562                         args->u.normal.via_iocb->ki_left = count;
1563 #elif defined(HAVE_KI_NBYTES)
1564                         args->u.normal.via_iocb->ki_nbytes = count;
1565 #endif
1566                 } else {
1567                         /* for splice */
1568                         pos = io->u.ci_rw.rw_range.cir_pos;
1569                 }
1570         }
1571 out:
1572         cl_io_fini(env, io);
1573
1574         CDEBUG(D_VFSTRACE,
1575                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1576                file->f_path.dentry->d_name.name,
1577                iot, rc, result, io->ci_need_restart);
1578
1579         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1580                 CDEBUG(D_VFSTRACE,
1581                         "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1582                         file_dentry(file)->d_name.name,
1583                         iot == CIT_READ ? "read" : "write",
1584                         pos, pos + count, result, rc);
1585                 /* preserve the tried count for FLR */
1586                 retried = io->ci_ndelay_tried;
1587                 restarted = true;
1588                 goto restart;
1589         }
1590
1591         if (iot == CIT_READ) {
1592                 if (result > 0)
1593                         ll_stats_ops_tally(ll_i2sbi(inode),
1594                                            LPROC_LL_READ_BYTES, result);
1595         } else if (iot == CIT_WRITE) {
1596                 if (result > 0) {
1597                         ll_stats_ops_tally(ll_i2sbi(inode),
1598                                            LPROC_LL_WRITE_BYTES, result);
1599                         fd->fd_write_failed = false;
1600                 } else if (result == 0 && rc == 0) {
1601                         rc = io->ci_result;
1602                         if (rc < 0)
1603                                 fd->fd_write_failed = true;
1604                         else
1605                                 fd->fd_write_failed = false;
1606                 } else if (rc != -ERESTARTSYS) {
1607                         fd->fd_write_failed = true;
1608                 }
1609         }
1610
1611         CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1612                 file_dentry(file)->d_name.name,
1613                 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1614
1615         *ppos = pos;
1616
1617         RETURN(result > 0 ? result : rc);
1618 }
1619
1620 /**
1621  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1622  * especially for small I/O.
1623  *
1624  * To serve a read request, CLIO has to create and initialize a cl_io and
1625  * then request DLM lock. This has turned out to have siginificant overhead
1626  * and affects the performance of small I/O dramatically.
1627  *
1628  * It's not necessary to create a cl_io for each I/O. Under the help of read
1629  * ahead, most of the pages being read are already in memory cache and we can
1630  * read those pages directly because if the pages exist, the corresponding DLM
1631  * lock must exist so that page content must be valid.
1632  *
1633  * In fast read implementation, the llite speculatively finds and reads pages
1634  * in memory cache. There are three scenarios for fast read:
1635  *   - If the page exists and is uptodate, kernel VM will provide the data and
1636  *     CLIO won't be intervened;
1637  *   - If the page was brought into memory by read ahead, it will be exported
1638  *     and read ahead parameters will be updated;
1639  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1640  *     it will go back and invoke normal read, i.e., a cl_io will be created
1641  *     and DLM lock will be requested.
1642  *
1643  * POSIX compliance: posix standard states that read is intended to be atomic.
1644  * Lustre read implementation is in line with Linux kernel read implementation
1645  * and neither of them complies with POSIX standard in this matter. Fast read
1646  * doesn't make the situation worse on single node but it may interleave write
1647  * results from multiple nodes due to short read handling in ll_file_aio_read().
1648  *
1649  * \param env - lu_env
1650  * \param iocb - kiocb from kernel
1651  * \param iter - user space buffers where the data will be copied
1652  *
1653  * \retval - number of bytes have been read, or error code if error occurred.
1654  */
1655 static ssize_t
1656 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1657 {
1658         ssize_t result;
1659
1660         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1661                 return 0;
1662
1663         /* NB: we can't do direct IO for fast read because it will need a lock
1664          * to make IO engine happy. */
1665         if (iocb->ki_filp->f_flags & O_DIRECT)
1666                 return 0;
1667
1668         result = generic_file_read_iter(iocb, iter);
1669
1670         /* If the first page is not in cache, generic_file_aio_read() will be
1671          * returned with -ENODATA.
1672          * See corresponding code in ll_readpage(). */
1673         if (result == -ENODATA)
1674                 result = 0;
1675
1676         if (result > 0)
1677                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1678                                 LPROC_LL_READ_BYTES, result);
1679
1680         return result;
1681 }
1682
1683 /*
1684  * Read from a file (through the page cache).
1685  */
1686 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1687 {
1688         struct lu_env *env;
1689         struct vvp_io_args *args;
1690         ssize_t result;
1691         ssize_t rc2;
1692         __u16 refcheck;
1693
1694         result = ll_do_fast_read(iocb, to);
1695         if (result < 0 || iov_iter_count(to) == 0)
1696                 GOTO(out, result);
1697
1698         env = cl_env_get(&refcheck);
1699         if (IS_ERR(env))
1700                 return PTR_ERR(env);
1701
1702         args = ll_env_args(env, IO_NORMAL);
1703         args->u.normal.via_iter = to;
1704         args->u.normal.via_iocb = iocb;
1705
1706         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1707                                  &iocb->ki_pos, iov_iter_count(to));
1708         if (rc2 > 0)
1709                 result += rc2;
1710         else if (result == 0)
1711                 result = rc2;
1712
1713         cl_env_put(env, &refcheck);
1714 out:
1715         return result;
1716 }
1717
1718 /**
1719  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1720  * If a page is already in the page cache and dirty (and some other things -
1721  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1722  * write to it without doing a full I/O, because Lustre already knows about it
1723  * and will write it out.  This saves a lot of processing time.
1724  *
1725  * All writes here are within one page, so exclusion is handled by the page
1726  * lock on the vm page.  We do not do tiny writes for writes which touch
1727  * multiple pages because it's very unlikely multiple sequential pages are
1728  * are already dirty.
1729  *
1730  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1731  * and are unlikely to be to already dirty pages.
1732  *
1733  * Attribute updates are important here, we do them in ll_tiny_write_end.
1734  */
1735 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1736 {
1737         ssize_t count = iov_iter_count(iter);
1738         struct file *file = iocb->ki_filp;
1739         struct inode *inode = file_inode(file);
1740         ssize_t result = 0;
1741
1742         ENTRY;
1743
1744         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1745          * of function for why.
1746          */
1747         if (count >= PAGE_SIZE ||
1748             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1749                 RETURN(0);
1750
1751         result = __generic_file_write_iter(iocb, iter);
1752
1753         /* If the page is not already dirty, ll_tiny_write_begin returns
1754          * -ENODATA.  We continue on to normal write.
1755          */
1756         if (result == -ENODATA)
1757                 result = 0;
1758
1759         if (result > 0) {
1760                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1761                                    result);
1762                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1763         }
1764
1765         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1766
1767         RETURN(result);
1768 }
1769
1770 /*
1771  * Write to a file (through the page cache).
1772  */
1773 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1774 {
1775         struct vvp_io_args *args;
1776         struct lu_env *env;
1777         ssize_t rc_tiny = 0, rc_normal;
1778         __u16 refcheck;
1779
1780         ENTRY;
1781
1782         /* NB: we can't do direct IO for tiny writes because they use the page
1783          * cache, we can't do sync writes because tiny writes can't flush
1784          * pages, and we can't do append writes because we can't guarantee the
1785          * required DLM locks are held to protect file size.
1786          */
1787         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1788             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1789                 rc_tiny = ll_do_tiny_write(iocb, from);
1790
1791         /* In case of error, go on and try normal write - Only stop if tiny
1792          * write completed I/O.
1793          */
1794         if (iov_iter_count(from) == 0)
1795                 GOTO(out, rc_normal = rc_tiny);
1796
1797         env = cl_env_get(&refcheck);
1798         if (IS_ERR(env))
1799                 return PTR_ERR(env);
1800
1801         args = ll_env_args(env, IO_NORMAL);
1802         args->u.normal.via_iter = from;
1803         args->u.normal.via_iocb = iocb;
1804
1805         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1806                                     &iocb->ki_pos, iov_iter_count(from));
1807
1808         /* On success, combine bytes written. */
1809         if (rc_tiny >= 0 && rc_normal > 0)
1810                 rc_normal += rc_tiny;
1811         /* On error, only return error from normal write if tiny write did not
1812          * write any bytes.  Otherwise return bytes written by tiny write.
1813          */
1814         else if (rc_tiny > 0)
1815                 rc_normal = rc_tiny;
1816
1817         cl_env_put(env, &refcheck);
1818 out:
1819         RETURN(rc_normal);
1820 }
1821
1822 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1823 /*
1824  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1825  */
1826 static int ll_file_get_iov_count(const struct iovec *iov,
1827                                  unsigned long *nr_segs, size_t *count)
1828 {
1829         size_t cnt = 0;
1830         unsigned long seg;
1831
1832         for (seg = 0; seg < *nr_segs; seg++) {
1833                 const struct iovec *iv = &iov[seg];
1834
1835                 /*
1836                  * If any segment has a negative length, or the cumulative
1837                  * length ever wraps negative then return -EINVAL.
1838                  */
1839                 cnt += iv->iov_len;
1840                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1841                         return -EINVAL;
1842                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1843                         continue;
1844                 if (seg == 0)
1845                         return -EFAULT;
1846                 *nr_segs = seg;
1847                 cnt -= iv->iov_len;     /* This segment is no good */
1848                 break;
1849         }
1850         *count = cnt;
1851         return 0;
1852 }
1853
1854 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1855                                 unsigned long nr_segs, loff_t pos)
1856 {
1857         struct iov_iter to;
1858         size_t iov_count;
1859         ssize_t result;
1860         ENTRY;
1861
1862         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1863         if (result)
1864                 RETURN(result);
1865
1866 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1867         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1868 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1869         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1870 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1871
1872         result = ll_file_read_iter(iocb, &to);
1873
1874         RETURN(result);
1875 }
1876
1877 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1878                             loff_t *ppos)
1879 {
1880         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1881         struct kiocb   kiocb;
1882         ssize_t        result;
1883         ENTRY;
1884
1885         init_sync_kiocb(&kiocb, file);
1886         kiocb.ki_pos = *ppos;
1887 #ifdef HAVE_KIOCB_KI_LEFT
1888         kiocb.ki_left = count;
1889 #elif defined(HAVE_KI_NBYTES)
1890         kiocb.i_nbytes = count;
1891 #endif
1892
1893         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1894         *ppos = kiocb.ki_pos;
1895
1896         RETURN(result);
1897 }
1898
1899 /*
1900  * Write to a file (through the page cache).
1901  * AIO stuff
1902  */
1903 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1904                                  unsigned long nr_segs, loff_t pos)
1905 {
1906         struct iov_iter from;
1907         size_t iov_count;
1908         ssize_t result;
1909         ENTRY;
1910
1911         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1912         if (result)
1913                 RETURN(result);
1914
1915 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1916         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1917 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1918         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1919 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1920
1921         result = ll_file_write_iter(iocb, &from);
1922
1923         RETURN(result);
1924 }
1925
1926 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1927                              size_t count, loff_t *ppos)
1928 {
1929         struct iovec   iov = { .iov_base = (void __user *)buf,
1930                                .iov_len = count };
1931         struct kiocb   kiocb;
1932         ssize_t        result;
1933
1934         ENTRY;
1935
1936         init_sync_kiocb(&kiocb, file);
1937         kiocb.ki_pos = *ppos;
1938 #ifdef HAVE_KIOCB_KI_LEFT
1939         kiocb.ki_left = count;
1940 #elif defined(HAVE_KI_NBYTES)
1941         kiocb.ki_nbytes = count;
1942 #endif
1943
1944         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1945         *ppos = kiocb.ki_pos;
1946
1947         RETURN(result);
1948 }
1949 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1950
1951 /*
1952  * Send file content (through pagecache) somewhere with helper
1953  */
1954 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1955                                    struct pipe_inode_info *pipe, size_t count,
1956                                    unsigned int flags)
1957 {
1958         struct lu_env      *env;
1959         struct vvp_io_args *args;
1960         ssize_t             result;
1961         __u16               refcheck;
1962         ENTRY;
1963
1964         env = cl_env_get(&refcheck);
1965         if (IS_ERR(env))
1966                 RETURN(PTR_ERR(env));
1967
1968         args = ll_env_args(env, IO_SPLICE);
1969         args->u.splice.via_pipe = pipe;
1970         args->u.splice.via_flags = flags;
1971
1972         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1973         cl_env_put(env, &refcheck);
1974         RETURN(result);
1975 }
1976
1977 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1978                              __u64 flags, struct lov_user_md *lum, int lum_size)
1979 {
1980         struct lookup_intent oit = {
1981                 .it_op = IT_OPEN,
1982                 .it_flags = flags | MDS_OPEN_BY_FID,
1983         };
1984         int rc;
1985         ENTRY;
1986
1987         ll_inode_size_lock(inode);
1988         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1989         if (rc < 0)
1990                 GOTO(out_unlock, rc);
1991
1992         ll_release_openhandle(dentry, &oit);
1993
1994 out_unlock:
1995         ll_inode_size_unlock(inode);
1996         ll_intent_release(&oit);
1997
1998         RETURN(rc);
1999 }
2000
2001 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2002                              struct lov_mds_md **lmmp, int *lmm_size,
2003                              struct ptlrpc_request **request)
2004 {
2005         struct ll_sb_info *sbi = ll_i2sbi(inode);
2006         struct mdt_body  *body;
2007         struct lov_mds_md *lmm = NULL;
2008         struct ptlrpc_request *req = NULL;
2009         struct md_op_data *op_data;
2010         int rc, lmmsize;
2011
2012         rc = ll_get_default_mdsize(sbi, &lmmsize);
2013         if (rc)
2014                 RETURN(rc);
2015
2016         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2017                                      strlen(filename), lmmsize,
2018                                      LUSTRE_OPC_ANY, NULL);
2019         if (IS_ERR(op_data))
2020                 RETURN(PTR_ERR(op_data));
2021
2022         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2023         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2024         ll_finish_md_op_data(op_data);
2025         if (rc < 0) {
2026                 CDEBUG(D_INFO, "md_getattr_name failed "
2027                        "on %s: rc %d\n", filename, rc);
2028                 GOTO(out, rc);
2029         }
2030
2031         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2032         LASSERT(body != NULL); /* checked by mdc_getattr_name */
2033
2034         lmmsize = body->mbo_eadatasize;
2035
2036         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2037                         lmmsize == 0) {
2038                 GOTO(out, rc = -ENODATA);
2039         }
2040
2041         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2042         LASSERT(lmm != NULL);
2043
2044         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2045             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2046             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2047                 GOTO(out, rc = -EPROTO);
2048
2049         /*
2050          * This is coming from the MDS, so is probably in
2051          * little endian.  We convert it to host endian before
2052          * passing it to userspace.
2053          */
2054         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2055                 int stripe_count;
2056
2057                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2058                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2059                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2060                         if (le32_to_cpu(lmm->lmm_pattern) &
2061                             LOV_PATTERN_F_RELEASED)
2062                                 stripe_count = 0;
2063                 }
2064
2065                 /* if function called for directory - we should
2066                  * avoid swab not existent lsm objects */
2067                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2068                         lustre_swab_lov_user_md_v1(
2069                                         (struct lov_user_md_v1 *)lmm);
2070                         if (S_ISREG(body->mbo_mode))
2071                                 lustre_swab_lov_user_md_objects(
2072                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2073                                     stripe_count);
2074                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2075                         lustre_swab_lov_user_md_v3(
2076                                         (struct lov_user_md_v3 *)lmm);
2077                         if (S_ISREG(body->mbo_mode))
2078                                 lustre_swab_lov_user_md_objects(
2079                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2080                                     stripe_count);
2081                 } else if (lmm->lmm_magic ==
2082                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2083                         lustre_swab_lov_comp_md_v1(
2084                                         (struct lov_comp_md_v1 *)lmm);
2085                 }
2086         }
2087
2088 out:
2089         *lmmp = lmm;
2090         *lmm_size = lmmsize;
2091         *request = req;
2092         return rc;
2093 }
2094
2095 static int ll_lov_setea(struct inode *inode, struct file *file,
2096                         void __user *arg)
2097 {
2098         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2099         struct lov_user_md      *lump;
2100         int                      lum_size = sizeof(struct lov_user_md) +
2101                                             sizeof(struct lov_user_ost_data);
2102         int                      rc;
2103         ENTRY;
2104
2105         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2106                 RETURN(-EPERM);
2107
2108         OBD_ALLOC_LARGE(lump, lum_size);
2109         if (lump == NULL)
2110                 RETURN(-ENOMEM);
2111
2112         if (copy_from_user(lump, arg, lum_size))
2113                 GOTO(out_lump, rc = -EFAULT);
2114
2115         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2116                                       lum_size);
2117         cl_lov_delay_create_clear(&file->f_flags);
2118
2119 out_lump:
2120         OBD_FREE_LARGE(lump, lum_size);
2121         RETURN(rc);
2122 }
2123
2124 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2125 {
2126         struct lu_env   *env;
2127         __u16           refcheck;
2128         int             rc;
2129         ENTRY;
2130
2131         env = cl_env_get(&refcheck);
2132         if (IS_ERR(env))
2133                 RETURN(PTR_ERR(env));
2134
2135         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2136         cl_env_put(env, &refcheck);
2137         RETURN(rc);
2138 }
2139
2140 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2141                             void __user *arg)
2142 {
2143         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2144         struct lov_user_md        *klum;
2145         int                        lum_size, rc;
2146         __u64                      flags = FMODE_WRITE;
2147         ENTRY;
2148
2149         rc = ll_copy_user_md(lum, &klum);
2150         if (rc < 0)
2151                 RETURN(rc);
2152
2153         lum_size = rc;
2154         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2155                                       lum_size);
2156         if (!rc) {
2157                 __u32 gen;
2158
2159                 rc = put_user(0, &lum->lmm_stripe_count);
2160                 if (rc)
2161                         GOTO(out, rc);
2162
2163                 rc = ll_layout_refresh(inode, &gen);
2164                 if (rc)
2165                         GOTO(out, rc);
2166
2167                 rc = ll_file_getstripe(inode, arg, lum_size);
2168         }
2169         cl_lov_delay_create_clear(&file->f_flags);
2170
2171 out:
2172         OBD_FREE(klum, lum_size);
2173         RETURN(rc);
2174 }
2175
2176 static int
2177 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2178 {
2179         struct ll_inode_info *lli = ll_i2info(inode);
2180         struct cl_object *obj = lli->lli_clob;
2181         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2182         struct ll_grouplock grouplock;
2183         int rc;
2184         ENTRY;
2185
2186         if (arg == 0) {
2187                 CWARN("group id for group lock must not be 0\n");
2188                 RETURN(-EINVAL);
2189         }
2190
2191         if (ll_file_nolock(file))
2192                 RETURN(-EOPNOTSUPP);
2193
2194         spin_lock(&lli->lli_lock);
2195         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2196                 CWARN("group lock already existed with gid %lu\n",
2197                       fd->fd_grouplock.lg_gid);
2198                 spin_unlock(&lli->lli_lock);
2199                 RETURN(-EINVAL);
2200         }
2201         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2202         spin_unlock(&lli->lli_lock);
2203
2204         /**
2205          * XXX: group lock needs to protect all OST objects while PFL
2206          * can add new OST objects during the IO, so we'd instantiate
2207          * all OST objects before getting its group lock.
2208          */
2209         if (obj) {
2210                 struct lu_env *env;
2211                 __u16 refcheck;
2212                 struct cl_layout cl = {
2213                         .cl_is_composite = false,
2214                 };
2215                 struct lu_extent ext = {
2216                         .e_start = 0,
2217                         .e_end = OBD_OBJECT_EOF,
2218                 };
2219
2220                 env = cl_env_get(&refcheck);
2221                 if (IS_ERR(env))
2222                         RETURN(PTR_ERR(env));
2223
2224                 rc = cl_object_layout_get(env, obj, &cl);
2225                 if (!rc && cl.cl_is_composite)
2226                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2227                                                     &ext);
2228
2229                 cl_env_put(env, &refcheck);
2230                 if (rc)
2231                         RETURN(rc);
2232         }
2233
2234         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2235                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2236         if (rc)
2237                 RETURN(rc);
2238
2239         spin_lock(&lli->lli_lock);
2240         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2241                 spin_unlock(&lli->lli_lock);
2242                 CERROR("another thread just won the race\n");
2243                 cl_put_grouplock(&grouplock);
2244                 RETURN(-EINVAL);
2245         }
2246
2247         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2248         fd->fd_grouplock = grouplock;
2249         spin_unlock(&lli->lli_lock);
2250
2251         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2252         RETURN(0);
2253 }
2254
2255 static int ll_put_grouplock(struct inode *inode, struct file *file,
2256                             unsigned long arg)
2257 {
2258         struct ll_inode_info   *lli = ll_i2info(inode);
2259         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2260         struct ll_grouplock     grouplock;
2261         ENTRY;
2262
2263         spin_lock(&lli->lli_lock);
2264         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2265                 spin_unlock(&lli->lli_lock);
2266                 CWARN("no group lock held\n");
2267                 RETURN(-EINVAL);
2268         }
2269
2270         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2271
2272         if (fd->fd_grouplock.lg_gid != arg) {
2273                 CWARN("group lock %lu doesn't match current id %lu\n",
2274                       arg, fd->fd_grouplock.lg_gid);
2275                 spin_unlock(&lli->lli_lock);
2276                 RETURN(-EINVAL);
2277         }
2278
2279         grouplock = fd->fd_grouplock;
2280         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2281         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2282         spin_unlock(&lli->lli_lock);
2283
2284         cl_put_grouplock(&grouplock);
2285         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2286         RETURN(0);
2287 }
2288
2289 /**
2290  * Close inode open handle
2291  *
2292  * \param dentry [in]     dentry which contains the inode
2293  * \param it     [in,out] intent which contains open info and result
2294  *
2295  * \retval 0     success
2296  * \retval <0    failure
2297  */
2298 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2299 {
2300         struct inode *inode = dentry->d_inode;
2301         struct obd_client_handle *och;
2302         int rc;
2303         ENTRY;
2304
2305         LASSERT(inode);
2306
2307         /* Root ? Do nothing. */
2308         if (dentry->d_inode->i_sb->s_root == dentry)
2309                 RETURN(0);
2310
2311         /* No open handle to close? Move away */
2312         if (!it_disposition(it, DISP_OPEN_OPEN))
2313                 RETURN(0);
2314
2315         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2316
2317         OBD_ALLOC(och, sizeof(*och));
2318         if (!och)
2319                 GOTO(out, rc = -ENOMEM);
2320
2321         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2322
2323         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2324 out:
2325         /* this one is in place of ll_file_open */
2326         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2327                 ptlrpc_req_finished(it->it_request);
2328                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2329         }
2330         RETURN(rc);
2331 }
2332
2333 /**
2334  * Get size for inode for which FIEMAP mapping is requested.
2335  * Make the FIEMAP get_info call and returns the result.
2336  * \param fiemap        kernel buffer to hold extens
2337  * \param num_bytes     kernel buffer size
2338  */
2339 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2340                         size_t num_bytes)
2341 {
2342         struct lu_env                   *env;
2343         __u16                           refcheck;
2344         int                             rc = 0;
2345         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2346         ENTRY;
2347
2348         /* Checks for fiemap flags */
2349         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2350                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2351                 return -EBADR;
2352         }
2353
2354         /* Check for FIEMAP_FLAG_SYNC */
2355         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2356                 rc = filemap_fdatawrite(inode->i_mapping);
2357                 if (rc)
2358                         return rc;
2359         }
2360
2361         env = cl_env_get(&refcheck);
2362         if (IS_ERR(env))
2363                 RETURN(PTR_ERR(env));
2364
2365         if (i_size_read(inode) == 0) {
2366                 rc = ll_glimpse_size(inode);
2367                 if (rc)
2368                         GOTO(out, rc);
2369         }
2370
2371         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2372         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2373         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2374
2375         /* If filesize is 0, then there would be no objects for mapping */
2376         if (fmkey.lfik_oa.o_size == 0) {
2377                 fiemap->fm_mapped_extents = 0;
2378                 GOTO(out, rc = 0);
2379         }
2380
2381         fmkey.lfik_fiemap = *fiemap;
2382
2383         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2384                               &fmkey, fiemap, &num_bytes);
2385 out:
2386         cl_env_put(env, &refcheck);
2387         RETURN(rc);
2388 }
2389
2390 int ll_fid2path(struct inode *inode, void __user *arg)
2391 {
2392         struct obd_export       *exp = ll_i2mdexp(inode);
2393         const struct getinfo_fid2path __user *gfin = arg;
2394         __u32                    pathlen;
2395         struct getinfo_fid2path *gfout;
2396         size_t                   outsize;
2397         int                      rc;
2398
2399         ENTRY;
2400
2401         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2402             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2403                 RETURN(-EPERM);
2404
2405         /* Only need to get the buflen */
2406         if (get_user(pathlen, &gfin->gf_pathlen))
2407                 RETURN(-EFAULT);
2408
2409         if (pathlen > PATH_MAX)
2410                 RETURN(-EINVAL);
2411
2412         outsize = sizeof(*gfout) + pathlen;
2413         OBD_ALLOC(gfout, outsize);
2414         if (gfout == NULL)
2415                 RETURN(-ENOMEM);
2416
2417         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2418                 GOTO(gf_free, rc = -EFAULT);
2419         /* append root FID after gfout to let MDT know the root FID so that it
2420          * can lookup the correct path, this is mainly for fileset.
2421          * old server without fileset mount support will ignore this. */
2422         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2423
2424         /* Call mdc_iocontrol */
2425         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2426         if (rc != 0)
2427                 GOTO(gf_free, rc);
2428
2429         if (copy_to_user(arg, gfout, outsize))
2430                 rc = -EFAULT;
2431
2432 gf_free:
2433         OBD_FREE(gfout, outsize);
2434         RETURN(rc);
2435 }
2436
2437 static int
2438 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2439 {
2440         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2441         struct lu_env *env;
2442         struct cl_io *io;
2443         __u16  refcheck;
2444         int result;
2445
2446         ENTRY;
2447
2448         ioc->idv_version = 0;
2449         ioc->idv_layout_version = UINT_MAX;
2450
2451         /* If no file object initialized, we consider its version is 0. */
2452         if (obj == NULL)
2453                 RETURN(0);
2454
2455         env = cl_env_get(&refcheck);
2456         if (IS_ERR(env))
2457                 RETURN(PTR_ERR(env));
2458
2459         io = vvp_env_thread_io(env);
2460         io->ci_obj = obj;
2461         io->u.ci_data_version.dv_data_version = 0;
2462         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2463         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2464
2465 restart:
2466         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2467                 result = cl_io_loop(env, io);
2468         else
2469                 result = io->ci_result;
2470
2471         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2472         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2473
2474         cl_io_fini(env, io);
2475
2476         if (unlikely(io->ci_need_restart))
2477                 goto restart;
2478
2479         cl_env_put(env, &refcheck);
2480
2481         RETURN(result);
2482 }
2483
2484 /*
2485  * Read the data_version for inode.
2486  *
2487  * This value is computed using stripe object version on OST.
2488  * Version is computed using server side locking.
2489  *
2490  * @param flags if do sync on the OST side;
2491  *              0: no sync
2492  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2493  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2494  */
2495 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2496 {
2497         struct ioc_data_version ioc = { .idv_flags = flags };
2498         int rc;
2499
2500         rc = ll_ioc_data_version(inode, &ioc);
2501         if (!rc)
2502                 *data_version = ioc.idv_version;
2503
2504         return rc;
2505 }
2506
2507 /*
2508  * Trigger a HSM release request for the provided inode.
2509  */
2510 int ll_hsm_release(struct inode *inode)
2511 {
2512         struct lu_env *env;
2513         struct obd_client_handle *och = NULL;
2514         __u64 data_version = 0;
2515         int rc;
2516         __u16 refcheck;
2517         ENTRY;
2518
2519         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2520                ll_get_fsname(inode->i_sb, NULL, 0),
2521                PFID(&ll_i2info(inode)->lli_fid));
2522
2523         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2524         if (IS_ERR(och))
2525                 GOTO(out, rc = PTR_ERR(och));
2526
2527         /* Grab latest data_version and [am]time values */
2528         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2529         if (rc != 0)
2530                 GOTO(out, rc);
2531
2532         env = cl_env_get(&refcheck);
2533         if (IS_ERR(env))
2534                 GOTO(out, rc = PTR_ERR(env));
2535
2536         rc = ll_merge_attr(env, inode);
2537         cl_env_put(env, &refcheck);
2538
2539         /* If error happen, we have the wrong size for a file.
2540          * Don't release it.
2541          */
2542         if (rc != 0)
2543                 GOTO(out, rc);
2544
2545         /* Release the file.
2546          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2547          * we still need it to pack l_remote_handle to MDT. */
2548         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2549                                        &data_version);
2550         och = NULL;
2551
2552         EXIT;
2553 out:
2554         if (och != NULL && !IS_ERR(och)) /* close the file */
2555                 ll_lease_close(och, inode, NULL);
2556
2557         return rc;
2558 }
2559
2560 struct ll_swap_stack {
2561         __u64                    dv1;
2562         __u64                    dv2;
2563         struct inode            *inode1;
2564         struct inode            *inode2;
2565         bool                     check_dv1;
2566         bool                     check_dv2;
2567 };
2568
2569 static int ll_swap_layouts(struct file *file1, struct file *file2,
2570                            struct lustre_swap_layouts *lsl)
2571 {
2572         struct mdc_swap_layouts  msl;
2573         struct md_op_data       *op_data;
2574         __u32                    gid;
2575         __u64                    dv;
2576         struct ll_swap_stack    *llss = NULL;
2577         int                      rc;
2578
2579         OBD_ALLOC_PTR(llss);
2580         if (llss == NULL)
2581                 RETURN(-ENOMEM);
2582
2583         llss->inode1 = file_inode(file1);
2584         llss->inode2 = file_inode(file2);
2585
2586         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2587         if (rc < 0)
2588                 GOTO(free, rc);
2589
2590         /* we use 2 bool because it is easier to swap than 2 bits */
2591         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2592                 llss->check_dv1 = true;
2593
2594         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2595                 llss->check_dv2 = true;
2596
2597         /* we cannot use lsl->sl_dvX directly because we may swap them */
2598         llss->dv1 = lsl->sl_dv1;
2599         llss->dv2 = lsl->sl_dv2;
2600
2601         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2602         if (rc == 0) /* same file, done! */
2603                 GOTO(free, rc);
2604
2605         if (rc < 0) { /* sequentialize it */
2606                 swap(llss->inode1, llss->inode2);
2607                 swap(file1, file2);
2608                 swap(llss->dv1, llss->dv2);
2609                 swap(llss->check_dv1, llss->check_dv2);
2610         }
2611
2612         gid = lsl->sl_gid;
2613         if (gid != 0) { /* application asks to flush dirty cache */
2614                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2615                 if (rc < 0)
2616                         GOTO(free, rc);
2617
2618                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2619                 if (rc < 0) {
2620                         ll_put_grouplock(llss->inode1, file1, gid);
2621                         GOTO(free, rc);
2622                 }
2623         }
2624
2625         /* ultimate check, before swaping the layouts we check if
2626          * dataversion has changed (if requested) */
2627         if (llss->check_dv1) {
2628                 rc = ll_data_version(llss->inode1, &dv, 0);
2629                 if (rc)
2630                         GOTO(putgl, rc);
2631                 if (dv != llss->dv1)
2632                         GOTO(putgl, rc = -EAGAIN);
2633         }
2634
2635         if (llss->check_dv2) {
2636                 rc = ll_data_version(llss->inode2, &dv, 0);
2637                 if (rc)
2638                         GOTO(putgl, rc);
2639                 if (dv != llss->dv2)
2640                         GOTO(putgl, rc = -EAGAIN);
2641         }
2642
2643         /* struct md_op_data is used to send the swap args to the mdt
2644          * only flags is missing, so we use struct mdc_swap_layouts
2645          * through the md_op_data->op_data */
2646         /* flags from user space have to be converted before they are send to
2647          * server, no flag is sent today, they are only used on the client */
2648         msl.msl_flags = 0;
2649         rc = -ENOMEM;
2650         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2651                                      0, LUSTRE_OPC_ANY, &msl);
2652         if (IS_ERR(op_data))
2653                 GOTO(free, rc = PTR_ERR(op_data));
2654
2655         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2656                            sizeof(*op_data), op_data, NULL);
2657         ll_finish_md_op_data(op_data);
2658
2659         if (rc < 0)
2660                 GOTO(putgl, rc);
2661
2662 putgl:
2663         if (gid != 0) {
2664                 ll_put_grouplock(llss->inode2, file2, gid);
2665                 ll_put_grouplock(llss->inode1, file1, gid);
2666         }
2667
2668 free:
2669         if (llss != NULL)
2670                 OBD_FREE_PTR(llss);
2671
2672         RETURN(rc);
2673 }
2674
2675 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2676 {
2677         struct md_op_data       *op_data;
2678         int                      rc;
2679         ENTRY;
2680
2681         /* Detect out-of range masks */
2682         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2683                 RETURN(-EINVAL);
2684
2685         /* Non-root users are forbidden to set or clear flags which are
2686          * NOT defined in HSM_USER_MASK. */
2687         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2688             !cfs_capable(CFS_CAP_SYS_ADMIN))
2689                 RETURN(-EPERM);
2690
2691         /* Detect out-of range archive id */
2692         if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2693             (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2694                 RETURN(-EINVAL);
2695
2696         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2697                                      LUSTRE_OPC_ANY, hss);
2698         if (IS_ERR(op_data))
2699                 RETURN(PTR_ERR(op_data));
2700
2701         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2702                            sizeof(*op_data), op_data, NULL);
2703
2704         ll_finish_md_op_data(op_data);
2705
2706         RETURN(rc);
2707 }
2708
2709 static int ll_hsm_import(struct inode *inode, struct file *file,
2710                          struct hsm_user_import *hui)
2711 {
2712         struct hsm_state_set    *hss = NULL;
2713         struct iattr            *attr = NULL;
2714         int                      rc;
2715         ENTRY;
2716
2717         if (!S_ISREG(inode->i_mode))
2718                 RETURN(-EINVAL);
2719
2720         /* set HSM flags */
2721         OBD_ALLOC_PTR(hss);
2722         if (hss == NULL)
2723                 GOTO(out, rc = -ENOMEM);
2724
2725         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2726         hss->hss_archive_id = hui->hui_archive_id;
2727         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2728         rc = ll_hsm_state_set(inode, hss);
2729         if (rc != 0)
2730                 GOTO(out, rc);
2731
2732         OBD_ALLOC_PTR(attr);
2733         if (attr == NULL)
2734                 GOTO(out, rc = -ENOMEM);
2735
2736         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2737         attr->ia_mode |= S_IFREG;
2738         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2739         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2740         attr->ia_size = hui->hui_size;
2741         attr->ia_mtime.tv_sec = hui->hui_mtime;
2742         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2743         attr->ia_atime.tv_sec = hui->hui_atime;
2744         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2745
2746         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2747                          ATTR_UID | ATTR_GID |
2748                          ATTR_MTIME | ATTR_MTIME_SET |
2749                          ATTR_ATIME | ATTR_ATIME_SET;
2750
2751         inode_lock(inode);
2752
2753         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2754         if (rc == -ENODATA)
2755                 rc = 0;
2756
2757         inode_unlock(inode);
2758
2759 out:
2760         if (hss != NULL)
2761                 OBD_FREE_PTR(hss);
2762
2763         if (attr != NULL)
2764                 OBD_FREE_PTR(attr);
2765
2766         RETURN(rc);
2767 }
2768
2769 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2770 {
2771         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2772                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2773 }
2774
2775 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2776 {
2777         struct inode *inode = file_inode(file);
2778         struct iattr ia = {
2779                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2780                             ATTR_MTIME | ATTR_MTIME_SET |
2781                             ATTR_CTIME,
2782                 .ia_atime = {
2783                         .tv_sec = lfu->lfu_atime_sec,
2784                         .tv_nsec = lfu->lfu_atime_nsec,
2785                 },
2786                 .ia_mtime = {
2787                         .tv_sec = lfu->lfu_mtime_sec,
2788                         .tv_nsec = lfu->lfu_mtime_nsec,
2789                 },
2790                 .ia_ctime = {
2791                         .tv_sec = lfu->lfu_ctime_sec,
2792                         .tv_nsec = lfu->lfu_ctime_nsec,
2793                 },
2794         };
2795         int rc;
2796         ENTRY;
2797
2798         if (!capable(CAP_SYS_ADMIN))
2799                 RETURN(-EPERM);
2800
2801         if (!S_ISREG(inode->i_mode))
2802                 RETURN(-EINVAL);
2803
2804         inode_lock(inode);
2805         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2806                             false);
2807         inode_unlock(inode);
2808
2809         RETURN(rc);
2810 }
2811
2812 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2813 {
2814         switch (mode) {
2815         case MODE_READ_USER:
2816                 return CLM_READ;
2817         case MODE_WRITE_USER:
2818                 return CLM_WRITE;
2819         default:
2820                 return -EINVAL;
2821         }
2822 }
2823
2824 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2825
2826 /* Used to allow the upper layers of the client to request an LDLM lock
2827  * without doing an actual read or write.
2828  *
2829  * Used for ladvise lockahead to manually request specific locks.
2830  *
2831  * \param[in] file      file this ladvise lock request is on
2832  * \param[in] ladvise   ladvise struct describing this lock request
2833  *
2834  * \retval 0            success, no detailed result available (sync requests
2835  *                      and requests sent to the server [not handled locally]
2836  *                      cannot return detailed results)
2837  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2838  *                                       see definitions for details.
2839  * \retval negative     negative errno on error
2840  */
2841 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2842 {
2843         struct lu_env *env = NULL;
2844         struct cl_io *io  = NULL;
2845         struct cl_lock *lock = NULL;
2846         struct cl_lock_descr *descr = NULL;
2847         struct dentry *dentry = file->f_path.dentry;
2848         struct inode *inode = dentry->d_inode;
2849         enum cl_lock_mode cl_mode;
2850         off_t start = ladvise->lla_start;
2851         off_t end = ladvise->lla_end;
2852         int result;
2853         __u16 refcheck;
2854
2855         ENTRY;
2856
2857         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2858                "start=%llu, end=%llu\n", dentry->d_name.len,
2859                dentry->d_name.name, dentry->d_inode,
2860                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2861                (__u64) end);
2862
2863         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2864         if (cl_mode < 0)
2865                 GOTO(out, result = cl_mode);
2866
2867         /* Get IO environment */
2868         result = cl_io_get(inode, &env, &io, &refcheck);
2869         if (result <= 0)
2870                 GOTO(out, result);
2871
2872         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2873         if (result > 0) {
2874                 /*
2875                  * nothing to do for this io. This currently happens when
2876                  * stripe sub-object's are not yet created.
2877                  */
2878                 result = io->ci_result;
2879         } else if (result == 0) {
2880                 lock = vvp_env_lock(env);
2881                 descr = &lock->cll_descr;
2882
2883                 descr->cld_obj   = io->ci_obj;
2884                 /* Convert byte offsets to pages */
2885                 descr->cld_start = cl_index(io->ci_obj, start);
2886                 descr->cld_end   = cl_index(io->ci_obj, end);
2887                 descr->cld_mode  = cl_mode;
2888                 /* CEF_MUST is used because we do not want to convert a
2889                  * lockahead request to a lockless lock */
2890                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2891                                        CEF_NONBLOCK;
2892
2893                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2894                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2895
2896                 result = cl_lock_request(env, io, lock);
2897
2898                 /* On success, we need to release the lock */
2899                 if (result >= 0)
2900                         cl_lock_release(env, lock);
2901         }
2902         cl_io_fini(env, io);
2903         cl_env_put(env, &refcheck);
2904
2905         /* -ECANCELED indicates a matching lock with a different extent
2906          * was already present, and -EEXIST indicates a matching lock
2907          * on exactly the same extent was already present.
2908          * We convert them to positive values for userspace to make
2909          * recognizing true errors easier.
2910          * Note we can only return these detailed results on async requests,
2911          * as sync requests look the same as i/o requests for locking. */
2912         if (result == -ECANCELED)
2913                 result = LLA_RESULT_DIFFERENT;
2914         else if (result == -EEXIST)
2915                 result = LLA_RESULT_SAME;
2916
2917 out:
2918         RETURN(result);
2919 }
2920 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2921
2922 static int ll_ladvise_sanity(struct inode *inode,
2923                              struct llapi_lu_ladvise *ladvise)
2924 {
2925         enum lu_ladvise_type advice = ladvise->lla_advice;
2926         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2927          * be in the first 32 bits of enum ladvise_flags */
2928         __u32 flags = ladvise->lla_peradvice_flags;
2929         /* 3 lines at 80 characters per line, should be plenty */
2930         int rc = 0;
2931
2932         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2933                 rc = -EINVAL;
2934                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2935                        "last supported advice is %s (value '%d'): rc = %d\n",
2936                        ll_get_fsname(inode->i_sb, NULL, 0), advice,
2937                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2938                 GOTO(out, rc);
2939         }
2940
2941         /* Per-advice checks */
2942         switch (advice) {
2943         case LU_LADVISE_LOCKNOEXPAND:
2944                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2945                         rc = -EINVAL;
2946                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2947                                "rc = %d\n",
2948                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2949                                ladvise_names[advice], rc);
2950                         GOTO(out, rc);
2951                 }
2952                 break;
2953         case LU_LADVISE_LOCKAHEAD:
2954                 /* Currently only READ and WRITE modes can be requested */
2955                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2956                     ladvise->lla_lockahead_mode == 0) {
2957                         rc = -EINVAL;
2958                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2959                                "rc = %d\n",
2960                                ll_get_fsname(inode->i_sb, NULL, 0),
2961                                ladvise->lla_lockahead_mode,
2962                                ladvise_names[advice], rc);
2963                         GOTO(out, rc);
2964                 }
2965         case LU_LADVISE_WILLREAD:
2966         case LU_LADVISE_DONTNEED:
2967         default:
2968                 /* Note fall through above - These checks apply to all advices
2969                  * except LOCKNOEXPAND */
2970                 if (flags & ~LF_DEFAULT_MASK) {
2971                         rc = -EINVAL;
2972                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2973                                "rc = %d\n",
2974                                ll_get_fsname(inode->i_sb, NULL, 0), flags,
2975                                ladvise_names[advice], rc);
2976                         GOTO(out, rc);
2977                 }
2978                 if (ladvise->lla_start >= ladvise->lla_end) {
2979                         rc = -EINVAL;
2980                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2981                                "for %s: rc = %d\n",
2982                                ll_get_fsname(inode->i_sb, NULL, 0),
2983                                ladvise->lla_start, ladvise->lla_end,
2984                                ladvise_names[advice], rc);
2985                         GOTO(out, rc);
2986                 }
2987                 break;
2988         }
2989
2990 out:
2991         return rc;
2992 }
2993 #undef ERRSIZE
2994
2995 /*
2996  * Give file access advices
2997  *
2998  * The ladvise interface is similar to Linux fadvise() system call, except it
2999  * forwards the advices directly from Lustre client to server. The server side
3000  * codes will apply appropriate read-ahead and caching techniques for the
3001  * corresponding files.
3002  *
3003  * A typical workload for ladvise is e.g. a bunch of different clients are
3004  * doing small random reads of a file, so prefetching pages into OSS cache
3005  * with big linear reads before the random IO is a net benefit. Fetching
3006  * all that data into each client cache with fadvise() may not be, due to
3007  * much more data being sent to the client.
3008  */
3009 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3010                       struct llapi_lu_ladvise *ladvise)
3011 {
3012         struct lu_env *env;
3013         struct cl_io *io;
3014         struct cl_ladvise_io *lio;
3015         int rc;
3016         __u16 refcheck;
3017         ENTRY;
3018
3019         env = cl_env_get(&refcheck);
3020         if (IS_ERR(env))
3021                 RETURN(PTR_ERR(env));
3022
3023         io = vvp_env_thread_io(env);
3024         io->ci_obj = ll_i2info(inode)->lli_clob;
3025
3026         /* initialize parameters for ladvise */
3027         lio = &io->u.ci_ladvise;
3028         lio->li_start = ladvise->lla_start;
3029         lio->li_end = ladvise->lla_end;
3030         lio->li_fid = ll_inode2fid(inode);
3031         lio->li_advice = ladvise->lla_advice;
3032         lio->li_flags = flags;
3033
3034         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3035                 rc = cl_io_loop(env, io);
3036         else
3037                 rc = io->ci_result;
3038
3039         cl_io_fini(env, io);
3040         cl_env_put(env, &refcheck);
3041         RETURN(rc);
3042 }
3043
3044 static int ll_lock_noexpand(struct file *file, int flags)
3045 {
3046         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3047
3048         fd->ll_lock_no_expand = !(flags & LF_UNSET);
3049
3050         return 0;
3051 }
3052
3053 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3054                         unsigned long arg)
3055 {
3056         struct fsxattr fsxattr;
3057
3058         if (copy_from_user(&fsxattr,
3059                            (const struct fsxattr __user *)arg,
3060                            sizeof(fsxattr)))
3061                 RETURN(-EFAULT);
3062
3063         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3064         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3065                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3066         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3067         if (copy_to_user((struct fsxattr __user *)arg,
3068                          &fsxattr, sizeof(fsxattr)))
3069                 RETURN(-EFAULT);
3070
3071         RETURN(0);
3072 }
3073
3074 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3075                         unsigned long arg)
3076 {
3077
3078         struct md_op_data *op_data;
3079         struct ptlrpc_request *req = NULL;
3080         int rc = 0;
3081         struct fsxattr fsxattr;
3082         struct cl_object *obj;
3083         struct iattr *attr;
3084         int flags;
3085
3086         /* only root could change project ID */
3087         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
3088                 RETURN(-EPERM);
3089
3090         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3091                                      LUSTRE_OPC_ANY, NULL);
3092         if (IS_ERR(op_data))
3093                 RETURN(PTR_ERR(op_data));
3094
3095         if (copy_from_user(&fsxattr,
3096                            (const struct fsxattr __user *)arg,
3097                            sizeof(fsxattr)))
3098                 GOTO(out_fsxattr, rc = -EFAULT);
3099
3100         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3101         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3102         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3103                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3104         op_data->op_projid = fsxattr.fsx_projid;
3105         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3106         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3107                         0, &req);
3108         ptlrpc_req_finished(req);
3109         if (rc)
3110                 GOTO(out_fsxattr, rc);
3111         ll_update_inode_flags(inode, op_data->op_attr_flags);
3112         obj = ll_i2info(inode)->lli_clob;
3113         if (obj == NULL)
3114                 GOTO(out_fsxattr, rc);
3115
3116         OBD_ALLOC_PTR(attr);
3117         if (attr == NULL)
3118                 GOTO(out_fsxattr, rc = -ENOMEM);
3119
3120         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3121                             fsxattr.fsx_xflags);
3122         OBD_FREE_PTR(attr);
3123 out_fsxattr:
3124         ll_finish_md_op_data(op_data);
3125         RETURN(rc);
3126 }
3127
3128 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3129                                  unsigned long arg)
3130 {
3131         struct inode            *inode = file_inode(file);
3132         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3133         struct ll_inode_info    *lli = ll_i2info(inode);
3134         struct obd_client_handle *och = NULL;
3135         struct split_param sp;
3136         bool lease_broken;
3137         fmode_t fmode = 0;
3138         enum mds_op_bias bias = 0;
3139         struct file *layout_file = NULL;
3140         void *data = NULL;
3141         size_t data_size = 0;
3142         long rc;
3143         ENTRY;
3144
3145         mutex_lock(&lli->lli_och_mutex);
3146         if (fd->fd_lease_och != NULL) {
3147                 och = fd->fd_lease_och;
3148                 fd->fd_lease_och = NULL;
3149         }
3150         mutex_unlock(&lli->lli_och_mutex);
3151
3152         if (och == NULL)
3153                 GOTO(out, rc = -ENOLCK);
3154
3155         fmode = och->och_flags;
3156
3157         switch (ioc->lil_flags) {
3158         case LL_LEASE_RESYNC_DONE:
3159                 if (ioc->lil_count > IOC_IDS_MAX)
3160                         GOTO(out, rc = -EINVAL);
3161
3162                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3163                 OBD_ALLOC(data, data_size);
3164                 if (!data)
3165                         GOTO(out, rc = -ENOMEM);
3166
3167                 if (copy_from_user(data, (void __user *)arg, data_size))
3168                         GOTO(out, rc = -EFAULT);
3169
3170                 bias = MDS_CLOSE_RESYNC_DONE;
3171                 break;
3172         case LL_LEASE_LAYOUT_MERGE: {
3173                 int fd;
3174
3175                 if (ioc->lil_count != 1)
3176                         GOTO(out, rc = -EINVAL);
3177
3178                 arg += sizeof(*ioc);
3179                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3180                         GOTO(out, rc = -EFAULT);
3181
3182                 layout_file = fget(fd);
3183                 if (!layout_file)
3184                         GOTO(out, rc = -EBADF);
3185
3186                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3187                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3188                         GOTO(out, rc = -EPERM);
3189
3190                 data = file_inode(layout_file);
3191                 bias = MDS_CLOSE_LAYOUT_MERGE;
3192                 break;
3193         }
3194         case LL_LEASE_LAYOUT_SPLIT: {
3195                 int fdv;
3196                 int mirror_id;
3197
3198                 if (ioc->lil_count != 2)
3199                         GOTO(out, rc = -EINVAL);
3200
3201                 arg += sizeof(*ioc);
3202                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3203                         GOTO(out, rc = -EFAULT);
3204
3205                 arg += sizeof(__u32);
3206                 if (copy_from_user(&mirror_id, (void __user *)arg,
3207                                    sizeof(__u32)))
3208                         GOTO(out, rc = -EFAULT);
3209
3210                 layout_file = fget(fdv);
3211                 if (!layout_file)
3212                         GOTO(out, rc = -EBADF);
3213
3214                 sp.sp_inode = file_inode(layout_file);
3215                 sp.sp_mirror_id = (__u16)mirror_id;
3216                 data = &sp;
3217                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3218                 break;
3219         }
3220         default:
3221                 /* without close intent */
3222                 break;
3223         }
3224
3225         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3226         if (rc < 0)
3227                 GOTO(out, rc);
3228
3229         rc = ll_lease_och_release(inode, file);
3230         if (rc < 0)
3231                 GOTO(out, rc);
3232
3233         if (lease_broken)
3234                 fmode = 0;
3235         EXIT;
3236
3237 out:
3238         switch (ioc->lil_flags) {
3239         case LL_LEASE_RESYNC_DONE:
3240                 if (data)
3241                         OBD_FREE(data, data_size);
3242                 break;
3243         case LL_LEASE_LAYOUT_MERGE:
3244         case LL_LEASE_LAYOUT_SPLIT:
3245                 if (layout_file)
3246                         fput(layout_file);
3247                 break;
3248         }
3249
3250         if (!rc)
3251                 rc = ll_lease_type_from_fmode(fmode);
3252         RETURN(rc);
3253 }
3254
3255 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3256                               unsigned long arg)
3257 {
3258         struct inode *inode = file_inode(file);
3259         struct ll_inode_info *lli = ll_i2info(inode);
3260         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3261         struct obd_client_handle *och = NULL;
3262         __u64 open_flags = 0;
3263         bool lease_broken;
3264         fmode_t fmode;
3265         long rc;
3266         ENTRY;
3267
3268         switch (ioc->lil_mode) {
3269         case LL_LEASE_WRLCK:
3270                 if (!(file->f_mode & FMODE_WRITE))
3271                         RETURN(-EPERM);
3272                 fmode = FMODE_WRITE;
3273                 break;
3274         case LL_LEASE_RDLCK:
3275                 if (!(file->f_mode & FMODE_READ))
3276                         RETURN(-EPERM);
3277                 fmode = FMODE_READ;
3278                 break;
3279         case LL_LEASE_UNLCK:
3280                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3281         default:
3282                 RETURN(-EINVAL);
3283         }
3284
3285         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3286
3287         /* apply for lease */
3288         if (ioc->lil_flags & LL_LEASE_RESYNC)
3289                 open_flags = MDS_OPEN_RESYNC;
3290         och = ll_lease_open(inode, file, fmode, open_flags);
3291         if (IS_ERR(och))
3292                 RETURN(PTR_ERR(och));
3293
3294         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3295                 rc = ll_lease_file_resync(och, inode);
3296                 if (rc) {
3297                         ll_lease_close(och, inode, NULL);
3298                         RETURN(rc);
3299                 }
3300                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3301                 if (rc) {
3302                         ll_lease_close(och, inode, NULL);
3303                         RETURN(rc);
3304                 }
3305         }
3306
3307         rc = 0;
3308         mutex_lock(&lli->lli_och_mutex);
3309         if (fd->fd_lease_och == NULL) {
3310                 fd->fd_lease_och = och;
3311                 och = NULL;
3312         }
3313         mutex_unlock(&lli->lli_och_mutex);
3314         if (och != NULL) {
3315                 /* impossible now that only excl is supported for now */
3316                 ll_lease_close(och, inode, &lease_broken);
3317                 rc = -EBUSY;
3318         }
3319         RETURN(rc);
3320 }
3321
3322 static long
3323 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3324 {
3325         struct inode            *inode = file_inode(file);
3326         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3327         int                      flags, rc;
3328         ENTRY;
3329
3330         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3331                PFID(ll_inode2fid(inode)), inode, cmd);
3332         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3333
3334         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3335         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3336                 RETURN(-ENOTTY);
3337
3338         switch (cmd) {
3339         case LL_IOC_GETFLAGS:
3340                 /* Get the current value of the file flags */
3341                 return put_user(fd->fd_flags, (int __user *)arg);
3342         case LL_IOC_SETFLAGS:
3343         case LL_IOC_CLRFLAGS:
3344                 /* Set or clear specific file flags */
3345                 /* XXX This probably needs checks to ensure the flags are
3346                  *     not abused, and to handle any flag side effects.
3347                  */
3348                 if (get_user(flags, (int __user *) arg))
3349                         RETURN(-EFAULT);
3350
3351                 if (cmd == LL_IOC_SETFLAGS) {
3352                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3353                             !(file->f_flags & O_DIRECT)) {
3354                                 CERROR("%s: unable to disable locking on "
3355                                        "non-O_DIRECT file\n", current->comm);
3356                                 RETURN(-EINVAL);
3357                         }
3358
3359                         fd->fd_flags |= flags;
3360                 } else {
3361                         fd->fd_flags &= ~flags;
3362                 }
3363                 RETURN(0);
3364         case LL_IOC_LOV_SETSTRIPE:
3365         case LL_IOC_LOV_SETSTRIPE_NEW:
3366                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3367         case LL_IOC_LOV_SETEA:
3368                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3369         case LL_IOC_LOV_SWAP_LAYOUTS: {
3370                 struct file *file2;
3371                 struct lustre_swap_layouts lsl;
3372
3373                 if (copy_from_user(&lsl, (char __user *)arg,
3374                                    sizeof(struct lustre_swap_layouts)))
3375                         RETURN(-EFAULT);
3376
3377                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3378                         RETURN(-EPERM);
3379
3380                 file2 = fget(lsl.sl_fd);
3381                 if (file2 == NULL)
3382                         RETURN(-EBADF);
3383
3384                 /* O_WRONLY or O_RDWR */
3385                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3386                         GOTO(out, rc = -EPERM);
3387
3388                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3389                         struct inode                    *inode2;
3390                         struct ll_inode_info            *lli;
3391                         struct obd_client_handle        *och = NULL;
3392
3393                         lli = ll_i2info(inode);
3394                         mutex_lock(&lli->lli_och_mutex);
3395                         if (fd->fd_lease_och != NULL) {
3396                                 och = fd->fd_lease_och;
3397                                 fd->fd_lease_och = NULL;
3398                         }
3399                         mutex_unlock(&lli->lli_och_mutex);
3400                         if (och == NULL)
3401                                 GOTO(out, rc = -ENOLCK);
3402                         inode2 = file_inode(file2);
3403                         rc = ll_swap_layouts_close(och, inode, inode2);
3404                 } else {
3405                         rc = ll_swap_layouts(file, file2, &lsl);
3406                 }
3407 out:
3408                 fput(file2);
3409                 RETURN(rc);
3410         }
3411         case LL_IOC_LOV_GETSTRIPE:
3412         case LL_IOC_LOV_GETSTRIPE_NEW:
3413                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3414         case FS_IOC_GETFLAGS:
3415         case FS_IOC_SETFLAGS:
3416                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3417         case FSFILT_IOC_GETVERSION:
3418         case FS_IOC_GETVERSION:
3419                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3420         /* We need to special case any other ioctls we want to handle,
3421          * to send them to the MDS/OST as appropriate and to properly
3422          * network encode the arg field. */
3423         case FS_IOC_SETVERSION:
3424                 RETURN(-ENOTSUPP);
3425
3426         case LL_IOC_GROUP_LOCK:
3427                 RETURN(ll_get_grouplock(inode, file, arg));
3428         case LL_IOC_GROUP_UNLOCK:
3429                 RETURN(ll_put_grouplock(inode, file, arg));
3430         case IOC_OBD_STATFS:
3431                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3432
3433         case LL_IOC_FLUSHCTX:
3434                 RETURN(ll_flush_ctx(inode));
3435         case LL_IOC_PATH2FID: {
3436                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3437                                  sizeof(struct lu_fid)))
3438                         RETURN(-EFAULT);
3439
3440                 RETURN(0);
3441         }
3442         case LL_IOC_GETPARENT:
3443                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3444
3445         case OBD_IOC_FID2PATH:
3446                 RETURN(ll_fid2path(inode, (void __user *)arg));
3447         case LL_IOC_DATA_VERSION: {
3448                 struct ioc_data_version idv;
3449                 int rc;
3450
3451                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3452                         RETURN(-EFAULT);
3453
3454                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3455                 rc = ll_ioc_data_version(inode, &idv);
3456
3457                 if (rc == 0 &&
3458                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3459                         RETURN(-EFAULT);
3460
3461                 RETURN(rc);
3462         }
3463
3464         case LL_IOC_GET_MDTIDX: {
3465                 int mdtidx;
3466
3467                 mdtidx = ll_get_mdt_idx(inode);
3468                 if (mdtidx < 0)
3469                         RETURN(mdtidx);
3470
3471                 if (put_user((int)mdtidx, (int __user *)arg))
3472                         RETURN(-EFAULT);
3473
3474                 RETURN(0);
3475         }
3476         case OBD_IOC_GETDTNAME:
3477         case OBD_IOC_GETMDNAME:
3478                 RETURN(ll_get_obd_name(inode, cmd, arg));
3479         case LL_IOC_HSM_STATE_GET: {
3480                 struct md_op_data       *op_data;
3481                 struct hsm_user_state   *hus;
3482                 int                      rc;
3483
3484                 OBD_ALLOC_PTR(hus);
3485                 if (hus == NULL)
3486                         RETURN(-ENOMEM);
3487
3488                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3489                                              LUSTRE_OPC_ANY, hus);
3490                 if (IS_ERR(op_data)) {
3491                         OBD_FREE_PTR(hus);
3492                         RETURN(PTR_ERR(op_data));
3493                 }
3494
3495                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3496                                    op_data, NULL);
3497
3498                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3499                         rc = -EFAULT;
3500
3501                 ll_finish_md_op_data(op_data);
3502                 OBD_FREE_PTR(hus);
3503                 RETURN(rc);
3504         }
3505         case LL_IOC_HSM_STATE_SET: {
3506                 struct hsm_state_set    *hss;
3507                 int                      rc;
3508
3509                 OBD_ALLOC_PTR(hss);
3510                 if (hss == NULL)
3511                         RETURN(-ENOMEM);
3512
3513                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3514                         OBD_FREE_PTR(hss);
3515                         RETURN(-EFAULT);
3516                 }
3517
3518                 rc = ll_hsm_state_set(inode, hss);
3519
3520                 OBD_FREE_PTR(hss);
3521                 RETURN(rc);
3522         }
3523         case LL_IOC_HSM_ACTION: {
3524                 struct md_op_data               *op_data;
3525                 struct hsm_current_action       *hca;
3526                 int                              rc;
3527
3528                 OBD_ALLOC_PTR(hca);
3529                 if (hca == NULL)
3530                         RETURN(-ENOMEM);
3531
3532                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3533                                              LUSTRE_OPC_ANY, hca);
3534                 if (IS_ERR(op_data)) {
3535                         OBD_FREE_PTR(hca);
3536                         RETURN(PTR_ERR(op_data));
3537                 }
3538
3539                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3540                                    op_data, NULL);
3541
3542                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3543                         rc = -EFAULT;
3544
3545                 ll_finish_md_op_data(op_data);
3546                 OBD_FREE_PTR(hca);
3547                 RETURN(rc);
3548         }
3549         case LL_IOC_SET_LEASE_OLD: {
3550                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3551
3552                 RETURN(ll_file_set_lease(file, &ioc, 0));
3553         }
3554         case LL_IOC_SET_LEASE: {
3555                 struct ll_ioc_lease ioc;
3556
3557                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3558                         RETURN(-EFAULT);
3559
3560                 RETURN(ll_file_set_lease(file, &ioc, arg));
3561         }
3562         case LL_IOC_GET_LEASE: {
3563                 struct ll_inode_info *lli = ll_i2info(inode);
3564                 struct ldlm_lock *lock = NULL;
3565                 fmode_t fmode = 0;
3566
3567                 mutex_lock(&lli->lli_och_mutex);
3568                 if (fd->fd_lease_och != NULL) {
3569                         struct obd_client_handle *och = fd->fd_lease_och;
3570
3571                         lock = ldlm_handle2lock(&och->och_lease_handle);
3572                         if (lock != NULL) {
3573                                 lock_res_and_lock(lock);
3574                                 if (!ldlm_is_cancel(lock))
3575                                         fmode = och->och_flags;
3576
3577                                 unlock_res_and_lock(lock);
3578                                 LDLM_LOCK_PUT(lock);
3579                         }
3580                 }
3581                 mutex_unlock(&lli->lli_och_mutex);
3582
3583                 RETURN(ll_lease_type_from_fmode(fmode));
3584         }
3585         case LL_IOC_HSM_IMPORT: {
3586                 struct hsm_user_import *hui;
3587
3588                 OBD_ALLOC_PTR(hui);
3589                 if (hui == NULL)
3590                         RETURN(-ENOMEM);
3591
3592                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3593                         OBD_FREE_PTR(hui);
3594                         RETURN(-EFAULT);
3595                 }
3596
3597                 rc = ll_hsm_import(inode, file, hui);
3598
3599                 OBD_FREE_PTR(hui);
3600                 RETURN(rc);
3601         }
3602         case LL_IOC_FUTIMES_3: {
3603                 struct ll_futimes_3 lfu;
3604
3605                 if (copy_from_user(&lfu,
3606                                    (const struct ll_futimes_3 __user *)arg,
3607                                    sizeof(lfu)))
3608                         RETURN(-EFAULT);
3609
3610                 RETURN(ll_file_futimes_3(file, &lfu));
3611         }
3612         case LL_IOC_LADVISE: {
3613                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3614                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3615                 int i;
3616                 int num_advise;
3617                 int alloc_size = sizeof(*k_ladvise_hdr);
3618
3619                 rc = 0;
3620                 u_ladvise_hdr = (void __user *)arg;
3621                 OBD_ALLOC_PTR(k_ladvise_hdr);
3622                 if (k_ladvise_hdr == NULL)
3623                         RETURN(-ENOMEM);
3624
3625                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3626                         GOTO(out_ladvise, rc = -EFAULT);
3627
3628                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3629                     k_ladvise_hdr->lah_count < 1)
3630                         GOTO(out_ladvise, rc = -EINVAL);
3631
3632                 num_advise = k_ladvise_hdr->lah_count;
3633                 if (num_advise >= LAH_COUNT_MAX)
3634                         GOTO(out_ladvise, rc = -EFBIG);
3635
3636                 OBD_FREE_PTR(k_ladvise_hdr);
3637                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3638                                       lah_advise[num_advise]);
3639                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3640                 if (k_ladvise_hdr == NULL)
3641                         RETURN(-ENOMEM);
3642
3643                 /*
3644                  * TODO: submit multiple advices to one server in a single RPC
3645                  */
3646                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3647                         GOTO(out_ladvise, rc = -EFAULT);
3648
3649                 for (i = 0; i < num_advise; i++) {
3650                         struct llapi_lu_ladvise *k_ladvise =
3651                                         &k_ladvise_hdr->lah_advise[i];
3652                         struct llapi_lu_ladvise __user *u_ladvise =
3653                                         &u_ladvise_hdr->lah_advise[i];
3654
3655                         rc = ll_ladvise_sanity(inode, k_ladvise);
3656                         if (rc)
3657                                 GOTO(out_ladvise, rc);
3658
3659                         switch (k_ladvise->lla_advice) {
3660                         case LU_LADVISE_LOCKNOEXPAND:
3661                                 rc = ll_lock_noexpand(file,
3662                                                k_ladvise->lla_peradvice_flags);
3663                                 GOTO(out_ladvise, rc);
3664                         case LU_LADVISE_LOCKAHEAD:
3665
3666                                 rc = ll_file_lock_ahead(file, k_ladvise);
3667
3668                                 if (rc < 0)
3669                                         GOTO(out_ladvise, rc);
3670
3671                                 if (put_user(rc,
3672                                              &u_ladvise->lla_lockahead_result))
3673                                         GOTO(out_ladvise, rc = -EFAULT);
3674                                 break;
3675                         default:
3676                                 rc = ll_ladvise(inode, file,
3677                                                 k_ladvise_hdr->lah_flags,
3678                                                 k_ladvise);
3679                                 if (rc)
3680                                         GOTO(out_ladvise, rc);
3681                                 break;
3682                         }
3683
3684                 }
3685
3686 out_ladvise:
3687                 OBD_FREE(k_ladvise_hdr, alloc_size);
3688                 RETURN(rc);
3689         }
3690         case LL_IOC_FLR_SET_MIRROR: {
3691                 /* mirror I/O must be direct to avoid polluting page cache
3692                  * by stale data. */
3693                 if (!(file->f_flags & O_DIRECT))
3694                         RETURN(-EINVAL);
3695
3696                 fd->fd_designated_mirror = (__u32)arg;
3697                 RETURN(0);
3698         }
3699         case LL_IOC_FSGETXATTR:
3700                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3701         case LL_IOC_FSSETXATTR:
3702                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3703         case BLKSSZGET:
3704                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3705         default:
3706                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3707                                      (void __user *)arg));
3708         }
3709 }
3710
3711 #ifndef HAVE_FILE_LLSEEK_SIZE
3712 static inline loff_t
3713 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3714 {
3715         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3716                 return -EINVAL;
3717         if (offset > maxsize)
3718                 return -EINVAL;
3719
3720         if (offset != file->f_pos) {
3721                 file->f_pos = offset;
3722                 file->f_version = 0;
3723         }
3724         return offset;
3725 }
3726
3727 static loff_t
3728 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3729                 loff_t maxsize, loff_t eof)
3730 {
3731         struct inode *inode = file_inode(file);
3732
3733         switch (origin) {
3734         case SEEK_END:
3735                 offset += eof;
3736                 break;
3737         case SEEK_CUR:
3738                 /*
3739                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3740                  * position-querying operation.  Avoid rewriting the "same"
3741                  * f_pos value back to the file because a concurrent read(),
3742                  * write() or lseek() might have altered it
3743                  */
3744                 if (offset == 0)
3745                         return file->f_pos;
3746                 /*
3747                  * f_lock protects against read/modify/write race with other
3748                  * SEEK_CURs. Note that parallel writes and reads behave
3749                  * like SEEK_SET.
3750                  */
3751                 inode_lock(inode);
3752                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3753                 inode_unlock(inode);
3754                 return offset;
3755         case SEEK_DATA:
3756                 /*
3757                  * In the generic case the entire file is data, so as long as
3758                  * offset isn't at the end of the file then the offset is data.
3759                  */
3760                 if (offset >= eof)
3761                         return -ENXIO;
3762                 break;
3763         case SEEK_HOLE:
3764                 /*
3765                  * There is a virtual hole at the end of the file, so as long as
3766                  * offset isn't i_size or larger, return i_size.
3767                  */
3768                 if (offset >= eof)
3769                         return -ENXIO;
3770                 offset = eof;
3771                 break;
3772         }
3773
3774         return llseek_execute(file, offset, maxsize);
3775 }
3776 #endif
3777
3778 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3779 {
3780         struct inode *inode = file_inode(file);
3781         loff_t retval, eof = 0;
3782
3783         ENTRY;
3784         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3785                            (origin == SEEK_CUR) ? file->f_pos : 0);
3786         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3787                PFID(ll_inode2fid(inode)), inode, retval, retval,
3788                origin);
3789         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3790
3791         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3792                 retval = ll_glimpse_size(inode);
3793                 if (retval != 0)
3794                         RETURN(retval);
3795                 eof = i_size_read(inode);
3796         }
3797
3798         retval = ll_generic_file_llseek_size(file, offset, origin,
3799                                           ll_file_maxbytes(inode), eof);
3800         RETURN(retval);
3801 }
3802
3803 static int ll_flush(struct file *file, fl_owner_t id)
3804 {
3805         struct inode *inode = file_inode(file);
3806         struct ll_inode_info *lli = ll_i2info(inode);
3807         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3808         int rc, err;
3809
3810         LASSERT(!S_ISDIR(inode->i_mode));
3811
3812         /* catch async errors that were recorded back when async writeback
3813          * failed for pages in this mapping. */
3814         rc = lli->lli_async_rc;
3815         lli->lli_async_rc = 0;
3816         if (lli->lli_clob != NULL) {
3817                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3818                 if (rc == 0)
3819                         rc = err;
3820         }
3821
3822         /* The application has been told write failure already.
3823          * Do not report failure again. */
3824         if (fd->fd_write_failed)
3825                 return 0;
3826         return rc ? -EIO : 0;
3827 }
3828
3829 /**
3830  * Called to make sure a portion of file has been written out.
3831  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3832  *
3833  * Return how many pages have been written.
3834  */
3835 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3836                        enum cl_fsync_mode mode, int ignore_layout)
3837 {
3838         struct lu_env *env;
3839         struct cl_io *io;
3840         struct cl_fsync_io *fio;
3841         int result;
3842         __u16 refcheck;
3843         ENTRY;
3844
3845         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3846             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3847                 RETURN(-EINVAL);
3848
3849         env = cl_env_get(&refcheck);
3850         if (IS_ERR(env))
3851                 RETURN(PTR_ERR(env));
3852
3853         io = vvp_env_thread_io(env);
3854         io->ci_obj = ll_i2info(inode)->lli_clob;
3855         io->ci_ignore_layout = ignore_layout;
3856
3857         /* initialize parameters for sync */
3858         fio = &io->u.ci_fsync;
3859         fio->fi_start = start;
3860         fio->fi_end = end;
3861         fio->fi_fid = ll_inode2fid(inode);
3862         fio->fi_mode = mode;
3863         fio->fi_nr_written = 0;
3864
3865         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3866                 result = cl_io_loop(env, io);
3867         else
3868                 result = io->ci_result;
3869         if (result == 0)
3870                 result = fio->fi_nr_written;
3871         cl_io_fini(env, io);
3872         cl_env_put(env, &refcheck);
3873
3874         RETURN(result);
3875 }
3876
3877 /*
3878  * When dentry is provided (the 'else' case), file_dentry() may be
3879  * null and dentry must be used directly rather than pulled from
3880  * file_dentry() as is done otherwise.
3881  */
3882
3883 #ifdef HAVE_FILE_FSYNC_4ARGS
3884 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3885 {
3886         struct dentry *dentry = file_dentry(file);
3887         bool lock_inode;
3888 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3889 int ll_fsync(struct file *file, int datasync)
3890 {
3891         struct dentry *dentry = file_dentry(file);
3892         loff_t start = 0;
3893         loff_t end = LLONG_MAX;
3894 #else
3895 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3896 {
3897         loff_t start = 0;
3898         loff_t end = LLONG_MAX;
3899 #endif
3900         struct inode *inode = dentry->d_inode;
3901         struct ll_inode_info *lli = ll_i2info(inode);
3902         struct ptlrpc_request *req;
3903         int rc, err;
3904         ENTRY;
3905
3906         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3907                PFID(ll_inode2fid(inode)), inode);
3908         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3909
3910 #ifdef HAVE_FILE_FSYNC_4ARGS
3911         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3912         lock_inode = !lli->lli_inode_locked;
3913         if (lock_inode)
3914                 inode_lock(inode);
3915 #else
3916         /* fsync's caller has already called _fdata{sync,write}, we want
3917          * that IO to finish before calling the osc and mdc sync methods */
3918         rc = filemap_fdatawait(inode->i_mapping);
3919 #endif
3920
3921         /* catch async errors that were recorded back when async writeback
3922          * failed for pages in this mapping. */
3923         if (!S_ISDIR(inode->i_mode)) {
3924                 err = lli->lli_async_rc;
3925                 lli->lli_async_rc = 0;
3926                 if (rc == 0)
3927                         rc = err;
3928                 if (lli->lli_clob != NULL) {
3929                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3930                         if (rc == 0)
3931                                 rc = err;
3932                 }
3933         }
3934
3935         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3936         if (!rc)
3937                 rc = err;
3938         if (!err)
3939                 ptlrpc_req_finished(req);
3940
3941         if (S_ISREG(inode->i_mode)) {
3942                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3943
3944                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3945                 if (rc == 0 && err < 0)
3946                         rc = err;
3947                 if (rc < 0)
3948                         fd->fd_write_failed = true;
3949                 else
3950                         fd->fd_write_failed = false;
3951         }
3952
3953 #ifdef HAVE_FILE_FSYNC_4ARGS
3954         if (lock_inode)
3955                 inode_unlock(inode);
3956 #endif
3957         RETURN(rc);
3958 }
3959
3960 static int
3961 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3962 {
3963         struct inode *inode = file_inode(file);
3964         struct ll_sb_info *sbi = ll_i2sbi(inode);
3965         struct ldlm_enqueue_info einfo = {
3966                 .ei_type        = LDLM_FLOCK,
3967                 .ei_cb_cp       = ldlm_flock_completion_ast,
3968                 .ei_cbdata      = file_lock,
3969         };
3970         struct md_op_data *op_data;
3971         struct lustre_handle lockh = { 0 };
3972         union ldlm_policy_data flock = { { 0 } };
3973         int fl_type = file_lock->fl_type;
3974         __u64 flags = 0;
3975         int rc;
3976         int rc2 = 0;
3977         ENTRY;
3978
3979         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3980                PFID(ll_inode2fid(inode)), file_lock);
3981
3982         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3983
3984         if (file_lock->fl_flags & FL_FLOCK) {
3985                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3986                 /* flocks are whole-file locks */
3987                 flock.l_flock.end = OFFSET_MAX;
3988                 /* For flocks owner is determined by the local file desctiptor*/
3989                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3990         } else if (file_lock->fl_flags & FL_POSIX) {
3991                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3992                 flock.l_flock.start = file_lock->fl_start;
3993                 flock.l_flock.end = file_lock->fl_end;
3994         } else {
3995                 RETURN(-EINVAL);
3996         }
3997         flock.l_flock.pid = file_lock->fl_pid;
3998
3999         /* Somewhat ugly workaround for svc lockd.
4000          * lockd installs custom fl_lmops->lm_compare_owner that checks
4001          * for the fl_owner to be the same (which it always is on local node
4002          * I guess between lockd processes) and then compares pid.
4003          * As such we assign pid to the owner field to make it all work,
4004          * conflict with normal locks is unlikely since pid space and
4005          * pointer space for current->files are not intersecting */
4006         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4007                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4008
4009         switch (fl_type) {
4010         case F_RDLCK:
4011                 einfo.ei_mode = LCK_PR;
4012                 break;
4013         case F_UNLCK:
4014                 /* An unlock request may or may not have any relation to
4015                  * existing locks so we may not be able to pass a lock handle
4016                  * via a normal ldlm_lock_cancel() request. The request may even
4017                  * unlock a byte range in the middle of an existing lock. In
4018                  * order to process an unlock request we need all of the same
4019                  * information that is given with a normal read or write record
4020                  * lock request. To avoid creating another ldlm unlock (cancel)
4021                  * message we'll treat a LCK_NL flock request as an unlock. */
4022                 einfo.ei_mode = LCK_NL;
4023                 break;
4024         case F_WRLCK:
4025                 einfo.ei_mode = LCK_PW;
4026                 break;
4027         default:
4028                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4029                 RETURN (-ENOTSUPP);
4030         }
4031
4032         switch (cmd) {
4033         case F_SETLKW:
4034 #ifdef F_SETLKW64
4035         case F_SETLKW64:
4036 #endif
4037                 flags = 0;
4038                 break;
4039         case F_SETLK:
4040 #ifdef F_SETLK64
4041         case F_SETLK64:
4042 #endif
4043                 flags = LDLM_FL_BLOCK_NOWAIT;
4044                 break;
4045         case F_GETLK:
4046 #ifdef F_GETLK64
4047         case F_GETLK64:
4048 #endif
4049                 flags = LDLM_FL_TEST_LOCK;
4050                 break;
4051         default:
4052                 CERROR("unknown fcntl lock command: %d\n", cmd);
4053                 RETURN (-EINVAL);
4054         }
4055
4056         /* Save the old mode so that if the mode in the lock changes we
4057          * can decrement the appropriate reader or writer refcount. */
4058         file_lock->fl_type = einfo.ei_mode;
4059
4060         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4061                                      LUSTRE_OPC_ANY, NULL);
4062         if (IS_ERR(op_data))
4063                 RETURN(PTR_ERR(op_data));
4064
4065         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4066                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4067                flock.l_flock.pid, flags, einfo.ei_mode,
4068                flock.l_flock.start, flock.l_flock.end);
4069
4070         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4071                         flags);
4072
4073         /* Restore the file lock type if not TEST lock. */
4074         if (!(flags & LDLM_FL_TEST_LOCK))
4075                 file_lock->fl_type = fl_type;
4076
4077 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4078         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4079             !(flags & LDLM_FL_TEST_LOCK))
4080                 rc2  = locks_lock_file_wait(file, file_lock);
4081 #else
4082         if ((file_lock->fl_flags & FL_FLOCK) &&
4083             (rc == 0 || file_lock->fl_type == F_UNLCK))
4084                 rc2  = flock_lock_file_wait(file, file_lock);
4085         if ((file_lock->fl_flags & FL_POSIX) &&
4086             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4087             !(flags & LDLM_FL_TEST_LOCK))
4088                 rc2  = posix_lock_file_wait(file, file_lock);
4089 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4090
4091         if (rc2 && file_lock->fl_type != F_UNLCK) {
4092                 einfo.ei_mode = LCK_NL;
4093                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4094                            &lockh, flags);
4095                 rc = rc2;
4096         }
4097
4098         ll_finish_md_op_data(op_data);
4099
4100         RETURN(rc);
4101 }
4102
4103 int ll_get_fid_by_name(struct inode *parent, const char *name,
4104                        int namelen, struct lu_fid *fid,
4105                        struct inode **inode)
4106 {
4107         struct md_op_data       *op_data = NULL;
4108         struct mdt_body         *body;
4109         struct ptlrpc_request   *req;
4110         int                     rc;
4111         ENTRY;
4112
4113         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4114                                      LUSTRE_OPC_ANY, NULL);
4115         if (IS_ERR(op_data))
4116                 RETURN(PTR_ERR(op_data));
4117
4118         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4119         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4120         ll_finish_md_op_data(op_data);
4121         if (rc < 0)
4122                 RETURN(rc);
4123
4124         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4125         if (body == NULL)
4126                 GOTO(out_req, rc = -EFAULT);
4127         if (fid != NULL)
4128                 *fid = body->mbo_fid1;
4129
4130         if (inode != NULL)
4131                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4132 out_req:
4133         ptlrpc_req_finished(req);
4134         RETURN(rc);
4135 }
4136
4137 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4138                const char *name)
4139 {
4140         struct dentry *dchild = NULL;
4141         struct inode *child_inode = NULL;
4142         struct md_op_data *op_data;
4143         struct ptlrpc_request *request = NULL;
4144         struct obd_client_handle *och = NULL;
4145         struct qstr qstr;
4146         struct mdt_body *body;
4147         __u64 data_version = 0;
4148         size_t namelen = strlen(name);
4149         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4150         int rc;
4151         ENTRY;
4152
4153         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4154                PFID(ll_inode2fid(parent)), name,
4155                lum->lum_stripe_offset, lum->lum_stripe_count);
4156
4157         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4158             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4159                 lustre_swab_lmv_user_md(lum);
4160
4161         /* Get child FID first */
4162         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4163         qstr.name = name;
4164         qstr.len = namelen;
4165         dchild = d_lookup(file_dentry(file), &qstr);
4166         if (dchild) {
4167                 if (dchild->d_inode)
4168                         child_inode = igrab(dchild->d_inode);
4169                 dput(dchild);
4170         }
4171
4172         if (!child_inode) {
4173                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4174                                         &child_inode);
4175                 if (rc)
4176                         RETURN(rc);
4177         }
4178
4179         if (!child_inode)
4180                 RETURN(-ENOENT);
4181
4182         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4183               OBD_CONNECT2_DIR_MIGRATE)) {
4184                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4185                     ll_i2info(child_inode)->lli_lsm_md) {
4186                         CERROR("%s: MDT doesn't support stripe directory "
4187                                "migration!\n",
4188                                ll_get_fsname(parent->i_sb, NULL, 0));
4189                         GOTO(out_iput, rc = -EOPNOTSUPP);
4190                 }
4191         }
4192
4193         /*
4194          * lfs migrate command needs to be blocked on the client
4195          * by checking the migrate FID against the FID of the
4196          * filesystem root.
4197          */
4198         if (child_inode == parent->i_sb->s_root->d_inode)
4199                 GOTO(out_iput, rc = -EINVAL);
4200
4201         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4202                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4203         if (IS_ERR(op_data))
4204                 GOTO(out_iput, rc = PTR_ERR(op_data));
4205
4206         inode_lock(child_inode);
4207         op_data->op_fid3 = *ll_inode2fid(child_inode);
4208         if (!fid_is_sane(&op_data->op_fid3)) {
4209                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4210                        ll_get_fsname(parent->i_sb, NULL, 0), name,
4211                        PFID(&op_data->op_fid3));
4212                 GOTO(out_unlock, rc = -EINVAL);
4213         }
4214
4215         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4216         op_data->op_data = lum;
4217         op_data->op_data_size = lumlen;
4218
4219 again:
4220         if (S_ISREG(child_inode->i_mode)) {
4221                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4222                 if (IS_ERR(och)) {
4223                         rc = PTR_ERR(och);
4224                         och = NULL;
4225                         GOTO(out_unlock, rc);
4226                 }
4227
4228                 rc = ll_data_version(child_inode, &data_version,
4229                                      LL_DV_WR_FLUSH);
4230                 if (rc != 0)
4231                         GOTO(out_close, rc);
4232
4233                 op_data->op_open_handle = och->och_open_handle;
4234                 op_data->op_data_version = data_version;
4235                 op_data->op_lease_handle = och->och_lease_handle;
4236                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4237
4238                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4239                 och->och_mod->mod_open_req->rq_replay = 0;
4240                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4241         }
4242
4243         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4244                        name, namelen, &request);
4245         if (rc == 0) {
4246                 LASSERT(request != NULL);
4247                 ll_update_times(request, parent);
4248
4249                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4250                 LASSERT(body != NULL);
4251
4252                 /* If the server does release layout lock, then we cleanup
4253                  * the client och here, otherwise release it in out_close: */
4254                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4255                         obd_mod_put(och->och_mod);
4256                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4257                                                   och);
4258                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4259                         OBD_FREE_PTR(och);
4260                         och = NULL;
4261                 }
4262         }
4263
4264         if (request != NULL) {
4265                 ptlrpc_req_finished(request);
4266                 request = NULL;
4267         }
4268
4269         /* Try again if the file layout has changed. */
4270         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4271                 goto again;
4272
4273 out_close:
4274         if (och)
4275                 ll_lease_close(och, child_inode, NULL);
4276         if (!rc)
4277                 clear_nlink(child_inode);
4278 out_unlock:
4279         inode_unlock(child_inode);
4280         ll_finish_md_op_data(op_data);
4281 out_iput:
4282         iput(child_inode);
4283         RETURN(rc);
4284 }
4285
4286 static int
4287 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4288 {
4289         ENTRY;
4290
4291         RETURN(-ENOSYS);
4292 }
4293
4294 /**
4295  * test if some locks matching bits and l_req_mode are acquired
4296  * - bits can be in different locks
4297  * - if found clear the common lock bits in *bits
4298  * - the bits not found, are kept in *bits
4299  * \param inode [IN]
4300  * \param bits [IN] searched lock bits [IN]
4301  * \param l_req_mode [IN] searched lock mode
4302  * \retval boolean, true iff all bits are found
4303  */
4304 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4305 {
4306         struct lustre_handle lockh;
4307         union ldlm_policy_data policy;
4308         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4309                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4310         struct lu_fid *fid;
4311         __u64 flags;
4312         int i;
4313         ENTRY;
4314
4315         if (!inode)
4316                RETURN(0);
4317
4318         fid = &ll_i2info(inode)->lli_fid;
4319         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4320                ldlm_lockname[mode]);
4321
4322         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4323         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4324                 policy.l_inodebits.bits = *bits & (1 << i);
4325                 if (policy.l_inodebits.bits == 0)
4326                         continue;
4327
4328                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4329                                   &policy, mode, &lockh)) {
4330                         struct ldlm_lock *lock;
4331
4332                         lock = ldlm_handle2lock(&lockh);
4333                         if (lock) {
4334                                 *bits &=
4335                                       ~(lock->l_policy_data.l_inodebits.bits);
4336                                 LDLM_LOCK_PUT(lock);
4337                         } else {
4338                                 *bits &= ~policy.l_inodebits.bits;
4339                         }
4340                 }
4341         }
4342         RETURN(*bits == 0);
4343 }
4344
4345 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4346                                struct lustre_handle *lockh, __u64 flags,
4347                                enum ldlm_mode mode)
4348 {
4349         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4350         struct lu_fid *fid;
4351         enum ldlm_mode rc;
4352         ENTRY;
4353
4354         fid = &ll_i2info(inode)->lli_fid;
4355         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4356
4357         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4358                            fid, LDLM_IBITS, &policy, mode, lockh);
4359
4360         RETURN(rc);
4361 }
4362
4363 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4364 {
4365         /* Already unlinked. Just update nlink and return success */
4366         if (rc == -ENOENT) {
4367                 clear_nlink(inode);
4368                 /* If it is striped directory, and there is bad stripe
4369                  * Let's revalidate the dentry again, instead of returning
4370                  * error */
4371                 if (S_ISDIR(inode->i_mode) &&
4372                     ll_i2info(inode)->lli_lsm_md != NULL)
4373                         return 0;
4374
4375                 /* This path cannot be hit for regular files unless in
4376                  * case of obscure races, so no need to to validate
4377                  * size. */
4378                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4379                         return 0;
4380         } else if (rc != 0) {
4381                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4382                              "%s: revalidate FID "DFID" error: rc = %d\n",
4383                              ll_get_fsname(inode->i_sb, NULL, 0),
4384                              PFID(ll_inode2fid(inode)), rc);
4385         }
4386
4387         return rc;
4388 }
4389
4390 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4391 {
4392         struct inode *inode = dentry->d_inode;
4393         struct obd_export *exp = ll_i2mdexp(inode);
4394         struct lookup_intent oit = {
4395                 .it_op = op,
4396         };
4397         struct ptlrpc_request *req = NULL;
4398         struct md_op_data *op_data;
4399         int rc = 0;
4400         ENTRY;
4401
4402         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4403                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4404
4405         /* Call getattr by fid, so do not provide name at all. */
4406         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4407                                      LUSTRE_OPC_ANY, NULL);
4408         if (IS_ERR(op_data))
4409                 RETURN(PTR_ERR(op_data));
4410
4411         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4412         ll_finish_md_op_data(op_data);
4413         if (rc < 0) {
4414                 rc = ll_inode_revalidate_fini(inode, rc);
4415                 GOTO(out, rc);
4416         }
4417
4418         rc = ll_revalidate_it_finish(req, &oit, dentry);
4419         if (rc != 0) {
4420                 ll_intent_release(&oit);
4421                 GOTO(out, rc);
4422         }
4423
4424         /* Unlinked? Unhash dentry, so it is not picked up later by
4425          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4426          * here to preserve get_cwd functionality on 2.6.
4427          * Bug 10503 */
4428         if (!dentry->d_inode->i_nlink) {
4429                 ll_lock_dcache(inode);
4430                 d_lustre_invalidate(dentry, 0);
4431                 ll_unlock_dcache(inode);
4432         }
4433
4434         ll_lookup_finish_locks(&oit, dentry);
4435 out:
4436         ptlrpc_req_finished(req);
4437
4438         return rc;
4439 }
4440
4441 static int ll_merge_md_attr(struct inode *inode)
4442 {
4443         struct cl_attr attr = { 0 };
4444         int rc;
4445
4446         LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4447         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4448                            &attr, ll_md_blocking_ast);
4449         if (rc != 0)
4450                 RETURN(rc);
4451
4452         set_nlink(inode, attr.cat_nlink);
4453         inode->i_blocks = attr.cat_blocks;
4454         i_size_write(inode, attr.cat_size);
4455
4456         ll_i2info(inode)->lli_atime = attr.cat_atime;
4457         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4458         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4459
4460         RETURN(0);
4461 }
4462
4463 static inline dev_t ll_compat_encode_dev(dev_t dev)
4464 {
4465         /* The compat_sys_*stat*() syscalls will fail unless the
4466          * device majors and minors are both less than 256. Note that
4467          * the value returned here will be passed through
4468          * old_encode_dev() in cp_compat_stat(). And so we are not
4469          * trying to return a valid compat (u16) device number, just
4470          * one that will pass the old_valid_dev() check. */
4471
4472         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4473 }
4474
4475 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4476 int ll_getattr(const struct path *path, struct kstat *stat,
4477                u32 request_mask, unsigned int flags)
4478 {
4479         struct dentry *de = path->dentry;
4480 #else
4481 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4482 {
4483 #endif
4484         struct inode *inode = de->d_inode;
4485         struct ll_sb_info *sbi = ll_i2sbi(inode);
4486         struct ll_inode_info *lli = ll_i2info(inode);
4487         int rc;
4488
4489         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4490
4491         rc = ll_inode_revalidate(de, IT_GETATTR);
4492         if (rc < 0)
4493                 RETURN(rc);
4494
4495         if (S_ISREG(inode->i_mode)) {
4496                 /* In case of restore, the MDT has the right size and has
4497                  * already send it back without granting the layout lock,
4498                  * inode is up-to-date so glimpse is useless.
4499                  * Also to glimpse we need the layout, in case of a running
4500                  * restore the MDT holds the layout lock so the glimpse will
4501                  * block up to the end of restore (getattr will block)
4502                  */
4503                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4504                         rc = ll_glimpse_size(inode);
4505                         if (rc < 0)
4506                                 RETURN(rc);
4507                 }
4508         } else {
4509                 /* If object isn't regular a file then don't validate size. */
4510                 if (S_ISDIR(inode->i_mode) &&
4511                     lli->lli_lsm_md != NULL) {
4512                         rc = ll_merge_md_attr(inode);
4513                         if (rc < 0)
4514                                 RETURN(rc);
4515                 }
4516
4517                 LTIME_S(inode->i_atime) = lli->lli_atime;
4518                 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4519                 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4520         }
4521
4522         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4523
4524         if (ll_need_32bit_api(sbi)) {
4525                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4526                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4527                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4528         } else {
4529                 stat->ino = inode->i_ino;
4530                 stat->dev = inode->i_sb->s_dev;
4531                 stat->rdev = inode->i_rdev;
4532         }
4533
4534         stat->mode = inode->i_mode;
4535         stat->uid = inode->i_uid;
4536         stat->gid = inode->i_gid;
4537         stat->atime = inode->i_atime;
4538         stat->mtime = inode->i_mtime;
4539         stat->ctime = inode->i_ctime;
4540         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4541
4542         stat->nlink = inode->i_nlink;
4543         stat->size = i_size_read(inode);
4544         stat->blocks = inode->i_blocks;
4545
4546         return 0;
4547 }
4548
4549 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4550                      __u64 start, __u64 len)
4551 {
4552         int             rc;
4553         size_t          num_bytes;
4554         struct fiemap   *fiemap;
4555         unsigned int    extent_count = fieinfo->fi_extents_max;
4556
4557         num_bytes = sizeof(*fiemap) + (extent_count *
4558                                        sizeof(struct fiemap_extent));
4559         OBD_ALLOC_LARGE(fiemap, num_bytes);
4560
4561         if (fiemap == NULL)
4562                 RETURN(-ENOMEM);
4563
4564         fiemap->fm_flags = fieinfo->fi_flags;
4565         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4566         fiemap->fm_start = start;
4567         fiemap->fm_length = len;
4568         if (extent_count > 0 &&
4569             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4570                            sizeof(struct fiemap_extent)) != 0)
4571                 GOTO(out, rc = -EFAULT);
4572
4573         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4574
4575         fieinfo->fi_flags = fiemap->fm_flags;
4576         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4577         if (extent_count > 0 &&
4578             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4579                          fiemap->fm_mapped_extents *
4580                          sizeof(struct fiemap_extent)) != 0)
4581                 GOTO(out, rc = -EFAULT);
4582 out:
4583         OBD_FREE_LARGE(fiemap, num_bytes);
4584         return rc;
4585 }
4586
4587 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4588 {
4589         struct ll_inode_info *lli = ll_i2info(inode);
4590         struct posix_acl *acl = NULL;
4591         ENTRY;
4592
4593         spin_lock(&lli->lli_lock);
4594         /* VFS' acl_permission_check->check_acl will release the refcount */
4595         acl = posix_acl_dup(lli->lli_posix_acl);
4596         spin_unlock(&lli->lli_lock);
4597
4598         RETURN(acl);
4599 }
4600
4601 #ifdef HAVE_IOP_SET_ACL
4602 #ifdef CONFIG_FS_POSIX_ACL
4603 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4604 {
4605         struct ll_sb_info *sbi = ll_i2sbi(inode);
4606         struct ptlrpc_request *req = NULL;
4607         const char *name = NULL;
4608         char *value = NULL;
4609         size_t value_size = 0;
4610         int rc = 0;
4611         ENTRY;
4612
4613         switch (type) {
4614         case ACL_TYPE_ACCESS:
4615                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4616                 if (acl)
4617                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4618                 break;
4619
4620         case ACL_TYPE_DEFAULT:
4621                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4622                 if (!S_ISDIR(inode->i_mode))
4623                         rc = acl ? -EACCES : 0;
4624                 break;
4625
4626         default:
4627                 rc = -EINVAL;
4628                 break;
4629         }
4630         if (rc)
4631                 return rc;
4632
4633         if (acl) {
4634                 value_size = posix_acl_xattr_size(acl->a_count);
4635                 value = kmalloc(value_size, GFP_NOFS);
4636                 if (value == NULL)
4637                         GOTO(out, rc = -ENOMEM);
4638
4639                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4640                 if (rc < 0)
4641                         GOTO(out_value, rc);
4642         }
4643
4644         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4645                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4646                          name, value, value_size, 0, 0, &req);
4647
4648         ptlrpc_req_finished(req);
4649 out_value:
4650         kfree(value);
4651 out:
4652         if (rc)
4653                 forget_cached_acl(inode, type);
4654         else
4655                 set_cached_acl(inode, type, acl);
4656         RETURN(rc);
4657 }
4658 #endif /* CONFIG_FS_POSIX_ACL */
4659 #endif /* HAVE_IOP_SET_ACL */
4660
4661 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4662 static int
4663 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4664 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4665 # else
4666 ll_check_acl(struct inode *inode, int mask)
4667 # endif
4668 {
4669 # ifdef CONFIG_FS_POSIX_ACL
4670         struct posix_acl *acl;
4671         int rc;
4672         ENTRY;
4673
4674 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4675         if (flags & IPERM_FLAG_RCU)
4676                 return -ECHILD;
4677 #  endif
4678         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4679
4680         if (!acl)
4681                 RETURN(-EAGAIN);
4682
4683         rc = posix_acl_permission(inode, acl, mask);
4684         posix_acl_release(acl);
4685
4686         RETURN(rc);
4687 # else /* !CONFIG_FS_POSIX_ACL */
4688         return -EAGAIN;
4689 # endif /* CONFIG_FS_POSIX_ACL */
4690 }
4691 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4692
4693 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4694 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4695 #else
4696 # ifdef HAVE_INODE_PERMISION_2ARGS
4697 int ll_inode_permission(struct inode *inode, int mask)
4698 # else
4699 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4700 # endif
4701 #endif
4702 {
4703         int rc = 0;
4704         struct ll_sb_info *sbi;
4705         struct root_squash_info *squash;
4706         struct cred *cred = NULL;
4707         const struct cred *old_cred = NULL;
4708         cfs_cap_t cap;
4709         bool squash_id = false;
4710         ENTRY;
4711
4712 #ifdef MAY_NOT_BLOCK
4713         if (mask & MAY_NOT_BLOCK)
4714                 return -ECHILD;
4715 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4716         if (flags & IPERM_FLAG_RCU)
4717                 return -ECHILD;
4718 #endif
4719
4720        /* as root inode are NOT getting validated in lookup operation,
4721         * need to do it before permission check. */
4722
4723         if (inode == inode->i_sb->s_root->d_inode) {
4724                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4725                 if (rc)
4726                         RETURN(rc);
4727         }
4728
4729         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4730                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4731
4732         /* squash fsuid/fsgid if needed */
4733         sbi = ll_i2sbi(inode);
4734         squash = &sbi->ll_squash;
4735         if (unlikely(squash->rsi_uid != 0 &&
4736                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4737                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4738                         squash_id = true;
4739         }
4740         if (squash_id) {
4741                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4742                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4743                        squash->rsi_uid, squash->rsi_gid);
4744
4745                 /* update current process's credentials
4746                  * and FS capability */
4747                 cred = prepare_creds();
4748                 if (cred == NULL)
4749                         RETURN(-ENOMEM);
4750
4751                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4752                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4753                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4754                         if ((1 << cap) & CFS_CAP_FS_MASK)
4755                                 cap_lower(cred->cap_effective, cap);
4756                 }
4757                 old_cred = override_creds(cred);
4758         }
4759
4760         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4761         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4762         /* restore current process's credentials and FS capability */
4763         if (squash_id) {
4764                 revert_creds(old_cred);
4765                 put_cred(cred);
4766         }
4767
4768         RETURN(rc);
4769 }
4770
4771 /* -o localflock - only provides locally consistent flock locks */
4772 struct file_operations ll_file_operations = {
4773 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4774 # ifdef HAVE_SYNC_READ_WRITE
4775         .read           = new_sync_read,
4776         .write          = new_sync_write,
4777 # endif
4778         .read_iter      = ll_file_read_iter,
4779         .write_iter     = ll_file_write_iter,
4780 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4781         .read           = ll_file_read,
4782         .aio_read       = ll_file_aio_read,
4783         .write          = ll_file_write,
4784         .aio_write      = ll_file_aio_write,
4785 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4786         .unlocked_ioctl = ll_file_ioctl,
4787         .open           = ll_file_open,
4788         .release        = ll_file_release,
4789         .mmap           = ll_file_mmap,
4790         .llseek         = ll_file_seek,
4791         .splice_read    = ll_file_splice_read,
4792         .fsync          = ll_fsync,
4793         .flush          = ll_flush
4794 };
4795
4796 struct file_operations ll_file_operations_flock = {
4797 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4798 # ifdef HAVE_SYNC_READ_WRITE
4799         .read           = new_sync_read,
4800         .write          = new_sync_write,
4801 # endif /* HAVE_SYNC_READ_WRITE */
4802         .read_iter      = ll_file_read_iter,
4803         .write_iter     = ll_file_write_iter,
4804 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4805         .read           = ll_file_read,
4806         .aio_read       = ll_file_aio_read,
4807         .write          = ll_file_write,
4808         .aio_write      = ll_file_aio_write,
4809 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4810         .unlocked_ioctl = ll_file_ioctl,
4811         .open           = ll_file_open,
4812         .release        = ll_file_release,
4813         .mmap           = ll_file_mmap,
4814         .llseek         = ll_file_seek,
4815         .splice_read    = ll_file_splice_read,
4816         .fsync          = ll_fsync,
4817         .flush          = ll_flush,
4818         .flock          = ll_file_flock,
4819         .lock           = ll_file_flock
4820 };
4821
4822 /* These are for -o noflock - to return ENOSYS on flock calls */
4823 struct file_operations ll_file_operations_noflock = {
4824 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4825 # ifdef HAVE_SYNC_READ_WRITE
4826         .read           = new_sync_read,
4827         .write          = new_sync_write,
4828 # endif /* HAVE_SYNC_READ_WRITE */
4829         .read_iter      = ll_file_read_iter,
4830         .write_iter     = ll_file_write_iter,
4831 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4832         .read           = ll_file_read,
4833         .aio_read       = ll_file_aio_read,
4834         .write          = ll_file_write,
4835         .aio_write      = ll_file_aio_write,
4836 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4837         .unlocked_ioctl = ll_file_ioctl,
4838         .open           = ll_file_open,
4839         .release        = ll_file_release,
4840         .mmap           = ll_file_mmap,
4841         .llseek         = ll_file_seek,
4842         .splice_read    = ll_file_splice_read,
4843         .fsync          = ll_fsync,
4844         .flush          = ll_flush,
4845         .flock          = ll_file_noflock,
4846         .lock           = ll_file_noflock
4847 };
4848
4849 struct inode_operations ll_file_inode_operations = {
4850         .setattr        = ll_setattr,
4851         .getattr        = ll_getattr,
4852         .permission     = ll_inode_permission,
4853 #ifdef HAVE_IOP_XATTR
4854         .setxattr       = ll_setxattr,
4855         .getxattr       = ll_getxattr,
4856         .removexattr    = ll_removexattr,
4857 #endif
4858         .listxattr      = ll_listxattr,
4859         .fiemap         = ll_fiemap,
4860 #ifdef HAVE_IOP_GET_ACL
4861         .get_acl        = ll_get_acl,
4862 #endif
4863 #ifdef HAVE_IOP_SET_ACL
4864         .set_acl        = ll_set_acl,
4865 #endif
4866 };
4867
4868 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4869 {
4870         struct ll_inode_info *lli = ll_i2info(inode);
4871         struct cl_object *obj = lli->lli_clob;
4872         struct lu_env *env;
4873         int rc;
4874         __u16 refcheck;
4875         ENTRY;
4876
4877         if (obj == NULL)
4878                 RETURN(0);
4879
4880         env = cl_env_get(&refcheck);
4881         if (IS_ERR(env))
4882                 RETURN(PTR_ERR(env));
4883
4884         rc = cl_conf_set(env, lli->lli_clob, conf);
4885         if (rc < 0)
4886                 GOTO(out, rc);
4887
4888         if (conf->coc_opc == OBJECT_CONF_SET) {
4889                 struct ldlm_lock *lock = conf->coc_lock;
4890                 struct cl_layout cl = {
4891                         .cl_layout_gen = 0,
4892                 };
4893
4894                 LASSERT(lock != NULL);
4895                 LASSERT(ldlm_has_layout(lock));
4896
4897                 /* it can only be allowed to match after layout is
4898                  * applied to inode otherwise false layout would be
4899                  * seen. Applying layout shoud happen before dropping
4900                  * the intent lock. */
4901                 ldlm_lock_allow_match(lock);
4902
4903                 rc = cl_object_layout_get(env, obj, &cl);
4904                 if (rc < 0)
4905                         GOTO(out, rc);
4906
4907                 CDEBUG(D_VFSTRACE,
4908                        DFID": layout version change: %u -> %u\n",
4909                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4910                        cl.cl_layout_gen);
4911                 ll_layout_version_set(lli, cl.cl_layout_gen);
4912         }
4913
4914 out:
4915         cl_env_put(env, &refcheck);
4916
4917         RETURN(rc);
4918 }
4919
4920 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4921 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4922
4923 {
4924         struct ll_sb_info *sbi = ll_i2sbi(inode);
4925         struct ptlrpc_request *req;
4926         struct mdt_body *body;
4927         void *lvbdata;
4928         void *lmm;
4929         int lmmsize;
4930         int rc;
4931         ENTRY;
4932
4933         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4934                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4935                lock->l_lvb_data, lock->l_lvb_len);
4936
4937         if (lock->l_lvb_data != NULL)
4938                 RETURN(0);
4939
4940         /* if layout lock was granted right away, the layout is returned
4941          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4942          * blocked and then granted via completion ast, we have to fetch
4943          * layout here. Please note that we can't use the LVB buffer in
4944          * completion AST because it doesn't have a large enough buffer */
4945         rc = ll_get_default_mdsize(sbi, &lmmsize);
4946         if (rc == 0)
4947                 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4948                                 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4949         if (rc < 0)
4950                 RETURN(rc);
4951
4952         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4953         if (body == NULL)
4954                 GOTO(out, rc = -EPROTO);
4955
4956         lmmsize = body->mbo_eadatasize;
4957         if (lmmsize == 0) /* empty layout */
4958                 GOTO(out, rc = 0);
4959
4960         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4961         if (lmm == NULL)
4962                 GOTO(out, rc = -EFAULT);
4963
4964         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4965         if (lvbdata == NULL)
4966                 GOTO(out, rc = -ENOMEM);
4967
4968         memcpy(lvbdata, lmm, lmmsize);
4969         lock_res_and_lock(lock);
4970         if (unlikely(lock->l_lvb_data == NULL)) {
4971                 lock->l_lvb_type = LVB_T_LAYOUT;
4972                 lock->l_lvb_data = lvbdata;
4973                 lock->l_lvb_len = lmmsize;
4974                 lvbdata = NULL;
4975         }
4976         unlock_res_and_lock(lock);
4977
4978         if (lvbdata)
4979                 OBD_FREE_LARGE(lvbdata, lmmsize);
4980
4981         EXIT;
4982
4983 out:
4984         ptlrpc_req_finished(req);
4985         return rc;
4986 }
4987
4988 /**
4989  * Apply the layout to the inode. Layout lock is held and will be released
4990  * in this function.
4991  */
4992 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4993                               struct inode *inode)
4994 {
4995         struct ll_inode_info *lli = ll_i2info(inode);
4996         struct ll_sb_info    *sbi = ll_i2sbi(inode);
4997         struct ldlm_lock *lock;
4998         struct cl_object_conf conf;
4999         int rc = 0;
5000         bool lvb_ready;
5001         bool wait_layout = false;
5002         ENTRY;
5003
5004         LASSERT(lustre_handle_is_used(lockh));
5005
5006         lock = ldlm_handle2lock(lockh);
5007         LASSERT(lock != NULL);
5008         LASSERT(ldlm_has_layout(lock));
5009
5010         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5011                    PFID(&lli->lli_fid), inode);
5012
5013         /* in case this is a caching lock and reinstate with new inode */
5014         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5015
5016         lock_res_and_lock(lock);
5017         lvb_ready = ldlm_is_lvb_ready(lock);
5018         unlock_res_and_lock(lock);
5019
5020         /* checking lvb_ready is racy but this is okay. The worst case is
5021          * that multi processes may configure the file on the same time. */
5022         if (lvb_ready)
5023                 GOTO(out, rc = 0);
5024
5025         rc = ll_layout_fetch(inode, lock);
5026         if (rc < 0)
5027                 GOTO(out, rc);
5028
5029         /* for layout lock, lmm is stored in lock's lvb.
5030          * lvb_data is immutable if the lock is held so it's safe to access it
5031          * without res lock.
5032          *
5033          * set layout to file. Unlikely this will fail as old layout was
5034          * surely eliminated */
5035         memset(&conf, 0, sizeof conf);
5036         conf.coc_opc = OBJECT_CONF_SET;
5037         conf.coc_inode = inode;
5038         conf.coc_lock = lock;
5039         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5040         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5041         rc = ll_layout_conf(inode, &conf);
5042
5043         /* refresh layout failed, need to wait */
5044         wait_layout = rc == -EBUSY;
5045         EXIT;
5046 out:
5047         LDLM_LOCK_PUT(lock);
5048         ldlm_lock_decref(lockh, mode);
5049
5050         /* wait for IO to complete if it's still being used. */
5051         if (wait_layout) {
5052                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5053                        ll_get_fsname(inode->i_sb, NULL, 0),
5054                        PFID(&lli->lli_fid), inode);
5055
5056                 memset(&conf, 0, sizeof conf);
5057                 conf.coc_opc = OBJECT_CONF_WAIT;
5058                 conf.coc_inode = inode;
5059                 rc = ll_layout_conf(inode, &conf);
5060                 if (rc == 0)
5061                         rc = -EAGAIN;
5062
5063                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5064                        ll_get_fsname(inode->i_sb, NULL, 0),
5065                        PFID(&lli->lli_fid), rc);
5066         }
5067         RETURN(rc);
5068 }
5069
5070 /**
5071  * Issue layout intent RPC to MDS.
5072  * \param inode [in]    file inode
5073  * \param intent [in]   layout intent
5074  *
5075  * \retval 0    on success
5076  * \retval < 0  error code
5077  */
5078 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5079 {
5080         struct ll_inode_info  *lli = ll_i2info(inode);
5081         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5082         struct md_op_data     *op_data;
5083         struct lookup_intent it;
5084         struct ptlrpc_request *req;
5085         int rc;
5086         ENTRY;
5087
5088         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5089                                      0, 0, LUSTRE_OPC_ANY, NULL);
5090         if (IS_ERR(op_data))
5091                 RETURN(PTR_ERR(op_data));
5092
5093         op_data->op_data = intent;
5094         op_data->op_data_size = sizeof(*intent);
5095
5096         memset(&it, 0, sizeof(it));
5097         it.it_op = IT_LAYOUT;
5098         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5099             intent->li_opc == LAYOUT_INTENT_TRUNC)
5100                 it.it_flags = FMODE_WRITE;
5101
5102         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5103                           ll_get_fsname(inode->i_sb, NULL, 0),
5104                           PFID(&lli->lli_fid), inode);
5105
5106         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5107                             &ll_md_blocking_ast, 0);
5108         if (it.it_request != NULL)
5109                 ptlrpc_req_finished(it.it_request);
5110         it.it_request = NULL;
5111
5112         ll_finish_md_op_data(op_data);
5113
5114         /* set lock data in case this is a new lock */
5115         if (!rc)
5116                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5117
5118         ll_intent_drop_lock(&it);
5119
5120         RETURN(rc);
5121 }
5122
5123 /**
5124  * This function checks if there exists a LAYOUT lock on the client side,
5125  * or enqueues it if it doesn't have one in cache.
5126  *
5127  * This function will not hold layout lock so it may be revoked any time after
5128  * this function returns. Any operations depend on layout should be redone
5129  * in that case.
5130  *
5131  * This function should be called before lov_io_init() to get an uptodate
5132  * layout version, the caller should save the version number and after IO
5133  * is finished, this function should be called again to verify that layout
5134  * is not changed during IO time.
5135  */
5136 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5137 {
5138         struct ll_inode_info    *lli = ll_i2info(inode);
5139         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5140         struct lustre_handle lockh;
5141         struct layout_intent intent = {
5142                 .li_opc = LAYOUT_INTENT_ACCESS,
5143         };
5144         enum ldlm_mode mode;
5145         int rc;
5146         ENTRY;
5147
5148         *gen = ll_layout_version_get(lli);
5149         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5150                 RETURN(0);
5151
5152         /* sanity checks */
5153         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5154         LASSERT(S_ISREG(inode->i_mode));
5155
5156         /* take layout lock mutex to enqueue layout lock exclusively. */
5157         mutex_lock(&lli->lli_layout_mutex);
5158
5159         while (1) {
5160                 /* mostly layout lock is caching on the local side, so try to
5161                  * match it before grabbing layout lock mutex. */
5162                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5163                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5164                 if (mode != 0) { /* hit cached lock */
5165                         rc = ll_layout_lock_set(&lockh, mode, inode);
5166                         if (rc == -EAGAIN)
5167                                 continue;
5168                         break;
5169                 }
5170
5171                 rc = ll_layout_intent(inode, &intent);
5172                 if (rc != 0)
5173                         break;
5174         }
5175
5176         if (rc == 0)
5177                 *gen = ll_layout_version_get(lli);
5178         mutex_unlock(&lli->lli_layout_mutex);
5179
5180         RETURN(rc);
5181 }
5182
5183 /**
5184  * Issue layout intent RPC indicating where in a file an IO is about to write.
5185  *
5186  * \param[in] inode     file inode.
5187  * \param[in] ext       write range with start offset of fille in bytes where
5188  *                      an IO is about to write, and exclusive end offset in
5189  *                      bytes.
5190  *
5191  * \retval 0    on success
5192  * \retval < 0  error code
5193  */
5194 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5195                            struct lu_extent *ext)
5196 {
5197         struct layout_intent intent = {
5198                 .li_opc = opc,
5199                 .li_extent.e_start = ext->e_start,
5200                 .li_extent.e_end = ext->e_end,
5201         };
5202         int rc;
5203         ENTRY;
5204
5205         rc = ll_layout_intent(inode, &intent);
5206
5207         RETURN(rc);
5208 }
5209
5210 /**
5211  *  This function send a restore request to the MDT
5212  */
5213 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5214 {
5215         struct hsm_user_request *hur;
5216         int                      len, rc;
5217         ENTRY;
5218
5219         len = sizeof(struct hsm_user_request) +
5220               sizeof(struct hsm_user_item);
5221         OBD_ALLOC(hur, len);
5222         if (hur == NULL)
5223                 RETURN(-ENOMEM);
5224
5225         hur->hur_request.hr_action = HUA_RESTORE;
5226         hur->hur_request.hr_archive_id = 0;
5227         hur->hur_request.hr_flags = 0;
5228         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5229                sizeof(hur->hur_user_item[0].hui_fid));
5230         hur->hur_user_item[0].hui_extent.offset = offset;
5231         hur->hur_user_item[0].hui_extent.length = length;
5232         hur->hur_request.hr_itemcount = 1;
5233         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5234                            len, hur, NULL);
5235         OBD_FREE(hur, len);
5236         RETURN(rc);
5237 }