lustre/llite/file.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2011, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lustre/llite/file.c
  33  *
  34  * Author: Peter Braam <braam@clusterfs.com>
  35  * Author: Phil Schwan <phil@clusterfs.com>
  36  * Author: Andreas Dilger <adilger@clusterfs.com>
  37  */
  38
  39 #define DEBUG_SUBSYSTEM S_LLITE
  40 #include <lustre_dlm.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/file.h>
  43 #include <linux/sched.h>
  44 #include <linux/user_namespace.h>
  45 #ifdef HAVE_UIDGID_HEADER
  46 # include <linux/uidgid.h>
  47 #endif
  48
  49 #include <uapi/linux/lustre/lustre_ioctl.h>
  50 #include <lustre_swab.h>
  51
  52 #include "cl_object.h"
  53 #include "llite_internal.h"
  54 #include "vvp_internal.h"
  55
  56 struct split_param {
  57         struct inode    *sp_inode;
  58         __u16           sp_mirror_id;
  59 };
  60
  61 static int
  62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
  63
  64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
  65                           bool *lease_broken);
  66
  67 static struct ll_file_data *ll_file_data_get(void)
  68 {
  69         struct ll_file_data *fd;
  70
  71         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
  72         if (fd == NULL)
  73                 return NULL;
  74
  75         fd->fd_write_failed = false;
  76
  77         return fd;
  78 }
  79
  80 static void ll_file_data_put(struct ll_file_data *fd)
  81 {
  82         if (fd != NULL)
  83                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  84 }
  85
  86 /**
  87  * Packs all the attributes into @op_data for the CLOSE rpc.
  88  */
  89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  90                              struct obd_client_handle *och)
  91 {
  92         ENTRY;
  93
  94         ll_prep_md_op_data(op_data, inode, NULL, NULL,
  95                            0, 0, LUSTRE_OPC_ANY, NULL);
  96
  97         op_data->op_attr.ia_mode = inode->i_mode;
  98         op_data->op_attr.ia_atime = inode->i_atime;
  99         op_data->op_attr.ia_mtime = inode->i_mtime;
 100         op_data->op_attr.ia_ctime = inode->i_ctime;
 101         op_data->op_attr.ia_size = i_size_read(inode);
 102         op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
 103                                       ATTR_MTIME | ATTR_MTIME_SET |
 104                                       ATTR_CTIME);
 105         op_data->op_xvalid |= OP_XVALID_CTIME_SET;
 106         op_data->op_attr_blocks = inode->i_blocks;
 107         op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
 108         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
 109                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
 110         op_data->op_open_handle = och->och_open_handle;
 111
 112         if (och->och_flags & FMODE_WRITE &&
 113             ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
 114                 /* For HSM: if inode data has been modified, pack it so that
 115                  * MDT can set data dirty flag in the archive. */
 116                 op_data->op_bias |= MDS_DATA_MODIFIED;
 117
 118         EXIT;
 119 }
 120
 121 /**
 122  * Perform a close, possibly with a bias.
 123  * The meaning of "data" depends on the value of "bias".
 124  *
 125  * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
 126  * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
 127  * swap layouts with.
 128  */
 129 static int ll_close_inode_openhandle(struct inode *inode,
 130                                      struct obd_client_handle *och,
 131                                      enum mds_op_bias bias, void *data)
 132 {
 133         struct obd_export *md_exp = ll_i2mdexp(inode);
 134         const struct ll_inode_info *lli = ll_i2info(inode);
 135         struct md_op_data *op_data;
 136         struct ptlrpc_request *req = NULL;
 137         int rc;
 138         ENTRY;
 139
 140         if (class_exp2obd(md_exp) == NULL) {
 141                 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
 142                        ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
 143                 GOTO(out, rc = 0);
 144         }
 145
 146         OBD_ALLOC_PTR(op_data);
 147         /* We leak openhandle and request here on error, but not much to be
 148          * done in OOM case since app won't retry close on error either. */
 149         if (op_data == NULL)
 150                 GOTO(out, rc = -ENOMEM);
 151
 152         ll_prepare_close(inode, op_data, och);
 153         switch (bias) {
 154         case MDS_CLOSE_LAYOUT_MERGE:
 155                 /* merge blocks from the victim inode */
 156                 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
 157                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 158                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 159         case MDS_CLOSE_LAYOUT_SPLIT:
 160         case MDS_CLOSE_LAYOUT_SWAP: {
 161                 struct split_param *sp = data;
 162
 163                 LASSERT(data != NULL);
 164                 op_data->op_bias |= bias;
 165                 op_data->op_data_version = 0;
 166                 op_data->op_lease_handle = och->och_lease_handle;
 167                 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
 168                         op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
 169                         op_data->op_mirror_id = sp->sp_mirror_id;
 170                 } else {
 171                         op_data->op_fid2 = *ll_inode2fid(data);
 172                 }
 173                 break;
 174         }
 175
 176         case MDS_CLOSE_RESYNC_DONE: {
 177                 struct ll_ioc_lease *ioc = data;
 178
 179                 LASSERT(data != NULL);
 180                 op_data->op_attr_blocks +=
 181                         ioc->lil_count * op_data->op_attr_blocks;
 182                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 183                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 184                 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
 185
 186                 op_data->op_lease_handle = och->och_lease_handle;
 187                 op_data->op_data = &ioc->lil_ids[0];
 188                 op_data->op_data_size =
 189                         ioc->lil_count * sizeof(ioc->lil_ids[0]);
 190                 break;
 191         }
 192
 193         case MDS_HSM_RELEASE:
 194                 LASSERT(data != NULL);
 195                 op_data->op_bias |= MDS_HSM_RELEASE;
 196                 op_data->op_data_version = *(__u64 *)data;
 197                 op_data->op_lease_handle = och->och_lease_handle;
 198                 op_data->op_attr.ia_valid |= ATTR_SIZE;
 199                 op_data->op_xvalid |= OP_XVALID_BLOCKS;
 200                 break;
 201
 202         default:
 203                 LASSERT(data == NULL);
 204                 break;
 205         }
 206
 207         if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
 208                 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
 209         if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
 210                 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
 211
 212         rc = md_close(md_exp, op_data, och->och_mod, &req);
 213         if (rc != 0 && rc != -EINTR)
 214                 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
 215                        md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
 216
 217         if (rc == 0 && op_data->op_bias & bias) {
 218                 struct mdt_body *body;
 219
 220                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 221                 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
 222                         rc = -EBUSY;
 223         }
 224
 225         ll_finish_md_op_data(op_data);
 226         EXIT;
 227 out:
 228
 229         md_clear_open_replay_data(md_exp, och);
 230         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
 231         OBD_FREE_PTR(och);
 232
 233         ptlrpc_req_finished(req);       /* This is close request */
 234         return rc;
 235 }
 236
 237 int ll_md_real_close(struct inode *inode, fmode_t fmode)
 238 {
 239         struct ll_inode_info *lli = ll_i2info(inode);
 240         struct obd_client_handle **och_p;
 241         struct obd_client_handle *och;
 242         __u64 *och_usecount;
 243         int rc = 0;
 244         ENTRY;
 245
 246         if (fmode & FMODE_WRITE) {
 247                 och_p = &lli->lli_mds_write_och;
 248                 och_usecount = &lli->lli_open_fd_write_count;
 249         } else if (fmode & FMODE_EXEC) {
 250                 och_p = &lli->lli_mds_exec_och;
 251                 och_usecount = &lli->lli_open_fd_exec_count;
 252         } else {
 253                 LASSERT(fmode & FMODE_READ);
 254                 och_p = &lli->lli_mds_read_och;
 255                 och_usecount = &lli->lli_open_fd_read_count;
 256         }
 257
 258         mutex_lock(&lli->lli_och_mutex);
 259         if (*och_usecount > 0) {
 260                 /* There are still users of this handle, so skip
 261                  * freeing it. */
 262                 mutex_unlock(&lli->lli_och_mutex);
 263                 RETURN(0);
 264         }
 265
 266         och = *och_p;
 267         *och_p = NULL;
 268         mutex_unlock(&lli->lli_och_mutex);
 269
 270         if (och != NULL) {
 271                 /* There might be a race and this handle may already
 272                  * be closed. */
 273                 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
 274         }
 275
 276         RETURN(rc);
 277 }
 278
 279 static int ll_md_close(struct inode *inode, struct file *file)
 280 {
 281         union ldlm_policy_data policy = {
 282                 .l_inodebits    = { MDS_INODELOCK_OPEN },
 283         };
 284         __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 285         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 286         struct ll_inode_info *lli = ll_i2info(inode);
 287         struct lustre_handle lockh;
 288         enum ldlm_mode lockmode;
 289         int rc = 0;
 290         ENTRY;
 291
 292         /* clear group lock, if present */
 293         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
 294                 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
 295
 296         if (fd->fd_lease_och != NULL) {
 297                 bool lease_broken;
 298
 299                 /* Usually the lease is not released when the
 300                  * application crashed, we need to release here. */
 301                 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
 302                 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
 303                         PFID(&lli->lli_fid), rc, lease_broken);
 304
 305                 fd->fd_lease_och = NULL;
 306         }
 307
 308         if (fd->fd_och != NULL) {
 309                 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
 310                 fd->fd_och = NULL;
 311                 GOTO(out, rc);
 312         }
 313
 314         /* Let's see if we have good enough OPEN lock on the file and if
 315            we can skip talking to MDS */
 316         mutex_lock(&lli->lli_och_mutex);
 317         if (fd->fd_omode & FMODE_WRITE) {
 318                 lockmode = LCK_CW;
 319                 LASSERT(lli->lli_open_fd_write_count);
 320                 lli->lli_open_fd_write_count--;
 321         } else if (fd->fd_omode & FMODE_EXEC) {
 322                 lockmode = LCK_PR;
 323                 LASSERT(lli->lli_open_fd_exec_count);
 324                 lli->lli_open_fd_exec_count--;
 325         } else {
 326                 lockmode = LCK_CR;
 327                 LASSERT(lli->lli_open_fd_read_count);
 328                 lli->lli_open_fd_read_count--;
 329         }
 330         mutex_unlock(&lli->lli_och_mutex);
 331
 332         if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
 333                            LDLM_IBITS, &policy, lockmode, &lockh))
 334                 rc = ll_md_real_close(inode, fd->fd_omode);
 335
 336 out:
 337         LUSTRE_FPRIVATE(file) = NULL;
 338         ll_file_data_put(fd);
 339
 340         RETURN(rc);
 341 }
 342
 343 /* While this returns an error code, fput() the caller does not, so we need
 344  * to make every effort to clean up all of our state here.  Also, applications
 345  * rarely check close errors and even if an error is returned they will not
 346  * re-try the close call.
 347  */
 348 int ll_file_release(struct inode *inode, struct file *file)
 349 {
 350         struct ll_file_data *fd;
 351         struct ll_sb_info *sbi = ll_i2sbi(inode);
 352         struct ll_inode_info *lli = ll_i2info(inode);
 353         int rc;
 354         ENTRY;
 355
 356         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
 357                PFID(ll_inode2fid(inode)), inode);
 358
 359         if (inode->i_sb->s_root != file_dentry(file))
 360                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 361         fd = LUSTRE_FPRIVATE(file);
 362         LASSERT(fd != NULL);
 363
 364         /* The last ref on @file, maybe not the the owner pid of statahead,
 365          * because parent and child process can share the same file handle. */
 366         if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
 367                 ll_deauthorize_statahead(inode, fd);
 368
 369         if (inode->i_sb->s_root == file_dentry(file)) {
 370                 LUSTRE_FPRIVATE(file) = NULL;
 371                 ll_file_data_put(fd);
 372                 RETURN(0);
 373         }
 374
 375         if (!S_ISDIR(inode->i_mode)) {
 376                 if (lli->lli_clob != NULL)
 377                         lov_read_and_clear_async_rc(lli->lli_clob);
 378                 lli->lli_async_rc = 0;
 379         }
 380
 381         rc = ll_md_close(inode, file);
 382
 383         if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
 384                 libcfs_debug_dumplog();
 385
 386         RETURN(rc);
 387 }
 388
 389 static inline int ll_dom_readpage(void *data, struct page *page)
 390 {
 391         struct niobuf_local *lnb = data;
 392         void *kaddr;
 393
 394         kaddr = ll_kmap_atomic(page, KM_USER0);
 395         memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
 396         if (lnb->lnb_len < PAGE_SIZE)
 397                 memset(kaddr + lnb->lnb_len, 0,
 398                        PAGE_SIZE - lnb->lnb_len);
 399         flush_dcache_page(page);
 400         SetPageUptodate(page);
 401         ll_kunmap_atomic(kaddr, KM_USER0);
 402         unlock_page(page);
 403
 404         return 0;
 405 }
 406
 407 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
 408                         struct lookup_intent *it)
 409 {
 410         struct ll_inode_info *lli = ll_i2info(inode);
 411         struct cl_object *obj = lli->lli_clob;
 412         struct address_space *mapping = inode->i_mapping;
 413         struct page *vmpage;
 414         struct niobuf_remote *rnb;
 415         char *data;
 416         struct lustre_handle lockh;
 417         struct ldlm_lock *lock;
 418         unsigned long index, start;
 419         struct niobuf_local lnb;
 420         bool dom_lock = false;
 421
 422         ENTRY;
 423
 424         if (obj == NULL)
 425                 RETURN_EXIT;
 426
 427         if (it->it_lock_mode != 0) {
 428                 lockh.cookie = it->it_lock_handle;
 429                 lock = ldlm_handle2lock(&lockh);
 430                 if (lock != NULL)
 431                         dom_lock = ldlm_has_dom(lock);
 432                 LDLM_LOCK_PUT(lock);
 433         }
 434         if (!dom_lock)
 435                 RETURN_EXIT;
 436
 437         if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
 438                                    RCL_SERVER))
 439                 RETURN_EXIT;
 440
 441         rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
 442         if (rnb == NULL || rnb->rnb_len == 0)
 443                 RETURN_EXIT;
 444
 445         /* LU-11595: Server may return whole file and that is OK always or
 446          * it may return just file tail and its offset must be aligned with
 447          * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
 448          * smaller then offset may be not aligned and that data is just ignored.
 449          */
 450         if (rnb->rnb_offset % PAGE_SIZE)
 451                 RETURN_EXIT;
 452
 453         /* Server returns whole file or just file tail if it fills in
 454          * reply buffer, in both cases total size should be inode size.
 455          */
 456         if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
 457                 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
 458                        ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
 459                        rnb->rnb_len, i_size_read(inode));
 460                 RETURN_EXIT;
 461         }
 462
 463         CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
 464                rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
 465
 466         data = (char *)rnb + sizeof(*rnb);
 467
 468         lnb.lnb_file_offset = rnb->rnb_offset;
 469         start = lnb.lnb_file_offset / PAGE_SIZE;
 470         index = 0;
 471         LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
 472         lnb.lnb_page_offset = 0;
 473         do {
 474                 lnb.lnb_data = data + (index << PAGE_SHIFT);
 475                 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
 476                 if (lnb.lnb_len > PAGE_SIZE)
 477                         lnb.lnb_len = PAGE_SIZE;
 478
 479                 vmpage = read_cache_page(mapping, index + start,
 480                                          ll_dom_readpage, &lnb);
 481                 if (IS_ERR(vmpage)) {
 482                         CWARN("%s: cannot fill page %lu for "DFID
 483                               " with data: rc = %li\n",
 484                               ll_i2sbi(inode)->ll_fsname, index + start,
 485                               PFID(lu_object_fid(&obj->co_lu)),
 486                               PTR_ERR(vmpage));
 487                         break;
 488                 }
 489                 put_page(vmpage);
 490                 index++;
 491         } while (rnb->rnb_len > (index << PAGE_SHIFT));
 492         EXIT;
 493 }
 494
 495 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
 496                                 struct lookup_intent *itp)
 497 {
 498         struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
 499         struct dentry *parent = de->d_parent;
 500         char *name = NULL;
 501         int len = 0;
 502         struct md_op_data *op_data;
 503         struct ptlrpc_request *req = NULL;
 504         int rc;
 505         ENTRY;
 506
 507         LASSERT(parent != NULL);
 508         LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
 509
 510         /* if server supports open-by-fid, or file name is invalid, don't pack
 511          * name in open request */
 512         if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
 513 retry:
 514                 len = de->d_name.len;
 515                 name = kmalloc(len, GFP_NOFS);
 516                 if (!name)
 517                         RETURN(-ENOMEM);
 518                 /* race here */
 519                 spin_lock(&de->d_lock);
 520                 if (len != de->d_name.len) {
 521                         spin_unlock(&de->d_lock);
 522                         kfree(name);
 523                         goto retry;
 524                 }
 525                 memcpy(name, de->d_name.name, len);
 526                 spin_unlock(&de->d_lock);
 527
 528                 if (!lu_name_is_valid_2(name, len)) {
 529                         kfree(name);
 530                         name = NULL;
 531                         len = 0;
 532                 }
 533         }
 534
 535         op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
 536                                      name, len, 0, LUSTRE_OPC_ANY, NULL);
 537         if (IS_ERR(op_data)) {
 538                 kfree(name);
 539                 RETURN(PTR_ERR(op_data));
 540         }
 541         op_data->op_data = lmm;
 542         op_data->op_data_size = lmmsize;
 543
 544         rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
 545                             &ll_md_blocking_ast, 0);
 546         kfree(name);
 547         ll_finish_md_op_data(op_data);
 548         if (rc == -ESTALE) {
 549                 /* reason for keep own exit path - don`t flood log
 550                  * with messages with -ESTALE errors.
 551                  */
 552                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 553                      it_open_error(DISP_OPEN_OPEN, itp))
 554                         GOTO(out, rc);
 555                 ll_release_openhandle(de, itp);
 556                 GOTO(out, rc);
 557         }
 558
 559         if (it_disposition(itp, DISP_LOOKUP_NEG))
 560                 GOTO(out, rc = -ENOENT);
 561
 562         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 563                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 564                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 565                 GOTO(out, rc);
 566         }
 567
 568         rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
 569
 570         if (!rc && itp->it_lock_mode) {
 571                 ll_dom_finish_open(de->d_inode, req, itp);
 572                 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
 573         }
 574
 575 out:
 576         ptlrpc_req_finished(req);
 577         ll_intent_drop_lock(itp);
 578
 579         /* We did open by fid, but by the time we got to the server,
 580          * the object disappeared. If this is a create, we cannot really
 581          * tell the userspace that the file it was trying to create
 582          * does not exist. Instead let's return -ESTALE, and the VFS will
 583          * retry the create with LOOKUP_REVAL that we are going to catch
 584          * in ll_revalidate_dentry() and use lookup then.
 585          */
 586         if (rc == -ENOENT && itp->it_op & IT_CREAT)
 587                 rc = -ESTALE;
 588
 589         RETURN(rc);
 590 }
 591
 592 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
 593                        struct obd_client_handle *och)
 594 {
 595         struct mdt_body *body;
 596
 597         body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
 598         och->och_open_handle = body->mbo_open_handle;
 599         och->och_fid = body->mbo_fid1;
 600         och->och_lease_handle.cookie = it->it_lock_handle;
 601         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 602         och->och_flags = it->it_flags;
 603
 604         return md_set_open_replay_data(md_exp, och, it);
 605 }
 606
 607 static int ll_local_open(struct file *file, struct lookup_intent *it,
 608                          struct ll_file_data *fd, struct obd_client_handle *och)
 609 {
 610         struct inode *inode = file_inode(file);
 611         ENTRY;
 612
 613         LASSERT(!LUSTRE_FPRIVATE(file));
 614
 615         LASSERT(fd != NULL);
 616
 617         if (och) {
 618                 int rc;
 619
 620                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
 621                 if (rc != 0)
 622                         RETURN(rc);
 623         }
 624
 625         LUSTRE_FPRIVATE(file) = fd;
 626         ll_readahead_init(inode, &fd->fd_ras);
 627         fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 628
 629         /* ll_cl_context initialize */
 630         rwlock_init(&fd->fd_lock);
 631         INIT_LIST_HEAD(&fd->fd_lccs);
 632
 633         RETURN(0);
 634 }
 635
 636 /* Open a file, and (for the very first open) create objects on the OSTs at
 637  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 638  * creation or open until ll_lov_setstripe() ioctl is called.
 639  *
 640  * If we already have the stripe MD locally then we don't request it in
 641  * md_open(), by passing a lmm_size = 0.
 642  *
 643  * It is up to the application to ensure no other processes open this file
 644  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 645  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 646  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 647  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 648  */
 649 int ll_file_open(struct inode *inode, struct file *file)
 650 {
 651         struct ll_inode_info *lli = ll_i2info(inode);
 652         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 653                                           .it_flags = file->f_flags };
 654         struct obd_client_handle **och_p = NULL;
 655         __u64 *och_usecount = NULL;
 656         struct ll_file_data *fd;
 657         int rc = 0;
 658         ENTRY;
 659
 660         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
 661                PFID(ll_inode2fid(inode)), inode, file->f_flags);
 662
 663         it = file->private_data; /* XXX: compat macro */
 664         file->private_data = NULL; /* prevent ll_local_open assertion */
 665
 666         fd = ll_file_data_get();
 667         if (fd == NULL)
 668                 GOTO(out_nofiledata, rc = -ENOMEM);
 669
 670         fd->fd_file = file;
 671         if (S_ISDIR(inode->i_mode))
 672                 ll_authorize_statahead(inode, fd);
 673
 674         if (inode->i_sb->s_root == file_dentry(file)) {
 675                 LUSTRE_FPRIVATE(file) = fd;
 676                 RETURN(0);
 677         }
 678
 679         if (!it || !it->it_disposition) {
 680                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 681                  * because everything but O_ACCMODE mask was stripped from
 682                  * there */
 683                 if ((oit.it_flags + 1) & O_ACCMODE)
 684                         oit.it_flags++;
 685                 if (file->f_flags & O_TRUNC)
 686                         oit.it_flags |= FMODE_WRITE;
 687
 688                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 689                  * dentry_open after call to open_namei that checks permissions.
 690                  * Only nfsd_open call dentry_open directly without checking
 691                  * permissions and because of that this code below is safe.
 692                  */
 693                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 694                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 695
 696                 /* We do not want O_EXCL here, presumably we opened the file
 697                  * already? XXX - NFS implications? */
 698                 oit.it_flags &= ~O_EXCL;
 699
 700                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 701                  * created if necessary, then "IT_CREAT" should be set to keep
 702                  * consistent with it */
 703                 if (oit.it_flags & O_CREAT)
 704                         oit.it_op |= IT_CREAT;
 705
 706                 it = &oit;
 707         }
 708
 709 restart:
 710         /* Let's see if we have file open on MDS already. */
 711         if (it->it_flags & FMODE_WRITE) {
 712                 och_p = &lli->lli_mds_write_och;
 713                 och_usecount = &lli->lli_open_fd_write_count;
 714         } else if (it->it_flags & FMODE_EXEC) {
 715                 och_p = &lli->lli_mds_exec_och;
 716                 och_usecount = &lli->lli_open_fd_exec_count;
 717          } else {
 718                 och_p = &lli->lli_mds_read_och;
 719                 och_usecount = &lli->lli_open_fd_read_count;
 720         }
 721
 722         mutex_lock(&lli->lli_och_mutex);
 723         if (*och_p) { /* Open handle is present */
 724                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 725                         /* Well, there's extra open request that we do not need,
 726                            let's close it somehow. This will decref request. */
 727                         rc = it_open_error(DISP_OPEN_OPEN, it);
 728                         if (rc) {
 729                                 mutex_unlock(&lli->lli_och_mutex);
 730                                 GOTO(out_openerr, rc);
 731                         }
 732
 733                         ll_release_openhandle(file_dentry(file), it);
 734                 }
 735                 (*och_usecount)++;
 736
 737                 rc = ll_local_open(file, it, fd, NULL);
 738                 if (rc) {
 739                         (*och_usecount)--;
 740                         mutex_unlock(&lli->lli_och_mutex);
 741                         GOTO(out_openerr, rc);
 742                 }
 743         } else {
 744                 LASSERT(*och_usecount == 0);
 745                 if (!it->it_disposition) {
 746                         struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
 747                         /* We cannot just request lock handle now, new ELC code
 748                            means that one of other OPEN locks for this file
 749                            could be cancelled, and since blocking ast handler
 750                            would attempt to grab och_mutex as well, that would
 751                            result in a deadlock */
 752                         mutex_unlock(&lli->lli_och_mutex);
 753                         /*
 754                          * Normally called under two situations:
 755                          * 1. NFS export.
 756                          * 2. A race/condition on MDS resulting in no open
 757                          *    handle to be returned from LOOKUP|OPEN request,
 758                          *    for example if the target entry was a symlink.
 759                          *
 760                          *  Only fetch MDS_OPEN_LOCK if this is in NFS path,
 761                          *  marked by a bit set in ll_iget_for_nfs. Clear the
 762                          *  bit so that it's not confusing later callers.
 763                          *
 764                          *  NB; when ldd is NULL, it must have come via normal
 765                          *  lookup path only, since ll_iget_for_nfs always calls
 766                          *  ll_d_init().
 767                          */
 768                         if (ldd && ldd->lld_nfs_dentry) {
 769                                 ldd->lld_nfs_dentry = 0;
 770                                 it->it_flags |= MDS_OPEN_LOCK;
 771                         }
 772
 773                          /*
 774                          * Always specify MDS_OPEN_BY_FID because we don't want
 775                          * to get file with different fid.
 776                          */
 777                         it->it_flags |= MDS_OPEN_BY_FID;
 778                         rc = ll_intent_file_open(file_dentry(file), NULL, 0,
 779                                                  it);
 780                         if (rc)
 781                                 GOTO(out_openerr, rc);
 782
 783                         goto restart;
 784                 }
 785                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 786                 if (!*och_p)
 787                         GOTO(out_och_free, rc = -ENOMEM);
 788
 789                 (*och_usecount)++;
 790
 791                 /* md_intent_lock() didn't get a request ref if there was an
 792                  * open error, so don't do cleanup on the request here
 793                  * (bug 3430) */
 794                 /* XXX (green): Should not we bail out on any error here, not
 795                  * just open error? */
 796                 rc = it_open_error(DISP_OPEN_OPEN, it);
 797                 if (rc != 0)
 798                         GOTO(out_och_free, rc);
 799
 800                 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
 801                          "inode %p: disposition %x, status %d\n", inode,
 802                          it_disposition(it, ~0), it->it_status);
 803
 804                 rc = ll_local_open(file, it, fd, *och_p);
 805                 if (rc)
 806                         GOTO(out_och_free, rc);
 807         }
 808         mutex_unlock(&lli->lli_och_mutex);
 809         fd = NULL;
 810
 811         /* Must do this outside lli_och_mutex lock to prevent deadlock where
 812            different kind of OPEN lock for this same inode gets cancelled
 813            by ldlm_cancel_lru */
 814         if (!S_ISREG(inode->i_mode))
 815                 GOTO(out_och_free, rc);
 816
 817         cl_lov_delay_create_clear(&file->f_flags);
 818         GOTO(out_och_free, rc);
 819
 820 out_och_free:
 821         if (rc) {
 822                 if (och_p && *och_p) {
 823                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 824                         *och_p = NULL; /* OBD_FREE writes some magic there */
 825                         (*och_usecount)--;
 826                 }
 827                 mutex_unlock(&lli->lli_och_mutex);
 828
 829 out_openerr:
 830                 if (lli->lli_opendir_key == fd)
 831                         ll_deauthorize_statahead(inode, fd);
 832                 if (fd != NULL)
 833                         ll_file_data_put(fd);
 834         } else {
 835                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 836         }
 837
 838 out_nofiledata:
 839         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 840                 ptlrpc_req_finished(it->it_request);
 841                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 842         }
 843
 844         return rc;
 845 }
 846
 847 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
 848                         struct ldlm_lock_desc *desc, void *data, int flag)
 849 {
 850         int rc;
 851         struct lustre_handle lockh;
 852         ENTRY;
 853
 854         switch (flag) {
 855         case LDLM_CB_BLOCKING:
 856                 ldlm_lock2handle(lock, &lockh);
 857                 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
 858                 if (rc < 0) {
 859                         CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
 860                         RETURN(rc);
 861                 }
 862                 break;
 863         case LDLM_CB_CANCELING:
 864                 /* do nothing */
 865                 break;
 866         }
 867         RETURN(0);
 868 }
 869
 870 /**
 871  * When setting a lease on a file, we take ownership of the lli_mds_*_och
 872  * and save it as fd->fd_och so as to force client to reopen the file even
 873  * if it has an open lock in cache already.
 874  */
 875 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
 876                                 struct lustre_handle *old_open_handle)
 877 {
 878         struct ll_inode_info *lli = ll_i2info(inode);
 879         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 880         struct obd_client_handle **och_p;
 881         __u64 *och_usecount;
 882         int rc = 0;
 883         ENTRY;
 884
 885         /* Get the openhandle of the file */
 886         mutex_lock(&lli->lli_och_mutex);
 887         if (fd->fd_lease_och != NULL)
 888                 GOTO(out_unlock, rc = -EBUSY);
 889
 890         if (fd->fd_och == NULL) {
 891                 if (file->f_mode & FMODE_WRITE) {
 892                         LASSERT(lli->lli_mds_write_och != NULL);
 893                         och_p = &lli->lli_mds_write_och;
 894                         och_usecount = &lli->lli_open_fd_write_count;
 895                 } else {
 896                         LASSERT(lli->lli_mds_read_och != NULL);
 897                         och_p = &lli->lli_mds_read_och;
 898                         och_usecount = &lli->lli_open_fd_read_count;
 899                 }
 900
 901                 if (*och_usecount > 1)
 902                         GOTO(out_unlock, rc = -EBUSY);
 903
 904                 fd->fd_och = *och_p;
 905                 *och_usecount = 0;
 906                 *och_p = NULL;
 907         }
 908
 909         *old_open_handle = fd->fd_och->och_open_handle;
 910
 911         EXIT;
 912 out_unlock:
 913         mutex_unlock(&lli->lli_och_mutex);
 914         return rc;
 915 }
 916
 917 /**
 918  * Release ownership on lli_mds_*_och when putting back a file lease.
 919  */
 920 static int ll_lease_och_release(struct inode *inode, struct file *file)
 921 {
 922         struct ll_inode_info *lli = ll_i2info(inode);
 923         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 924         struct obd_client_handle **och_p;
 925         struct obd_client_handle *old_och = NULL;
 926         __u64 *och_usecount;
 927         int rc = 0;
 928         ENTRY;
 929
 930         mutex_lock(&lli->lli_och_mutex);
 931         if (file->f_mode & FMODE_WRITE) {
 932                 och_p = &lli->lli_mds_write_och;
 933                 och_usecount = &lli->lli_open_fd_write_count;
 934         } else {
 935                 och_p = &lli->lli_mds_read_och;
 936                 och_usecount = &lli->lli_open_fd_read_count;
 937         }
 938
 939         /* The file may have been open by another process (broken lease) so
 940          * *och_p is not NULL. In this case we should simply increase usecount
 941          * and close fd_och.
 942          */
 943         if (*och_p != NULL) {
 944                 old_och = fd->fd_och;
 945                 (*och_usecount)++;
 946         } else {
 947                 *och_p = fd->fd_och;
 948                 *och_usecount = 1;
 949         }
 950         fd->fd_och = NULL;
 951         mutex_unlock(&lli->lli_och_mutex);
 952
 953         if (old_och != NULL)
 954                 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
 955
 956         RETURN(rc);
 957 }
 958
 959 /**
 960  * Acquire a lease and open the file.
 961  */
 962 static struct obd_client_handle *
 963 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
 964               __u64 open_flags)
 965 {
 966         struct lookup_intent it = { .it_op = IT_OPEN };
 967         struct ll_sb_info *sbi = ll_i2sbi(inode);
 968         struct md_op_data *op_data;
 969         struct ptlrpc_request *req = NULL;
 970         struct lustre_handle old_open_handle = { 0 };
 971         struct obd_client_handle *och = NULL;
 972         int rc;
 973         int rc2;
 974         ENTRY;
 975
 976         if (fmode != FMODE_WRITE && fmode != FMODE_READ)
 977                 RETURN(ERR_PTR(-EINVAL));
 978
 979         if (file != NULL) {
 980                 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
 981                         RETURN(ERR_PTR(-EPERM));
 982
 983                 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
 984                 if (rc)
 985                         RETURN(ERR_PTR(rc));
 986         }
 987
 988         OBD_ALLOC_PTR(och);
 989         if (och == NULL)
 990                 RETURN(ERR_PTR(-ENOMEM));
 991
 992         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
 993                                         LUSTRE_OPC_ANY, NULL);
 994         if (IS_ERR(op_data))
 995                 GOTO(out, rc = PTR_ERR(op_data));
 996
 997         /* To tell the MDT this openhandle is from the same owner */
 998         op_data->op_open_handle = old_open_handle;
 999
1000         it.it_flags = fmode | open_flags;
1001         it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1002         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1003                             &ll_md_blocking_lease_ast,
1004         /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1005          * it can be cancelled which may mislead applications that the lease is
1006          * broken;
1007          * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1008          * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1009          * doesn't deal with openhandle, so normal openhandle will be leaked. */
1010                             LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1011         ll_finish_md_op_data(op_data);
1012         ptlrpc_req_finished(req);
1013         if (rc < 0)
1014                 GOTO(out_release_it, rc);
1015
1016         if (it_disposition(&it, DISP_LOOKUP_NEG))
1017                 GOTO(out_release_it, rc = -ENOENT);
1018
1019         rc = it_open_error(DISP_OPEN_OPEN, &it);
1020         if (rc)
1021                 GOTO(out_release_it, rc);
1022
1023         LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1024         ll_och_fill(sbi->ll_md_exp, &it, och);
1025
1026         if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1027                 GOTO(out_close, rc = -EOPNOTSUPP);
1028
1029         /* already get lease, handle lease lock */
1030         ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1031         if (it.it_lock_mode == 0 ||
1032             it.it_lock_bits != MDS_INODELOCK_OPEN) {
1033                 /* open lock must return for lease */
1034                 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1035                         PFID(ll_inode2fid(inode)), it.it_lock_mode,
1036                         it.it_lock_bits);
1037                 GOTO(out_close, rc = -EPROTO);
1038         }
1039
1040         ll_intent_release(&it);
1041         RETURN(och);
1042
1043 out_close:
1044         /* Cancel open lock */
1045         if (it.it_lock_mode != 0) {
1046                 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1047                                             it.it_lock_mode);
1048                 it.it_lock_mode = 0;
1049                 och->och_lease_handle.cookie = 0ULL;
1050         }
1051         rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1052         if (rc2 < 0)
1053                 CERROR("%s: error closing file "DFID": %d\n",
1054                        sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1055         och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1056 out_release_it:
1057         ll_intent_release(&it);
1058 out:
1059         if (och != NULL)
1060                 OBD_FREE_PTR(och);
1061         RETURN(ERR_PTR(rc));
1062 }
1063
1064 /**
1065  * Check whether a layout swap can be done between two inodes.
1066  *
1067  * \param[in] inode1  First inode to check
1068  * \param[in] inode2  Second inode to check
1069  *
1070  * \retval 0 on success, layout swap can be performed between both inodes
1071  * \retval negative error code if requirements are not met
1072  */
1073 static int ll_check_swap_layouts_validity(struct inode *inode1,
1074                                           struct inode *inode2)
1075 {
1076         if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1077                 return -EINVAL;
1078
1079         if (inode_permission(inode1, MAY_WRITE) ||
1080             inode_permission(inode2, MAY_WRITE))
1081                 return -EPERM;
1082
1083         if (inode1->i_sb != inode2->i_sb)
1084                 return -EXDEV;
1085
1086         return 0;
1087 }
1088
1089 static int ll_swap_layouts_close(struct obd_client_handle *och,
1090                                  struct inode *inode, struct inode *inode2)
1091 {
1092         const struct lu_fid     *fid1 = ll_inode2fid(inode);
1093         const struct lu_fid     *fid2;
1094         int                      rc;
1095         ENTRY;
1096
1097         CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1098                ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1099
1100         rc = ll_check_swap_layouts_validity(inode, inode2);
1101         if (rc < 0)
1102                 GOTO(out_free_och, rc);
1103
1104         /* We now know that inode2 is a lustre inode */
1105         fid2 = ll_inode2fid(inode2);
1106
1107         rc = lu_fid_cmp(fid1, fid2);
1108         if (rc == 0)
1109                 GOTO(out_free_och, rc = -EINVAL);
1110
1111         /* Close the file and {swap,merge} layouts between inode & inode2.
1112          * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1113          * because we still need it to pack l_remote_handle to MDT. */
1114         rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1115                                        inode2);
1116
1117         och = NULL; /* freed in ll_close_inode_openhandle() */
1118
1119 out_free_och:
1120         if (och != NULL)
1121                 OBD_FREE_PTR(och);
1122
1123         RETURN(rc);
1124 }
1125
1126 /**
1127  * Release lease and close the file.
1128  * It will check if the lease has ever broken.
1129  */
1130 static int ll_lease_close_intent(struct obd_client_handle *och,
1131                                  struct inode *inode,
1132                                  bool *lease_broken, enum mds_op_bias bias,
1133                                  void *data)
1134 {
1135         struct ldlm_lock *lock;
1136         bool cancelled = true;
1137         int rc;
1138         ENTRY;
1139
1140         lock = ldlm_handle2lock(&och->och_lease_handle);
1141         if (lock != NULL) {
1142                 lock_res_and_lock(lock);
1143                 cancelled = ldlm_is_cancel(lock);
1144                 unlock_res_and_lock(lock);
1145                 LDLM_LOCK_PUT(lock);
1146         }
1147
1148         CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1149                PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1150
1151         if (lease_broken != NULL)
1152                 *lease_broken = cancelled;
1153
1154         if (!cancelled && !bias)
1155                 ldlm_cli_cancel(&och->och_lease_handle, 0);
1156
1157         if (cancelled) { /* no need to excute intent */
1158                 bias = 0;
1159                 data = NULL;
1160         }
1161
1162         rc = ll_close_inode_openhandle(inode, och, bias, data);
1163         RETURN(rc);
1164 }
1165
1166 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1167                           bool *lease_broken)
1168 {
1169         return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1170 }
1171
1172 /**
1173  * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1174  */
1175 static int ll_lease_file_resync(struct obd_client_handle *och,
1176                                 struct inode *inode, unsigned long arg)
1177 {
1178         struct ll_sb_info *sbi = ll_i2sbi(inode);
1179         struct md_op_data *op_data;
1180         struct ll_ioc_lease_id ioc;
1181         __u64 data_version_unused;
1182         int rc;
1183         ENTRY;
1184
1185         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1186                                      LUSTRE_OPC_ANY, NULL);
1187         if (IS_ERR(op_data))
1188                 RETURN(PTR_ERR(op_data));
1189
1190         if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1191                            sizeof(ioc)))
1192                 RETURN(-EFAULT);
1193
1194         /* before starting file resync, it's necessary to clean up page cache
1195          * in client memory, otherwise once the layout version is increased,
1196          * writing back cached data will be denied the OSTs. */
1197         rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1198         if (rc)
1199                 GOTO(out, rc);
1200
1201         op_data->op_lease_handle = och->och_lease_handle;
1202         op_data->op_mirror_id = ioc.lil_mirror_id;
1203         rc = md_file_resync(sbi->ll_md_exp, op_data);
1204         if (rc)
1205                 GOTO(out, rc);
1206
1207         EXIT;
1208 out:
1209         ll_finish_md_op_data(op_data);
1210         return rc;
1211 }
1212
1213 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1214 {
1215         struct ll_inode_info *lli = ll_i2info(inode);
1216         struct cl_object *obj = lli->lli_clob;
1217         struct cl_attr *attr = vvp_env_thread_attr(env);
1218         s64 atime;
1219         s64 mtime;
1220         s64 ctime;
1221         int rc = 0;
1222
1223         ENTRY;
1224
1225         ll_inode_size_lock(inode);
1226
1227         /* Merge timestamps the most recently obtained from MDS with
1228          * timestamps obtained from OSTs.
1229          *
1230          * Do not overwrite atime of inode because it may be refreshed
1231          * by file_accessed() function. If the read was served by cache
1232          * data, there is no RPC to be sent so that atime may not be
1233          * transferred to OSTs at all. MDT only updates atime at close time
1234          * if it's at least 'mdd.*.atime_diff' older.
1235          * All in all, the atime in Lustre does not strictly comply with
1236          * POSIX. Solving this problem needs to send an RPC to MDT for each
1237          * read, this will hurt performance.
1238          */
1239         if (inode->i_atime.tv_sec < lli->lli_atime ||
1240             lli->lli_update_atime) {
1241                 inode->i_atime.tv_sec = lli->lli_atime;
1242                 lli->lli_update_atime = 0;
1243         }
1244         inode->i_mtime.tv_sec = lli->lli_mtime;
1245         inode->i_ctime.tv_sec = lli->lli_ctime;
1246
1247         mtime = inode->i_mtime.tv_sec;
1248         atime = inode->i_atime.tv_sec;
1249         ctime = inode->i_ctime.tv_sec;
1250
1251         cl_object_attr_lock(obj);
1252         if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1253                 rc = -EINVAL;
1254         else
1255                 rc = cl_object_attr_get(env, obj, attr);
1256         cl_object_attr_unlock(obj);
1257
1258         if (rc != 0)
1259                 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1260
1261         if (atime < attr->cat_atime)
1262                 atime = attr->cat_atime;
1263
1264         if (ctime < attr->cat_ctime)
1265                 ctime = attr->cat_ctime;
1266
1267         if (mtime < attr->cat_mtime)
1268                 mtime = attr->cat_mtime;
1269
1270         CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1271                PFID(&lli->lli_fid), attr->cat_size);
1272
1273         i_size_write(inode, attr->cat_size);
1274         inode->i_blocks = attr->cat_blocks;
1275
1276         inode->i_mtime.tv_sec = mtime;
1277         inode->i_atime.tv_sec = atime;
1278         inode->i_ctime.tv_sec = ctime;
1279
1280 out_size_unlock:
1281         ll_inode_size_unlock(inode);
1282
1283         RETURN(rc);
1284 }
1285
1286 /**
1287  * Set designated mirror for I/O.
1288  *
1289  * So far only read, write, and truncated can support to issue I/O to
1290  * designated mirror.
1291  */
1292 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1293 {
1294         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1295
1296         /* clear layout version for generic(non-resync) I/O in case it carries
1297          * stale layout version due to I/O restart */
1298         io->ci_layout_version = 0;
1299
1300         /* FLR: disable non-delay for designated mirror I/O because obviously
1301          * only one mirror is available */
1302         if (fd->fd_designated_mirror > 0) {
1303                 io->ci_ndelay = 0;
1304                 io->ci_designated_mirror = fd->fd_designated_mirror;
1305                 io->ci_layout_version = fd->fd_layout_version;
1306         }
1307
1308         CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1309                file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1310 }
1311
1312 static bool file_is_noatime(const struct file *file)
1313 {
1314         const struct vfsmount *mnt = file->f_path.mnt;
1315         const struct inode *inode = file_inode((struct file *)file);
1316
1317         /* Adapted from file_accessed() and touch_atime().*/
1318         if (file->f_flags & O_NOATIME)
1319                 return true;
1320
1321         if (inode->i_flags & S_NOATIME)
1322                 return true;
1323
1324         if (IS_NOATIME(inode))
1325                 return true;
1326
1327         if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1328                 return true;
1329
1330         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1331                 return true;
1332
1333         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1334                 return true;
1335
1336         return false;
1337 }
1338
1339 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1340 {
1341         struct inode *inode = file_inode(file);
1342         struct ll_file_data *fd  = LUSTRE_FPRIVATE(file);
1343
1344         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1345         io->ci_lock_no_expand = fd->ll_lock_no_expand;
1346
1347         if (iot == CIT_WRITE) {
1348                 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1349                 io->u.ci_wr.wr_sync   = !!(file->f_flags & O_SYNC ||
1350                                            file->f_flags & O_DIRECT ||
1351                                            IS_SYNC(inode));
1352         }
1353         io->ci_obj = ll_i2info(inode)->lli_clob;
1354         io->ci_lockreq = CILR_MAYBE;
1355         if (ll_file_nolock(file)) {
1356                 io->ci_lockreq = CILR_NEVER;
1357                 io->ci_no_srvlock = 1;
1358         } else if (file->f_flags & O_APPEND) {
1359                 io->ci_lockreq = CILR_MANDATORY;
1360         }
1361         io->ci_noatime = file_is_noatime(file);
1362
1363         /* FLR: only use non-delay I/O for read as there is only one
1364          * avaliable mirror for write. */
1365         io->ci_ndelay = !(iot == CIT_WRITE);
1366
1367         ll_io_set_mirror(io, file);
1368 }
1369
1370 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1371                         __u64 count)
1372 {
1373         struct ll_inode_info *lli = ll_i2info(inode);
1374         struct ll_sb_info *sbi = ll_i2sbi(inode);
1375         enum obd_heat_type sample_type;
1376         enum obd_heat_type iobyte_type;
1377         __u64 now = ktime_get_real_seconds();
1378
1379         if (!ll_sbi_has_file_heat(sbi) ||
1380             lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1381                 return;
1382
1383         if (iot == CIT_READ) {
1384                 sample_type = OBD_HEAT_READSAMPLE;
1385                 iobyte_type = OBD_HEAT_READBYTE;
1386         } else if (iot == CIT_WRITE) {
1387                 sample_type = OBD_HEAT_WRITESAMPLE;
1388                 iobyte_type = OBD_HEAT_WRITEBYTE;
1389         } else {
1390                 return;
1391         }
1392
1393         spin_lock(&lli->lli_heat_lock);
1394         obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1395                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1396         obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1397                      sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1398         spin_unlock(&lli->lli_heat_lock);
1399 }
1400
1401 static ssize_t
1402 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1403                    struct file *file, enum cl_io_type iot,
1404                    loff_t *ppos, size_t count)
1405 {
1406         struct vvp_io           *vio = vvp_env_io(env);
1407         struct inode            *inode = file_inode(file);
1408         struct ll_inode_info    *lli = ll_i2info(inode);
1409         struct ll_file_data     *fd  = LUSTRE_FPRIVATE(file);
1410         struct range_lock       range;
1411         struct cl_io            *io;
1412         ssize_t                 result = 0;
1413         int                     rc = 0;
1414         unsigned                retried = 0;
1415         bool                    restarted = false;
1416
1417         ENTRY;
1418
1419         CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1420                 file_dentry(file)->d_name.name,
1421                 iot == CIT_READ ? "read" : "write", *ppos, count);
1422
1423 restart:
1424         io = vvp_env_thread_io(env);
1425         ll_io_init(io, file, iot);
1426         io->ci_ndelay_tried = retried;
1427
1428         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1429                 bool range_locked = false;
1430
1431                 if (file->f_flags & O_APPEND)
1432                         range_lock_init(&range, 0, LUSTRE_EOF);
1433                 else
1434                         range_lock_init(&range, *ppos, *ppos + count - 1);
1435
1436                 vio->vui_fd  = LUSTRE_FPRIVATE(file);
1437                 vio->vui_io_subtype = args->via_io_subtype;
1438
1439                 switch (vio->vui_io_subtype) {
1440                 case IO_NORMAL:
1441                         vio->vui_iter = args->u.normal.via_iter;
1442                         vio->vui_iocb = args->u.normal.via_iocb;
1443                         /* Direct IO reads must also take range lock,
1444                          * or multiple reads will try to work on the same pages
1445                          * See LU-6227 for details. */
1446                         if (((iot == CIT_WRITE) ||
1447                             (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1448                             !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1449                                 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1450                                        RL_PARA(&range));
1451                                 rc = range_lock(&lli->lli_write_tree, &range);
1452                                 if (rc < 0)
1453                                         GOTO(out, rc);
1454
1455                                 range_locked = true;
1456                         }
1457                         break;
1458                 case IO_SPLICE:
1459                         vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1460                         vio->u.splice.vui_flags = args->u.splice.via_flags;
1461                         break;
1462                 default:
1463                         CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1464                         LBUG();
1465                 }
1466
1467                 ll_cl_add(file, env, io, LCC_RW);
1468                 rc = cl_io_loop(env, io);
1469                 ll_cl_remove(file, env);
1470
1471                 if (range_locked) {
1472                         CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1473                                RL_PARA(&range));
1474                         range_unlock(&lli->lli_write_tree, &range);
1475                 }
1476         } else {
1477                 /* cl_io_rw_init() handled IO */
1478                 rc = io->ci_result;
1479         }
1480
1481         if (io->ci_nob > 0) {
1482                 result += io->ci_nob;
1483                 count  -= io->ci_nob;
1484                 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1485
1486                 /* prepare IO restart */
1487                 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1488                         args->u.normal.via_iter = vio->vui_iter;
1489         }
1490 out:
1491         cl_io_fini(env, io);
1492
1493         CDEBUG(D_VFSTRACE,
1494                "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1495                file->f_path.dentry->d_name.name,
1496                iot, rc, result, io->ci_need_restart);
1497
1498         if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1499                 CDEBUG(D_VFSTRACE,
1500                        "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1501                        file_dentry(file)->d_name.name,
1502                        iot == CIT_READ ? "read" : "write",
1503                        *ppos, count, result, rc);
1504                 /* preserve the tried count for FLR */
1505                 retried = io->ci_ndelay_tried;
1506                 restarted = true;
1507                 goto restart;
1508         }
1509
1510         if (iot == CIT_READ) {
1511                 if (result > 0)
1512                         ll_stats_ops_tally(ll_i2sbi(inode),
1513                                            LPROC_LL_READ_BYTES, result);
1514         } else if (iot == CIT_WRITE) {
1515                 if (result > 0) {
1516                         ll_stats_ops_tally(ll_i2sbi(inode),
1517                                            LPROC_LL_WRITE_BYTES, result);
1518                         fd->fd_write_failed = false;
1519                 } else if (result == 0 && rc == 0) {
1520                         rc = io->ci_result;
1521                         if (rc < 0)
1522                                 fd->fd_write_failed = true;
1523                         else
1524                                 fd->fd_write_failed = false;
1525                 } else if (rc != -ERESTARTSYS) {
1526                         fd->fd_write_failed = true;
1527                 }
1528         }
1529
1530         CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1531         if (result > 0)
1532                 ll_heat_add(inode, iot, result);
1533
1534         RETURN(result > 0 ? result : rc);
1535 }
1536
1537 /**
1538  * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1539  * especially for small I/O.
1540  *
1541  * To serve a read request, CLIO has to create and initialize a cl_io and
1542  * then request DLM lock. This has turned out to have siginificant overhead
1543  * and affects the performance of small I/O dramatically.
1544  *
1545  * It's not necessary to create a cl_io for each I/O. Under the help of read
1546  * ahead, most of the pages being read are already in memory cache and we can
1547  * read those pages directly because if the pages exist, the corresponding DLM
1548  * lock must exist so that page content must be valid.
1549  *
1550  * In fast read implementation, the llite speculatively finds and reads pages
1551  * in memory cache. There are three scenarios for fast read:
1552  *   - If the page exists and is uptodate, kernel VM will provide the data and
1553  *     CLIO won't be intervened;
1554  *   - If the page was brought into memory by read ahead, it will be exported
1555  *     and read ahead parameters will be updated;
1556  *   - Otherwise the page is not in memory, we can't do fast read. Therefore,
1557  *     it will go back and invoke normal read, i.e., a cl_io will be created
1558  *     and DLM lock will be requested.
1559  *
1560  * POSIX compliance: posix standard states that read is intended to be atomic.
1561  * Lustre read implementation is in line with Linux kernel read implementation
1562  * and neither of them complies with POSIX standard in this matter. Fast read
1563  * doesn't make the situation worse on single node but it may interleave write
1564  * results from multiple nodes due to short read handling in ll_file_aio_read().
1565  *
1566  * \param env - lu_env
1567  * \param iocb - kiocb from kernel
1568  * \param iter - user space buffers where the data will be copied
1569  *
1570  * \retval - number of bytes have been read, or error code if error occurred.
1571  */
1572 static ssize_t
1573 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1574 {
1575         ssize_t result;
1576
1577         if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1578                 return 0;
1579
1580         /* NB: we can't do direct IO for fast read because it will need a lock
1581          * to make IO engine happy. */
1582         if (iocb->ki_filp->f_flags & O_DIRECT)
1583                 return 0;
1584
1585         result = generic_file_read_iter(iocb, iter);
1586
1587         /* If the first page is not in cache, generic_file_aio_read() will be
1588          * returned with -ENODATA.
1589          * See corresponding code in ll_readpage(). */
1590         if (result == -ENODATA)
1591                 result = 0;
1592
1593         if (result > 0) {
1594                 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1595                 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1596                                 LPROC_LL_READ_BYTES, result);
1597         }
1598
1599         return result;
1600 }
1601
1602 /*
1603  * Read from a file (through the page cache).
1604  */
1605 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1606 {
1607         struct lu_env *env;
1608         struct vvp_io_args *args;
1609         ssize_t result;
1610         ssize_t rc2;
1611         __u16 refcheck;
1612
1613         result = ll_do_fast_read(iocb, to);
1614         if (result < 0 || iov_iter_count(to) == 0)
1615                 GOTO(out, result);
1616
1617         env = cl_env_get(&refcheck);
1618         if (IS_ERR(env))
1619                 return PTR_ERR(env);
1620
1621         args = ll_env_args(env, IO_NORMAL);
1622         args->u.normal.via_iter = to;
1623         args->u.normal.via_iocb = iocb;
1624
1625         rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1626                                  &iocb->ki_pos, iov_iter_count(to));
1627         if (rc2 > 0)
1628                 result += rc2;
1629         else if (result == 0)
1630                 result = rc2;
1631
1632         cl_env_put(env, &refcheck);
1633 out:
1634         return result;
1635 }
1636
1637 /**
1638  * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1639  * If a page is already in the page cache and dirty (and some other things -
1640  * See ll_tiny_write_begin for the instantiation of these rules), then we can
1641  * write to it without doing a full I/O, because Lustre already knows about it
1642  * and will write it out.  This saves a lot of processing time.
1643  *
1644  * All writes here are within one page, so exclusion is handled by the page
1645  * lock on the vm page.  We do not do tiny writes for writes which touch
1646  * multiple pages because it's very unlikely multiple sequential pages are
1647  * are already dirty.
1648  *
1649  * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1650  * and are unlikely to be to already dirty pages.
1651  *
1652  * Attribute updates are important here, we do them in ll_tiny_write_end.
1653  */
1654 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1655 {
1656         ssize_t count = iov_iter_count(iter);
1657         struct  file *file = iocb->ki_filp;
1658         struct  inode *inode = file_inode(file);
1659         bool    lock_inode = !IS_NOSEC(inode);
1660         ssize_t result = 0;
1661
1662         ENTRY;
1663
1664         /* Restrict writes to single page and < PAGE_SIZE.  See comment at top
1665          * of function for why.
1666          */
1667         if (count >= PAGE_SIZE ||
1668             (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1669                 RETURN(0);
1670
1671         if (unlikely(lock_inode))
1672                 inode_lock(inode);
1673         result = __generic_file_write_iter(iocb, iter);
1674
1675         if (unlikely(lock_inode))
1676                 inode_unlock(inode);
1677
1678         /* If the page is not already dirty, ll_tiny_write_begin returns
1679          * -ENODATA.  We continue on to normal write.
1680          */
1681         if (result == -ENODATA)
1682                 result = 0;
1683
1684         if (result > 0) {
1685                 ll_heat_add(inode, CIT_WRITE, result);
1686                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1687                                    result);
1688                 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1689         }
1690
1691         CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1692
1693         RETURN(result);
1694 }
1695
1696 /*
1697  * Write to a file (through the page cache).
1698  */
1699 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1700 {
1701         struct vvp_io_args *args;
1702         struct lu_env *env;
1703         ssize_t rc_tiny = 0, rc_normal;
1704         __u16 refcheck;
1705
1706         ENTRY;
1707
1708         /* NB: we can't do direct IO for tiny writes because they use the page
1709          * cache, we can't do sync writes because tiny writes can't flush
1710          * pages, and we can't do append writes because we can't guarantee the
1711          * required DLM locks are held to protect file size.
1712          */
1713         if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1714             !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1715                 rc_tiny = ll_do_tiny_write(iocb, from);
1716
1717         /* In case of error, go on and try normal write - Only stop if tiny
1718          * write completed I/O.
1719          */
1720         if (iov_iter_count(from) == 0)
1721                 GOTO(out, rc_normal = rc_tiny);
1722
1723         env = cl_env_get(&refcheck);
1724         if (IS_ERR(env))
1725                 return PTR_ERR(env);
1726
1727         args = ll_env_args(env, IO_NORMAL);
1728         args->u.normal.via_iter = from;
1729         args->u.normal.via_iocb = iocb;
1730
1731         rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1732                                     &iocb->ki_pos, iov_iter_count(from));
1733
1734         /* On success, combine bytes written. */
1735         if (rc_tiny >= 0 && rc_normal > 0)
1736                 rc_normal += rc_tiny;
1737         /* On error, only return error from normal write if tiny write did not
1738          * write any bytes.  Otherwise return bytes written by tiny write.
1739          */
1740         else if (rc_tiny > 0)
1741                 rc_normal = rc_tiny;
1742
1743         cl_env_put(env, &refcheck);
1744 out:
1745         RETURN(rc_normal);
1746 }
1747
1748 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1749 /*
1750  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1751  */
1752 static int ll_file_get_iov_count(const struct iovec *iov,
1753                                  unsigned long *nr_segs, size_t *count)
1754 {
1755         size_t cnt = 0;
1756         unsigned long seg;
1757
1758         for (seg = 0; seg < *nr_segs; seg++) {
1759                 const struct iovec *iv = &iov[seg];
1760
1761                 /*
1762                  * If any segment has a negative length, or the cumulative
1763                  * length ever wraps negative then return -EINVAL.
1764                  */
1765                 cnt += iv->iov_len;
1766                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1767                         return -EINVAL;
1768                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1769                         continue;
1770                 if (seg == 0)
1771                         return -EFAULT;
1772                 *nr_segs = seg;
1773                 cnt -= iv->iov_len;     /* This segment is no good */
1774                 break;
1775         }
1776         *count = cnt;
1777         return 0;
1778 }
1779
1780 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1781                                 unsigned long nr_segs, loff_t pos)
1782 {
1783         struct iov_iter to;
1784         size_t iov_count;
1785         ssize_t result;
1786         ENTRY;
1787
1788         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1789         if (result)
1790                 RETURN(result);
1791
1792 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1793         iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1794 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1795         iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1796 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1797
1798         result = ll_file_read_iter(iocb, &to);
1799
1800         RETURN(result);
1801 }
1802
1803 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1804                             loff_t *ppos)
1805 {
1806         struct iovec   iov = { .iov_base = buf, .iov_len = count };
1807         struct kiocb   kiocb;
1808         ssize_t        result;
1809         ENTRY;
1810
1811         init_sync_kiocb(&kiocb, file);
1812         kiocb.ki_pos = *ppos;
1813 #ifdef HAVE_KIOCB_KI_LEFT
1814         kiocb.ki_left = count;
1815 #elif defined(HAVE_KI_NBYTES)
1816         kiocb.i_nbytes = count;
1817 #endif
1818
1819         result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1820         *ppos = kiocb.ki_pos;
1821
1822         RETURN(result);
1823 }
1824
1825 /*
1826  * Write to a file (through the page cache).
1827  * AIO stuff
1828  */
1829 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1830                                  unsigned long nr_segs, loff_t pos)
1831 {
1832         struct iov_iter from;
1833         size_t iov_count;
1834         ssize_t result;
1835         ENTRY;
1836
1837         result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1838         if (result)
1839                 RETURN(result);
1840
1841 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1842         iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1843 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1844         iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1845 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1846
1847         result = ll_file_write_iter(iocb, &from);
1848
1849         RETURN(result);
1850 }
1851
1852 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1853                              size_t count, loff_t *ppos)
1854 {
1855         struct iovec   iov = { .iov_base = (void __user *)buf,
1856                                .iov_len = count };
1857         struct kiocb   kiocb;
1858         ssize_t        result;
1859
1860         ENTRY;
1861
1862         init_sync_kiocb(&kiocb, file);
1863         kiocb.ki_pos = *ppos;
1864 #ifdef HAVE_KIOCB_KI_LEFT
1865         kiocb.ki_left = count;
1866 #elif defined(HAVE_KI_NBYTES)
1867         kiocb.ki_nbytes = count;
1868 #endif
1869
1870         result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1871         *ppos = kiocb.ki_pos;
1872
1873         RETURN(result);
1874 }
1875 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1876
1877 /*
1878  * Send file content (through pagecache) somewhere with helper
1879  */
1880 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1881                                    struct pipe_inode_info *pipe, size_t count,
1882                                    unsigned int flags)
1883 {
1884         struct lu_env      *env;
1885         struct vvp_io_args *args;
1886         ssize_t             result;
1887         __u16               refcheck;
1888         ENTRY;
1889
1890         env = cl_env_get(&refcheck);
1891         if (IS_ERR(env))
1892                 RETURN(PTR_ERR(env));
1893
1894         args = ll_env_args(env, IO_SPLICE);
1895         args->u.splice.via_pipe = pipe;
1896         args->u.splice.via_flags = flags;
1897
1898         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1899         cl_env_put(env, &refcheck);
1900         RETURN(result);
1901 }
1902
1903 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1904                              __u64 flags, struct lov_user_md *lum, int lum_size)
1905 {
1906         struct lookup_intent oit = {
1907                 .it_op = IT_OPEN,
1908                 .it_flags = flags | MDS_OPEN_BY_FID,
1909         };
1910         int rc;
1911         ENTRY;
1912
1913         ll_inode_size_lock(inode);
1914         rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1915         if (rc < 0)
1916                 GOTO(out_unlock, rc);
1917
1918         ll_release_openhandle(dentry, &oit);
1919
1920 out_unlock:
1921         ll_inode_size_unlock(inode);
1922         ll_intent_release(&oit);
1923
1924         RETURN(rc);
1925 }
1926
1927 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1928                              struct lov_mds_md **lmmp, int *lmm_size,
1929                              struct ptlrpc_request **request)
1930 {
1931         struct ll_sb_info *sbi = ll_i2sbi(inode);
1932         struct mdt_body  *body;
1933         struct lov_mds_md *lmm = NULL;
1934         struct ptlrpc_request *req = NULL;
1935         struct md_op_data *op_data;
1936         int rc, lmmsize;
1937
1938         rc = ll_get_default_mdsize(sbi, &lmmsize);
1939         if (rc)
1940                 RETURN(rc);
1941
1942         op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1943                                      strlen(filename), lmmsize,
1944                                      LUSTRE_OPC_ANY, NULL);
1945         if (IS_ERR(op_data))
1946                 RETURN(PTR_ERR(op_data));
1947
1948         op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1949         rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1950         ll_finish_md_op_data(op_data);
1951         if (rc < 0) {
1952                 CDEBUG(D_INFO, "md_getattr_name failed "
1953                        "on %s: rc %d\n", filename, rc);
1954                 GOTO(out, rc);
1955         }
1956
1957         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1958         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1959
1960         lmmsize = body->mbo_eadatasize;
1961
1962         if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1963                         lmmsize == 0) {
1964                 GOTO(out, rc = -ENODATA);
1965         }
1966
1967         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1968         LASSERT(lmm != NULL);
1969
1970         if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1971             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1972             lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1973                 GOTO(out, rc = -EPROTO);
1974
1975         /*
1976          * This is coming from the MDS, so is probably in
1977          * little endian.  We convert it to host endian before
1978          * passing it to userspace.
1979          */
1980         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1981                 int stripe_count;
1982
1983                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1984                     lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1985                         stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1986                         if (le32_to_cpu(lmm->lmm_pattern) &
1987                             LOV_PATTERN_F_RELEASED)
1988                                 stripe_count = 0;
1989                 }
1990
1991                 /* if function called for directory - we should
1992                  * avoid swab not existent lsm objects */
1993                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1994                         lustre_swab_lov_user_md_v1(
1995                                         (struct lov_user_md_v1 *)lmm);
1996                         if (S_ISREG(body->mbo_mode))
1997                                 lustre_swab_lov_user_md_objects(
1998                                     ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1999                                     stripe_count);
2000                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2001                         lustre_swab_lov_user_md_v3(
2002                                         (struct lov_user_md_v3 *)lmm);
2003                         if (S_ISREG(body->mbo_mode))
2004                                 lustre_swab_lov_user_md_objects(
2005                                     ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2006                                     stripe_count);
2007                 } else if (lmm->lmm_magic ==
2008                            cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2009                         lustre_swab_lov_comp_md_v1(
2010                                         (struct lov_comp_md_v1 *)lmm);
2011                 }
2012         }
2013
2014 out:
2015         *lmmp = lmm;
2016         *lmm_size = lmmsize;
2017         *request = req;
2018         return rc;
2019 }
2020
2021 static int ll_lov_setea(struct inode *inode, struct file *file,
2022                         void __user *arg)
2023 {
2024         __u64                    flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2025         struct lov_user_md      *lump;
2026         int                      lum_size = sizeof(struct lov_user_md) +
2027                                             sizeof(struct lov_user_ost_data);
2028         int                      rc;
2029         ENTRY;
2030
2031         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2032                 RETURN(-EPERM);
2033
2034         OBD_ALLOC_LARGE(lump, lum_size);
2035         if (lump == NULL)
2036                 RETURN(-ENOMEM);
2037
2038         if (copy_from_user(lump, arg, lum_size))
2039                 GOTO(out_lump, rc = -EFAULT);
2040
2041         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2042                                       lum_size);
2043         cl_lov_delay_create_clear(&file->f_flags);
2044
2045 out_lump:
2046         OBD_FREE_LARGE(lump, lum_size);
2047         RETURN(rc);
2048 }
2049
2050 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2051 {
2052         struct lu_env   *env;
2053         __u16           refcheck;
2054         int             rc;
2055         ENTRY;
2056
2057         env = cl_env_get(&refcheck);
2058         if (IS_ERR(env))
2059                 RETURN(PTR_ERR(env));
2060
2061         rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2062         cl_env_put(env, &refcheck);
2063         RETURN(rc);
2064 }
2065
2066 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2067                             void __user *arg)
2068 {
2069         struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2070         struct lov_user_md        *klum;
2071         int                        lum_size, rc;
2072         __u64                      flags = FMODE_WRITE;
2073         ENTRY;
2074
2075         rc = ll_copy_user_md(lum, &klum);
2076         if (rc < 0)
2077                 RETURN(rc);
2078
2079         lum_size = rc;
2080         rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2081                                       lum_size);
2082         if (!rc) {
2083                 __u32 gen;
2084
2085                 rc = put_user(0, &lum->lmm_stripe_count);
2086                 if (rc)
2087                         GOTO(out, rc);
2088
2089                 rc = ll_layout_refresh(inode, &gen);
2090                 if (rc)
2091                         GOTO(out, rc);
2092
2093                 rc = ll_file_getstripe(inode, arg, lum_size);
2094         }
2095         cl_lov_delay_create_clear(&file->f_flags);
2096
2097 out:
2098         OBD_FREE(klum, lum_size);
2099         RETURN(rc);
2100 }
2101
2102 static int
2103 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2104 {
2105         struct ll_inode_info *lli = ll_i2info(inode);
2106         struct cl_object *obj = lli->lli_clob;
2107         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2108         struct ll_grouplock grouplock;
2109         int rc;
2110         ENTRY;
2111
2112         if (arg == 0) {
2113                 CWARN("group id for group lock must not be 0\n");
2114                 RETURN(-EINVAL);
2115         }
2116
2117         if (ll_file_nolock(file))
2118                 RETURN(-EOPNOTSUPP);
2119
2120         spin_lock(&lli->lli_lock);
2121         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2122                 CWARN("group lock already existed with gid %lu\n",
2123                       fd->fd_grouplock.lg_gid);
2124                 spin_unlock(&lli->lli_lock);
2125                 RETURN(-EINVAL);
2126         }
2127         LASSERT(fd->fd_grouplock.lg_lock == NULL);
2128         spin_unlock(&lli->lli_lock);
2129
2130         /**
2131          * XXX: group lock needs to protect all OST objects while PFL
2132          * can add new OST objects during the IO, so we'd instantiate
2133          * all OST objects before getting its group lock.
2134          */
2135         if (obj) {
2136                 struct lu_env *env;
2137                 __u16 refcheck;
2138                 struct cl_layout cl = {
2139                         .cl_is_composite = false,
2140                 };
2141                 struct lu_extent ext = {
2142                         .e_start = 0,
2143                         .e_end = OBD_OBJECT_EOF,
2144                 };
2145
2146                 env = cl_env_get(&refcheck);
2147                 if (IS_ERR(env))
2148                         RETURN(PTR_ERR(env));
2149
2150                 rc = cl_object_layout_get(env, obj, &cl);
2151                 if (!rc && cl.cl_is_composite)
2152                         rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2153                                                     &ext);
2154
2155                 cl_env_put(env, &refcheck);
2156                 if (rc)
2157                         RETURN(rc);
2158         }
2159
2160         rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2161                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
2162         if (rc)
2163                 RETURN(rc);
2164
2165         spin_lock(&lli->lli_lock);
2166         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2167                 spin_unlock(&lli->lli_lock);
2168                 CERROR("another thread just won the race\n");
2169                 cl_put_grouplock(&grouplock);
2170                 RETURN(-EINVAL);
2171         }
2172
2173         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2174         fd->fd_grouplock = grouplock;
2175         spin_unlock(&lli->lli_lock);
2176
2177         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2178         RETURN(0);
2179 }
2180
2181 static int ll_put_grouplock(struct inode *inode, struct file *file,
2182                             unsigned long arg)
2183 {
2184         struct ll_inode_info   *lli = ll_i2info(inode);
2185         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
2186         struct ll_grouplock     grouplock;
2187         ENTRY;
2188
2189         spin_lock(&lli->lli_lock);
2190         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2191                 spin_unlock(&lli->lli_lock);
2192                 CWARN("no group lock held\n");
2193                 RETURN(-EINVAL);
2194         }
2195
2196         LASSERT(fd->fd_grouplock.lg_lock != NULL);
2197
2198         if (fd->fd_grouplock.lg_gid != arg) {
2199                 CWARN("group lock %lu doesn't match current id %lu\n",
2200                       arg, fd->fd_grouplock.lg_gid);
2201                 spin_unlock(&lli->lli_lock);
2202                 RETURN(-EINVAL);
2203         }
2204
2205         grouplock = fd->fd_grouplock;
2206         memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2207         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2208         spin_unlock(&lli->lli_lock);
2209
2210         cl_put_grouplock(&grouplock);
2211         CDEBUG(D_INFO, "group lock %lu released\n", arg);
2212         RETURN(0);
2213 }
2214
2215 /**
2216  * Close inode open handle
2217  *
2218  * \param dentry [in]     dentry which contains the inode
2219  * \param it     [in,out] intent which contains open info and result
2220  *
2221  * \retval 0     success
2222  * \retval <0    failure
2223  */
2224 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2225 {
2226         struct inode *inode = dentry->d_inode;
2227         struct obd_client_handle *och;
2228         int rc;
2229         ENTRY;
2230
2231         LASSERT(inode);
2232
2233         /* Root ? Do nothing. */
2234         if (dentry->d_inode->i_sb->s_root == dentry)
2235                 RETURN(0);
2236
2237         /* No open handle to close? Move away */
2238         if (!it_disposition(it, DISP_OPEN_OPEN))
2239                 RETURN(0);
2240
2241         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2242
2243         OBD_ALLOC(och, sizeof(*och));
2244         if (!och)
2245                 GOTO(out, rc = -ENOMEM);
2246
2247         ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2248
2249         rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2250 out:
2251         /* this one is in place of ll_file_open */
2252         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2253                 ptlrpc_req_finished(it->it_request);
2254                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2255         }
2256         RETURN(rc);
2257 }
2258
2259 /**
2260  * Get size for inode for which FIEMAP mapping is requested.
2261  * Make the FIEMAP get_info call and returns the result.
2262  * \param fiemap        kernel buffer to hold extens
2263  * \param num_bytes     kernel buffer size
2264  */
2265 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2266                         size_t num_bytes)
2267 {
2268         struct lu_env                   *env;
2269         __u16                           refcheck;
2270         int                             rc = 0;
2271         struct ll_fiemap_info_key       fmkey = { .lfik_name = KEY_FIEMAP, };
2272         ENTRY;
2273
2274         /* Checks for fiemap flags */
2275         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2276                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2277                 return -EBADR;
2278         }
2279
2280         /* Check for FIEMAP_FLAG_SYNC */
2281         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2282                 rc = filemap_fdatawrite(inode->i_mapping);
2283                 if (rc)
2284                         return rc;
2285         }
2286
2287         env = cl_env_get(&refcheck);
2288         if (IS_ERR(env))
2289                 RETURN(PTR_ERR(env));
2290
2291         if (i_size_read(inode) == 0) {
2292                 rc = ll_glimpse_size(inode);
2293                 if (rc)
2294                         GOTO(out, rc);
2295         }
2296
2297         fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2298         obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2299         obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2300
2301         /* If filesize is 0, then there would be no objects for mapping */
2302         if (fmkey.lfik_oa.o_size == 0) {
2303                 fiemap->fm_mapped_extents = 0;
2304                 GOTO(out, rc = 0);
2305         }
2306
2307         fmkey.lfik_fiemap = *fiemap;
2308
2309         rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2310                               &fmkey, fiemap, &num_bytes);
2311 out:
2312         cl_env_put(env, &refcheck);
2313         RETURN(rc);
2314 }
2315
2316 int ll_fid2path(struct inode *inode, void __user *arg)
2317 {
2318         struct obd_export       *exp = ll_i2mdexp(inode);
2319         const struct getinfo_fid2path __user *gfin = arg;
2320         __u32                    pathlen;
2321         struct getinfo_fid2path *gfout;
2322         size_t                   outsize;
2323         int                      rc;
2324
2325         ENTRY;
2326
2327         if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2328             !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2329                 RETURN(-EPERM);
2330
2331         /* Only need to get the buflen */
2332         if (get_user(pathlen, &gfin->gf_pathlen))
2333                 RETURN(-EFAULT);
2334
2335         if (pathlen > PATH_MAX)
2336                 RETURN(-EINVAL);
2337
2338         outsize = sizeof(*gfout) + pathlen;
2339         OBD_ALLOC(gfout, outsize);
2340         if (gfout == NULL)
2341                 RETURN(-ENOMEM);
2342
2343         if (copy_from_user(gfout, arg, sizeof(*gfout)))
2344                 GOTO(gf_free, rc = -EFAULT);
2345         /* append root FID after gfout to let MDT know the root FID so that it
2346          * can lookup the correct path, this is mainly for fileset.
2347          * old server without fileset mount support will ignore this. */
2348         *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2349
2350         /* Call mdc_iocontrol */
2351         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2352         if (rc != 0)
2353                 GOTO(gf_free, rc);
2354
2355         if (copy_to_user(arg, gfout, outsize))
2356                 rc = -EFAULT;
2357
2358 gf_free:
2359         OBD_FREE(gfout, outsize);
2360         RETURN(rc);
2361 }
2362
2363 static int
2364 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2365 {
2366         struct cl_object *obj = ll_i2info(inode)->lli_clob;
2367         struct lu_env *env;
2368         struct cl_io *io;
2369         __u16  refcheck;
2370         int result;
2371
2372         ENTRY;
2373
2374         ioc->idv_version = 0;
2375         ioc->idv_layout_version = UINT_MAX;
2376
2377         /* If no file object initialized, we consider its version is 0. */
2378         if (obj == NULL)
2379                 RETURN(0);
2380
2381         env = cl_env_get(&refcheck);
2382         if (IS_ERR(env))
2383                 RETURN(PTR_ERR(env));
2384
2385         io = vvp_env_thread_io(env);
2386         io->ci_obj = obj;
2387         io->u.ci_data_version.dv_data_version = 0;
2388         io->u.ci_data_version.dv_layout_version = UINT_MAX;
2389         io->u.ci_data_version.dv_flags = ioc->idv_flags;
2390
2391 restart:
2392         if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2393                 result = cl_io_loop(env, io);
2394         else
2395                 result = io->ci_result;
2396
2397         ioc->idv_version = io->u.ci_data_version.dv_data_version;
2398         ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2399
2400         cl_io_fini(env, io);
2401
2402         if (unlikely(io->ci_need_restart))
2403                 goto restart;
2404
2405         cl_env_put(env, &refcheck);
2406
2407         RETURN(result);
2408 }
2409
2410 /*
2411  * Read the data_version for inode.
2412  *
2413  * This value is computed using stripe object version on OST.
2414  * Version is computed using server side locking.
2415  *
2416  * @param flags if do sync on the OST side;
2417  *              0: no sync
2418  *              LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2419  *              LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2420  */
2421 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2422 {
2423         struct ioc_data_version ioc = { .idv_flags = flags };
2424         int rc;
2425
2426         rc = ll_ioc_data_version(inode, &ioc);
2427         if (!rc)
2428                 *data_version = ioc.idv_version;
2429
2430         return rc;
2431 }
2432
2433 /*
2434  * Trigger a HSM release request for the provided inode.
2435  */
2436 int ll_hsm_release(struct inode *inode)
2437 {
2438         struct lu_env *env;
2439         struct obd_client_handle *och = NULL;
2440         __u64 data_version = 0;
2441         int rc;
2442         __u16 refcheck;
2443         ENTRY;
2444
2445         CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2446                ll_i2sbi(inode)->ll_fsname,
2447                PFID(&ll_i2info(inode)->lli_fid));
2448
2449         och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2450         if (IS_ERR(och))
2451                 GOTO(out, rc = PTR_ERR(och));
2452
2453         /* Grab latest data_version and [am]time values */
2454         rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2455         if (rc != 0)
2456                 GOTO(out, rc);
2457
2458         env = cl_env_get(&refcheck);
2459         if (IS_ERR(env))
2460                 GOTO(out, rc = PTR_ERR(env));
2461
2462         rc = ll_merge_attr(env, inode);
2463         cl_env_put(env, &refcheck);
2464
2465         /* If error happen, we have the wrong size for a file.
2466          * Don't release it.
2467          */
2468         if (rc != 0)
2469                 GOTO(out, rc);
2470
2471         /* Release the file.
2472          * NB: lease lock handle is released in mdc_hsm_release_pack() because
2473          * we still need it to pack l_remote_handle to MDT. */
2474         rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2475                                        &data_version);
2476         och = NULL;
2477
2478         EXIT;
2479 out:
2480         if (och != NULL && !IS_ERR(och)) /* close the file */
2481                 ll_lease_close(och, inode, NULL);
2482
2483         return rc;
2484 }
2485
2486 struct ll_swap_stack {
2487         __u64                    dv1;
2488         __u64                    dv2;
2489         struct inode            *inode1;
2490         struct inode            *inode2;
2491         bool                     check_dv1;
2492         bool                     check_dv2;
2493 };
2494
2495 static int ll_swap_layouts(struct file *file1, struct file *file2,
2496                            struct lustre_swap_layouts *lsl)
2497 {
2498         struct mdc_swap_layouts  msl;
2499         struct md_op_data       *op_data;
2500         __u32                    gid;
2501         __u64                    dv;
2502         struct ll_swap_stack    *llss = NULL;
2503         int                      rc;
2504
2505         OBD_ALLOC_PTR(llss);
2506         if (llss == NULL)
2507                 RETURN(-ENOMEM);
2508
2509         llss->inode1 = file_inode(file1);
2510         llss->inode2 = file_inode(file2);
2511
2512         rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2513         if (rc < 0)
2514                 GOTO(free, rc);
2515
2516         /* we use 2 bool because it is easier to swap than 2 bits */
2517         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2518                 llss->check_dv1 = true;
2519
2520         if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2521                 llss->check_dv2 = true;
2522
2523         /* we cannot use lsl->sl_dvX directly because we may swap them */
2524         llss->dv1 = lsl->sl_dv1;
2525         llss->dv2 = lsl->sl_dv2;
2526
2527         rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2528         if (rc == 0) /* same file, done! */
2529                 GOTO(free, rc);
2530
2531         if (rc < 0) { /* sequentialize it */
2532                 swap(llss->inode1, llss->inode2);
2533                 swap(file1, file2);
2534                 swap(llss->dv1, llss->dv2);
2535                 swap(llss->check_dv1, llss->check_dv2);
2536         }
2537
2538         gid = lsl->sl_gid;
2539         if (gid != 0) { /* application asks to flush dirty cache */
2540                 rc = ll_get_grouplock(llss->inode1, file1, gid);
2541                 if (rc < 0)
2542                         GOTO(free, rc);
2543
2544                 rc = ll_get_grouplock(llss->inode2, file2, gid);
2545                 if (rc < 0) {
2546                         ll_put_grouplock(llss->inode1, file1, gid);
2547                         GOTO(free, rc);
2548                 }
2549         }
2550
2551         /* ultimate check, before swaping the layouts we check if
2552          * dataversion has changed (if requested) */
2553         if (llss->check_dv1) {
2554                 rc = ll_data_version(llss->inode1, &dv, 0);
2555                 if (rc)
2556                         GOTO(putgl, rc);
2557                 if (dv != llss->dv1)
2558                         GOTO(putgl, rc = -EAGAIN);
2559         }
2560
2561         if (llss->check_dv2) {
2562                 rc = ll_data_version(llss->inode2, &dv, 0);
2563                 if (rc)
2564                         GOTO(putgl, rc);
2565                 if (dv != llss->dv2)
2566                         GOTO(putgl, rc = -EAGAIN);
2567         }
2568
2569         /* struct md_op_data is used to send the swap args to the mdt
2570          * only flags is missing, so we use struct mdc_swap_layouts
2571          * through the md_op_data->op_data */
2572         /* flags from user space have to be converted before they are send to
2573          * server, no flag is sent today, they are only used on the client */
2574         msl.msl_flags = 0;
2575         rc = -ENOMEM;
2576         op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2577                                      0, LUSTRE_OPC_ANY, &msl);
2578         if (IS_ERR(op_data))
2579                 GOTO(free, rc = PTR_ERR(op_data));
2580
2581         rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2582                            sizeof(*op_data), op_data, NULL);
2583         ll_finish_md_op_data(op_data);
2584
2585         if (rc < 0)
2586                 GOTO(putgl, rc);
2587
2588 putgl:
2589         if (gid != 0) {
2590                 ll_put_grouplock(llss->inode2, file2, gid);
2591                 ll_put_grouplock(llss->inode1, file1, gid);
2592         }
2593
2594 free:
2595         if (llss != NULL)
2596                 OBD_FREE_PTR(llss);
2597
2598         RETURN(rc);
2599 }
2600
2601 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2602 {
2603         struct obd_export *exp = ll_i2mdexp(inode);
2604         struct md_op_data *op_data;
2605         int rc;
2606         ENTRY;
2607
2608         /* Detect out-of range masks */
2609         if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2610                 RETURN(-EINVAL);
2611
2612         /* Non-root users are forbidden to set or clear flags which are
2613          * NOT defined in HSM_USER_MASK. */
2614         if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2615             !cfs_capable(CFS_CAP_SYS_ADMIN))
2616                 RETURN(-EPERM);
2617
2618         if (!exp_connect_archive_id_array(exp)) {
2619                 /* Detect out-of range archive id */
2620                 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2621                     (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2622                         RETURN(-EINVAL);
2623         }
2624
2625         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2626                                      LUSTRE_OPC_ANY, hss);
2627         if (IS_ERR(op_data))
2628                 RETURN(PTR_ERR(op_data));
2629
2630         rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2631                            op_data, NULL);
2632
2633         ll_finish_md_op_data(op_data);
2634
2635         RETURN(rc);
2636 }
2637
2638 static int ll_hsm_import(struct inode *inode, struct file *file,
2639                          struct hsm_user_import *hui)
2640 {
2641         struct hsm_state_set    *hss = NULL;
2642         struct iattr            *attr = NULL;
2643         int                      rc;
2644         ENTRY;
2645
2646         if (!S_ISREG(inode->i_mode))
2647                 RETURN(-EINVAL);
2648
2649         /* set HSM flags */
2650         OBD_ALLOC_PTR(hss);
2651         if (hss == NULL)
2652                 GOTO(out, rc = -ENOMEM);
2653
2654         hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2655         hss->hss_archive_id = hui->hui_archive_id;
2656         hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2657         rc = ll_hsm_state_set(inode, hss);
2658         if (rc != 0)
2659                 GOTO(out, rc);
2660
2661         OBD_ALLOC_PTR(attr);
2662         if (attr == NULL)
2663                 GOTO(out, rc = -ENOMEM);
2664
2665         attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2666         attr->ia_mode |= S_IFREG;
2667         attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2668         attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2669         attr->ia_size = hui->hui_size;
2670         attr->ia_mtime.tv_sec = hui->hui_mtime;
2671         attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2672         attr->ia_atime.tv_sec = hui->hui_atime;
2673         attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2674
2675         attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2676                          ATTR_UID | ATTR_GID |
2677                          ATTR_MTIME | ATTR_MTIME_SET |
2678                          ATTR_ATIME | ATTR_ATIME_SET;
2679
2680         inode_lock(inode);
2681
2682         rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2683         if (rc == -ENODATA)
2684                 rc = 0;
2685
2686         inode_unlock(inode);
2687
2688 out:
2689         if (hss != NULL)
2690                 OBD_FREE_PTR(hss);
2691
2692         if (attr != NULL)
2693                 OBD_FREE_PTR(attr);
2694
2695         RETURN(rc);
2696 }
2697
2698 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2699 {
2700         return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2701                ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2702 }
2703
2704 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2705 {
2706         struct inode *inode = file_inode(file);
2707         struct iattr ia = {
2708                 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2709                             ATTR_MTIME | ATTR_MTIME_SET |
2710                             ATTR_CTIME,
2711                 .ia_atime = {
2712                         .tv_sec = lfu->lfu_atime_sec,
2713                         .tv_nsec = lfu->lfu_atime_nsec,
2714                 },
2715                 .ia_mtime = {
2716                         .tv_sec = lfu->lfu_mtime_sec,
2717                         .tv_nsec = lfu->lfu_mtime_nsec,
2718                 },
2719                 .ia_ctime = {
2720                         .tv_sec = lfu->lfu_ctime_sec,
2721                         .tv_nsec = lfu->lfu_ctime_nsec,
2722                 },
2723         };
2724         int rc;
2725         ENTRY;
2726
2727         if (!capable(CAP_SYS_ADMIN))
2728                 RETURN(-EPERM);
2729
2730         if (!S_ISREG(inode->i_mode))
2731                 RETURN(-EINVAL);
2732
2733         inode_lock(inode);
2734         rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2735                             false);
2736         inode_unlock(inode);
2737
2738         RETURN(rc);
2739 }
2740
2741 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2742 {
2743         switch (mode) {
2744         case MODE_READ_USER:
2745                 return CLM_READ;
2746         case MODE_WRITE_USER:
2747                 return CLM_WRITE;
2748         default:
2749                 return -EINVAL;
2750         }
2751 }
2752
2753 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2754
2755 /* Used to allow the upper layers of the client to request an LDLM lock
2756  * without doing an actual read or write.
2757  *
2758  * Used for ladvise lockahead to manually request specific locks.
2759  *
2760  * \param[in] file      file this ladvise lock request is on
2761  * \param[in] ladvise   ladvise struct describing this lock request
2762  *
2763  * \retval 0            success, no detailed result available (sync requests
2764  *                      and requests sent to the server [not handled locally]
2765  *                      cannot return detailed results)
2766  * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2767  *                                       see definitions for details.
2768  * \retval negative     negative errno on error
2769  */
2770 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2771 {
2772         struct lu_env *env = NULL;
2773         struct cl_io *io  = NULL;
2774         struct cl_lock *lock = NULL;
2775         struct cl_lock_descr *descr = NULL;
2776         struct dentry *dentry = file->f_path.dentry;
2777         struct inode *inode = dentry->d_inode;
2778         enum cl_lock_mode cl_mode;
2779         off_t start = ladvise->lla_start;
2780         off_t end = ladvise->lla_end;
2781         int result;
2782         __u16 refcheck;
2783
2784         ENTRY;
2785
2786         CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2787                "start=%llu, end=%llu\n", dentry->d_name.len,
2788                dentry->d_name.name, dentry->d_inode,
2789                user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2790                (__u64) end);
2791
2792         cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2793         if (cl_mode < 0)
2794                 GOTO(out, result = cl_mode);
2795
2796         /* Get IO environment */
2797         result = cl_io_get(inode, &env, &io, &refcheck);
2798         if (result <= 0)
2799                 GOTO(out, result);
2800
2801         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2802         if (result > 0) {
2803                 /*
2804                  * nothing to do for this io. This currently happens when
2805                  * stripe sub-object's are not yet created.
2806                  */
2807                 result = io->ci_result;
2808         } else if (result == 0) {
2809                 lock = vvp_env_lock(env);
2810                 descr = &lock->cll_descr;
2811
2812                 descr->cld_obj   = io->ci_obj;
2813                 /* Convert byte offsets to pages */
2814                 descr->cld_start = cl_index(io->ci_obj, start);
2815                 descr->cld_end   = cl_index(io->ci_obj, end);
2816                 descr->cld_mode  = cl_mode;
2817                 /* CEF_MUST is used because we do not want to convert a
2818                  * lockahead request to a lockless lock */
2819                 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2820                                        CEF_NONBLOCK;
2821
2822                 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2823                         descr->cld_enq_flags |= CEF_SPECULATIVE;
2824
2825                 result = cl_lock_request(env, io, lock);
2826
2827                 /* On success, we need to release the lock */
2828                 if (result >= 0)
2829                         cl_lock_release(env, lock);
2830         }
2831         cl_io_fini(env, io);
2832         cl_env_put(env, &refcheck);
2833
2834         /* -ECANCELED indicates a matching lock with a different extent
2835          * was already present, and -EEXIST indicates a matching lock
2836          * on exactly the same extent was already present.
2837          * We convert them to positive values for userspace to make
2838          * recognizing true errors easier.
2839          * Note we can only return these detailed results on async requests,
2840          * as sync requests look the same as i/o requests for locking. */
2841         if (result == -ECANCELED)
2842                 result = LLA_RESULT_DIFFERENT;
2843         else if (result == -EEXIST)
2844                 result = LLA_RESULT_SAME;
2845
2846 out:
2847         RETURN(result);
2848 }
2849 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2850
2851 static int ll_ladvise_sanity(struct inode *inode,
2852                              struct llapi_lu_ladvise *ladvise)
2853 {
2854         struct ll_sb_info *sbi = ll_i2sbi(inode);
2855         enum lu_ladvise_type advice = ladvise->lla_advice;
2856         /* Note the peradvice flags is a 32 bit field, so per advice flags must
2857          * be in the first 32 bits of enum ladvise_flags */
2858         __u32 flags = ladvise->lla_peradvice_flags;
2859         /* 3 lines at 80 characters per line, should be plenty */
2860         int rc = 0;
2861
2862         if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2863                 rc = -EINVAL;
2864                 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2865                        "last supported advice is %s (value '%d'): rc = %d\n",
2866                        sbi->ll_fsname, advice,
2867                        ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2868                 GOTO(out, rc);
2869         }
2870
2871         /* Per-advice checks */
2872         switch (advice) {
2873         case LU_LADVISE_LOCKNOEXPAND:
2874                 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2875                         rc = -EINVAL;
2876                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2877                                "rc = %d\n", sbi->ll_fsname, flags,
2878                                ladvise_names[advice], rc);
2879                         GOTO(out, rc);
2880                 }
2881                 break;
2882         case LU_LADVISE_LOCKAHEAD:
2883                 /* Currently only READ and WRITE modes can be requested */
2884                 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2885                     ladvise->lla_lockahead_mode == 0) {
2886                         rc = -EINVAL;
2887                         CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2888                                "rc = %d\n", sbi->ll_fsname,
2889                                ladvise->lla_lockahead_mode,
2890                                ladvise_names[advice], rc);
2891                         GOTO(out, rc);
2892                 }
2893         case LU_LADVISE_WILLREAD:
2894         case LU_LADVISE_DONTNEED:
2895         default:
2896                 /* Note fall through above - These checks apply to all advices
2897                  * except LOCKNOEXPAND */
2898                 if (flags & ~LF_DEFAULT_MASK) {
2899                         rc = -EINVAL;
2900                         CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2901                                "rc = %d\n", sbi->ll_fsname, flags,
2902                                ladvise_names[advice], rc);
2903                         GOTO(out, rc);
2904                 }
2905                 if (ladvise->lla_start >= ladvise->lla_end) {
2906                         rc = -EINVAL;
2907                         CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2908                                "for %s: rc = %d\n", sbi->ll_fsname,
2909                                ladvise->lla_start, ladvise->lla_end,
2910                                ladvise_names[advice], rc);
2911                         GOTO(out, rc);
2912                 }
2913                 break;
2914         }
2915
2916 out:
2917         return rc;
2918 }
2919 #undef ERRSIZE
2920
2921 /*
2922  * Give file access advices
2923  *
2924  * The ladvise interface is similar to Linux fadvise() system call, except it
2925  * forwards the advices directly from Lustre client to server. The server side
2926  * codes will apply appropriate read-ahead and caching techniques for the
2927  * corresponding files.
2928  *
2929  * A typical workload for ladvise is e.g. a bunch of different clients are
2930  * doing small random reads of a file, so prefetching pages into OSS cache
2931  * with big linear reads before the random IO is a net benefit. Fetching
2932  * all that data into each client cache with fadvise() may not be, due to
2933  * much more data being sent to the client.
2934  */
2935 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2936                       struct llapi_lu_ladvise *ladvise)
2937 {
2938         struct lu_env *env;
2939         struct cl_io *io;
2940         struct cl_ladvise_io *lio;
2941         int rc;
2942         __u16 refcheck;
2943         ENTRY;
2944
2945         env = cl_env_get(&refcheck);
2946         if (IS_ERR(env))
2947                 RETURN(PTR_ERR(env));
2948
2949         io = vvp_env_thread_io(env);
2950         io->ci_obj = ll_i2info(inode)->lli_clob;
2951
2952         /* initialize parameters for ladvise */
2953         lio = &io->u.ci_ladvise;
2954         lio->li_start = ladvise->lla_start;
2955         lio->li_end = ladvise->lla_end;
2956         lio->li_fid = ll_inode2fid(inode);
2957         lio->li_advice = ladvise->lla_advice;
2958         lio->li_flags = flags;
2959
2960         if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2961                 rc = cl_io_loop(env, io);
2962         else
2963                 rc = io->ci_result;
2964
2965         cl_io_fini(env, io);
2966         cl_env_put(env, &refcheck);
2967         RETURN(rc);
2968 }
2969
2970 static int ll_lock_noexpand(struct file *file, int flags)
2971 {
2972         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2973
2974         fd->ll_lock_no_expand = !(flags & LF_UNSET);
2975
2976         return 0;
2977 }
2978
2979 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2980                         unsigned long arg)
2981 {
2982         struct fsxattr fsxattr;
2983
2984         if (copy_from_user(&fsxattr,
2985                            (const struct fsxattr __user *)arg,
2986                            sizeof(fsxattr)))
2987                 RETURN(-EFAULT);
2988
2989         fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
2990         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
2991                 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
2992         fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2993         if (copy_to_user((struct fsxattr __user *)arg,
2994                          &fsxattr, sizeof(fsxattr)))
2995                 RETURN(-EFAULT);
2996
2997         RETURN(0);
2998 }
2999
3000 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3001 {
3002         /*
3003          * Project Quota ID state is only allowed to change from within the init
3004          * namespace. Enforce that restriction only if we are trying to change
3005          * the quota ID state. Everything else is allowed in user namespaces.
3006          */
3007         if (current_user_ns() == &init_user_ns)
3008                 return 0;
3009
3010         if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3011                 return -EINVAL;
3012
3013         if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3014                 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3015                         return -EINVAL;
3016         } else {
3017                 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3018                         return -EINVAL;
3019         }
3020
3021         return 0;
3022 }
3023
3024 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3025                         unsigned long arg)
3026 {
3027
3028         struct md_op_data *op_data;
3029         struct ptlrpc_request *req = NULL;
3030         int rc = 0;
3031         struct fsxattr fsxattr;
3032         struct cl_object *obj;
3033         struct iattr *attr;
3034         int flags;
3035
3036         if (copy_from_user(&fsxattr,
3037                            (const struct fsxattr __user *)arg,
3038                            sizeof(fsxattr)))
3039                 RETURN(-EFAULT);
3040
3041         rc = ll_ioctl_check_project(inode, &fsxattr);
3042         if (rc)
3043                 RETURN(rc);
3044
3045         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3046                                      LUSTRE_OPC_ANY, NULL);
3047         if (IS_ERR(op_data))
3048                 RETURN(PTR_ERR(op_data));
3049
3050         flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3051         op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3052         if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3053                 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3054         op_data->op_projid = fsxattr.fsx_projid;
3055         op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3056         rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3057                         0, &req);
3058         ptlrpc_req_finished(req);
3059         if (rc)
3060                 GOTO(out_fsxattr, rc);
3061         ll_update_inode_flags(inode, op_data->op_attr_flags);
3062         obj = ll_i2info(inode)->lli_clob;
3063         if (obj == NULL)
3064                 GOTO(out_fsxattr, rc);
3065
3066         OBD_ALLOC_PTR(attr);
3067         if (attr == NULL)
3068                 GOTO(out_fsxattr, rc = -ENOMEM);
3069
3070         rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3071                             fsxattr.fsx_xflags);
3072         OBD_FREE_PTR(attr);
3073 out_fsxattr:
3074         ll_finish_md_op_data(op_data);
3075         RETURN(rc);
3076 }
3077
3078 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3079                                  unsigned long arg)
3080 {
3081         struct inode            *inode = file_inode(file);
3082         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3083         struct ll_inode_info    *lli = ll_i2info(inode);
3084         struct obd_client_handle *och = NULL;
3085         struct split_param sp;
3086         bool lease_broken;
3087         fmode_t fmode = 0;
3088         enum mds_op_bias bias = 0;
3089         struct file *layout_file = NULL;
3090         void *data = NULL;
3091         size_t data_size = 0;
3092         long rc;
3093         ENTRY;
3094
3095         mutex_lock(&lli->lli_och_mutex);
3096         if (fd->fd_lease_och != NULL) {
3097                 och = fd->fd_lease_och;
3098                 fd->fd_lease_och = NULL;
3099         }
3100         mutex_unlock(&lli->lli_och_mutex);
3101
3102         if (och == NULL)
3103                 GOTO(out, rc = -ENOLCK);
3104
3105         fmode = och->och_flags;
3106
3107         switch (ioc->lil_flags) {
3108         case LL_LEASE_RESYNC_DONE:
3109                 if (ioc->lil_count > IOC_IDS_MAX)
3110                         GOTO(out, rc = -EINVAL);
3111
3112                 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3113                 OBD_ALLOC(data, data_size);
3114                 if (!data)
3115                         GOTO(out, rc = -ENOMEM);
3116
3117                 if (copy_from_user(data, (void __user *)arg, data_size))
3118                         GOTO(out, rc = -EFAULT);
3119
3120                 bias = MDS_CLOSE_RESYNC_DONE;
3121                 break;
3122         case LL_LEASE_LAYOUT_MERGE: {
3123                 int fd;
3124
3125                 if (ioc->lil_count != 1)
3126                         GOTO(out, rc = -EINVAL);
3127
3128                 arg += sizeof(*ioc);
3129                 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3130                         GOTO(out, rc = -EFAULT);
3131
3132                 layout_file = fget(fd);
3133                 if (!layout_file)
3134                         GOTO(out, rc = -EBADF);
3135
3136                 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3137                                 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3138                         GOTO(out, rc = -EPERM);
3139
3140                 data = file_inode(layout_file);
3141                 bias = MDS_CLOSE_LAYOUT_MERGE;
3142                 break;
3143         }
3144         case LL_LEASE_LAYOUT_SPLIT: {
3145                 int fdv;
3146                 int mirror_id;
3147
3148                 if (ioc->lil_count != 2)
3149                         GOTO(out, rc = -EINVAL);
3150
3151                 arg += sizeof(*ioc);
3152                 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3153                         GOTO(out, rc = -EFAULT);
3154
3155                 arg += sizeof(__u32);
3156                 if (copy_from_user(&mirror_id, (void __user *)arg,
3157                                    sizeof(__u32)))
3158                         GOTO(out, rc = -EFAULT);
3159
3160                 layout_file = fget(fdv);
3161                 if (!layout_file)
3162                         GOTO(out, rc = -EBADF);
3163
3164                 sp.sp_inode = file_inode(layout_file);
3165                 sp.sp_mirror_id = (__u16)mirror_id;
3166                 data = &sp;
3167                 bias = MDS_CLOSE_LAYOUT_SPLIT;
3168                 break;
3169         }
3170         default:
3171                 /* without close intent */
3172                 break;
3173         }
3174
3175         rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3176         if (rc < 0)
3177                 GOTO(out, rc);
3178
3179         rc = ll_lease_och_release(inode, file);
3180         if (rc < 0)
3181                 GOTO(out, rc);
3182
3183         if (lease_broken)
3184                 fmode = 0;
3185         EXIT;
3186
3187 out:
3188         switch (ioc->lil_flags) {
3189         case LL_LEASE_RESYNC_DONE:
3190                 if (data)
3191                         OBD_FREE(data, data_size);
3192                 break;
3193         case LL_LEASE_LAYOUT_MERGE:
3194         case LL_LEASE_LAYOUT_SPLIT:
3195                 if (layout_file)
3196                         fput(layout_file);
3197                 break;
3198         }
3199
3200         if (!rc)
3201                 rc = ll_lease_type_from_fmode(fmode);
3202         RETURN(rc);
3203 }
3204
3205 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3206                               unsigned long arg)
3207 {
3208         struct inode *inode = file_inode(file);
3209         struct ll_inode_info *lli = ll_i2info(inode);
3210         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3211         struct obd_client_handle *och = NULL;
3212         __u64 open_flags = 0;
3213         bool lease_broken;
3214         fmode_t fmode;
3215         long rc;
3216         ENTRY;
3217
3218         switch (ioc->lil_mode) {
3219         case LL_LEASE_WRLCK:
3220                 if (!(file->f_mode & FMODE_WRITE))
3221                         RETURN(-EPERM);
3222                 fmode = FMODE_WRITE;
3223                 break;
3224         case LL_LEASE_RDLCK:
3225                 if (!(file->f_mode & FMODE_READ))
3226                         RETURN(-EPERM);
3227                 fmode = FMODE_READ;
3228                 break;
3229         case LL_LEASE_UNLCK:
3230                 RETURN(ll_file_unlock_lease(file, ioc, arg));
3231         default:
3232                 RETURN(-EINVAL);
3233         }
3234
3235         CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3236
3237         /* apply for lease */
3238         if (ioc->lil_flags & LL_LEASE_RESYNC)
3239                 open_flags = MDS_OPEN_RESYNC;
3240         och = ll_lease_open(inode, file, fmode, open_flags);
3241         if (IS_ERR(och))
3242                 RETURN(PTR_ERR(och));
3243
3244         if (ioc->lil_flags & LL_LEASE_RESYNC) {
3245                 rc = ll_lease_file_resync(och, inode, arg);
3246                 if (rc) {
3247                         ll_lease_close(och, inode, NULL);
3248                         RETURN(rc);
3249                 }
3250                 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3251                 if (rc) {
3252                         ll_lease_close(och, inode, NULL);
3253                         RETURN(rc);
3254                 }
3255         }
3256
3257         rc = 0;
3258         mutex_lock(&lli->lli_och_mutex);
3259         if (fd->fd_lease_och == NULL) {
3260                 fd->fd_lease_och = och;
3261                 och = NULL;
3262         }
3263         mutex_unlock(&lli->lli_och_mutex);
3264         if (och != NULL) {
3265                 /* impossible now that only excl is supported for now */
3266                 ll_lease_close(och, inode, &lease_broken);
3267                 rc = -EBUSY;
3268         }
3269         RETURN(rc);
3270 }
3271
3272 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3273 {
3274         struct ll_inode_info *lli = ll_i2info(inode);
3275         struct ll_sb_info *sbi = ll_i2sbi(inode);
3276         __u64 now = ktime_get_real_seconds();
3277         int i;
3278
3279         spin_lock(&lli->lli_heat_lock);
3280         heat->lh_flags = lli->lli_heat_flags;
3281         for (i = 0; i < heat->lh_count; i++)
3282                 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3283                                                 now, sbi->ll_heat_decay_weight,
3284                                                 sbi->ll_heat_period_second);
3285         spin_unlock(&lli->lli_heat_lock);
3286 }
3287
3288 static int ll_heat_set(struct inode *inode, __u64 flags)
3289 {
3290         struct ll_inode_info *lli = ll_i2info(inode);
3291         int rc = 0;
3292
3293         spin_lock(&lli->lli_heat_lock);
3294         if (flags & LU_HEAT_FLAG_CLEAR)
3295                 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3296
3297         if (flags & LU_HEAT_FLAG_OFF)
3298                 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3299         else
3300                 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3301
3302         spin_unlock(&lli->lli_heat_lock);
3303
3304         RETURN(rc);
3305 }
3306
3307 static long
3308 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3309 {
3310         struct inode            *inode = file_inode(file);
3311         struct ll_file_data     *fd = LUSTRE_FPRIVATE(file);
3312         int                      flags, rc;
3313         ENTRY;
3314
3315         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3316                PFID(ll_inode2fid(inode)), inode, cmd);
3317         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3318
3319         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3320         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3321                 RETURN(-ENOTTY);
3322
3323         switch (cmd) {
3324         case LL_IOC_GETFLAGS:
3325                 /* Get the current value of the file flags */
3326                 return put_user(fd->fd_flags, (int __user *)arg);
3327         case LL_IOC_SETFLAGS:
3328         case LL_IOC_CLRFLAGS:
3329                 /* Set or clear specific file flags */
3330                 /* XXX This probably needs checks to ensure the flags are
3331                  *     not abused, and to handle any flag side effects.
3332                  */
3333                 if (get_user(flags, (int __user *) arg))
3334                         RETURN(-EFAULT);
3335
3336                 if (cmd == LL_IOC_SETFLAGS) {
3337                         if ((flags & LL_FILE_IGNORE_LOCK) &&
3338                             !(file->f_flags & O_DIRECT)) {
3339                                 CERROR("%s: unable to disable locking on "
3340                                        "non-O_DIRECT file\n", current->comm);
3341                                 RETURN(-EINVAL);
3342                         }
3343
3344                         fd->fd_flags |= flags;
3345                 } else {
3346                         fd->fd_flags &= ~flags;
3347                 }
3348                 RETURN(0);
3349         case LL_IOC_LOV_SETSTRIPE:
3350         case LL_IOC_LOV_SETSTRIPE_NEW:
3351                 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3352         case LL_IOC_LOV_SETEA:
3353                 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3354         case LL_IOC_LOV_SWAP_LAYOUTS: {
3355                 struct file *file2;
3356                 struct lustre_swap_layouts lsl;
3357
3358                 if (copy_from_user(&lsl, (char __user *)arg,
3359                                    sizeof(struct lustre_swap_layouts)))
3360                         RETURN(-EFAULT);
3361
3362                 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3363                         RETURN(-EPERM);
3364
3365                 file2 = fget(lsl.sl_fd);
3366                 if (file2 == NULL)
3367                         RETURN(-EBADF);
3368
3369                 /* O_WRONLY or O_RDWR */
3370                 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3371                         GOTO(out, rc = -EPERM);
3372
3373                 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3374                         struct inode                    *inode2;
3375                         struct ll_inode_info            *lli;
3376                         struct obd_client_handle        *och = NULL;
3377
3378                         lli = ll_i2info(inode);
3379                         mutex_lock(&lli->lli_och_mutex);
3380                         if (fd->fd_lease_och != NULL) {
3381                                 och = fd->fd_lease_och;
3382                                 fd->fd_lease_och = NULL;
3383                         }
3384                         mutex_unlock(&lli->lli_och_mutex);
3385                         if (och == NULL)
3386                                 GOTO(out, rc = -ENOLCK);
3387                         inode2 = file_inode(file2);
3388                         rc = ll_swap_layouts_close(och, inode, inode2);
3389                 } else {
3390                         rc = ll_swap_layouts(file, file2, &lsl);
3391                 }
3392 out:
3393                 fput(file2);
3394                 RETURN(rc);
3395         }
3396         case LL_IOC_LOV_GETSTRIPE:
3397         case LL_IOC_LOV_GETSTRIPE_NEW:
3398                 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3399         case FS_IOC_GETFLAGS:
3400         case FS_IOC_SETFLAGS:
3401                 RETURN(ll_iocontrol(inode, file, cmd, arg));
3402         case FSFILT_IOC_GETVERSION:
3403         case FS_IOC_GETVERSION:
3404                 RETURN(put_user(inode->i_generation, (int __user *)arg));
3405         /* We need to special case any other ioctls we want to handle,
3406          * to send them to the MDS/OST as appropriate and to properly
3407          * network encode the arg field. */
3408         case FS_IOC_SETVERSION:
3409                 RETURN(-ENOTSUPP);
3410
3411         case LL_IOC_GROUP_LOCK:
3412                 RETURN(ll_get_grouplock(inode, file, arg));
3413         case LL_IOC_GROUP_UNLOCK:
3414                 RETURN(ll_put_grouplock(inode, file, arg));
3415         case IOC_OBD_STATFS:
3416                 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3417
3418         case LL_IOC_FLUSHCTX:
3419                 RETURN(ll_flush_ctx(inode));
3420         case LL_IOC_PATH2FID: {
3421                 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3422                                  sizeof(struct lu_fid)))
3423                         RETURN(-EFAULT);
3424
3425                 RETURN(0);
3426         }
3427         case LL_IOC_GETPARENT:
3428                 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3429
3430         case OBD_IOC_FID2PATH:
3431                 RETURN(ll_fid2path(inode, (void __user *)arg));
3432         case LL_IOC_DATA_VERSION: {
3433                 struct ioc_data_version idv;
3434                 int rc;
3435
3436                 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3437                         RETURN(-EFAULT);
3438
3439                 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3440                 rc = ll_ioc_data_version(inode, &idv);
3441
3442                 if (rc == 0 &&
3443                     copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3444                         RETURN(-EFAULT);
3445
3446                 RETURN(rc);
3447         }
3448
3449         case LL_IOC_GET_MDTIDX: {
3450                 int mdtidx;
3451
3452                 mdtidx = ll_get_mdt_idx(inode);
3453                 if (mdtidx < 0)
3454                         RETURN(mdtidx);
3455
3456                 if (put_user((int)mdtidx, (int __user *)arg))
3457                         RETURN(-EFAULT);
3458
3459                 RETURN(0);
3460         }
3461         case OBD_IOC_GETDTNAME:
3462         case OBD_IOC_GETMDNAME:
3463                 RETURN(ll_get_obd_name(inode, cmd, arg));
3464         case LL_IOC_HSM_STATE_GET: {
3465                 struct md_op_data       *op_data;
3466                 struct hsm_user_state   *hus;
3467                 int                      rc;
3468
3469                 OBD_ALLOC_PTR(hus);
3470                 if (hus == NULL)
3471                         RETURN(-ENOMEM);
3472
3473                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3474                                              LUSTRE_OPC_ANY, hus);
3475                 if (IS_ERR(op_data)) {
3476                         OBD_FREE_PTR(hus);
3477                         RETURN(PTR_ERR(op_data));
3478                 }
3479
3480                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3481                                    op_data, NULL);
3482
3483                 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3484                         rc = -EFAULT;
3485
3486                 ll_finish_md_op_data(op_data);
3487                 OBD_FREE_PTR(hus);
3488                 RETURN(rc);
3489         }
3490         case LL_IOC_HSM_STATE_SET: {
3491                 struct hsm_state_set    *hss;
3492                 int                      rc;
3493
3494                 OBD_ALLOC_PTR(hss);
3495                 if (hss == NULL)
3496                         RETURN(-ENOMEM);
3497
3498                 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3499                         OBD_FREE_PTR(hss);
3500                         RETURN(-EFAULT);
3501                 }
3502
3503                 rc = ll_hsm_state_set(inode, hss);
3504
3505                 OBD_FREE_PTR(hss);
3506                 RETURN(rc);
3507         }
3508         case LL_IOC_HSM_ACTION: {
3509                 struct md_op_data               *op_data;
3510                 struct hsm_current_action       *hca;
3511                 int                              rc;
3512
3513                 OBD_ALLOC_PTR(hca);
3514                 if (hca == NULL)
3515                         RETURN(-ENOMEM);
3516
3517                 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3518                                              LUSTRE_OPC_ANY, hca);
3519                 if (IS_ERR(op_data)) {
3520                         OBD_FREE_PTR(hca);
3521                         RETURN(PTR_ERR(op_data));
3522                 }
3523
3524                 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3525                                    op_data, NULL);
3526
3527                 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3528                         rc = -EFAULT;
3529
3530                 ll_finish_md_op_data(op_data);
3531                 OBD_FREE_PTR(hca);
3532                 RETURN(rc);
3533         }
3534         case LL_IOC_SET_LEASE_OLD: {
3535                 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3536
3537                 RETURN(ll_file_set_lease(file, &ioc, 0));
3538         }
3539         case LL_IOC_SET_LEASE: {
3540                 struct ll_ioc_lease ioc;
3541
3542                 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3543                         RETURN(-EFAULT);
3544
3545                 RETURN(ll_file_set_lease(file, &ioc, arg));
3546         }
3547         case LL_IOC_GET_LEASE: {
3548                 struct ll_inode_info *lli = ll_i2info(inode);
3549                 struct ldlm_lock *lock = NULL;
3550                 fmode_t fmode = 0;
3551
3552                 mutex_lock(&lli->lli_och_mutex);
3553                 if (fd->fd_lease_och != NULL) {
3554                         struct obd_client_handle *och = fd->fd_lease_och;
3555
3556                         lock = ldlm_handle2lock(&och->och_lease_handle);
3557                         if (lock != NULL) {
3558                                 lock_res_and_lock(lock);
3559                                 if (!ldlm_is_cancel(lock))
3560                                         fmode = och->och_flags;
3561
3562                                 unlock_res_and_lock(lock);
3563                                 LDLM_LOCK_PUT(lock);
3564                         }
3565                 }
3566                 mutex_unlock(&lli->lli_och_mutex);
3567
3568                 RETURN(ll_lease_type_from_fmode(fmode));
3569         }
3570         case LL_IOC_HSM_IMPORT: {
3571                 struct hsm_user_import *hui;
3572
3573                 OBD_ALLOC_PTR(hui);
3574                 if (hui == NULL)
3575                         RETURN(-ENOMEM);
3576
3577                 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3578                         OBD_FREE_PTR(hui);
3579                         RETURN(-EFAULT);
3580                 }
3581
3582                 rc = ll_hsm_import(inode, file, hui);
3583
3584                 OBD_FREE_PTR(hui);
3585                 RETURN(rc);
3586         }
3587         case LL_IOC_FUTIMES_3: {
3588                 struct ll_futimes_3 lfu;
3589
3590                 if (copy_from_user(&lfu,
3591                                    (const struct ll_futimes_3 __user *)arg,
3592                                    sizeof(lfu)))
3593                         RETURN(-EFAULT);
3594
3595                 RETURN(ll_file_futimes_3(file, &lfu));
3596         }
3597         case LL_IOC_LADVISE: {
3598                 struct llapi_ladvise_hdr *k_ladvise_hdr;
3599                 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3600                 int i;
3601                 int num_advise;
3602                 int alloc_size = sizeof(*k_ladvise_hdr);
3603
3604                 rc = 0;
3605                 u_ladvise_hdr = (void __user *)arg;
3606                 OBD_ALLOC_PTR(k_ladvise_hdr);
3607                 if (k_ladvise_hdr == NULL)
3608                         RETURN(-ENOMEM);
3609
3610                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3611                         GOTO(out_ladvise, rc = -EFAULT);
3612
3613                 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3614                     k_ladvise_hdr->lah_count < 1)
3615                         GOTO(out_ladvise, rc = -EINVAL);
3616
3617                 num_advise = k_ladvise_hdr->lah_count;
3618                 if (num_advise >= LAH_COUNT_MAX)
3619                         GOTO(out_ladvise, rc = -EFBIG);
3620
3621                 OBD_FREE_PTR(k_ladvise_hdr);
3622                 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3623                                       lah_advise[num_advise]);
3624                 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3625                 if (k_ladvise_hdr == NULL)
3626                         RETURN(-ENOMEM);
3627
3628                 /*
3629                  * TODO: submit multiple advices to one server in a single RPC
3630                  */
3631                 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3632                         GOTO(out_ladvise, rc = -EFAULT);
3633
3634                 for (i = 0; i < num_advise; i++) {
3635                         struct llapi_lu_ladvise *k_ladvise =
3636                                         &k_ladvise_hdr->lah_advise[i];
3637                         struct llapi_lu_ladvise __user *u_ladvise =
3638                                         &u_ladvise_hdr->lah_advise[i];
3639
3640                         rc = ll_ladvise_sanity(inode, k_ladvise);
3641                         if (rc)
3642                                 GOTO(out_ladvise, rc);
3643
3644                         switch (k_ladvise->lla_advice) {
3645                         case LU_LADVISE_LOCKNOEXPAND:
3646                                 rc = ll_lock_noexpand(file,
3647                                                k_ladvise->lla_peradvice_flags);
3648                                 GOTO(out_ladvise, rc);
3649                         case LU_LADVISE_LOCKAHEAD:
3650
3651                                 rc = ll_file_lock_ahead(file, k_ladvise);
3652
3653                                 if (rc < 0)
3654                                         GOTO(out_ladvise, rc);
3655
3656                                 if (put_user(rc,
3657                                              &u_ladvise->lla_lockahead_result))
3658                                         GOTO(out_ladvise, rc = -EFAULT);
3659                                 break;
3660                         default:
3661                                 rc = ll_ladvise(inode, file,
3662                                                 k_ladvise_hdr->lah_flags,
3663                                                 k_ladvise);
3664                                 if (rc)
3665                                         GOTO(out_ladvise, rc);
3666                                 break;
3667                         }
3668
3669                 }
3670
3671 out_ladvise:
3672                 OBD_FREE(k_ladvise_hdr, alloc_size);
3673                 RETURN(rc);
3674         }
3675         case LL_IOC_FLR_SET_MIRROR: {
3676                 /* mirror I/O must be direct to avoid polluting page cache
3677                  * by stale data. */
3678                 if (!(file->f_flags & O_DIRECT))
3679                         RETURN(-EINVAL);
3680
3681                 fd->fd_designated_mirror = (__u32)arg;
3682                 RETURN(0);
3683         }
3684         case LL_IOC_FSGETXATTR:
3685                 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3686         case LL_IOC_FSSETXATTR:
3687                 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3688         case BLKSSZGET:
3689                 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3690         case LL_IOC_HEAT_GET: {
3691                 struct lu_heat uheat;
3692                 struct lu_heat *heat;
3693                 int size;
3694
3695                 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3696                         RETURN(-EFAULT);
3697
3698                 if (uheat.lh_count > OBD_HEAT_COUNT)
3699                         uheat.lh_count = OBD_HEAT_COUNT;
3700
3701                 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3702                 OBD_ALLOC(heat, size);
3703                 if (heat == NULL)
3704                         RETURN(-ENOMEM);
3705
3706                 heat->lh_count = uheat.lh_count;
3707                 ll_heat_get(inode, heat);
3708                 rc = copy_to_user((char __user *)arg, heat, size);
3709                 OBD_FREE(heat, size);
3710                 RETURN(rc ? -EFAULT : 0);
3711         }
3712         case LL_IOC_HEAT_SET: {
3713                 __u64 flags;
3714
3715                 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3716                         RETURN(-EFAULT);
3717
3718                 rc = ll_heat_set(inode, flags);
3719                 RETURN(rc);
3720         }
3721         default:
3722                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3723                                      (void __user *)arg));
3724         }
3725 }
3726
3727 #ifndef HAVE_FILE_LLSEEK_SIZE
3728 static inline loff_t
3729 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3730 {
3731         if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3732                 return -EINVAL;
3733         if (offset > maxsize)
3734                 return -EINVAL;
3735
3736         if (offset != file->f_pos) {
3737                 file->f_pos = offset;
3738                 file->f_version = 0;
3739         }
3740         return offset;
3741 }
3742
3743 static loff_t
3744 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3745                 loff_t maxsize, loff_t eof)
3746 {
3747         struct inode *inode = file_inode(file);
3748
3749         switch (origin) {
3750         case SEEK_END:
3751                 offset += eof;
3752                 break;
3753         case SEEK_CUR:
3754                 /*
3755                  * Here we special-case the lseek(fd, 0, SEEK_CUR)
3756                  * position-querying operation.  Avoid rewriting the "same"
3757                  * f_pos value back to the file because a concurrent read(),
3758                  * write() or lseek() might have altered it
3759                  */
3760                 if (offset == 0)
3761                         return file->f_pos;
3762                 /*
3763                  * f_lock protects against read/modify/write race with other
3764                  * SEEK_CURs. Note that parallel writes and reads behave
3765                  * like SEEK_SET.
3766                  */
3767                 inode_lock(inode);
3768                 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3769                 inode_unlock(inode);
3770                 return offset;
3771         case SEEK_DATA:
3772                 /*
3773                  * In the generic case the entire file is data, so as long as
3774                  * offset isn't at the end of the file then the offset is data.
3775                  */
3776                 if (offset >= eof)
3777                         return -ENXIO;
3778                 break;
3779         case SEEK_HOLE:
3780                 /*
3781                  * There is a virtual hole at the end of the file, so as long as
3782                  * offset isn't i_size or larger, return i_size.
3783                  */
3784                 if (offset >= eof)
3785                         return -ENXIO;
3786                 offset = eof;
3787                 break;
3788         }
3789
3790         return llseek_execute(file, offset, maxsize);
3791 }
3792 #endif
3793
3794 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3795 {
3796         struct inode *inode = file_inode(file);
3797         loff_t retval, eof = 0;
3798
3799         ENTRY;
3800         retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3801                            (origin == SEEK_CUR) ? file->f_pos : 0);
3802         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3803                PFID(ll_inode2fid(inode)), inode, retval, retval,
3804                origin);
3805         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3806
3807         if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3808                 retval = ll_glimpse_size(inode);
3809                 if (retval != 0)
3810                         RETURN(retval);
3811                 eof = i_size_read(inode);
3812         }
3813
3814         retval = ll_generic_file_llseek_size(file, offset, origin,
3815                                           ll_file_maxbytes(inode), eof);
3816         RETURN(retval);
3817 }
3818
3819 static int ll_flush(struct file *file, fl_owner_t id)
3820 {
3821         struct inode *inode = file_inode(file);
3822         struct ll_inode_info *lli = ll_i2info(inode);
3823         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3824         int rc, err;
3825
3826         LASSERT(!S_ISDIR(inode->i_mode));
3827
3828         /* catch async errors that were recorded back when async writeback
3829          * failed for pages in this mapping. */
3830         rc = lli->lli_async_rc;
3831         lli->lli_async_rc = 0;
3832         if (lli->lli_clob != NULL) {
3833                 err = lov_read_and_clear_async_rc(lli->lli_clob);
3834                 if (rc == 0)
3835                         rc = err;
3836         }
3837
3838         /* The application has been told write failure already.
3839          * Do not report failure again. */
3840         if (fd->fd_write_failed)
3841                 return 0;
3842         return rc ? -EIO : 0;
3843 }
3844
3845 /**
3846  * Called to make sure a portion of file has been written out.
3847  * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3848  *
3849  * Return how many pages have been written.
3850  */
3851 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3852                        enum cl_fsync_mode mode, int ignore_layout)
3853 {
3854         struct lu_env *env;
3855         struct cl_io *io;
3856         struct cl_fsync_io *fio;
3857         int result;
3858         __u16 refcheck;
3859         ENTRY;
3860
3861         if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3862             mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3863                 RETURN(-EINVAL);
3864
3865         env = cl_env_get(&refcheck);
3866         if (IS_ERR(env))
3867                 RETURN(PTR_ERR(env));
3868
3869         io = vvp_env_thread_io(env);
3870         io->ci_obj = ll_i2info(inode)->lli_clob;
3871         io->ci_ignore_layout = ignore_layout;
3872
3873         /* initialize parameters for sync */
3874         fio = &io->u.ci_fsync;
3875         fio->fi_start = start;
3876         fio->fi_end = end;
3877         fio->fi_fid = ll_inode2fid(inode);
3878         fio->fi_mode = mode;
3879         fio->fi_nr_written = 0;
3880
3881         if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3882                 result = cl_io_loop(env, io);
3883         else
3884                 result = io->ci_result;
3885         if (result == 0)
3886                 result = fio->fi_nr_written;
3887         cl_io_fini(env, io);
3888         cl_env_put(env, &refcheck);
3889
3890         RETURN(result);
3891 }
3892
3893 /*
3894  * When dentry is provided (the 'else' case), file_dentry() may be
3895  * null and dentry must be used directly rather than pulled from
3896  * file_dentry() as is done otherwise.
3897  */
3898
3899 #ifdef HAVE_FILE_FSYNC_4ARGS
3900 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3901 {
3902         struct dentry *dentry = file_dentry(file);
3903 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3904 int ll_fsync(struct file *file, int datasync)
3905 {
3906         struct dentry *dentry = file_dentry(file);
3907         loff_t start = 0;
3908         loff_t end = LLONG_MAX;
3909 #else
3910 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3911 {
3912         loff_t start = 0;
3913         loff_t end = LLONG_MAX;
3914 #endif
3915         struct inode *inode = dentry->d_inode;
3916         struct ll_inode_info *lli = ll_i2info(inode);
3917         struct ptlrpc_request *req;
3918         int rc, err;
3919         ENTRY;
3920
3921         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3922                PFID(ll_inode2fid(inode)), inode);
3923         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3924
3925 #ifdef HAVE_FILE_FSYNC_4ARGS
3926         rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3927         inode_lock(inode);
3928 #else
3929         /* fsync's caller has already called _fdata{sync,write}, we want
3930          * that IO to finish before calling the osc and mdc sync methods */
3931         rc = filemap_fdatawait(inode->i_mapping);
3932 #endif
3933
3934         /* catch async errors that were recorded back when async writeback
3935          * failed for pages in this mapping. */
3936         if (!S_ISDIR(inode->i_mode)) {
3937                 err = lli->lli_async_rc;
3938                 lli->lli_async_rc = 0;
3939                 if (rc == 0)
3940                         rc = err;
3941                 if (lli->lli_clob != NULL) {
3942                         err = lov_read_and_clear_async_rc(lli->lli_clob);
3943                         if (rc == 0)
3944                                 rc = err;
3945                 }
3946         }
3947
3948         err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3949         if (!rc)
3950                 rc = err;
3951         if (!err)
3952                 ptlrpc_req_finished(req);
3953
3954         if (S_ISREG(inode->i_mode)) {
3955                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3956
3957                 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3958                 if (rc == 0 && err < 0)
3959                         rc = err;
3960                 if (rc < 0)
3961                         fd->fd_write_failed = true;
3962                 else
3963                         fd->fd_write_failed = false;
3964         }
3965
3966 #ifdef HAVE_FILE_FSYNC_4ARGS
3967         inode_unlock(inode);
3968 #endif
3969         RETURN(rc);
3970 }
3971
3972 static int
3973 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3974 {
3975         struct inode *inode = file_inode(file);
3976         struct ll_sb_info *sbi = ll_i2sbi(inode);
3977         struct ldlm_enqueue_info einfo = {
3978                 .ei_type        = LDLM_FLOCK,
3979                 .ei_cb_cp       = ldlm_flock_completion_ast,
3980                 .ei_cbdata      = file_lock,
3981         };
3982         struct md_op_data *op_data;
3983         struct lustre_handle lockh = { 0 };
3984         union ldlm_policy_data flock = { { 0 } };
3985         int fl_type = file_lock->fl_type;
3986         __u64 flags = 0;
3987         int rc;
3988         int rc2 = 0;
3989         ENTRY;
3990
3991         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3992                PFID(ll_inode2fid(inode)), file_lock);
3993
3994         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3995
3996         if (file_lock->fl_flags & FL_FLOCK) {
3997                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3998                 /* flocks are whole-file locks */
3999                 flock.l_flock.end = OFFSET_MAX;
4000                 /* For flocks owner is determined by the local file desctiptor*/
4001                 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4002         } else if (file_lock->fl_flags & FL_POSIX) {
4003                 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4004                 flock.l_flock.start = file_lock->fl_start;
4005                 flock.l_flock.end = file_lock->fl_end;
4006         } else {
4007                 RETURN(-EINVAL);
4008         }
4009         flock.l_flock.pid = file_lock->fl_pid;
4010
4011         /* Somewhat ugly workaround for svc lockd.
4012          * lockd installs custom fl_lmops->lm_compare_owner that checks
4013          * for the fl_owner to be the same (which it always is on local node
4014          * I guess between lockd processes) and then compares pid.
4015          * As such we assign pid to the owner field to make it all work,
4016          * conflict with normal locks is unlikely since pid space and
4017          * pointer space for current->files are not intersecting */
4018         if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4019                 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4020
4021         switch (fl_type) {
4022         case F_RDLCK:
4023                 einfo.ei_mode = LCK_PR;
4024                 break;
4025         case F_UNLCK:
4026                 /* An unlock request may or may not have any relation to
4027                  * existing locks so we may not be able to pass a lock handle
4028                  * via a normal ldlm_lock_cancel() request. The request may even
4029                  * unlock a byte range in the middle of an existing lock. In
4030                  * order to process an unlock request we need all of the same
4031                  * information that is given with a normal read or write record
4032                  * lock request. To avoid creating another ldlm unlock (cancel)
4033                  * message we'll treat a LCK_NL flock request as an unlock. */
4034                 einfo.ei_mode = LCK_NL;
4035                 break;
4036         case F_WRLCK:
4037                 einfo.ei_mode = LCK_PW;
4038                 break;
4039         default:
4040                 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4041                 RETURN (-ENOTSUPP);
4042         }
4043
4044         switch (cmd) {
4045         case F_SETLKW:
4046 #ifdef F_SETLKW64
4047         case F_SETLKW64:
4048 #endif
4049                 flags = 0;
4050                 break;
4051         case F_SETLK:
4052 #ifdef F_SETLK64
4053         case F_SETLK64:
4054 #endif
4055                 flags = LDLM_FL_BLOCK_NOWAIT;
4056                 break;
4057         case F_GETLK:
4058 #ifdef F_GETLK64
4059         case F_GETLK64:
4060 #endif
4061                 flags = LDLM_FL_TEST_LOCK;
4062                 break;
4063         default:
4064                 CERROR("unknown fcntl lock command: %d\n", cmd);
4065                 RETURN (-EINVAL);
4066         }
4067
4068         /* Save the old mode so that if the mode in the lock changes we
4069          * can decrement the appropriate reader or writer refcount. */
4070         file_lock->fl_type = einfo.ei_mode;
4071
4072         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4073                                      LUSTRE_OPC_ANY, NULL);
4074         if (IS_ERR(op_data))
4075                 RETURN(PTR_ERR(op_data));
4076
4077         CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4078                "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4079                flock.l_flock.pid, flags, einfo.ei_mode,
4080                flock.l_flock.start, flock.l_flock.end);
4081
4082         rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4083                         flags);
4084
4085         /* Restore the file lock type if not TEST lock. */
4086         if (!(flags & LDLM_FL_TEST_LOCK))
4087                 file_lock->fl_type = fl_type;
4088
4089 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4090         if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4091             !(flags & LDLM_FL_TEST_LOCK))
4092                 rc2  = locks_lock_file_wait(file, file_lock);
4093 #else
4094         if ((file_lock->fl_flags & FL_FLOCK) &&
4095             (rc == 0 || file_lock->fl_type == F_UNLCK))
4096                 rc2  = flock_lock_file_wait(file, file_lock);
4097         if ((file_lock->fl_flags & FL_POSIX) &&
4098             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4099             !(flags & LDLM_FL_TEST_LOCK))
4100                 rc2  = posix_lock_file_wait(file, file_lock);
4101 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4102
4103         if (rc2 && file_lock->fl_type != F_UNLCK) {
4104                 einfo.ei_mode = LCK_NL;
4105                 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4106                            &lockh, flags);
4107                 rc = rc2;
4108         }
4109
4110         ll_finish_md_op_data(op_data);
4111
4112         RETURN(rc);
4113 }
4114
4115 int ll_get_fid_by_name(struct inode *parent, const char *name,
4116                        int namelen, struct lu_fid *fid,
4117                        struct inode **inode)
4118 {
4119         struct md_op_data       *op_data = NULL;
4120         struct mdt_body         *body;
4121         struct ptlrpc_request   *req;
4122         int                     rc;
4123         ENTRY;
4124
4125         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4126                                      LUSTRE_OPC_ANY, NULL);
4127         if (IS_ERR(op_data))
4128                 RETURN(PTR_ERR(op_data));
4129
4130         op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4131         rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4132         ll_finish_md_op_data(op_data);
4133         if (rc < 0)
4134                 RETURN(rc);
4135
4136         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4137         if (body == NULL)
4138                 GOTO(out_req, rc = -EFAULT);
4139         if (fid != NULL)
4140                 *fid = body->mbo_fid1;
4141
4142         if (inode != NULL)
4143                 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4144 out_req:
4145         ptlrpc_req_finished(req);
4146         RETURN(rc);
4147 }
4148
4149 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4150                const char *name)
4151 {
4152         struct dentry *dchild = NULL;
4153         struct inode *child_inode = NULL;
4154         struct md_op_data *op_data;
4155         struct ptlrpc_request *request = NULL;
4156         struct obd_client_handle *och = NULL;
4157         struct qstr qstr;
4158         struct mdt_body *body;
4159         __u64 data_version = 0;
4160         size_t namelen = strlen(name);
4161         int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4162         int rc;
4163         ENTRY;
4164
4165         CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4166                PFID(ll_inode2fid(parent)), name,
4167                lum->lum_stripe_offset, lum->lum_stripe_count);
4168
4169         if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4170             lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4171                 lustre_swab_lmv_user_md(lum);
4172
4173         /* Get child FID first */
4174         qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4175         qstr.name = name;
4176         qstr.len = namelen;
4177         dchild = d_lookup(file_dentry(file), &qstr);
4178         if (dchild) {
4179                 if (dchild->d_inode)
4180                         child_inode = igrab(dchild->d_inode);
4181                 dput(dchild);
4182         }
4183
4184         if (!child_inode) {
4185                 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4186                                         &child_inode);
4187                 if (rc)
4188                         RETURN(rc);
4189         }
4190
4191         if (!child_inode)
4192                 RETURN(-ENOENT);
4193
4194         if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4195               OBD_CONNECT2_DIR_MIGRATE)) {
4196                 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4197                     ll_i2info(child_inode)->lli_lsm_md) {
4198                         CERROR("%s: MDT doesn't support stripe directory "
4199                                "migration!\n", ll_i2sbi(parent)->ll_fsname);
4200                         GOTO(out_iput, rc = -EOPNOTSUPP);
4201                 }
4202         }
4203
4204         /*
4205          * lfs migrate command needs to be blocked on the client
4206          * by checking the migrate FID against the FID of the
4207          * filesystem root.
4208          */
4209         if (child_inode == parent->i_sb->s_root->d_inode)
4210                 GOTO(out_iput, rc = -EINVAL);
4211
4212         op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4213                                      child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4214         if (IS_ERR(op_data))
4215                 GOTO(out_iput, rc = PTR_ERR(op_data));
4216
4217         inode_lock(child_inode);
4218         op_data->op_fid3 = *ll_inode2fid(child_inode);
4219         if (!fid_is_sane(&op_data->op_fid3)) {
4220                 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4221                        ll_i2sbi(parent)->ll_fsname, name,
4222                        PFID(&op_data->op_fid3));
4223                 GOTO(out_unlock, rc = -EINVAL);
4224         }
4225
4226         op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4227         op_data->op_data = lum;
4228         op_data->op_data_size = lumlen;
4229
4230 again:
4231         if (S_ISREG(child_inode->i_mode)) {
4232                 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4233                 if (IS_ERR(och)) {
4234                         rc = PTR_ERR(och);
4235                         och = NULL;
4236                         GOTO(out_unlock, rc);
4237                 }
4238
4239                 rc = ll_data_version(child_inode, &data_version,
4240                                      LL_DV_WR_FLUSH);
4241                 if (rc != 0)
4242                         GOTO(out_close, rc);
4243
4244                 op_data->op_open_handle = och->och_open_handle;
4245                 op_data->op_data_version = data_version;
4246                 op_data->op_lease_handle = och->och_lease_handle;
4247                 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4248
4249                 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4250                 och->och_mod->mod_open_req->rq_replay = 0;
4251                 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4252         }
4253
4254         rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4255                        name, namelen, &request);
4256         if (rc == 0) {
4257                 LASSERT(request != NULL);
4258                 ll_update_times(request, parent);
4259         }
4260
4261         if (rc == 0 || rc == -EAGAIN) {
4262                 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4263                 LASSERT(body != NULL);
4264
4265                 /* If the server does release layout lock, then we cleanup
4266                  * the client och here, otherwise release it in out_close: */
4267                 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4268                         obd_mod_put(och->och_mod);
4269                         md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4270                                                   och);
4271                         och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4272                         OBD_FREE_PTR(och);
4273                         och = NULL;
4274                 }
4275         }
4276
4277         if (request != NULL) {
4278                 ptlrpc_req_finished(request);
4279                 request = NULL;
4280         }
4281
4282         /* Try again if the lease has cancelled. */
4283         if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4284                 goto again;
4285
4286 out_close:
4287         if (och)
4288                 ll_lease_close(och, child_inode, NULL);
4289         if (!rc)
4290                 clear_nlink(child_inode);
4291 out_unlock:
4292         inode_unlock(child_inode);
4293         ll_finish_md_op_data(op_data);
4294 out_iput:
4295         iput(child_inode);
4296         RETURN(rc);
4297 }
4298
4299 static int
4300 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4301 {
4302         ENTRY;
4303
4304         RETURN(-ENOSYS);
4305 }
4306
4307 /**
4308  * test if some locks matching bits and l_req_mode are acquired
4309  * - bits can be in different locks
4310  * - if found clear the common lock bits in *bits
4311  * - the bits not found, are kept in *bits
4312  * \param inode [IN]
4313  * \param bits [IN] searched lock bits [IN]
4314  * \param l_req_mode [IN] searched lock mode
4315  * \retval boolean, true iff all bits are found
4316  */
4317 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4318 {
4319         struct lustre_handle lockh;
4320         union ldlm_policy_data policy;
4321         enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4322                               (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4323         struct lu_fid *fid;
4324         __u64 flags;
4325         int i;
4326         ENTRY;
4327
4328         if (!inode)
4329                RETURN(0);
4330
4331         fid = &ll_i2info(inode)->lli_fid;
4332         CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4333                ldlm_lockname[mode]);
4334
4335         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4336         for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4337                 policy.l_inodebits.bits = *bits & (1 << i);
4338                 if (policy.l_inodebits.bits == 0)
4339                         continue;
4340
4341                 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4342                                   &policy, mode, &lockh)) {
4343                         struct ldlm_lock *lock;
4344
4345                         lock = ldlm_handle2lock(&lockh);
4346                         if (lock) {
4347                                 *bits &=
4348                                       ~(lock->l_policy_data.l_inodebits.bits);
4349                                 LDLM_LOCK_PUT(lock);
4350                         } else {
4351                                 *bits &= ~policy.l_inodebits.bits;
4352                         }
4353                 }
4354         }
4355         RETURN(*bits == 0);
4356 }
4357
4358 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4359                                struct lustre_handle *lockh, __u64 flags,
4360                                enum ldlm_mode mode)
4361 {
4362         union ldlm_policy_data policy = { .l_inodebits = { bits } };
4363         struct lu_fid *fid;
4364         enum ldlm_mode rc;
4365         ENTRY;
4366
4367         fid = &ll_i2info(inode)->lli_fid;
4368         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4369
4370         rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4371                            fid, LDLM_IBITS, &policy, mode, lockh);
4372
4373         RETURN(rc);
4374 }
4375
4376 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4377 {
4378         /* Already unlinked. Just update nlink and return success */
4379         if (rc == -ENOENT) {
4380                 clear_nlink(inode);
4381                 /* If it is striped directory, and there is bad stripe
4382                  * Let's revalidate the dentry again, instead of returning
4383                  * error */
4384                 if (S_ISDIR(inode->i_mode) &&
4385                     ll_i2info(inode)->lli_lsm_md != NULL)
4386                         return 0;
4387
4388                 /* This path cannot be hit for regular files unless in
4389                  * case of obscure races, so no need to to validate
4390                  * size. */
4391                 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4392                         return 0;
4393         } else if (rc != 0) {
4394                 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4395                              "%s: revalidate FID "DFID" error: rc = %d\n",
4396                              ll_i2sbi(inode)->ll_fsname,
4397                              PFID(ll_inode2fid(inode)), rc);
4398         }
4399
4400         return rc;
4401 }
4402
4403 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4404 {
4405         struct inode *inode = dentry->d_inode;
4406         struct obd_export *exp = ll_i2mdexp(inode);
4407         struct lookup_intent oit = {
4408                 .it_op = op,
4409         };
4410         struct ptlrpc_request *req = NULL;
4411         struct md_op_data *op_data;
4412         int rc = 0;
4413         ENTRY;
4414
4415         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4416                PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4417
4418         /* Call getattr by fid, so do not provide name at all. */
4419         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4420                                      LUSTRE_OPC_ANY, NULL);
4421         if (IS_ERR(op_data))
4422                 RETURN(PTR_ERR(op_data));
4423
4424         rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4425         ll_finish_md_op_data(op_data);
4426         if (rc < 0) {
4427                 rc = ll_inode_revalidate_fini(inode, rc);
4428                 GOTO(out, rc);
4429         }
4430
4431         rc = ll_revalidate_it_finish(req, &oit, dentry);
4432         if (rc != 0) {
4433                 ll_intent_release(&oit);
4434                 GOTO(out, rc);
4435         }
4436
4437         /* Unlinked? Unhash dentry, so it is not picked up later by
4438          * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4439          * here to preserve get_cwd functionality on 2.6.
4440          * Bug 10503 */
4441         if (!dentry->d_inode->i_nlink) {
4442                 ll_lock_dcache(inode);
4443                 d_lustre_invalidate(dentry, 0);
4444                 ll_unlock_dcache(inode);
4445         }
4446
4447         ll_lookup_finish_locks(&oit, dentry);
4448 out:
4449         ptlrpc_req_finished(req);
4450
4451         return rc;
4452 }
4453
4454 static int ll_merge_md_attr(struct inode *inode)
4455 {
4456         struct ll_inode_info *lli = ll_i2info(inode);
4457         struct cl_attr attr = { 0 };
4458         int rc;
4459
4460         LASSERT(lli->lli_lsm_md != NULL);
4461         down_read(&lli->lli_lsm_sem);
4462         rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4463                            &attr, ll_md_blocking_ast);
4464         up_read(&lli->lli_lsm_sem);
4465         if (rc != 0)
4466                 RETURN(rc);
4467
4468         set_nlink(inode, attr.cat_nlink);
4469         inode->i_blocks = attr.cat_blocks;
4470         i_size_write(inode, attr.cat_size);
4471
4472         ll_i2info(inode)->lli_atime = attr.cat_atime;
4473         ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4474         ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4475
4476         RETURN(0);
4477 }
4478
4479 static inline dev_t ll_compat_encode_dev(dev_t dev)
4480 {
4481         /* The compat_sys_*stat*() syscalls will fail unless the
4482          * device majors and minors are both less than 256. Note that
4483          * the value returned here will be passed through
4484          * old_encode_dev() in cp_compat_stat(). And so we are not
4485          * trying to return a valid compat (u16) device number, just
4486          * one that will pass the old_valid_dev() check. */
4487
4488         return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4489 }
4490
4491 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4492 int ll_getattr(const struct path *path, struct kstat *stat,
4493                u32 request_mask, unsigned int flags)
4494 {
4495         struct dentry *de = path->dentry;
4496 #else
4497 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4498 {
4499 #endif
4500         struct inode *inode = de->d_inode;
4501         struct ll_sb_info *sbi = ll_i2sbi(inode);
4502         struct ll_inode_info *lli = ll_i2info(inode);
4503         int rc;
4504
4505         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4506
4507         rc = ll_inode_revalidate(de, IT_GETATTR);
4508         if (rc < 0)
4509                 RETURN(rc);
4510
4511         if (S_ISREG(inode->i_mode)) {
4512                 /* In case of restore, the MDT has the right size and has
4513                  * already send it back without granting the layout lock,
4514                  * inode is up-to-date so glimpse is useless.
4515                  * Also to glimpse we need the layout, in case of a running
4516                  * restore the MDT holds the layout lock so the glimpse will
4517                  * block up to the end of restore (getattr will block)
4518                  */
4519                 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4520                         rc = ll_glimpse_size(inode);
4521                         if (rc < 0)
4522                                 RETURN(rc);
4523                 }
4524         } else {
4525                 /* If object isn't regular a file then don't validate size. */
4526                 if (S_ISDIR(inode->i_mode) &&
4527                     lli->lli_lsm_md != NULL) {
4528                         rc = ll_merge_md_attr(inode);
4529                         if (rc < 0)
4530                                 RETURN(rc);
4531                 }
4532
4533                 inode->i_atime.tv_sec = lli->lli_atime;
4534                 inode->i_mtime.tv_sec = lli->lli_mtime;
4535                 inode->i_ctime.tv_sec = lli->lli_ctime;
4536         }
4537
4538         OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4539
4540         if (ll_need_32bit_api(sbi)) {
4541                 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4542                 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4543                 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4544         } else {
4545                 stat->ino = inode->i_ino;
4546                 stat->dev = inode->i_sb->s_dev;
4547                 stat->rdev = inode->i_rdev;
4548         }
4549
4550         stat->mode = inode->i_mode;
4551         stat->uid = inode->i_uid;
4552         stat->gid = inode->i_gid;
4553         stat->atime = inode->i_atime;
4554         stat->mtime = inode->i_mtime;
4555         stat->ctime = inode->i_ctime;
4556         stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4557
4558         stat->nlink = inode->i_nlink;
4559         stat->size = i_size_read(inode);
4560         stat->blocks = inode->i_blocks;
4561
4562         return 0;
4563 }
4564
4565 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4566                      __u64 start, __u64 len)
4567 {
4568         int             rc;
4569         size_t          num_bytes;
4570         struct fiemap   *fiemap;
4571         unsigned int    extent_count = fieinfo->fi_extents_max;
4572
4573         num_bytes = sizeof(*fiemap) + (extent_count *
4574                                        sizeof(struct fiemap_extent));
4575         OBD_ALLOC_LARGE(fiemap, num_bytes);
4576
4577         if (fiemap == NULL)
4578                 RETURN(-ENOMEM);
4579
4580         fiemap->fm_flags = fieinfo->fi_flags;
4581         fiemap->fm_extent_count = fieinfo->fi_extents_max;
4582         fiemap->fm_start = start;
4583         fiemap->fm_length = len;
4584         if (extent_count > 0 &&
4585             copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4586                            sizeof(struct fiemap_extent)) != 0)
4587                 GOTO(out, rc = -EFAULT);
4588
4589         rc = ll_do_fiemap(inode, fiemap, num_bytes);
4590
4591         fieinfo->fi_flags = fiemap->fm_flags;
4592         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4593         if (extent_count > 0 &&
4594             copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4595                          fiemap->fm_mapped_extents *
4596                          sizeof(struct fiemap_extent)) != 0)
4597                 GOTO(out, rc = -EFAULT);
4598 out:
4599         OBD_FREE_LARGE(fiemap, num_bytes);
4600         return rc;
4601 }
4602
4603 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4604 {
4605         struct ll_inode_info *lli = ll_i2info(inode);
4606         struct posix_acl *acl = NULL;
4607         ENTRY;
4608
4609         spin_lock(&lli->lli_lock);
4610         /* VFS' acl_permission_check->check_acl will release the refcount */
4611         acl = posix_acl_dup(lli->lli_posix_acl);
4612         spin_unlock(&lli->lli_lock);
4613
4614         RETURN(acl);
4615 }
4616
4617 #ifdef HAVE_IOP_SET_ACL
4618 #ifdef CONFIG_FS_POSIX_ACL
4619 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4620 {
4621         struct ll_sb_info *sbi = ll_i2sbi(inode);
4622         struct ptlrpc_request *req = NULL;
4623         const char *name = NULL;
4624         char *value = NULL;
4625         size_t value_size = 0;
4626         int rc = 0;
4627         ENTRY;
4628
4629         switch (type) {
4630         case ACL_TYPE_ACCESS:
4631                 name = XATTR_NAME_POSIX_ACL_ACCESS;
4632                 if (acl)
4633                         rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4634                 break;
4635
4636         case ACL_TYPE_DEFAULT:
4637                 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4638                 if (!S_ISDIR(inode->i_mode))
4639                         rc = acl ? -EACCES : 0;
4640                 break;
4641
4642         default:
4643                 rc = -EINVAL;
4644                 break;
4645         }
4646         if (rc)
4647                 return rc;
4648
4649         if (acl) {
4650                 value_size = posix_acl_xattr_size(acl->a_count);
4651                 value = kmalloc(value_size, GFP_NOFS);
4652                 if (value == NULL)
4653                         GOTO(out, rc = -ENOMEM);
4654
4655                 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4656                 if (rc < 0)
4657                         GOTO(out_value, rc);
4658         }
4659
4660         rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4661                          value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4662                          name, value, value_size, 0, 0, &req);
4663
4664         ptlrpc_req_finished(req);
4665 out_value:
4666         kfree(value);
4667 out:
4668         if (rc)
4669                 forget_cached_acl(inode, type);
4670         else
4671                 set_cached_acl(inode, type, acl);
4672         RETURN(rc);
4673 }
4674 #endif /* CONFIG_FS_POSIX_ACL */
4675 #endif /* HAVE_IOP_SET_ACL */
4676
4677 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4678 static int
4679 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4680 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4681 # else
4682 ll_check_acl(struct inode *inode, int mask)
4683 # endif
4684 {
4685 # ifdef CONFIG_FS_POSIX_ACL
4686         struct posix_acl *acl;
4687         int rc;
4688         ENTRY;
4689
4690 #  ifdef HAVE_GENERIC_PERMISSION_4ARGS
4691         if (flags & IPERM_FLAG_RCU)
4692                 return -ECHILD;
4693 #  endif
4694         acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4695
4696         if (!acl)
4697                 RETURN(-EAGAIN);
4698
4699         rc = posix_acl_permission(inode, acl, mask);
4700         posix_acl_release(acl);
4701
4702         RETURN(rc);
4703 # else /* !CONFIG_FS_POSIX_ACL */
4704         return -EAGAIN;
4705 # endif /* CONFIG_FS_POSIX_ACL */
4706 }
4707 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4708
4709 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4710 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4711 #else
4712 # ifdef HAVE_INODE_PERMISION_2ARGS
4713 int ll_inode_permission(struct inode *inode, int mask)
4714 # else
4715 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4716 # endif
4717 #endif
4718 {
4719         int rc = 0;
4720         struct ll_sb_info *sbi;
4721         struct root_squash_info *squash;
4722         struct cred *cred = NULL;
4723         const struct cred *old_cred = NULL;
4724         cfs_cap_t cap;
4725         bool squash_id = false;
4726         ENTRY;
4727
4728 #ifdef MAY_NOT_BLOCK
4729         if (mask & MAY_NOT_BLOCK)
4730                 return -ECHILD;
4731 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4732         if (flags & IPERM_FLAG_RCU)
4733                 return -ECHILD;
4734 #endif
4735
4736        /* as root inode are NOT getting validated in lookup operation,
4737         * need to do it before permission check. */
4738
4739         if (inode == inode->i_sb->s_root->d_inode) {
4740                 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4741                 if (rc)
4742                         RETURN(rc);
4743         }
4744
4745         CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4746                PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4747
4748         /* squash fsuid/fsgid if needed */
4749         sbi = ll_i2sbi(inode);
4750         squash = &sbi->ll_squash;
4751         if (unlikely(squash->rsi_uid != 0 &&
4752                      uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4753                      !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4754                         squash_id = true;
4755         }
4756         if (squash_id) {
4757                 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4758                        __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4759                        squash->rsi_uid, squash->rsi_gid);
4760
4761                 /* update current process's credentials
4762                  * and FS capability */
4763                 cred = prepare_creds();
4764                 if (cred == NULL)
4765                         RETURN(-ENOMEM);
4766
4767                 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4768                 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4769                 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4770                         if ((1 << cap) & CFS_CAP_FS_MASK)
4771                                 cap_lower(cred->cap_effective, cap);
4772                 }
4773                 old_cred = override_creds(cred);
4774         }
4775
4776         ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4777         rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4778         /* restore current process's credentials and FS capability */
4779         if (squash_id) {
4780                 revert_creds(old_cred);
4781                 put_cred(cred);
4782         }
4783
4784         RETURN(rc);
4785 }
4786
4787 /* -o localflock - only provides locally consistent flock locks */
4788 struct file_operations ll_file_operations = {
4789 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4790 # ifdef HAVE_SYNC_READ_WRITE
4791         .read           = new_sync_read,
4792         .write          = new_sync_write,
4793 # endif
4794         .read_iter      = ll_file_read_iter,
4795         .write_iter     = ll_file_write_iter,
4796 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4797         .read           = ll_file_read,
4798         .aio_read       = ll_file_aio_read,
4799         .write          = ll_file_write,
4800         .aio_write      = ll_file_aio_write,
4801 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4802         .unlocked_ioctl = ll_file_ioctl,
4803         .open           = ll_file_open,
4804         .release        = ll_file_release,
4805         .mmap           = ll_file_mmap,
4806         .llseek         = ll_file_seek,
4807         .splice_read    = ll_file_splice_read,
4808         .fsync          = ll_fsync,
4809         .flush          = ll_flush
4810 };
4811
4812 struct file_operations ll_file_operations_flock = {
4813 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4814 # ifdef HAVE_SYNC_READ_WRITE
4815         .read           = new_sync_read,
4816         .write          = new_sync_write,
4817 # endif /* HAVE_SYNC_READ_WRITE */
4818         .read_iter      = ll_file_read_iter,
4819         .write_iter     = ll_file_write_iter,
4820 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4821         .read           = ll_file_read,
4822         .aio_read       = ll_file_aio_read,
4823         .write          = ll_file_write,
4824         .aio_write      = ll_file_aio_write,
4825 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4826         .unlocked_ioctl = ll_file_ioctl,
4827         .open           = ll_file_open,
4828         .release        = ll_file_release,
4829         .mmap           = ll_file_mmap,
4830         .llseek         = ll_file_seek,
4831         .splice_read    = ll_file_splice_read,
4832         .fsync          = ll_fsync,
4833         .flush          = ll_flush,
4834         .flock          = ll_file_flock,
4835         .lock           = ll_file_flock
4836 };
4837
4838 /* These are for -o noflock - to return ENOSYS on flock calls */
4839 struct file_operations ll_file_operations_noflock = {
4840 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4841 # ifdef HAVE_SYNC_READ_WRITE
4842         .read           = new_sync_read,
4843         .write          = new_sync_write,
4844 # endif /* HAVE_SYNC_READ_WRITE */
4845         .read_iter      = ll_file_read_iter,
4846         .write_iter     = ll_file_write_iter,
4847 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4848         .read           = ll_file_read,
4849         .aio_read       = ll_file_aio_read,
4850         .write          = ll_file_write,
4851         .aio_write      = ll_file_aio_write,
4852 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4853         .unlocked_ioctl = ll_file_ioctl,
4854         .open           = ll_file_open,
4855         .release        = ll_file_release,
4856         .mmap           = ll_file_mmap,
4857         .llseek         = ll_file_seek,
4858         .splice_read    = ll_file_splice_read,
4859         .fsync          = ll_fsync,
4860         .flush          = ll_flush,
4861         .flock          = ll_file_noflock,
4862         .lock           = ll_file_noflock
4863 };
4864
4865 struct inode_operations ll_file_inode_operations = {
4866         .setattr        = ll_setattr,
4867         .getattr        = ll_getattr,
4868         .permission     = ll_inode_permission,
4869 #ifdef HAVE_IOP_XATTR
4870         .setxattr       = ll_setxattr,
4871         .getxattr       = ll_getxattr,
4872         .removexattr    = ll_removexattr,
4873 #endif
4874         .listxattr      = ll_listxattr,
4875         .fiemap         = ll_fiemap,
4876 #ifdef HAVE_IOP_GET_ACL
4877         .get_acl        = ll_get_acl,
4878 #endif
4879 #ifdef HAVE_IOP_SET_ACL
4880         .set_acl        = ll_set_acl,
4881 #endif
4882 };
4883
4884 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4885 {
4886         struct ll_inode_info *lli = ll_i2info(inode);
4887         struct cl_object *obj = lli->lli_clob;
4888         struct lu_env *env;
4889         int rc;
4890         __u16 refcheck;
4891         ENTRY;
4892
4893         if (obj == NULL)
4894                 RETURN(0);
4895
4896         env = cl_env_get(&refcheck);
4897         if (IS_ERR(env))
4898                 RETURN(PTR_ERR(env));
4899
4900         rc = cl_conf_set(env, lli->lli_clob, conf);
4901         if (rc < 0)
4902                 GOTO(out, rc);
4903
4904         if (conf->coc_opc == OBJECT_CONF_SET) {
4905                 struct ldlm_lock *lock = conf->coc_lock;
4906                 struct cl_layout cl = {
4907                         .cl_layout_gen = 0,
4908                 };
4909
4910                 LASSERT(lock != NULL);
4911                 LASSERT(ldlm_has_layout(lock));
4912
4913                 /* it can only be allowed to match after layout is
4914                  * applied to inode otherwise false layout would be
4915                  * seen. Applying layout shoud happen before dropping
4916                  * the intent lock. */
4917                 ldlm_lock_allow_match(lock);
4918
4919                 rc = cl_object_layout_get(env, obj, &cl);
4920                 if (rc < 0)
4921                         GOTO(out, rc);
4922
4923                 CDEBUG(D_VFSTRACE,
4924                        DFID": layout version change: %u -> %u\n",
4925                        PFID(&lli->lli_fid), ll_layout_version_get(lli),
4926                        cl.cl_layout_gen);
4927                 ll_layout_version_set(lli, cl.cl_layout_gen);
4928         }
4929
4930 out:
4931         cl_env_put(env, &refcheck);
4932
4933         RETURN(rc);
4934 }
4935
4936 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4937 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4938
4939 {
4940         struct ll_sb_info *sbi = ll_i2sbi(inode);
4941         struct ptlrpc_request *req;
4942         void *lvbdata;
4943         void *lmm;
4944         int lmmsize;
4945         int rc;
4946         ENTRY;
4947
4948         CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4949                PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4950                lock->l_lvb_data, lock->l_lvb_len);
4951
4952         if (lock->l_lvb_data != NULL)
4953                 RETURN(0);
4954
4955         /* if layout lock was granted right away, the layout is returned
4956          * within DLM_LVB of dlm reply; otherwise if the lock was ever
4957          * blocked and then granted via completion ast, we have to fetch
4958          * layout here. Please note that we can't use the LVB buffer in
4959          * completion AST because it doesn't have a large enough buffer */
4960         rc = ll_get_default_mdsize(sbi, &lmmsize);
4961         if (rc < 0)
4962                 RETURN(rc);
4963
4964         rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4965                          XATTR_NAME_LOV, lmmsize, &req);
4966         if (rc < 0) {
4967                 if (rc == -ENODATA)
4968                         GOTO(out, rc = 0); /* empty layout */
4969                 else
4970                         RETURN(rc);
4971         }
4972
4973         lmmsize = rc;
4974         rc = 0;
4975         if (lmmsize == 0) /* empty layout */
4976                 GOTO(out, rc = 0);
4977
4978         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4979         if (lmm == NULL)
4980                 GOTO(out, rc = -EFAULT);
4981
4982         OBD_ALLOC_LARGE(lvbdata, lmmsize);
4983         if (lvbdata == NULL)
4984                 GOTO(out, rc = -ENOMEM);
4985
4986         memcpy(lvbdata, lmm, lmmsize);
4987         lock_res_and_lock(lock);
4988         if (unlikely(lock->l_lvb_data == NULL)) {
4989                 lock->l_lvb_type = LVB_T_LAYOUT;
4990                 lock->l_lvb_data = lvbdata;
4991                 lock->l_lvb_len = lmmsize;
4992                 lvbdata = NULL;
4993         }
4994         unlock_res_and_lock(lock);
4995
4996         if (lvbdata)
4997                 OBD_FREE_LARGE(lvbdata, lmmsize);
4998
4999         EXIT;
5000
5001 out:
5002         ptlrpc_req_finished(req);
5003         return rc;
5004 }
5005
5006 /**
5007  * Apply the layout to the inode. Layout lock is held and will be released
5008  * in this function.
5009  */
5010 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5011                               struct inode *inode)
5012 {
5013         struct ll_inode_info *lli = ll_i2info(inode);
5014         struct ll_sb_info    *sbi = ll_i2sbi(inode);
5015         struct ldlm_lock *lock;
5016         struct cl_object_conf conf;
5017         int rc = 0;
5018         bool lvb_ready;
5019         bool wait_layout = false;
5020         ENTRY;
5021
5022         LASSERT(lustre_handle_is_used(lockh));
5023
5024         lock = ldlm_handle2lock(lockh);
5025         LASSERT(lock != NULL);
5026         LASSERT(ldlm_has_layout(lock));
5027
5028         LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5029                    PFID(&lli->lli_fid), inode);
5030
5031         /* in case this is a caching lock and reinstate with new inode */
5032         md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5033
5034         lock_res_and_lock(lock);
5035         lvb_ready = ldlm_is_lvb_ready(lock);
5036         unlock_res_and_lock(lock);
5037
5038         /* checking lvb_ready is racy but this is okay. The worst case is
5039          * that multi processes may configure the file on the same time. */
5040         if (lvb_ready)
5041                 GOTO(out, rc = 0);
5042
5043         rc = ll_layout_fetch(inode, lock);
5044         if (rc < 0)
5045                 GOTO(out, rc);
5046
5047         /* for layout lock, lmm is stored in lock's lvb.
5048          * lvb_data is immutable if the lock is held so it's safe to access it
5049          * without res lock.
5050          *
5051          * set layout to file. Unlikely this will fail as old layout was
5052          * surely eliminated */
5053         memset(&conf, 0, sizeof conf);
5054         conf.coc_opc = OBJECT_CONF_SET;
5055         conf.coc_inode = inode;
5056         conf.coc_lock = lock;
5057         conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5058         conf.u.coc_layout.lb_len = lock->l_lvb_len;
5059         rc = ll_layout_conf(inode, &conf);
5060
5061         /* refresh layout failed, need to wait */
5062         wait_layout = rc == -EBUSY;
5063         EXIT;
5064 out:
5065         LDLM_LOCK_PUT(lock);
5066         ldlm_lock_decref(lockh, mode);
5067
5068         /* wait for IO to complete if it's still being used. */
5069         if (wait_layout) {
5070                 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5071                        sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5072
5073                 memset(&conf, 0, sizeof conf);
5074                 conf.coc_opc = OBJECT_CONF_WAIT;
5075                 conf.coc_inode = inode;
5076                 rc = ll_layout_conf(inode, &conf);
5077                 if (rc == 0)
5078                         rc = -EAGAIN;
5079
5080                 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5081                        sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5082         }
5083         RETURN(rc);
5084 }
5085
5086 /**
5087  * Issue layout intent RPC to MDS.
5088  * \param inode [in]    file inode
5089  * \param intent [in]   layout intent
5090  *
5091  * \retval 0    on success
5092  * \retval < 0  error code
5093  */
5094 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5095 {
5096         struct ll_inode_info  *lli = ll_i2info(inode);
5097         struct ll_sb_info     *sbi = ll_i2sbi(inode);
5098         struct md_op_data     *op_data;
5099         struct lookup_intent it;
5100         struct ptlrpc_request *req;
5101         int rc;
5102         ENTRY;
5103
5104         op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5105                                      0, 0, LUSTRE_OPC_ANY, NULL);
5106         if (IS_ERR(op_data))
5107                 RETURN(PTR_ERR(op_data));
5108
5109         op_data->op_data = intent;
5110         op_data->op_data_size = sizeof(*intent);
5111
5112         memset(&it, 0, sizeof(it));
5113         it.it_op = IT_LAYOUT;
5114         if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5115             intent->li_opc == LAYOUT_INTENT_TRUNC)
5116                 it.it_flags = FMODE_WRITE;
5117
5118         LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5119                           sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5120
5121         rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5122                             &ll_md_blocking_ast, 0);
5123         if (it.it_request != NULL)
5124                 ptlrpc_req_finished(it.it_request);
5125         it.it_request = NULL;
5126
5127         ll_finish_md_op_data(op_data);
5128
5129         /* set lock data in case this is a new lock */
5130         if (!rc)
5131                 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5132
5133         ll_intent_drop_lock(&it);
5134
5135         RETURN(rc);
5136 }
5137
5138 /**
5139  * This function checks if there exists a LAYOUT lock on the client side,
5140  * or enqueues it if it doesn't have one in cache.
5141  *
5142  * This function will not hold layout lock so it may be revoked any time after
5143  * this function returns. Any operations depend on layout should be redone
5144  * in that case.
5145  *
5146  * This function should be called before lov_io_init() to get an uptodate
5147  * layout version, the caller should save the version number and after IO
5148  * is finished, this function should be called again to verify that layout
5149  * is not changed during IO time.
5150  */
5151 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5152 {
5153         struct ll_inode_info    *lli = ll_i2info(inode);
5154         struct ll_sb_info       *sbi = ll_i2sbi(inode);
5155         struct lustre_handle lockh;
5156         struct layout_intent intent = {
5157                 .li_opc = LAYOUT_INTENT_ACCESS,
5158         };
5159         enum ldlm_mode mode;
5160         int rc;
5161         ENTRY;
5162
5163         *gen = ll_layout_version_get(lli);
5164         if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5165                 RETURN(0);
5166
5167         /* sanity checks */
5168         LASSERT(fid_is_sane(ll_inode2fid(inode)));
5169         LASSERT(S_ISREG(inode->i_mode));
5170
5171         /* take layout lock mutex to enqueue layout lock exclusively. */
5172         mutex_lock(&lli->lli_layout_mutex);
5173
5174         while (1) {
5175                 /* mostly layout lock is caching on the local side, so try to
5176                  * match it before grabbing layout lock mutex. */
5177                 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5178                                        LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5179                 if (mode != 0) { /* hit cached lock */
5180                         rc = ll_layout_lock_set(&lockh, mode, inode);
5181                         if (rc == -EAGAIN)
5182                                 continue;
5183                         break;
5184                 }
5185
5186                 rc = ll_layout_intent(inode, &intent);
5187                 if (rc != 0)
5188                         break;
5189         }
5190
5191         if (rc == 0)
5192                 *gen = ll_layout_version_get(lli);
5193         mutex_unlock(&lli->lli_layout_mutex);
5194
5195         RETURN(rc);
5196 }
5197
5198 /**
5199  * Issue layout intent RPC indicating where in a file an IO is about to write.
5200  *
5201  * \param[in] inode     file inode.
5202  * \param[in] ext       write range with start offset of fille in bytes where
5203  *                      an IO is about to write, and exclusive end offset in
5204  *                      bytes.
5205  *
5206  * \retval 0    on success
5207  * \retval < 0  error code
5208  */
5209 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5210                            struct lu_extent *ext)
5211 {
5212         struct layout_intent intent = {
5213                 .li_opc = opc,
5214                 .li_extent.e_start = ext->e_start,
5215                 .li_extent.e_end = ext->e_end,
5216         };
5217         int rc;
5218         ENTRY;
5219
5220         rc = ll_layout_intent(inode, &intent);
5221
5222         RETURN(rc);
5223 }
5224
5225 /**
5226  *  This function send a restore request to the MDT
5227  */
5228 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5229 {
5230         struct hsm_user_request *hur;
5231         int                      len, rc;
5232         ENTRY;
5233
5234         len = sizeof(struct hsm_user_request) +
5235               sizeof(struct hsm_user_item);
5236         OBD_ALLOC(hur, len);
5237         if (hur == NULL)
5238                 RETURN(-ENOMEM);
5239
5240         hur->hur_request.hr_action = HUA_RESTORE;
5241         hur->hur_request.hr_archive_id = 0;
5242         hur->hur_request.hr_flags = 0;
5243         memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5244                sizeof(hur->hur_user_item[0].hui_fid));
5245         hur->hur_user_item[0].hui_extent.offset = offset;
5246         hur->hur_user_item[0].hui_extent.length = length;
5247         hur->hur_request.hr_itemcount = 1;
5248         rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,
5249                            len, hur, NULL);
5250         OBD_FREE(hur, len);
5251         RETURN(rc);
5252 }