lustre/llite/file.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  * GPL HEADER START
   5  *
   6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License version 2 only,
  10  * as published by the Free Software Foundation.
  11  *
  12  * This program is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * General Public License version 2 for more details (a copy is included
  16  * in the LICENSE file that accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * version 2 along with this program; If not, see
  20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  21  *
  22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  23  * CA 95054 USA or visit www.sun.com if you need additional information or
  24  * have any questions.
  25  *
  26  * GPL HEADER END
  27  */
  28 /*
  29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
  30  * Use is subject to license terms.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lustre/llite/file.c
  37  *
  38  * Author: Peter Braam <braam@clusterfs.com>
  39  * Author: Phil Schwan <phil@clusterfs.com>
  40  * Author: Andreas Dilger <adilger@clusterfs.com>
  41  */
  42
  43 #define DEBUG_SUBSYSTEM S_LLITE
  44 #include <lustre_dlm.h>
  45 #include <lustre_lite.h>
  46 #include <lustre_mdc.h>
  47 #include <linux/pagemap.h>
  48 #include <linux/file.h>
  49 #include "llite_internal.h"
  50 #include <lustre/ll_fiemap.h>
  51
  52 /* also used by llite/special.c:ll_special_open() */
  53 struct ll_file_data *ll_file_data_get(void)
  54 {
  55         struct ll_file_data *fd;
  56
  57         OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
  58         return fd;
  59 }
  60
  61 static void ll_file_data_put(struct ll_file_data *fd)
  62 {
  63         if (fd != NULL)
  64                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  65 }
  66
  67 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  68                           struct lustre_handle *fh)
  69 {
  70         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  71         op_data->op_attr.ia_mode = inode->i_mode;
  72         op_data->op_attr.ia_atime = inode->i_atime;
  73         op_data->op_attr.ia_mtime = inode->i_mtime;
  74         op_data->op_attr.ia_ctime = inode->i_ctime;
  75         op_data->op_attr.ia_size = i_size_read(inode);
  76         op_data->op_attr_blocks = inode->i_blocks;
  77         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
  78         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  79         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
  80         op_data->op_capa1 = ll_mdscapa_get(inode);
  81 }
  82
  83 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  84                              struct obd_client_handle *och)
  85 {
  86         ENTRY;
  87
  88         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
  89                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
  90
  91         if (!(och->och_flags & FMODE_WRITE))
  92                 goto out;
  93
  94         if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
  95             !S_ISREG(inode->i_mode))
  96                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
  97         else
  98                 ll_epoch_close(inode, op_data, &och, 0);
  99
 100 out:
 101         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
 102         EXIT;
 103 }
 104
 105 static int ll_close_inode_openhandle(struct obd_export *md_exp,
 106                                      struct inode *inode,
 107                                      struct obd_client_handle *och)
 108 {
 109         struct obd_export *exp = ll_i2mdexp(inode);
 110         struct md_op_data *op_data;
 111         struct ptlrpc_request *req = NULL;
 112         struct obd_device *obd = class_exp2obd(exp);
 113         int epoch_close = 1;
 114         int seq_end = 0, rc;
 115         ENTRY;
 116
 117         if (obd == NULL) {
 118                 /*
 119                  * XXX: in case of LMV, is this correct to access
 120                  * ->exp_handle?
 121                  */
 122                 CERROR("Invalid MDC connection handle "LPX64"\n",
 123                        ll_i2mdexp(inode)->exp_handle.h_cookie);
 124                 GOTO(out, rc = 0);
 125         }
 126
 127         /*
 128          * here we check if this is forced umount. If so this is called on
 129          * canceling "open lock" and we do not call md_close() in this case, as
 130          * it will not be successful, as import is already deactivated.
 131          */
 132         if (obd->obd_force)
 133                 GOTO(out, rc = 0);
 134
 135         OBD_ALLOC_PTR(op_data);
 136         if (op_data == NULL)
 137                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
 138
 139         ll_prepare_close(inode, op_data, och);
 140         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
 141         rc = md_close(md_exp, op_data, och->och_mod, &req);
 142         if (rc != -EAGAIN)
 143                 seq_end = 1;
 144
 145         if (rc == -EAGAIN) {
 146                 /* This close must have the epoch closed. */
 147                 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
 148                 LASSERT(epoch_close);
 149                 /* MDS has instructed us to obtain Size-on-MDS attribute from
 150                  * OSTs and send setattr to back to MDS. */
 151                 rc = ll_sizeonmds_update(inode, och->och_mod,
 152                                          &och->och_fh, op_data->op_ioepoch);
 153                 if (rc) {
 154                         CERROR("inode %lu mdc Size-on-MDS update failed: "
 155                                "rc = %d\n", inode->i_ino, rc);
 156                         rc = 0;
 157                 }
 158         } else if (rc) {
 159                 CERROR("inode %lu mdc close failed: rc = %d\n",
 160                        inode->i_ino, rc);
 161         }
 162         ll_finish_md_op_data(op_data);
 163
 164         if (rc == 0) {
 165                 rc = ll_objects_destroy(req, inode);
 166                 if (rc)
 167                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
 168                                inode->i_ino, rc);
 169         }
 170
 171         EXIT;
 172 out:
 173
 174         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
 175             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
 176                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 177         } else {
 178                 if (seq_end)
 179                         ptlrpc_close_replay_seq(req);
 180                 md_clear_open_replay_data(md_exp, och);
 181                 /* Free @och if it is not waiting for DONE_WRITING. */
 182                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 183                 OBD_FREE_PTR(och);
 184         }
 185         if (req) /* This is close request */
 186                 ptlrpc_req_finished(req);
 187         return rc;
 188 }
 189
 190 int ll_md_real_close(struct inode *inode, int flags)
 191 {
 192         struct ll_inode_info *lli = ll_i2info(inode);
 193         struct obd_client_handle **och_p;
 194         struct obd_client_handle *och;
 195         __u64 *och_usecount;
 196         int rc = 0;
 197         ENTRY;
 198
 199         if (flags & FMODE_WRITE) {
 200                 och_p = &lli->lli_mds_write_och;
 201                 och_usecount = &lli->lli_open_fd_write_count;
 202         } else if (flags & FMODE_EXEC) {
 203                 och_p = &lli->lli_mds_exec_och;
 204                 och_usecount = &lli->lli_open_fd_exec_count;
 205         } else {
 206                 LASSERT(flags & FMODE_READ);
 207                 och_p = &lli->lli_mds_read_och;
 208                 och_usecount = &lli->lli_open_fd_read_count;
 209         }
 210
 211         down(&lli->lli_och_sem);
 212         if (*och_usecount) { /* There are still users of this handle, so
 213                                 skip freeing it. */
 214                 up(&lli->lli_och_sem);
 215                 RETURN(0);
 216         }
 217         och=*och_p;
 218         *och_p = NULL;
 219         up(&lli->lli_och_sem);
 220
 221         if (och) { /* There might be a race and somebody have freed this och
 222                       already */
 223                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 224                                                inode, och);
 225         }
 226
 227         RETURN(rc);
 228 }
 229
 230 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 231                 struct file *file)
 232 {
 233         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 234         struct ll_inode_info *lli = ll_i2info(inode);
 235         int rc = 0;
 236         ENTRY;
 237
 238         /* clear group lock, if present */
 239         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
 240                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 241                 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
 242                 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
 243                                       &fd->fd_cwlockh);
 244         }
 245
 246         /* Let's see if we have good enough OPEN lock on the file and if
 247            we can skip talking to MDS */
 248         if (file->f_dentry->d_inode) { /* Can this ever be false? */
 249                 int lockmode;
 250                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 251                 struct lustre_handle lockh;
 252                 struct inode *inode = file->f_dentry->d_inode;
 253                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 254
 255                 down(&lli->lli_och_sem);
 256                 if (fd->fd_omode & FMODE_WRITE) {
 257                         lockmode = LCK_CW;
 258                         LASSERT(lli->lli_open_fd_write_count);
 259                         lli->lli_open_fd_write_count--;
 260                 } else if (fd->fd_omode & FMODE_EXEC) {
 261                         lockmode = LCK_PR;
 262                         LASSERT(lli->lli_open_fd_exec_count);
 263                         lli->lli_open_fd_exec_count--;
 264                 } else {
 265                         lockmode = LCK_CR;
 266                         LASSERT(lli->lli_open_fd_read_count);
 267                         lli->lli_open_fd_read_count--;
 268                 }
 269                 up(&lli->lli_och_sem);
 270
 271                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 272                                    LDLM_IBITS, &policy, lockmode,
 273                                    &lockh)) {
 274                         rc = ll_md_real_close(file->f_dentry->d_inode,
 275                                               fd->fd_omode);
 276                 }
 277         } else {
 278                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
 279                        file, file->f_dentry, file->f_dentry->d_name.name);
 280         }
 281
 282         LUSTRE_FPRIVATE(file) = NULL;
 283         ll_file_data_put(fd);
 284         ll_capa_close(inode);
 285
 286         RETURN(rc);
 287 }
 288
 289 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
 290
 291 /* While this returns an error code, fput() the caller does not, so we need
 292  * to make every effort to clean up all of our state here.  Also, applications
 293  * rarely check close errors and even if an error is returned they will not
 294  * re-try the close call.
 295  */
 296 int ll_file_release(struct inode *inode, struct file *file)
 297 {
 298         struct ll_file_data *fd;
 299         struct ll_sb_info *sbi = ll_i2sbi(inode);
 300         struct ll_inode_info *lli = ll_i2info(inode);
 301         struct lov_stripe_md *lsm = lli->lli_smd;
 302         int rc;
 303         ENTRY;
 304
 305         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 306                inode->i_generation, inode);
 307
 308 #ifdef CONFIG_FS_POSIX_ACL
 309         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
 310             inode == inode->i_sb->s_root->d_inode) {
 311                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 312
 313                 LASSERT(fd != NULL);
 314                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
 315                         fd->fd_flags &= ~LL_FILE_RMTACL;
 316                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
 317                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
 318                 }
 319         }
 320 #endif
 321
 322         if (inode->i_sb->s_root != file->f_dentry)
 323                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 324         fd = LUSTRE_FPRIVATE(file);
 325         LASSERT(fd != NULL);
 326
 327         /* The last ref on @file, maybe not the the owner pid of statahead.
 328          * Different processes can open the same dir, "ll_opendir_key" means:
 329          * it is me that should stop the statahead thread. */
 330         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
 331                 ll_stop_statahead(inode, fd);
 332
 333         if (inode->i_sb->s_root == file->f_dentry) {
 334                 LUSTRE_FPRIVATE(file) = NULL;
 335                 ll_file_data_put(fd);
 336                 RETURN(0);
 337         }
 338
 339         if (lsm)
 340                 lov_test_and_clear_async_rc(lsm);
 341         lli->lli_async_rc = 0;
 342
 343         rc = ll_md_close(sbi->ll_md_exp, inode, file);
 344         RETURN(rc);
 345 }
 346
 347 static int ll_intent_file_open(struct file *file, void *lmm,
 348                                int lmmsize, struct lookup_intent *itp)
 349 {
 350         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
 351         struct dentry *parent = file->f_dentry->d_parent;
 352         const char *name = file->f_dentry->d_name.name;
 353         const int len = file->f_dentry->d_name.len;
 354         struct md_op_data *op_data;
 355         struct ptlrpc_request *req;
 356         int rc;
 357         ENTRY;
 358
 359         if (!parent)
 360                 RETURN(-ENOENT);
 361
 362         /* Usually we come here only for NFSD, and we want open lock.
 363            But we can also get here with pre 2.6.15 patchless kernels, and in
 364            that case that lock is also ok */
 365         /* We can also get here if there was cached open handle in revalidate_it
 366          * but it disappeared while we were getting from there to ll_file_open.
 367          * But this means this file was closed and immediatelly opened which
 368          * makes a good candidate for using OPEN lock */
 369         /* If lmmsize & lmm are not 0, we are just setting stripe info
 370          * parameters. No need for the open lock */
 371         if (!lmm && !lmmsize)
 372                 itp->it_flags |= MDS_OPEN_LOCK;
 373
 374         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
 375                                       file->f_dentry->d_inode, name, len,
 376                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
 377         if (IS_ERR(op_data))
 378                 RETURN(PTR_ERR(op_data));
 379
 380         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
 381                             0 /*unused */, &req, ll_md_blocking_ast, 0);
 382         ll_finish_md_op_data(op_data);
 383         if (rc == -ESTALE) {
 384                 /* reason for keep own exit path - don`t flood log
 385                 * with messages with -ESTALE errors.
 386                 */
 387                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 388                      it_open_error(DISP_OPEN_OPEN, itp))
 389                         GOTO(out, rc);
 390                 ll_release_openhandle(file->f_dentry, itp);
 391                 GOTO(out, rc);
 392         }
 393
 394         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 395                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 396                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 397                 GOTO(out, rc);
 398         }
 399
 400         if (itp->d.lustre.it_lock_mode)
 401                 md_set_lock_data(sbi->ll_md_exp,
 402                                  &itp->d.lustre.it_lock_handle,
 403                                  file->f_dentry->d_inode);
 404
 405         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
 406 out:
 407         ptlrpc_req_finished(itp->d.lustre.it_data);
 408         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
 409         ll_intent_drop_lock(itp);
 410
 411         RETURN(rc);
 412 }
 413
 414 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
 415                        struct lookup_intent *it, struct obd_client_handle *och)
 416 {
 417         struct ptlrpc_request *req = it->d.lustre.it_data;
 418         struct mdt_body *body;
 419
 420         LASSERT(och);
 421
 422         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 423         LASSERT(body != NULL);                      /* reply already checked out */
 424
 425         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
 426         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 427         och->och_fid = lli->lli_fid;
 428         och->och_flags = it->it_flags;
 429         lli->lli_ioepoch = body->ioepoch;
 430
 431         return md_set_open_replay_data(md_exp, och, req);
 432 }
 433
 434 int ll_local_open(struct file *file, struct lookup_intent *it,
 435                   struct ll_file_data *fd, struct obd_client_handle *och)
 436 {
 437         struct inode *inode = file->f_dentry->d_inode;
 438         struct ll_inode_info *lli = ll_i2info(inode);
 439         ENTRY;
 440
 441         LASSERT(!LUSTRE_FPRIVATE(file));
 442
 443         LASSERT(fd != NULL);
 444
 445         if (och) {
 446                 struct ptlrpc_request *req = it->d.lustre.it_data;
 447                 struct mdt_body *body;
 448                 int rc;
 449
 450                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
 451                 if (rc)
 452                         RETURN(rc);
 453
 454                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
 455                 if ((it->it_flags & FMODE_WRITE) &&
 456                     (body->valid & OBD_MD_FLSIZE))
 457                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 458                                lli->lli_ioepoch, PFID(&lli->lli_fid));
 459         }
 460
 461         LUSTRE_FPRIVATE(file) = fd;
 462         ll_readahead_init(inode, &fd->fd_ras);
 463         fd->fd_omode = it->it_flags;
 464         RETURN(0);
 465 }
 466
 467 /* Open a file, and (for the very first open) create objects on the OSTs at
 468  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 469  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
 470  * lli_open_sem to ensure no other process will create objects, send the
 471  * stripe MD to the MDS, or try to destroy the objects if that fails.
 472  *
 473  * If we already have the stripe MD locally then we don't request it in
 474  * md_open(), by passing a lmm_size = 0.
 475  *
 476  * It is up to the application to ensure no other processes open this file
 477  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 478  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 479  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 480  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 481  */
 482 int ll_file_open(struct inode *inode, struct file *file)
 483 {
 484         struct ll_inode_info *lli = ll_i2info(inode);
 485         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 486                                           .it_flags = file->f_flags };
 487         struct lov_stripe_md *lsm;
 488         struct ptlrpc_request *req = NULL;
 489         struct obd_client_handle **och_p;
 490         __u64 *och_usecount;
 491         struct ll_file_data *fd;
 492         int rc = 0, opendir_set = 0;
 493         ENTRY;
 494
 495         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 496                inode->i_generation, inode, file->f_flags);
 497
 498 #ifdef HAVE_VFS_INTENT_PATCHES
 499         it = file->f_it;
 500 #else
 501         it = file->private_data; /* XXX: compat macro */
 502         file->private_data = NULL; /* prevent ll_local_open assertion */
 503 #endif
 504
 505         fd = ll_file_data_get();
 506         if (fd == NULL)
 507                 RETURN(-ENOMEM);
 508
 509         if (S_ISDIR(inode->i_mode)) {
 510 again:
 511                 spin_lock(&lli->lli_lock);
 512                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
 513                         LASSERT(lli->lli_sai == NULL);
 514                         lli->lli_opendir_key = fd;
 515                         lli->lli_opendir_pid = cfs_curproc_pid();
 516                         opendir_set = 1;
 517                 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid() &&
 518                                     lli->lli_opendir_key != NULL)) {
 519                         /* Two cases for this:
 520                          * (1) The same process open such directory many times.
 521                          * (2) The old process opened the directory, and exited
 522                          *     before its children processes. Then new process
 523                          *     with the same pid opens such directory before the
 524                          *     old process's children processes exit.
 525                          * reset stat ahead for such cases. */
 526                         spin_unlock(&lli->lli_lock);
 527                         CDEBUG(D_INFO, "Conflict statahead for %.*s "DFID
 528                                " reset it.\n", file->f_dentry->d_name.len,
 529                                file->f_dentry->d_name.name,
 530                                PFID(&lli->lli_fid));
 531                         ll_stop_statahead(inode, lli->lli_opendir_key);
 532                         goto again;
 533                 }
 534                 spin_unlock(&lli->lli_lock);
 535         }
 536
 537         if (inode->i_sb->s_root == file->f_dentry) {
 538                 LUSTRE_FPRIVATE(file) = fd;
 539                 RETURN(0);
 540         }
 541
 542         if (!it || !it->d.lustre.it_disposition) {
 543                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 544                  * because everything but O_ACCMODE mask was stripped from
 545                  * there */
 546                 if ((oit.it_flags + 1) & O_ACCMODE)
 547                         oit.it_flags++;
 548                 if (file->f_flags & O_TRUNC)
 549                         oit.it_flags |= FMODE_WRITE;
 550
 551                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 552                  * dentry_open after call to open_namei that checks permissions.
 553                  * Only nfsd_open call dentry_open directly without checking
 554                  * permissions and because of that this code below is safe. */
 555                 if (oit.it_flags & FMODE_WRITE)
 556                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 557
 558                 /* We do not want O_EXCL here, presumably we opened the file
 559                  * already? XXX - NFS implications? */
 560                 oit.it_flags &= ~O_EXCL;
 561
 562                 it = &oit;
 563         }
 564
 565 restart:
 566         /* Let's see if we have file open on MDS already. */
 567         if (it->it_flags & FMODE_WRITE) {
 568                 och_p = &lli->lli_mds_write_och;
 569                 och_usecount = &lli->lli_open_fd_write_count;
 570         } else if (it->it_flags & FMODE_EXEC) {
 571                 och_p = &lli->lli_mds_exec_och;
 572                 och_usecount = &lli->lli_open_fd_exec_count;
 573          } else {
 574                 och_p = &lli->lli_mds_read_och;
 575                 och_usecount = &lli->lli_open_fd_read_count;
 576         }
 577
 578         down(&lli->lli_och_sem);
 579         if (*och_p) { /* Open handle is present */
 580                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 581                         /* Well, there's extra open request that we do not need,
 582                            let's close it somehow. This will decref request. */
 583                         rc = it_open_error(DISP_OPEN_OPEN, it);
 584                         if (rc) {
 585                                 up(&lli->lli_och_sem);
 586                                 ll_file_data_put(fd);
 587                                 GOTO(out_openerr, rc);
 588                         }
 589                         ll_release_openhandle(file->f_dentry, it);
 590                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
 591                                              LPROC_LL_OPEN);
 592                 }
 593                 (*och_usecount)++;
 594
 595                 rc = ll_local_open(file, it, fd, NULL);
 596                 if (rc) {
 597                         (*och_usecount)--;
 598                         up(&lli->lli_och_sem);
 599                         ll_file_data_put(fd);
 600                         GOTO(out_openerr, rc);
 601                 }
 602         } else {
 603                 LASSERT(*och_usecount == 0);
 604                 if (!it->d.lustre.it_disposition) {
 605                         /* We cannot just request lock handle now, new ELC code
 606                            means that one of other OPEN locks for this file
 607                            could be cancelled, and since blocking ast handler
 608                            would attempt to grab och_sem as well, that would
 609                            result in a deadlock */
 610                         up(&lli->lli_och_sem);
 611                         it->it_flags |= O_CHECK_STALE;
 612                         rc = ll_intent_file_open(file, NULL, 0, it);
 613                         it->it_flags &= ~O_CHECK_STALE;
 614                         if (rc) {
 615                                 ll_file_data_put(fd);
 616                                 GOTO(out_openerr, rc);
 617                         }
 618
 619                         /* Got some error? Release the request */
 620                         if (it->d.lustre.it_status < 0) {
 621                                 req = it->d.lustre.it_data;
 622                                 ptlrpc_req_finished(req);
 623                         }
 624                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
 625                                          &it->d.lustre.it_lock_handle,
 626                                          file->f_dentry->d_inode);
 627                         goto restart;
 628                 }
 629                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 630                 if (!*och_p) {
 631                         ll_file_data_put(fd);
 632                         GOTO(out_och_free, rc = -ENOMEM);
 633                 }
 634                 (*och_usecount)++;
 635                 req = it->d.lustre.it_data;
 636
 637                 /* md_intent_lock() didn't get a request ref if there was an
 638                  * open error, so don't do cleanup on the request here
 639                  * (bug 3430) */
 640                 /* XXX (green): Should not we bail out on any error here, not
 641                  * just open error? */
 642                 rc = it_open_error(DISP_OPEN_OPEN, it);
 643                 if (rc) {
 644                         ll_file_data_put(fd);
 645                         GOTO(out_och_free, rc);
 646                 }
 647
 648                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 649                 rc = ll_local_open(file, it, fd, *och_p);
 650                 if (rc) {
 651                         ll_file_data_put(fd);
 652                         GOTO(out_och_free, rc);
 653                 }
 654         }
 655         up(&lli->lli_och_sem);
 656
 657         /* Must do this outside lli_och_sem lock to prevent deadlock where
 658            different kind of OPEN lock for this same inode gets cancelled
 659            by ldlm_cancel_lru */
 660         if (!S_ISREG(inode->i_mode))
 661                 GOTO(out, rc);
 662
 663         ll_capa_open(inode);
 664
 665         lsm = lli->lli_smd;
 666         if (lsm == NULL) {
 667                 if (file->f_flags & O_LOV_DELAY_CREATE ||
 668                     !(file->f_mode & FMODE_WRITE)) {
 669                         CDEBUG(D_INODE, "object creation was delayed\n");
 670                         GOTO(out, rc);
 671                 }
 672         }
 673         file->f_flags &= ~O_LOV_DELAY_CREATE;
 674         GOTO(out, rc);
 675 out:
 676         ptlrpc_req_finished(req);
 677         if (req)
 678                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 679 out_och_free:
 680         if (rc) {
 681                 if (*och_p) {
 682                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 683                         *och_p = NULL; /* OBD_FREE writes some magic there */
 684                         (*och_usecount)--;
 685                 }
 686                 up(&lli->lli_och_sem);
 687 out_openerr:
 688                 if (opendir_set != 0)
 689                         ll_stop_statahead(inode, fd);
 690         }
 691
 692         return rc;
 693 }
 694
 695 /* Fills the obdo with the attributes for the inode defined by lsm */
 696 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
 697 {
 698         struct ptlrpc_request_set *set;
 699         struct ll_inode_info *lli = ll_i2info(inode);
 700         struct lov_stripe_md *lsm = lli->lli_smd;
 701
 702         struct obd_info oinfo = { { { 0 } } };
 703         int rc;
 704         ENTRY;
 705
 706         LASSERT(lsm != NULL);
 707
 708         oinfo.oi_md = lsm;
 709         oinfo.oi_oa = obdo;
 710         oinfo.oi_oa->o_id = lsm->lsm_object_id;
 711         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
 712         oinfo.oi_oa->o_mode = S_IFREG;
 713         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 714                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 715                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
 716                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 717                                OBD_MD_FLGROUP;
 718         oinfo.oi_capa = ll_mdscapa_get(inode);
 719
 720         set = ptlrpc_prep_set();
 721         if (set == NULL) {
 722                 CERROR("can't allocate ptlrpc set\n");
 723                 rc = -ENOMEM;
 724         } else {
 725                 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
 726                 if (rc == 0)
 727                         rc = ptlrpc_set_wait(set);
 728                 ptlrpc_set_destroy(set);
 729         }
 730         capa_put(oinfo.oi_capa);
 731         if (rc)
 732                 RETURN(rc);
 733
 734         oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 735                                  OBD_MD_FLATIME | OBD_MD_FLMTIME |
 736                                  OBD_MD_FLCTIME | OBD_MD_FLSIZE);
 737
 738         obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
 739         CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
 740                lli->lli_smd->lsm_object_id, i_size_read(inode),
 741                (unsigned long long)inode->i_blocks,
 742                (unsigned long)ll_inode_blksize(inode));
 743         RETURN(0);
 744 }
 745
 746 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
 747 {
 748         struct ll_inode_info *lli = ll_i2info(inode);
 749         struct lov_stripe_md *lsm = lli->lli_smd;
 750         struct obd_export *exp = ll_i2dtexp(inode);
 751         struct {
 752                 char name[16];
 753                 struct ldlm_lock *lock;
 754         } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock };
 755         __u32 stripe, vallen = sizeof(stripe);
 756         struct lov_oinfo *loinfo;
 757         int rc;
 758         ENTRY;
 759
 760         if (lsm->lsm_stripe_count == 1)
 761                 GOTO(check, stripe = 0);
 762
 763         /* get our offset in the lov */
 764         rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe, lsm);
 765         if (rc != 0) {
 766                 CERROR("obd_get_info: rc = %d\n", rc);
 767                 RETURN(rc);
 768         }
 769         LASSERT(stripe < lsm->lsm_stripe_count);
 770
 771 check:
 772         loinfo = lsm->lsm_oinfo[stripe];
 773         if (!osc_res_name_eq(loinfo->loi_id, loinfo->loi_gr,
 774                             &lock->l_resource->lr_name)){
 775                 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
 776                            loinfo->loi_id, loinfo->loi_gr);
 777                 RETURN(-ELDLM_NO_LOCK_DATA);
 778         }
 779
 780         RETURN(stripe);
 781 }
 782
 783 /* Get extra page reference to ensure it is not going away */
 784 void ll_pin_extent_cb(void *data)
 785 {
 786         struct page *page = data;
 787
 788         page_cache_get(page);
 789
 790         return;
 791 }
 792
 793 /* Flush the page from page cache for an extent as its canceled.
 794  * Page to remove is delivered as @data.
 795  *
 796  * No one can dirty the extent until we've finished our work and they cannot
 797  * enqueue another lock.  The DLM protects us from ll_file_read/write here,
 798  * but other kernel actors could have pages locked.
 799  *
 800  * If @discard is set, there is no need to write the page if it is dirty.
 801  *
 802  * Called with the DLM lock held. */
 803 int ll_page_removal_cb(void *data, int discard)
 804 {
 805         int rc;
 806         struct page *page = data;
 807         struct address_space *mapping;
 808
 809         ENTRY;
 810
 811         /* We have page reference already from ll_pin_page */
 812         lock_page(page);
 813
 814         /* Already truncated by somebody */
 815         if (!page->mapping)
 816                 GOTO(out, rc = 0);
 817         mapping = page->mapping;
 818
 819         ll_teardown_mmaps(mapping,
 820                           (__u64)page->index << PAGE_CACHE_SHIFT,
 821                           ((__u64)page->index<<PAGE_CACHE_SHIFT)|
 822                                                               ~PAGE_CACHE_MASK);
 823         LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
 824
 825         if (!discard && clear_page_dirty_for_io(page)) {
 826                 LASSERT(page->mapping);
 827                 rc = ll_call_writepage(page->mapping->host, page);
 828                 /* either waiting for io to complete or reacquiring
 829                  * the lock that the failed writepage released */
 830                 lock_page(page);
 831                 wait_on_page_writeback(page);
 832                 if (rc != 0) {
 833                         CERROR("writepage inode %lu(%p) of page %p "
 834                                "failed: %d\n", mapping->host->i_ino,
 835                                mapping->host, page, rc);
 836                         if (rc == -ENOSPC)
 837                                 set_bit(AS_ENOSPC, &mapping->flags);
 838                         else
 839                                 set_bit(AS_EIO, &mapping->flags);
 840                 }
 841                 set_bit(AS_EIO, &mapping->flags);
 842         }
 843         if (page->mapping != NULL) {
 844                 struct ll_async_page *llap = llap_cast_private(page);
 845                 /* checking again to account for writeback's lock_page() */
 846                 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
 847                 if (llap)
 848                         ll_ra_accounting(llap, page->mapping);
 849                 ll_truncate_complete_page(page);
 850         }
 851         EXIT;
 852 out:
 853         LASSERT(!PageWriteback(page));
 854         unlock_page(page);
 855         page_cache_release(page);
 856
 857         return 0;
 858 }
 859
 860 int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
 861                              void *data, int flag)
 862 {
 863         struct inode *inode;
 864         struct ll_inode_info *lli;
 865         struct lov_stripe_md *lsm;
 866         int stripe;
 867         __u64 kms;
 868
 869         ENTRY;
 870
 871         if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
 872                 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
 873                 LBUG();
 874         }
 875
 876         inode = ll_inode_from_lock(lock);
 877         if (inode == NULL)
 878                 RETURN(0);
 879         lli = ll_i2info(inode);
 880         if (lli == NULL)
 881                 GOTO(iput, 0);
 882         if (lli->lli_smd == NULL)
 883                 GOTO(iput, 0);
 884         lsm = lli->lli_smd;
 885
 886         stripe = ll_lock_to_stripe_offset(inode, lock);
 887         if (stripe < 0)
 888                 GOTO(iput, 0);
 889
 890         lov_stripe_lock(lsm);
 891         lock_res_and_lock(lock);
 892         kms = ldlm_extent_shift_kms(lock,
 893                                     lsm->lsm_oinfo[stripe]->loi_kms);
 894
 895         if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
 896                 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 897                            lsm->lsm_oinfo[stripe]->loi_kms, kms);
 898         lsm->lsm_oinfo[stripe]->loi_kms = kms;
 899         unlock_res_and_lock(lock);
 900         lov_stripe_unlock(lsm);
 901         ll_queue_done_writing(inode, 0);
 902         EXIT;
 903 iput:
 904         iput(inode);
 905
 906         return 0;
 907 }
 908
 909 #if 0
 910 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
 911 {
 912         /* XXX ALLOCATE - 160 bytes */
 913         struct inode *inode = ll_inode_from_lock(lock);
 914         struct ll_inode_info *lli = ll_i2info(inode);
 915         struct lustre_handle lockh = { 0 };
 916         struct ost_lvb *lvb;
 917         int stripe;
 918         ENTRY;
 919
 920         if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
 921                      LDLM_FL_BLOCK_CONV)) {
 922                 LBUG(); /* not expecting any blocked async locks yet */
 923                 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
 924                            "lock, returning");
 925                 ldlm_lock_dump(D_OTHER, lock, 0);
 926                 ldlm_reprocess_all(lock->l_resource);
 927                 RETURN(0);
 928         }
 929
 930         LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
 931
 932         stripe = ll_lock_to_stripe_offset(inode, lock);
 933         if (stripe < 0)
 934                 goto iput;
 935
 936         if (lock->l_lvb_len) {
 937                 struct lov_stripe_md *lsm = lli->lli_smd;
 938                 __u64 kms;
 939                 lvb = lock->l_lvb_data;
 940                 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
 941
 942                 lock_res_and_lock(lock);
 943                 ll_inode_size_lock(inode, 1);
 944                 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
 945                 kms = ldlm_extent_shift_kms(NULL, kms);
 946                 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
 947                         LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 948                                    lsm->lsm_oinfo[stripe].loi_kms, kms);
 949                 lsm->lsm_oinfo[stripe].loi_kms = kms;
 950                 ll_inode_size_unlock(inode, 1);
 951                 unlock_res_and_lock(lock);
 952         }
 953
 954 iput:
 955         iput(inode);
 956         wake_up(&lock->l_waitq);
 957
 958         ldlm_lock2handle(lock, &lockh);
 959         ldlm_lock_decref(&lockh, LCK_PR);
 960         RETURN(0);
 961 }
 962 #endif
 963
 964 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
 965 {
 966         struct ptlrpc_request *req = reqp;
 967         struct inode *inode = ll_inode_from_lock(lock);
 968         struct ll_inode_info *lli;
 969         struct lov_stripe_md *lsm;
 970         struct ost_lvb *lvb;
 971         int rc, stripe;
 972         ENTRY;
 973
 974         if (inode == NULL)
 975                 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
 976         lli = ll_i2info(inode);
 977         if (lli == NULL)
 978                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
 979         lsm = lli->lli_smd;
 980         if (lsm == NULL)
 981                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
 982
 983         /* First, find out which stripe index this lock corresponds to. */
 984         stripe = ll_lock_to_stripe_offset(inode, lock);
 985         if (stripe < 0)
 986                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
 987
 988         req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
 989         req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
 990                              sizeof(*lvb));
 991         rc = req_capsule_server_pack(&req->rq_pill);
 992         if (rc) {
 993                 CERROR("lustre_pack_reply: %d\n", rc);
 994                 GOTO(iput, rc);
 995         }
 996
 997         lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
 998         lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
 999         lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1000         lvb->lvb_atime = LTIME_S(inode->i_atime);
1001         lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1002
1003         LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1004                    " atime "LPU64", mtime "LPU64", ctime "LPU64,
1005                    i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
1006                    lvb->lvb_atime, lvb->lvb_ctime);
1007  iput:
1008         iput(inode);
1009
1010  out:
1011         /* These errors are normal races, so we don't want to fill the console
1012          * with messages by calling ptlrpc_error() */
1013         if (rc == -ELDLM_NO_LOCK_DATA)
1014                 lustre_pack_reply(req, 1, NULL, NULL);
1015
1016         req->rq_status = rc;
1017         return rc;
1018 }
1019
1020 static int ll_merge_lvb(struct inode *inode)
1021 {
1022         struct ll_inode_info *lli = ll_i2info(inode);
1023         struct ll_sb_info *sbi = ll_i2sbi(inode);
1024         struct ost_lvb lvb;
1025         int rc;
1026
1027         ENTRY;
1028
1029         ll_inode_size_lock(inode, 1);
1030         inode_init_lvb(inode, &lvb);
1031         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1032         i_size_write(inode, lvb.lvb_size);
1033         inode->i_blocks = lvb.lvb_blocks;
1034
1035         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1036         LTIME_S(inode->i_atime) = lvb.lvb_atime;
1037         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1038         ll_inode_size_unlock(inode, 1);
1039
1040         RETURN(rc);
1041 }
1042
1043 int ll_local_size(struct inode *inode)
1044 {
1045         ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1046         struct ll_inode_info *lli = ll_i2info(inode);
1047         struct ll_sb_info *sbi = ll_i2sbi(inode);
1048         struct lustre_handle lockh = { 0 };
1049         int flags = 0;
1050         int rc;
1051         ENTRY;
1052
1053         if (lli->lli_smd->lsm_stripe_count == 0)
1054                 RETURN(0);
1055
1056         rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1057                        &policy, LCK_PR, &flags, inode, &lockh);
1058         if (rc < 0)
1059                 RETURN(rc);
1060         else if (rc == 0)
1061                 RETURN(-ENODATA);
1062
1063         rc = ll_merge_lvb(inode);
1064         obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
1065         RETURN(rc);
1066 }
1067
1068 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1069                      lstat_t *st)
1070 {
1071         struct lustre_handle lockh = { 0 };
1072         struct ldlm_enqueue_info einfo = { 0 };
1073         struct obd_info oinfo = { { { 0 } } };
1074         struct ost_lvb lvb;
1075         int rc;
1076
1077         ENTRY;
1078
1079         einfo.ei_type = LDLM_EXTENT;
1080         einfo.ei_mode = LCK_PR;
1081         einfo.ei_cb_bl = osc_extent_blocking_cb;
1082         einfo.ei_cb_cp = ldlm_completion_ast;
1083         einfo.ei_cb_gl = ll_glimpse_callback;
1084         einfo.ei_cbdata = NULL;
1085
1086         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1087         oinfo.oi_lockh = &lockh;
1088         oinfo.oi_md = lsm;
1089         oinfo.oi_flags = LDLM_FL_HAS_INTENT;
1090
1091         rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1092         if (rc == -ENOENT)
1093                 RETURN(rc);
1094         if (rc != 0) {
1095                 CERROR("obd_enqueue returned rc %d, "
1096                        "returning -EIO\n", rc);
1097                 RETURN(rc > 0 ? -EIO : rc);
1098         }
1099
1100         lov_stripe_lock(lsm);
1101         memset(&lvb, 0, sizeof(lvb));
1102         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1103         st->st_size = lvb.lvb_size;
1104         st->st_blocks = lvb.lvb_blocks;
1105         st->st_mtime = lvb.lvb_mtime;
1106         st->st_atime = lvb.lvb_atime;
1107         st->st_ctime = lvb.lvb_ctime;
1108         lov_stripe_unlock(lsm);
1109
1110         RETURN(rc);
1111 }
1112
1113 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1114  * file (because it prefers KMS over RSS when larger) */
1115 int ll_glimpse_size(struct inode *inode, int ast_flags)
1116 {
1117         struct ll_inode_info *lli = ll_i2info(inode);
1118         struct ll_sb_info *sbi = ll_i2sbi(inode);
1119         struct lustre_handle lockh = { 0 };
1120         struct ldlm_enqueue_info einfo = { 0 };
1121         struct obd_info oinfo = { { { 0 } } };
1122         int rc;
1123         ENTRY;
1124
1125         if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1126                 RETURN(0);
1127
1128         CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1129
1130         if (!lli->lli_smd) {
1131                 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1132                 RETURN(0);
1133         }
1134
1135         /* NOTE: this looks like DLM lock request, but it may not be one. Due
1136          *       to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1137          *       won't revoke any conflicting DLM locks held. Instead,
1138          *       ll_glimpse_callback() will be called on each client
1139          *       holding a DLM lock against this file, and resulting size
1140          *       will be returned for each stripe. DLM lock on [0, EOF] is
1141          *       acquired only if there were no conflicting locks. */
1142         einfo.ei_type = LDLM_EXTENT;
1143         einfo.ei_mode = LCK_PR;
1144         einfo.ei_cb_bl = osc_extent_blocking_cb;
1145         einfo.ei_cb_cp = ldlm_completion_ast;
1146         einfo.ei_cb_gl = ll_glimpse_callback;
1147         einfo.ei_cbdata = inode;
1148
1149         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1150         oinfo.oi_lockh = &lockh;
1151         oinfo.oi_md = lli->lli_smd;
1152         oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1153
1154         rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1155         if (rc == -ENOENT)
1156                 RETURN(rc);
1157         if (rc != 0) {
1158                 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1159                 RETURN(rc > 0 ? -EIO : rc);
1160         }
1161
1162         rc = ll_merge_lvb(inode);
1163
1164         CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1165                i_size_read(inode), (unsigned long long)inode->i_blocks);
1166
1167         RETURN(rc);
1168 }
1169
1170 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1171                    struct lov_stripe_md *lsm, int mode,
1172                    ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1173                    int ast_flags)
1174 {
1175         struct ll_sb_info *sbi = ll_i2sbi(inode);
1176         struct ost_lvb lvb;
1177         struct ldlm_enqueue_info einfo = { 0 };
1178         struct obd_info oinfo = { { { 0 } } };
1179         int rc;
1180         ENTRY;
1181
1182         LASSERT(!lustre_handle_is_used(lockh));
1183         LASSERT(lsm != NULL);
1184
1185         /* XXX phil: can we do this?  won't it screw the file size up? */
1186         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1187             (sbi->ll_flags & LL_SBI_NOLCK))
1188                 RETURN(0);
1189
1190         CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1191                inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1192
1193         einfo.ei_type = LDLM_EXTENT;
1194         einfo.ei_mode = mode;
1195         einfo.ei_cb_bl = osc_extent_blocking_cb;
1196         einfo.ei_cb_cp = ldlm_completion_ast;
1197         einfo.ei_cb_gl = ll_glimpse_callback;
1198         einfo.ei_cbdata = inode;
1199
1200         oinfo.oi_policy = *policy;
1201         oinfo.oi_lockh = lockh;
1202         oinfo.oi_md = lsm;
1203         oinfo.oi_flags = ast_flags;
1204
1205         rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
1206         *policy = oinfo.oi_policy;
1207         if (rc > 0)
1208                 rc = -EIO;
1209
1210         ll_inode_size_lock(inode, 1);
1211         inode_init_lvb(inode, &lvb);
1212         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1213
1214         if (policy->l_extent.start == 0 &&
1215             policy->l_extent.end == OBD_OBJECT_EOF) {
1216                 /* vmtruncate()->ll_truncate() first sets the i_size and then
1217                  * the kms under both a DLM lock and the
1218                  * ll_inode_size_lock().  If we don't get the
1219                  * ll_inode_size_lock() here we can match the DLM lock and
1220                  * reset i_size from the kms before the truncating path has
1221                  * updated the kms.  generic_file_write can then trust the
1222                  * stale i_size when doing appending writes and effectively
1223                  * cancel the result of the truncate.  Getting the
1224                  * ll_inode_size_lock() after the enqueue maintains the DLM
1225                  * -> ll_inode_size_lock() acquiring order. */
1226                 i_size_write(inode, lvb.lvb_size);
1227                 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1228                        inode->i_ino, i_size_read(inode));
1229         }
1230
1231         if (rc == 0) {
1232                 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1233                 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1234                 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1235         }
1236         ll_inode_size_unlock(inode, 1);
1237
1238         RETURN(rc);
1239 }
1240
1241 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1242                      struct lov_stripe_md *lsm, int mode,
1243                      struct lustre_handle *lockh)
1244 {
1245         struct ll_sb_info *sbi = ll_i2sbi(inode);
1246         int rc;
1247         ENTRY;
1248
1249         /* XXX phil: can we do this?  won't it screw the file size up? */
1250         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1251             (sbi->ll_flags & LL_SBI_NOLCK))
1252                 RETURN(0);
1253
1254         rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1255
1256         RETURN(rc);
1257 }
1258
1259 static void ll_set_file_contended(struct inode *inode)
1260 {
1261         struct ll_inode_info *lli = ll_i2info(inode);
1262         cfs_time_t now = cfs_time_current();
1263
1264         spin_lock(&lli->lli_lock);
1265         lli->lli_contention_time = now;
1266         lli->lli_flags |= LLIF_CONTENDED;
1267         spin_unlock(&lli->lli_lock);
1268 }
1269
1270 void ll_clear_file_contended(struct inode *inode)
1271 {
1272         struct ll_inode_info *lli = ll_i2info(inode);
1273
1274         spin_lock(&lli->lli_lock);
1275         lli->lli_flags &= ~LLIF_CONTENDED;
1276         spin_unlock(&lli->lli_lock);
1277 }
1278
1279 static int ll_is_file_contended(struct file *file)
1280 {
1281         struct inode *inode = file->f_dentry->d_inode;
1282         struct ll_inode_info *lli = ll_i2info(inode);
1283         struct ll_sb_info *sbi = ll_i2sbi(inode);
1284         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1285         ENTRY;
1286
1287         if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1288                 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1289                        " osc connect flags = 0x"LPX64"\n",
1290                        sbi->ll_lco.lco_flags);
1291                 RETURN(0);
1292         }
1293         if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1294                 RETURN(1);
1295         if (lli->lli_flags & LLIF_CONTENDED) {
1296                 cfs_time_t cur_time = cfs_time_current();
1297                 cfs_time_t retry_time;
1298
1299                 retry_time = cfs_time_add(
1300                         lli->lli_contention_time,
1301                         cfs_time_seconds(sbi->ll_contention_time));
1302                 if (cfs_time_after(cur_time, retry_time)) {
1303                         ll_clear_file_contended(inode);
1304                         RETURN(0);
1305                 }
1306                 RETURN(1);
1307         }
1308         RETURN(0);
1309 }
1310
1311 static int ll_file_get_tree_lock(struct ll_lock_tree *tree, struct file *file,
1312                                  const char *buf, size_t count,
1313                                  loff_t start, loff_t end, int rw)
1314 {
1315         int append;
1316         int tree_locked = 0;
1317         int rc;
1318         struct inode * inode = file->f_dentry->d_inode;
1319         ENTRY;
1320
1321         append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND);
1322
1323         if (append || !ll_is_file_contended(file)) {
1324                 struct ll_lock_tree_node *node;
1325                 int ast_flags;
1326
1327                 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1328                 if (file->f_flags & O_NONBLOCK)
1329                         ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1330                 node = ll_node_from_inode(inode, start, end,
1331                                           (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR);
1332                 if (IS_ERR(node)) {
1333                         rc = PTR_ERR(node);
1334                         GOTO(out, rc);
1335                 }
1336                 tree->lt_fd = LUSTRE_FPRIVATE(file);
1337                 rc = ll_tree_lock(tree, node, buf, count, ast_flags);
1338                 if (rc == 0)
1339                         tree_locked = 1;
1340                 else if (rc == -EUSERS)
1341                         ll_set_file_contended(inode);
1342                 else
1343                         GOTO(out, rc);
1344         }
1345         RETURN(tree_locked);
1346 out:
1347         return rc;
1348 }
1349
1350 /**
1351  * Checks if requested extent lock is compatible with a lock under a page.
1352  *
1353  * Checks if the lock under \a page is compatible with a read or write lock
1354  * (specified by \a rw) for an extent [\a start , \a end].
1355  *
1356  * \param page the page under which lock is considered
1357  * \param rw OBD_BRW_READ if requested for reading,
1358  *           OBD_BRW_WRITE if requested for writing
1359  * \param start start of the requested extent
1360  * \param end end of the requested extent
1361  * \param cookie transparent parameter for passing locking context
1362  *
1363  * \post result == 1, *cookie == context, appropriate lock is referenced or
1364  * \post result == 0
1365  *
1366  * \retval 1 owned lock is reused for the request
1367  * \retval 0 no lock reused for the request
1368  *
1369  * \see ll_release_short_lock
1370  */
1371 static int ll_reget_short_lock(struct page *page, int rw,
1372                                obd_off start, obd_off end,
1373                                void **cookie)
1374 {
1375         struct ll_async_page *llap;
1376         struct obd_export *exp;
1377         struct inode *inode = page->mapping->host;
1378
1379         ENTRY;
1380
1381         exp = ll_i2dtexp(inode);
1382         if (exp == NULL)
1383                 RETURN(0);
1384
1385         llap = llap_cast_private(page);
1386         if (llap == NULL)
1387                 RETURN(0);
1388
1389         RETURN(obd_reget_short_lock(exp, ll_i2info(inode)->lli_smd,
1390                                     &llap->llap_cookie, rw, start, end,
1391                                     cookie));
1392 }
1393
1394 /**
1395  * Releases a reference to a lock taken in a "fast" way.
1396  *
1397  * Releases a read or a write (specified by \a rw) lock
1398  * referenced by \a cookie.
1399  *
1400  * \param inode inode to which data belong
1401  * \param end end of the locked extent
1402  * \param rw OBD_BRW_READ if requested for reading,
1403  *           OBD_BRW_WRITE if requested for writing
1404  * \param cookie transparent parameter for passing locking context
1405  *
1406  * \post appropriate lock is dereferenced
1407  *
1408  * \see ll_reget_short_lock
1409  */
1410 static void ll_release_short_lock(struct inode *inode, obd_off end,
1411                                   void *cookie, int rw)
1412 {
1413         struct obd_export *exp;
1414         int rc;
1415
1416         exp = ll_i2dtexp(inode);
1417         if (exp == NULL)
1418                 return;
1419
1420         rc = obd_release_short_lock(exp, ll_i2info(inode)->lli_smd, end,
1421                                     cookie, rw);
1422         if (rc < 0)
1423                 CERROR("unlock failed (%d)\n", rc);
1424 }
1425
1426 /**
1427  * Checks if requested extent lock is compatible
1428  * with a lock under a page in page cache.
1429  *
1430  * Checks if a lock under some \a page is compatible with a read or write lock
1431  * (specified by \a rw) for an extent [\a start , \a end].
1432  *
1433  * \param file the file under which lock is considered
1434  * \param rw OBD_BRW_READ if requested for reading,
1435  *           OBD_BRW_WRITE if requested for writing
1436  * \param ppos start of the requested extent
1437  * \param end end of the requested extent
1438  * \param cookie transparent parameter for passing locking context
1439  * \param buf userspace buffer for the data
1440  *
1441  * \post result == 1, *cookie == context, appropriate lock is referenced
1442  * \post retuls == 0
1443  *
1444  * \retval 1 owned lock is reused for the request
1445  * \retval 0 no lock reused for the request
1446  *
1447  * \see ll_file_put_fast_lock
1448  */
1449 static inline int ll_file_get_fast_lock(struct file *file,
1450                                         obd_off ppos, obd_off end,
1451                                         char *buf, void **cookie, int rw)
1452 {
1453         int rc = 0;
1454         struct page *page;
1455
1456         ENTRY;
1457
1458         if (!ll_region_mapped((unsigned long)buf, end - ppos)) {
1459                 page = find_lock_page(file->f_dentry->d_inode->i_mapping,
1460                                       ppos >> CFS_PAGE_SHIFT);
1461                 if (page) {
1462                         if (ll_reget_short_lock(page, rw, ppos, end, cookie))
1463                                 rc = 1;
1464
1465                         unlock_page(page);
1466                         page_cache_release(page);
1467                 }
1468         }
1469
1470         RETURN(rc);
1471 }
1472
1473 /**
1474  * Releases a reference to a lock taken in a "fast" way.
1475  *
1476  * Releases a read or a write (specified by \a rw) lock
1477  * referenced by \a cookie.
1478  *
1479  * \param inode inode to which data belong
1480  * \param end end of the locked extent
1481  * \param rw OBD_BRW_READ if requested for reading,
1482  *           OBD_BRW_WRITE if requested for writing
1483  * \param cookie transparent parameter for passing locking context
1484  *
1485  * \post appropriate lock is dereferenced
1486  *
1487  * \see ll_file_get_fast_lock
1488  */
1489 static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end,
1490                                          void *cookie, int rw)
1491 {
1492         ll_release_short_lock(inode, end, cookie, rw);
1493 }
1494
1495 enum ll_lock_style {
1496         LL_LOCK_STYLE_NOLOCK   = 0,
1497         LL_LOCK_STYLE_FASTLOCK = 1,
1498         LL_LOCK_STYLE_TREELOCK = 2
1499 };
1500
1501 /**
1502  * Checks if requested extent lock is compatible with a lock
1503  * under a page cache page.
1504  *
1505  * Checks if the lock under \a page is compatible with a read or write lock
1506  * (specified by \a rw) for an extent [\a start , \a end].
1507  *
1508  * \param file file under which I/O is processed
1509  * \param rw OBD_BRW_READ if requested for reading,
1510  *           OBD_BRW_WRITE if requested for writing
1511  * \param ppos start of the requested extent
1512  * \param end end of the requested extent
1513  * \param cookie transparent parameter for passing locking context
1514  *           (only used with LL_LOCK_STYLE_FASTLOCK)
1515  * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
1516  * \param buf userspace buffer for the data
1517  *
1518  * \retval LL_LOCK_STYLE_FASTLOCK owned lock is reused through fast lock
1519  * \retval LL_LOCK_STYLE_TREELOCK got a lock through tree lock
1520  * \retval LL_LOCK_STYLE_NOLOCK got no lock
1521  *
1522  * \see ll_file_put_lock
1523  */
1524 static inline int ll_file_get_lock(struct file *file, obd_off ppos,
1525                                    obd_off end, char *buf, void **cookie,
1526                                    struct ll_lock_tree *tree, int rw)
1527 {
1528         int rc;
1529
1530         ENTRY;
1531
1532         if (ll_file_get_fast_lock(file, ppos, end, buf, cookie, rw))
1533                 RETURN(LL_LOCK_STYLE_FASTLOCK);
1534
1535         rc = ll_file_get_tree_lock(tree, file, buf, ppos - end, ppos, end, rw);
1536         /* rc: 1 for tree lock, 0 for no lock, <0 for error */
1537         switch (rc) {
1538         case 1:
1539                 RETURN(LL_LOCK_STYLE_TREELOCK);
1540         case 0:
1541                 RETURN(LL_LOCK_STYLE_NOLOCK);
1542         }
1543
1544         /* an error happened if we reached this point, rc = -errno here */
1545         RETURN(rc);
1546 }
1547
1548 /**
1549  * Drops the lock taken by ll_file_get_lock.
1550  *
1551  * Releases a read or a write (specified by \a rw) lock
1552  * referenced by \a tree or \a cookie.
1553  *
1554  * \param inode inode to which data belong
1555  * \param end end of the locked extent
1556  * \param lockstyle facility through which the lock was taken
1557  * \param rw OBD_BRW_READ if requested for reading,
1558  *           OBD_BRW_WRITE if requested for writing
1559  * \param cookie transparent parameter for passing locking context
1560  *           (only used with LL_LOCK_STYLE_FASTLOCK)
1561  * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
1562  *
1563  * \post appropriate lock is dereferenced
1564  *
1565  * \see ll_file_get_lock
1566  */
1567 static inline void ll_file_put_lock(struct inode *inode, obd_off end,
1568                                     enum ll_lock_style lock_style,
1569                                     void *cookie, struct ll_lock_tree *tree,
1570                                     int rw)
1571
1572 {
1573         switch (lock_style) {
1574         case LL_LOCK_STYLE_TREELOCK:
1575                 ll_tree_unlock(tree);
1576                 break;
1577         case LL_LOCK_STYLE_FASTLOCK:
1578                 ll_file_put_fast_lock(inode, end, cookie, rw);
1579                 break;
1580         default:
1581                 CERROR("invalid locking style (%d)\n", lock_style);
1582         }
1583 }
1584
1585 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1586                             loff_t *ppos)
1587 {
1588         struct inode *inode = file->f_dentry->d_inode;
1589         struct ll_inode_info *lli = ll_i2info(inode);
1590         struct lov_stripe_md *lsm = lli->lli_smd;
1591         struct ll_sb_info *sbi = ll_i2sbi(inode);
1592         struct ll_lock_tree tree;
1593         struct ost_lvb lvb;
1594         struct ll_ra_read bead;
1595         int ra = 0;
1596         obd_off end;
1597         ssize_t retval, chunk, sum = 0;
1598         int lock_style;
1599         void *cookie;
1600
1601         __u64 kms;
1602         ENTRY;
1603         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1604                inode->i_ino, inode->i_generation, inode, count, *ppos);
1605         /* "If nbyte is 0, read() will return 0 and have no other results."
1606          *                      -- Single Unix Spec */
1607         if (count == 0)
1608                 RETURN(0);
1609
1610         ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1611
1612         if (!lsm) {
1613                 /* Read on file with no objects should return zero-filled
1614                  * buffers up to file size (we can get non-zero sizes with
1615                  * mknod + truncate, then opening file for read. This is a
1616                  * common pattern in NFS case, it seems). Bug 6243 */
1617                 int notzeroed;
1618                 /* Since there are no objects on OSTs, we have nothing to get
1619                  * lock on and so we are forced to access inode->i_size
1620                  * unguarded */
1621
1622                 /* Read beyond end of file */
1623                 if (*ppos >= i_size_read(inode))
1624                         RETURN(0);
1625
1626                 if (count > i_size_read(inode) - *ppos)
1627                         count = i_size_read(inode) - *ppos;
1628                 /* Make sure to correctly adjust the file pos pointer for
1629                  * EFAULT case */
1630                 notzeroed = clear_user(buf, count);
1631                 count -= notzeroed;
1632                 *ppos += count;
1633                 if (!count)
1634                         RETURN(-EFAULT);
1635                 RETURN(count);
1636         }
1637 repeat:
1638         if (sbi->ll_max_rw_chunk != 0) {
1639                 /* first, let's know the end of the current stripe */
1640                 end = *ppos;
1641                 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end);
1642
1643                 /* correct, the end is beyond the request */
1644                 if (end > *ppos + count - 1)
1645                         end = *ppos + count - 1;
1646
1647                 /* and chunk shouldn't be too large even if striping is wide */
1648                 if (end - *ppos > sbi->ll_max_rw_chunk)
1649                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1650         } else {
1651                 end = *ppos + count - 1;
1652         }
1653
1654         lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end,
1655                                       buf, &cookie, &tree, OBD_BRW_READ);
1656         if (lock_style < 0)
1657                 GOTO(out, retval = lock_style);
1658
1659         ll_inode_size_lock(inode, 1);
1660         /*
1661          * Consistency guarantees: following possibilities exist for the
1662          * relation between region being read and real file size at this
1663          * moment:
1664          *
1665          *  (A): the region is completely inside of the file;
1666          *
1667          *  (B-x): x bytes of region are inside of the file, the rest is
1668          *  outside;
1669          *
1670          *  (C): the region is completely outside of the file.
1671          *
1672          * This classification is stable under DLM lock acquired by
1673          * ll_tree_lock() above, because to change class, other client has to
1674          * take DLM lock conflicting with our lock. Also, any updates to
1675          * ->i_size by other threads on this client are serialized by
1676          * ll_inode_size_lock(). This guarantees that short reads are handled
1677          * correctly in the face of concurrent writes and truncates.
1678          */
1679         inode_init_lvb(inode, &lvb);
1680         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1681         kms = lvb.lvb_size;
1682         if (*ppos + count - 1 > kms) {
1683                 /* A glimpse is necessary to determine whether we return a
1684                  * short read (B) or some zeroes at the end of the buffer (C) */
1685                 ll_inode_size_unlock(inode, 1);
1686                 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1687                 if (retval) {
1688                         if (lock_style != LL_LOCK_STYLE_NOLOCK)
1689                                 ll_file_put_lock(inode, end, lock_style,
1690                                                  cookie, &tree, OBD_BRW_READ);
1691                         goto out;
1692                 }
1693         } else {
1694                 /* region is within kms and, hence, within real file size (A).
1695                  * We need to increase i_size to cover the read region so that
1696                  * generic_file_read() will do its job, but that doesn't mean
1697                  * the kms size is _correct_, it is only the _minimum_ size.
1698                  * If someone does a stat they will get the correct size which
1699                  * will always be >= the kms value here.  b=11081 */
1700                 if (i_size_read(inode) < kms)
1701                         i_size_write(inode, kms);
1702                 ll_inode_size_unlock(inode, 1);
1703         }
1704
1705         chunk = end - *ppos + 1;
1706         CDEBUG(D_INODE,"Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1707                inode->i_ino, chunk, *ppos, i_size_read(inode));
1708
1709         if (lock_style != LL_LOCK_STYLE_NOLOCK) {
1710                 /* turn off the kernel's read-ahead */
1711                 file->f_ra.ra_pages = 0;
1712
1713                 /* initialize read-ahead window once per syscall */
1714                 if (ra == 0) {
1715                         ra = 1;
1716                         bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1717                         bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1718                         ll_ra_read_in(file, &bead);
1719                 }
1720
1721                 /* BUG: 5972 */
1722                 file_accessed(file);
1723                 retval = generic_file_read(file, buf, chunk, ppos);
1724                 ll_file_put_lock(inode, end, lock_style, cookie, &tree,
1725                                  OBD_BRW_READ);
1726         } else {
1727                 retval = ll_file_lockless_io(file, buf, chunk, ppos, READ);
1728         }
1729
1730         ll_rw_stats_tally(sbi, current->pid, file, chunk, 0);
1731
1732         if (retval > 0) {
1733                 buf += retval;
1734                 count -= retval;
1735                 sum += retval;
1736                 if (retval == chunk && count > 0)
1737                         goto repeat;
1738         }
1739
1740  out:
1741         if (ra != 0)
1742                 ll_ra_read_ex(file, &bead);
1743         retval = (sum > 0) ? sum : retval;
1744         RETURN(retval);
1745 }
1746
1747 /*
1748  * Write to a file (through the page cache).
1749  */
1750 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1751                              loff_t *ppos)
1752 {
1753         struct inode *inode = file->f_dentry->d_inode;
1754         struct ll_sb_info *sbi = ll_i2sbi(inode);
1755         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1756         struct ll_lock_tree tree;
1757         loff_t maxbytes = ll_file_maxbytes(inode);
1758         loff_t lock_start, lock_end, end;
1759         ssize_t retval, chunk, sum = 0;
1760         int tree_locked;
1761         ENTRY;
1762
1763         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1764                inode->i_ino, inode->i_generation, inode, count, *ppos);
1765
1766         SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1767
1768         /* POSIX, but surprised the VFS doesn't check this already */
1769         if (count == 0)
1770                 RETURN(0);
1771
1772         /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1773          * called on the file, don't fail the below assertion (bug 2388). */
1774         if (file->f_flags & O_LOV_DELAY_CREATE &&
1775             ll_i2info(inode)->lli_smd == NULL)
1776                 RETURN(-EBADF);
1777
1778         LASSERT(ll_i2info(inode)->lli_smd != NULL);
1779
1780         down(&ll_i2info(inode)->lli_write_sem);
1781
1782 repeat:
1783         chunk = 0; /* just to fix gcc's warning */
1784         end = *ppos + count - 1;
1785
1786         if (file->f_flags & O_APPEND) {
1787                 lock_start = 0;
1788                 lock_end = OBD_OBJECT_EOF;
1789         } else if (sbi->ll_max_rw_chunk != 0) {
1790                 /* first, let's know the end of the current stripe */
1791                 end = *ppos;
1792                 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1793                                 (obd_off *)&end);
1794
1795                 /* correct, the end is beyond the request */
1796                 if (end > *ppos + count - 1)
1797                         end = *ppos + count - 1;
1798
1799                 /* and chunk shouldn't be too large even if striping is wide */
1800                 if (end - *ppos > sbi->ll_max_rw_chunk)
1801                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1802                 lock_start = *ppos;
1803                 lock_end = end;
1804         } else {
1805                 lock_start = *ppos;
1806                 lock_end = *ppos + count - 1;
1807         }
1808
1809         tree_locked = ll_file_get_tree_lock(&tree, file, buf, count,
1810                                             lock_start, lock_end, OBD_BRW_WRITE);
1811         if (tree_locked < 0)
1812                 GOTO(out, retval = tree_locked);
1813
1814         /* This is ok, g_f_w will overwrite this under i_sem if it races
1815          * with a local truncate, it just makes our maxbyte checking easier.
1816          * The i_size value gets updated in ll_extent_lock() as a consequence
1817          * of the [0,EOF] extent lock we requested above. */
1818         if (file->f_flags & O_APPEND) {
1819                 *ppos = i_size_read(inode);
1820                 end = *ppos + count - 1;
1821         }
1822
1823         if (*ppos >= maxbytes) {
1824                 send_sig(SIGXFSZ, current, 0);
1825                 GOTO(out_unlock, retval = -EFBIG);
1826         }
1827         if (end > maxbytes - 1)
1828                 end = maxbytes - 1;
1829
1830         /* generic_file_write handles O_APPEND after getting i_mutex */
1831         chunk = end - *ppos + 1;
1832         CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1833                inode->i_ino, chunk, *ppos);
1834         if (tree_locked)
1835                 retval = generic_file_write(file, buf, chunk, ppos);
1836         else
1837                 retval = ll_file_lockless_io(file, (char*)buf, chunk,
1838                                              ppos, WRITE);
1839         ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1840
1841 out_unlock:
1842         if (tree_locked)
1843                 ll_tree_unlock(&tree);
1844
1845 out:
1846         if (retval > 0) {
1847                 buf += retval;
1848                 count -= retval;
1849                 sum += retval;
1850                 if (retval == chunk && count > 0)
1851                         goto repeat;
1852         }
1853
1854         up(&ll_i2info(inode)->lli_write_sem);
1855
1856         retval = (sum > 0) ? sum : retval;
1857         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1858                            retval > 0 ? retval : 0);
1859         RETURN(retval);
1860 }
1861
1862 /*
1863  * Send file content (through pagecache) somewhere with helper
1864  */
1865 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1866                                 read_actor_t actor, void *target)
1867 {
1868         struct inode *inode = in_file->f_dentry->d_inode;
1869         struct ll_inode_info *lli = ll_i2info(inode);
1870         struct lov_stripe_md *lsm = lli->lli_smd;
1871         struct ll_lock_tree tree;
1872         struct ll_lock_tree_node *node;
1873         struct ost_lvb lvb;
1874         struct ll_ra_read bead;
1875         int rc;
1876         ssize_t retval;
1877         __u64 kms;
1878         ENTRY;
1879         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1880                inode->i_ino, inode->i_generation, inode, count, *ppos);
1881
1882         /* "If nbyte is 0, read() will return 0 and have no other results."
1883          *                      -- Single Unix Spec */
1884         if (count == 0)
1885                 RETURN(0);
1886
1887         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1888         /* turn off the kernel's read-ahead */
1889         in_file->f_ra.ra_pages = 0;
1890
1891         /* File with no objects, nothing to lock */
1892         if (!lsm)
1893                 RETURN(generic_file_sendfile(in_file, ppos,count,actor,target));
1894
1895         node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1896         if (IS_ERR(node))
1897                 RETURN(PTR_ERR(node));
1898
1899         tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1900         rc = ll_tree_lock(&tree, node, NULL, count,
1901                           in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1902         if (rc != 0)
1903                 RETURN(rc);
1904
1905         ll_clear_file_contended(inode);
1906         ll_inode_size_lock(inode, 1);
1907         /*
1908          * Consistency guarantees: following possibilities exist for the
1909          * relation between region being read and real file size at this
1910          * moment:
1911          *
1912          *  (A): the region is completely inside of the file;
1913          *
1914          *  (B-x): x bytes of region are inside of the file, the rest is
1915          *  outside;
1916          *
1917          *  (C): the region is completely outside of the file.
1918          *
1919          * This classification is stable under DLM lock acquired by
1920          * ll_tree_lock() above, because to change class, other client has to
1921          * take DLM lock conflicting with our lock. Also, any updates to
1922          * ->i_size by other threads on this client are serialized by
1923          * ll_inode_size_lock(). This guarantees that short reads are handled
1924          * correctly in the face of concurrent writes and truncates.
1925          */
1926         inode_init_lvb(inode, &lvb);
1927         obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1928         kms = lvb.lvb_size;
1929         if (*ppos + count - 1 > kms) {
1930                 /* A glimpse is necessary to determine whether we return a
1931                  * short read (B) or some zeroes at the end of the buffer (C) */
1932                 ll_inode_size_unlock(inode, 1);
1933                 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1934                 if (retval)
1935                         goto out;
1936         } else {
1937                 /* region is within kms and, hence, within real file size (A) */
1938                 i_size_write(inode, kms);
1939                 ll_inode_size_unlock(inode, 1);
1940         }
1941
1942         CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1943                inode->i_ino, count, *ppos, i_size_read(inode));
1944
1945         bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1946         bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1947         ll_ra_read_in(in_file, &bead);
1948         /* BUG: 5972 */
1949         file_accessed(in_file);
1950         retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1951         ll_ra_read_ex(in_file, &bead);
1952
1953  out:
1954         ll_tree_unlock(&tree);
1955         RETURN(retval);
1956 }
1957
1958 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1959                                unsigned long arg)
1960 {
1961         struct ll_inode_info *lli = ll_i2info(inode);
1962         struct obd_export *exp = ll_i2dtexp(inode);
1963         struct ll_recreate_obj ucreatp;
1964         struct obd_trans_info oti = { 0 };
1965         struct obdo *oa = NULL;
1966         int lsm_size;
1967         int rc = 0;
1968         struct lov_stripe_md *lsm, *lsm2;
1969         ENTRY;
1970
1971         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1972                 RETURN(-EPERM);
1973
1974         if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1975                            sizeof(struct ll_recreate_obj)))
1976                 RETURN(-EFAULT);
1977
1978         OBDO_ALLOC(oa);
1979         if (oa == NULL)
1980                 RETURN(-ENOMEM);
1981
1982         down(&lli->lli_size_sem);
1983         lsm = lli->lli_smd;
1984         if (lsm == NULL)
1985                 GOTO(out, rc = -ENOENT);
1986         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1987                    (lsm->lsm_stripe_count));
1988
1989         OBD_ALLOC(lsm2, lsm_size);
1990         if (lsm2 == NULL)
1991                 GOTO(out, rc = -ENOMEM);
1992
1993         oa->o_id = ucreatp.lrc_id;
1994         oa->o_gr = ucreatp.lrc_group;
1995         oa->o_nlink = ucreatp.lrc_ost_idx;
1996         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1997         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1998         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1999                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2000
2001         memcpy(lsm2, lsm, lsm_size);
2002         rc = obd_create(exp, oa, &lsm2, &oti);
2003
2004         OBD_FREE(lsm2, lsm_size);
2005         GOTO(out, rc);
2006 out:
2007         up(&lli->lli_size_sem);
2008         OBDO_FREE(oa);
2009         return rc;
2010 }
2011
2012 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
2013                              int flags, struct lov_user_md *lum, int lum_size)
2014 {
2015         struct ll_inode_info *lli = ll_i2info(inode);
2016         struct lov_stripe_md *lsm;
2017         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
2018         int rc = 0;
2019         ENTRY;
2020
2021         down(&lli->lli_size_sem);
2022         lsm = lli->lli_smd;
2023         if (lsm) {
2024                 up(&lli->lli_size_sem);
2025                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
2026                        inode->i_ino);
2027                 RETURN(-EEXIST);
2028         }
2029
2030         rc = ll_intent_file_open(file, lum, lum_size, &oit);
2031         if (rc)
2032                 GOTO(out, rc);
2033         if (it_disposition(&oit, DISP_LOOKUP_NEG))
2034                 GOTO(out_req_free, rc = -ENOENT);
2035         rc = oit.d.lustre.it_status;
2036         if (rc < 0)
2037                 GOTO(out_req_free, rc);
2038
2039         ll_release_openhandle(file->f_dentry, &oit);
2040
2041  out:
2042         up(&lli->lli_size_sem);
2043         ll_intent_release(&oit);
2044         RETURN(rc);
2045 out_req_free:
2046         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2047         goto out;
2048 }
2049
2050 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2051                              struct lov_mds_md **lmmp, int *lmm_size,
2052                              struct ptlrpc_request **request)
2053 {
2054         struct ll_sb_info *sbi = ll_i2sbi(inode);
2055         struct mdt_body  *body;
2056         struct lov_mds_md *lmm = NULL;
2057         struct ptlrpc_request *req = NULL;
2058         struct obd_capa *oc;
2059         int rc, lmmsize;
2060
2061         rc = ll_get_max_mdsize(sbi, &lmmsize);
2062         if (rc)
2063                 RETURN(rc);
2064
2065         oc = ll_mdscapa_get(inode);
2066         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
2067                              oc, filename, strlen(filename) + 1,
2068                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
2069                              ll_i2suppgid(inode), &req);
2070         capa_put(oc);
2071         if (rc < 0) {
2072                 CDEBUG(D_INFO, "md_getattr_name failed "
2073                        "on %s: rc %d\n", filename, rc);
2074                 GOTO(out, rc);
2075         }
2076
2077         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2078         LASSERT(body != NULL); /* checked by mdc_getattr_name */
2079
2080         lmmsize = body->eadatasize;
2081
2082         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2083                         lmmsize == 0) {
2084                 GOTO(out, rc = -ENODATA);
2085         }
2086
2087         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2088         LASSERT(lmm != NULL);
2089
2090         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
2091             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
2092             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
2093                 GOTO(out, rc = -EPROTO);
2094         }
2095
2096         /*
2097          * This is coming from the MDS, so is probably in
2098          * little endian.  We convert it to host endian before
2099          * passing it to userspace.
2100          */
2101         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2102                 /* if function called for directory - we should
2103                  * avoid swab not existent lsm objects */
2104                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2105                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
2106                         if (S_ISREG(body->mode))
2107                                 lustre_swab_lov_user_md_objects(
2108                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2109                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
2110                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2111                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
2112                         if (S_ISREG(body->mode))
2113                                 lustre_swab_lov_user_md_objects(
2114                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2115                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
2116                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
2117                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
2118                 }
2119         }
2120
2121         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
2122                 struct lov_stripe_md *lsm;
2123                 struct lov_user_md_join *lmj;
2124                 int lmj_size, i, aindex = 0;
2125
2126                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
2127                 if (rc < 0)
2128                         GOTO(out, rc = -ENOMEM);
2129                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
2130                 if (rc)
2131                         GOTO(out_free_memmd, rc);
2132
2133                 lmj_size = sizeof(struct lov_user_md_join) +
2134                            lsm->lsm_stripe_count *
2135                            sizeof(struct lov_user_ost_data_join);
2136                 OBD_ALLOC(lmj, lmj_size);
2137                 if (!lmj)
2138                         GOTO(out_free_memmd, rc = -ENOMEM);
2139
2140                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
2141                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
2142                         struct lov_extent *lex =
2143                                 &lsm->lsm_array->lai_ext_array[aindex];
2144
2145                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
2146                                 aindex ++;
2147                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
2148                                         LPU64" len %d\n", aindex, i,
2149                                         lex->le_start, (int)lex->le_len);
2150                         lmj->lmm_objects[i].l_extent_start =
2151                                 lex->le_start;
2152
2153                         if ((int)lex->le_len == -1)
2154                                 lmj->lmm_objects[i].l_extent_end = -1;
2155                         else
2156                                 lmj->lmm_objects[i].l_extent_end =
2157                                         lex->le_start + lex->le_len;
2158                         lmj->lmm_objects[i].l_object_id =
2159                                 lsm->lsm_oinfo[i]->loi_id;
2160                         lmj->lmm_objects[i].l_object_gr =
2161                                 lsm->lsm_oinfo[i]->loi_gr;
2162                         lmj->lmm_objects[i].l_ost_gen =
2163                                 lsm->lsm_oinfo[i]->loi_ost_gen;
2164                         lmj->lmm_objects[i].l_ost_idx =
2165                                 lsm->lsm_oinfo[i]->loi_ost_idx;
2166                 }
2167                 lmm = (struct lov_mds_md *)lmj;
2168                 lmmsize = lmj_size;
2169 out_free_memmd:
2170                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
2171         }
2172 out:
2173         *lmmp = lmm;
2174         *lmm_size = lmmsize;
2175         *request = req;
2176         return rc;
2177 }
2178
2179 static int ll_lov_setea(struct inode *inode, struct file *file,
2180                             unsigned long arg)
2181 {
2182         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2183         struct lov_user_md  *lump;
2184         int lum_size = sizeof(struct lov_user_md) +
2185                        sizeof(struct lov_user_ost_data);
2186         int rc;
2187         ENTRY;
2188
2189         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2190                 RETURN(-EPERM);
2191
2192         OBD_ALLOC(lump, lum_size);
2193         if (lump == NULL) {
2194                 RETURN(-ENOMEM);
2195         }
2196         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
2197                 OBD_FREE(lump, lum_size);
2198                 RETURN(-EFAULT);
2199         }
2200
2201         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
2202
2203         OBD_FREE(lump, lum_size);
2204         RETURN(rc);
2205 }
2206
2207 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2208                             unsigned long arg)
2209 {
2210         struct lov_user_md_v3 lumv3;
2211         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
2212         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
2213         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
2214         int lum_size;
2215         int rc;
2216         int flags = FMODE_WRITE;
2217         ENTRY;
2218
2219         /* first try with v1 which is smaller than v3 */
2220         lum_size = sizeof(struct lov_user_md_v1);
2221         if (copy_from_user(lumv1, lumv1p, lum_size))
2222                 RETURN(-EFAULT);
2223
2224         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
2225                 lum_size = sizeof(struct lov_user_md_v3);
2226                 if (copy_from_user(&lumv3, lumv3p, lum_size))
2227                         RETURN(-EFAULT);
2228         }
2229
2230         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
2231         if (rc == 0) {
2232                  put_user(0, &lumv1p->lmm_stripe_count);
2233                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
2234                                     0, ll_i2info(inode)->lli_smd,
2235                                     (void *)arg);
2236         }
2237         RETURN(rc);
2238 }
2239
2240 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
2241 {
2242         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2243
2244         if (!lsm)
2245                 RETURN(-ENODATA);
2246
2247         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
2248                             (void *)arg);
2249 }
2250
2251 static int ll_get_grouplock(struct inode *inode, struct file *file,
2252                             unsigned long arg)
2253 {
2254         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2255         ldlm_policy_data_t policy = { .l_extent = { .start = 0,
2256                                                     .end = OBD_OBJECT_EOF}};
2257         struct lustre_handle lockh = { 0 };
2258         struct ll_inode_info *lli = ll_i2info(inode);
2259         struct lov_stripe_md *lsm = lli->lli_smd;
2260         int flags = 0, rc;
2261         ENTRY;
2262
2263         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2264                 RETURN(-EINVAL);
2265         }
2266
2267         policy.l_extent.gid = arg;
2268         if (file->f_flags & O_NONBLOCK)
2269                 flags = LDLM_FL_BLOCK_NOWAIT;
2270
2271         rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
2272         if (rc)
2273                 RETURN(rc);
2274
2275         fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
2276         fd->fd_gid = arg;
2277         memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
2278
2279         RETURN(0);
2280 }
2281
2282 static int ll_put_grouplock(struct inode *inode, struct file *file,
2283                             unsigned long arg)
2284 {
2285         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2286         struct ll_inode_info *lli = ll_i2info(inode);
2287         struct lov_stripe_md *lsm = lli->lli_smd;
2288         int rc;
2289         ENTRY;
2290
2291         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2292                 /* Ugh, it's already unlocked. */
2293                 RETURN(-EINVAL);
2294         }
2295
2296         if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2297                 RETURN(-EINVAL);
2298
2299         fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2300
2301         rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2302         if (rc)
2303                 RETURN(rc);
2304
2305         fd->fd_gid = 0;
2306         memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2307
2308         RETURN(0);
2309 }
2310
2311 #if LUSTRE_FIX >= 50
2312 static int join_sanity_check(struct inode *head, struct inode *tail)
2313 {
2314         ENTRY;
2315         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2316                 CERROR("server do not support join \n");
2317                 RETURN(-EINVAL);
2318         }
2319         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2320                 CERROR("tail ino %lu and ino head %lu must be regular\n",
2321                        head->i_ino, tail->i_ino);
2322                 RETURN(-EINVAL);
2323         }
2324         if (head->i_ino == tail->i_ino) {
2325                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2326                 RETURN(-EINVAL);
2327         }
2328         if (i_size_read(head) % JOIN_FILE_ALIGN) {
2329                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2330                 RETURN(-EINVAL);
2331         }
2332         RETURN(0);
2333 }
2334
2335 static int join_file(struct inode *head_inode, struct file *head_filp,
2336                      struct file *tail_filp)
2337 {
2338         struct dentry *tail_dentry = tail_filp->f_dentry;
2339         struct lookup_intent oit = {.it_op = IT_OPEN,
2340                                    .it_flags = head_filp->f_flags|O_JOIN_FILE};
2341         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
2342                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
2343
2344         struct lustre_handle lockh;
2345         struct md_op_data *op_data;
2346         int    rc;
2347         loff_t data;
2348         ENTRY;
2349
2350         tail_dentry = tail_filp->f_dentry;
2351
2352         data = i_size_read(head_inode);
2353         op_data = ll_prep_md_op_data(NULL, head_inode,
2354                                      tail_dentry->d_parent->d_inode,
2355                                      tail_dentry->d_name.name,
2356                                      tail_dentry->d_name.len, 0,
2357                                      LUSTRE_OPC_ANY, &data);
2358         if (IS_ERR(op_data))
2359                 RETURN(PTR_ERR(op_data));
2360
2361         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
2362                          op_data, &lockh, NULL, 0, NULL, 0);
2363
2364         ll_finish_md_op_data(op_data);
2365         if (rc < 0)
2366                 GOTO(out, rc);
2367
2368         rc = oit.d.lustre.it_status;
2369
2370         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2371                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2372                 ptlrpc_req_finished((struct ptlrpc_request *)
2373                                     oit.d.lustre.it_data);
2374                 GOTO(out, rc);
2375         }
2376
2377         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2378                                            * away */
2379                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2380                 oit.d.lustre.it_lock_mode = 0;
2381         }
2382         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2383         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
2384         ll_release_openhandle(head_filp->f_dentry, &oit);
2385 out:
2386         ll_intent_release(&oit);
2387         RETURN(rc);
2388 }
2389
2390 static int ll_file_join(struct inode *head, struct file *filp,
2391                         char *filename_tail)
2392 {
2393         struct inode *tail = NULL, *first = NULL, *second = NULL;
2394         struct dentry *tail_dentry;
2395         struct file *tail_filp, *first_filp, *second_filp;
2396         struct ll_lock_tree first_tree, second_tree;
2397         struct ll_lock_tree_node *first_node, *second_node;
2398         struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2399         int rc = 0, cleanup_phase = 0;
2400         ENTRY;
2401
2402         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2403                head->i_ino, head->i_generation, head, filename_tail);
2404
2405         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2406         if (IS_ERR(tail_filp)) {
2407                 CERROR("Can not open tail file %s", filename_tail);
2408                 rc = PTR_ERR(tail_filp);
2409                 GOTO(cleanup, rc);
2410         }
2411         tail = igrab(tail_filp->f_dentry->d_inode);
2412
2413         tlli = ll_i2info(tail);
2414         tail_dentry = tail_filp->f_dentry;
2415         LASSERT(tail_dentry);
2416         cleanup_phase = 1;
2417
2418         /*reorder the inode for lock sequence*/
2419         first = head->i_ino > tail->i_ino ? head : tail;
2420         second = head->i_ino > tail->i_ino ? tail : head;
2421         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2422         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2423
2424         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2425                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2426         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2427         if (IS_ERR(first_node)){
2428                 rc = PTR_ERR(first_node);
2429                 GOTO(cleanup, rc);
2430         }
2431         first_tree.lt_fd = first_filp->private_data;
2432         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2433         if (rc != 0)
2434                 GOTO(cleanup, rc);
2435         cleanup_phase = 2;
2436
2437         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2438         if (IS_ERR(second_node)){
2439                 rc = PTR_ERR(second_node);
2440                 GOTO(cleanup, rc);
2441         }
2442         second_tree.lt_fd = second_filp->private_data;
2443         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2444         if (rc != 0)
2445                 GOTO(cleanup, rc);
2446         cleanup_phase = 3;
2447
2448         rc = join_sanity_check(head, tail);
2449         if (rc)
2450                 GOTO(cleanup, rc);
2451
2452         rc = join_file(head, filp, tail_filp);
2453         if (rc)
2454                 GOTO(cleanup, rc);
2455 cleanup:
2456         switch (cleanup_phase) {
2457         case 3:
2458                 ll_tree_unlock(&second_tree);
2459                 obd_cancel_unused(ll_i2dtexp(second),
2460                                   ll_i2info(second)->lli_smd, 0, NULL);
2461         case 2:
2462                 ll_tree_unlock(&first_tree);
2463                 obd_cancel_unused(ll_i2dtexp(first),
2464                                   ll_i2info(first)->lli_smd, 0, NULL);
2465         case 1:
2466                 filp_close(tail_filp, 0);
2467                 if (tail)
2468                         iput(tail);
2469                 if (head && rc == 0) {
2470                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2471                                        &hlli->lli_smd);
2472                         hlli->lli_smd = NULL;
2473                 }
2474         case 0:
2475                 break;
2476         default:
2477                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2478                 LBUG();
2479         }
2480         RETURN(rc);
2481 }
2482 #endif /* LUSTRE_FIX >= 50 */
2483
2484 /**
2485  * Close inode open handle
2486  *
2487  * \param dentry [in]     dentry which contains the inode
2488  * \param it     [in,out] intent which contains open info and result
2489  *
2490  * \retval 0     success
2491  * \retval <0    failure
2492  */
2493 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2494 {
2495         struct inode *inode = dentry->d_inode;
2496         struct obd_client_handle *och;
2497         int rc;
2498         ENTRY;
2499
2500         LASSERT(inode);
2501
2502         /* Root ? Do nothing. */
2503         if (dentry->d_inode->i_sb->s_root == dentry)
2504                 RETURN(0);
2505
2506         /* No open handle to close? Move away */
2507         if (!it_disposition(it, DISP_OPEN_OPEN))
2508                 RETURN(0);
2509
2510         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2511
2512         OBD_ALLOC(och, sizeof(*och));
2513         if (!och)
2514                 GOTO(out, rc = -ENOMEM);
2515
2516         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2517                     ll_i2info(inode), it, och);
2518
2519         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2520                                        inode, och);
2521  out:
2522         /* this one is in place of ll_file_open */
2523         if (it_disposition(it, DISP_ENQ_OPEN_REF))
2524                 ptlrpc_req_finished(it->d.lustre.it_data);
2525         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2526         RETURN(rc);
2527 }
2528
2529 /**
2530  * Get size for inode for which FIEMAP mapping is requested.
2531  * Make the FIEMAP get_info call and returns the result.
2532  */
2533 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
2534               int num_bytes)
2535 {
2536         struct obd_export *exp = ll_i2dtexp(inode);
2537         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2538         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
2539         int vallen = num_bytes;
2540         int rc;
2541         ENTRY;
2542
2543         /* If the stripe_count > 1 and the application does not understand
2544          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
2545          */
2546         if (lsm->lsm_stripe_count > 1 &&
2547             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
2548                 return -EOPNOTSUPP;
2549
2550         fm_key.oa.o_id = lsm->lsm_object_id;
2551         fm_key.oa.o_gr = lsm->lsm_object_gr;
2552         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2553
2554         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
2555                         OBD_MD_FLSIZE);
2556
2557         /* If filesize is 0, then there would be no objects for mapping */
2558         if (fm_key.oa.o_size == 0) {
2559                 fiemap->fm_mapped_extents = 0;
2560                 RETURN(0);
2561         }
2562
2563         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
2564
2565         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
2566         if (rc)
2567                 CERROR("obd_get_info failed: rc = %d\n", rc);
2568
2569         RETURN(rc);
2570 }
2571
2572 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2573                   unsigned long arg)
2574 {
2575         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2576         int flags;
2577         ENTRY;
2578
2579         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2580                inode->i_generation, inode, cmd);
2581         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2582
2583         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2584         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2585                 RETURN(-ENOTTY);
2586
2587         switch(cmd) {
2588         case LL_IOC_GETFLAGS:
2589                 /* Get the current value of the file flags */
2590                 return put_user(fd->fd_flags, (int *)arg);
2591         case LL_IOC_SETFLAGS:
2592         case LL_IOC_CLRFLAGS:
2593                 /* Set or clear specific file flags */
2594                 /* XXX This probably needs checks to ensure the flags are
2595                  *     not abused, and to handle any flag side effects.
2596                  */
2597                 if (get_user(flags, (int *) arg))
2598                         RETURN(-EFAULT);
2599
2600                 if (cmd == LL_IOC_SETFLAGS) {
2601                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2602                             !(file->f_flags & O_DIRECT)) {
2603                                 CERROR("%s: unable to disable locking on "
2604                                        "non-O_DIRECT file\n", current->comm);
2605                                 RETURN(-EINVAL);
2606                         }
2607
2608                         fd->fd_flags |= flags;
2609                 } else {
2610                         fd->fd_flags &= ~flags;
2611                 }
2612                 RETURN(0);
2613         case LL_IOC_LOV_SETSTRIPE:
2614                 RETURN(ll_lov_setstripe(inode, file, arg));
2615         case LL_IOC_LOV_SETEA:
2616                 RETURN(ll_lov_setea(inode, file, arg));
2617         case LL_IOC_LOV_GETSTRIPE:
2618                 RETURN(ll_lov_getstripe(inode, arg));
2619         case LL_IOC_RECREATE_OBJ:
2620                 RETURN(ll_lov_recreate_obj(inode, file, arg));
2621         case EXT3_IOC_FIEMAP: {
2622                 struct ll_user_fiemap *fiemap_s;
2623                 size_t num_bytes, ret_bytes;
2624                 unsigned int extent_count;
2625                 int rc = 0;
2626
2627                 /* Get the extent count so we can calculate the size of
2628                  * required fiemap buffer */
2629                 if (get_user(extent_count,
2630                     &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
2631                         RETURN(-EFAULT);
2632                 num_bytes = sizeof(*fiemap_s) + (extent_count *
2633                                                  sizeof(struct ll_fiemap_extent));
2634                 OBD_VMALLOC(fiemap_s, num_bytes);
2635                 if (fiemap_s == NULL)
2636                         RETURN(-ENOMEM);
2637
2638                 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
2639                                    sizeof(*fiemap_s)))
2640                         GOTO(error, rc = -EFAULT);
2641
2642                 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2643                         fiemap_s->fm_flags = fiemap_s->fm_flags &
2644                                                     ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2645                         if (copy_to_user((char *)arg, fiemap_s,
2646                                          sizeof(*fiemap_s)))
2647                                 GOTO(error, rc = -EFAULT);
2648
2649                         GOTO(error, rc = -EBADR);
2650                 }
2651
2652                 /* If fm_extent_count is non-zero, read the first extent since
2653                  * it is used to calculate end_offset and device from previous
2654                  * fiemap call. */
2655                 if (extent_count) {
2656                         if (copy_from_user(&fiemap_s->fm_extents[0],
2657                             (char __user *)arg + sizeof(*fiemap_s),
2658                             sizeof(struct ll_fiemap_extent)))
2659                                 GOTO(error, rc = -EFAULT);
2660                 }
2661
2662                 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
2663                         int rc;
2664
2665                         rc = filemap_fdatawrite(inode->i_mapping);
2666                         if (rc)
2667                                 GOTO(error, rc);
2668                 }
2669
2670                 rc = ll_fiemap(inode, fiemap_s, num_bytes);
2671                 if (rc)
2672                         GOTO(error, rc);
2673
2674                 ret_bytes = sizeof(struct ll_user_fiemap);
2675
2676                 if (extent_count != 0)
2677                         ret_bytes += (fiemap_s->fm_mapped_extents *
2678                                          sizeof(struct ll_fiemap_extent));
2679
2680                 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
2681                         rc = -EFAULT;
2682
2683 error:
2684                 OBD_VFREE(fiemap_s, num_bytes);
2685                 RETURN(rc);
2686         }
2687         case EXT3_IOC_GETFLAGS:
2688         case EXT3_IOC_SETFLAGS:
2689                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2690         case EXT3_IOC_GETVERSION_OLD:
2691         case EXT3_IOC_GETVERSION:
2692                 RETURN(put_user(inode->i_generation, (int *)arg));
2693         case LL_IOC_JOIN: {
2694 #if LUSTRE_FIX >= 50
2695                 /* Allow file join in beta builds to allow debuggging */
2696                 char *ftail;
2697                 int rc;
2698
2699                 ftail = getname((const char *)arg);
2700                 if (IS_ERR(ftail))
2701                         RETURN(PTR_ERR(ftail));
2702                 rc = ll_file_join(inode, file, ftail);
2703                 putname(ftail);
2704                 RETURN(rc);
2705 #else
2706                 CWARN("file join is not supported in this version of Lustre\n");
2707                 RETURN(-ENOTTY);
2708 #endif
2709         }
2710         case LL_IOC_GROUP_LOCK:
2711                 RETURN(ll_get_grouplock(inode, file, arg));
2712         case LL_IOC_GROUP_UNLOCK:
2713                 RETURN(ll_put_grouplock(inode, file, arg));
2714         case IOC_OBD_STATFS:
2715                 RETURN(ll_obd_statfs(inode, (void *)arg));
2716
2717         /* We need to special case any other ioctls we want to handle,
2718          * to send them to the MDS/OST as appropriate and to properly
2719          * network encode the arg field.
2720         case EXT3_IOC_SETVERSION_OLD:
2721         case EXT3_IOC_SETVERSION:
2722         */
2723         case LL_IOC_FLUSHCTX:
2724                 RETURN(ll_flush_ctx(inode));
2725         default: {
2726                 int err;
2727
2728                 if (LLIOC_STOP ==
2729                     ll_iocontrol_call(inode, file, cmd, arg, &err))
2730                         RETURN(err);
2731
2732                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2733                                      (void *)arg));
2734         }
2735         }
2736 }
2737
2738 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2739 {
2740         struct inode *inode = file->f_dentry->d_inode;
2741         struct ll_inode_info *lli = ll_i2info(inode);
2742         struct lov_stripe_md *lsm = lli->lli_smd;
2743         loff_t retval;
2744         ENTRY;
2745         retval = offset + ((origin == 2) ? i_size_read(inode) :
2746                            (origin == 1) ? file->f_pos : 0);
2747         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2748                inode->i_ino, inode->i_generation, inode, retval, retval,
2749                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2750         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2751
2752         if (origin == 2) { /* SEEK_END */
2753                 int nonblock = 0, rc;
2754
2755                 if (file->f_flags & O_NONBLOCK)
2756                         nonblock = LDLM_FL_BLOCK_NOWAIT;
2757
2758                 if (lsm != NULL) {
2759                         rc = ll_glimpse_size(inode, nonblock);
2760                         if (rc != 0)
2761                                 RETURN(rc);
2762                 }
2763
2764                 ll_inode_size_lock(inode, 0);
2765                 offset += i_size_read(inode);
2766                 ll_inode_size_unlock(inode, 0);
2767         } else if (origin == 1) { /* SEEK_CUR */
2768                 offset += file->f_pos;
2769         }
2770
2771         retval = -EINVAL;
2772         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2773                 if (offset != file->f_pos) {
2774                         file->f_pos = offset;
2775                 }
2776                 retval = offset;
2777         }
2778
2779         RETURN(retval);
2780 }
2781
2782 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2783 {
2784         struct inode *inode = dentry->d_inode;
2785         struct ll_inode_info *lli = ll_i2info(inode);
2786         struct lov_stripe_md *lsm = lli->lli_smd;
2787         struct ptlrpc_request *req;
2788         struct obd_capa *oc;
2789         int rc, err;
2790         ENTRY;
2791         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2792                inode->i_generation, inode);
2793         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2794
2795         /* fsync's caller has already called _fdata{sync,write}, we want
2796          * that IO to finish before calling the osc and mdc sync methods */
2797         rc = filemap_fdatawait(inode->i_mapping);
2798
2799         /* catch async errors that were recorded back when async writeback
2800          * failed for pages in this mapping. */
2801         err = lli->lli_async_rc;
2802         lli->lli_async_rc = 0;
2803         if (rc == 0)
2804                 rc = err;
2805         if (lsm) {
2806                 err = lov_test_and_clear_async_rc(lsm);
2807                 if (rc == 0)
2808                         rc = err;
2809         }
2810
2811         oc = ll_mdscapa_get(inode);
2812         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2813                       &req);
2814         capa_put(oc);
2815         if (!rc)
2816                 rc = err;
2817         if (!err)
2818                 ptlrpc_req_finished(req);
2819
2820         if (data && lsm) {
2821                 struct obdo *oa;
2822
2823                 OBDO_ALLOC(oa);
2824                 if (!oa)
2825                         RETURN(rc ? rc : -ENOMEM);
2826
2827                 oa->o_id = lsm->lsm_object_id;
2828                 oa->o_gr = lsm->lsm_object_gr;
2829                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2830                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2831                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2832                                            OBD_MD_FLGROUP);
2833
2834                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2835                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2836                                0, OBD_OBJECT_EOF, oc);
2837                 capa_put(oc);
2838                 if (!rc)
2839                         rc = err;
2840                 OBDO_FREE(oa);
2841         }
2842
2843         RETURN(rc);
2844 }
2845
2846 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2847 {
2848         struct inode *inode = file->f_dentry->d_inode;
2849         struct ll_sb_info *sbi = ll_i2sbi(inode);
2850         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2851                                            .ei_cb_cp =ldlm_flock_completion_ast,
2852                                            .ei_cbdata = file_lock };
2853         struct md_op_data *op_data;
2854         struct lustre_handle lockh = {0};
2855         ldlm_policy_data_t flock;
2856         int flags = 0;
2857         int rc;
2858         ENTRY;
2859
2860         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2861                inode->i_ino, file_lock);
2862
2863         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2864
2865         if (file_lock->fl_flags & FL_FLOCK) {
2866                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2867                 /* set missing params for flock() calls */
2868                 file_lock->fl_end = OFFSET_MAX;
2869                 file_lock->fl_pid = current->tgid;
2870         }
2871         flock.l_flock.pid = file_lock->fl_pid;
2872         flock.l_flock.start = file_lock->fl_start;
2873         flock.l_flock.end = file_lock->fl_end;
2874
2875         switch (file_lock->fl_type) {
2876         case F_RDLCK:
2877                 einfo.ei_mode = LCK_PR;
2878                 break;
2879         case F_UNLCK:
2880                 /* An unlock request may or may not have any relation to
2881                  * existing locks so we may not be able to pass a lock handle
2882                  * via a normal ldlm_lock_cancel() request. The request may even
2883                  * unlock a byte range in the middle of an existing lock. In
2884                  * order to process an unlock request we need all of the same
2885                  * information that is given with a normal read or write record
2886                  * lock request. To avoid creating another ldlm unlock (cancel)
2887                  * message we'll treat a LCK_NL flock request as an unlock. */
2888                 einfo.ei_mode = LCK_NL;
2889                 break;
2890         case F_WRLCK:
2891                 einfo.ei_mode = LCK_PW;
2892                 break;
2893         default:
2894                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2895                 LBUG();
2896         }
2897
2898         switch (cmd) {
2899         case F_SETLKW:
2900 #ifdef F_SETLKW64
2901         case F_SETLKW64:
2902 #endif
2903                 flags = 0;
2904                 break;
2905         case F_SETLK:
2906 #ifdef F_SETLK64
2907         case F_SETLK64:
2908 #endif
2909                 flags = LDLM_FL_BLOCK_NOWAIT;
2910                 break;
2911         case F_GETLK:
2912 #ifdef F_GETLK64
2913         case F_GETLK64:
2914 #endif
2915                 flags = LDLM_FL_TEST_LOCK;
2916                 /* Save the old mode so that if the mode in the lock changes we
2917                  * can decrement the appropriate reader or writer refcount. */
2918                 file_lock->fl_type = einfo.ei_mode;
2919                 break;
2920         default:
2921                 CERROR("unknown fcntl lock command: %d\n", cmd);
2922                 LBUG();
2923         }
2924
2925         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2926                                      LUSTRE_OPC_ANY, NULL);
2927         if (IS_ERR(op_data))
2928                 RETURN(PTR_ERR(op_data));
2929
2930         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2931                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2932                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2933
2934         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2935                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2936
2937         ll_finish_md_op_data(op_data);
2938
2939         if ((file_lock->fl_flags & FL_FLOCK) &&
2940             (rc == 0 || file_lock->fl_type == F_UNLCK))
2941                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2942 #ifdef HAVE_F_OP_FLOCK
2943         if ((file_lock->fl_flags & FL_POSIX) &&
2944             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2945             !(flags & LDLM_FL_TEST_LOCK))
2946                 posix_lock_file_wait(file, file_lock);
2947 #endif
2948
2949         RETURN(rc);
2950 }
2951
2952 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2953 {
2954         ENTRY;
2955
2956         RETURN(-ENOSYS);
2957 }
2958
2959 int ll_have_md_lock(struct inode *inode, __u64 bits)
2960 {
2961         struct lustre_handle lockh;
2962         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2963         struct lu_fid *fid;
2964         int flags;
2965         ENTRY;
2966
2967         if (!inode)
2968                RETURN(0);
2969
2970         fid = &ll_i2info(inode)->lli_fid;
2971         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2972
2973         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2974         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2975                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2976                 RETURN(1);
2977         }
2978         RETURN(0);
2979 }
2980
2981 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2982                             struct lustre_handle *lockh)
2983 {
2984         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2985         struct lu_fid *fid;
2986         ldlm_mode_t rc;
2987         int flags;
2988         ENTRY;
2989
2990         fid = &ll_i2info(inode)->lli_fid;
2991         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2992
2993         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2994         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2995                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2996         RETURN(rc);
2997 }
2998
2999 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
3000         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
3001                               * and return success */
3002                 inode->i_nlink = 0;
3003                 /* This path cannot be hit for regular files unless in
3004                  * case of obscure races, so no need to to validate
3005                  * size. */
3006                 if (!S_ISREG(inode->i_mode) &&
3007                     !S_ISDIR(inode->i_mode))
3008                         return 0;
3009         }
3010
3011         if (rc) {
3012                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
3013                 return -abs(rc);
3014
3015         }
3016
3017         return 0;
3018 }
3019
3020 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
3021 {
3022         struct inode *inode = dentry->d_inode;
3023         struct ptlrpc_request *req = NULL;
3024         struct ll_sb_info *sbi;
3025         struct obd_export *exp;
3026         int rc;
3027         ENTRY;
3028
3029         if (!inode) {
3030                 CERROR("REPORT THIS LINE TO PETER\n");
3031                 RETURN(0);
3032         }
3033         sbi = ll_i2sbi(inode);
3034
3035         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
3036                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
3037
3038         exp = ll_i2mdexp(inode);
3039
3040         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
3041                 struct lookup_intent oit = { .it_op = IT_GETATTR };
3042                 struct md_op_data *op_data;
3043
3044                 /* Call getattr by fid, so do not provide name at all. */
3045                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
3046                                              dentry->d_inode, NULL, 0, 0,
3047                                              LUSTRE_OPC_ANY, NULL);
3048                 if (IS_ERR(op_data))
3049                         RETURN(PTR_ERR(op_data));
3050
3051                 oit.it_flags |= O_CHECK_STALE;
3052                 rc = md_intent_lock(exp, op_data, NULL, 0,
3053                                     /* we are not interested in name
3054                                        based lookup */
3055                                     &oit, 0, &req,
3056                                     ll_md_blocking_ast, 0);
3057                 ll_finish_md_op_data(op_data);
3058                 oit.it_flags &= ~O_CHECK_STALE;
3059                 if (rc < 0) {
3060                         rc = ll_inode_revalidate_fini(inode, rc);
3061                         GOTO (out, rc);
3062                 }
3063
3064                 rc = ll_revalidate_it_finish(req, &oit, dentry);
3065                 if (rc != 0) {
3066                         ll_intent_release(&oit);
3067                         GOTO(out, rc);
3068                 }
3069
3070                 /* Unlinked? Unhash dentry, so it is not picked up later by
3071                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3072                    here to preserve get_cwd functionality on 2.6.
3073                    Bug 10503 */
3074                 if (!dentry->d_inode->i_nlink) {
3075                         spin_lock(&ll_lookup_lock);
3076                         spin_lock(&dcache_lock);
3077                         ll_drop_dentry(dentry);
3078                         spin_unlock(&dcache_lock);
3079                         spin_unlock(&ll_lookup_lock);
3080                 }
3081
3082                 ll_lookup_finish_locks(&oit, dentry);
3083         } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE |
3084                                                      MDS_INODELOCK_LOOKUP)) {
3085                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3086                 obd_valid valid = OBD_MD_FLGETATTR;
3087                 struct obd_capa *oc;
3088                 int ealen = 0;
3089
3090                 if (S_ISREG(inode->i_mode)) {
3091                         rc = ll_get_max_mdsize(sbi, &ealen);
3092                         if (rc)
3093                                 RETURN(rc);
3094                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3095                 }
3096                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
3097                  * capa for this inode. Because we only keep capas of dirs
3098                  * fresh. */
3099                 oc = ll_mdscapa_get(inode);
3100                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
3101                                 ealen, &req);
3102                 capa_put(oc);
3103                 if (rc) {
3104                         rc = ll_inode_revalidate_fini(inode, rc);
3105                         RETURN(rc);
3106                 }
3107
3108                 rc = ll_prep_inode(&inode, req, NULL);
3109                 if (rc)
3110                         GOTO(out, rc);
3111         }
3112
3113         /* if object not yet allocated, don't validate size */
3114         if (ll_i2info(inode)->lli_smd == NULL)
3115                 GOTO(out, rc = 0);
3116
3117         /* ll_glimpse_size will prefer locally cached writes if they extend
3118          * the file */
3119         rc = ll_glimpse_size(inode, 0);
3120         EXIT;
3121 out:
3122         ptlrpc_req_finished(req);
3123         return rc;
3124 }
3125
3126 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3127                   struct lookup_intent *it, struct kstat *stat)
3128 {
3129         struct inode *inode = de->d_inode;
3130         int res = 0;
3131
3132         res = ll_inode_revalidate_it(de, it);
3133         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
3134
3135         if (res)
3136                 return res;
3137
3138         stat->dev = inode->i_sb->s_dev;
3139         stat->ino = inode->i_ino;
3140         stat->mode = inode->i_mode;
3141         stat->nlink = inode->i_nlink;
3142         stat->uid = inode->i_uid;
3143         stat->gid = inode->i_gid;
3144         stat->rdev = kdev_t_to_nr(inode->i_rdev);
3145         stat->atime = inode->i_atime;
3146         stat->mtime = inode->i_mtime;
3147         stat->ctime = inode->i_ctime;
3148 #ifdef HAVE_INODE_BLKSIZE
3149         stat->blksize = inode->i_blksize;
3150 #else
3151         stat->blksize = 1 << inode->i_blkbits;
3152 #endif
3153
3154         ll_inode_size_lock(inode, 0);
3155         stat->size = i_size_read(inode);
3156         stat->blocks = inode->i_blocks;
3157         ll_inode_size_unlock(inode, 0);
3158
3159         return 0;
3160 }
3161 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3162 {
3163         struct lookup_intent it = { .it_op = IT_GETATTR };
3164
3165         return ll_getattr_it(mnt, de, &it, stat);
3166 }
3167
3168 static
3169 int lustre_check_acl(struct inode *inode, int mask)
3170 {
3171 #ifdef CONFIG_FS_POSIX_ACL
3172         struct ll_inode_info *lli = ll_i2info(inode);
3173         struct posix_acl *acl;
3174         int rc;
3175         ENTRY;
3176
3177         spin_lock(&lli->lli_lock);
3178         acl = posix_acl_dup(lli->lli_posix_acl);
3179         spin_unlock(&lli->lli_lock);
3180
3181         if (!acl)
3182                 RETURN(-EAGAIN);
3183
3184         rc = posix_acl_permission(inode, acl, mask);
3185         posix_acl_release(acl);
3186
3187         RETURN(rc);
3188 #else
3189         return -EAGAIN;
3190 #endif
3191 }
3192
3193 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
3194 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3195 {
3196         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3197                inode->i_ino, inode->i_generation, inode, mask);
3198         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3199                 return lustre_check_remote_perm(inode, mask);
3200
3201         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3202         return generic_permission(inode, mask, lustre_check_acl);
3203 }
3204 #else
3205 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3206 {
3207         int mode = inode->i_mode;
3208         int rc;
3209
3210         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3211                inode->i_ino, inode->i_generation, inode, mask);
3212
3213         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
3214                 return lustre_check_remote_perm(inode, mask);
3215
3216         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3217
3218         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
3219             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
3220                 return -EROFS;
3221         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
3222                 return -EACCES;
3223         if (current->fsuid == inode->i_uid) {
3224                 mode >>= 6;
3225         } else if (1) {
3226                 if (((mode >> 3) & mask & S_IRWXO) != mask)
3227                         goto check_groups;
3228                 rc = lustre_check_acl(inode, mask);
3229                 if (rc == -EAGAIN)
3230                         goto check_groups;
3231                 if (rc == -EACCES)
3232                         goto check_capabilities;
3233                 return rc;
3234         } else {
3235 check_groups:
3236                 if (in_group_p(inode->i_gid))
3237                         mode >>= 3;
3238         }
3239         if ((mode & mask & S_IRWXO) == mask)
3240                 return 0;
3241
3242 check_capabilities:
3243         if (!(mask & MAY_EXEC) ||
3244             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
3245                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
3246                         return 0;
3247
3248         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
3249             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
3250                 return 0;
3251
3252         return -EACCES;
3253 }
3254 #endif
3255
3256 /* -o localflock - only provides locally consistent flock locks */
3257 struct file_operations ll_file_operations = {
3258         .read           = ll_file_read,
3259         .write          = ll_file_write,
3260         .ioctl          = ll_file_ioctl,
3261         .open           = ll_file_open,
3262         .release        = ll_file_release,
3263         .mmap           = ll_file_mmap,
3264         .llseek         = ll_file_seek,
3265         .sendfile       = ll_file_sendfile,
3266         .fsync          = ll_fsync,
3267 };
3268
3269 struct file_operations ll_file_operations_flock = {
3270         .read           = ll_file_read,
3271         .write          = ll_file_write,
3272         .ioctl          = ll_file_ioctl,
3273         .open           = ll_file_open,
3274         .release        = ll_file_release,
3275         .mmap           = ll_file_mmap,
3276         .llseek         = ll_file_seek,
3277         .sendfile       = ll_file_sendfile,
3278         .fsync          = ll_fsync,
3279 #ifdef HAVE_F_OP_FLOCK
3280         .flock          = ll_file_flock,
3281 #endif
3282         .lock           = ll_file_flock
3283 };
3284
3285 /* These are for -o noflock - to return ENOSYS on flock calls */
3286 struct file_operations ll_file_operations_noflock = {
3287         .read           = ll_file_read,
3288         .write          = ll_file_write,
3289         .ioctl          = ll_file_ioctl,
3290         .open           = ll_file_open,
3291         .release        = ll_file_release,
3292         .mmap           = ll_file_mmap,
3293         .llseek         = ll_file_seek,
3294         .sendfile       = ll_file_sendfile,
3295         .fsync          = ll_fsync,
3296 #ifdef HAVE_F_OP_FLOCK
3297         .flock          = ll_file_noflock,
3298 #endif
3299         .lock           = ll_file_noflock
3300 };
3301
3302 struct inode_operations ll_file_inode_operations = {
3303 #ifdef HAVE_VFS_INTENT_PATCHES
3304         .setattr_raw    = ll_setattr_raw,
3305 #endif
3306         .setattr        = ll_setattr,
3307         .truncate       = ll_truncate,
3308         .getattr        = ll_getattr,
3309         .permission     = ll_inode_permission,
3310         .setxattr       = ll_setxattr,
3311         .getxattr       = ll_getxattr,
3312         .listxattr      = ll_listxattr,
3313         .removexattr    = ll_removexattr,
3314 };
3315
3316 /* dynamic ioctl number support routins */
3317 static struct llioc_ctl_data {
3318         struct rw_semaphore ioc_sem;
3319         struct list_head    ioc_head;
3320 } llioc = {
3321         __RWSEM_INITIALIZER(llioc.ioc_sem),
3322         CFS_LIST_HEAD_INIT(llioc.ioc_head)
3323 };
3324
3325
3326 struct llioc_data {
3327         struct list_head        iocd_list;
3328         unsigned int            iocd_size;
3329         llioc_callback_t        iocd_cb;
3330         unsigned int            iocd_count;
3331         unsigned int            iocd_cmd[0];
3332 };
3333
3334 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3335 {
3336         unsigned int size;
3337         struct llioc_data *in_data = NULL;
3338         ENTRY;
3339
3340         if (cb == NULL || cmd == NULL ||
3341             count > LLIOC_MAX_CMD || count < 0)
3342                 RETURN(NULL);
3343
3344         size = sizeof(*in_data) + count * sizeof(unsigned int);
3345         OBD_ALLOC(in_data, size);
3346         if (in_data == NULL)
3347                 RETURN(NULL);
3348
3349         memset(in_data, 0, sizeof(*in_data));
3350         in_data->iocd_size = size;
3351         in_data->iocd_cb = cb;
3352         in_data->iocd_count = count;
3353         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3354
3355         down_write(&llioc.ioc_sem);
3356         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3357         up_write(&llioc.ioc_sem);
3358
3359         RETURN(in_data);
3360 }
3361
3362 void ll_iocontrol_unregister(void *magic)
3363 {
3364         struct llioc_data *tmp;
3365
3366         if (magic == NULL)
3367                 return;
3368
3369         down_write(&llioc.ioc_sem);
3370         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3371                 if (tmp == magic) {
3372                         unsigned int size = tmp->iocd_size;
3373
3374                         list_del(&tmp->iocd_list);
3375                         up_write(&llioc.ioc_sem);
3376
3377                         OBD_FREE(tmp, size);
3378                         return;
3379                 }
3380         }
3381         up_write(&llioc.ioc_sem);
3382
3383         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3384 }
3385
3386 EXPORT_SYMBOL(ll_iocontrol_register);
3387 EXPORT_SYMBOL(ll_iocontrol_unregister);
3388
3389 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3390                         unsigned int cmd, unsigned long arg, int *rcp)
3391 {
3392         enum llioc_iter ret = LLIOC_CONT;
3393         struct llioc_data *data;
3394         int rc = -EINVAL, i;
3395
3396         down_read(&llioc.ioc_sem);
3397         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3398                 for (i = 0; i < data->iocd_count; i++) {
3399                         if (cmd != data->iocd_cmd[i])
3400                                 continue;
3401
3402                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3403                         break;
3404                 }
3405
3406                 if (ret == LLIOC_STOP)
3407                         break;
3408         }
3409         up_read(&llioc.ioc_sem);
3410
3411         if (rcp)
3412                 *rcp = rc;
3413         return ret;
3414 }