lustre/llite/file.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  * GPL HEADER START
   5  *
   6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License version 2 only,
  10  * as published by the Free Software Foundation.
  11  *
  12  * This program is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * General Public License version 2 for more details (a copy is included
  16  * in the LICENSE file that accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License
  19  * version 2 along with this program; If not, see
  20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  21  *
  22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
  23  * CA 95054 USA or visit www.sun.com if you need additional information or
  24  * have any questions.
  25  *
  26  * GPL HEADER END
  27  */
  28 /*
  29  * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
  30  * Use is subject to license terms.
  31  */
  32 /*
  33  * This file is part of Lustre, http://www.lustre.org/
  34  * Lustre is a trademark of Sun Microsystems, Inc.
  35  *
  36  * lustre/llite/file.c
  37  *
  38  * Author: Peter Braam <braam@clusterfs.com>
  39  * Author: Phil Schwan <phil@clusterfs.com>
  40  * Author: Andreas Dilger <adilger@clusterfs.com>
  41  */
  42
  43 #define DEBUG_SUBSYSTEM S_LLITE
  44 #include <lustre_dlm.h>
  45 #include <lustre_lite.h>
  46 #include <linux/pagemap.h>
  47 #include <linux/file.h>
  48 #include "llite_internal.h"
  49 #include <lustre/ll_fiemap.h>
  50
  51 /* also used by llite/special.c:ll_special_open() */
  52 struct ll_file_data *ll_file_data_get(void)
  53 {
  54         struct ll_file_data *fd;
  55
  56         OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
  57         return fd;
  58 }
  59
  60 static void ll_file_data_put(struct ll_file_data *fd)
  61 {
  62         if (fd != NULL)
  63                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
  64 }
  65
  66 static int ll_close_inode_openhandle(struct inode *inode,
  67                                      struct obd_client_handle *och)
  68 {
  69         struct ptlrpc_request *req = NULL;
  70         struct obd_device *obd;
  71         struct obdo *oa;
  72         struct mdc_op_data data = { { 0 } };
  73         obd_flag valid;
  74         int rc;
  75         ENTRY;
  76
  77         obd = class_exp2obd(ll_i2mdcexp(inode));
  78         if (obd == NULL) {
  79                 CERROR("Invalid MDC connection handle "LPX64"\n",
  80                        ll_i2mdcexp(inode)->exp_handle.h_cookie);
  81                 GOTO(out, rc = 0);
  82         }
  83
  84         /*
  85          * here we check if this is forced umount. If so this is called on
  86          * canceling "open lock" and we do not call mdc_close() in this case, as
  87          * it will not be successful, as import is already deactivated.
  88          */
  89         if (obd->obd_force)
  90                 GOTO(out, rc = 0);
  91
  92         OBDO_ALLOC(oa);
  93         if (!oa)
  94                 RETURN(-ENOMEM); // XXX We leak openhandle and request here.
  95
  96         oa->o_id = inode->i_ino;
  97         oa->o_valid = OBD_MD_FLID;
  98         valid = OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLATIME |
  99                 OBD_MD_FLMTIME | OBD_MD_FLCTIME;
 100         if (S_ISREG(inode->i_mode))
 101                 valid |=  OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
 102         obdo_from_inode(oa, inode, valid);
 103         if (ll_is_inode_dirty(inode)) {
 104                 oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES;
 105                 oa->o_valid |= OBD_MD_FLFLAGS;
 106         }
 107         ll_inode2fid(&data.fid1, inode);
 108         rc = mdc_close(ll_i2mdcexp(inode), &data, oa, och, &req);
 109         if (rc == EAGAIN) {
 110                 /* We are the last writer, so the MDS has instructed us to get
 111                  * the file size and any write cookies, then close again. */
 112                 ll_queue_done_writing(inode);
 113                 rc = 0;
 114         } else if (rc) {
 115                 CERROR("inode %lu mdc close failed: rc = %d\n",
 116                        inode->i_ino, rc);
 117         }
 118
 119         OBDO_FREE(oa);
 120
 121         if (rc == 0) {
 122                 rc = ll_objects_destroy(req, inode);
 123                 if (rc)
 124                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
 125                                inode->i_ino, rc);
 126         }
 127
 128         ptlrpc_req_finished(req); /* This is close request */
 129         EXIT;
 130 out:
 131         mdc_clear_open_replay_data(och);
 132
 133         return rc;
 134 }
 135
 136 int ll_mdc_real_close(struct inode *inode, int flags)
 137 {
 138         struct ll_inode_info *lli = ll_i2info(inode);
 139         int rc = 0;
 140         struct obd_client_handle **och_p;
 141         struct obd_client_handle *och;
 142         __u64 *och_usecount;
 143
 144         ENTRY;
 145
 146         if (flags & FMODE_WRITE) {
 147                 och_p = &lli->lli_mds_write_och;
 148                 och_usecount = &lli->lli_open_fd_write_count;
 149         } else if (flags & FMODE_EXEC) {
 150                 och_p = &lli->lli_mds_exec_och;
 151                 och_usecount = &lli->lli_open_fd_exec_count;
 152          } else {
 153                 LASSERT(flags & FMODE_READ);
 154                 och_p = &lli->lli_mds_read_och;
 155                 och_usecount = &lli->lli_open_fd_read_count;
 156         }
 157
 158         down(&lli->lli_och_sem);
 159         if (*och_usecount) { /* There are still users of this handle, so
 160                                 skip freeing it. */
 161                 up(&lli->lli_och_sem);
 162                 RETURN(0);
 163         }
 164         och=*och_p;
 165         *och_p = NULL;
 166         up(&lli->lli_och_sem);
 167
 168         if (och) { /* There might be a race and somebody have freed this och
 169                       already */
 170                 rc = ll_close_inode_openhandle(inode, och);
 171                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 172                 OBD_FREE(och, sizeof *och);
 173         }
 174
 175         RETURN(rc);
 176 }
 177
 178 int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode,
 179                         struct file *file)
 180 {
 181         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 182         struct ll_inode_info *lli = ll_i2info(inode);
 183         int rc = 0;
 184         ENTRY;
 185
 186         /* clear group lock, if present */
 187         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
 188                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 189                 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
 190                 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
 191                                       &fd->fd_cwlockh);
 192         }
 193
 194         /* Let's see if we have good enough OPEN lock on the file and if
 195            we can skip talking to MDS */
 196         if (file->f_dentry->d_inode) { /* Can this ever be false? */
 197                 int lockmode;
 198                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 199                 struct lustre_handle lockh;
 200                 struct inode *inode = file->f_dentry->d_inode;
 201                 struct ldlm_res_id file_res_id;
 202
 203                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 204                 fid_build_reg_res_name(ll_inode_lu_fid(inode), &file_res_id);
 205
 206                 down(&lli->lli_och_sem);
 207                 if (fd->fd_omode & FMODE_WRITE) {
 208                         lockmode = LCK_CW;
 209                         LASSERT(lli->lli_open_fd_write_count);
 210                         lli->lli_open_fd_write_count--;
 211                 } else if (fd->fd_omode & FMODE_EXEC) {
 212                         lockmode = LCK_PR;
 213                         LASSERT(lli->lli_open_fd_exec_count);
 214                         lli->lli_open_fd_exec_count--;
 215                 } else {
 216                         lockmode = LCK_CR;
 217                         LASSERT(lli->lli_open_fd_read_count);
 218                         lli->lli_open_fd_read_count--;
 219                 }
 220                 up(&lli->lli_och_sem);
 221
 222                 if (!ldlm_lock_match(mdc_exp->exp_obd->obd_namespace, flags,
 223                                      &file_res_id, LDLM_IBITS, &policy,lockmode,
 224                                      &lockh)) {
 225                         rc = ll_mdc_real_close(file->f_dentry->d_inode,
 226                                                 fd->fd_omode);
 227                 }
 228         } else {
 229                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
 230                        file, file->f_dentry, file->f_dentry->d_name.name);
 231         }
 232
 233         LUSTRE_FPRIVATE(file) = NULL;
 234         ll_file_data_put(fd);
 235
 236         RETURN(rc);
 237 }
 238
 239 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
 240
 241 /* While this returns an error code, fput() the caller does not, so we need
 242  * to make every effort to clean up all of our state here.  Also, applications
 243  * rarely check close errors and even if an error is returned they will not
 244  * re-try the close call.
 245  */
 246 int ll_file_release(struct inode *inode, struct file *file)
 247 {
 248         struct ll_file_data *fd;
 249         struct ll_sb_info *sbi = ll_i2sbi(inode);
 250         struct ll_inode_info *lli = ll_i2info(inode);
 251         struct lov_stripe_md *lsm = lli->lli_smd;
 252         int rc;
 253         ENTRY;
 254
 255         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 256                inode->i_generation, inode);
 257
 258         if (inode->i_sb->s_root != file->f_dentry)
 259                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
 260         fd = LUSTRE_FPRIVATE(file);
 261         LASSERT(fd != NULL);
 262
 263         /* The last ref on @file, maybe not the the owner pid of statahead.
 264          * Different processes can open the same dir, "ll_opendir_key" means:
 265          * it is me that should stop the statahead thread. */
 266         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
 267                 ll_stop_statahead(inode, lli->lli_opendir_key);
 268
 269         if (inode->i_sb->s_root == file->f_dentry) {
 270                 LUSTRE_FPRIVATE(file) = NULL;
 271                 ll_file_data_put(fd);
 272                 RETURN(0);
 273         }
 274
 275         if (lsm)
 276                 lov_test_and_clear_async_rc(lsm);
 277         lli->lli_async_rc = 0;
 278
 279         rc = ll_mdc_close(sbi->ll_mdc_exp, inode, file);
 280
 281         if (OBD_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, obd_fail_val))
 282                 libcfs_debug_dumplog();
 283
 284         RETURN(rc);
 285 }
 286
 287 static int ll_intent_file_open(struct file *file, void *lmm,
 288                                int lmmsize, struct lookup_intent *itp)
 289 {
 290         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
 291         struct mdc_op_data data = { { 0 } };
 292         struct dentry *parent = file->f_dentry->d_parent;
 293         const char *name = file->f_dentry->d_name.name;
 294         const int len = file->f_dentry->d_name.len;
 295         struct inode *inode = file->f_dentry->d_inode;
 296         struct ptlrpc_request *req;
 297         int rc;
 298         ENTRY;
 299
 300         if (!parent)
 301                 RETURN(-ENOENT);
 302
 303         ll_prepare_mdc_op_data(&data, parent->d_inode, inode,
 304                                name, len, O_RDWR, NULL);
 305
 306         /* Usually we come here only for NFSD, and we want open lock.
 307            But we can also get here with pre 2.6.15 patchless kernels, and in
 308            that case that lock is also ok */
 309         /* We can also get here if there was cached open handle in revalidate_it
 310          * but it disappeared while we were getting from there to ll_file_open.
 311          * But this means this file was closed and immediatelly opened which
 312          * makes a good candidate for using OPEN lock */
 313         /* If lmmsize & lmm are not 0, we are just setting stripe info
 314          * parameters. No need for the open lock */
 315         if (!lmm && !lmmsize)
 316                 itp->it_flags |= MDS_OPEN_LOCK;
 317
 318         rc = mdc_intent_lock(sbi->ll_mdc_exp, &data, lmm, lmmsize, itp,
 319                               0 /*unused */, &req, ll_mdc_blocking_ast, 0);
 320         if (rc == -ESTALE) {
 321                 /* reason for keep own exit path - don`t flood log
 322                 * with messages with -ESTALE errors.
 323                 */
 324                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
 325                      it_open_error(DISP_OPEN_OPEN, itp))
 326                         GOTO(out, rc);
 327                 ll_release_openhandle(file->f_dentry, itp);
 328                 GOTO(out, rc);
 329         }
 330
 331         if (it_disposition(itp, DISP_LOOKUP_NEG))
 332                 GOTO(out, rc = -ENOENT);
 333
 334         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
 335                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
 336                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
 337                 GOTO(out, rc);
 338         }
 339
 340         rc = ll_prep_inode(sbi->ll_osc_exp, &file->f_dentry->d_inode,
 341                            req, DLM_REPLY_REC_OFF, NULL);
 342         if (itp->d.lustre.it_lock_mode)
 343                 mdc_set_lock_data(&itp->d.lustre.it_lock_handle,
 344                                   inode, NULL);
 345
 346 out:
 347         ptlrpc_req_finished(itp->d.lustre.it_data);
 348         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
 349         ll_intent_drop_lock(itp);
 350
 351         RETURN(rc);
 352 }
 353
 354
 355 static void ll_och_fill(struct ll_inode_info *lli, struct lookup_intent *it,
 356                         struct obd_client_handle *och)
 357 {
 358         struct ptlrpc_request *req = it->d.lustre.it_data;
 359         struct mds_body *body;
 360
 361         LASSERT(och);
 362
 363         body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
 364         LASSERT(body != NULL);                  /* reply already checked out */
 365         /* and swabbed in mdc_enqueue */
 366         LASSERT(lustre_rep_swabbed(req, DLM_REPLY_REC_OFF));
 367
 368         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
 369         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 370         lli->lli_io_epoch = body->io_epoch;
 371
 372         mdc_set_open_replay_data(och, it->d.lustre.it_data);
 373 }
 374
 375 int ll_local_open(struct file *file, struct lookup_intent *it,
 376                   struct ll_file_data *fd, struct obd_client_handle *och)
 377 {
 378         ENTRY;
 379
 380         LASSERT(!LUSTRE_FPRIVATE(file));
 381
 382         LASSERT(fd != NULL);
 383
 384         if (och)
 385                 ll_och_fill(ll_i2info(file->f_dentry->d_inode), it, och);
 386         LUSTRE_FPRIVATE(file) = fd;
 387         ll_readahead_init(file->f_dentry->d_inode, &fd->fd_ras);
 388         fd->fd_omode = it->it_flags;
 389
 390         RETURN(0);
 391 }
 392
 393 /* Open a file, and (for the very first open) create objects on the OSTs at
 394  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 395  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
 396  * lli_open_sem to ensure no other process will create objects, send the
 397  * stripe MD to the MDS, or try to destroy the objects if that fails.
 398  *
 399  * If we already have the stripe MD locally then we don't request it in
 400  * mdc_open(), by passing a lmm_size = 0.
 401  *
 402  * It is up to the application to ensure no other processes open this file
 403  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 404  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 405  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 406  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 407  */
 408 int ll_file_open(struct inode *inode, struct file *file)
 409 {
 410         struct ll_inode_info *lli = ll_i2info(inode);
 411         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 412                                           .it_flags = file->f_flags };
 413         struct lov_stripe_md *lsm;
 414         struct obd_client_handle **och_p = NULL;
 415         __u64 *och_usecount = NULL;
 416         struct ll_file_data *fd;
 417         int rc = 0, opendir_set = 0;
 418         ENTRY;
 419
 420         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 421                inode->i_generation, inode, file->f_flags);
 422
 423 #ifdef HAVE_VFS_INTENT_PATCHES
 424         it = file->f_it;
 425 #else
 426         it = file->private_data; /* XXX: compat macro */
 427         file->private_data = NULL; /* prevent ll_local_open assertion */
 428 #endif
 429
 430         fd = ll_file_data_get();
 431         if (fd == NULL)
 432                 GOTO(out_och_free, rc = -ENOMEM);
 433
 434         if (S_ISDIR(inode->i_mode)) {
 435                 spin_lock(&lli->lli_lock);
 436                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
 437                         LASSERT(lli->lli_sai == NULL);
 438                         lli->lli_opendir_key = fd;
 439                         lli->lli_opendir_pid = cfs_curproc_pid();
 440                         opendir_set = 1;
 441                 }
 442                 spin_unlock(&lli->lli_lock);
 443         }
 444
 445         if (inode->i_sb->s_root == file->f_dentry) {
 446                 LUSTRE_FPRIVATE(file) = fd;
 447                 RETURN(0);
 448         }
 449
 450         if (!it || !it->d.lustre.it_disposition) {
 451                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 452                  * because everything but O_ACCMODE mask was stripped from it */
 453                 if ((oit.it_flags + 1) & O_ACCMODE)
 454                         oit.it_flags++;
 455                 if (file->f_flags & O_TRUNC)
 456                         oit.it_flags |= FMODE_WRITE;
 457
 458                 /* kernel only call f_op->open in dentry_open.  filp_open calls
 459                  * dentry_open after call to open_namei that checks permissions.
 460                  * Only nfsd_open call dentry_open directly without checking
 461                  * permissions and because of that this code below is safe. */
 462                 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
 463                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 464
 465                 /* We do not want O_EXCL here, presumably we opened the file
 466                  * already? XXX - NFS implications? */
 467                 oit.it_flags &= ~O_EXCL;
 468
 469                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
 470                  * created if necessary, then "IT_CREAT" should be set to keep
 471                  * consistent with it */
 472                 if (oit.it_flags & O_CREAT)
 473                         oit.it_op |= IT_CREAT;
 474
 475                 it = &oit;
 476         }
 477
 478         if (ll_i2sbi(inode)->ll_direct_io_default &&
 479             !S_ISDIR(inode->i_mode) &&
 480             !(it->it_flags & FMODE_EXEC))
 481                 file->f_flags |= O_DIRECT;
 482
 483 restart:
 484         /* Let's see if we have file open on MDS already. */
 485         if (it->it_flags & FMODE_WRITE) {
 486                 och_p = &lli->lli_mds_write_och;
 487                 och_usecount = &lli->lli_open_fd_write_count;
 488         } else if (it->it_flags & FMODE_EXEC) {
 489                 och_p = &lli->lli_mds_exec_och;
 490                 och_usecount = &lli->lli_open_fd_exec_count;
 491          } else {
 492                 och_p = &lli->lli_mds_read_och;
 493                 och_usecount = &lli->lli_open_fd_read_count;
 494         }
 495
 496         LASSERTF(it->it_flags != 0, "it %p dist %d \n", it,
 497                  it->d.lustre.it_disposition);
 498
 499         down(&lli->lli_och_sem);
 500         if (*och_p) { /* Open handle is present */
 501                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 502                         /* Well, there's extra open request that we do not need,
 503                            let's close it somehow. This will decref request. */
 504                         rc = it_open_error(DISP_OPEN_OPEN, it);
 505                         if (rc) {
 506                                 up(&lli->lli_och_sem);
 507                                 GOTO(out_openerr, rc);
 508                         }
 509                         ll_release_openhandle(file->f_dentry, it);
 510                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
 511                                              LPROC_LL_OPEN);
 512                 }
 513                 (*och_usecount)++;
 514
 515                 rc = ll_local_open(file, it, fd, NULL);
 516
 517                 LASSERTF(rc == 0, "rc = %d\n", rc);
 518         } else {
 519                 LASSERT(*och_usecount == 0);
 520                 if (!it->d.lustre.it_disposition) {
 521                         /* We cannot just request lock handle now, new ELC code
 522                            means that one of other OPEN locks for this file
 523                            could be cancelled, and since blocking ast handler
 524                            would attempt to grab och_sem as well, that would
 525                            result in a deadlock */
 526                         up(&lli->lli_och_sem);
 527                         it->it_create_mode |= M_CHECK_STALE;
 528                         rc = ll_intent_file_open(file, NULL, 0, it);
 529                         it->it_create_mode &= ~M_CHECK_STALE;
 530                         if (rc)
 531                                 GOTO(out_openerr, rc);
 532
 533                         goto restart;
 534                 }
 535
 536                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 537                 if (!*och_p)
 538                         GOTO(out_och_free, rc = -ENOMEM);
 539
 540                 (*och_usecount)++;
 541
 542                 /* mdc_intent_lock() didn't get a request ref if there was an
 543                  * open error, so don't do cleanup on the request here
 544                  * (bug 3430) */
 545                 /* XXX (green): Should not we bail out on any error here, not
 546                  * just open error? */
 547                 rc = it_open_error(DISP_OPEN_OPEN, it);
 548                 if (rc)
 549                         GOTO(out_och_free, rc);
 550
 551                 LASSERT(it_disposition(it, DISP_ENQ_OPEN_REF));
 552
 553                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
 554                 rc = ll_local_open(file, it, fd, *och_p);
 555                 LASSERTF(rc == 0, "rc = %d\n", rc);
 556         }
 557         up(&lli->lli_och_sem);
 558         fd = NULL;
 559
 560         /* Must do this outside lli_och_sem lock to prevent deadlock where
 561            different kind of OPEN lock for this same inode gets cancelled
 562            by ldlm_cancel_lru */
 563         if (!S_ISREG(inode->i_mode))
 564                 GOTO(out_och_free, rc);
 565
 566         lsm = lli->lli_smd;
 567         if (lsm == NULL) {
 568                 if (file->f_flags & O_LOV_DELAY_CREATE ||
 569                     !(file->f_mode & FMODE_WRITE)) {
 570                         CDEBUG(D_INODE, "object creation was delayed\n");
 571                         GOTO(out_och_free, rc);
 572                 }
 573         }
 574         file->f_flags &= ~O_LOV_DELAY_CREATE;
 575         GOTO(out_och_free, rc);
 576
 577 out_och_free:
 578         if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
 579                 ptlrpc_req_finished(it->d.lustre.it_data);
 580                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 581         }
 582
 583         if (rc == 0) {
 584                 ll_open_complete(inode);
 585         } else {
 586                 if (och_p && *och_p) {
 587                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 588                         *och_p = NULL; /* OBD_FREE writes some magic there */
 589                         (*och_usecount)--;
 590                 }
 591                 up(&lli->lli_och_sem);
 592 out_openerr:
 593                 if (opendir_set != 0)
 594                         ll_stop_statahead(inode, lli->lli_opendir_key);
 595                 if (fd != NULL)
 596                         ll_file_data_put(fd);
 597         }
 598
 599         return rc;
 600 }
 601
 602 /* Fills the obdo with the attributes for the inode defined by lsm */
 603 int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm,
 604                    struct obdo *oa)
 605 {
 606         struct ptlrpc_request_set *set;
 607         struct obd_info oinfo = { { { 0 } } };
 608         int rc;
 609         ENTRY;
 610
 611         LASSERT(lsm != NULL);
 612
 613         memset(oa, 0, sizeof *oa);
 614         oinfo.oi_md = lsm;
 615         oinfo.oi_oa = oa;
 616         oa->o_id = lsm->lsm_object_id;
 617         oa->o_gr = lsm->lsm_object_gr;
 618         oa->o_mode = S_IFREG;
 619         oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE |
 620                 OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
 621                 OBD_MD_FLCTIME | OBD_MD_FLGROUP;
 622
 623         set = ptlrpc_prep_set();
 624         if (set == NULL) {
 625                 rc = -ENOMEM;
 626         } else {
 627                 rc = obd_getattr_async(exp, &oinfo, set);
 628                 if (rc == 0)
 629                         rc = ptlrpc_set_wait(set);
 630                 ptlrpc_set_destroy(set);
 631         }
 632         if (rc)
 633                 RETURN(rc);
 634
 635         oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
 636                         OBD_MD_FLCTIME | OBD_MD_FLSIZE);
 637         RETURN(0);
 638 }
 639
 640 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
 641 {
 642         struct ll_inode_info *lli = ll_i2info(inode);
 643         struct lov_stripe_md *lsm = lli->lli_smd;
 644         struct obd_export *exp = ll_i2obdexp(inode);
 645         struct {
 646                 char name[16];
 647                 struct ldlm_lock *lock;
 648         } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock };
 649         __u32 stripe, vallen = sizeof(stripe);
 650         struct lov_oinfo *loinfo;
 651         int rc;
 652         ENTRY;
 653
 654         if (lsm->lsm_stripe_count == 1)
 655                 GOTO(check, stripe = 0);
 656
 657         /* get our offset in the lov */
 658         rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe, lsm);
 659         if (rc != 0) {
 660                 CERROR("obd_get_info: rc = %d\n", rc);
 661                 RETURN(rc);
 662         }
 663         LASSERT(stripe < lsm->lsm_stripe_count);
 664
 665 check:
 666         loinfo = lsm->lsm_oinfo[stripe];
 667         if (!osc_res_name_eq(loinfo->loi_id, loinfo->loi_gr,
 668                             &lock->l_resource->lr_name)) {
 669                 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
 670                            loinfo->loi_id, loinfo->loi_gr);
 671                 RETURN(-ELDLM_NO_LOCK_DATA);
 672         }
 673
 674         RETURN(stripe);
 675 }
 676
 677 /* Get extra page reference to ensure it is not going away */
 678 void ll_pin_extent_cb(void *data)
 679 {
 680         struct page *page = data;
 681
 682         page_cache_get(page);
 683
 684         return;
 685 }
 686 /* Flush the page from page cache for an extent as its canceled.
 687  * Page to remove is delivered as @data.
 688  *
 689  * No one can dirty the extent until we've finished our work and they cannot
 690  * enqueue another lock.  The DLM protects us from ll_file_read/write here,
 691  * but other kernel actors could have pages locked.
 692  *
 693  * If @discard is set, there is no need to write the page if it is dirty.
 694  *
 695  * Called with the DLM lock held. */
 696 int ll_page_removal_cb(void *data, int discard)
 697 {
 698         int rc;
 699         struct page *page = data;
 700         struct address_space *mapping;
 701
 702         ENTRY;
 703
 704         /* We have page reference already from ll_pin_page */
 705         lock_page(page);
 706
 707         /* Already truncated by somebody */
 708         if (!page->mapping)
 709                 GOTO(out, rc = 0);
 710
 711         mapping = page->mapping;
 712
 713         ll_teardown_mmaps(mapping,
 714                           (__u64)page->index << PAGE_CACHE_SHIFT,
 715                           ((__u64)page->index<<PAGE_CACHE_SHIFT)|
 716                                                               ~PAGE_CACHE_MASK);
 717         LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
 718         if (!discard && PageWriteback(page))
 719                 wait_on_page_writeback(page);
 720
 721         if (!discard && clear_page_dirty_for_io(page)) {
 722                 rc = ll_call_writepage(page->mapping->host, page);
 723                 /* either waiting for io to complete or reacquiring
 724                  * the lock that the failed writepage released */
 725                 lock_page(page);
 726                 wait_on_page_writeback(page);
 727                 if (rc < 0) {
 728                         CERROR("writepage inode %lu(%p) of page %p "
 729                                "failed: %d\n", mapping->host->i_ino,
 730                                mapping->host, page, rc);
 731                         if (rc == -ENOSPC)
 732                                 set_bit(AS_ENOSPC, &mapping->flags);
 733                         else
 734                                 set_bit(AS_EIO, &mapping->flags);
 735                 }
 736         }
 737         if (page->mapping != NULL) {
 738                 struct ll_async_page *llap = llap_cast_private(page);
 739                 // checking again to account for writeback's lock_page()
 740                 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
 741                 if (llap)
 742                         ll_ra_accounting(llap, page->mapping);
 743                 ll_truncate_complete_page(page);
 744         }
 745         EXIT;
 746 out:
 747         LASSERT(!PageWriteback(page));
 748         unlock_page(page);
 749         page_cache_release(page);
 750
 751         return 0;
 752 }
 753
 754 int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
 755                              void *data, int flag)
 756 {
 757         struct inode *inode;
 758         struct ll_inode_info *lli;
 759         struct lov_stripe_md *lsm;
 760         int stripe;
 761         __u64 kms;
 762
 763         ENTRY;
 764
 765         if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
 766                 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
 767                 LBUG();
 768         }
 769
 770         inode = ll_inode_from_lock(lock);
 771         if (inode == NULL)
 772                 RETURN(0);
 773         lli = ll_i2info(inode);
 774         if (lli == NULL)
 775                 GOTO(iput, 0);
 776         if (lli->lli_smd == NULL)
 777                 GOTO(iput, 0);
 778         lsm = lli->lli_smd;
 779
 780         stripe = ll_lock_to_stripe_offset(inode, lock);
 781         if (stripe < 0)
 782                 GOTO(iput, 0);
 783
 784         lov_stripe_lock(lsm);
 785         lock_res_and_lock(lock);
 786         kms = ldlm_extent_shift_kms(lock,
 787                                     lsm->lsm_oinfo[stripe]->loi_kms);
 788
 789         if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
 790                 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 791                            lsm->lsm_oinfo[stripe]->loi_kms, kms);
 792         lsm->lsm_oinfo[stripe]->loi_kms = kms;
 793         unlock_res_and_lock(lock);
 794         lov_stripe_unlock(lsm);
 795         ll_try_done_writing(inode);
 796         EXIT;
 797 iput:
 798         iput(inode);
 799
 800         return 0;
 801 }
 802
 803 #if 0
 804 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
 805 {
 806         /* XXX ALLOCATE - 160 bytes */
 807         struct inode *inode = ll_inode_from_lock(lock);
 808         struct ll_inode_info *lli = ll_i2info(inode);
 809         struct lustre_handle lockh = { 0 };
 810         struct ost_lvb *lvb;
 811         int stripe;
 812         ENTRY;
 813
 814         if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
 815                      LDLM_FL_BLOCK_CONV)) {
 816                 LBUG(); /* not expecting any blocked async locks yet */
 817                 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
 818                            "lock, returning");
 819                 ldlm_lock_dump(D_OTHER, lock, 0);
 820                 ldlm_reprocess_all(lock->l_resource);
 821                 RETURN(0);
 822         }
 823
 824         LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
 825
 826         stripe = ll_lock_to_stripe_offset(inode, lock);
 827         if (stripe < 0)
 828                 goto iput;
 829
 830         if (lock->l_lvb_len) {
 831                 struct lov_stripe_md *lsm = lli->lli_smd;
 832                 __u64 kms;
 833                 lvb = lock->l_lvb_data;
 834                 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
 835
 836                 lock_res_and_lock(lock);
 837                 ll_inode_size_lock(inode, 1);
 838                 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
 839                 kms = ldlm_extent_shift_kms(NULL, kms);
 840                 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
 841                         LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 842                                    lsm->lsm_oinfo[stripe].loi_kms, kms);
 843                 lsm->lsm_oinfo[stripe].loi_kms = kms;
 844                 ll_inode_size_unlock(inode, 1);
 845                 unlock_res_and_lock(lock);
 846         }
 847
 848 iput:
 849         iput(inode);
 850         wake_up(&lock->l_waitq);
 851
 852         ldlm_lock2handle(lock, &lockh);
 853         ldlm_lock_decref(&lockh, LCK_PR);
 854         RETURN(0);
 855 }
 856 #endif
 857
 858 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
 859 {
 860         struct ptlrpc_request *req = reqp;
 861         struct inode *inode = ll_inode_from_lock(lock);
 862         struct ll_inode_info *lli;
 863         struct lov_stripe_md *lsm;
 864         struct ost_lvb *lvb;
 865         int rc, stripe;
 866         int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
 867         ENTRY;
 868
 869         if (inode == NULL)
 870                 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
 871         lli = ll_i2info(inode);
 872         if (lli == NULL)
 873                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
 874         lsm = lli->lli_smd;
 875         if (lsm == NULL)
 876                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
 877
 878         /* First, find out which stripe index this lock corresponds to. */
 879         stripe = ll_lock_to_stripe_offset(inode, lock);
 880         if (stripe < 0)
 881                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
 882
 883         rc = lustre_pack_reply(req, 2, size, NULL);
 884         if (rc)
 885                 GOTO(iput, rc);
 886
 887         lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
 888         lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
 889         lvb->lvb_mtime = LTIME_S(inode->i_mtime);
 890         lvb->lvb_atime = LTIME_S(inode->i_atime);
 891         lvb->lvb_ctime = LTIME_S(inode->i_ctime);
 892
 893         LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
 894                    " atime "LPU64", mtime "LPU64", ctime "LPU64,
 895                    i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_atime,
 896                    lvb->lvb_mtime, lvb->lvb_ctime);
 897  iput:
 898         iput(inode);
 899
 900  out:
 901         /* These errors are normal races, so we don't want to fill the console
 902          * with messages by calling ptlrpc_error() */
 903         if (rc == -ELDLM_NO_LOCK_DATA)
 904                 lustre_pack_reply(req, 1, NULL, NULL);
 905
 906         req->rq_status = rc;
 907         return rc;
 908 }
 909
 910 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
 911                      lstat_t *st)
 912 {
 913         struct lustre_handle lockh = { 0 };
 914         struct ldlm_enqueue_info einfo = { 0 };
 915         struct obd_info oinfo = { { { 0 } } };
 916         struct ost_lvb lvb;
 917         int rc;
 918
 919         ENTRY;
 920
 921         einfo.ei_type = LDLM_EXTENT;
 922         einfo.ei_mode = LCK_PR;
 923         einfo.ei_cb_bl = osc_extent_blocking_cb;
 924         einfo.ei_cb_cp = ldlm_completion_ast;
 925         einfo.ei_cb_gl = ll_glimpse_callback;
 926         einfo.ei_cbdata = NULL;
 927
 928         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
 929         oinfo.oi_lockh = &lockh;
 930         oinfo.oi_md = lsm;
 931         oinfo.oi_flags = LDLM_FL_HAS_INTENT;
 932
 933         rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
 934         if (rc == -ENOENT)
 935                 RETURN(rc);
 936         if (rc != 0) {
 937                 CERROR("obd_enqueue returned rc %d, "
 938                        "returning -EIO\n", rc);
 939                 RETURN(rc > 0 ? -EIO : rc);
 940         }
 941
 942         lov_stripe_lock(lsm);
 943         memset(&lvb, 0, sizeof(lvb));
 944         obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 0);
 945         st->st_size = lvb.lvb_size;
 946         st->st_blocks = lvb.lvb_blocks;
 947         st->st_mtime = lvb.lvb_mtime;
 948         st->st_atime = lvb.lvb_atime;
 949         st->st_ctime = lvb.lvb_ctime;
 950         lov_stripe_unlock(lsm);
 951
 952         RETURN(rc);
 953 }
 954
 955 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
 956  * file (because it prefers KMS over RSS when larger) */
 957 int ll_glimpse_size(struct inode *inode, int ast_flags)
 958 {
 959         struct ll_inode_info *lli = ll_i2info(inode);
 960         struct ll_sb_info *sbi = ll_i2sbi(inode);
 961         struct lustre_handle lockh = { 0 };
 962         struct ldlm_enqueue_info einfo = { 0 };
 963         struct obd_info oinfo = { { { 0 } } };
 964         struct ost_lvb lvb;
 965         int rc;
 966         ENTRY;
 967
 968         CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
 969
 970         if (!lli->lli_smd) {
 971                 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
 972                 RETURN(0);
 973         }
 974
 975         /* NOTE: this looks like DLM lock request, but it may not be one. Due
 976          *       to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
 977          *       won't revoke any conflicting DLM locks held. Instead,
 978          *       ll_glimpse_callback() will be called on each client
 979          *       holding a DLM lock against this file, and resulting size
 980          *       will be returned for each stripe. DLM lock on [0, EOF] is
 981          *       acquired only if there were no conflicting locks. */
 982         einfo.ei_type = LDLM_EXTENT;
 983         einfo.ei_mode = LCK_PR;
 984         einfo.ei_cb_bl = osc_extent_blocking_cb;
 985         einfo.ei_cb_cp = ldlm_completion_ast;
 986         einfo.ei_cb_gl = ll_glimpse_callback;
 987         einfo.ei_cbdata = inode;
 988
 989         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
 990         oinfo.oi_lockh = &lockh;
 991         oinfo.oi_md = lli->lli_smd;
 992         oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
 993
 994         rc = obd_enqueue_rqset(sbi->ll_osc_exp, &oinfo, &einfo);
 995         if (rc == -ENOENT)
 996                 RETURN(rc);
 997         if (rc != 0) {
 998                 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
 999                 RETURN(rc > 0 ? -EIO : rc);
1000         }
1001
1002         ll_inode_size_lock(inode, 1);
1003         inode_init_lvb(inode, &lvb);
1004         /* merge timestamps the most recently obtained from mds with
1005            timestamps obtained from osts */
1006         lvb.lvb_atime = lli->lli_lvb.lvb_atime;
1007         lvb.lvb_mtime = lli->lli_lvb.lvb_mtime;
1008         lvb.lvb_ctime = lli->lli_lvb.lvb_ctime;
1009         rc = obd_merge_lvb(sbi->ll_osc_exp, lli->lli_smd, &lvb, 0);
1010         i_size_write(inode, lvb.lvb_size);
1011         inode->i_blocks = lvb.lvb_blocks;
1012         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1013         LTIME_S(inode->i_atime) = lvb.lvb_atime;
1014         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1015         ll_inode_size_unlock(inode, 1);
1016
1017         CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
1018                i_size_read(inode), (long long)inode->i_blocks);
1019
1020         RETURN(rc);
1021 }
1022
1023 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1024                    struct lov_stripe_md *lsm, int mode,
1025                    ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1026                    int ast_flags)
1027 {
1028         struct ll_sb_info *sbi = ll_i2sbi(inode);
1029         struct ost_lvb lvb;
1030         struct ldlm_enqueue_info einfo = { 0 };
1031         struct obd_info oinfo = { { { 0 } } };
1032         int rc;
1033         ENTRY;
1034
1035         LASSERT(!lustre_handle_is_used(lockh));
1036         LASSERT(lsm != NULL);
1037
1038         /* don't drop the mmapped file to LRU */
1039         if (mapping_mapped(inode->i_mapping))
1040                 ast_flags |= LDLM_FL_NO_LRU;
1041
1042         /* XXX phil: can we do this?  won't it screw the file size up? */
1043         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1044             (sbi->ll_flags & LL_SBI_NOLCK))
1045                 RETURN(0);
1046
1047         CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1048                inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1049
1050         einfo.ei_type = LDLM_EXTENT;
1051         einfo.ei_mode = mode;
1052         einfo.ei_cb_bl = osc_extent_blocking_cb;
1053         einfo.ei_cb_cp = ldlm_completion_ast;
1054         einfo.ei_cb_gl = ll_glimpse_callback;
1055         einfo.ei_cbdata = inode;
1056
1057         oinfo.oi_policy = *policy;
1058         oinfo.oi_lockh = lockh;
1059         oinfo.oi_md = lsm;
1060         oinfo.oi_flags = ast_flags;
1061
1062         rc = obd_enqueue(sbi->ll_osc_exp, &oinfo, &einfo, NULL);
1063         *policy = oinfo.oi_policy;
1064         if (rc > 0)
1065                 rc = -EIO;
1066
1067         ll_inode_size_lock(inode, 1);
1068         inode_init_lvb(inode, &lvb);
1069         obd_merge_lvb(sbi->ll_osc_exp, lsm, &lvb, 1);
1070
1071         if (policy->l_extent.start == 0 &&
1072             policy->l_extent.end == OBD_OBJECT_EOF) {
1073                 /* vmtruncate()->ll_truncate() first sets the i_size and then
1074                  * the kms under both a DLM lock and the
1075                  * ll_inode_size_lock().  If we don't get the
1076                  * ll_inode_size_lock() here we can match the DLM lock and
1077                  * reset i_size from the kms before the truncating path has
1078                  * updated the kms.  generic_file_write can then trust the
1079                  * stale i_size when doing appending writes and effectively
1080                  * cancel the result of the truncate.  Getting the
1081                  * ll_inode_size_lock() after the enqueue maintains the DLM
1082                  * -> ll_inode_size_lock() acquiring order. */
1083                 i_size_write(inode, lvb.lvb_size);
1084                 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1085                        inode->i_ino, i_size_read(inode));
1086         }
1087
1088         if (rc == 0) {
1089                 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1090                 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1091                 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1092         }
1093         ll_inode_size_unlock(inode, 1);
1094
1095         RETURN(rc);
1096 }
1097
1098 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1099                      struct lov_stripe_md *lsm, int mode,
1100                      struct lustre_handle *lockh)
1101 {
1102         struct ll_sb_info *sbi = ll_i2sbi(inode);
1103         int rc;
1104         ENTRY;
1105
1106         /* XXX phil: can we do this?  won't it screw the file size up? */
1107         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1108             (sbi->ll_flags & LL_SBI_NOLCK))
1109                 RETURN(0);
1110
1111         rc = obd_cancel(sbi->ll_osc_exp, lsm, mode, lockh, 0, 0);
1112
1113         RETURN(rc);
1114 }
1115
1116 static void ll_set_file_contended(struct inode *inode)
1117 {
1118         struct ll_inode_info *lli = ll_i2info(inode);
1119
1120         lli->lli_contention_time = cfs_time_current();
1121         set_bit(LLI_F_CONTENDED, &lli->lli_flags);
1122 }
1123
1124 void ll_clear_file_contended(struct inode *inode)
1125 {
1126         struct ll_inode_info *lli = ll_i2info(inode);
1127
1128         clear_bit(LLI_F_CONTENDED, &lli->lli_flags);
1129 }
1130
1131 static int ll_is_file_contended(struct file *file)
1132 {
1133         struct inode *inode = file->f_dentry->d_inode;
1134         struct ll_inode_info *lli = ll_i2info(inode);
1135         struct ll_sb_info *sbi = ll_i2sbi(inode);
1136         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1137         ENTRY;
1138
1139         if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
1140                 CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
1141                        " osc connect flags = 0x"LPX64"\n",
1142                        sbi->ll_lco.lco_flags);
1143                 RETURN(0);
1144         }
1145
1146         if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
1147                 RETURN(0);
1148
1149         /* The semantics here is a bit complicated due to compatibility.
1150          * The user may be aware of per-file LL_FILE_LOCKED_DIRECTIO,
1151          * but not of per-client lockless_direct_io, so the file bit takes
1152          * precedence if it is set. If the file bit is not set, we use
1153          * lockless I/O unless per-client lockless_direct_io is set to zero.
1154          */
1155         CLASSERT(SBI_DEFAULT_LOCKLESS_DIRECT_IO == 1);
1156         if ((file->f_flags & O_DIRECT) &&
1157             !(fd && (fd->fd_flags & LL_FILE_LOCKED_DIRECTIO)) &&
1158             sbi->ll_lockless_direct_io)
1159                 RETURN(1);
1160
1161         /* server-side locking for cached I/O with LL_FILE_LOCKLESS_IO */
1162         if (!(file->f_flags & O_DIRECT) &&
1163             fd && fd->fd_flags & LL_FILE_LOCKLESS_IO)
1164                 RETURN(1);
1165
1166         if (test_bit(LLI_F_CONTENDED, &lli->lli_flags)) {
1167                 cfs_time_t cur_time = cfs_time_current();
1168                 cfs_time_t retry_time;
1169
1170                 retry_time = cfs_time_add(
1171                         lli->lli_contention_time,
1172                         cfs_time_seconds(sbi->ll_contention_time));
1173                 if (cfs_time_after(cur_time, retry_time)) {
1174                         ll_clear_file_contended(inode);
1175                         RETURN(0);
1176                 }
1177                 RETURN(1);
1178         }
1179         RETURN(0);
1180 }
1181
1182 static int ll_file_get_tree_lock_iov(struct ll_lock_tree *tree,
1183                                      struct file *file, const struct iovec *iov,
1184                                      unsigned long nr_segs,
1185                                      obd_off start, obd_off end, int rw)
1186 {
1187         int append;
1188         int tree_locked = 0;
1189         int rc;
1190         struct inode * inode = file->f_dentry->d_inode;
1191         ENTRY;
1192
1193         append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND);
1194
1195         if (append || !ll_is_file_contended(file)) {
1196                 struct ll_lock_tree_node *node;
1197                 int ast_flags;
1198
1199                 ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
1200                 if (file->f_flags & O_NONBLOCK)
1201                         ast_flags |= LDLM_FL_BLOCK_NOWAIT;
1202                 node = ll_node_from_inode(inode, start, end,
1203                                           (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR);
1204                 if (IS_ERR(node)) {
1205                         rc = PTR_ERR(node);
1206                         GOTO(out, rc);
1207                 }
1208                 tree->lt_fd = LUSTRE_FPRIVATE(file);
1209                 rc = ll_tree_lock_iov(tree, node, iov, nr_segs, ast_flags);
1210                 if (rc == 0)
1211                         tree_locked = 1;
1212                 else if (rc == -EUSERS)
1213                         ll_set_file_contended(inode);
1214                 else
1215                         GOTO(out, rc);
1216         }
1217         RETURN(tree_locked);
1218 out:
1219         return rc;
1220 }
1221
1222 /* XXX: exact copy from kernel code (__generic_file_aio_write_nolock from rhel4)
1223  */
1224 static size_t ll_file_get_iov_count(const struct iovec *iov,
1225                                      unsigned long *nr_segs)
1226 {
1227         size_t count = 0;
1228         unsigned long seg;
1229
1230         for (seg = 0; seg < *nr_segs; seg++) {
1231                 const struct iovec *iv = &iov[seg];
1232
1233                 /*
1234                  * If any segment has a negative length, or the cumulative
1235                  * length ever wraps negative then return -EINVAL.
1236                  */
1237                 count += iv->iov_len;
1238                 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
1239                         return -EINVAL;
1240                 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
1241                         continue;
1242                 if (seg == 0)
1243                         return -EFAULT;
1244                 *nr_segs = seg;
1245                 count -= iv->iov_len;   /* This segment is no good */
1246                 break;
1247         }
1248         return count;
1249 }
1250
1251 static int iov_copy_update(unsigned long *nr_segs, const struct iovec **iov_out,
1252                            unsigned long *nrsegs_copy,
1253                            struct iovec *iov_copy, size_t *offset,
1254                            size_t size)
1255 {
1256         int i;
1257         const struct iovec *iov = *iov_out;
1258         for (i = 0; i < *nr_segs;
1259              i++) {
1260                 const struct iovec *iv = &iov[i];
1261                 struct iovec *ivc = &iov_copy[i];
1262                 *ivc = *iv;
1263                 if (i == 0) {
1264                         ivc->iov_len -= *offset;
1265                         ivc->iov_base += *offset;
1266                 }
1267                 if (ivc->iov_len >= size) {
1268                         ivc->iov_len = size;
1269                         if (i == 0)
1270                                 *offset += size;
1271                         else
1272                                 *offset = size;
1273                         break;
1274                 }
1275                 size -= ivc->iov_len;
1276         }
1277         *iov_out += i;
1278         *nr_segs -= i;
1279         *nrsegs_copy = i + 1;
1280
1281         return 0;
1282 }
1283
1284 static int ll_get_short_lock(struct page *page, int rw, obd_off start,
1285                              obd_off end, struct lustre_handle *lockh)
1286 {
1287         struct ll_async_page *llap;
1288         struct obd_export *exp;
1289         struct inode *inode = page->mapping->host;
1290
1291         ENTRY;
1292
1293         exp = ll_i2obdexp(inode);
1294         if (exp == NULL)
1295                 RETURN(0);
1296
1297         llap = llap_cast_private(page);
1298         if (llap == NULL)
1299                 RETURN(0);
1300
1301         RETURN(obd_get_lock(exp, ll_i2info(inode)->lli_smd,
1302                             &llap->llap_cookie, rw, start, end, lockh,
1303                             OBD_FAST_LOCK));
1304 }
1305
1306 static void ll_release_short_lock(struct inode *inode, obd_off end,
1307                                   struct lustre_handle *lockh, int rw)
1308 {
1309         struct obd_export *exp;
1310         int rc;
1311
1312         exp = ll_i2obdexp(inode);
1313         if (exp == NULL)
1314                 return;
1315
1316         rc = obd_cancel(exp, ll_i2info(inode)->lli_smd,
1317                         rw = OBD_BRW_READ ? LCK_PR : LCK_PW, lockh,
1318                         OBD_FAST_LOCK, end);
1319         if (rc < 0)
1320                 CERROR("unlock failed (%d)\n", rc);
1321 }
1322
1323 static inline int ll_file_get_fast_lock(struct file *file,
1324                                         obd_off ppos, obd_off end,
1325                                         const struct iovec *iov,
1326                                         unsigned long nr_segs,
1327                                         struct lustre_handle *lockh,
1328                                         int rw)
1329 {
1330         int rc = 0, seg;
1331         struct page *page;
1332
1333         ENTRY;
1334
1335         /* we would like this read request to be lockfree */
1336         for (seg = 0; seg < nr_segs; seg++) {
1337                 const struct iovec *iv = &iov[seg];
1338                 if (ll_region_mapped((unsigned long)iv->iov_base, iv->iov_len))
1339                         GOTO(out, rc);
1340         }
1341
1342         page = find_lock_page(file->f_dentry->d_inode->i_mapping,
1343                               ppos >> CFS_PAGE_SHIFT);
1344         if (page) {
1345                 if (ll_get_short_lock(page, rw, ppos, end, lockh))
1346                         rc = 1;
1347
1348                 unlock_page(page);
1349                 page_cache_release(page);
1350         }
1351
1352 out:
1353         RETURN(rc);
1354 }
1355
1356 static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end,
1357                                          struct lustre_handle *lockh, int rw)
1358 {
1359         ll_release_short_lock(inode, end, lockh, rw);
1360 }
1361
1362 static inline int ll_file_get_lock(struct file *file, obd_off ppos,
1363                                    obd_off end, const struct iovec *iov,
1364                                    unsigned long nr_segs,
1365                                    struct lustre_handle *lockh,
1366                                    struct ll_lock_tree *tree, int rw)
1367 {
1368         int rc;
1369
1370         ENTRY;
1371
1372         if (ll_file_get_fast_lock(file, ppos, end, iov, nr_segs, lockh, rw))
1373                 RETURN(LL_LOCK_STYLE_FASTLOCK);
1374
1375         rc = ll_file_get_tree_lock_iov(tree, file, iov, nr_segs,
1376                                        ppos, end, rw);
1377         /* rc: 1 for tree lock, 0 for no lock, <0 for error */
1378         switch (rc) {
1379         case 1:
1380                 RETURN(LL_LOCK_STYLE_TREELOCK);
1381         case 0:
1382                 RETURN(LL_LOCK_STYLE_NOLOCK);
1383         }
1384
1385         /* an error happened if we reached this point, rc = -errno here */
1386         RETURN(rc);
1387 }
1388
1389 static inline void ll_file_put_lock(struct inode *inode, obd_off end,
1390                                     enum ll_lock_style lock_style,
1391                                     struct lustre_handle *lockh,
1392                                     struct ll_lock_tree *tree, int rw)
1393
1394 {
1395         switch (lock_style) {
1396         case LL_LOCK_STYLE_TREELOCK:
1397                 ll_tree_unlock(tree);
1398                 break;
1399         case LL_LOCK_STYLE_FASTLOCK:
1400                 ll_file_put_fast_lock(inode, end, lockh, rw);
1401                 break;
1402         default:
1403                 CERROR("invalid locking style (%d)\n", lock_style);
1404         }
1405 }
1406
1407 #ifdef HAVE_FILE_READV
1408 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
1409                               unsigned long nr_segs, loff_t *ppos)
1410 {
1411 #else
1412 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1413                                 unsigned long nr_segs, loff_t pos)
1414 {
1415         struct file *file = iocb->ki_filp;
1416         loff_t *ppos = &iocb->ki_pos;
1417 #endif
1418         struct inode *inode = file->f_dentry->d_inode;
1419         struct ll_inode_info *lli = ll_i2info(inode);
1420         struct lov_stripe_md *lsm = lli->lli_smd;
1421         struct ll_sb_info *sbi = ll_i2sbi(inode);
1422         struct ll_thread_data ltd = { 0 };
1423         struct ost_lvb lvb;
1424         struct ll_ra_read bead;
1425         int ra = 0;
1426         obd_off end;
1427         ssize_t retval, chunk, sum = 0;
1428         struct iovec *iov_copy = NULL;
1429         unsigned long nrsegs_copy, nrsegs_orig = 0;
1430         size_t count, iov_offset = 0;
1431         __u64 kms;
1432         ENTRY;
1433
1434         count = ll_file_get_iov_count(iov, &nr_segs);
1435         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size=%lu,offset=%lld\n",
1436                inode->i_ino, inode->i_generation, inode, (unsigned long)count,
1437                *ppos);
1438         /* "If nbyte is 0, read() will return 0 and have no other results."
1439          *                      -- Single Unix Spec */
1440         if (count == 0)
1441                 RETURN(0);
1442
1443         ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1444
1445         if (!lsm) {
1446                 /* Read on file with no objects should return zero-filled
1447                  * buffers up to file size (we can get non-zero sizes with
1448                  * mknod + truncate, then opening file for read. This is a
1449                  * common pattern in NFS case, it seems). Bug 6243 */
1450                 int notzeroed;
1451                 /* Since there are no objects on OSTs, we have nothing to get
1452                  * lock on and so we are forced to access inode->i_size
1453                  * unguarded */
1454
1455                 /* Read beyond end of file */
1456                 if (*ppos >= i_size_read(inode))
1457                         RETURN(0);
1458
1459                 if (count > i_size_read(inode) - *ppos)
1460                         count = i_size_read(inode) - *ppos;
1461                 /* Make sure to correctly adjust the file pos pointer for
1462                  * EFAULT case */
1463                 for (nrsegs_copy = 0; nrsegs_copy < nr_segs; nrsegs_copy++) {
1464                         const struct iovec *iv = &iov[nrsegs_copy];
1465
1466                         if (count < iv->iov_len)
1467                                 chunk = count;
1468                         else
1469                                 chunk = iv->iov_len;
1470                         notzeroed = clear_user(iv->iov_base, chunk);
1471                         sum += (chunk - notzeroed);
1472                         count -= (chunk - notzeroed);
1473                         if (notzeroed || !count)
1474                                 break;
1475                 }
1476                 *ppos += sum;
1477                 if (!sum)
1478                         RETURN(-EFAULT);
1479                 RETURN(sum);
1480         }
1481
1482         ltd.ltd_magic = LTD_MAGIC;
1483         ll_td_set(&ltd);
1484 repeat:
1485         memset(&ltd, 0, sizeof(ltd));
1486         ltd.ltd_magic = LTD_MAGIC;
1487         if (sbi->ll_max_rw_chunk != 0 && !(file->f_flags & O_DIRECT)) {
1488                 /* first, let's know the end of the current stripe */
1489                 end = *ppos;
1490                 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,&end);
1491
1492                 /* correct, the end is beyond the request */
1493                 if (end > *ppos + count - 1)
1494                         end = *ppos + count - 1;
1495
1496                 /* and chunk shouldn't be too large even if striping is wide */
1497                 if (end - *ppos > sbi->ll_max_rw_chunk)
1498                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1499
1500                 chunk = end - *ppos + 1;
1501                 if ((count == chunk) && (iov_offset == 0)) {
1502                         if (iov_copy)
1503                                 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1504
1505                         iov_copy = (struct iovec *)iov;
1506                         nrsegs_copy = nr_segs;
1507                 } else {
1508                         if (!iov_copy) {
1509                                 nrsegs_orig = nr_segs;
1510                                 OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
1511                                 if (!iov_copy)
1512                                         GOTO(out, retval = -ENOMEM);
1513                         }
1514
1515                         iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
1516                                         &iov_offset, chunk);
1517                 }
1518         } else {
1519                 end = *ppos + count - 1;
1520                 iov_copy = (struct iovec *)iov;
1521                 nrsegs_copy = nr_segs;
1522         }
1523
1524         down_read(&lli->lli_truncate_rwsem); /* Bug 18233 */
1525
1526         ltd.lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end,
1527                                           iov_copy, nrsegs_copy,
1528                                           &ltd.u.lockh, &ltd.u.tree,
1529                                           OBD_BRW_READ);
1530         if (ltd.lock_style < 0 || ltd.lock_style == LL_LOCK_STYLE_NOLOCK)
1531                 up_read(&lli->lli_truncate_rwsem);
1532         if (ltd.lock_style < 0)
1533                 GOTO(out, retval = ltd.lock_style);
1534
1535         ll_inode_size_lock(inode, 1);
1536         /*
1537          * Consistency guarantees: following possibilities exist for the
1538          * relation between region being read and real file size at this
1539          * moment:
1540          *
1541          *  (A): the region is completely inside of the file;
1542          *
1543          *  (B-x): x bytes of region are inside of the file, the rest is
1544          *  outside;
1545          *
1546          *  (C): the region is completely outside of the file.
1547          *
1548          * This classification is stable under DLM lock acquired by
1549          * ll_tree_lock() above, because to change class, other client has to
1550          * take DLM lock conflicting with our lock. Also, any updates to
1551          * ->i_size by other threads on this client are serialized by
1552          * ll_inode_size_lock(). This guarantees that short reads are handled
1553          * correctly in the face of concurrent writes and truncates.
1554          */
1555         inode_init_lvb(inode, &lvb);
1556         obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
1557         kms = lvb.lvb_size;
1558         if (*ppos + count - 1 > kms) {
1559                 /* A glimpse is necessary to determine whether we return a
1560                  * short read (B) or some zeroes at the end of the buffer (C) */
1561                 ll_inode_size_unlock(inode, 1);
1562                 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1563                 if (retval) {
1564                         if (ltd.lock_style != LL_LOCK_STYLE_NOLOCK) {
1565                                 ll_file_put_lock(inode, end, ltd.lock_style,
1566                                                  &ltd.u.lockh, &ltd.u.tree,
1567                                                  OBD_BRW_READ);
1568                                 up_read(&lli->lli_truncate_rwsem);
1569                         }
1570                         goto out;
1571                 } else {
1572                         /* If objective page index exceed the end-of-file page
1573                          * index, return directly. Do not expect kernel will
1574                          * check such case correctly. linux-2.6.18-128.1.1 miss
1575                          * to do that. --bug 17336 */
1576                         loff_t size = i_size_read(inode);
1577                         unsigned long cur_index = *ppos >> CFS_PAGE_SHIFT;
1578
1579                         if ((size == 0 && cur_index != 0) ||
1580                             (((size - 1) >> CFS_PAGE_SHIFT) < cur_index)) {
1581                                 if (ltd.lock_style != LL_LOCK_STYLE_NOLOCK) {
1582
1583                                         ll_file_put_lock(inode, end,
1584                                                          ltd.lock_style,
1585                                                          &ltd.u.lockh,
1586                                                          &ltd.u.tree,
1587                                                          OBD_BRW_READ);
1588                                         up_read(&lli->lli_truncate_rwsem);
1589                                 }
1590                                 goto out;
1591                         }
1592                 }
1593         } else {
1594                 /* region is within kms and, hence, within real file size (A).
1595                  * We need to increase i_size to cover the read region so that
1596                  * generic_file_read() will do its job, but that doesn't mean
1597                  * the kms size is _correct_, it is only the _minimum_ size.
1598                  * If someone does a stat they will get the correct size which
1599                  * will always be >= the kms value here.  b=11081 */
1600                 if (i_size_read(inode) < kms)
1601                         i_size_write(inode, kms);
1602                 ll_inode_size_unlock(inode, 1);
1603         }
1604
1605         chunk = end - *ppos + 1;
1606         CDEBUG(D_INODE,"Read ino %lu, %ld bytes, offset %lld, i_size %llu\n",
1607                inode->i_ino, (long)chunk, *ppos, i_size_read(inode));
1608
1609         /* turn off the kernel's read-ahead */
1610         if (ltd.lock_style != LL_LOCK_STYLE_NOLOCK) {
1611                 struct ost_lvb *xtimes;
1612                 /* read under locks
1613                  *
1614                  * 1. update inode's atime as long as concurrent stat
1615                  * (via ll_glimpse_size) might bring out-of-date ones
1616                  *
1617                  * 2. update lsm so that next stat (via
1618                  * ll_glimpse_size) could get correct values in lsm */
1619                 OBD_ALLOC_PTR(xtimes);
1620                 if (NULL == xtimes) {
1621                         ll_file_put_lock(inode, end, ltd.lock_style,
1622                                          &ltd.u.lockh, &ltd.u.tree,
1623                                          OBD_BRW_READ);
1624                         up_read(&lli->lli_truncate_rwsem);
1625                         GOTO(out, retval = -ENOMEM);
1626                 }
1627
1628                 lov_stripe_lock(lsm);
1629                 LTIME_S(inode->i_atime) = LTIME_S(CURRENT_TIME);
1630                 xtimes->lvb_atime = LTIME_S(inode->i_atime);
1631                 obd_update_lvb(sbi->ll_osc_exp, lsm, xtimes,
1632                                OBD_MD_FLATIME);
1633                 lov_stripe_unlock(lsm);
1634                 OBD_FREE_PTR(xtimes);
1635
1636                 file->f_ra.ra_pages = 0;
1637                 /* initialize read-ahead window once per syscall */
1638                 if (ra == 0) {
1639                         ra = 1;
1640                         ll_ra_read_init(file, &bead, *ppos, count);
1641                 }
1642
1643                 /* BUG: 5972 */
1644                 file_accessed(file);
1645 #ifdef HAVE_FILE_READV
1646                 retval = generic_file_readv(file, iov_copy, nrsegs_copy, ppos);
1647 #else
1648                 retval = generic_file_aio_read(iocb, iov_copy, nrsegs_copy,
1649                                                *ppos);
1650 #endif
1651                 ll_file_put_lock(inode, end, ltd.lock_style, &ltd.u.lockh,
1652                                  &ltd.u.tree, OBD_BRW_READ);
1653                 up_read(&lli->lli_truncate_rwsem);
1654         } else {
1655                 file_accessed(file);
1656                 retval = ll_direct_IO(READ, file, iov_copy, *ppos, nr_segs, 0);
1657                 if (retval > 0) {
1658                         lprocfs_counter_add(sbi->ll_stats,
1659                                             LPROC_LL_LOCKLESS_READ,
1660                                             (long)retval);
1661                         *ppos += retval;
1662                 }
1663         }
1664         ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1665         if (retval > 0) {
1666                 count -= retval;
1667                 sum += retval;
1668                 if (retval == chunk && count > 0)
1669                         goto repeat;
1670         }
1671
1672  out:
1673         ll_td_set(NULL);
1674         if (ra != 0)
1675                 ll_ra_read_ex(file, &bead);
1676         retval = (sum > 0) ? sum : retval;
1677
1678         if (iov_copy && iov_copy != iov)
1679                 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1680
1681         RETURN(retval);
1682 }
1683
1684 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1685                             loff_t *ppos)
1686 {
1687         struct iovec local_iov = { .iov_base = (void __user *)buf,
1688                                    .iov_len = count };
1689 #ifdef HAVE_FILE_READV
1690         return ll_file_readv(file, &local_iov, 1, ppos);
1691 #else
1692         struct kiocb kiocb;
1693         ssize_t ret;
1694
1695         init_sync_kiocb(&kiocb, file);
1696         kiocb.ki_pos = *ppos;
1697         kiocb.ki_left = count;
1698
1699         ret = ll_file_aio_read(&kiocb, &local_iov, 1, kiocb.ki_pos);
1700         *ppos = kiocb.ki_pos;
1701         return ret;
1702 #endif
1703 }
1704
1705 /* iov_shorten from linux kernel */
1706 static unsigned long ll_iov_shorten(struct iovec *iov,
1707                                     unsigned long nr_segs,
1708                                     size_t to)
1709 {
1710         unsigned long seg = 0;
1711         size_t len = 0;
1712
1713         while (seg < nr_segs) {
1714                 seg++;
1715                 if (len + iov->iov_len >= to) {
1716                         iov->iov_len = to - len;
1717                         break;
1718                 }
1719                 len += iov->iov_len;
1720                 iov++;
1721         }
1722         return seg;
1723 }
1724
1725 /* 2.6.22 and 2.6.27 export this as generic_segment_checks */
1726 static int ll_generic_segment_checks(const struct iovec *iov,
1727                                      unsigned long *nr_segs,
1728                                      size_t *count,
1729                                      int access_flags)
1730 {
1731         unsigned long   seg;
1732         size_t cnt = 0;
1733         for (seg = 0; seg < *nr_segs; seg++) {
1734                 const struct iovec *iv = &iov[seg];
1735
1736                 /*
1737                  * If any segment has a negative length, or the cumulative
1738                  * length ever wraps negative then return -EINVAL.
1739                  */
1740                 cnt += iv->iov_len;
1741                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1742                         return -EINVAL;
1743                 if (access_ok(access_flags, iv->iov_base, iv->iov_len))
1744                         continue;
1745                 if (seg == 0)
1746                         return -EFAULT;
1747                 *nr_segs = seg;
1748                 cnt -= iv->iov_len;  /* This segment is no good */
1749                 break;
1750         }
1751         *count = cnt;
1752         return 0;
1753 }
1754
1755 /*
1756  * Write to a file (through the page cache).
1757  */
1758 #ifdef HAVE_FILE_WRITEV
1759 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1760                               unsigned long nr_segs, loff_t *ppos)
1761 {
1762 #else /* AIO stuff */
1763 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1764                                  unsigned long nr_segs, loff_t pos)
1765 {
1766         struct file *file = iocb->ki_filp;
1767         loff_t *ppos = &iocb->ki_pos;
1768 #endif
1769         struct inode *inode = file->f_dentry->d_inode;
1770         struct ll_sb_info *sbi = ll_i2sbi(inode);
1771         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1772         struct ll_thread_data ltd = { 0 };
1773         loff_t maxbytes = ll_file_maxbytes(inode);
1774         loff_t lock_start, lock_end, end;
1775         ssize_t retval, chunk, sum = 0;
1776         int tree_locked;
1777         struct iovec *iov_copy = NULL;
1778         unsigned long nrsegs_copy, nrsegs_orig = 0;
1779         size_t count, iov_offset = 0;
1780         int got_write_sem = 0;
1781         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1782         ENTRY;
1783
1784         count = ll_file_get_iov_count(iov, &nr_segs);
1785
1786         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size=%lu,offset=%Ld\n",
1787                inode->i_ino, inode->i_generation, inode, (unsigned long)count,
1788                *ppos);
1789
1790         SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1791
1792         /* POSIX, but surprised the VFS doesn't check this already */
1793         if (count == 0)
1794                 RETURN(0);
1795
1796         /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1797          * called on the file, don't fail the below assertion (bug 2388). */
1798         if (file->f_flags & O_LOV_DELAY_CREATE &&
1799             ll_i2info(inode)->lli_smd == NULL)
1800                 RETURN(-EBADF);
1801
1802         LASSERT(ll_i2info(inode)->lli_smd != NULL);
1803
1804         /* signal(7) specifies that write(2) and writev(2) should be restarted */
1805         if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK)) {
1806                 got_write_sem = 1;
1807                 if (down_interruptible(&ll_i2info(inode)->lli_write_sem))
1808                         RETURN(-ERESTARTSYS);
1809         }
1810
1811         ltd.ltd_magic = LTD_MAGIC;
1812         ll_td_set(&ltd);
1813 repeat:
1814         memset(&ltd, 0, sizeof(ltd));
1815         ltd.ltd_magic = LTD_MAGIC;
1816
1817         chunk = 0; /* just to fix gcc's warning */
1818         end = *ppos + count - 1;
1819
1820         if (file->f_flags & O_APPEND) {
1821                 lock_start = 0;
1822                 lock_end = OBD_OBJECT_EOF;
1823                 iov_copy = (struct iovec *)iov;
1824                 nrsegs_copy = nr_segs;
1825         } else if (sbi->ll_max_rw_chunk != 0 && !(file->f_flags & O_DIRECT)) {
1826                 /* first, let's know the end of the current stripe */
1827                 end = *ppos;
1828                 obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,
1829                                 (obd_off *)&end);
1830
1831                 /* correct, the end is beyond the request */
1832                 if (end > *ppos + count - 1)
1833                         end = *ppos + count - 1;
1834
1835                 /* and chunk shouldn't be too large even if striping is wide */
1836                 if (end - *ppos > sbi->ll_max_rw_chunk)
1837                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1838                 lock_start = *ppos;
1839                 lock_end = end;
1840                 chunk = end - *ppos + 1;
1841                 if ((count == chunk) && (iov_offset == 0)) {
1842                         if (iov_copy)
1843                                 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1844
1845                         iov_copy = (struct iovec *)iov;
1846                         nrsegs_copy = nr_segs;
1847                 } else {
1848                         if (!iov_copy) {
1849                                 nrsegs_orig = nr_segs;
1850                                 OBD_ALLOC(iov_copy, sizeof(*iov) * nr_segs);
1851                                 if (!iov_copy)
1852                                         GOTO(out, retval = -ENOMEM);
1853                         }
1854                         iov_copy_update(&nr_segs, &iov, &nrsegs_copy, iov_copy,
1855                                         &iov_offset, chunk);
1856                 }
1857         } else {
1858                 lock_start = *ppos;
1859                 lock_end = end;
1860                 iov_copy = (struct iovec *)iov;
1861                 nrsegs_copy = nr_segs;
1862         }
1863
1864         tree_locked = ll_file_get_tree_lock_iov(&ltd.u.tree, file, iov_copy,
1865                                                 nrsegs_copy,
1866                                                 (obd_off)lock_start,
1867                                                 (obd_off)lock_end,
1868                                                 OBD_BRW_WRITE);
1869         if (tree_locked < 0)
1870                 GOTO(out, retval = tree_locked);
1871
1872         /* This is ok, g_f_w will overwrite this under i_sem if it races
1873          * with a local truncate, it just makes our maxbyte checking easier.
1874          * The i_size value gets updated in ll_extent_lock() as a consequence
1875          * of the [0,EOF] extent lock we requested above. */
1876         if (file->f_flags & O_APPEND) {
1877                 *ppos = i_size_read(inode);
1878                 end = *ppos + count - 1;
1879         }
1880
1881         if (*ppos >= maxbytes) {
1882                 send_sig(SIGXFSZ, current, 0);
1883                 GOTO(out_unlock, retval = -EFBIG);
1884         }
1885         if (end > maxbytes - 1)
1886                 end = maxbytes - 1;
1887
1888         /* generic_file_write handles O_APPEND after getting i_mutex */
1889         chunk = end - *ppos + 1;
1890         CDEBUG(D_INFO, "Writing inode %lu, %ld bytes, offset %Lu\n",
1891                inode->i_ino, (long)chunk, *ppos);
1892         if (tree_locked) {
1893                 struct ost_lvb *xtimes;
1894                 /* write under locks
1895                  *
1896                  * 1. update inode's mtime and ctime as long as
1897                  * concurrent stat (via ll_glimpse_size) might bring
1898                  * out-of-date ones
1899                  *
1900                  * 2. update lsm so that next stat (via
1901                  * ll_glimpse_size) could get correct values in lsm */
1902                 OBD_ALLOC_PTR(xtimes);
1903                 if (NULL == xtimes)
1904                         GOTO(out_unlock, retval = -ENOMEM);
1905
1906                 lov_stripe_lock(lsm);
1907                 LTIME_S(inode->i_mtime) = LTIME_S(CURRENT_TIME);
1908                 LTIME_S(inode->i_ctime) = LTIME_S(CURRENT_TIME);
1909                 xtimes->lvb_mtime = LTIME_S(inode->i_mtime);
1910                 xtimes->lvb_ctime = LTIME_S(inode->i_ctime);
1911                 obd_update_lvb(sbi->ll_osc_exp, lsm, xtimes,
1912                                OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1913                 lov_stripe_unlock(lsm);
1914                 OBD_FREE_PTR(xtimes);
1915
1916                 ltd.lock_style = LL_LOCK_STYLE_TREELOCK;
1917
1918 #ifdef HAVE_FILE_WRITEV
1919                 retval = generic_file_writev(file, iov_copy, nrsegs_copy, ppos);
1920 #else
1921                 retval = generic_file_aio_write(iocb, iov_copy, nrsegs_copy,
1922                                                 *ppos);
1923 #endif
1924         } else {
1925                 size_t ocount, ncount;
1926
1927                 retval = ll_generic_segment_checks(iov_copy, &nrsegs_copy,
1928                                                    &ocount, VERIFY_READ);
1929                 if (retval)
1930                         GOTO(out, retval);
1931
1932                 ncount = ocount;
1933
1934                 retval = generic_write_checks(file, ppos, &ncount, 0);
1935                 if (retval)
1936                         GOTO(out, retval);
1937
1938                 if (unlikely(ocount != ncount)) {
1939                         /* we are allowed to modify the original iov too */
1940                         nrsegs_copy = ll_iov_shorten(iov_copy, nrsegs_copy,
1941                                                      ncount);
1942                         chunk = 0; /* no repetition after the short write */
1943                 }
1944
1945                 retval = ll_remove_suid(file, file->f_vfsmnt);
1946                 if (retval)
1947                         GOTO(out, retval);
1948
1949                 ll_update_time(file);
1950                 retval = ll_direct_IO(WRITE, file, iov_copy, *ppos, nr_segs, 0);
1951                 if (retval > 0) {
1952                         lprocfs_counter_add(sbi->ll_stats,
1953                                             LPROC_LL_LOCKLESS_WRITE,
1954                                             (long)retval);
1955                         *ppos += retval;
1956                 }
1957         }
1958         ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
1959
1960 out_unlock:
1961         if (tree_locked)
1962                 ll_tree_unlock(&ltd.u.tree);
1963
1964 out:
1965         if (retval > 0) {
1966                 count -= retval;
1967                 sum += retval;
1968                 if (retval == chunk && count > 0)
1969                         goto repeat;
1970         }
1971
1972         if (got_write_sem)
1973                 up(&ll_i2info(inode)->lli_write_sem);
1974
1975         ll_td_set(NULL);
1976         if (iov_copy && iov_copy != iov)
1977                 OBD_FREE(iov_copy, sizeof(*iov) * nrsegs_orig);
1978
1979         retval = (sum > 0) ? sum : retval;
1980         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1981                            retval > 0 ? retval : 0);
1982         RETURN(retval);
1983 }
1984
1985 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1986                              loff_t *ppos)
1987 {
1988         struct iovec local_iov = { .iov_base = (void __user *)buf,
1989                                    .iov_len = count };
1990
1991 #ifdef HAVE_FILE_WRITEV
1992         return ll_file_writev(file, &local_iov, 1, ppos);
1993 #else
1994         struct kiocb kiocb;
1995         ssize_t ret;
1996
1997         init_sync_kiocb(&kiocb, file);
1998         kiocb.ki_pos = *ppos;
1999         kiocb.ki_left = count;
2000
2001         ret = ll_file_aio_write(&kiocb, &local_iov, 1, kiocb.ki_pos);
2002         *ppos = kiocb.ki_pos;
2003
2004         return ret;
2005 #endif
2006 }
2007
2008 #ifdef HAVE_KERNEL_SENDFILE
2009 /*
2010  * Send file content (through pagecache) somewhere with helper
2011  */
2012 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,
2013                                 size_t count, read_actor_t actor, void *target)
2014 {
2015         struct inode *inode = in_file->f_dentry->d_inode;
2016         struct ll_inode_info *lli = ll_i2info(inode);
2017         struct lov_stripe_md *lsm = lli->lli_smd;
2018         struct ll_lock_tree tree;
2019         struct ll_lock_tree_node *node;
2020         struct ost_lvb lvb;
2021         struct ll_ra_read bead;
2022         ssize_t rc;
2023         __u64 kms;
2024         ENTRY;
2025
2026         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size=%lu,offset=%Ld\n",
2027                inode->i_ino, inode->i_generation, inode, (unsigned long)count,
2028                *ppos);
2029
2030         /* "If nbyte is 0, read() will return 0 and have no other results."
2031          *                      -- Single Unix Spec */
2032         if (count == 0)
2033                 RETURN(0);
2034
2035         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
2036         /* turn off the kernel's read-ahead */
2037         in_file->f_ra.ra_pages = 0;
2038
2039         /* File with no objects, nothing to lock */
2040         if (!lsm) {
2041                 rc = generic_file_sendfile(in_file, ppos, count, actor, target);
2042                 RETURN(rc);
2043         }
2044
2045         node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
2046         if (IS_ERR(node))
2047                 RETURN(PTR_ERR(node));
2048
2049         tree.lt_fd = LUSTRE_FPRIVATE(in_file);
2050         rc = ll_tree_lock(&tree, node, NULL, count,
2051                           in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
2052         if (rc != 0)
2053                 RETURN(rc);
2054
2055         ll_clear_file_contended(inode);
2056         ll_inode_size_lock(inode, 1);
2057         /*
2058          * Consistency guarantees: following possibilities exist for the
2059          * relation between region being read and real file size at this
2060          * moment:
2061          *
2062          *  (A): the region is completely inside of the file;
2063          *
2064          *  (B-x): x bytes of region are inside of the file, the rest is
2065          *  outside;
2066          *
2067          *  (C): the region is completely outside of the file.
2068          *
2069          * This classification is stable under DLM lock acquired by
2070          * ll_tree_lock() above, because to change class, other client has to
2071          * take DLM lock conflicting with our lock. Also, any updates to
2072          * ->i_size by other threads on this client are serialized by
2073          * ll_inode_size_lock(). This guarantees that short reads are handled
2074          * correctly in the face of concurrent writes and truncates.
2075          */
2076         inode_init_lvb(inode, &lvb);
2077         obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
2078         kms = lvb.lvb_size;
2079         if (*ppos + count - 1 > kms) {
2080                 /* A glimpse is necessary to determine whether we return a
2081                  * short read (B) or some zeroes at the end of the buffer (C) */
2082                 ll_inode_size_unlock(inode, 1);
2083                 rc = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
2084                 if (rc)
2085                         goto out;
2086         } else {
2087                 /* region is within kms and, hence, within real file size (A) */
2088                 i_size_write(inode, kms);
2089                 ll_inode_size_unlock(inode, 1);
2090         }
2091
2092         CDEBUG(D_INFO, "Send ino %lu, %lu bytes, offset %lld, i_size %llu\n",
2093                inode->i_ino, (unsigned long)count, *ppos, i_size_read(inode));
2094
2095         ll_ra_read_init(in_file, &bead, *ppos, count);
2096         /* BUG: 5972 */
2097         file_accessed(in_file);
2098         rc = generic_file_sendfile(in_file, ppos, count, actor, target);
2099         ll_ra_read_ex(in_file, &bead);
2100
2101  out:
2102         ll_tree_unlock(&tree);
2103         RETURN(rc);
2104 }
2105 #endif
2106
2107 /* change based on
2108  * http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=f0930fffa99e7fe0a0c4b6c7d9a244dc88288c27
2109  */
2110 #ifdef HAVE_KERNEL_SPLICE_READ
2111 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
2112                                    struct pipe_inode_info *pipe, size_t count,
2113                                    unsigned int flags)
2114 {
2115         struct inode *inode = in_file->f_dentry->d_inode;
2116         struct ll_inode_info *lli = ll_i2info(inode);
2117         struct lov_stripe_md *lsm = lli->lli_smd;
2118         struct ll_lock_tree tree;
2119         struct ll_lock_tree_node *node;
2120         struct ost_lvb lvb;
2121         struct ll_ra_read bead;
2122         ssize_t rc;
2123         __u64 kms;
2124         ENTRY;
2125
2126         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size=%lu,offset=%Ld\n",
2127                inode->i_ino, inode->i_generation, inode, (unsigned long)count,
2128                *ppos);
2129
2130         /* "If nbyte is 0, read() will return 0 and have no other results."
2131          *                      -- Single Unix Spec */
2132         if (count == 0)
2133                 RETURN(0);
2134
2135         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
2136         /* turn off the kernel's read-ahead */
2137         in_file->f_ra.ra_pages = 0;
2138
2139         /* File with no objects, nothing to lock */
2140         if (!lsm) {
2141                 rc = generic_file_splice_read(in_file, ppos, pipe, count, flags);
2142                 RETURN(rc);
2143         }
2144
2145         node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
2146         if (IS_ERR(node))
2147                 RETURN(PTR_ERR(node));
2148
2149         tree.lt_fd = LUSTRE_FPRIVATE(in_file);
2150         rc = ll_tree_lock(&tree, node, NULL, count,
2151                           in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
2152         if (rc != 0)
2153                 RETURN(rc);
2154
2155         ll_clear_file_contended(inode);
2156         ll_inode_size_lock(inode, 1);
2157         /*
2158          * Consistency guarantees: following possibilities exist for the
2159          * relation between region being read and real file size at this
2160          * moment:
2161          *
2162          *  (A): the region is completely inside of the file;
2163          *
2164          *  (B-x): x bytes of region are inside of the file, the rest is
2165          *  outside;
2166          *
2167          *  (C): the region is completely outside of the file.
2168          *
2169          * This classification is stable under DLM lock acquired by
2170          * ll_tree_lock() above, because to change class, other client has to
2171          * take DLM lock conflicting with our lock. Also, any updates to
2172          * ->i_size by other threads on this client are serialized by
2173          * ll_inode_size_lock(). This guarantees that short reads are handled
2174          * correctly in the face of concurrent writes and truncates.
2175          */
2176         inode_init_lvb(inode, &lvb);
2177         obd_merge_lvb(ll_i2sbi(inode)->ll_osc_exp, lsm, &lvb, 1);
2178         kms = lvb.lvb_size;
2179         if (*ppos + count - 1 > kms) {
2180                 /* A glimpse is necessary to determine whether we return a
2181                  * short read (B) or some zeroes at the end of the buffer (C) */
2182                 ll_inode_size_unlock(inode, 1);
2183                 rc = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
2184                 if (rc)
2185                         goto out;
2186         } else {
2187                 /* region is within kms and, hence, within real file size (A) */
2188                 i_size_write(inode, kms);
2189                 ll_inode_size_unlock(inode, 1);
2190         }
2191
2192         CDEBUG(D_INFO, "Send ino %lu, %lu bytes, offset %lld, i_size %llu\n",
2193                inode->i_ino, (unsigned long)count, *ppos, i_size_read(inode));
2194
2195         ll_ra_read_init(in_file, &bead, *ppos, count);
2196         /* BUG: 5972 */
2197         file_accessed(in_file);
2198         rc = generic_file_splice_read(in_file, ppos, pipe, count, flags);
2199         ll_ra_read_ex(in_file, &bead);
2200
2201  out:
2202         ll_tree_unlock(&tree);
2203         RETURN(rc);
2204 }
2205 #endif
2206
2207 static int ll_lov_recreate(struct inode *inode, obd_id id, obd_gr gr,
2208                            obd_count ost_idx)
2209 {
2210         struct ll_inode_info *lli = ll_i2info(inode);
2211         struct obd_export *exp = ll_i2obdexp(inode);
2212         struct obd_trans_info oti = { 0 };
2213         struct obdo *oa = NULL;
2214         int lsm_size;
2215         int rc = 0;
2216         struct lov_stripe_md *lsm, *lsm2;
2217         ENTRY;
2218
2219         OBDO_ALLOC(oa);
2220         if (oa == NULL)
2221                 RETURN(-ENOMEM);
2222
2223         down(&lli->lli_size_sem);
2224         lsm = lli->lli_smd;
2225         if (lsm == NULL)
2226                 GOTO(out, rc = -ENOENT);
2227         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
2228                    (lsm->lsm_stripe_count));
2229
2230         OBD_ALLOC(lsm2, lsm_size);
2231         if (lsm2 == NULL)
2232                 GOTO(out, rc = -ENOMEM);
2233
2234         oa->o_id = id;
2235         oa->o_gr = gr;
2236         oa->o_nlink = ost_idx;
2237         oa->o_flags |= OBD_FL_RECREATE_OBJS;
2238         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
2239         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2240                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
2241
2242         memcpy(lsm2, lsm, lsm_size);
2243         rc = obd_create(exp, oa, &lsm2, &oti);
2244
2245         OBD_FREE(lsm2, lsm_size);
2246         GOTO(out, rc);
2247 out:
2248         up(&lli->lli_size_sem);
2249         OBDO_FREE(oa);
2250         return rc;
2251 }
2252
2253 static int ll_lov_recreate_obj(struct inode *inode, unsigned long arg)
2254 {
2255         struct ll_recreate_obj ucreat;
2256         ENTRY;
2257
2258         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2259                 RETURN(-EPERM);
2260
2261         if (copy_from_user(&ucreat, (struct ll_recreate_obj *)arg,
2262                            sizeof(struct ll_recreate_obj)))
2263                 RETURN(-EFAULT);
2264
2265         RETURN(ll_lov_recreate(inode, ucreat.lrc_id, 0,
2266                                ucreat.lrc_ost_idx));
2267 }
2268
2269 static int ll_lov_recreate_fid(struct inode *inode, unsigned long arg)
2270 {
2271         struct lu_fid fid;
2272         obd_id id;
2273         obd_count ost_idx;
2274         ENTRY;
2275
2276         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2277                 RETURN(-EPERM);
2278
2279         if (copy_from_user(&fid, (struct lu_fid *)arg,
2280                            sizeof(struct lu_fid)))
2281                 RETURN(-EFAULT);
2282
2283         id = fid_oid(&fid) | ((fid_seq(&fid) & 0xffff) << 32);
2284         ost_idx = (fid_seq(&fid) >> 16) & 0xffff;
2285         RETURN(ll_lov_recreate(inode, id, 0, ost_idx));
2286 }
2287
2288 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
2289                                     int flags, struct lov_user_md *lum,
2290                                     int lum_size)
2291 {
2292         struct ll_inode_info *lli = ll_i2info(inode);
2293         struct lov_stripe_md *lsm;
2294         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
2295         int rc = 0;
2296         ENTRY;
2297
2298         down(&lli->lli_size_sem);
2299         lsm = lli->lli_smd;
2300         if (lsm) {
2301                 up(&lli->lli_size_sem);
2302                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
2303                        inode->i_ino);
2304                 RETURN(-EEXIST);
2305         }
2306
2307         rc = ll_intent_file_open(file, lum, lum_size, &oit);
2308         if (rc)
2309                 GOTO(out, rc);
2310         rc = oit.d.lustre.it_status;
2311         if (rc < 0)
2312                 GOTO(out_req_free, rc);
2313
2314         ll_release_openhandle(file->f_dentry, &oit);
2315
2316  out:
2317         up(&lli->lli_size_sem);
2318         ll_intent_release(&oit);
2319         RETURN(rc);
2320 out_req_free:
2321         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2322         goto out;
2323 }
2324
2325 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2326                              struct lov_mds_md **lmmp, int *lmm_size,
2327                              struct ptlrpc_request **request)
2328 {
2329         struct ll_sb_info *sbi = ll_i2sbi(inode);
2330         struct ll_fid  fid;
2331         struct mds_body  *body;
2332         struct lov_mds_md *lmm = NULL;
2333         struct ptlrpc_request *req = NULL;
2334         int rc, lmmsize;
2335
2336         ll_inode2fid(&fid, inode);
2337
2338         rc = ll_get_max_mdsize(sbi, &lmmsize);
2339         if (rc)
2340                 RETURN(rc);
2341
2342         rc = mdc_getattr_name(sbi->ll_mdc_exp, &fid,
2343                         filename, strlen(filename) + 1,
2344                         OBD_MD_FLEASIZE | OBD_MD_FLDIREA,
2345                         lmmsize, &req);
2346         if (rc < 0) {
2347                 CDEBUG(D_INFO, "mdc_getattr_name failed "
2348                                 "on %s: rc %d\n", filename, rc);
2349                 GOTO(out, rc);
2350         }
2351
2352         body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF,
2353                         sizeof(*body));
2354         LASSERT(body != NULL); /* checked by mdc_getattr_name */
2355         /* swabbed by mdc_getattr_name */
2356         LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
2357
2358         lmmsize = body->eadatasize;
2359
2360         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2361                         lmmsize == 0) {
2362                 GOTO(out, rc = -ENODATA);
2363         }
2364
2365         lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1,
2366                         lmmsize);
2367         LASSERT(lmm != NULL);
2368         LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
2369
2370         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
2371             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
2372             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
2373                 GOTO(out, rc = -EPROTO);
2374         }
2375         /*
2376          * This is coming from the MDS, so is probably in
2377          * little endian.  We convert it to host endian before
2378          * passing it to userspace.
2379          */
2380         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2381                 /* if function called for directory - we should
2382                  * avoid swab not existent lsm objects */
2383                 if ((lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) ||
2384                     (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3))) {
2385                         lustre_swab_lov_user_md((struct lov_user_md*)lmm);
2386                         if (S_ISREG(body->mode))
2387                                 lustre_swab_lov_user_md_objects(
2388                                                 (struct lov_user_md*)lmm);
2389                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
2390                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
2391                 }
2392         }
2393
2394         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
2395                 struct lov_stripe_md *lsm;
2396                 struct lov_user_md_join *lmj;
2397                 int lmj_size, i, aindex = 0;
2398
2399                 rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize);
2400                 if (rc < 0)
2401                         GOTO(out, rc = -ENOMEM);
2402                 rc = obd_checkmd(sbi->ll_osc_exp, sbi->ll_mdc_exp, lsm);
2403                 if (rc)
2404                         GOTO(out_free_memmd, rc);
2405
2406                 lmj_size = sizeof(struct lov_user_md_join) +
2407                         lsm->lsm_stripe_count *
2408                         sizeof(struct lov_user_ost_data_join);
2409                 OBD_ALLOC(lmj, lmj_size);
2410                 if (!lmj)
2411                         GOTO(out_free_memmd, rc = -ENOMEM);
2412
2413                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
2414                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
2415                         struct lov_extent *lex =
2416                                 &lsm->lsm_array->lai_ext_array[aindex];
2417
2418                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
2419                                 aindex ++;
2420                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
2421                                         LPU64" len %d\n", aindex, i,
2422                                         lex->le_start, (int)lex->le_len);
2423                         lmj->lmm_objects[i].l_extent_start =
2424                                 lex->le_start;
2425
2426                         if ((int)lex->le_len == -1)
2427                                 lmj->lmm_objects[i].l_extent_end = -1;
2428                         else
2429                                 lmj->lmm_objects[i].l_extent_end =
2430                                         lex->le_start + lex->le_len;
2431                         lmj->lmm_objects[i].l_object_id =
2432                                 lsm->lsm_oinfo[i]->loi_id;
2433                         lmj->lmm_objects[i].l_object_gr =
2434                                 lsm->lsm_oinfo[i]->loi_gr;
2435                         lmj->lmm_objects[i].l_ost_gen =
2436                                 lsm->lsm_oinfo[i]->loi_ost_gen;
2437                         lmj->lmm_objects[i].l_ost_idx =
2438                                 lsm->lsm_oinfo[i]->loi_ost_idx;
2439                 }
2440                 lmm = (struct lov_mds_md *)lmj;
2441                 lmmsize = lmj_size;
2442 out_free_memmd:
2443                 obd_free_memmd(sbi->ll_osc_exp, &lsm);
2444         }
2445 out:
2446         *lmmp = lmm;
2447         *lmm_size = lmmsize;
2448         *request = req;
2449         return rc;
2450 }
2451 static int ll_lov_setea(struct inode *inode, struct file *file,
2452                             unsigned long arg)
2453 {
2454         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2455         struct lov_user_md  *lump;
2456         int lum_size = sizeof(struct lov_user_md) +
2457                        sizeof(struct lov_user_ost_data);
2458         int rc;
2459         ENTRY;
2460
2461         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2462                 RETURN(-EPERM);
2463
2464         OBD_ALLOC(lump, lum_size);
2465         if (lump == NULL) {
2466                 RETURN(-ENOMEM);
2467         }
2468         rc = copy_from_user(lump, (struct lov_user_md  *)arg, lum_size);
2469         if (rc) {
2470                 OBD_FREE(lump, lum_size);
2471                 RETURN(-EFAULT);
2472         }
2473
2474         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
2475
2476         OBD_FREE(lump, lum_size);
2477         RETURN(rc);
2478 }
2479
2480 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2481                             unsigned long arg)
2482 {
2483         struct lov_user_md_v3 lumv3;
2484         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
2485         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
2486         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
2487         int lum_size;
2488         int rc;
2489         int flags = FMODE_WRITE;
2490         ENTRY;
2491
2492         /* first try with v1 which is smaller than v3 */
2493         lum_size = sizeof(struct lov_user_md_v1);
2494         rc = copy_from_user(lumv1, lumv1p, lum_size);
2495         if (rc)
2496                 RETURN(-EFAULT);
2497
2498         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
2499                 lum_size = sizeof(struct lov_user_md_v3);
2500                 rc = copy_from_user(&lumv3, lumv3p, lum_size);
2501                 if (rc)
2502                         RETURN(-EFAULT);
2503         }
2504
2505         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
2506         if (rc == 0) {
2507                  put_user(0, &lumv1p->lmm_stripe_count);
2508                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode),
2509                                     0, ll_i2info(inode)->lli_smd,
2510                                     (void *)arg);
2511         }
2512         RETURN(rc);
2513 }
2514
2515 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
2516 {
2517         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2518
2519         if (!lsm)
2520                 RETURN(-ENODATA);
2521
2522         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2obdexp(inode), 0, lsm,
2523                             (void *)arg);
2524 }
2525
2526 static int ll_get_grouplock(struct inode *inode, struct file *file,
2527                             unsigned long arg)
2528 {
2529         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2530         ldlm_policy_data_t policy = { .l_extent = { .start = 0,
2531                                                     .end = OBD_OBJECT_EOF}};
2532         struct lustre_handle lockh = { 0 };
2533         struct ll_inode_info *lli = ll_i2info(inode);
2534         struct lov_stripe_md *lsm = lli->lli_smd;
2535         int flags = 0, rc;
2536         ENTRY;
2537
2538         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2539                 RETURN(-EINVAL);
2540         }
2541
2542         policy.l_extent.gid = arg;
2543         if (file->f_flags & O_NONBLOCK)
2544                 flags = LDLM_FL_BLOCK_NOWAIT;
2545
2546         rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
2547         if (rc)
2548                 RETURN(rc);
2549
2550         fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
2551         fd->fd_gid = arg;
2552         memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
2553
2554         RETURN(0);
2555 }
2556
2557 static int ll_put_grouplock(struct inode *inode, struct file *file,
2558                             unsigned long arg)
2559 {
2560         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2561         struct ll_inode_info *lli = ll_i2info(inode);
2562         struct lov_stripe_md *lsm = lli->lli_smd;
2563         int rc;
2564         ENTRY;
2565
2566         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2567                 /* Ugh, it's already unlocked. */
2568                 RETURN(-EINVAL);
2569         }
2570
2571         if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2572                 RETURN(-EINVAL);
2573
2574         fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2575
2576         rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2577         if (rc)
2578                 RETURN(rc);
2579
2580         fd->fd_gid = 0;
2581         memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2582
2583         RETURN(0);
2584 }
2585
2586 #if LUSTRE_FIX >= 50
2587 static int join_sanity_check(struct inode *head, struct inode *tail)
2588 {
2589         ENTRY;
2590         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2591                 CERROR("server do not support join \n");
2592                 RETURN(-EINVAL);
2593         }
2594         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2595                 CERROR("tail ino %lu and ino head %lu must be regular\n",
2596                        head->i_ino, tail->i_ino);
2597                 RETURN(-EINVAL);
2598         }
2599         if (head->i_ino == tail->i_ino) {
2600                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2601                 RETURN(-EINVAL);
2602         }
2603         if (i_size_read(head) % JOIN_FILE_ALIGN) {
2604                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2605                 RETURN(-EINVAL);
2606         }
2607         RETURN(0);
2608 }
2609
2610 static int join_file(struct inode *head_inode, struct file *head_filp,
2611                      struct file *tail_filp)
2612 {
2613         struct dentry *tail_dentry = tail_filp->f_dentry;
2614         struct lookup_intent oit = {.it_op = IT_OPEN,
2615                                     .it_flags = head_filp->f_flags,
2616                                     .it_create_mode = M_JOIN_FILE};
2617         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_PW,
2618                 ll_mdc_blocking_ast, ldlm_completion_ast, NULL, NULL };
2619
2620         struct lustre_handle lockh;
2621         struct mdc_op_data *op_data;
2622         int    rc;
2623         loff_t data;
2624         ENTRY;
2625
2626         tail_dentry = tail_filp->f_dentry;
2627
2628         OBD_ALLOC_PTR(op_data);
2629         if (op_data == NULL) {
2630                 RETURN(-ENOMEM);
2631         }
2632
2633         data = i_size_read(head_inode);
2634         ll_prepare_mdc_op_data(op_data, head_inode,
2635                                tail_dentry->d_parent->d_inode,
2636                                tail_dentry->d_name.name,
2637                                tail_dentry->d_name.len, 0, &data);
2638         rc = mdc_enqueue(ll_i2mdcexp(head_inode), &einfo, &oit,
2639                          op_data, &lockh, NULL, 0, 0);
2640
2641         if (rc < 0)
2642                 GOTO(out, rc);
2643
2644         rc = oit.d.lustre.it_status;
2645
2646         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2647                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2648                 ptlrpc_req_finished((struct ptlrpc_request *)
2649                                     oit.d.lustre.it_data);
2650                 GOTO(out, rc);
2651         }
2652
2653         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2654                                            * away */
2655                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2656                 oit.d.lustre.it_lock_mode = 0;
2657         }
2658         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
2659         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
2660         ll_release_openhandle(head_filp->f_dentry, &oit);
2661 out:
2662         if (op_data)
2663                 OBD_FREE_PTR(op_data);
2664         ll_intent_release(&oit);
2665         RETURN(rc);
2666 }
2667
2668 static int ll_file_join(struct inode *head, struct file *filp,
2669                         char *filename_tail)
2670 {
2671         struct inode *tail = NULL, *first = NULL, *second = NULL;
2672         struct dentry *tail_dentry;
2673         struct file *tail_filp, *first_filp, *second_filp;
2674         struct ll_lock_tree first_tree, second_tree;
2675         struct ll_lock_tree_node *first_node, *second_node;
2676         struct ll_inode_info *hlli = ll_i2info(head);
2677         int rc = 0, cleanup_phase = 0;
2678         ENTRY;
2679
2680         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2681                head->i_ino, head->i_generation, head, filename_tail);
2682
2683         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2684         if (IS_ERR(tail_filp)) {
2685                 CERROR("Can not open tail file %s", filename_tail);
2686                 rc = PTR_ERR(tail_filp);
2687                 GOTO(cleanup, rc);
2688         }
2689         tail = igrab(tail_filp->f_dentry->d_inode);
2690
2691         tail_dentry = tail_filp->f_dentry;
2692         LASSERT(tail_dentry);
2693         cleanup_phase = 1;
2694
2695         /*reorder the inode for lock sequence*/
2696         first = head->i_ino > tail->i_ino ? head : tail;
2697         second = head->i_ino > tail->i_ino ? tail : head;
2698         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2699         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2700
2701         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2702                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2703         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2704         if (IS_ERR(first_node)){
2705                 rc = PTR_ERR(first_node);
2706                 GOTO(cleanup, rc);
2707         }
2708         first_tree.lt_fd = first_filp->private_data;
2709         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2710         if (rc != 0)
2711                 GOTO(cleanup, rc);
2712         cleanup_phase = 2;
2713
2714         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2715         if (IS_ERR(second_node)){
2716                 rc = PTR_ERR(second_node);
2717                 GOTO(cleanup, rc);
2718         }
2719         second_tree.lt_fd = second_filp->private_data;
2720         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2721         if (rc != 0)
2722                 GOTO(cleanup, rc);
2723         cleanup_phase = 3;
2724
2725         rc = join_sanity_check(head, tail);
2726         if (rc)
2727                 GOTO(cleanup, rc);
2728
2729         rc = join_file(head, filp, tail_filp);
2730         if (rc)
2731                 GOTO(cleanup, rc);
2732 cleanup:
2733         switch (cleanup_phase) {
2734         case 3:
2735                 ll_tree_unlock(&second_tree);
2736                 obd_cancel_unused(ll_i2obdexp(second),
2737                                   ll_i2info(second)->lli_smd, 0, NULL);
2738         case 2:
2739                 ll_tree_unlock(&first_tree);
2740                 obd_cancel_unused(ll_i2obdexp(first),
2741                                   ll_i2info(first)->lli_smd, 0, NULL);
2742         case 1:
2743                 filp_close(tail_filp, 0);
2744                 if (tail)
2745                         iput(tail);
2746                 if (head && rc == 0) {
2747                         obd_free_memmd(ll_i2sbi(head)->ll_osc_exp,
2748                                        &hlli->lli_smd);
2749                         hlli->lli_smd = NULL;
2750                 }
2751         case 0:
2752                 break;
2753         default:
2754                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2755                 LBUG();
2756         }
2757         RETURN(rc);
2758 }
2759 #endif  /* LUSTRE_FIX >= 50 */
2760
2761 /**
2762  * Close inode open handle
2763  *
2764  * \param dentry [in]     dentry which contains the inode
2765  * \param it     [in,out] intent which contains open info and result
2766  *
2767  * \retval 0     success
2768  * \retval <0    failure
2769  */
2770 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2771 {
2772         struct inode *inode = dentry->d_inode;
2773         struct obd_client_handle *och;
2774         int rc;
2775         ENTRY;
2776
2777         LASSERT(inode);
2778
2779         /* Root ? Do nothing. */
2780         if (dentry->d_inode->i_sb->s_root == dentry)
2781                 RETURN(0);
2782
2783         /* No open handle to close? Move away */
2784         if (!it_disposition(it, DISP_OPEN_OPEN))
2785                 RETURN(0);
2786
2787         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2788
2789         OBD_ALLOC(och, sizeof(*och));
2790         if (!och)
2791                 GOTO(out, rc = -ENOMEM);
2792
2793         ll_och_fill(ll_i2info(inode), it, och);
2794
2795         rc = ll_close_inode_openhandle(inode, och);
2796
2797         OBD_FREE(och, sizeof(*och));
2798  out:
2799         /* this one is in place of ll_file_open */
2800         if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2801                 ptlrpc_req_finished(it->d.lustre.it_data);
2802                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2803         }
2804         RETURN(rc);
2805 }
2806
2807 int ll_do_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
2808               int num_bytes)
2809 {
2810         struct obd_export *exp = ll_i2obdexp(inode);
2811         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
2812         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
2813         int vallen = num_bytes;
2814         int rc;
2815         ENTRY;
2816
2817         /* Checks for fiemap flags */
2818         if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2819                 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2820                 return -EBADR;
2821         }
2822
2823         /* Check for FIEMAP_FLAG_SYNC */
2824         if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2825                 rc = filemap_fdatawrite(inode->i_mapping);
2826                 if (rc)
2827                         return rc;
2828         }
2829
2830         /* If the stripe_count > 1 and the application does not understand
2831          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
2832          */
2833         if (lsm->lsm_stripe_count > 1 &&
2834             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
2835                 return -EOPNOTSUPP;
2836
2837         fm_key.oa.o_id = lsm->lsm_object_id;
2838         fm_key.oa.o_valid = OBD_MD_FLID;
2839
2840         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLSIZE);
2841
2842         /* If filesize is 0, then there would be no objects for mapping */
2843         if (fm_key.oa.o_size == 0) {
2844                 fiemap->fm_mapped_extents = 0;
2845                 RETURN(0);
2846         }
2847
2848         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
2849
2850         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
2851         if (rc)
2852                 CERROR("obd_get_info failed: rc = %d\n", rc);
2853
2854         RETURN(rc);
2855 }
2856
2857 static int ll_ioctl_fiemap(struct inode *inode, unsigned long arg)
2858 {
2859         struct ll_user_fiemap *fiemap_s;
2860         size_t num_bytes, ret_bytes;
2861         unsigned int extent_count;
2862         int rc = 0;
2863
2864         /* Get the extent count so we can calculate the size of
2865          * required fiemap buffer */
2866         if (get_user(extent_count,
2867             &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
2868                 RETURN(-EFAULT);
2869         num_bytes = sizeof(*fiemap_s) + (extent_count *
2870                                          sizeof(struct ll_fiemap_extent));
2871
2872         OBD_VMALLOC(fiemap_s, num_bytes);
2873         if (fiemap_s == NULL)
2874                 RETURN(-ENOMEM);
2875
2876         /* get the fiemap value */
2877         if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
2878                            sizeof(*fiemap_s)))
2879                 GOTO(error, rc = -EFAULT);
2880
2881         /* If fm_extent_count is non-zero, read the first extent since
2882          * it is used to calculate end_offset and device from previous
2883          * fiemap call. */
2884         if (extent_count) {
2885                 if (copy_from_user(&fiemap_s->fm_extents[0],
2886                     (char __user *)arg + sizeof(*fiemap_s),
2887                     sizeof(struct ll_fiemap_extent)))
2888                         GOTO(error, rc = -EFAULT);
2889         }
2890
2891         rc = ll_do_fiemap(inode, fiemap_s, num_bytes);
2892         if (rc)
2893                 GOTO(error, rc);
2894
2895         ret_bytes = sizeof(struct ll_user_fiemap);
2896
2897         if (extent_count != 0)
2898                 ret_bytes += (fiemap_s->fm_mapped_extents *
2899                                  sizeof(struct ll_fiemap_extent));
2900
2901         if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
2902                 rc = -EFAULT;
2903
2904 error:
2905         OBD_VFREE(fiemap_s, num_bytes);
2906         RETURN(rc);
2907 }
2908
2909 #ifdef HAVE_UNLOCKED_IOCTL
2910 long ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2911 {
2912         struct inode *inode = file->f_dentry->d_inode;
2913 #else
2914 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2915                   unsigned long arg)
2916 {
2917 #endif
2918         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2919         int flags;
2920         ENTRY;
2921
2922         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2923                inode->i_generation, inode, cmd);
2924         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2925
2926         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2927         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2928                 RETURN(-ENOTTY);
2929
2930         switch(cmd) {
2931         case LL_IOC_GETFLAGS:
2932                 /* Get the current value of the file flags */
2933                 return put_user(fd->fd_flags, (int *)arg);
2934         case LL_IOC_SETFLAGS:
2935         case LL_IOC_CLRFLAGS:
2936                 /* Set or clear specific file flags */
2937                 /* XXX This probably needs checks to ensure the flags are
2938                  *     not abused, and to handle any flag side effects.
2939                  */
2940                 if (get_user(flags, (int *) arg))
2941                         RETURN(-EFAULT);
2942
2943                 if (cmd == LL_IOC_SETFLAGS) {
2944                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2945                             !(file->f_flags & O_DIRECT)) {
2946                                 CERROR("%s: unable to disable locking on "
2947                                        "non-O_DIRECT file\n", current->comm);
2948                                 RETURN(-EINVAL);
2949                         }
2950
2951                         fd->fd_flags |= flags;
2952                 } else {
2953                         fd->fd_flags &= ~flags;
2954                 }
2955                 RETURN(0);
2956         case LL_IOC_LOV_SETSTRIPE:
2957                 RETURN(ll_lov_setstripe(inode, file, arg));
2958         case LL_IOC_LOV_SETEA:
2959                 RETURN(ll_lov_setea(inode, file, arg));
2960         case LL_IOC_LOV_GETSTRIPE:
2961                 RETURN(ll_lov_getstripe(inode, arg));
2962         case LL_IOC_RECREATE_OBJ:
2963                 RETURN(ll_lov_recreate_obj(inode, arg));
2964         case LL_IOC_RECREATE_FID:
2965                 RETURN(ll_lov_recreate_fid(inode, arg));
2966         case FSFILT_IOC_FIEMAP:
2967                 RETURN(ll_ioctl_fiemap(inode, arg));
2968         case FSFILT_IOC_GETFLAGS:
2969         case FSFILT_IOC_SETFLAGS:
2970                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2971         case FSFILT_IOC_GETVERSION_OLD:
2972         case FSFILT_IOC_GETVERSION:
2973                 RETURN(put_user(inode->i_generation, (int *)arg));
2974         case LL_IOC_JOIN: {
2975 #if LUSTRE_FIX >= 50
2976                 /* Allow file join in beta builds to allow debuggging */
2977                 char *ftail;
2978                 int rc;
2979
2980                 ftail = getname((const char *)arg);
2981                 if (IS_ERR(ftail))
2982                         RETURN(PTR_ERR(ftail));
2983                 rc = ll_file_join(inode, file, ftail);
2984                 putname(ftail);
2985                 RETURN(rc);
2986 #else
2987                 CWARN("file join is not supported in this version of Lustre\n");
2988                 RETURN(-ENOTTY);
2989 #endif
2990         }
2991         case LL_IOC_GROUP_LOCK:
2992                 RETURN(ll_get_grouplock(inode, file, arg));
2993         case LL_IOC_GROUP_UNLOCK:
2994                 RETURN(ll_put_grouplock(inode, file, arg));
2995         case IOC_OBD_STATFS:
2996                 RETURN(ll_obd_statfs(inode, (void *)arg));
2997         case OBD_IOC_GETNAME_OLD:
2998         case OBD_IOC_GETNAME: {
2999                 struct obd_device *obd =
3000                         class_exp2obd(ll_i2sbi(inode)->ll_osc_exp);
3001                 if (!obd)
3002                         RETURN(-EFAULT);
3003                 if (copy_to_user((void *)arg, obd->obd_name,
3004                                 strlen(obd->obd_name) + 1))
3005                         RETURN (-EFAULT);
3006                 RETURN(0);
3007         }
3008         case LL_IOC_PATH2FID: {
3009                 if (copy_to_user((void *)arg, ll_inode_lu_fid(inode),
3010                                  sizeof(struct lu_fid)))
3011                         RETURN(-EFAULT);
3012
3013                 RETURN(0);
3014         }
3015
3016         /* We need to special case any other ioctls we want to handle,
3017          * to send them to the MDS/OST as appropriate and to properly
3018          * network encode the arg field.
3019         case EXT3_IOC_SETVERSION_OLD:
3020         case EXT3_IOC_SETVERSION:
3021         */
3022         default: {
3023                 int err;
3024
3025                 if (LLIOC_STOP ==
3026                     ll_iocontrol_call(inode, file, cmd, arg, &err))
3027                         RETURN(err);
3028
3029                 RETURN(obd_iocontrol(cmd, ll_i2obdexp(inode), 0, NULL,
3030                                      (void *)arg));
3031         }
3032         }
3033 }
3034
3035 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3036 {
3037         struct inode *inode = file->f_dentry->d_inode;
3038         struct ll_inode_info *lli = ll_i2info(inode);
3039         struct lov_stripe_md *lsm = lli->lli_smd;
3040         loff_t retval;
3041         ENTRY;
3042         retval = offset + ((origin == 2) ? i_size_read(inode) :
3043                            (origin == 1) ? file->f_pos : 0);
3044         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
3045                inode->i_ino, inode->i_generation, inode, retval, retval,
3046                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
3047         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3048
3049         if (origin == 2) { /* SEEK_END */
3050                 int nonblock = 0, rc;
3051
3052                 if (file->f_flags & O_NONBLOCK)
3053                         nonblock = LDLM_FL_BLOCK_NOWAIT;
3054
3055                 if (lsm != NULL) {
3056                         rc = ll_glimpse_size(inode, nonblock);
3057                         if (rc != 0)
3058                                 RETURN(rc);
3059                 }
3060
3061                 ll_inode_size_lock(inode, 0);
3062                 offset += i_size_read(inode);
3063                 ll_inode_size_unlock(inode, 0);
3064         } else if (origin == 1) { /* SEEK_CUR */
3065                 offset += file->f_pos;
3066         }
3067
3068         retval = -EINVAL;
3069         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
3070                 if (offset != file->f_pos) {
3071                         file->f_pos = offset;
3072                         file->f_version = 0;
3073                 }
3074                 retval = offset;
3075         }
3076
3077         RETURN(retval);
3078 }
3079
3080 #ifdef HAVE_FLUSH_OWNER_ID
3081 int ll_flush(struct file *file, fl_owner_t id)
3082 #else
3083 int ll_flush(struct file *file)
3084 #endif
3085 {
3086         struct inode *inode = file->f_dentry->d_inode;
3087         struct ll_inode_info *lli = ll_i2info(inode);
3088         struct lov_stripe_md *lsm = lli->lli_smd;
3089         int rc, err;
3090
3091         /* catch async errors that were recorded back when async writeback
3092          * failed for pages in this mapping. */
3093         rc = lli->lli_async_rc;
3094         lli->lli_async_rc = 0;
3095         if (lsm) {
3096                 err = lov_test_and_clear_async_rc(lsm);
3097                 if (rc == 0)
3098                         rc = err;
3099         }
3100
3101         return rc ? -EIO : 0;
3102 }
3103
3104 int ll_fsync(struct file *file, struct dentry *dentry, int data)
3105 {
3106         struct inode *inode = dentry->d_inode;
3107         struct ll_inode_info *lli = ll_i2info(inode);
3108         struct lov_stripe_md *lsm = lli->lli_smd;
3109         struct ll_fid fid;
3110         struct ptlrpc_request *req;
3111         int rc, err;
3112         ENTRY;
3113         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
3114                inode->i_generation, inode);
3115         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3116
3117         /* fsync's caller has already called _fdata{sync,write}, we want
3118          * that IO to finish before calling the osc and mdc sync methods */
3119         rc = filemap_fdatawait(inode->i_mapping);
3120
3121         /* catch async errors that were recorded back when async writeback
3122          * failed for pages in this mapping. */
3123         err = lli->lli_async_rc;
3124         lli->lli_async_rc = 0;
3125         if (rc == 0)
3126                 rc = err;
3127         if (lsm) {
3128                 err = lov_test_and_clear_async_rc(lsm);
3129                 if (rc == 0)
3130                         rc = err;
3131         }
3132
3133         ll_inode2fid(&fid, inode);
3134         err = mdc_sync(ll_i2sbi(inode)->ll_mdc_exp, &fid, &req);
3135         if (!rc)
3136                 rc = err;
3137         if (!err)
3138                 ptlrpc_req_finished(req);
3139
3140         if (data && lsm) {
3141                 struct obd_info *oinfo;
3142
3143                 OBD_ALLOC_PTR(oinfo);
3144                 if (!oinfo)
3145                         RETURN(rc ? rc : -ENOMEM);
3146                 OBDO_ALLOC(oinfo->oi_oa);
3147                 if (!oinfo->oi_oa) {
3148                         OBD_FREE_PTR(oinfo);
3149                         RETURN(rc ? rc : -ENOMEM);
3150                 }
3151                 oinfo->oi_oa->o_id = lsm->lsm_object_id;
3152                 oinfo->oi_oa->o_gr = lsm->lsm_object_gr;
3153                 oinfo->oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
3154                 obdo_from_inode(oinfo->oi_oa, inode,
3155                                 OBD_MD_FLTYPE | OBD_MD_FLATIME |
3156                                 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
3157                 oinfo->oi_md = lsm;
3158                 err = obd_sync_rqset(ll_i2sbi(inode)->ll_osc_exp, oinfo,
3159                                      0, OBD_OBJECT_EOF);
3160                 if (!rc)
3161                         rc = err;
3162                 OBDO_FREE(oinfo->oi_oa);
3163                 OBD_FREE_PTR(oinfo);
3164         }
3165
3166         RETURN(rc);
3167 }
3168
3169 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3170 {
3171         struct inode *inode = file->f_dentry->d_inode;
3172         struct ll_sb_info *sbi = ll_i2sbi(inode);
3173         struct lu_fid *fid = ll_inode_lu_fid(inode);
3174         struct ldlm_res_id res_id =
3175                     { .name = { fid_seq(fid),
3176                                 fid_oid(fid),
3177                                 fid_ver(fid),
3178                                 LDLM_FLOCK} };
3179         struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
3180                 ldlm_flock_completion_ast, NULL, file_lock };
3181         struct lustre_handle lockh = {0};
3182         ldlm_policy_data_t flock;
3183         int flags = 0;
3184         int rc, rc2 = 0;
3185         ENTRY;
3186
3187         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
3188                inode->i_ino, file_lock);
3189         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3190
3191         if (fid_is_igif(fid)) {
3192                 /* If this is an IGIF inode, we need to keep the 1.6-style
3193                  * flock mapping for compatibility.  If it is a proper FID
3194                  * then we know any other client accessing it must also be
3195                  * accessing it as a FID and can use the CMD-style flock. */
3196                 res_id.name[2] = LDLM_FLOCK;
3197                 res_id.name[3] = 0;
3198         }
3199
3200         if (file_lock->fl_flags & FL_FLOCK) {
3201                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3202                 /* set missing params for flock() calls */
3203                 file_lock->fl_end = OFFSET_MAX;
3204                 file_lock->fl_pid = current->tgid;
3205         }
3206         flock.l_flock.pid = file_lock->fl_pid;
3207         flock.l_flock.start = file_lock->fl_start;
3208         flock.l_flock.end = file_lock->fl_end;
3209
3210         switch (file_lock->fl_type) {
3211         case F_RDLCK:
3212                 einfo.ei_mode = LCK_PR;
3213                 break;
3214         case F_UNLCK:
3215                 /* An unlock request may or may not have any relation to
3216                  * existing locks so we may not be able to pass a lock handle
3217                  * via a normal ldlm_lock_cancel() request. The request may even
3218                  * unlock a byte range in the middle of an existing lock. In
3219                  * order to process an unlock request we need all of the same
3220                  * information that is given with a normal read or write record
3221                  * lock request. To avoid creating another ldlm unlock (cancel)
3222                  * message we'll treat a LCK_NL flock request as an unlock. */
3223                 einfo.ei_mode = LCK_NL;
3224                 break;
3225         case F_WRLCK:
3226                 einfo.ei_mode = LCK_PW;
3227                 break;
3228         default:
3229                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
3230                 RETURN (-EINVAL);
3231         }
3232
3233         switch (cmd) {
3234         case F_SETLKW:
3235 #ifdef F_SETLKW64
3236         case F_SETLKW64:
3237 #endif
3238                 flags = 0;
3239                 break;
3240         case F_SETLK:
3241 #ifdef F_SETLK64
3242         case F_SETLK64:
3243 #endif
3244                 flags = LDLM_FL_BLOCK_NOWAIT;
3245                 break;
3246         case F_GETLK:
3247 #ifdef F_GETLK64
3248         case F_GETLK64:
3249 #endif
3250                 flags = LDLM_FL_TEST_LOCK;
3251                 /* Save the old mode so that if the mode in the lock changes we
3252                  * can decrement the appropriate reader or writer refcount. */
3253                 file_lock->fl_type = einfo.ei_mode;
3254                 break;
3255         default:
3256                 CERROR("unknown fcntl lock command: %d\n", cmd);
3257                 RETURN (-EINVAL);
3258         }
3259
3260         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
3261                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
3262                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
3263
3264         rc = ldlm_cli_enqueue(sbi->ll_mdc_exp, NULL, &einfo, res_id,
3265                               &flock, &flags, NULL, 0, NULL, &lockh, 0);
3266         if ((file_lock->fl_flags & FL_FLOCK) &&
3267             (rc == 0 || file_lock->fl_type == F_UNLCK))
3268                 rc2 = ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
3269 #ifdef HAVE_F_OP_FLOCK
3270         if ((file_lock->fl_flags & FL_POSIX) &&
3271             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
3272             !(flags & LDLM_FL_TEST_LOCK))
3273                 rc2 = posix_lock_file_wait(file, file_lock);
3274 #endif
3275
3276         RETURN(rc ? rc : rc2);
3277 }
3278
3279 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
3280 {
3281         ENTRY;
3282
3283         RETURN(-ENOSYS);
3284 }
3285
3286 int ll_have_md_lock(struct inode *inode, __u64 bits, ldlm_mode_t l_req_mode)
3287 {
3288         struct lustre_handle lockh;
3289         struct ldlm_res_id res_id;
3290         struct obd_device *obddev;
3291         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
3292         ldlm_mode_t mode = (l_req_mode == LCK_MINMODE) ?
3293                                 (LCK_CR|LCK_CW|LCK_PR|LCK_PW) : l_req_mode;
3294         int flags;
3295         ENTRY;
3296
3297         if (!inode)
3298                RETURN(0);
3299
3300         obddev = ll_i2mdcexp(inode)->exp_obd;
3301         fid_build_reg_res_name(ll_inode_lu_fid(inode), &res_id);
3302
3303         CDEBUG(D_INFO, "trying to match res "LPU64":"LPU64":"LPU64" mode %s\n",
3304                 res_id.name[0],
3305                 res_id.name[1],
3306                 res_id.name[2],
3307                 ldlm_lockname[mode]);
3308
3309         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
3310         if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS,
3311                             &policy, mode, &lockh)) {
3312                 RETURN(1);
3313         }
3314
3315         RETURN(0);
3316 }
3317
3318 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
3319         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
3320                               * and return success */
3321                 inode->i_nlink = 0;
3322                 /* This path cannot be hit for regular files unless in
3323                  * case of obscure races, so no need to to validate
3324                  * size. */
3325                 if (!S_ISREG(inode->i_mode) &&
3326                     !S_ISDIR(inode->i_mode))
3327                         return 0;
3328         }
3329
3330         if (rc) {
3331                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
3332                 return -abs(rc);
3333
3334         }
3335
3336         return 0;
3337 }
3338
3339 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
3340                              __u64 ibits)
3341 {
3342         struct inode *inode = dentry->d_inode;
3343         struct ptlrpc_request *req = NULL;
3344         struct obd_export *exp;
3345         int rc = 0;
3346         ENTRY;
3347
3348         if (!inode) {
3349                 CERROR("REPORT THIS LINE TO PETER\n");
3350                 RETURN(0);
3351         }
3352         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
3353                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
3354
3355         exp = ll_i2mdcexp(inode);
3356
3357         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
3358                 struct lookup_intent oit = { .it_op = IT_GETATTR };
3359                 struct mdc_op_data op_data = { { 0 } };
3360
3361                 /* Call getattr by fid, so do not provide name at all. */
3362                 ll_prepare_mdc_op_data(&op_data, dentry->d_parent->d_inode,
3363                                        dentry->d_inode, NULL, 0, 0, NULL);
3364                 oit.it_create_mode |= M_CHECK_STALE;
3365                 rc = mdc_intent_lock(exp, &op_data, NULL, 0,
3366                                      /* we are not interested in name
3367                                         based lookup */
3368                                      &oit, 0, &req,
3369                                      ll_mdc_blocking_ast, 0);
3370                 oit.it_create_mode &= ~M_CHECK_STALE;
3371                 if (rc < 0) {
3372                         rc = ll_inode_revalidate_fini(inode, rc);
3373                         GOTO (out, rc);
3374                 }
3375
3376                 rc = revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
3377                 if (rc != 0) {
3378                         ll_intent_release(&oit);
3379                         GOTO(out, rc);
3380                 }
3381
3382                 /* Unlinked? Unhash dentry, so it is not picked up later by
3383                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
3384                    here to preserve get_cwd functionality on 2.6.
3385                    Bug 10503 */
3386                 if (!dentry->d_inode->i_nlink) {
3387                         spin_lock(&ll_lookup_lock);
3388                         spin_lock(&dcache_lock);
3389                         ll_drop_dentry(dentry);
3390                         spin_unlock(&dcache_lock);
3391                         spin_unlock(&ll_lookup_lock);
3392                 }
3393
3394                 ll_lookup_finish_locks(&oit, dentry);
3395         } else if (!ll_have_md_lock(dentry->d_inode, ibits, LCK_MINMODE)) {
3396                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
3397                 struct ll_fid fid;
3398                 obd_valid valid = OBD_MD_FLGETATTR;
3399                 int ealen = 0;
3400
3401                 if (S_ISREG(inode->i_mode)) {
3402                         rc = ll_get_max_mdsize(sbi, &ealen);
3403                         if (rc)
3404                                 RETURN(rc);
3405                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
3406                 }
3407                 ll_inode2fid(&fid, inode);
3408                 rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req);
3409                 if (rc) {
3410                         rc = ll_inode_revalidate_fini(inode, rc);
3411                         RETURN(rc);
3412                 }
3413
3414                 rc = ll_prep_inode(sbi->ll_osc_exp, &inode, req, REPLY_REC_OFF,
3415                                    NULL);
3416         }
3417
3418 out:
3419         ptlrpc_req_finished(req);
3420         RETURN(rc);
3421 }
3422
3423 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
3424 {
3425         struct inode *inode = dentry->d_inode;
3426         int rc;
3427         ENTRY;
3428
3429         rc = __ll_inode_revalidate_it(dentry, it, MDS_INODELOCK_UPDATE |
3430                                                   MDS_INODELOCK_LOOKUP);
3431
3432         /* if object not yet allocated, don't validate size */
3433         if (rc == 0 && ll_i2info(inode)->lli_smd == NULL) {
3434                 LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_lvb.lvb_atime;
3435                 LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_lvb.lvb_mtime;
3436                 LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_lvb.lvb_ctime;
3437                 RETURN(0);
3438         }
3439
3440         /* ll_glimpse_size will prefer locally cached writes if they extend
3441          * the file */
3442
3443         if (rc == 0)
3444                 rc = ll_glimpse_size(inode, 0);
3445
3446         RETURN(rc);
3447 }
3448
3449 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
3450                   struct lookup_intent *it, struct kstat *stat)
3451 {
3452         struct inode *inode = de->d_inode;
3453         struct ll_sb_info *sbi = ll_i2sbi(inode);
3454         struct ll_inode_info *lli = ll_i2info(inode);
3455         int res = 0;
3456
3457         res = ll_inode_revalidate_it(de, it);
3458         ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
3459
3460         if (res)
3461                 return res;
3462
3463         stat->dev = inode->i_sb->s_dev;
3464         if (ll_need_32bit_api(sbi))
3465                 stat->ino = ll_fid_build_ino((struct ll_fid *)&lli->lli_fid, 1);
3466         else
3467                 stat->ino = inode->i_ino;
3468         stat->mode = inode->i_mode;
3469         stat->nlink = inode->i_nlink;
3470         stat->uid = inode->i_uid;
3471         stat->gid = inode->i_gid;
3472         stat->rdev = kdev_t_to_nr(inode->i_rdev);
3473         stat->atime = inode->i_atime;
3474         stat->mtime = inode->i_mtime;
3475         stat->ctime = inode->i_ctime;
3476 #ifdef HAVE_INODE_BLKSIZE
3477         stat->blksize = inode->i_blksize;
3478 #else
3479         stat->blksize = 1<<inode->i_blkbits;
3480 #endif
3481
3482         ll_inode_size_lock(inode, 0);
3483         stat->size = i_size_read(inode);
3484         stat->blocks = inode->i_blocks;
3485         ll_inode_size_unlock(inode, 0);
3486
3487         return 0;
3488 }
3489 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
3490 {
3491         struct lookup_intent it = { .it_op = IT_GETATTR };
3492
3493         return ll_getattr_it(mnt, de, &it, stat);
3494 }
3495
3496 #ifdef HAVE_LINUX_FIEMAP_H
3497 int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3498                 __u64 start, __u64 len)
3499 {
3500         int rc;
3501         size_t num_bytes;
3502         struct ll_user_fiemap *fiemap;
3503         unsigned int extent_count = fieinfo->fi_extents_max;
3504
3505         num_bytes = sizeof(*fiemap) + (extent_count *
3506                                        sizeof(struct ll_fiemap_extent));
3507         OBD_VMALLOC(fiemap, num_bytes);
3508
3509         if (fiemap == NULL)
3510                 RETURN(-ENOMEM);
3511
3512         fiemap->fm_flags = fieinfo->fi_flags;
3513         fiemap->fm_extent_count = fieinfo->fi_extents_max;
3514         fiemap->fm_start = start;
3515         fiemap->fm_length = len;
3516         memcpy(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
3517                sizeof(struct ll_fiemap_extent));
3518
3519         rc = ll_do_fiemap(inode, fiemap, num_bytes);
3520
3521         fieinfo->fi_flags = fiemap->fm_flags;
3522         fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
3523         memcpy(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
3524                fiemap->fm_mapped_extents * sizeof(struct ll_fiemap_extent));
3525
3526         OBD_VFREE(fiemap, num_bytes);
3527         return rc;
3528 }
3529 #endif
3530
3531
3532 static
3533 int lustre_check_acl(struct inode *inode, int mask)
3534 {
3535 #ifdef CONFIG_FS_POSIX_ACL
3536         struct ll_inode_info *lli = ll_i2info(inode);
3537         struct posix_acl *acl;
3538         int rc;
3539         ENTRY;
3540
3541         spin_lock(&lli->lli_lock);
3542         acl = posix_acl_dup(lli->lli_posix_acl);
3543         spin_unlock(&lli->lli_lock);
3544
3545         if (!acl)
3546                 RETURN(-EAGAIN);
3547
3548         rc = posix_acl_permission(inode, acl, mask);
3549         posix_acl_release(acl);
3550
3551         RETURN(rc);
3552 #else
3553         return -EAGAIN;
3554 #endif
3555 }
3556
3557 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
3558 #ifndef HAVE_INODE_PERMISION_2ARGS
3559 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3560 #else
3561 int ll_inode_permission(struct inode *inode, int mask)
3562 #endif
3563 {
3564         int rc = 0;
3565         ENTRY;
3566
3567        /* as root inode are NOT getting validated in lookup operation,
3568         * need to do it before permission check. */
3569
3570         if (inode == inode->i_sb->s_root->d_inode) {
3571                 struct lookup_intent it = { .it_op = IT_LOOKUP };
3572
3573                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
3574                                               MDS_INODELOCK_LOOKUP);
3575                 if (rc)
3576                         RETURN(rc);
3577         }
3578
3579         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
3580                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
3581
3582         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3583         rc = generic_permission(inode, mask, lustre_check_acl);
3584
3585         RETURN(rc);
3586 }
3587 #else
3588 #ifndef HAVE_INODE_PERMISION_2ARGS
3589 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
3590 #else
3591 int ll_inode_permission(struct inode *inode, int mask)
3592 #endif
3593 {
3594         int mode = inode->i_mode;
3595         int rc;
3596
3597         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
3598                inode->i_ino, inode->i_generation, inode, mask);
3599         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
3600
3601         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
3602             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
3603                 return -EROFS;
3604         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
3605                 return -EACCES;
3606         if (current->fsuid == inode->i_uid) {
3607                 mode >>= 6;
3608         } else if (1) {
3609                 if (((mode >> 3) & mask & S_IRWXO) != mask)
3610                         goto check_groups;
3611                 rc = lustre_check_acl(inode, mask);
3612                 if (rc == -EAGAIN)
3613                         goto check_groups;
3614                 if (rc == -EACCES)
3615                         goto check_capabilities;
3616                 return rc;
3617         } else {
3618 check_groups:
3619                 if (in_group_p(inode->i_gid))
3620                         mode >>= 3;
3621         }
3622         if ((mode & mask & S_IRWXO) == mask)
3623                 return 0;
3624
3625 check_capabilities:
3626         if (!(mask & MAY_EXEC) ||
3627             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
3628                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
3629                         return 0;
3630
3631         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
3632             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
3633                 return 0;
3634
3635         return -EACCES;
3636 }
3637 #endif
3638
3639 /* -o localflock - only provides locally consistent flock locks */
3640 struct file_operations ll_file_operations = {
3641         .read           = ll_file_read,
3642 #ifdef HAVE_FILE_READV
3643         .readv          = ll_file_readv,
3644 #else
3645         .aio_read       = ll_file_aio_read,
3646 #endif
3647         .write          = ll_file_write,
3648 #ifdef HAVE_FILE_WRITEV
3649         .writev         = ll_file_writev,
3650 #else
3651         .aio_write      = ll_file_aio_write,
3652 #endif
3653 #ifdef HAVE_UNLOCKED_IOCTL
3654         .unlocked_ioctl = ll_file_ioctl,
3655 #else
3656         .ioctl          = ll_file_ioctl,
3657 #endif
3658         .open           = ll_file_open,
3659         .release        = ll_file_release,
3660         .mmap           = ll_file_mmap,
3661         .llseek         = ll_file_seek,
3662 #ifdef HAVE_KERNEL_SPLICE_READ
3663         .splice_read    = ll_file_splice_read,
3664 #endif
3665 #ifdef HAVE_KERNEL_SENDFILE
3666         .sendfile       = ll_file_sendfile,
3667 #endif
3668         .fsync          = ll_fsync,
3669         .flush          = ll_flush
3670 };
3671
3672 struct file_operations ll_file_operations_flock = {
3673         .read           = ll_file_read,
3674 #ifdef HAVE_FILE_READV
3675         .readv          = ll_file_readv,
3676 #else
3677         .aio_read       = ll_file_aio_read,
3678 #endif
3679         .write          = ll_file_write,
3680 #ifdef HAVE_FILE_WRITEV
3681         .writev         = ll_file_writev,
3682 #else
3683         .aio_write      = ll_file_aio_write,
3684 #endif
3685 #ifdef HAVE_UNLOCKED_IOCTL
3686         .unlocked_ioctl = ll_file_ioctl,
3687 #else
3688         .ioctl          = ll_file_ioctl,
3689 #endif
3690         .open           = ll_file_open,
3691         .release        = ll_file_release,
3692         .mmap           = ll_file_mmap,
3693         .llseek         = ll_file_seek,
3694 #ifdef HAVE_KERNEL_SPLICE_READ
3695         .splice_read    = ll_file_splice_read,
3696 #endif
3697 #ifdef HAVE_KERNEL_SENDFILE
3698         .sendfile       = ll_file_sendfile,
3699 #endif
3700         .fsync          = ll_fsync,
3701         .flush          = ll_flush,
3702 #ifdef HAVE_F_OP_FLOCK
3703         .flock          = ll_file_flock,
3704 #endif
3705         .lock           = ll_file_flock
3706 };
3707
3708 /* These are for -o noflock - to return ENOSYS on flock calls */
3709 struct file_operations ll_file_operations_noflock = {
3710         .read           = ll_file_read,
3711 #ifdef HAVE_FILE_READV
3712         .readv          = ll_file_readv,
3713 #else
3714         .aio_read       = ll_file_aio_read,
3715 #endif
3716         .write          = ll_file_write,
3717 #ifdef HAVE_FILE_WRITEV
3718         .writev         = ll_file_writev,
3719 #else
3720         .aio_write      = ll_file_aio_write,
3721 #endif
3722 #ifdef HAVE_UNLOCKED_IOCTL
3723         .unlocked_ioctl = ll_file_ioctl,
3724 #else
3725         .ioctl          = ll_file_ioctl,
3726 #endif
3727         .open           = ll_file_open,
3728         .release        = ll_file_release,
3729         .mmap           = ll_file_mmap,
3730         .llseek         = ll_file_seek,
3731 #ifdef HAVE_KERNEL_SPLICE_READ
3732         .splice_read    = ll_file_splice_read,
3733 #endif
3734 #ifdef HAVE_KERNEL_SENDFILE
3735         .sendfile       = ll_file_sendfile,
3736 #endif
3737         .fsync          = ll_fsync,
3738         .flush          = ll_flush,
3739 #ifdef HAVE_F_OP_FLOCK
3740         .flock          = ll_file_noflock,
3741 #endif
3742         .lock           = ll_file_noflock
3743 };
3744
3745 struct inode_operations ll_file_inode_operations = {
3746 #ifdef HAVE_VFS_INTENT_PATCHES
3747         .setattr_raw    = ll_setattr_raw,
3748 #endif
3749         .setattr        = ll_setattr,
3750         .truncate       = ll_truncate,
3751         .getattr        = ll_getattr,
3752         .permission     = ll_inode_permission,
3753         .setxattr       = ll_setxattr,
3754         .getxattr       = ll_getxattr,
3755         .listxattr      = ll_listxattr,
3756         .removexattr    = ll_removexattr,
3757 #ifdef  HAVE_LINUX_FIEMAP_H
3758         .fiemap         = ll_fiemap,
3759 #endif
3760 };
3761
3762 /* dynamic ioctl number support routins */
3763 static struct llioc_ctl_data {
3764         struct rw_semaphore ioc_sem;
3765         struct list_head    ioc_head;
3766 } llioc = {
3767         __RWSEM_INITIALIZER(llioc.ioc_sem),
3768         CFS_LIST_HEAD_INIT(llioc.ioc_head)
3769 };
3770
3771
3772 struct llioc_data {
3773         struct list_head        iocd_list;
3774         unsigned int            iocd_size;
3775         llioc_callback_t        iocd_cb;
3776         unsigned int            iocd_count;
3777         unsigned int            iocd_cmd[0];
3778 };
3779
3780 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
3781 {
3782         unsigned int size;
3783         struct llioc_data *in_data = NULL;
3784         ENTRY;
3785
3786         if (cb == NULL || cmd == NULL ||
3787             count > LLIOC_MAX_CMD || count < 0)
3788                 RETURN(NULL);
3789
3790         size = sizeof(*in_data) + count * sizeof(unsigned int);
3791         OBD_ALLOC(in_data, size);
3792         if (in_data == NULL)
3793                 RETURN(NULL);
3794
3795         memset(in_data, 0, sizeof(*in_data));
3796         in_data->iocd_size = size;
3797         in_data->iocd_cb = cb;
3798         in_data->iocd_count = count;
3799         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
3800
3801         down_write(&llioc.ioc_sem);
3802         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
3803         up_write(&llioc.ioc_sem);
3804
3805         RETURN(in_data);
3806 }
3807
3808 void ll_iocontrol_unregister(void *magic)
3809 {
3810         struct llioc_data *tmp;
3811
3812         if (magic == NULL)
3813                 return;
3814
3815         down_write(&llioc.ioc_sem);
3816         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
3817                 if (tmp == magic) {
3818                         unsigned int size = tmp->iocd_size;
3819
3820                         list_del(&tmp->iocd_list);
3821                         up_write(&llioc.ioc_sem);
3822
3823                         OBD_FREE(tmp, size);
3824                         return;
3825                 }
3826         }
3827         up_write(&llioc.ioc_sem);
3828
3829         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
3830 }
3831
3832 EXPORT_SYMBOL(ll_iocontrol_register);
3833 EXPORT_SYMBOL(ll_iocontrol_unregister);
3834
3835 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
3836                         unsigned int cmd, unsigned long arg, int *rcp)
3837 {
3838         enum llioc_iter ret = LLIOC_CONT;
3839         struct llioc_data *data;
3840         int rc = -EINVAL, i;
3841
3842         down_read(&llioc.ioc_sem);
3843         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
3844                 for (i = 0; i < data->iocd_count; i++) {
3845                         if (cmd != data->iocd_cmd[i])
3846                                 continue;
3847
3848                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
3849                         break;
3850                 }
3851
3852                 if (ret == LLIOC_STOP)
3853                         break;
3854         }
3855         up_read(&llioc.ioc_sem);
3856
3857         if (rcp)
3858                 *rcp = rc;
3859         return ret;
3860 }