lustre/llite/file.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
   5  *   Author: Peter Braam <braam@clusterfs.com>
   6  *   Author: Phil Schwan <phil@clusterfs.com>
   7  *   Author: Andreas Dilger <adilger@clusterfs.com>
   8  *
   9  *   This file is part of Lustre, http://www.lustre.org.
  10  *
  11  *   Lustre is free software; you can redistribute it and/or
  12  *   modify it under the terms of version 2 of the GNU General Public
  13  *   License as published by the Free Software Foundation.
  14  *
  15  *   Lustre is distributed in the hope that it will be useful,
  16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  *   GNU General Public License for more details.
  19  *
  20  *   You should have received a copy of the GNU General Public License
  21  *   along with Lustre; if not, write to the Free Software
  22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25 #define DEBUG_SUBSYSTEM S_LLITE
  26 #include <lustre_dlm.h>
  27 #include <lustre_lite.h>
  28 #include <lustre_mdc.h>
  29 #include <linux/pagemap.h>
  30 #include <linux/file.h>
  31 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
  32 #include <linux/lustre_compat25.h>
  33 #endif
  34 #include "llite_internal.h"
  35
  36 /* also used by llite/special.c:ll_special_open() */
  37 struct ll_file_data *ll_file_data_get(void)
  38 {
  39         struct ll_file_data *fd;
  40
  41         OBD_SLAB_ALLOC(fd, ll_file_data_slab, SLAB_KERNEL, sizeof *fd);
  42         return fd;
  43 }
  44
  45 static void ll_file_data_put(struct ll_file_data *fd)
  46 {
  47         if (fd != NULL)
  48                 OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd);
  49 }
  50
  51 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
  52                           struct lustre_handle *fh)
  53 {
  54         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
  55         op_data->op_attr.ia_mode = inode->i_mode;
  56         op_data->op_attr.ia_atime = inode->i_atime;
  57         op_data->op_attr.ia_mtime = inode->i_mtime;
  58         op_data->op_attr.ia_ctime = inode->i_ctime;
  59         op_data->op_attr.ia_size = inode->i_size;
  60         op_data->op_attr_blocks = inode->i_blocks;
  61         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
  62         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
  63         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
  64         op_data->op_capa1 = ll_mdscapa_get(inode);
  65 }
  66
  67 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
  68                              struct obd_client_handle *och)
  69 {
  70         ENTRY;
  71
  72         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
  73                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
  74
  75         if (!(och->och_flags & FMODE_WRITE))
  76                 goto out;
  77
  78         if (!S_ISREG(inode->i_mode))
  79                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
  80         else
  81                 ll_epoch_close(inode, op_data, &och, 0);
  82
  83 out:
  84         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
  85         EXIT;
  86 }
  87
  88 static int ll_close_inode_openhandle(struct obd_export *md_exp,
  89                                      struct inode *inode,
  90                                      struct obd_client_handle *och)
  91 {
  92         struct md_op_data *op_data;
  93         struct ptlrpc_request *req = NULL;
  94         struct obd_device *obd;
  95         int rc, clear_ord = 0;
  96         int epoch_close = 1;
  97         ENTRY;
  98
  99         obd = class_exp2obd(ll_i2mdexp(inode));
 100         if (obd == NULL) {
 101                 /*
 102                  * XXX: in case of LMV, is this correct to access
 103                  * ->exp_handle?
 104                  */
 105                 CERROR("Invalid MDC connection handle "LPX64"\n",
 106                        ll_i2mdexp(inode)->exp_handle.h_cookie);
 107                 GOTO(out, rc = 0);
 108         }
 109
 110         /*
 111          * here we check if this is forced umount. If so this is called on
 112          * canceling "open lock" and we do not call md_close() in this case, as
 113          * it will not be successful, as import is already deactivated.
 114          */
 115         if (obd->obd_no_recov)
 116                 GOTO(out, rc = 0);
 117
 118         OBD_ALLOC_PTR(op_data);
 119         if (op_data == NULL)
 120                 GOTO(out, rc = -ENOMEM);
 121
 122         ll_prepare_close(inode, op_data, och);
 123         epoch_close = (och->och_flags & FMODE_WRITE) &&
 124                       ((op_data->op_flags & MF_EPOCH_CLOSE) ||
 125                        !S_ISREG(inode->i_mode));
 126         rc = md_close(md_exp, op_data, och, &req);
 127
 128         ll_finish_md_op_data(op_data);
 129         if (rc == -EAGAIN) {
 130                 /* This close must have closed the epoch. */
 131                 LASSERT(epoch_close);
 132                 /* MDS has instructed us to obtain Size-on-MDS attribute from
 133                  * OSTs and send setattr to back to MDS. */
 134                 rc = ll_sizeonmds_update(inode, &och->och_fh);
 135                 if (rc) {
 136                         CERROR("inode %lu mdc Size-on-MDS update failed: "
 137                                "rc = %d\n", inode->i_ino, rc);
 138                         rc = 0;
 139                 }
 140         } else if (rc) {
 141                 CERROR("inode %lu mdc close failed: rc = %d\n",
 142                        inode->i_ino, rc);
 143         }
 144
 145         if (!epoch_close && (och->och_flags & FMODE_WRITE)) {
 146                 md_clear_open_replay_data(md_exp, och);
 147                 clear_ord = 1;
 148
 149                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
 150         }
 151
 152         if (rc == 0) {
 153                 rc = ll_objects_destroy(req, inode);
 154                 if (rc)
 155                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
 156                                inode->i_ino, rc);
 157         }
 158
 159         ptlrpc_req_finished(req); /* This is close request */
 160         EXIT;
 161 out:
 162         if (!clear_ord)
 163                 md_clear_open_replay_data(md_exp, och);
 164
 165         if (epoch_close || !(och->och_flags & FMODE_WRITE))
 166                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
 167         return rc;
 168 }
 169
 170 int ll_md_real_close(struct inode *inode, int flags)
 171 {
 172         struct ll_inode_info *lli = ll_i2info(inode);
 173         struct obd_client_handle **och_p;
 174         struct obd_client_handle *och;
 175         __u64 *och_usecount;
 176         int rc = 0;
 177         ENTRY;
 178
 179         if (flags & FMODE_WRITE) {
 180                 och_p = &lli->lli_mds_write_och;
 181                 och_usecount = &lli->lli_open_fd_write_count;
 182         } else if (flags & FMODE_EXEC) {
 183                 och_p = &lli->lli_mds_exec_och;
 184                 och_usecount = &lli->lli_open_fd_exec_count;
 185         } else {
 186                 LASSERT(flags & FMODE_READ);
 187                 och_p = &lli->lli_mds_read_och;
 188                 och_usecount = &lli->lli_open_fd_read_count;
 189         }
 190
 191         down(&lli->lli_och_sem);
 192         if (*och_usecount) { /* There are still users of this handle, so
 193                                 skip freeing it. */
 194                 up(&lli->lli_och_sem);
 195                 RETURN(0);
 196         }
 197         och=*och_p;
 198         *och_p = NULL;
 199         up(&lli->lli_och_sem);
 200
 201         if (och) { /* There might be a race and somebody have freed this och
 202                       already */
 203                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
 204                                                inode, och);
 205                 /* Do not free @och is it is waiting for DONE_WRITING. */
 206                 if (och->och_fh.cookie == DEAD_HANDLE_MAGIC)
 207                         OBD_FREE(och, sizeof *och);
 208         }
 209
 210         RETURN(rc);
 211 }
 212
 213 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
 214                 struct file *file)
 215 {
 216         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
 217         struct ll_inode_info *lli = ll_i2info(inode);
 218         int rc = 0;
 219         ENTRY;
 220
 221         /* clear group lock, if present */
 222         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
 223                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
 224                 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
 225                 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
 226                                       &fd->fd_cwlockh);
 227         }
 228
 229         /* Let's see if we have good enough OPEN lock on the file and if
 230            we can skip talking to MDS */
 231         if (file->f_dentry->d_inode) { /* Can this ever be false? */
 232                 int lockmode;
 233                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
 234                 struct lustre_handle lockh;
 235                 struct inode *inode = file->f_dentry->d_inode;
 236                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
 237
 238                 down(&lli->lli_och_sem);
 239                 if (fd->fd_omode & FMODE_WRITE) {
 240                         lockmode = LCK_CW;
 241                         LASSERT(lli->lli_open_fd_write_count);
 242                         lli->lli_open_fd_write_count--;
 243                 } else if (fd->fd_omode & FMODE_EXEC) {
 244                         lockmode = LCK_PR;
 245                         LASSERT(lli->lli_open_fd_exec_count);
 246                         lli->lli_open_fd_exec_count--;
 247                 } else {
 248                         lockmode = LCK_CR;
 249                         LASSERT(lli->lli_open_fd_read_count);
 250                         lli->lli_open_fd_read_count--;
 251                 }
 252                 up(&lli->lli_och_sem);
 253
 254                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
 255                                    LDLM_IBITS, &policy, lockmode,
 256                                    &lockh)) {
 257                         rc = ll_md_real_close(file->f_dentry->d_inode,
 258                                               fd->fd_omode);
 259                 }
 260         } else {
 261                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
 262                        file, file->f_dentry, file->f_dentry->d_name.name);
 263         }
 264
 265         LUSTRE_FPRIVATE(file) = NULL;
 266         ll_file_data_put(fd);
 267         ll_capa_close(inode);
 268
 269         RETURN(rc);
 270 }
 271
 272 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
 273
 274 /* While this returns an error code, fput() the caller does not, so we need
 275  * to make every effort to clean up all of our state here.  Also, applications
 276  * rarely check close errors and even if an error is returned they will not
 277  * re-try the close call.
 278  */
 279 int ll_file_release(struct inode *inode, struct file *file)
 280 {
 281         struct ll_file_data *fd;
 282         struct ll_sb_info *sbi = ll_i2sbi(inode);
 283         struct ll_inode_info *lli = ll_i2info(inode);
 284         struct lov_stripe_md *lsm = lli->lli_smd;
 285         int rc;
 286
 287         ENTRY;
 288         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
 289                inode->i_generation, inode);
 290
 291         lprocfs_counter_incr(sbi->ll_stats, LPROC_LL_RELEASE);
 292         fd = LUSTRE_FPRIVATE(file);
 293         LASSERT(fd != NULL);
 294
 295         /* don't do anything for / */
 296         if (inode->i_sb->s_root == file->f_dentry) {
 297                 LUSTRE_FPRIVATE(file) = NULL;
 298                 ll_file_data_put(fd);
 299                 RETURN(0);
 300         }
 301
 302         if (lsm)
 303                 lov_test_and_clear_async_rc(lsm);
 304         lli->lli_async_rc = 0;
 305
 306         rc = ll_md_close(sbi->ll_md_exp, inode, file);
 307         RETURN(rc);
 308 }
 309
 310 static int ll_intent_file_open(struct file *file, void *lmm,
 311                                int lmmsize, struct lookup_intent *itp)
 312 {
 313         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
 314         struct dentry *parent = file->f_dentry->d_parent;
 315         const char *name = file->f_dentry->d_name.name;
 316         const int len = file->f_dentry->d_name.len;
 317         struct lustre_handle lockh;
 318         struct md_op_data *op_data;
 319         int rc;
 320
 321         if (!parent)
 322                 RETURN(-ENOENT);
 323
 324         /* Usually we come here only for NFSD, and we want open lock.
 325            But we can also get here with pre 2.6.15 patchless kernels, and in
 326            that case that lock is also ok */
 327         /* We can also get here if there was cached open handle in revalidate_it
 328          * but it disappeared while we were getting from there to ll_file_open.
 329          * But this means this file was closed and immediatelly opened which
 330          * makes a good candidate for using OPEN lock */
 331         /* If lmmsize & lmm are not 0, we are just setting stripe info
 332          * parameters. No need for the open lock */
 333         if (!lmm && !lmmsize)
 334                 itp->it_flags |= MDS_OPEN_LOCK;
 335
 336         op_data  = ll_prep_md_op_data(NULL, parent->d_inode, NULL, name, len,
 337                                       O_RDWR);
 338         if (op_data == NULL)
 339                 RETURN(-ENOMEM);
 340
 341         rc = md_enqueue(sbi->ll_md_exp, LDLM_IBITS, itp, LCK_CW, op_data,
 342                         &lockh, lmm, lmmsize, ldlm_completion_ast,
 343                         ll_md_blocking_ast, NULL, 0);
 344
 345         ll_finish_md_op_data(op_data);
 346         if (rc < 0) {
 347                 CERROR("lock enqueue: err: %d\n", rc);
 348                 RETURN(rc);
 349         }
 350
 351         if (itp->d.lustre.it_lock_mode) { /* If we got lock - release it right
 352                                            * away */
 353                 ldlm_lock_decref(&lockh, itp->d.lustre.it_lock_mode);
 354                 itp->d.lustre.it_lock_mode = 0;
 355         }
 356         rc = ll_prep_inode(&file->f_dentry->d_inode,
 357                            (struct ptlrpc_request *)itp->d.lustre.it_data,
 358                            DLM_REPLY_REC_OFF, NULL);
 359         RETURN(rc);
 360 }
 361
 362 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
 363                        struct lookup_intent *it, struct obd_client_handle *och)
 364 {
 365         struct ptlrpc_request *req = it->d.lustre.it_data;
 366         struct mdt_body *body;
 367
 368         LASSERT(och);
 369
 370         body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
 371         LASSERT(body != NULL);                      /* reply already checked out */
 372         LASSERT_REPSWABBED(req, DLM_REPLY_REC_OFF); /* and swabbed in md_enqueue */
 373
 374         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
 375         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
 376         och->och_fid = lli->lli_fid;
 377         och->och_flags = it->it_flags;
 378         lli->lli_ioepoch = body->ioepoch;
 379
 380         return md_set_open_replay_data(md_exp, och, req);
 381 }
 382
 383 int ll_local_open(struct file *file, struct lookup_intent *it,
 384                   struct ll_file_data *fd, struct obd_client_handle *och)
 385 {
 386         struct inode *inode = file->f_dentry->d_inode;
 387         struct ll_inode_info *lli = ll_i2info(inode);
 388         ENTRY;
 389
 390         LASSERT(!LUSTRE_FPRIVATE(file));
 391
 392         LASSERT(fd != NULL);
 393
 394         if (och) {
 395                 struct ptlrpc_request *req = it->d.lustre.it_data;
 396                 struct mdt_body *body;
 397                 int rc;
 398
 399                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
 400                 if (rc)
 401                         RETURN(rc);
 402
 403                 body = lustre_msg_buf(req->rq_repmsg,
 404                                       DLM_REPLY_REC_OFF, sizeof(*body));
 405
 406                 if ((it->it_flags & FMODE_WRITE) &&
 407                     (body->valid & OBD_MD_FLSIZE))
 408                 {
 409                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
 410                                lli->lli_ioepoch, PFID(&lli->lli_fid));
 411                 }
 412         }
 413
 414         LUSTRE_FPRIVATE(file) = fd;
 415         ll_readahead_init(inode, &fd->fd_ras);
 416         fd->fd_omode = it->it_flags;
 417         RETURN(0);
 418 }
 419
 420 /* Open a file, and (for the very first open) create objects on the OSTs at
 421  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
 422  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
 423  * lli_open_sem to ensure no other process will create objects, send the
 424  * stripe MD to the MDS, or try to destroy the objects if that fails.
 425  *
 426  * If we already have the stripe MD locally then we don't request it in
 427  * md_open(), by passing a lmm_size = 0.
 428  *
 429  * It is up to the application to ensure no other processes open this file
 430  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
 431  * used.  We might be able to avoid races of that sort by getting lli_open_sem
 432  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
 433  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
 434  */
 435 int ll_file_open(struct inode *inode, struct file *file)
 436 {
 437         struct ll_inode_info *lli = ll_i2info(inode);
 438         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
 439                                           .it_flags = file->f_flags };
 440         struct lov_stripe_md *lsm;
 441         struct ptlrpc_request *req = NULL;
 442         struct obd_client_handle **och_p;
 443         __u64 *och_usecount;
 444         struct ll_file_data *fd;
 445         int rc = 0;
 446         ENTRY;
 447
 448         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
 449                inode->i_generation, inode, file->f_flags);
 450
 451 #ifdef LUSTRE_KERNEL_VERSION
 452         it = file->f_it;
 453 #else
 454         it = file->private_data; /* XXX: compat macro */
 455         file->private_data = NULL; /* prevent ll_local_open assertion */
 456 #endif
 457
 458         fd = ll_file_data_get();
 459         if (fd == NULL)
 460                 RETURN(-ENOMEM);
 461
 462         /* don't do anything for / */
 463         if (inode->i_sb->s_root == file->f_dentry) {
 464                 LUSTRE_FPRIVATE(file) = fd;
 465                 RETURN(0);
 466         }
 467
 468         if (!it || !it->d.lustre.it_disposition) {
 469                 /* Convert f_flags into access mode. We cannot use file->f_mode,
 470                  * because everything but O_ACCMODE mask was stripped from
 471                  * there */
 472                 if ((oit.it_flags + 1) & O_ACCMODE)
 473                         oit.it_flags++;
 474                 if (oit.it_flags & O_TRUNC)
 475                         oit.it_flags |= FMODE_WRITE;
 476
 477                 if (oit.it_flags & O_CREAT)
 478                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 479
 480                 /* NFS hack - some strange NFS clients create files with zero
 481                  * permission bits, and then expect to be able to open such
 482                  * files. We are relying on real VFS client to do ll_permission
 483                  * first before coming here, so if we got here, we either came
 484                  * from NFS or all access checks ar eok, so it is safe to set
 485                  * this flag in any case (XXX - race with chmod?)
 486                  */
 487                 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
 488
 489                 /* We do not want O_EXCL here, presumably we opened the file
 490                  * already? XXX - NFS implications? */
 491                 oit.it_flags &= ~O_EXCL;
 492
 493                 it = &oit;
 494         }
 495
 496         /* Let's see if we have file open on MDS already. */
 497         if (it->it_flags & FMODE_WRITE) {
 498                 och_p = &lli->lli_mds_write_och;
 499                 och_usecount = &lli->lli_open_fd_write_count;
 500         } else if (it->it_flags & FMODE_EXEC) {
 501                 och_p = &lli->lli_mds_exec_och;
 502                 och_usecount = &lli->lli_open_fd_exec_count;
 503          } else {
 504                 och_p = &lli->lli_mds_read_och;
 505                 och_usecount = &lli->lli_open_fd_read_count;
 506         }
 507         down(&lli->lli_och_sem);
 508         if (*och_p) { /* Open handle is present */
 509                 if (it_disposition(it, DISP_OPEN_OPEN)) {
 510                         /* Well, there's extra open request that we do not need,
 511                            let's close it somehow. This will decref request. */
 512                         ll_release_openhandle(file->f_dentry, it);
 513                 }
 514                 (*och_usecount)++;
 515
 516                 rc = ll_local_open(file, it, fd, NULL);
 517                 if (rc) {
 518                         up(&lli->lli_och_sem);
 519                         ll_file_data_put(fd);
 520                         RETURN(rc);
 521                 }
 522         } else {
 523                 LASSERT(*och_usecount == 0);
 524                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
 525                 if (!*och_p) {
 526                         ll_file_data_put(fd);
 527                         GOTO(out_och_free, rc = -ENOMEM);
 528                 }
 529                 (*och_usecount)++;
 530                 if (!it->d.lustre.it_disposition) {
 531                         rc = ll_intent_file_open(file, NULL, 0, it);
 532                         if (rc) {
 533                                 ll_file_data_put(fd);
 534                                 GOTO(out_och_free, rc);
 535                         }
 536
 537                         /* Got some error? Release the request */
 538                         if (it->d.lustre.it_status < 0) {
 539                                 req = it->d.lustre.it_data;
 540                                 ptlrpc_req_finished(req);
 541                         }
 542                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
 543                                          &it->d.lustre.it_lock_handle,
 544                                          file->f_dentry->d_inode);
 545                 }
 546                 req = it->d.lustre.it_data;
 547
 548                 /* md_intent_lock() didn't get a request ref if there was an
 549                  * open error, so don't do cleanup on the request here
 550                  * (bug 3430) */
 551                 /* XXX (green): Should not we bail out on any error here, not
 552                  * just open error? */
 553                 rc = it_open_error(DISP_OPEN_OPEN, it);
 554                 if (rc) {
 555                         ll_file_data_put(fd);
 556                         GOTO(out_och_free, rc);
 557                 }
 558
 559                 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_OPEN);
 560                 rc = ll_local_open(file, it, fd, *och_p);
 561                 if (rc) {
 562                         up(&lli->lli_och_sem);
 563                         ll_file_data_put(fd);
 564                         GOTO(out_och_free, rc);
 565                 }
 566         }
 567         up(&lli->lli_och_sem);
 568
 569         /* Must do this outside lli_och_sem lock to prevent deadlock where
 570            different kind of OPEN lock for this same inode gets cancelled
 571            by ldlm_cancel_lru */
 572         if (!S_ISREG(inode->i_mode))
 573                 GOTO(out, rc);
 574
 575         ll_capa_open(inode);
 576
 577         lsm = lli->lli_smd;
 578         if (lsm == NULL) {
 579                 if (file->f_flags & O_LOV_DELAY_CREATE ||
 580                     !(file->f_mode & FMODE_WRITE)) {
 581                         CDEBUG(D_INODE, "object creation was delayed\n");
 582                         GOTO(out, rc);
 583                 }
 584         }
 585         file->f_flags &= ~O_LOV_DELAY_CREATE;
 586         GOTO(out, rc);
 587 out:
 588         ptlrpc_req_finished(req);
 589         if (req)
 590                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
 591 out_och_free:
 592         if (rc) {
 593                 if (*och_p) {
 594                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
 595                         *och_p = NULL; /* OBD_FREE writes some magic there */
 596                         (*och_usecount)--;
 597                 }
 598                 up(&lli->lli_och_sem);
 599         }
 600
 601         return rc;
 602 }
 603
 604 /* Fills the obdo with the attributes for the inode defined by lsm */
 605 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
 606 {
 607         struct ptlrpc_request_set *set;
 608         struct ll_inode_info *lli = ll_i2info(inode);
 609         struct lov_stripe_md *lsm = lli->lli_smd;
 610
 611         struct obd_info oinfo = { { { 0 } } };
 612         int rc;
 613         ENTRY;
 614
 615         LASSERT(lsm != NULL);
 616
 617         oinfo.oi_md = lsm;
 618         oinfo.oi_oa = obdo;
 619         oinfo.oi_oa->o_id = lsm->lsm_object_id;
 620         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
 621         oinfo.oi_oa->o_mode = S_IFREG;
 622         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
 623                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
 624                                OBD_MD_FLBLKSZ | OBD_MD_FLMTIME |
 625                                OBD_MD_FLCTIME | OBD_MD_FLGROUP;
 626         oinfo.oi_capa = ll_mdscapa_get(inode);
 627
 628         set = ptlrpc_prep_set();
 629         if (set == NULL) {
 630                 CERROR("can't allocate ptlrpc set\n");
 631                 rc = -ENOMEM;
 632         } else {
 633                 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
 634                 if (rc == 0)
 635                         rc = ptlrpc_set_wait(set);
 636                 ptlrpc_set_destroy(set);
 637         }
 638         capa_put(oinfo.oi_capa);
 639         if (rc)
 640                 RETURN(rc);
 641
 642         oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
 643                                  OBD_MD_FLMTIME | OBD_MD_FLCTIME |
 644                                  OBD_MD_FLSIZE);
 645
 646         obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
 647         CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
 648                lli->lli_smd->lsm_object_id, inode->i_size, inode->i_blocks,
 649                inode->i_blksize);
 650         RETURN(0);
 651 }
 652
 653 static inline void ll_remove_suid(struct inode *inode)
 654 {
 655         unsigned int mode;
 656
 657         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
 658         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
 659
 660         /* was any of the uid bits set? */
 661         mode &= inode->i_mode;
 662         if (mode && !capable(CAP_FSETID)) {
 663                 inode->i_mode &= ~mode;
 664                 // XXX careful here - we cannot change the size
 665         }
 666 }
 667
 668 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
 669 {
 670         struct ll_inode_info *lli = ll_i2info(inode);
 671         struct lov_stripe_md *lsm = lli->lli_smd;
 672         struct obd_export *exp = ll_i2dtexp(inode);
 673         struct {
 674                 char name[16];
 675                 struct ldlm_lock *lock;
 676                 struct lov_stripe_md *lsm;
 677         } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
 678         __u32 stripe, vallen = sizeof(stripe);
 679         int rc;
 680         ENTRY;
 681
 682         if (lsm->lsm_stripe_count == 1)
 683                 GOTO(check, stripe = 0);
 684
 685         /* get our offset in the lov */
 686         rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
 687         if (rc != 0) {
 688                 CERROR("obd_get_info: rc = %d\n", rc);
 689                 RETURN(rc);
 690         }
 691         LASSERT(stripe < lsm->lsm_stripe_count);
 692
 693 check:
 694         if (lsm->lsm_oinfo[stripe].loi_id != lock->l_resource->lr_name.name[0]||
 695             lsm->lsm_oinfo[stripe].loi_gr != lock->l_resource->lr_name.name[2]){
 696                 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
 697                            lsm->lsm_oinfo[stripe].loi_id,
 698                            lsm->lsm_oinfo[stripe].loi_gr);
 699                 RETURN(-ELDLM_NO_LOCK_DATA);
 700         }
 701
 702         RETURN(stripe);
 703 }
 704
 705 /* Flush the page cache for an extent as its canceled.  When we're on an LOV,
 706  * we get a lock cancellation for each stripe, so we have to map the obd's
 707  * region back onto the stripes in the file that it held.
 708  *
 709  * No one can dirty the extent until we've finished our work and they can
 710  * enqueue another lock.  The DLM protects us from ll_file_read/write here,
 711  * but other kernel actors could have pages locked.
 712  *
 713  * Called with the DLM lock held. */
 714 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
 715                               struct ldlm_lock *lock, __u32 stripe)
 716 {
 717         ldlm_policy_data_t tmpex;
 718         unsigned long start, end, count, skip, i, j;
 719         struct page *page;
 720         int rc, rc2, l_flags, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
 721         struct lustre_handle lockh;
 722         ENTRY;
 723
 724         memcpy(&tmpex, &lock->l_policy_data, sizeof(tmpex));
 725         CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
 726                inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
 727                inode->i_size);
 728
 729         /* our locks are page granular thanks to osc_enqueue, we invalidate the
 730          * whole page. */
 731         if ((tmpex.l_extent.start & ~PAGE_CACHE_MASK) != 0 ||
 732             ((tmpex.l_extent.end + 1) & ~PAGE_CACHE_MASK) != 0)
 733                 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",PAGE_SIZE);
 734         LASSERT((tmpex.l_extent.start & ~PAGE_CACHE_MASK) == 0);
 735         LASSERT(((tmpex.l_extent.end + 1) & ~PAGE_CACHE_MASK) == 0);
 736
 737         count = ~0;
 738         skip = 0;
 739         start = tmpex.l_extent.start >> PAGE_CACHE_SHIFT;
 740         end = tmpex.l_extent.end >> PAGE_CACHE_SHIFT;
 741         if (lsm->lsm_stripe_count > 1) {
 742                 count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT;
 743                 skip = (lsm->lsm_stripe_count - 1) * count;
 744                 start += start/count * skip + stripe * count;
 745                 if (end != ~0)
 746                         end += end/count * skip + stripe * count;
 747         }
 748         if (end < tmpex.l_extent.end >> PAGE_CACHE_SHIFT)
 749                 end = ~0;
 750
 751         i = inode->i_size ? (inode->i_size - 1) >> PAGE_CACHE_SHIFT : 0;
 752         if (i < end)
 753                 end = i;
 754
 755         CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
 756                "count: %lu skip: %lu end: %lu%s\n", start, start % count,
 757                count, skip, end, discard ? " (DISCARDING)" : "");
 758
 759         /* walk through the vmas on the inode and tear down mmaped pages that
 760          * intersect with the lock.  this stops immediately if there are no
 761          * mmap()ed regions of the file.  This is not efficient at all and
 762          * should be short lived. We'll associate mmap()ed pages with the lock
 763          * and will be able to find them directly */
 764         for (i = start; i <= end; i += (j + skip)) {
 765                 j = min(count - (i % count), end - i + 1);
 766                 LASSERT(j > 0);
 767                 LASSERT(inode->i_mapping);
 768                 if (ll_teardown_mmaps(inode->i_mapping,
 769                                       (__u64)i << PAGE_CACHE_SHIFT,
 770                                       ((__u64)(i+j) << PAGE_CACHE_SHIFT) - 1) )
 771                         break;
 772         }
 773
 774         /* this is the simplistic implementation of page eviction at
 775          * cancelation.  It is careful to get races with other page
 776          * lockers handled correctly.  fixes from bug 20 will make it
 777          * more efficient by associating locks with pages and with
 778          * batching writeback under the lock explicitly. */
 779         for (i = start, j = start % count; i <= end;
 780              j++, i++, tmpex.l_extent.start += PAGE_CACHE_SIZE) {
 781                 if (j == count) {
 782                         CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
 783                         i += skip;
 784                         j = 0;
 785                         if (i > end)
 786                                 break;
 787                 }
 788                 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
 789                          LPU64" >= "LPU64" start %lu i %lu end %lu\n",
 790                          tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
 791                          start, i, end);
 792
 793                 if (!mapping_has_pages(inode->i_mapping)) {
 794                         CDEBUG(D_INODE|D_PAGE, "nothing left\n");
 795                         break;
 796                 }
 797
 798                 cond_resched();
 799
 800                 page = find_get_page(inode->i_mapping, i);
 801                 if (page == NULL)
 802                         continue;
 803                 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
 804                                i, tmpex.l_extent.start);
 805                 lock_page(page);
 806
 807                 /* page->mapping to check with racing against teardown */
 808                 if (!discard && clear_page_dirty_for_io(page)) {
 809                         rc = ll_call_writepage(inode, page);
 810                         if (rc != 0)
 811                                 CERROR("writepage of page %p failed: %d\n",
 812                                        page, rc);
 813                         /* either waiting for io to complete or reacquiring
 814                          * the lock that the failed writepage released */
 815                         lock_page(page);
 816                 }
 817
 818                 tmpex.l_extent.end = tmpex.l_extent.start + PAGE_CACHE_SIZE - 1;
 819                 l_flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
 820                 /* check to see if another DLM lock covers this page b=2765 */
 821                 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
 822                                       l_flags, &lock->l_resource->lr_name,
 823                                       LDLM_EXTENT, &tmpex, LCK_PR | LCK_PW, &lockh);
 824
 825                 if (rc2 <= 0 && page->mapping != NULL) {
 826                         struct ll_async_page *llap = llap_cast_private(page);
 827                         // checking again to account for writeback's lock_page()
 828                         LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
 829                         if (llap)
 830                                 ll_ra_accounting(llap, inode->i_mapping);
 831                         ll_truncate_complete_page(page);
 832                 }
 833                 unlock_page(page);
 834                 page_cache_release(page);
 835         }
 836         LASSERTF(tmpex.l_extent.start <=
 837                  (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
 838                   lock->l_policy_data.l_extent.end + 1),
 839                  "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
 840                  tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
 841                  start, i, end);
 842         EXIT;
 843 }
 844
 845 static int ll_extent_lock_callback(struct ldlm_lock *lock,
 846                                    struct ldlm_lock_desc *new, void *data,
 847                                    int flag)
 848 {
 849         struct lustre_handle lockh = { 0 };
 850         int rc;
 851         ENTRY;
 852
 853         if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
 854                 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
 855                 LBUG();
 856         }
 857
 858         switch (flag) {
 859         case LDLM_CB_BLOCKING:
 860                 ldlm_lock2handle(lock, &lockh);
 861                 rc = ldlm_cli_cancel(&lockh);
 862                 if (rc != ELDLM_OK)
 863                         CERROR("ldlm_cli_cancel failed: %d\n", rc);
 864                 break;
 865         case LDLM_CB_CANCELING: {
 866                 struct inode *inode;
 867                 struct ll_inode_info *lli;
 868                 struct lov_stripe_md *lsm;
 869                 int stripe;
 870                 __u64 kms;
 871
 872                 /* This lock wasn't granted, don't try to evict pages */
 873                 if (lock->l_req_mode != lock->l_granted_mode)
 874                         RETURN(0);
 875
 876                 inode = ll_inode_from_lock(lock);
 877                 if (inode == NULL)
 878                         RETURN(0);
 879                 lli = ll_i2info(inode);
 880                 if (lli == NULL)
 881                         goto iput;
 882                 if (lli->lli_smd == NULL)
 883                         goto iput;
 884                 lsm = lli->lli_smd;
 885
 886                 stripe = ll_lock_to_stripe_offset(inode, lock);
 887                 if (stripe < 0)
 888                         goto iput;
 889
 890                 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
 891
 892                 lov_stripe_lock(lsm);
 893                 lock_res_and_lock(lock);
 894                 kms = ldlm_extent_shift_kms(lock,
 895                                             lsm->lsm_oinfo[stripe].loi_kms);
 896
 897                 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
 898                         LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 899                                    lsm->lsm_oinfo[stripe].loi_kms, kms);
 900                 lsm->lsm_oinfo[stripe].loi_kms = kms;
 901                 unlock_res_and_lock(lock);
 902                 lov_stripe_unlock(lsm);
 903         iput:
 904                 iput(inode);
 905                 break;
 906         }
 907         default:
 908                 LBUG();
 909         }
 910
 911         RETURN(0);
 912 }
 913
 914 #if 0
 915 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
 916 {
 917         /* XXX ALLOCATE - 160 bytes */
 918         struct inode *inode = ll_inode_from_lock(lock);
 919         struct ll_inode_info *lli = ll_i2info(inode);
 920         struct lustre_handle lockh = { 0 };
 921         struct ost_lvb *lvb;
 922         int stripe;
 923         ENTRY;
 924
 925         if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
 926                      LDLM_FL_BLOCK_CONV)) {
 927                 LBUG(); /* not expecting any blocked async locks yet */
 928                 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
 929                            "lock, returning");
 930                 ldlm_lock_dump(D_OTHER, lock, 0);
 931                 ldlm_reprocess_all(lock->l_resource);
 932                 RETURN(0);
 933         }
 934
 935         LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
 936
 937         stripe = ll_lock_to_stripe_offset(inode, lock);
 938         if (stripe < 0)
 939                 goto iput;
 940
 941         if (lock->l_lvb_len) {
 942                 struct lov_stripe_md *lsm = lli->lli_smd;
 943                 __u64 kms;
 944                 lvb = lock->l_lvb_data;
 945                 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
 946
 947                 LOCK_INODE_MUTEX(inode);
 948                 lock_res_and_lock(lock);
 949                 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
 950                 kms = ldlm_extent_shift_kms(NULL, kms);
 951                 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
 952                         LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
 953                                    lsm->lsm_oinfo[stripe].loi_kms, kms);
 954                 lsm->lsm_oinfo[stripe].loi_kms = kms;
 955                 unlock_res_and_lock(lock);
 956                 UNLOCK_INODE_MUTEX(inode);
 957         }
 958
 959 iput:
 960         iput(inode);
 961         wake_up(&lock->l_waitq);
 962
 963         ldlm_lock2handle(lock, &lockh);
 964         ldlm_lock_decref(&lockh, LCK_PR);
 965         RETURN(0);
 966 }
 967 #endif
 968
 969 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
 970 {
 971         struct ptlrpc_request *req = reqp;
 972         struct inode *inode = ll_inode_from_lock(lock);
 973         struct ll_inode_info *lli;
 974         struct lov_stripe_md *lsm;
 975         struct ost_lvb *lvb;
 976         int rc, stripe;
 977         int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
 978         ENTRY;
 979
 980         if (inode == NULL)
 981                 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
 982         lli = ll_i2info(inode);
 983         if (lli == NULL)
 984                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
 985         lsm = lli->lli_smd;
 986         if (lsm == NULL)
 987                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
 988
 989         /* First, find out which stripe index this lock corresponds to. */
 990         stripe = ll_lock_to_stripe_offset(inode, lock);
 991         if (stripe < 0)
 992                 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
 993
 994         rc = lustre_pack_reply(req, 2, size, NULL);
 995         if (rc) {
 996                 CERROR("lustre_pack_reply: %d\n", rc);
 997                 GOTO(iput, rc);
 998         }
 999
1000         lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
1001         lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe].loi_kms;
1002         lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1003         lvb->lvb_atime = LTIME_S(inode->i_atime);
1004         lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1005
1006         LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1007                    " atime "LPU64", mtime "LPU64", ctime "LPU64,
1008                    inode->i_size, stripe, lvb->lvb_size, lvb->lvb_mtime,
1009                    lvb->lvb_atime, lvb->lvb_ctime);
1010  iput:
1011         iput(inode);
1012
1013  out:
1014         /* These errors are normal races, so we don't want to fill the console
1015          * with messages by calling ptlrpc_error() */
1016         if (rc == -ELDLM_NO_LOCK_DATA)
1017                 lustre_pack_reply(req, 1, NULL, NULL);
1018
1019         req->rq_status = rc;
1020         return rc;
1021 }
1022
1023 static void ll_merge_lvb(struct inode *inode)
1024 {
1025         struct ll_inode_info *lli = ll_i2info(inode);
1026         struct ll_sb_info *sbi = ll_i2sbi(inode);
1027         struct ost_lvb lvb;
1028         ENTRY;
1029
1030         ll_inode_size_lock(inode, 1);
1031         inode_init_lvb(inode, &lvb);
1032         obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1033         inode->i_size = lvb.lvb_size;
1034         inode->i_blocks = lvb.lvb_blocks;
1035         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1036         LTIME_S(inode->i_atime) = lvb.lvb_atime;
1037         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1038         ll_inode_size_unlock(inode, 1);
1039         EXIT;
1040 }
1041
1042 int ll_local_size(struct inode *inode)
1043 {
1044         ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1045         struct ll_inode_info *lli = ll_i2info(inode);
1046         struct ll_sb_info *sbi = ll_i2sbi(inode);
1047         struct lustre_handle lockh = { 0 };
1048         int flags = 0;
1049         int rc;
1050         ENTRY;
1051
1052         if (lli->lli_smd->lsm_stripe_count == 0)
1053                 RETURN(0);
1054
1055         rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1056                        &policy, LCK_PR | LCK_PW, &flags, inode, &lockh);
1057         if (rc < 0)
1058                 RETURN(rc);
1059         else if (rc == 0)
1060                 RETURN(-ENODATA);
1061
1062         ll_merge_lvb(inode);
1063         obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR | LCK_PW, &lockh);
1064         RETURN(0);
1065 }
1066
1067 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1068                      lstat_t *st)
1069 {
1070         struct lustre_handle lockh = { 0 };
1071         struct obd_enqueue_info einfo = { 0 };
1072         struct obd_info oinfo = { { { 0 } } };
1073         struct ost_lvb lvb;
1074         int rc;
1075
1076         ENTRY;
1077
1078         einfo.ei_type = LDLM_EXTENT;
1079         einfo.ei_mode = LCK_PR;
1080         einfo.ei_flags = LDLM_FL_HAS_INTENT;
1081         einfo.ei_cb_bl = ll_extent_lock_callback;
1082         einfo.ei_cb_cp = ldlm_completion_ast;
1083         einfo.ei_cb_gl = ll_glimpse_callback;
1084         einfo.ei_cbdata = NULL;
1085
1086         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1087         oinfo.oi_lockh = &lockh;
1088         oinfo.oi_md = lsm;
1089
1090         rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1091         if (rc == -ENOENT)
1092                 RETURN(rc);
1093         if (rc != 0) {
1094                 CERROR("obd_enqueue returned rc %d, "
1095                        "returning -EIO\n", rc);
1096                 RETURN(rc > 0 ? -EIO : rc);
1097         }
1098
1099         lov_stripe_lock(lsm);
1100         memset(&lvb, 0, sizeof(lvb));
1101         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1102         st->st_size = lvb.lvb_size;
1103         st->st_blocks = lvb.lvb_blocks;
1104         st->st_mtime = lvb.lvb_mtime;
1105         st->st_atime = lvb.lvb_atime;
1106         st->st_ctime = lvb.lvb_ctime;
1107         lov_stripe_unlock(lsm);
1108
1109         RETURN(rc);
1110 }
1111
1112 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1113  * file (because it prefers KMS over RSS when larger) */
1114 int ll_glimpse_size(struct inode *inode, int ast_flags)
1115 {
1116         struct ll_inode_info *lli = ll_i2info(inode);
1117         struct ll_sb_info *sbi = ll_i2sbi(inode);
1118         struct lustre_handle lockh = { 0 };
1119         struct obd_enqueue_info einfo = { 0 };
1120         struct obd_info oinfo = { { { 0 } } };
1121         int rc;
1122         ENTRY;
1123
1124         if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1125                 RETURN(0);
1126
1127         CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1128
1129         if (!lli->lli_smd) {
1130                 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1131                 RETURN(0);
1132         }
1133
1134         /* NOTE: this looks like DLM lock request, but it may not be one. Due
1135          *       to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1136          *       won't revoke any conflicting DLM locks held. Instead,
1137          *       ll_glimpse_callback() will be called on each client
1138          *       holding a DLM lock against this file, and resulting size
1139          *       will be returned for each stripe. DLM lock on [0, EOF] is
1140          *       acquired only if there were no conflicting locks. */
1141         einfo.ei_type = LDLM_EXTENT;
1142         einfo.ei_mode = LCK_PR;
1143         einfo.ei_flags = ast_flags | LDLM_FL_HAS_INTENT;
1144         einfo.ei_cb_bl = ll_extent_lock_callback;
1145         einfo.ei_cb_cp = ldlm_completion_ast;
1146         einfo.ei_cb_gl = ll_glimpse_callback;
1147         einfo.ei_cbdata = inode;
1148
1149         oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1150         oinfo.oi_lockh = &lockh;
1151         oinfo.oi_md = lli->lli_smd;
1152
1153         rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1154         if (rc == -ENOENT)
1155                 RETURN(rc);
1156         if (rc != 0) {
1157                 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1158                 RETURN(rc > 0 ? -EIO : rc);
1159         }
1160
1161         ll_merge_lvb(inode);
1162
1163         CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
1164                inode->i_size, inode->i_blocks);
1165
1166         RETURN(rc);
1167 }
1168
1169 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1170                    struct lov_stripe_md *lsm, int mode,
1171                    ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1172                    int ast_flags)
1173 {
1174         struct ll_sb_info *sbi = ll_i2sbi(inode);
1175         struct ost_lvb lvb;
1176         struct obd_enqueue_info einfo = { 0 };
1177         struct obd_info oinfo = { { { 0 } } };
1178         int rc;
1179         ENTRY;
1180
1181         LASSERT(!lustre_handle_is_used(lockh));
1182         LASSERT(lsm != NULL);
1183
1184         /* don't drop the mmapped file to LRU */
1185         if (mapping_mapped(inode->i_mapping))
1186                 ast_flags |= LDLM_FL_NO_LRU;
1187
1188         /* XXX phil: can we do this?  won't it screw the file size up? */
1189         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1190             (sbi->ll_flags & LL_SBI_NOLCK))
1191                 RETURN(0);
1192
1193         CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1194                inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1195
1196         einfo.ei_type = LDLM_EXTENT;
1197         einfo.ei_mode = mode;
1198         einfo.ei_flags = ast_flags;
1199         einfo.ei_cb_bl = ll_extent_lock_callback;
1200         einfo.ei_cb_cp = ldlm_completion_ast;
1201         einfo.ei_cb_gl = ll_glimpse_callback;
1202         einfo.ei_cbdata = inode;
1203
1204         oinfo.oi_policy = *policy;
1205         oinfo.oi_lockh = lockh;
1206         oinfo.oi_md = lsm;
1207
1208         rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo);
1209         *policy = oinfo.oi_policy;
1210         if (rc > 0)
1211                 rc = -EIO;
1212
1213         ll_inode_size_lock(inode, 1);
1214         inode_init_lvb(inode, &lvb);
1215         obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1216
1217         if (policy->l_extent.start == 0 &&
1218             policy->l_extent.end == OBD_OBJECT_EOF) {
1219                 /* vmtruncate()->ll_truncate() first sets the i_size and then
1220                  * the kms under both a DLM lock and the
1221                  * ll_inode_size_lock().  If we don't get the
1222                  * ll_inode_size_lock() here we can match the DLM lock and
1223                  * reset i_size from the kms before the truncating path has
1224                  * updated the kms.  generic_file_write can then trust the
1225                  * stale i_size when doing appending writes and effectively
1226                  * cancel the result of the truncate.  Getting the
1227                  * ll_inode_size_lock() after the enqueue maintains the DLM
1228                  * -> ll_inode_size_lock() acquiring order. */
1229                 inode->i_size = lvb.lvb_size;
1230         }
1231
1232         if (rc == 0) {
1233                 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1234                 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1235                 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1236         }
1237         ll_inode_size_unlock(inode, 1);
1238
1239         RETURN(rc);
1240 }
1241
1242 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1243                      struct lov_stripe_md *lsm, int mode,
1244                      struct lustre_handle *lockh)
1245 {
1246         struct ll_sb_info *sbi = ll_i2sbi(inode);
1247         int rc;
1248         ENTRY;
1249
1250         /* XXX phil: can we do this?  won't it screw the file size up? */
1251         if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1252             (sbi->ll_flags & LL_SBI_NOLCK))
1253                 RETURN(0);
1254
1255         rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1256
1257         RETURN(rc);
1258 }
1259
1260 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1261                             loff_t *ppos)
1262 {
1263         struct inode *inode = file->f_dentry->d_inode;
1264         struct ll_inode_info *lli = ll_i2info(inode);
1265         struct lov_stripe_md *lsm = lli->lli_smd;
1266         struct ll_sb_info *sbi = ll_i2sbi(inode);
1267         struct ll_lock_tree tree;
1268         struct ll_lock_tree_node *node;
1269         struct ost_lvb lvb;
1270         struct ll_ra_read bead;
1271         int rc, ra = 0;
1272         loff_t end;
1273         ssize_t retval, chunk, sum = 0;
1274
1275         __u64 kms;
1276         ENTRY;
1277         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1278                inode->i_ino, inode->i_generation, inode, count, *ppos);
1279
1280         /* "If nbyte is 0, read() will return 0 and have no other results."
1281          *                      -- Single Unix Spec */
1282         if (count == 0)
1283                 RETURN(0);
1284
1285         lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
1286                             count);
1287
1288         if (!lsm) {
1289                 /* Read on file with no objects should return zero-filled
1290                  * buffers up to file size (we can get non-zero sizes with
1291                  * mknod + truncate, then opening file for read. This is a
1292                  * common pattern in NFS case, it seems). Bug 6243 */
1293                 int notzeroed;
1294                 /* Since there are no objects on OSTs, we have nothing to get
1295                  * lock on and so we are forced to access inode->i_size
1296                  * unguarded */
1297
1298                 /* Read beyond end of file */
1299                 if (*ppos >= inode->i_size)
1300                         RETURN(0);
1301
1302                 if (count > inode->i_size - *ppos)
1303                         count = inode->i_size - *ppos;
1304                 /* Make sure to correctly adjust the file pos pointer for
1305                  * EFAULT case */
1306                 notzeroed = clear_user(buf, count);
1307                 count -= notzeroed;
1308                 *ppos += count;
1309                 if (!count)
1310                         RETURN(-EFAULT);
1311                 RETURN(count);
1312         }
1313
1314 repeat:
1315         if (sbi->ll_max_rw_chunk != 0) {
1316                 /* first, let's know the end of the current stripe */
1317                 end = *ppos;
1318                 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end);
1319
1320                 /* correct, the end is beyond the request */
1321                 if (end > *ppos + count - 1)
1322                         end = *ppos + count - 1;
1323
1324                 /* and chunk shouldn't be too large even if striping is wide */
1325                 if (end - *ppos > sbi->ll_max_rw_chunk)
1326                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1327         } else {
1328                 end = *ppos + count - 1;
1329         }
1330
1331         node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1332         tree.lt_fd = LUSTRE_FPRIVATE(file);
1333         rc = ll_tree_lock(&tree, node, buf, count,
1334                           file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1335         if (rc != 0)
1336                 GOTO(out, retval = rc);
1337
1338         ll_inode_size_lock(inode, 1);
1339         /*
1340          * Consistency guarantees: following possibilities exist for the
1341          * relation between region being read and real file size at this
1342          * moment:
1343          *
1344          *  (A): the region is completely inside of the file;
1345          *
1346          *  (B-x): x bytes of region are inside of the file, the rest is
1347          *  outside;
1348          *
1349          *  (C): the region is completely outside of the file.
1350          *
1351          * This classification is stable under DLM lock acquired by
1352          * ll_tree_lock() above, because to change class, other client has to
1353          * take DLM lock conflicting with our lock. Also, any updates to
1354          * ->i_size by other threads on this client are serialized by
1355          * ll_inode_size_lock(). This guarantees that short reads are handled
1356          * correctly in the face of concurrent writes and truncates.
1357          */
1358         inode_init_lvb(inode, &lvb);
1359         obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1360         kms = lvb.lvb_size;
1361         if (*ppos + count - 1 > kms) {
1362                 /* A glimpse is necessary to determine whether we return a
1363                  * short read (B) or some zeroes at the end of the buffer (C) */
1364                 ll_inode_size_unlock(inode, 1);
1365                 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1366                 if (retval) {
1367                         ll_tree_unlock(&tree);
1368                         goto out;
1369                 }
1370         } else {
1371                 /* region is within kms and, hence, within real file size (A) */
1372                 inode->i_size = kms;
1373                 ll_inode_size_unlock(inode, 1);
1374         }
1375
1376         chunk = end - *ppos + 1;
1377         CDEBUG(D_VFSTRACE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1378                inode->i_ino, chunk, *ppos, inode->i_size);
1379
1380         /* turn off the kernel's read-ahead */
1381 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
1382         file->f_ramax = 0;
1383 #else
1384         file->f_ra.ra_pages = 0;
1385 #endif
1386         /* initialize read-ahead window once per syscall */
1387         if (ra == 0) {
1388                 ra = 1;
1389                 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1390                 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1391                 ll_ra_read_in(file, &bead);
1392         }
1393
1394         /* BUG: 5972 */
1395         file_accessed(file);
1396         retval = generic_file_read(file, buf, chunk, ppos);
1397         ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 0);
1398
1399         ll_tree_unlock(&tree);
1400
1401         if (retval > 0) {
1402                 buf += retval;
1403                 count -= retval;
1404                 sum += retval;
1405                 if (retval == chunk && count > 0)
1406                         goto repeat;
1407         }
1408
1409  out:
1410         if (ra != 0)
1411                 ll_ra_read_ex(file, &bead);
1412         retval = (sum > 0) ? sum : retval;
1413         if (retval < 0)
1414                CERROR("Read error inode=%lu/%u(%p),size="LPSZ",off=%Ld rc %d\n",
1415                       inode->i_ino, inode->i_generation, inode, count, *ppos,
1416                       retval);
1417
1418         RETURN(retval);
1419 }
1420
1421 /*
1422  * Write to a file (through the page cache).
1423  */
1424 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1425                              loff_t *ppos)
1426 {
1427         struct inode *inode = file->f_dentry->d_inode;
1428         struct ll_sb_info *sbi = ll_i2sbi(inode);
1429         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1430         struct ll_lock_tree tree;
1431         struct ll_lock_tree_node *node;
1432         loff_t maxbytes = ll_file_maxbytes(inode);
1433         loff_t lock_start, lock_end, end;
1434         ssize_t retval, chunk, sum = 0;
1435         int rc;
1436         ENTRY;
1437         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1438                inode->i_ino, inode->i_generation, inode, count, *ppos);
1439
1440         SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1441
1442         /* POSIX, but surprised the VFS doesn't check this already */
1443         if (count == 0)
1444                 RETURN(0);
1445
1446         /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1447          * called on the file, don't fail the below assertion (bug 2388). */
1448         if (file->f_flags & O_LOV_DELAY_CREATE &&
1449             ll_i2info(inode)->lli_smd == NULL)
1450                 RETURN(-EBADF);
1451
1452         LASSERT(ll_i2info(inode)->lli_smd != NULL);
1453
1454         down(&ll_i2info(inode)->lli_write_sem);
1455
1456 repeat:
1457         chunk = 0; /* just to fix gcc's warning */
1458         end = *ppos + count - 1;
1459
1460         if (file->f_flags & O_APPEND) {
1461                 lock_start = 0;
1462                 lock_end = OBD_OBJECT_EOF;
1463         } else if (sbi->ll_max_rw_chunk != 0) {
1464                 /* first, let's know the end of the current stripe */
1465                 end = *ppos;
1466                 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end);
1467
1468                 /* correct, the end is beyond the request */
1469                 if (end > *ppos + count - 1)
1470                         end = *ppos + count - 1;
1471
1472                 /* and chunk shouldn't be too large even if striping is wide */
1473                 if (end - *ppos > sbi->ll_max_rw_chunk)
1474                         end = *ppos + sbi->ll_max_rw_chunk - 1;
1475                 lock_start = *ppos;
1476                 lock_end = end;
1477         } else {
1478                 lock_start = *ppos;
1479                 lock_end = *ppos + count - 1;
1480         }
1481         node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1482
1483         if (IS_ERR(node))
1484                 GOTO(out, retval = PTR_ERR(node));
1485
1486         tree.lt_fd = LUSTRE_FPRIVATE(file);
1487         rc = ll_tree_lock(&tree, node, buf, count,
1488                           file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1489         if (rc != 0)
1490                 GOTO(out, retval = rc);
1491
1492         /* this is ok, g_f_w will overwrite this under i_mutex if it races
1493          * with a local truncate, it just makes our maxbyte checking easier */
1494         if (file->f_flags & O_APPEND) {
1495                 *ppos = inode->i_size;
1496                 end = *ppos + count - 1;
1497         }
1498
1499         if (*ppos >= maxbytes) {
1500                 send_sig(SIGXFSZ, current, 0);
1501                 GOTO(out, retval = -EFBIG);
1502         }
1503         if (*ppos + count > maxbytes)
1504                 count = maxbytes - *ppos;
1505
1506         /* generic_file_write handles O_APPEND after getting i_mutex */
1507         chunk = end - *ppos + 1;
1508         CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1509                inode->i_ino, chunk, *ppos);
1510         retval = generic_file_write(file, buf, chunk, ppos);
1511         ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1512
1513 out:
1514         ll_tree_unlock(&tree);
1515
1516         if (retval > 0) {
1517                 buf += retval;
1518                 count -= retval;
1519                 sum += retval;
1520                 if (retval == chunk && count > 0)
1521                         goto repeat;
1522         }
1523
1524         up(&ll_i2info(inode)->lli_write_sem);
1525
1526         retval = (sum > 0) ? sum : retval;
1527         lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_WRITE_BYTES,
1528                             retval > 0 ? retval : 0);
1529         RETURN(retval);
1530 }
1531
1532 /*
1533  * Send file content (through pagecache) somewhere with helper
1534  */
1535 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
1536 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1537                                 read_actor_t actor, void *target)
1538 {
1539         struct inode *inode = in_file->f_dentry->d_inode;
1540         struct ll_inode_info *lli = ll_i2info(inode);
1541         struct lov_stripe_md *lsm = lli->lli_smd;
1542         struct ll_lock_tree tree;
1543         struct ll_lock_tree_node *node;
1544         struct ost_lvb lvb;
1545         struct ll_ra_read bead;
1546         int rc;
1547         ssize_t retval;
1548         __u64 kms;
1549         ENTRY;
1550         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1551                inode->i_ino, inode->i_generation, inode, count, *ppos);
1552
1553         /* "If nbyte is 0, read() will return 0 and have no other results."
1554          *                      -- Single Unix Spec */
1555         if (count == 0)
1556                 RETURN(0);
1557
1558         lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, LPROC_LL_READ_BYTES,
1559                             count);
1560
1561         /* File with no objects, nothing to lock */
1562         if (!lsm)
1563                 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1564
1565         node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1566         tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1567         rc = ll_tree_lock(&tree, node, NULL, count,
1568                           in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1569         if (rc != 0)
1570                 RETURN(rc);
1571
1572         ll_inode_size_lock(inode, 1);
1573         /*
1574          * Consistency guarantees: following possibilities exist for the
1575          * relation between region being read and real file size at this
1576          * moment:
1577          *
1578          *  (A): the region is completely inside of the file;
1579          *
1580          *  (B-x): x bytes of region are inside of the file, the rest is
1581          *  outside;
1582          *
1583          *  (C): the region is completely outside of the file.
1584          *
1585          * This classification is stable under DLM lock acquired by
1586          * ll_tree_lock() above, because to change class, other client has to
1587          * take DLM lock conflicting with our lock. Also, any updates to
1588          * ->i_size by other threads on this client are serialized by
1589          * ll_inode_size_lock(). This guarantees that short reads are handled
1590          * correctly in the face of concurrent writes and truncates.
1591          */
1592         inode_init_lvb(inode, &lvb);
1593         obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1594         kms = lvb.lvb_size;
1595         if (*ppos + count - 1 > kms) {
1596                 /* A glimpse is necessary to determine whether we return a
1597                  * short read (B) or some zeroes at the end of the buffer (C) */
1598                 ll_inode_size_unlock(inode, 1);
1599                 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1600                 if (retval)
1601                         goto out;
1602         } else {
1603                 /* region is within kms and, hence, within real file size (A) */
1604                 inode->i_size = kms;
1605                 ll_inode_size_unlock(inode, 1);
1606         }
1607
1608         CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1609                inode->i_ino, count, *ppos, inode->i_size);
1610
1611         /* turn off the kernel's read-ahead */
1612         in_file->f_ra.ra_pages = 0;
1613
1614         bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1615         bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1616         ll_ra_read_in(in_file, &bead);
1617         /* BUG: 5972 */
1618         file_accessed(in_file);
1619         retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1620         ll_ra_read_ex(in_file, &bead);
1621
1622  out:
1623         ll_tree_unlock(&tree);
1624         RETURN(retval);
1625 }
1626 #endif
1627
1628 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1629                                unsigned long arg)
1630 {
1631         struct ll_inode_info *lli = ll_i2info(inode);
1632         struct obd_export *exp = ll_i2dtexp(inode);
1633         struct ll_recreate_obj ucreatp;
1634         struct obd_trans_info oti = { 0 };
1635         struct obdo *oa = NULL;
1636         int lsm_size;
1637         int rc = 0;
1638         struct lov_stripe_md *lsm, *lsm2;
1639         ENTRY;
1640
1641         if (!capable (CAP_SYS_ADMIN))
1642                 RETURN(-EPERM);
1643
1644         rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1645                             sizeof(struct ll_recreate_obj));
1646         if (rc) {
1647                 RETURN(-EFAULT);
1648         }
1649         oa = obdo_alloc();
1650         if (oa == NULL)
1651                 RETURN(-ENOMEM);
1652
1653         down(&lli->lli_open_sem);
1654         lsm = lli->lli_smd;
1655         if (lsm == NULL)
1656                 GOTO(out, rc = -ENOENT);
1657         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1658                    (lsm->lsm_stripe_count));
1659
1660         OBD_ALLOC(lsm2, lsm_size);
1661         if (lsm2 == NULL)
1662                 GOTO(out, rc = -ENOMEM);
1663
1664         oa->o_id = ucreatp.lrc_id;
1665         oa->o_gr = ucreatp.lrc_group;
1666         oa->o_nlink = ucreatp.lrc_ost_idx;
1667         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1668         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1669         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1670                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1671
1672         oti.oti_objid = NULL;
1673         memcpy(lsm2, lsm, lsm_size);
1674         rc = obd_create(exp, oa, &lsm2, &oti);
1675
1676         OBD_FREE(lsm2, lsm_size);
1677         GOTO(out, rc);
1678 out:
1679         up(&lli->lli_open_sem);
1680         obdo_free(oa);
1681         return rc;
1682 }
1683
1684 static int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1685                                     int flags, struct lov_user_md *lum,
1686                                     int lum_size)
1687 {
1688         struct ll_inode_info *lli = ll_i2info(inode);
1689         struct lov_stripe_md *lsm;
1690         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1691         int rc = 0;
1692         ENTRY;
1693
1694         down(&lli->lli_open_sem);
1695         lsm = lli->lli_smd;
1696         if (lsm) {
1697                 up(&lli->lli_open_sem);
1698                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1699                        inode->i_ino);
1700                 RETURN(-EEXIST);
1701         }
1702
1703         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1704         if (rc)
1705                 GOTO(out, rc);
1706         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1707                 GOTO(out_req_free, rc = -ENOENT);
1708         rc = oit.d.lustre.it_status;
1709         if (rc < 0)
1710                 GOTO(out_req_free, rc);
1711
1712         ll_release_openhandle(file->f_dentry, &oit);
1713
1714  out:
1715         up(&lli->lli_open_sem);
1716         ll_intent_release(&oit);
1717         RETURN(rc);
1718 out_req_free:
1719         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1720         goto out;
1721 }
1722
1723 static int ll_lov_setea(struct inode *inode, struct file *file,
1724                             unsigned long arg)
1725 {
1726         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1727         struct lov_user_md  *lump;
1728         int lum_size = sizeof(struct lov_user_md) +
1729                        sizeof(struct lov_user_ost_data);
1730         int rc;
1731         ENTRY;
1732
1733         if (!capable (CAP_SYS_ADMIN))
1734                 RETURN(-EPERM);
1735
1736         OBD_ALLOC(lump, lum_size);
1737         if (lump == NULL) {
1738                 RETURN(-ENOMEM);
1739         }
1740         rc = copy_from_user(lump, (struct lov_user_md  *)arg, lum_size);
1741         if (rc) {
1742                 OBD_FREE(lump, lum_size);
1743                 RETURN(-EFAULT);
1744         }
1745
1746         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1747
1748         OBD_FREE(lump, lum_size);
1749         RETURN(rc);
1750 }
1751
1752 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1753                             unsigned long arg)
1754 {
1755         struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1756         int rc;
1757         int flags = FMODE_WRITE;
1758         ENTRY;
1759
1760         /* Bug 1152: copy properly when this is no longer true */
1761         LASSERT(sizeof(lum) == sizeof(*lump));
1762         LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1763         rc = copy_from_user(&lum, lump, sizeof(lum));
1764         if (rc)
1765                 RETURN(-EFAULT);
1766
1767         rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1768         if (rc == 0) {
1769                  put_user(0, &lump->lmm_stripe_count);
1770                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1771                                     0, ll_i2info(inode)->lli_smd, lump);
1772         }
1773         RETURN(rc);
1774 }
1775
1776 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1777 {
1778         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1779
1780         if (!lsm)
1781                 RETURN(-ENODATA);
1782
1783         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1784                             (void *)arg);
1785 }
1786
1787 static int ll_get_grouplock(struct inode *inode, struct file *file,
1788                             unsigned long arg)
1789 {
1790         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1791         ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1792                                                     .end = OBD_OBJECT_EOF}};
1793         struct lustre_handle lockh = { 0 };
1794         struct ll_inode_info *lli = ll_i2info(inode);
1795         struct lov_stripe_md *lsm = lli->lli_smd;
1796         int flags = 0, rc;
1797         ENTRY;
1798
1799         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1800                 RETURN(-EINVAL);
1801         }
1802
1803         policy.l_extent.gid = arg;
1804         if (file->f_flags & O_NONBLOCK)
1805                 flags = LDLM_FL_BLOCK_NOWAIT;
1806
1807         rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1808         if (rc)
1809                 RETURN(rc);
1810
1811         fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1812         fd->fd_gid = arg;
1813         memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1814
1815         RETURN(0);
1816 }
1817
1818 static int ll_put_grouplock(struct inode *inode, struct file *file,
1819                             unsigned long arg)
1820 {
1821         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1822         struct ll_inode_info *lli = ll_i2info(inode);
1823         struct lov_stripe_md *lsm = lli->lli_smd;
1824         int rc;
1825         ENTRY;
1826
1827         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1828                 /* Ugh, it's already unlocked. */
1829                 RETURN(-EINVAL);
1830         }
1831
1832         if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
1833                 RETURN(-EINVAL);
1834
1835         fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
1836
1837         rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
1838         if (rc)
1839                 RETURN(rc);
1840
1841         fd->fd_gid = 0;
1842         memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
1843
1844         RETURN(0);
1845 }
1846
1847 static int join_sanity_check(struct inode *head, struct inode *tail)
1848 {
1849         ENTRY;
1850         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1851                 CERROR("server do not support join \n");
1852                 RETURN(-EINVAL);
1853         }
1854         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1855                 CERROR("tail ino %lu and ino head %lu must be regular\n",
1856                        head->i_ino, tail->i_ino);
1857                 RETURN(-EINVAL);
1858         }
1859         if (head->i_ino == tail->i_ino) {
1860                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1861                 RETURN(-EINVAL);
1862         }
1863         if (head->i_size % JOIN_FILE_ALIGN) {
1864                 CERROR("hsize %llu must be times of 64K\n", head->i_size);
1865                 RETURN(-EINVAL);
1866         }
1867         RETURN(0);
1868 }
1869
1870 static int join_file(struct inode *head_inode, struct file *head_filp,
1871                      struct file *tail_filp)
1872 {
1873         struct inode *tail_inode, *tail_parent;
1874         struct dentry *tail_dentry = tail_filp->f_dentry;
1875         struct lookup_intent oit = {.it_op = IT_OPEN,
1876                                    .it_flags = head_filp->f_flags|O_JOIN_FILE};
1877         struct lustre_handle lockh;
1878         struct md_op_data *op_data;
1879         __u32  hsize = head_inode->i_size >> 32;
1880         __u32  tsize = head_inode->i_size;
1881         int    rc;
1882         ENTRY;
1883
1884         tail_dentry = tail_filp->f_dentry;
1885         tail_inode = tail_dentry->d_inode;
1886         tail_parent = tail_dentry->d_parent->d_inode;
1887
1888         op_data = ll_prep_md_op_data(NULL, head_inode, tail_parent,
1889                                      tail_dentry->d_name.name,
1890                                      tail_dentry->d_name.len, 0);
1891         if (op_data == NULL)
1892                 RETURN(-ENOMEM);
1893
1894         rc = md_enqueue(ll_i2mdexp(head_inode), LDLM_IBITS, &oit, LCK_CW,
1895                         op_data, &lockh, &tsize, 0, ldlm_completion_ast,
1896                         ll_md_blocking_ast, &hsize, 0);
1897
1898         ll_finish_md_op_data(op_data);
1899         if (rc < 0)
1900                 GOTO(out, rc);
1901
1902         rc = oit.d.lustre.it_status;
1903
1904         if (rc < 0) {
1905                 ptlrpc_req_finished((struct ptlrpc_request *)
1906                                                           oit.d.lustre.it_data);
1907                 GOTO(out, rc);
1908         }
1909
1910         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1911                                            * away */
1912                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1913                 oit.d.lustre.it_lock_mode = 0;
1914         }
1915         ll_release_openhandle(head_filp->f_dentry, &oit);
1916 out:
1917         ll_intent_release(&oit);
1918         RETURN(rc);
1919 }
1920
1921 static int ll_file_join(struct inode *head, struct file *filp,
1922                         char *filename_tail)
1923 {
1924         struct inode *tail = NULL, *first = NULL, *second = NULL;
1925         struct dentry *tail_dentry;
1926         struct file *tail_filp, *first_filp, *second_filp;
1927         struct ll_lock_tree first_tree, second_tree;
1928         struct ll_lock_tree_node *first_node, *second_node;
1929         struct ll_inode_info *hlli = ll_i2info(head), *tlli;
1930         int rc = 0, cleanup_phase = 0;
1931         ENTRY;
1932
1933         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1934                head->i_ino, head->i_generation, head, filename_tail);
1935
1936         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1937         if (IS_ERR(tail_filp)) {
1938                 CERROR("Can not open tail file %s", filename_tail);
1939                 rc = PTR_ERR(tail_filp);
1940                 GOTO(cleanup, rc);
1941         }
1942         tail = igrab(tail_filp->f_dentry->d_inode);
1943
1944         tlli = ll_i2info(tail);
1945         tail_dentry = tail_filp->f_dentry;
1946         LASSERT(tail_dentry);
1947         cleanup_phase = 1;
1948
1949         /*reorder the inode for lock sequence*/
1950         first = head->i_ino > tail->i_ino ? head : tail;
1951         second = head->i_ino > tail->i_ino ? tail : head;
1952         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1953         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1954
1955         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1956                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1957         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1958         if (IS_ERR(first_node)){
1959                 rc = PTR_ERR(first_node);
1960                 GOTO(cleanup, rc);
1961         }
1962         first_tree.lt_fd = first_filp->private_data;
1963         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1964         if (rc != 0)
1965                 GOTO(cleanup, rc);
1966         cleanup_phase = 2;
1967
1968         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1969         if (IS_ERR(second_node)){
1970                 rc = PTR_ERR(second_node);
1971                 GOTO(cleanup, rc);
1972         }
1973         second_tree.lt_fd = second_filp->private_data;
1974         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1975         if (rc != 0)
1976                 GOTO(cleanup, rc);
1977         cleanup_phase = 3;
1978
1979         rc = join_sanity_check(head, tail);
1980         if (rc)
1981                 GOTO(cleanup, rc);
1982
1983         rc = join_file(head, filp, tail_filp);
1984         if (rc)
1985                 GOTO(cleanup, rc);
1986 cleanup:
1987         switch (cleanup_phase) {
1988         case 3:
1989                 ll_tree_unlock(&second_tree);
1990                 obd_cancel_unused(ll_i2dtexp(second),
1991                                   ll_i2info(second)->lli_smd, 0, NULL);
1992         case 2:
1993                 ll_tree_unlock(&first_tree);
1994                 obd_cancel_unused(ll_i2dtexp(first),
1995                                   ll_i2info(first)->lli_smd, 0, NULL);
1996         case 1:
1997                 filp_close(tail_filp, 0);
1998                 if (tail)
1999                         iput(tail);
2000                 if (head && rc == 0) {
2001                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2002                                        &hlli->lli_smd);
2003                         hlli->lli_smd = NULL;
2004                 }
2005         case 0:
2006                 break;
2007         default:
2008                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2009                 LBUG();
2010         }
2011         RETURN(rc);
2012 }
2013
2014 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2015 {
2016         struct inode *inode = dentry->d_inode;
2017         struct obd_client_handle *och;
2018         int rc;
2019         ENTRY;
2020
2021         LASSERT(inode);
2022
2023         /* Root ? Do nothing. */
2024         if (dentry->d_inode->i_sb->s_root == dentry)
2025                 RETURN(0);
2026
2027         /* No open handle to close? Move away */
2028         if (!it_disposition(it, DISP_OPEN_OPEN))
2029                 RETURN(0);
2030
2031         OBD_ALLOC(och, sizeof(*och));
2032         if (!och)
2033                 GOTO(out, rc = -ENOMEM);
2034
2035         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2036                     ll_i2info(inode), it, och);
2037
2038         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2039                                        inode, och);
2040
2041         /* Do not free @och is it is waiting for DONE_WRITING. */
2042         if (och->och_fh.cookie == DEAD_HANDLE_MAGIC)
2043                 OBD_FREE(och, sizeof(*och));
2044  out:
2045         /* this one is in place of ll_file_open */
2046         ptlrpc_req_finished(it->d.lustre.it_data);
2047         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2048         RETURN(rc);
2049 }
2050
2051 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2052                   unsigned long arg)
2053 {
2054         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2055         int flags;
2056         ENTRY;
2057
2058         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2059                inode->i_generation, inode, cmd);
2060
2061         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2062         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2063                 RETURN(-ENOTTY);
2064
2065         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_IOCTL);
2066         switch(cmd) {
2067         case LL_IOC_GETFLAGS:
2068                 /* Get the current value of the file flags */
2069                 return put_user(fd->fd_flags, (int *)arg);
2070         case LL_IOC_SETFLAGS:
2071         case LL_IOC_CLRFLAGS:
2072                 /* Set or clear specific file flags */
2073                 /* XXX This probably needs checks to ensure the flags are
2074                  *     not abused, and to handle any flag side effects.
2075                  */
2076                 if (get_user(flags, (int *) arg))
2077                         RETURN(-EFAULT);
2078
2079                 if (cmd == LL_IOC_SETFLAGS) {
2080                         if ((flags & LL_FILE_IGNORE_LOCK) &&
2081                             !(file->f_flags & O_DIRECT)) {
2082                                 CERROR("%s: unable to disable locking on "
2083                                        "non-O_DIRECT file\n", current->comm);
2084                                 RETURN(-EINVAL);
2085                         }
2086
2087                         fd->fd_flags |= flags;
2088                 } else {
2089                         fd->fd_flags &= ~flags;
2090                 }
2091                 RETURN(0);
2092         case LL_IOC_LOV_SETSTRIPE:
2093                 RETURN(ll_lov_setstripe(inode, file, arg));
2094         case LL_IOC_LOV_SETEA:
2095                 RETURN(ll_lov_setea(inode, file, arg));
2096         case LL_IOC_LOV_GETSTRIPE:
2097                 RETURN(ll_lov_getstripe(inode, arg));
2098         case LL_IOC_RECREATE_OBJ:
2099                 RETURN(ll_lov_recreate_obj(inode, file, arg));
2100         case EXT3_IOC_GETFLAGS:
2101         case EXT3_IOC_SETFLAGS:
2102                 RETURN(ll_iocontrol(inode, file, cmd, arg));
2103         case EXT3_IOC_GETVERSION_OLD:
2104         case EXT3_IOC_GETVERSION:
2105                 RETURN(put_user(inode->i_generation, (int *)arg));
2106         case LL_IOC_JOIN: {
2107                 char *ftail;
2108                 int rc;
2109
2110                 ftail = getname((const char *)arg);
2111                 if (IS_ERR(ftail))
2112                         RETURN(PTR_ERR(ftail));
2113                 rc = ll_file_join(inode, file, ftail);
2114                 putname(ftail);
2115                 RETURN(rc);
2116         }
2117         case LL_IOC_GROUP_LOCK:
2118                 RETURN(ll_get_grouplock(inode, file, arg));
2119         case LL_IOC_GROUP_UNLOCK:
2120                 RETURN(ll_put_grouplock(inode, file, arg));
2121         case IOC_OBD_STATFS:
2122                 RETURN(ll_obd_statfs(inode, (void *)arg));
2123
2124         /* We need to special case any other ioctls we want to handle,
2125          * to send them to the MDS/OST as appropriate and to properly
2126          * network encode the arg field.
2127         case EXT3_IOC_SETVERSION_OLD:
2128         case EXT3_IOC_SETVERSION:
2129         */
2130         case LL_IOC_FLUSHCTX:
2131                 RETURN(ll_flush_ctx(inode));
2132         case LL_IOC_GETFACL: {
2133                 struct rmtacl_ioctl_data ioc;
2134
2135                 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2136                         RETURN(-EFAULT);
2137
2138                 RETURN(ll_ioctl_getfacl(inode, &ioc));
2139         }
2140         case LL_IOC_SETFACL: {
2141                 struct rmtacl_ioctl_data ioc;
2142
2143                 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2144                         RETURN(-EFAULT);
2145
2146                 RETURN(ll_ioctl_setfacl(inode, &ioc));
2147         }
2148         default:
2149                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2150                                      (void *)arg));
2151         }
2152 }
2153
2154 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2155 {
2156         struct inode *inode = file->f_dentry->d_inode;
2157         struct ll_inode_info *lli = ll_i2info(inode);
2158         struct lov_stripe_md *lsm = lli->lli_smd;
2159         loff_t retval;
2160         ENTRY;
2161         retval = offset + ((origin == 2) ? inode->i_size :
2162                            (origin == 1) ? file->f_pos : 0);
2163         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2164                inode->i_ino, inode->i_generation, inode, retval, retval,
2165                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2166
2167         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_LLSEEK);
2168         if (origin == 2) { /* SEEK_END */
2169                 int nonblock = 0, rc;
2170
2171                 if (file->f_flags & O_NONBLOCK)
2172                         nonblock = LDLM_FL_BLOCK_NOWAIT;
2173
2174                 if (lsm != NULL) {
2175                         rc = ll_glimpse_size(inode, nonblock);
2176                         if (rc != 0)
2177                                 RETURN(rc);
2178                 }
2179
2180                 ll_inode_size_lock(inode, 0);
2181                 offset += inode->i_size;
2182                 ll_inode_size_unlock(inode, 0);
2183         } else if (origin == 1) { /* SEEK_CUR */
2184                 offset += file->f_pos;
2185         }
2186
2187         retval = -EINVAL;
2188         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2189                 if (offset != file->f_pos) {
2190                         file->f_pos = offset;
2191 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2192                         file->f_reada = 0;
2193                         file->f_version = ++event;
2194 #endif
2195                 }
2196                 retval = offset;
2197         } else {
2198                 CERROR("invalid offset offset "LPX64" inode=%lu/%u(%p)"
2199                        "seek (%s) isize "LPU64", f_ops "LPU64"\n",
2200                        offset, inode->i_ino, inode->i_generation, inode,
2201                        origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR"
2202                        : "SEEK_SET", inode->i_size, file->f_pos);
2203         }
2204
2205         RETURN(retval);
2206 }
2207
2208 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2209 {
2210         struct inode *inode = dentry->d_inode;
2211         struct ll_inode_info *lli = ll_i2info(inode);
2212         struct lov_stripe_md *lsm = lli->lli_smd;
2213         struct ptlrpc_request *req;
2214         struct obd_capa *oc;
2215         int rc, err;
2216         ENTRY;
2217         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2218                inode->i_generation, inode);
2219
2220         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_FSYNC);
2221
2222         /* fsync's caller has already called _fdata{sync,write}, we want
2223          * that IO to finish before calling the osc and mdc sync methods */
2224         rc = filemap_fdatawait(inode->i_mapping);
2225
2226         /* catch async errors that were recorded back when async writeback
2227          * failed for pages in this mapping. */
2228         err = lli->lli_async_rc;
2229         lli->lli_async_rc = 0;
2230         if (rc == 0)
2231                 rc = err;
2232         if (lsm) {
2233                 err = lov_test_and_clear_async_rc(lsm);
2234                 if (rc == 0)
2235                         rc = err;
2236         }
2237
2238         oc = ll_mdscapa_get(inode);
2239         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2240                       &req);
2241         capa_put(oc);
2242         if (!rc)
2243                 rc = err;
2244         if (!err)
2245                 ptlrpc_req_finished(req);
2246
2247         if (data && lsm) {
2248                 struct obdo *oa = obdo_alloc();
2249
2250                 if (!oa)
2251                         RETURN(rc ? rc : -ENOMEM);
2252
2253                 oa->o_id = lsm->lsm_object_id;
2254                 oa->o_gr = lsm->lsm_object_gr;
2255                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2256                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2257                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2258                                            OBD_MD_FLGROUP);
2259
2260                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2261                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2262                                0, OBD_OBJECT_EOF, oc);
2263                 capa_put(oc);
2264                 if (!rc)
2265                         rc = err;
2266                 obdo_free(oa);
2267         }
2268
2269         RETURN(rc);
2270 }
2271
2272 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2273 {
2274         struct inode *inode = file->f_dentry->d_inode;
2275         struct ll_sb_info *sbi = ll_i2sbi(inode);
2276         struct ldlm_res_id res_id =
2277                 { .name = { fid_seq(ll_inode2fid(inode)),
2278                             fid_oid(ll_inode2fid(inode)),
2279                             fid_ver(ll_inode2fid(inode)),
2280                             LDLM_FLOCK} };
2281         struct lustre_handle lockh = {0};
2282         ldlm_policy_data_t flock;
2283         ldlm_mode_t mode = 0;
2284         int flags = 0;
2285         int rc;
2286         ENTRY;
2287
2288         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2289                inode->i_ino, file_lock);
2290
2291         flock.l_flock.pid = file_lock->fl_pid;
2292         flock.l_flock.start = file_lock->fl_start;
2293         flock.l_flock.end = file_lock->fl_end;
2294
2295         switch (file_lock->fl_type) {
2296         case F_RDLCK:
2297                 mode = LCK_PR;
2298                 break;
2299         case F_UNLCK:
2300                 /* An unlock request may or may not have any relation to
2301                  * existing locks so we may not be able to pass a lock handle
2302                  * via a normal ldlm_lock_cancel() request. The request may even
2303                  * unlock a byte range in the middle of an existing lock. In
2304                  * order to process an unlock request we need all of the same
2305                  * information that is given with a normal read or write record
2306                  * lock request. To avoid creating another ldlm unlock (cancel)
2307                  * message we'll treat a LCK_NL flock request as an unlock. */
2308                 mode = LCK_NL;
2309                 break;
2310         case F_WRLCK:
2311                 mode = LCK_PW;
2312                 break;
2313         default:
2314                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2315                 LBUG();
2316         }
2317
2318         switch (cmd) {
2319         case F_SETLKW:
2320 #ifdef F_SETLKW64
2321         case F_SETLKW64:
2322 #endif
2323                 flags = 0;
2324                 break;
2325         case F_SETLK:
2326 #ifdef F_SETLK64
2327         case F_SETLK64:
2328 #endif
2329                 flags = LDLM_FL_BLOCK_NOWAIT;
2330                 break;
2331         case F_GETLK:
2332 #ifdef F_GETLK64
2333         case F_GETLK64:
2334 #endif
2335                 flags = LDLM_FL_TEST_LOCK;
2336                 /* Save the old mode so that if the mode in the lock changes we
2337                  * can decrement the appropriate reader or writer refcount. */
2338                 file_lock->fl_type = mode;
2339                 break;
2340         default:
2341                 CERROR("unknown fcntl lock command: %d\n", cmd);
2342                 LBUG();
2343         }
2344
2345         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2346                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2347                flags, mode, flock.l_flock.start, flock.l_flock.end);
2348
2349         rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &res_id,
2350                               LDLM_FLOCK, &flock, mode, &flags, NULL,
2351                               ldlm_flock_completion_ast, NULL, file_lock,
2352                               NULL, 0, NULL, &lockh, 0);
2353         RETURN(rc);
2354 }
2355
2356 int ll_have_md_lock(struct inode *inode, __u64 bits)
2357 {
2358         struct lustre_handle lockh;
2359         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2360         struct lu_fid *fid;
2361         int flags;
2362         ENTRY;
2363
2364         if (!inode)
2365                RETURN(0);
2366
2367         fid = &ll_i2info(inode)->lli_fid;
2368         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2369
2370         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2371         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2372                           LCK_CR|LCK_CW|LCK_PR, &lockh)) {
2373                 RETURN(1);
2374         }
2375
2376         RETURN(0);
2377 }
2378
2379 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2380         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2381                               * and return success */
2382                 inode->i_nlink = 0;
2383                 /* This path cannot be hit for regular files unless in
2384                  * case of obscure races, so no need to to validate
2385                  * size. */
2386                 if (!S_ISREG(inode->i_mode) &&
2387                     !S_ISDIR(inode->i_mode))
2388                         return 0;
2389         }
2390
2391         if (rc) {
2392                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2393                 return -abs(rc);
2394
2395         }
2396
2397         return 0;
2398 }
2399
2400 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2401 {
2402         struct inode *inode = dentry->d_inode;
2403         struct ptlrpc_request *req = NULL;
2404         struct ll_sb_info *sbi;
2405         struct obd_export *exp;
2406         int rc;
2407         ENTRY;
2408
2409         if (!inode) {
2410                 CERROR("REPORT THIS LINE TO PETER\n");
2411                 RETURN(0);
2412         }
2413         sbi = ll_i2sbi(inode);
2414
2415         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2416                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2417 #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0))
2418         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_REVALIDATE);
2419 #endif
2420
2421         exp = ll_i2mdexp(inode);
2422
2423         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2424                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2425                 struct md_op_data *op_data;
2426
2427                 /* Call getattr by fid, so do not provide name at all. */
2428                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2429                                              dentry->d_inode, NULL, 0, 0);
2430                 if (op_data == NULL)
2431                         RETURN(-ENOMEM);
2432                 it->it_flags |= O_CHECK_STALE;
2433                 rc = md_intent_lock(exp, op_data, NULL, 0,
2434                                     /* we are not interested in name
2435                                        based lookup */
2436                                     &oit, 0, &req,
2437                                     ll_md_blocking_ast, 0);
2438                 ll_finish_md_op_data(op_data);
2439                 it->it_flags &= ~ O_CHECK_STALE;
2440                 if (rc < 0) {
2441                         rc = ll_inode_revalidate_fini(inode, rc);
2442                         GOTO (out, rc);
2443                 }
2444
2445                 rc = ll_revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2446                 if (rc != 0) {
2447                         ll_intent_release(&oit);
2448                         GOTO(out, rc);
2449                 }
2450
2451                 /* Unlinked? Unhash dentry, so it is not picked up later by
2452                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2453                    here to preserve get_cwd functionality on 2.6.
2454                    Bug 10503 */
2455                 if (!dentry->d_inode->i_nlink) {
2456                         spin_lock(&dcache_lock);
2457                         ll_drop_dentry(dentry);
2458                         spin_unlock(&dcache_lock);
2459                 }
2460
2461                 ll_lookup_finish_locks(&oit, dentry);
2462         } else if (!ll_have_md_lock(dentry->d_inode,
2463                                     MDS_INODELOCK_UPDATE)) {
2464                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2465                 obd_valid valid = OBD_MD_FLGETATTR;
2466                 int ealen = 0;
2467                 struct obd_capa *oc;
2468
2469                 if (S_ISREG(inode->i_mode)) {
2470                         rc = ll_get_max_mdsize(sbi, &ealen);
2471                         if (rc)
2472                                 RETURN(rc);
2473                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2474                 }
2475                 oc = ll_mdscapa_get(inode);
2476                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2477                                 ealen, &req);
2478                 capa_put(oc);
2479                 if (rc) {
2480                         rc = ll_inode_revalidate_fini(inode, rc);
2481                         RETURN(rc);
2482                 }
2483
2484                 rc = ll_prep_inode(&inode, req, REPLY_REC_OFF,
2485                                    NULL);
2486                 if (rc)
2487                         GOTO(out, rc);
2488         }
2489
2490         /* if object not yet allocated, don't validate size */
2491         if (ll_i2info(inode)->lli_smd == NULL)
2492                 GOTO(out, rc = 0);
2493
2494         /* ll_glimpse_size will prefer locally cached writes if they extend
2495            the file */
2496         rc = ll_glimpse_size(inode, 0);
2497         EXIT;
2498 out:
2499         ptlrpc_req_finished(req);
2500         return rc;
2501 }
2502
2503 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2504 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2505                   struct lookup_intent *it, struct kstat *stat)
2506 {
2507         struct inode *inode = de->d_inode;
2508         int res = 0;
2509
2510         res = ll_inode_revalidate_it(de, it);
2511         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats, LPROC_LL_GETATTR);
2512
2513         if (res)
2514                 return res;
2515
2516         stat->dev = inode->i_sb->s_dev;
2517         stat->ino = inode->i_ino;
2518         stat->mode = inode->i_mode;
2519         stat->nlink = inode->i_nlink;
2520         stat->uid = inode->i_uid;
2521         stat->gid = inode->i_gid;
2522         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2523         stat->atime = inode->i_atime;
2524         stat->mtime = inode->i_mtime;
2525         stat->ctime = inode->i_ctime;
2526         stat->blksize = inode->i_blksize;
2527
2528         ll_inode_size_lock(inode, 0);
2529         stat->size = inode->i_size;
2530         stat->blocks = inode->i_blocks;
2531         ll_inode_size_unlock(inode, 0);
2532
2533         return 0;
2534 }
2535 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2536 {
2537         struct lookup_intent it = { .it_op = IT_GETATTR };
2538
2539         return ll_getattr_it(mnt, de, &it, stat);
2540 }
2541 #endif
2542
2543 static
2544 int lustre_check_acl(struct inode *inode, int mask)
2545 {
2546 #ifdef CONFIG_FS_POSIX_ACL
2547         struct ll_inode_info *lli = ll_i2info(inode);
2548         struct posix_acl *acl;
2549         int rc;
2550         ENTRY;
2551
2552         spin_lock(&lli->lli_lock);
2553         acl = posix_acl_dup(lli->lli_posix_acl);
2554         spin_unlock(&lli->lli_lock);
2555
2556         if (!acl)
2557                 RETURN(-EAGAIN);
2558
2559         rc = posix_acl_permission(inode, acl, mask);
2560         posix_acl_release(acl);
2561
2562         RETURN(rc);
2563 #else
2564         return -EAGAIN;
2565 #endif
2566 }
2567
2568 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2569 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2570 {
2571         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2572                inode->i_ino, inode->i_generation, inode, mask);
2573         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2574                 return lustre_check_remote_perm(inode, mask);
2575         return generic_permission(inode, mask, lustre_check_acl);
2576 }
2577 #else
2578 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0))
2579 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2580 #else
2581 int ll_inode_permission(struct inode *inode, int mask)
2582 #endif
2583 {
2584         int mode = inode->i_mode;
2585         int rc;
2586
2587         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2588                inode->i_ino, inode->i_generation, inode, mask);
2589
2590         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2591                 return lustre_check_remote_perm(inode, mask);
2592
2593         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2594             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2595                 return -EROFS;
2596         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2597                 return -EACCES;
2598         if (current->fsuid == inode->i_uid) {
2599                 mode >>= 6;
2600         } else if (1) {
2601                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2602                         goto check_groups;
2603                 rc = lustre_check_acl(inode, mask);
2604                 if (rc == -EAGAIN)
2605                         goto check_groups;
2606                 if (rc == -EACCES)
2607                         goto check_capabilities;
2608                 return rc;
2609         } else {
2610 check_groups:
2611                 if (in_group_p(inode->i_gid))
2612                         mode >>= 3;
2613         }
2614         if ((mode & mask & S_IRWXO) == mask)
2615                 return 0;
2616
2617 check_capabilities:
2618         if (!(mask & MAY_EXEC) ||
2619             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2620                 if (capable(CAP_DAC_OVERRIDE))
2621                         return 0;
2622
2623         if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2624             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2625                 return 0;
2626         return -EACCES;
2627 }
2628 #endif
2629
2630 struct file_operations ll_file_operations = {
2631         .read           = ll_file_read,
2632         .write          = ll_file_write,
2633         .ioctl          = ll_file_ioctl,
2634         .open           = ll_file_open,
2635         .release        = ll_file_release,
2636         .mmap           = ll_file_mmap,
2637         .llseek         = ll_file_seek,
2638 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2639         .sendfile       = ll_file_sendfile,
2640 #endif
2641         .fsync          = ll_fsync,
2642         /* .lock           = ll_file_flock */
2643 };
2644
2645 struct file_operations ll_file_operations_flock = {
2646         .read           = ll_file_read,
2647         .write          = ll_file_write,
2648         .ioctl          = ll_file_ioctl,
2649         .open           = ll_file_open,
2650         .release        = ll_file_release,
2651         .mmap           = ll_file_mmap,
2652         .llseek         = ll_file_seek,
2653 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
2654         .sendfile       = ll_file_sendfile,
2655 #endif
2656         .fsync          = ll_fsync,
2657         .lock           = ll_file_flock
2658 };
2659
2660
2661 struct inode_operations ll_file_inode_operations = {
2662 #ifdef LUSTRE_KERNEL_VERSION
2663         .setattr_raw    = ll_setattr_raw,
2664 #endif
2665         .setattr        = ll_setattr,
2666         .truncate       = ll_truncate,
2667 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
2668         .getattr        = ll_getattr,
2669 #else
2670         .revalidate_it  = ll_inode_revalidate_it,
2671 #endif
2672         .permission     = ll_inode_permission,
2673         .setxattr       = ll_setxattr,
2674         .getxattr       = ll_getxattr,
2675         .listxattr      = ll_listxattr,
2676         .removexattr    = ll_removexattr,
2677 };
2678