Whamcloud - gitweb
OBD_SLAB_ALLOC_PTR_SAFE() is a new OBD_ macro for slab allocation that uses CFS_ALLOC...
[fs/lustre-release.git] / lustre / llite / file.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51
52 #include "cl_object.h"
53
54 struct ll_file_data *ll_file_data_get(void)
55 {
56         struct ll_file_data *fd;
57
58         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
59         return fd;
60 }
61
62 static void ll_file_data_put(struct ll_file_data *fd)
63 {
64         if (fd != NULL)
65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 }
67
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69                           struct lustre_handle *fh)
70 {
71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72         op_data->op_attr.ia_mode = inode->i_mode;
73         op_data->op_attr.ia_atime = inode->i_atime;
74         op_data->op_attr.ia_mtime = inode->i_mtime;
75         op_data->op_attr.ia_ctime = inode->i_ctime;
76         op_data->op_attr.ia_size = i_size_read(inode);
77         op_data->op_attr_blocks = inode->i_blocks;
78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
79         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
80         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
81         op_data->op_capa1 = ll_mdscapa_get(inode);
82 }
83
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85                              struct obd_client_handle *och)
86 {
87         ENTRY;
88
89         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
90                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
91
92         if (!(och->och_flags & FMODE_WRITE))
93                 goto out;
94
95         if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
96             !S_ISREG(inode->i_mode))
97                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
98         else
99                 ll_epoch_close(inode, op_data, &och, 0);
100
101 out:
102         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
103         EXIT;
104 }
105
106 static int ll_close_inode_openhandle(struct obd_export *md_exp,
107                                      struct inode *inode,
108                                      struct obd_client_handle *och)
109 {
110         struct obd_export *exp = ll_i2mdexp(inode);
111         struct md_op_data *op_data;
112         struct ptlrpc_request *req = NULL;
113         struct obd_device *obd = class_exp2obd(exp);
114         int epoch_close = 1;
115         int seq_end = 0, rc;
116         ENTRY;
117
118         if (obd == NULL) {
119                 /*
120                  * XXX: in case of LMV, is this correct to access
121                  * ->exp_handle?
122                  */
123                 CERROR("Invalid MDC connection handle "LPX64"\n",
124                        ll_i2mdexp(inode)->exp_handle.h_cookie);
125                 GOTO(out, rc = 0);
126         }
127
128         /*
129          * here we check if this is forced umount. If so this is called on
130          * canceling "open lock" and we do not call md_close() in this case, as
131          * it will not be successful, as import is already deactivated.
132          */
133         if (obd->obd_force)
134                 GOTO(out, rc = 0);
135
136         OBD_ALLOC_PTR(op_data);
137         if (op_data == NULL)
138                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
139
140         ll_prepare_close(inode, op_data, och);
141         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
142         rc = md_close(md_exp, op_data, och->och_mod, &req);
143         if (rc != -EAGAIN)
144                 seq_end = 1;
145
146         if (rc == -EAGAIN) {
147                 /* This close must have the epoch closed. */
148                 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
149                 LASSERT(epoch_close);
150                 /* MDS has instructed us to obtain Size-on-MDS attribute from
151                  * OSTs and send setattr to back to MDS. */
152                 rc = ll_sizeonmds_update(inode, och->och_mod,
153                                          &och->och_fh, op_data->op_ioepoch);
154                 if (rc) {
155                         CERROR("inode %lu mdc Size-on-MDS update failed: "
156                                "rc = %d\n", inode->i_ino, rc);
157                         rc = 0;
158                 }
159         } else if (rc) {
160                 CERROR("inode %lu mdc close failed: rc = %d\n",
161                        inode->i_ino, rc);
162         }
163         ll_finish_md_op_data(op_data);
164
165         if (rc == 0) {
166                 rc = ll_objects_destroy(req, inode);
167                 if (rc)
168                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
169                                inode->i_ino, rc);
170         }
171
172         EXIT;
173 out:
174
175         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
176             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
177                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
178         } else {
179                 if (seq_end)
180                         ptlrpc_close_replay_seq(req);
181                 md_clear_open_replay_data(md_exp, och);
182                 /* Free @och if it is not waiting for DONE_WRITING. */
183                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
184                 OBD_FREE_PTR(och);
185         }
186         if (req) /* This is close request */
187                 ptlrpc_req_finished(req);
188         return rc;
189 }
190
191 int ll_md_real_close(struct inode *inode, int flags)
192 {
193         struct ll_inode_info *lli = ll_i2info(inode);
194         struct obd_client_handle **och_p;
195         struct obd_client_handle *och;
196         __u64 *och_usecount;
197         int rc = 0;
198         ENTRY;
199
200         if (flags & FMODE_WRITE) {
201                 och_p = &lli->lli_mds_write_och;
202                 och_usecount = &lli->lli_open_fd_write_count;
203         } else if (flags & FMODE_EXEC) {
204                 och_p = &lli->lli_mds_exec_och;
205                 och_usecount = &lli->lli_open_fd_exec_count;
206         } else {
207                 LASSERT(flags & FMODE_READ);
208                 och_p = &lli->lli_mds_read_och;
209                 och_usecount = &lli->lli_open_fd_read_count;
210         }
211
212         down(&lli->lli_och_sem);
213         if (*och_usecount) { /* There are still users of this handle, so
214                                 skip freeing it. */
215                 up(&lli->lli_och_sem);
216                 RETURN(0);
217         }
218         och=*och_p;
219         *och_p = NULL;
220         up(&lli->lli_och_sem);
221
222         if (och) { /* There might be a race and somebody have freed this och
223                       already */
224                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
225                                                inode, och);
226         }
227
228         RETURN(rc);
229 }
230
231 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
232                 struct file *file)
233 {
234         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
235         struct ll_inode_info *lli = ll_i2info(inode);
236         int rc = 0;
237         ENTRY;
238
239         /* clear group lock, if present */
240         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
241 #if 0 /* XXX */
242                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
243                 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
244                 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
245                                       &fd->fd_cwlockh);
246 #endif
247         }
248
249         /* Let's see if we have good enough OPEN lock on the file and if
250            we can skip talking to MDS */
251         if (file->f_dentry->d_inode) { /* Can this ever be false? */
252                 int lockmode;
253                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
254                 struct lustre_handle lockh;
255                 struct inode *inode = file->f_dentry->d_inode;
256                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
257
258                 down(&lli->lli_och_sem);
259                 if (fd->fd_omode & FMODE_WRITE) {
260                         lockmode = LCK_CW;
261                         LASSERT(lli->lli_open_fd_write_count);
262                         lli->lli_open_fd_write_count--;
263                 } else if (fd->fd_omode & FMODE_EXEC) {
264                         lockmode = LCK_PR;
265                         LASSERT(lli->lli_open_fd_exec_count);
266                         lli->lli_open_fd_exec_count--;
267                 } else {
268                         lockmode = LCK_CR;
269                         LASSERT(lli->lli_open_fd_read_count);
270                         lli->lli_open_fd_read_count--;
271                 }
272                 up(&lli->lli_och_sem);
273
274                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
275                                    LDLM_IBITS, &policy, lockmode,
276                                    &lockh)) {
277                         rc = ll_md_real_close(file->f_dentry->d_inode,
278                                               fd->fd_omode);
279                 }
280         } else {
281                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
282                        file, file->f_dentry, file->f_dentry->d_name.name);
283         }
284
285         LUSTRE_FPRIVATE(file) = NULL;
286         ll_file_data_put(fd);
287         ll_capa_close(inode);
288
289         RETURN(rc);
290 }
291
292 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
293
294 /* While this returns an error code, fput() the caller does not, so we need
295  * to make every effort to clean up all of our state here.  Also, applications
296  * rarely check close errors and even if an error is returned they will not
297  * re-try the close call.
298  */
299 int ll_file_release(struct inode *inode, struct file *file)
300 {
301         struct ll_file_data *fd;
302         struct ll_sb_info *sbi = ll_i2sbi(inode);
303         struct ll_inode_info *lli = ll_i2info(inode);
304         struct lov_stripe_md *lsm = lli->lli_smd;
305         int rc;
306         ENTRY;
307
308         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
309                inode->i_generation, inode);
310
311 #ifdef CONFIG_FS_POSIX_ACL
312         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
313             inode == inode->i_sb->s_root->d_inode) {
314                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
315
316                 LASSERT(fd != NULL);
317                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
318                         fd->fd_flags &= ~LL_FILE_RMTACL;
319                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
320                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
321                 }
322         }
323 #endif
324
325         if (inode->i_sb->s_root != file->f_dentry)
326                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
327         fd = LUSTRE_FPRIVATE(file);
328         LASSERT(fd != NULL);
329
330         /* The last ref on @file, maybe not the the owner pid of statahead.
331          * Different processes can open the same dir, "ll_opendir_key" means:
332          * it is me that should stop the statahead thread. */
333         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
334                 ll_stop_statahead(inode, lli->lli_opendir_key);
335
336         if (inode->i_sb->s_root == file->f_dentry) {
337                 LUSTRE_FPRIVATE(file) = NULL;
338                 ll_file_data_put(fd);
339                 RETURN(0);
340         }
341
342         if (lsm)
343                 lov_test_and_clear_async_rc(lsm);
344         lli->lli_async_rc = 0;
345
346         rc = ll_md_close(sbi->ll_md_exp, inode, file);
347         RETURN(rc);
348 }
349
350 static int ll_intent_file_open(struct file *file, void *lmm,
351                                int lmmsize, struct lookup_intent *itp)
352 {
353         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
354         struct dentry *parent = file->f_dentry->d_parent;
355         const char *name = file->f_dentry->d_name.name;
356         const int len = file->f_dentry->d_name.len;
357         struct md_op_data *op_data;
358         struct ptlrpc_request *req;
359         int rc;
360         ENTRY;
361
362         if (!parent)
363                 RETURN(-ENOENT);
364
365         /* Usually we come here only for NFSD, and we want open lock.
366            But we can also get here with pre 2.6.15 patchless kernels, and in
367            that case that lock is also ok */
368         /* We can also get here if there was cached open handle in revalidate_it
369          * but it disappeared while we were getting from there to ll_file_open.
370          * But this means this file was closed and immediatelly opened which
371          * makes a good candidate for using OPEN lock */
372         /* If lmmsize & lmm are not 0, we are just setting stripe info
373          * parameters. No need for the open lock */
374         if (!lmm && !lmmsize)
375                 itp->it_flags |= MDS_OPEN_LOCK;
376
377         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
378                                       file->f_dentry->d_inode, name, len,
379                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
380         if (IS_ERR(op_data))
381                 RETURN(PTR_ERR(op_data));
382
383         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
384                             0 /*unused */, &req, ll_md_blocking_ast, 0);
385         ll_finish_md_op_data(op_data);
386         if (rc == -ESTALE) {
387                 /* reason for keep own exit path - don`t flood log
388                 * with messages with -ESTALE errors.
389                 */
390                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
391                      it_open_error(DISP_OPEN_OPEN, itp))
392                         GOTO(out, rc);
393                 ll_release_openhandle(file->f_dentry, itp);
394                 GOTO(out, rc);
395         }
396
397         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
398                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
399                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
400                 GOTO(out, rc);
401         }
402
403         if (itp->d.lustre.it_lock_mode)
404                 md_set_lock_data(sbi->ll_md_exp,
405                                  &itp->d.lustre.it_lock_handle,
406                                  file->f_dentry->d_inode);
407
408         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
409 out:
410         ptlrpc_req_finished(itp->d.lustre.it_data);
411         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
412         ll_intent_drop_lock(itp);
413
414         RETURN(rc);
415 }
416
417 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
418                        struct lookup_intent *it, struct obd_client_handle *och)
419 {
420         struct ptlrpc_request *req = it->d.lustre.it_data;
421         struct mdt_body *body;
422
423         LASSERT(och);
424
425         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
426         LASSERT(body != NULL);                      /* reply already checked out */
427
428         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
429         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
430         och->och_fid = lli->lli_fid;
431         och->och_flags = it->it_flags;
432         lli->lli_ioepoch = body->ioepoch;
433
434         return md_set_open_replay_data(md_exp, och, req);
435 }
436
437 int ll_local_open(struct file *file, struct lookup_intent *it,
438                   struct ll_file_data *fd, struct obd_client_handle *och)
439 {
440         struct inode *inode = file->f_dentry->d_inode;
441         struct ll_inode_info *lli = ll_i2info(inode);
442         ENTRY;
443
444         LASSERT(!LUSTRE_FPRIVATE(file));
445
446         LASSERT(fd != NULL);
447
448         if (och) {
449                 struct ptlrpc_request *req = it->d.lustre.it_data;
450                 struct mdt_body *body;
451                 int rc;
452
453                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
454                 if (rc)
455                         RETURN(rc);
456
457                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
458                 if ((it->it_flags & FMODE_WRITE) &&
459                     (body->valid & OBD_MD_FLSIZE))
460                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
461                                lli->lli_ioepoch, PFID(&lli->lli_fid));
462         }
463
464         LUSTRE_FPRIVATE(file) = fd;
465         ll_readahead_init(inode, &fd->fd_ras);
466         fd->fd_omode = it->it_flags;
467         RETURN(0);
468 }
469
470 /* Open a file, and (for the very first open) create objects on the OSTs at
471  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
472  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
473  * lli_open_sem to ensure no other process will create objects, send the
474  * stripe MD to the MDS, or try to destroy the objects if that fails.
475  *
476  * If we already have the stripe MD locally then we don't request it in
477  * md_open(), by passing a lmm_size = 0.
478  *
479  * It is up to the application to ensure no other processes open this file
480  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
481  * used.  We might be able to avoid races of that sort by getting lli_open_sem
482  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
483  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
484  */
485 int ll_file_open(struct inode *inode, struct file *file)
486 {
487         struct ll_inode_info *lli = ll_i2info(inode);
488         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
489                                           .it_flags = file->f_flags };
490         struct lov_stripe_md *lsm;
491         struct ptlrpc_request *req = NULL;
492         struct obd_client_handle **och_p;
493         __u64 *och_usecount;
494         struct ll_file_data *fd;
495         int rc = 0, opendir_set = 0;
496         ENTRY;
497
498         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
499                inode->i_generation, inode, file->f_flags);
500
501 #ifdef HAVE_VFS_INTENT_PATCHES
502         it = file->f_it;
503 #else
504         it = file->private_data; /* XXX: compat macro */
505         file->private_data = NULL; /* prevent ll_local_open assertion */
506 #endif
507
508         fd = ll_file_data_get();
509         if (fd == NULL)
510                 RETURN(-ENOMEM);
511
512         fd->fd_file = file;
513         if (S_ISDIR(inode->i_mode)) {
514 again:
515                 spin_lock(&lli->lli_lock);
516                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
517                         LASSERT(lli->lli_sai == NULL);
518                         lli->lli_opendir_key = fd;
519                         lli->lli_opendir_pid = cfs_curproc_pid();
520                         opendir_set = 1;
521                 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid() &&
522                                     lli->lli_opendir_key != NULL)) {
523                         /* Two cases for this:
524                          * (1) The same process open such directory many times.
525                          * (2) The old process opened the directory, and exited
526                          *     before its children processes. Then new process
527                          *     with the same pid opens such directory before the
528                          *     old process's children processes exit.
529                          * reset stat ahead for such cases. */
530                         spin_unlock(&lli->lli_lock);
531                         CDEBUG(D_INFO, "Conflict statahead for %.*s "DFID
532                                " reset it.\n", file->f_dentry->d_name.len,
533                                file->f_dentry->d_name.name,
534                                PFID(&lli->lli_fid));
535                         ll_stop_statahead(inode, lli->lli_opendir_key);
536                         goto again;
537                 }
538                 spin_unlock(&lli->lli_lock);
539         }
540
541         if (inode->i_sb->s_root == file->f_dentry) {
542                 LUSTRE_FPRIVATE(file) = fd;
543                 RETURN(0);
544         }
545
546         if (!it || !it->d.lustre.it_disposition) {
547                 /* Convert f_flags into access mode. We cannot use file->f_mode,
548                  * because everything but O_ACCMODE mask was stripped from
549                  * there */
550                 if ((oit.it_flags + 1) & O_ACCMODE)
551                         oit.it_flags++;
552                 if (file->f_flags & O_TRUNC)
553                         oit.it_flags |= FMODE_WRITE;
554
555                 /* kernel only call f_op->open in dentry_open.  filp_open calls
556                  * dentry_open after call to open_namei that checks permissions.
557                  * Only nfsd_open call dentry_open directly without checking
558                  * permissions and because of that this code below is safe. */
559                 if (oit.it_flags & FMODE_WRITE)
560                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
561
562                 /* We do not want O_EXCL here, presumably we opened the file
563                  * already? XXX - NFS implications? */
564                 oit.it_flags &= ~O_EXCL;
565
566                 it = &oit;
567         }
568
569 restart:
570         /* Let's see if we have file open on MDS already. */
571         if (it->it_flags & FMODE_WRITE) {
572                 och_p = &lli->lli_mds_write_och;
573                 och_usecount = &lli->lli_open_fd_write_count;
574         } else if (it->it_flags & FMODE_EXEC) {
575                 och_p = &lli->lli_mds_exec_och;
576                 och_usecount = &lli->lli_open_fd_exec_count;
577          } else {
578                 och_p = &lli->lli_mds_read_och;
579                 och_usecount = &lli->lli_open_fd_read_count;
580         }
581
582         down(&lli->lli_och_sem);
583         if (*och_p) { /* Open handle is present */
584                 if (it_disposition(it, DISP_OPEN_OPEN)) {
585                         /* Well, there's extra open request that we do not need,
586                            let's close it somehow. This will decref request. */
587                         rc = it_open_error(DISP_OPEN_OPEN, it);
588                         if (rc) {
589                                 up(&lli->lli_och_sem);
590                                 ll_file_data_put(fd);
591                                 GOTO(out_openerr, rc);
592                         }
593                         ll_release_openhandle(file->f_dentry, it);
594                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
595                                              LPROC_LL_OPEN);
596                 }
597                 (*och_usecount)++;
598
599                 rc = ll_local_open(file, it, fd, NULL);
600                 if (rc) {
601                         (*och_usecount)--;
602                         up(&lli->lli_och_sem);
603                         ll_file_data_put(fd);
604                         GOTO(out_openerr, rc);
605                 }
606         } else {
607                 LASSERT(*och_usecount == 0);
608                 if (!it->d.lustre.it_disposition) {
609                         /* We cannot just request lock handle now, new ELC code
610                            means that one of other OPEN locks for this file
611                            could be cancelled, and since blocking ast handler
612                            would attempt to grab och_sem as well, that would
613                            result in a deadlock */
614                         up(&lli->lli_och_sem);
615                         it->it_flags |= O_CHECK_STALE;
616                         rc = ll_intent_file_open(file, NULL, 0, it);
617                         it->it_flags &= ~O_CHECK_STALE;
618                         if (rc) {
619                                 ll_file_data_put(fd);
620                                 GOTO(out_openerr, rc);
621                         }
622
623                         /* Got some error? Release the request */
624                         if (it->d.lustre.it_status < 0) {
625                                 req = it->d.lustre.it_data;
626                                 ptlrpc_req_finished(req);
627                         }
628                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
629                                          &it->d.lustre.it_lock_handle,
630                                          file->f_dentry->d_inode);
631                         goto restart;
632                 }
633                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
634                 if (!*och_p) {
635                         ll_file_data_put(fd);
636                         GOTO(out_och_free, rc = -ENOMEM);
637                 }
638                 (*och_usecount)++;
639                 req = it->d.lustre.it_data;
640
641                 /* md_intent_lock() didn't get a request ref if there was an
642                  * open error, so don't do cleanup on the request here
643                  * (bug 3430) */
644                 /* XXX (green): Should not we bail out on any error here, not
645                  * just open error? */
646                 rc = it_open_error(DISP_OPEN_OPEN, it);
647                 if (rc) {
648                         ll_file_data_put(fd);
649                         GOTO(out_och_free, rc);
650                 }
651
652                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
653                 rc = ll_local_open(file, it, fd, *och_p);
654                 if (rc) {
655                         ll_file_data_put(fd);
656                         GOTO(out_och_free, rc);
657                 }
658         }
659         up(&lli->lli_och_sem);
660
661         /* Must do this outside lli_och_sem lock to prevent deadlock where
662            different kind of OPEN lock for this same inode gets cancelled
663            by ldlm_cancel_lru */
664         if (!S_ISREG(inode->i_mode))
665                 GOTO(out, rc);
666
667         ll_capa_open(inode);
668
669         lsm = lli->lli_smd;
670         if (lsm == NULL) {
671                 if (file->f_flags & O_LOV_DELAY_CREATE ||
672                     !(file->f_mode & FMODE_WRITE)) {
673                         CDEBUG(D_INODE, "object creation was delayed\n");
674                         GOTO(out, rc);
675                 }
676         }
677         file->f_flags &= ~O_LOV_DELAY_CREATE;
678         GOTO(out, rc);
679 out:
680         ptlrpc_req_finished(req);
681         if (req)
682                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
683 out_och_free:
684         if (rc) {
685                 if (*och_p) {
686                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
687                         *och_p = NULL; /* OBD_FREE writes some magic there */
688                         (*och_usecount)--;
689                 }
690                 up(&lli->lli_och_sem);
691 out_openerr:
692                 if (opendir_set != 0)
693                         ll_stop_statahead(inode, lli->lli_opendir_key);
694         }
695
696         return rc;
697 }
698
699 /* Fills the obdo with the attributes for the lsm */
700 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
701                           struct obd_capa *capa, struct obdo *obdo)
702 {
703         struct ptlrpc_request_set *set;
704         struct obd_info            oinfo = { { { 0 } } };
705         int                        rc;
706
707         ENTRY;
708
709         LASSERT(lsm != NULL);
710
711         oinfo.oi_md = lsm;
712         oinfo.oi_oa = obdo;
713         oinfo.oi_oa->o_id = lsm->lsm_object_id;
714         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
715         oinfo.oi_oa->o_mode = S_IFREG;
716         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
717                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
718                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
719                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
720                                OBD_MD_FLGROUP;
721         oinfo.oi_capa = capa;
722
723         set = ptlrpc_prep_set();
724         if (set == NULL) {
725                 CERROR("can't allocate ptlrpc set\n");
726                 rc = -ENOMEM;
727         } else {
728                 rc = obd_getattr_async(exp, &oinfo, set);
729                 if (rc == 0)
730                         rc = ptlrpc_set_wait(set);
731                 ptlrpc_set_destroy(set);
732         }
733         if (rc == 0)
734                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
735                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
736                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE);
737         RETURN(rc);
738 }
739
740 /* Fills the obdo with the attributes for the inode defined by lsm */
741 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
742 {
743         struct ll_inode_info *lli  = ll_i2info(inode);
744         struct obd_capa      *capa = ll_mdscapa_get(inode);
745         int rc;
746         ENTRY;
747
748         rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode), capa, obdo);
749         capa_put(capa);
750         if (rc == 0) {
751                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
752                 CDEBUG(D_INODE,
753                        "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
754                        lli->lli_smd->lsm_object_id, i_size_read(inode),
755                        (unsigned long long)inode->i_blocks,
756                        (unsigned long)ll_inode_blksize(inode));
757         }
758         RETURN(rc);
759 }
760
761 int ll_merge_lvb(struct inode *inode)
762 {
763         struct ll_inode_info *lli = ll_i2info(inode);
764         struct ll_sb_info *sbi = ll_i2sbi(inode);
765         struct ost_lvb lvb;
766         int rc;
767
768         ENTRY;
769
770         ll_inode_size_lock(inode, 1);
771         inode_init_lvb(inode, &lvb);
772         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
773         i_size_write(inode, lvb.lvb_size);
774         inode->i_blocks = lvb.lvb_blocks;
775
776         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
777         LTIME_S(inode->i_atime) = lvb.lvb_atime;
778         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
779         ll_inode_size_unlock(inode, 1);
780
781         RETURN(rc);
782 }
783
784 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
785                      lstat_t *st)
786 {
787         struct obdo obdo = { 0 };
788         int rc;
789
790         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo);
791         if (rc == 0) {
792                 st->st_size   = obdo.o_size;
793                 st->st_blocks = obdo.o_blocks;
794                 st->st_mtime  = obdo.o_mtime;
795                 st->st_atime  = obdo.o_atime;
796                 st->st_ctime  = obdo.o_ctime;
797         }
798         return rc;
799 }
800
801 void ll_io_init(struct cl_io *io, const struct file *file, int write)
802 {
803         struct inode *inode     = file->f_dentry->d_inode;
804         struct ll_sb_info *sbi  = ll_i2sbi(inode);
805         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
806
807         LASSERT(fd != NULL);
808         memset(io, 0, sizeof *io);
809         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
810         if (write)
811                 io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
812         io->ci_obj     = ll_i2info(inode)->lli_clob;
813         io->ci_lockreq = CILR_MAYBE;
814         if (fd->fd_flags & LL_FILE_IGNORE_LOCK || sbi->ll_flags & LL_SBI_NOLCK)
815                 io->ci_lockreq = CILR_NEVER;
816         else if (file->f_flags & O_APPEND)
817                 io->ci_lockreq = CILR_MANDATORY;
818 }
819
820 static ssize_t ll_file_io_generic(const struct lu_env *env,
821                 struct ccc_io_args *args, struct file *file,
822                 enum cl_io_type iot, loff_t *ppos, size_t count)
823 {
824         struct cl_io       *io;
825         ssize_t             result;
826         ENTRY;
827
828         io = &ccc_env_info(env)->cti_io;
829         ll_io_init(io, file, iot == CIT_WRITE);
830
831         if (iot == CIT_READ)
832                 io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile;
833
834         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
835                 struct vvp_io *vio = vvp_env_io(env);
836                 struct ccc_io *cio = ccc_env_io(env);
837                 if (cl_io_is_sendfile(io)) {
838                         vio->u.read.cui_actor = args->cia_actor;
839                         vio->u.read.cui_target = args->cia_target;
840                 } else {
841                         cio->cui_iov = args->cia_iov;
842                         cio->cui_nrsegs = args->cia_nrsegs;
843 #ifndef HAVE_FILE_WRITEV
844                         cio->cui_iocb = args->cia_iocb;
845 #endif
846                 }
847                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
848                 result = cl_io_loop(env, io);
849         } else
850                 /* cl_io_rw_init() handled IO */
851                 result = io->ci_result;
852         if (io->ci_nob > 0) {
853                 result = io->ci_nob;
854                 *ppos = io->u.ci_wr.wr.crw_pos;
855         }
856         cl_io_fini(env, io);
857         RETURN(result);
858 }
859
860
861 /*
862  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
863  */
864 static int ll_file_get_iov_count(const struct iovec *iov,
865                                  unsigned long *nr_segs, size_t *count)
866 {
867         size_t cnt = 0;
868         unsigned long seg;
869
870         for (seg = 0; seg < *nr_segs; seg++) {
871                 const struct iovec *iv = &iov[seg];
872
873                 /*
874                  * If any segment has a negative length, or the cumulative
875                  * length ever wraps negative then return -EINVAL.
876                  */
877                 cnt += iv->iov_len;
878                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
879                         return -EINVAL;
880                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
881                         continue;
882                 if (seg == 0)
883                         return -EFAULT;
884                 *nr_segs = seg;
885                 cnt -= iv->iov_len;   /* This segment is no good */
886                 break;
887         }
888         *count = cnt;
889         return 0;
890 }
891
892 #ifdef HAVE_FILE_READV
893 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
894                               unsigned long nr_segs, loff_t *ppos)
895 {
896         struct lu_env      *env;
897         struct ccc_io_args *args;
898         size_t              count;
899         ssize_t             result;
900         int                 refcheck;
901         ENTRY;
902
903         result = ll_file_get_iov_count(iov, &nr_segs, &count);
904         if (result)
905                 RETURN(result);
906
907         env = cl_env_get(&refcheck);
908         if (IS_ERR(env))
909                 RETURN(PTR_ERR(env));
910
911         args = &vvp_env_info(env)->vti_args;
912         args->cia_is_sendfile = 0;
913         args->cia_iov = (struct iovec *)iov;
914         args->cia_nrsegs = nr_segs;
915         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
916         cl_env_put(env, &refcheck);
917         RETURN(result);
918 }
919
920 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
921                             loff_t *ppos)
922 {
923         struct lu_env *env;
924         struct iovec  *local_iov;
925         ssize_t        result;
926         int            refcheck;
927         ENTRY;
928
929         env = cl_env_get(&refcheck);
930         if (IS_ERR(env))
931                 RETURN(PTR_ERR(env));
932
933         local_iov = &vvp_env_info(env)->vti_local_iov;
934         local_iov->iov_base = (void __user *)buf;
935         local_iov->iov_len = count;
936         result = ll_file_readv(file, local_iov, 1, ppos);
937         cl_env_put(env, &refcheck);
938         RETURN(result);
939 }
940
941 #else
942 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
943                                 unsigned long nr_segs, loff_t pos)
944 {
945         struct lu_env      *env;
946         struct ccc_io_args *args;
947         size_t              count;
948         ssize_t             result;
949         int                 refcheck;
950         ENTRY;
951
952         result = ll_file_get_iov_count(iov, &nr_segs, &count);
953         if (result)
954                 RETURN(result);
955
956         env = cl_env_get(&refcheck);
957         if (IS_ERR(env))
958                 RETURN(PTR_ERR(env));
959
960         args = &vvp_env_info(env)->vti_args;
961         args->cia_is_sendfile = 0;
962         args->cia_iov = (struct iovec *)iov;
963         args->cia_nrsegs = nr_segs;
964         args->cia_iocb = iocb;
965         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
966                                     &iocb->ki_pos, count);
967         cl_env_put(env, &refcheck);
968         RETURN(result);
969 }
970
971 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
972                             loff_t *ppos)
973 {
974         struct lu_env *env;
975         struct iovec  *local_iov;
976         struct kiocb  *kiocb;
977         ssize_t        result;
978         int            refcheck;
979         ENTRY;
980
981         env = cl_env_get(&refcheck);
982         if (IS_ERR(env))
983                 RETURN(PTR_ERR(env));
984
985         local_iov = &vvp_env_info(env)->vti_local_iov;
986         kiocb = &vvp_env_info(env)->vti_kiocb;
987         local_iov->iov_base = (void __user *)buf;
988         local_iov->iov_len = count;
989         init_sync_kiocb(kiocb, file);
990         kiocb->ki_pos = *ppos;
991         kiocb->ki_left = count;
992
993         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
994         *ppos = kiocb->ki_pos;
995
996         cl_env_put(env, &refcheck);
997         RETURN(result);
998 }
999 #endif
1000
1001 /*
1002  * Write to a file (through the page cache).
1003  */
1004 #ifdef HAVE_FILE_WRITEV
1005 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1006                               unsigned long nr_segs, loff_t *ppos)
1007 {
1008         struct lu_env      *env;
1009         struct ccc_io_args *args;
1010         size_t              count;
1011         ssize_t             result;
1012         int                 refcheck;
1013         ENTRY;
1014
1015         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1016         if (result)
1017                 RETURN(result);
1018
1019         env = cl_env_get(&refcheck);
1020         if (IS_ERR(env))
1021                 RETURN(PTR_ERR(env));
1022
1023         args = &vvp_env_info(env)->vti_args;
1024         args->cia_iov = (struct iovec *)iov;
1025         args->cia_nrsegs = nr_segs;
1026         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1027         cl_env_put(env, &refcheck);
1028         RETURN(result);
1029 }
1030
1031 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1032                              loff_t *ppos)
1033 {
1034         struct lu_env    *env;
1035         struct iovec     *local_iov;
1036         ssize_t           result;
1037         int               refcheck;
1038         ENTRY;
1039
1040         env = cl_env_get(&refcheck);
1041         if (IS_ERR(env))
1042                 RETURN(PTR_ERR(env));
1043
1044         local_iov = &vvp_env_info(env)->vti_local_iov;
1045         local_iov->iov_base = (void __user *)buf;
1046         local_iov->iov_len = count;
1047
1048         result = ll_file_writev(file, local_iov, 1, ppos);
1049         cl_env_put(env, &refcheck);
1050         RETURN(result);
1051 }
1052
1053 #else /* AIO stuff */
1054 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1055                                  unsigned long nr_segs, loff_t pos)
1056 {
1057         struct lu_env      *env;
1058         struct ccc_io_args *args;
1059         size_t              count;
1060         ssize_t             result;
1061         int                 refcheck;
1062         ENTRY;
1063
1064         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1065         if (result)
1066                 RETURN(result);
1067
1068         env = cl_env_get(&refcheck);
1069         if (IS_ERR(env))
1070                 RETURN(PTR_ERR(env));
1071
1072         args = &vvp_env_info(env)->vti_args;
1073         args->cia_iov = (struct iovec *)iov;
1074         args->cia_nrsegs = nr_segs;
1075         args->cia_iocb = iocb;
1076         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1077                                   &iocb->ki_pos, count);
1078         cl_env_put(env, &refcheck);
1079         RETURN(result);
1080 }
1081
1082 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1083                              loff_t *ppos)
1084 {
1085         struct lu_env *env;
1086         struct iovec  *local_iov;
1087         struct kiocb  *kiocb;
1088         ssize_t        result;
1089         int            refcheck;
1090         ENTRY;
1091
1092         env = cl_env_get(&refcheck);
1093         if (IS_ERR(env))
1094                 RETURN(PTR_ERR(env));
1095
1096         local_iov = &vvp_env_info(env)->vti_local_iov;
1097         kiocb = &vvp_env_info(env)->vti_kiocb;
1098         local_iov->iov_base = (void __user *)buf;
1099         local_iov->iov_len = count;
1100         init_sync_kiocb(kiocb, file);
1101         kiocb->ki_pos = *ppos;
1102         kiocb->ki_left = count;
1103
1104         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1105         *ppos = kiocb->ki_pos;
1106
1107         cl_env_put(env, &refcheck);
1108         RETURN(result);
1109 }
1110 #endif
1111
1112
1113 /*
1114  * Send file content (through pagecache) somewhere with helper
1115  */
1116 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1117                                 read_actor_t actor, void *target)
1118 {
1119         struct lu_env      *env;
1120         struct ccc_io_args *args;
1121         ssize_t             result;
1122         int                 refcheck;
1123         ENTRY;
1124
1125         env = cl_env_get(&refcheck);
1126         if (IS_ERR(env))
1127                 RETURN(PTR_ERR(env));
1128
1129         args = &vvp_env_info(env)->vti_args;
1130         args->cia_is_sendfile = 1;
1131         args->cia_target = target;
1132         args->cia_actor = actor;
1133         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1134         cl_env_put(env, &refcheck);
1135         RETURN(result);
1136 }
1137
1138 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1139                                unsigned long arg)
1140 {
1141         struct obd_export *exp = ll_i2dtexp(inode);
1142         struct ll_recreate_obj ucreatp;
1143         struct obd_trans_info oti = { 0 };
1144         struct obdo *oa = NULL;
1145         int lsm_size;
1146         int rc = 0;
1147         struct lov_stripe_md *lsm, *lsm2;
1148         ENTRY;
1149
1150         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1151                 RETURN(-EPERM);
1152
1153         if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1154                            sizeof(struct ll_recreate_obj)))
1155                 RETURN(-EFAULT);
1156
1157         OBDO_ALLOC(oa);
1158         if (oa == NULL)
1159                 RETURN(-ENOMEM);
1160
1161         ll_inode_size_lock(inode, 0);
1162         lsm = ll_i2info(inode)->lli_smd;
1163         if (lsm == NULL)
1164                 GOTO(out, rc = -ENOENT);
1165         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1166                    (lsm->lsm_stripe_count));
1167
1168         OBD_ALLOC(lsm2, lsm_size);
1169         if (lsm2 == NULL)
1170                 GOTO(out, rc = -ENOMEM);
1171
1172         oa->o_id = ucreatp.lrc_id;
1173         oa->o_gr = ucreatp.lrc_group;
1174         oa->o_nlink = ucreatp.lrc_ost_idx;
1175         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1176         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1177         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1178                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1179
1180         memcpy(lsm2, lsm, lsm_size);
1181         rc = obd_create(exp, oa, &lsm2, &oti);
1182
1183         OBD_FREE(lsm2, lsm_size);
1184         GOTO(out, rc);
1185 out:
1186         ll_inode_size_unlock(inode, 0);
1187         OBDO_FREE(oa);
1188         return rc;
1189 }
1190
1191 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1192                              int flags, struct lov_user_md *lum, int lum_size)
1193 {
1194         struct lov_stripe_md *lsm;
1195         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1196         int rc = 0;
1197         ENTRY;
1198
1199         ll_inode_size_lock(inode, 0);
1200         lsm = ll_i2info(inode)->lli_smd;
1201         if (lsm) {
1202                 ll_inode_size_unlock(inode, 0);
1203                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1204                        inode->i_ino);
1205                 RETURN(-EEXIST);
1206         }
1207
1208         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1209         if (rc)
1210                 GOTO(out, rc);
1211         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1212                 GOTO(out_req_free, rc = -ENOENT);
1213         rc = oit.d.lustre.it_status;
1214         if (rc < 0)
1215                 GOTO(out_req_free, rc);
1216
1217         ll_release_openhandle(file->f_dentry, &oit);
1218
1219  out:
1220         ll_inode_size_unlock(inode, 0);
1221         ll_intent_release(&oit);
1222         RETURN(rc);
1223 out_req_free:
1224         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1225         goto out;
1226 }
1227
1228 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1229                              struct lov_mds_md **lmmp, int *lmm_size,
1230                              struct ptlrpc_request **request)
1231 {
1232         struct ll_sb_info *sbi = ll_i2sbi(inode);
1233         struct mdt_body  *body;
1234         struct lov_mds_md *lmm = NULL;
1235         struct ptlrpc_request *req = NULL;
1236         struct obd_capa *oc;
1237         int rc, lmmsize;
1238
1239         rc = ll_get_max_mdsize(sbi, &lmmsize);
1240         if (rc)
1241                 RETURN(rc);
1242
1243         oc = ll_mdscapa_get(inode);
1244         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1245                              oc, filename, strlen(filename) + 1,
1246                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1247                              ll_i2suppgid(inode), &req);
1248         capa_put(oc);
1249         if (rc < 0) {
1250                 CDEBUG(D_INFO, "md_getattr_name failed "
1251                        "on %s: rc %d\n", filename, rc);
1252                 GOTO(out, rc);
1253         }
1254
1255         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1256         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1257
1258         lmmsize = body->eadatasize;
1259
1260         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1261                         lmmsize == 0) {
1262                 GOTO(out, rc = -ENODATA);
1263         }
1264
1265         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1266         LASSERT(lmm != NULL);
1267
1268         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1269             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
1270             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
1271                 GOTO(out, rc = -EPROTO);
1272         }
1273
1274         /*
1275          * This is coming from the MDS, so is probably in
1276          * little endian.  We convert it to host endian before
1277          * passing it to userspace.
1278          */
1279         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1280                 /* if function called for directory - we should
1281                  * avoid swab not existent lsm objects */
1282                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1283                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1284                         if (S_ISREG(body->mode))
1285                                 lustre_swab_lov_user_md_objects(
1286                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1287                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1288                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1289                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1290                         if (S_ISREG(body->mode))
1291                                 lustre_swab_lov_user_md_objects(
1292                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1293                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1294                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
1295                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1296                 }
1297         }
1298
1299         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1300                 struct lov_stripe_md *lsm;
1301                 struct lov_user_md_join *lmj;
1302                 int lmj_size, i, aindex = 0;
1303
1304                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1305                 if (rc < 0)
1306                         GOTO(out, rc = -ENOMEM);
1307                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1308                 if (rc)
1309                         GOTO(out_free_memmd, rc);
1310
1311                 lmj_size = sizeof(struct lov_user_md_join) +
1312                            lsm->lsm_stripe_count *
1313                            sizeof(struct lov_user_ost_data_join);
1314                 OBD_ALLOC(lmj, lmj_size);
1315                 if (!lmj)
1316                         GOTO(out_free_memmd, rc = -ENOMEM);
1317
1318                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1319                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1320                         struct lov_extent *lex =
1321                                 &lsm->lsm_array->lai_ext_array[aindex];
1322
1323                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1324                                 aindex ++;
1325                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1326                                         LPU64" len %d\n", aindex, i,
1327                                         lex->le_start, (int)lex->le_len);
1328                         lmj->lmm_objects[i].l_extent_start =
1329                                 lex->le_start;
1330
1331                         if ((int)lex->le_len == -1)
1332                                 lmj->lmm_objects[i].l_extent_end = -1;
1333                         else
1334                                 lmj->lmm_objects[i].l_extent_end =
1335                                         lex->le_start + lex->le_len;
1336                         lmj->lmm_objects[i].l_object_id =
1337                                 lsm->lsm_oinfo[i]->loi_id;
1338                         lmj->lmm_objects[i].l_object_gr =
1339                                 lsm->lsm_oinfo[i]->loi_gr;
1340                         lmj->lmm_objects[i].l_ost_gen =
1341                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1342                         lmj->lmm_objects[i].l_ost_idx =
1343                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1344                 }
1345                 lmm = (struct lov_mds_md *)lmj;
1346                 lmmsize = lmj_size;
1347 out_free_memmd:
1348                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1349         }
1350 out:
1351         *lmmp = lmm;
1352         *lmm_size = lmmsize;
1353         *request = req;
1354         return rc;
1355 }
1356
1357 static int ll_lov_setea(struct inode *inode, struct file *file,
1358                             unsigned long arg)
1359 {
1360         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1361         struct lov_user_md  *lump;
1362         int lum_size = sizeof(struct lov_user_md) +
1363                        sizeof(struct lov_user_ost_data);
1364         int rc;
1365         ENTRY;
1366
1367         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1368                 RETURN(-EPERM);
1369
1370         OBD_ALLOC(lump, lum_size);
1371         if (lump == NULL) {
1372                 RETURN(-ENOMEM);
1373         }
1374         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1375                 OBD_FREE(lump, lum_size);
1376                 RETURN(-EFAULT);
1377         }
1378
1379         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1380
1381         OBD_FREE(lump, lum_size);
1382         RETURN(rc);
1383 }
1384
1385 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1386                             unsigned long arg)
1387 {
1388         struct lov_user_md_v3 lumv3;
1389         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1390         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1391         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1392         int lum_size;
1393         int rc;
1394         int flags = FMODE_WRITE;
1395         ENTRY;
1396
1397         /* first try with v1 which is smaller than v3 */
1398         lum_size = sizeof(struct lov_user_md_v1);
1399         if (copy_from_user(lumv1, lumv1p, lum_size))
1400                 RETURN(-EFAULT);
1401
1402         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1403                 lum_size = sizeof(struct lov_user_md_v3);
1404                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1405                         RETURN(-EFAULT);
1406         }
1407
1408         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1409         if (rc == 0) {
1410                  put_user(0, &lumv1p->lmm_stripe_count);
1411                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1412                                     0, ll_i2info(inode)->lli_smd,
1413                                     (void *)arg);
1414         }
1415         RETURN(rc);
1416 }
1417
1418 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1419 {
1420         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1421
1422         if (!lsm)
1423                 RETURN(-ENODATA);
1424
1425         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1426                             (void *)arg);
1427 }
1428
1429 static int ll_get_grouplock(struct inode *inode, struct file *file,
1430                             unsigned long arg)
1431 {
1432         /* XXX */
1433         return -ENOSYS;
1434 }
1435
1436 static int ll_put_grouplock(struct inode *inode, struct file *file,
1437                             unsigned long arg)
1438 {
1439         /* XXX */
1440         return -ENOSYS;
1441 }
1442
1443 #if LUSTRE_FIX >= 50
1444 static int join_sanity_check(struct inode *head, struct inode *tail)
1445 {
1446         ENTRY;
1447         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1448                 CERROR("server do not support join \n");
1449                 RETURN(-EINVAL);
1450         }
1451         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1452                 CERROR("tail ino %lu and ino head %lu must be regular\n",
1453                        head->i_ino, tail->i_ino);
1454                 RETURN(-EINVAL);
1455         }
1456         if (head->i_ino == tail->i_ino) {
1457                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1458                 RETURN(-EINVAL);
1459         }
1460         if (i_size_read(head) % JOIN_FILE_ALIGN) {
1461                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1462                 RETURN(-EINVAL);
1463         }
1464         RETURN(0);
1465 }
1466
1467 static int join_file(struct inode *head_inode, struct file *head_filp,
1468                      struct file *tail_filp)
1469 {
1470         struct dentry *tail_dentry = tail_filp->f_dentry;
1471         struct lookup_intent oit = {.it_op = IT_OPEN,
1472                                    .it_flags = head_filp->f_flags|O_JOIN_FILE};
1473         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
1474                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
1475
1476         struct lustre_handle lockh;
1477         struct md_op_data *op_data;
1478         int    rc;
1479         loff_t data;
1480         ENTRY;
1481
1482         tail_dentry = tail_filp->f_dentry;
1483
1484         data = i_size_read(head_inode);
1485         op_data = ll_prep_md_op_data(NULL, head_inode,
1486                                      tail_dentry->d_parent->d_inode,
1487                                      tail_dentry->d_name.name,
1488                                      tail_dentry->d_name.len, 0,
1489                                      LUSTRE_OPC_ANY, &data);
1490         if (IS_ERR(op_data))
1491                 RETURN(PTR_ERR(op_data));
1492
1493         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
1494                          op_data, &lockh, NULL, 0, NULL, 0);
1495
1496         ll_finish_md_op_data(op_data);
1497         if (rc < 0)
1498                 GOTO(out, rc);
1499
1500         rc = oit.d.lustre.it_status;
1501
1502         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
1503                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
1504                 ptlrpc_req_finished((struct ptlrpc_request *)
1505                                     oit.d.lustre.it_data);
1506                 GOTO(out, rc);
1507         }
1508
1509         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1510                                            * away */
1511                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1512                 oit.d.lustre.it_lock_mode = 0;
1513         }
1514         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1515         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
1516         ll_release_openhandle(head_filp->f_dentry, &oit);
1517 out:
1518         ll_intent_release(&oit);
1519         RETURN(rc);
1520 }
1521
1522 static int ll_file_join(struct inode *head, struct file *filp,
1523                         char *filename_tail)
1524 {
1525         struct inode *tail = NULL, *first = NULL, *second = NULL;
1526         struct dentry *tail_dentry;
1527         struct file *tail_filp, *first_filp, *second_filp;
1528         struct ll_lock_tree first_tree, second_tree;
1529         struct ll_lock_tree_node *first_node, *second_node;
1530         struct ll_inode_info *hlli = ll_i2info(head), *tlli;
1531         int rc = 0, cleanup_phase = 0;
1532         ENTRY;
1533
1534         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1535                head->i_ino, head->i_generation, head, filename_tail);
1536
1537         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1538         if (IS_ERR(tail_filp)) {
1539                 CERROR("Can not open tail file %s", filename_tail);
1540                 rc = PTR_ERR(tail_filp);
1541                 GOTO(cleanup, rc);
1542         }
1543         tail = igrab(tail_filp->f_dentry->d_inode);
1544
1545         tlli = ll_i2info(tail);
1546         tail_dentry = tail_filp->f_dentry;
1547         LASSERT(tail_dentry);
1548         cleanup_phase = 1;
1549
1550         /*reorder the inode for lock sequence*/
1551         first = head->i_ino > tail->i_ino ? head : tail;
1552         second = head->i_ino > tail->i_ino ? tail : head;
1553         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1554         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1555
1556         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1557                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1558         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1559         if (IS_ERR(first_node)){
1560                 rc = PTR_ERR(first_node);
1561                 GOTO(cleanup, rc);
1562         }
1563         first_tree.lt_fd = first_filp->private_data;
1564         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1565         if (rc != 0)
1566                 GOTO(cleanup, rc);
1567         cleanup_phase = 2;
1568
1569         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1570         if (IS_ERR(second_node)){
1571                 rc = PTR_ERR(second_node);
1572                 GOTO(cleanup, rc);
1573         }
1574         second_tree.lt_fd = second_filp->private_data;
1575         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1576         if (rc != 0)
1577                 GOTO(cleanup, rc);
1578         cleanup_phase = 3;
1579
1580         rc = join_sanity_check(head, tail);
1581         if (rc)
1582                 GOTO(cleanup, rc);
1583
1584         rc = join_file(head, filp, tail_filp);
1585         if (rc)
1586                 GOTO(cleanup, rc);
1587 cleanup:
1588         switch (cleanup_phase) {
1589         case 3:
1590                 ll_tree_unlock(&second_tree);
1591                 obd_cancel_unused(ll_i2dtexp(second),
1592                                   ll_i2info(second)->lli_smd, 0, NULL);
1593         case 2:
1594                 ll_tree_unlock(&first_tree);
1595                 obd_cancel_unused(ll_i2dtexp(first),
1596                                   ll_i2info(first)->lli_smd, 0, NULL);
1597         case 1:
1598                 filp_close(tail_filp, 0);
1599                 if (tail)
1600                         iput(tail);
1601                 if (head && rc == 0) {
1602                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
1603                                        &hlli->lli_smd);
1604                         hlli->lli_smd = NULL;
1605                 }
1606         case 0:
1607                 break;
1608         default:
1609                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
1610                 LBUG();
1611         }
1612         RETURN(rc);
1613 }
1614 #endif /* LUSTRE_FIX >= 50 */
1615
1616 /**
1617  * Close inode open handle
1618  *
1619  * \param dentry [in]     dentry which contains the inode
1620  * \param it     [in,out] intent which contains open info and result
1621  *
1622  * \retval 0     success
1623  * \retval <0    failure
1624  */
1625 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1626 {
1627         struct inode *inode = dentry->d_inode;
1628         struct obd_client_handle *och;
1629         int rc;
1630         ENTRY;
1631
1632         LASSERT(inode);
1633
1634         /* Root ? Do nothing. */
1635         if (dentry->d_inode->i_sb->s_root == dentry)
1636                 RETURN(0);
1637
1638         /* No open handle to close? Move away */
1639         if (!it_disposition(it, DISP_OPEN_OPEN))
1640                 RETURN(0);
1641
1642         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1643
1644         OBD_ALLOC(och, sizeof(*och));
1645         if (!och)
1646                 GOTO(out, rc = -ENOMEM);
1647
1648         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1649                     ll_i2info(inode), it, och);
1650
1651         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1652                                        inode, och);
1653  out:
1654         /* this one is in place of ll_file_open */
1655         if (it_disposition(it, DISP_ENQ_OPEN_REF))
1656                 ptlrpc_req_finished(it->d.lustre.it_data);
1657         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1658         RETURN(rc);
1659 }
1660
1661 /**
1662  * Get size for inode for which FIEMAP mapping is requested.
1663  * Make the FIEMAP get_info call and returns the result.
1664  */
1665 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1666               int num_bytes)
1667 {
1668         struct obd_export *exp = ll_i2dtexp(inode);
1669         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1670         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1671         int vallen = num_bytes;
1672         int rc;
1673         ENTRY;
1674
1675         /* If the stripe_count > 1 and the application does not understand
1676          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1677          */
1678         if (lsm->lsm_stripe_count > 1 &&
1679             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1680                 return -EOPNOTSUPP;
1681
1682         fm_key.oa.o_id = lsm->lsm_object_id;
1683         fm_key.oa.o_gr = lsm->lsm_object_gr;
1684         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1685
1686         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
1687                         OBD_MD_FLSIZE);
1688
1689         /* If filesize is 0, then there would be no objects for mapping */
1690         if (fm_key.oa.o_size == 0) {
1691                 fiemap->fm_mapped_extents = 0;
1692                 RETURN(0);
1693         }
1694
1695         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1696
1697         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1698         if (rc)
1699                 CERROR("obd_get_info failed: rc = %d\n", rc);
1700
1701         RETURN(rc);
1702 }
1703
1704 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1705                   unsigned long arg)
1706 {
1707         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1708         int flags;
1709         ENTRY;
1710
1711         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1712                inode->i_generation, inode, cmd);
1713         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1714
1715         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1716         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1717                 RETURN(-ENOTTY);
1718
1719         switch(cmd) {
1720         case LL_IOC_GETFLAGS:
1721                 /* Get the current value of the file flags */
1722                 return put_user(fd->fd_flags, (int *)arg);
1723         case LL_IOC_SETFLAGS:
1724         case LL_IOC_CLRFLAGS:
1725                 /* Set or clear specific file flags */
1726                 /* XXX This probably needs checks to ensure the flags are
1727                  *     not abused, and to handle any flag side effects.
1728                  */
1729                 if (get_user(flags, (int *) arg))
1730                         RETURN(-EFAULT);
1731
1732                 if (cmd == LL_IOC_SETFLAGS) {
1733                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1734                             !(file->f_flags & O_DIRECT)) {
1735                                 CERROR("%s: unable to disable locking on "
1736                                        "non-O_DIRECT file\n", current->comm);
1737                                 RETURN(-EINVAL);
1738                         }
1739
1740                         fd->fd_flags |= flags;
1741                 } else {
1742                         fd->fd_flags &= ~flags;
1743                 }
1744                 RETURN(0);
1745         case LL_IOC_LOV_SETSTRIPE:
1746                 RETURN(ll_lov_setstripe(inode, file, arg));
1747         case LL_IOC_LOV_SETEA:
1748                 RETURN(ll_lov_setea(inode, file, arg));
1749         case LL_IOC_LOV_GETSTRIPE:
1750                 RETURN(ll_lov_getstripe(inode, arg));
1751         case LL_IOC_RECREATE_OBJ:
1752                 RETURN(ll_lov_recreate_obj(inode, file, arg));
1753         case EXT3_IOC_FIEMAP: {
1754                 struct ll_user_fiemap *fiemap_s;
1755                 size_t num_bytes, ret_bytes;
1756                 unsigned int extent_count;
1757                 int rc = 0;
1758
1759                 /* Get the extent count so we can calculate the size of
1760                  * required fiemap buffer */
1761                 if (get_user(extent_count,
1762                     &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1763                         RETURN(-EFAULT);
1764                 num_bytes = sizeof(*fiemap_s) + (extent_count *
1765                                                  sizeof(struct ll_fiemap_extent));
1766                 OBD_VMALLOC(fiemap_s, num_bytes);
1767                 if (fiemap_s == NULL)
1768                         RETURN(-ENOMEM);
1769
1770                 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1771                                    sizeof(*fiemap_s)))
1772                         GOTO(error, rc = -EFAULT);
1773
1774                 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1775                         fiemap_s->fm_flags = fiemap_s->fm_flags &
1776                                                     ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1777                         if (copy_to_user((char *)arg, fiemap_s,
1778                                          sizeof(*fiemap_s)))
1779                                 GOTO(error, rc = -EFAULT);
1780
1781                         GOTO(error, rc = -EBADR);
1782                 }
1783
1784                 /* If fm_extent_count is non-zero, read the first extent since
1785                  * it is used to calculate end_offset and device from previous
1786                  * fiemap call. */
1787                 if (extent_count) {
1788                         if (copy_from_user(&fiemap_s->fm_extents[0],
1789                             (char __user *)arg + sizeof(*fiemap_s),
1790                             sizeof(struct ll_fiemap_extent)))
1791                                 GOTO(error, rc = -EFAULT);
1792                 }
1793
1794                 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
1795                         int rc;
1796
1797                         rc = filemap_fdatawrite(inode->i_mapping);
1798                         if (rc)
1799                                 GOTO(error, rc);
1800                 }
1801
1802                 rc = ll_fiemap(inode, fiemap_s, num_bytes);
1803                 if (rc)
1804                         GOTO(error, rc);
1805
1806                 ret_bytes = sizeof(struct ll_user_fiemap);
1807
1808                 if (extent_count != 0)
1809                         ret_bytes += (fiemap_s->fm_mapped_extents *
1810                                          sizeof(struct ll_fiemap_extent));
1811
1812                 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1813                         rc = -EFAULT;
1814
1815 error:
1816                 OBD_VFREE(fiemap_s, num_bytes);
1817                 RETURN(rc);
1818         }
1819         case EXT3_IOC_GETFLAGS:
1820         case EXT3_IOC_SETFLAGS:
1821                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1822         case EXT3_IOC_GETVERSION_OLD:
1823         case EXT3_IOC_GETVERSION:
1824                 RETURN(put_user(inode->i_generation, (int *)arg));
1825         case LL_IOC_JOIN: {
1826 #if LUSTRE_FIX >= 50
1827                 /* Allow file join in beta builds to allow debuggging */
1828                 char *ftail;
1829                 int rc;
1830
1831                 ftail = getname((const char *)arg);
1832                 if (IS_ERR(ftail))
1833                         RETURN(PTR_ERR(ftail));
1834                 rc = ll_file_join(inode, file, ftail);
1835                 putname(ftail);
1836                 RETURN(rc);
1837 #else
1838                 CWARN("file join is not supported in this version of Lustre\n");
1839                 RETURN(-ENOTTY);
1840 #endif
1841         }
1842         case LL_IOC_GROUP_LOCK:
1843                 RETURN(ll_get_grouplock(inode, file, arg));
1844         case LL_IOC_GROUP_UNLOCK:
1845                 RETURN(ll_put_grouplock(inode, file, arg));
1846         case IOC_OBD_STATFS:
1847                 RETURN(ll_obd_statfs(inode, (void *)arg));
1848
1849         /* We need to special case any other ioctls we want to handle,
1850          * to send them to the MDS/OST as appropriate and to properly
1851          * network encode the arg field.
1852         case EXT3_IOC_SETVERSION_OLD:
1853         case EXT3_IOC_SETVERSION:
1854         */
1855         case LL_IOC_FLUSHCTX:
1856                 RETURN(ll_flush_ctx(inode));
1857         case LL_IOC_PATH2FID: {
1858                 if (copy_to_user((void *)arg, &ll_i2info(inode)->lli_fid,
1859                                  sizeof(struct lu_fid)))
1860                         RETURN(-EFAULT);
1861
1862                 RETURN(0);
1863         }
1864         default: {
1865                 int err;
1866
1867                 if (LLIOC_STOP ==
1868                     ll_iocontrol_call(inode, file, cmd, arg, &err))
1869                         RETURN(err);
1870
1871                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1872                                      (void *)arg));
1873         }
1874         }
1875 }
1876
1877 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1878 {
1879         struct inode *inode = file->f_dentry->d_inode;
1880         loff_t retval;
1881         ENTRY;
1882         retval = offset + ((origin == 2) ? i_size_read(inode) :
1883                            (origin == 1) ? file->f_pos : 0);
1884         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
1885                inode->i_ino, inode->i_generation, inode, retval, retval,
1886                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1887         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1888
1889         if (origin == 2) { /* SEEK_END */
1890                 int nonblock = 0, rc;
1891
1892                 if (file->f_flags & O_NONBLOCK)
1893                         nonblock = LDLM_FL_BLOCK_NOWAIT;
1894
1895                 rc = cl_glimpse_size(inode);
1896                 if (rc != 0)
1897                         RETURN(rc);
1898
1899                 ll_inode_size_lock(inode, 0);
1900                 offset += i_size_read(inode);
1901                 ll_inode_size_unlock(inode, 0);
1902         } else if (origin == 1) { /* SEEK_CUR */
1903                 offset += file->f_pos;
1904         }
1905
1906         retval = -EINVAL;
1907         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1908                 if (offset != file->f_pos) {
1909                         file->f_pos = offset;
1910                 }
1911                 retval = offset;
1912         }
1913
1914         RETURN(retval);
1915 }
1916
1917 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1918 {
1919         struct inode *inode = dentry->d_inode;
1920         struct ll_inode_info *lli = ll_i2info(inode);
1921         struct lov_stripe_md *lsm = lli->lli_smd;
1922         struct ptlrpc_request *req;
1923         struct obd_capa *oc;
1924         int rc, err;
1925         ENTRY;
1926         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
1927                inode->i_generation, inode);
1928         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
1929
1930         /* fsync's caller has already called _fdata{sync,write}, we want
1931          * that IO to finish before calling the osc and mdc sync methods */
1932         rc = filemap_fdatawait(inode->i_mapping);
1933
1934         /* catch async errors that were recorded back when async writeback
1935          * failed for pages in this mapping. */
1936         err = lli->lli_async_rc;
1937         lli->lli_async_rc = 0;
1938         if (rc == 0)
1939                 rc = err;
1940         if (lsm) {
1941                 err = lov_test_and_clear_async_rc(lsm);
1942                 if (rc == 0)
1943                         rc = err;
1944         }
1945
1946         oc = ll_mdscapa_get(inode);
1947         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
1948                       &req);
1949         capa_put(oc);
1950         if (!rc)
1951                 rc = err;
1952         if (!err)
1953                 ptlrpc_req_finished(req);
1954
1955         if (data && lsm) {
1956                 struct obdo *oa;
1957
1958                 OBDO_ALLOC(oa);
1959                 if (!oa)
1960                         RETURN(rc ? rc : -ENOMEM);
1961
1962                 oa->o_id = lsm->lsm_object_id;
1963                 oa->o_gr = lsm->lsm_object_gr;
1964                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1965                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1966                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
1967                                            OBD_MD_FLGROUP);
1968
1969                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
1970                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
1971                                0, OBD_OBJECT_EOF, oc);
1972                 capa_put(oc);
1973                 if (!rc)
1974                         rc = err;
1975                 OBDO_FREE(oa);
1976         }
1977
1978         RETURN(rc);
1979 }
1980
1981 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
1982 {
1983         struct inode *inode = file->f_dentry->d_inode;
1984         struct ll_sb_info *sbi = ll_i2sbi(inode);
1985         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
1986                                            .ei_cb_cp =ldlm_flock_completion_ast,
1987                                            .ei_cbdata = file_lock };
1988         struct md_op_data *op_data;
1989         struct lustre_handle lockh = {0};
1990         ldlm_policy_data_t flock;
1991         int flags = 0;
1992         int rc;
1993         ENTRY;
1994
1995         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
1996                inode->i_ino, file_lock);
1997
1998         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
1999
2000         if (file_lock->fl_flags & FL_FLOCK) {
2001                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2002                 /* set missing params for flock() calls */
2003                 file_lock->fl_end = OFFSET_MAX;
2004                 file_lock->fl_pid = current->tgid;
2005         }
2006         flock.l_flock.pid = file_lock->fl_pid;
2007         flock.l_flock.start = file_lock->fl_start;
2008         flock.l_flock.end = file_lock->fl_end;
2009
2010         switch (file_lock->fl_type) {
2011         case F_RDLCK:
2012                 einfo.ei_mode = LCK_PR;
2013                 break;
2014         case F_UNLCK:
2015                 /* An unlock request may or may not have any relation to
2016                  * existing locks so we may not be able to pass a lock handle
2017                  * via a normal ldlm_lock_cancel() request. The request may even
2018                  * unlock a byte range in the middle of an existing lock. In
2019                  * order to process an unlock request we need all of the same
2020                  * information that is given with a normal read or write record
2021                  * lock request. To avoid creating another ldlm unlock (cancel)
2022                  * message we'll treat a LCK_NL flock request as an unlock. */
2023                 einfo.ei_mode = LCK_NL;
2024                 break;
2025         case F_WRLCK:
2026                 einfo.ei_mode = LCK_PW;
2027                 break;
2028         default:
2029                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2030                 RETURN (-EINVAL);
2031         }
2032
2033         switch (cmd) {
2034         case F_SETLKW:
2035 #ifdef F_SETLKW64
2036         case F_SETLKW64:
2037 #endif
2038                 flags = 0;
2039                 break;
2040         case F_SETLK:
2041 #ifdef F_SETLK64
2042         case F_SETLK64:
2043 #endif
2044                 flags = LDLM_FL_BLOCK_NOWAIT;
2045                 break;
2046         case F_GETLK:
2047 #ifdef F_GETLK64
2048         case F_GETLK64:
2049 #endif
2050                 flags = LDLM_FL_TEST_LOCK;
2051                 /* Save the old mode so that if the mode in the lock changes we
2052                  * can decrement the appropriate reader or writer refcount. */
2053                 file_lock->fl_type = einfo.ei_mode;
2054                 break;
2055         default:
2056                 CERROR("unknown fcntl lock command: %d\n", cmd);
2057                 RETURN (-EINVAL);
2058         }
2059
2060         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2061                                      LUSTRE_OPC_ANY, NULL);
2062         if (IS_ERR(op_data))
2063                 RETURN(PTR_ERR(op_data));
2064
2065         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2066                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2067                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2068
2069         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2070                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2071
2072         ll_finish_md_op_data(op_data);
2073
2074         if ((file_lock->fl_flags & FL_FLOCK) &&
2075             (rc == 0 || file_lock->fl_type == F_UNLCK))
2076                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2077 #ifdef HAVE_F_OP_FLOCK
2078         if ((file_lock->fl_flags & FL_POSIX) &&
2079             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2080             !(flags & LDLM_FL_TEST_LOCK))
2081                 posix_lock_file_wait(file, file_lock);
2082 #endif
2083
2084         RETURN(rc);
2085 }
2086
2087 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2088 {
2089         ENTRY;
2090
2091         RETURN(-ENOSYS);
2092 }
2093
2094 int ll_have_md_lock(struct inode *inode, __u64 bits)
2095 {
2096         struct lustre_handle lockh;
2097         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2098         struct lu_fid *fid;
2099         int flags;
2100         ENTRY;
2101
2102         if (!inode)
2103                RETURN(0);
2104
2105         fid = &ll_i2info(inode)->lli_fid;
2106         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2107
2108         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2109         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2110                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2111                 RETURN(1);
2112         }
2113         RETURN(0);
2114 }
2115
2116 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2117                             struct lustre_handle *lockh)
2118 {
2119         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2120         struct lu_fid *fid;
2121         ldlm_mode_t rc;
2122         int flags;
2123         ENTRY;
2124
2125         fid = &ll_i2info(inode)->lli_fid;
2126         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2127
2128         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2129         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2130                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2131         RETURN(rc);
2132 }
2133
2134 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2135         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2136                               * and return success */
2137                 inode->i_nlink = 0;
2138                 /* This path cannot be hit for regular files unless in
2139                  * case of obscure races, so no need to to validate
2140                  * size. */
2141                 if (!S_ISREG(inode->i_mode) &&
2142                     !S_ISDIR(inode->i_mode))
2143                         return 0;
2144         }
2145
2146         if (rc) {
2147                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2148                 return -abs(rc);
2149
2150         }
2151
2152         return 0;
2153 }
2154
2155 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2156 {
2157         struct inode *inode = dentry->d_inode;
2158         struct ptlrpc_request *req = NULL;
2159         struct ll_sb_info *sbi;
2160         struct obd_export *exp;
2161         int rc;
2162         ENTRY;
2163
2164         if (!inode) {
2165                 CERROR("REPORT THIS LINE TO PETER\n");
2166                 RETURN(0);
2167         }
2168         sbi = ll_i2sbi(inode);
2169
2170         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2171                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2172
2173         exp = ll_i2mdexp(inode);
2174
2175         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2176                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2177                 struct md_op_data *op_data;
2178
2179                 /* Call getattr by fid, so do not provide name at all. */
2180                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2181                                              dentry->d_inode, NULL, 0, 0,
2182                                              LUSTRE_OPC_ANY, NULL);
2183                 if (IS_ERR(op_data))
2184                         RETURN(PTR_ERR(op_data));
2185
2186                 oit.it_flags |= O_CHECK_STALE;
2187                 rc = md_intent_lock(exp, op_data, NULL, 0,
2188                                     /* we are not interested in name
2189                                        based lookup */
2190                                     &oit, 0, &req,
2191                                     ll_md_blocking_ast, 0);
2192                 ll_finish_md_op_data(op_data);
2193                 oit.it_flags &= ~O_CHECK_STALE;
2194                 if (rc < 0) {
2195                         rc = ll_inode_revalidate_fini(inode, rc);
2196                         GOTO (out, rc);
2197                 }
2198
2199                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2200                 if (rc != 0) {
2201                         ll_intent_release(&oit);
2202                         GOTO(out, rc);
2203                 }
2204
2205                 /* Unlinked? Unhash dentry, so it is not picked up later by
2206                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2207                    here to preserve get_cwd functionality on 2.6.
2208                    Bug 10503 */
2209                 if (!dentry->d_inode->i_nlink) {
2210                         spin_lock(&ll_lookup_lock);
2211                         spin_lock(&dcache_lock);
2212                         ll_drop_dentry(dentry);
2213                         spin_unlock(&dcache_lock);
2214                         spin_unlock(&ll_lookup_lock);
2215                 }
2216
2217                 ll_lookup_finish_locks(&oit, dentry);
2218         } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE |
2219                                                      MDS_INODELOCK_LOOKUP)) {
2220                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2221                 obd_valid valid = OBD_MD_FLGETATTR;
2222                 struct obd_capa *oc;
2223                 int ealen = 0;
2224
2225                 if (S_ISREG(inode->i_mode)) {
2226                         rc = ll_get_max_mdsize(sbi, &ealen);
2227                         if (rc)
2228                                 RETURN(rc);
2229                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2230                 }
2231                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2232                  * capa for this inode. Because we only keep capas of dirs
2233                  * fresh. */
2234                 oc = ll_mdscapa_get(inode);
2235                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2236                                 ealen, &req);
2237                 capa_put(oc);
2238                 if (rc) {
2239                         rc = ll_inode_revalidate_fini(inode, rc);
2240                         RETURN(rc);
2241                 }
2242
2243                 rc = ll_prep_inode(&inode, req, NULL);
2244                 if (rc)
2245                         GOTO(out, rc);
2246         }
2247
2248         /* if object not yet allocated, don't validate size */
2249         if (ll_i2info(inode)->lli_smd == NULL)
2250                 GOTO(out, rc = 0);
2251
2252         /* cl_glimpse_size will prefer locally cached writes if they extend
2253          * the file */
2254         rc = cl_glimpse_size(inode);
2255         EXIT;
2256 out:
2257         ptlrpc_req_finished(req);
2258         return rc;
2259 }
2260
2261 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2262                   struct lookup_intent *it, struct kstat *stat)
2263 {
2264         struct inode *inode = de->d_inode;
2265         int res = 0;
2266
2267         res = ll_inode_revalidate_it(de, it);
2268         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2269
2270         if (res)
2271                 return res;
2272
2273         stat->dev = inode->i_sb->s_dev;
2274         stat->ino = inode->i_ino;
2275         stat->mode = inode->i_mode;
2276         stat->nlink = inode->i_nlink;
2277         stat->uid = inode->i_uid;
2278         stat->gid = inode->i_gid;
2279         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2280         stat->atime = inode->i_atime;
2281         stat->mtime = inode->i_mtime;
2282         stat->ctime = inode->i_ctime;
2283 #ifdef HAVE_INODE_BLKSIZE
2284         stat->blksize = inode->i_blksize;
2285 #else
2286         stat->blksize = 1 << inode->i_blkbits;
2287 #endif
2288
2289         ll_inode_size_lock(inode, 0);
2290         stat->size = i_size_read(inode);
2291         stat->blocks = inode->i_blocks;
2292         ll_inode_size_unlock(inode, 0);
2293
2294         return 0;
2295 }
2296 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2297 {
2298         struct lookup_intent it = { .it_op = IT_GETATTR };
2299
2300         return ll_getattr_it(mnt, de, &it, stat);
2301 }
2302
2303 static
2304 int lustre_check_acl(struct inode *inode, int mask)
2305 {
2306 #ifdef CONFIG_FS_POSIX_ACL
2307         struct ll_inode_info *lli = ll_i2info(inode);
2308         struct posix_acl *acl;
2309         int rc;
2310         ENTRY;
2311
2312         spin_lock(&lli->lli_lock);
2313         acl = posix_acl_dup(lli->lli_posix_acl);
2314         spin_unlock(&lli->lli_lock);
2315
2316         if (!acl)
2317                 RETURN(-EAGAIN);
2318
2319         rc = posix_acl_permission(inode, acl, mask);
2320         posix_acl_release(acl);
2321
2322         RETURN(rc);
2323 #else
2324         return -EAGAIN;
2325 #endif
2326 }
2327
2328 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2329 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2330 {
2331         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2332                inode->i_ino, inode->i_generation, inode, mask);
2333         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2334                 return lustre_check_remote_perm(inode, mask);
2335
2336         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2337         return generic_permission(inode, mask, lustre_check_acl);
2338 }
2339 #else
2340 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2341 {
2342         int mode = inode->i_mode;
2343         int rc;
2344
2345         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2346                inode->i_ino, inode->i_generation, inode, mask);
2347
2348         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2349                 return lustre_check_remote_perm(inode, mask);
2350
2351         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2352
2353         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2354             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2355                 return -EROFS;
2356         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2357                 return -EACCES;
2358         if (current->fsuid == inode->i_uid) {
2359                 mode >>= 6;
2360         } else if (1) {
2361                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2362                         goto check_groups;
2363                 rc = lustre_check_acl(inode, mask);
2364                 if (rc == -EAGAIN)
2365                         goto check_groups;
2366                 if (rc == -EACCES)
2367                         goto check_capabilities;
2368                 return rc;
2369         } else {
2370 check_groups:
2371                 if (in_group_p(inode->i_gid))
2372                         mode >>= 3;
2373         }
2374         if ((mode & mask & S_IRWXO) == mask)
2375                 return 0;
2376
2377 check_capabilities:
2378         if (!(mask & MAY_EXEC) ||
2379             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2380                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2381                         return 0;
2382
2383         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2384             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2385                 return 0;
2386
2387         return -EACCES;
2388 }
2389 #endif
2390
2391 #ifdef HAVE_FILE_READV
2392 #define READ_METHOD readv
2393 #define READ_FUNCTION ll_file_readv
2394 #define WRITE_METHOD writev
2395 #define WRITE_FUNCTION ll_file_writev
2396 #else
2397 #define READ_METHOD aio_read
2398 #define READ_FUNCTION ll_file_aio_read
2399 #define WRITE_METHOD aio_write
2400 #define WRITE_FUNCTION ll_file_aio_write
2401 #endif
2402
2403 /* -o localflock - only provides locally consistent flock locks */
2404 struct file_operations ll_file_operations = {
2405         .read           = ll_file_read,
2406         .READ_METHOD    = READ_FUNCTION,
2407         .write          = ll_file_write,
2408         .WRITE_METHOD   = WRITE_FUNCTION,
2409         .ioctl          = ll_file_ioctl,
2410         .open           = ll_file_open,
2411         .release        = ll_file_release,
2412         .mmap           = ll_file_mmap,
2413         .llseek         = ll_file_seek,
2414         .sendfile       = ll_file_sendfile,
2415         .fsync          = ll_fsync,
2416 };
2417
2418 struct file_operations ll_file_operations_flock = {
2419         .read           = ll_file_read,
2420         .READ_METHOD    = READ_FUNCTION,
2421         .write          = ll_file_write,
2422         .WRITE_METHOD   = WRITE_FUNCTION,
2423         .ioctl          = ll_file_ioctl,
2424         .open           = ll_file_open,
2425         .release        = ll_file_release,
2426         .mmap           = ll_file_mmap,
2427         .llseek         = ll_file_seek,
2428         .sendfile       = ll_file_sendfile,
2429         .fsync          = ll_fsync,
2430 #ifdef HAVE_F_OP_FLOCK
2431         .flock          = ll_file_flock,
2432 #endif
2433         .lock           = ll_file_flock
2434 };
2435
2436 /* These are for -o noflock - to return ENOSYS on flock calls */
2437 struct file_operations ll_file_operations_noflock = {
2438         .read           = ll_file_read,
2439         .READ_METHOD    = READ_FUNCTION,
2440         .write          = ll_file_write,
2441         .WRITE_METHOD   = WRITE_FUNCTION,
2442         .ioctl          = ll_file_ioctl,
2443         .open           = ll_file_open,
2444         .release        = ll_file_release,
2445         .mmap           = ll_file_mmap,
2446         .llseek         = ll_file_seek,
2447         .sendfile       = ll_file_sendfile,
2448         .fsync          = ll_fsync,
2449 #ifdef HAVE_F_OP_FLOCK
2450         .flock          = ll_file_noflock,
2451 #endif
2452         .lock           = ll_file_noflock
2453 };
2454
2455 struct inode_operations ll_file_inode_operations = {
2456 #ifdef HAVE_VFS_INTENT_PATCHES
2457         .setattr_raw    = ll_setattr_raw,
2458 #endif
2459         .setattr        = ll_setattr,
2460         .truncate       = ll_truncate,
2461         .getattr        = ll_getattr,
2462         .permission     = ll_inode_permission,
2463         .setxattr       = ll_setxattr,
2464         .getxattr       = ll_getxattr,
2465         .listxattr      = ll_listxattr,
2466         .removexattr    = ll_removexattr,
2467 };
2468
2469 /* dynamic ioctl number support routins */
2470 static struct llioc_ctl_data {
2471         struct rw_semaphore ioc_sem;
2472         struct list_head    ioc_head;
2473 } llioc = {
2474         __RWSEM_INITIALIZER(llioc.ioc_sem),
2475         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2476 };
2477
2478
2479 struct llioc_data {
2480         struct list_head        iocd_list;
2481         unsigned int            iocd_size;
2482         llioc_callback_t        iocd_cb;
2483         unsigned int            iocd_count;
2484         unsigned int            iocd_cmd[0];
2485 };
2486
2487 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2488 {
2489         unsigned int size;
2490         struct llioc_data *in_data = NULL;
2491         ENTRY;
2492
2493         if (cb == NULL || cmd == NULL ||
2494             count > LLIOC_MAX_CMD || count < 0)
2495                 RETURN(NULL);
2496
2497         size = sizeof(*in_data) + count * sizeof(unsigned int);
2498         OBD_ALLOC(in_data, size);
2499         if (in_data == NULL)
2500                 RETURN(NULL);
2501
2502         memset(in_data, 0, sizeof(*in_data));
2503         in_data->iocd_size = size;
2504         in_data->iocd_cb = cb;
2505         in_data->iocd_count = count;
2506         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2507
2508         down_write(&llioc.ioc_sem);
2509         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2510         up_write(&llioc.ioc_sem);
2511
2512         RETURN(in_data);
2513 }
2514
2515 void ll_iocontrol_unregister(void *magic)
2516 {
2517         struct llioc_data *tmp;
2518
2519         if (magic == NULL)
2520                 return;
2521
2522         down_write(&llioc.ioc_sem);
2523         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2524                 if (tmp == magic) {
2525                         unsigned int size = tmp->iocd_size;
2526
2527                         list_del(&tmp->iocd_list);
2528                         up_write(&llioc.ioc_sem);
2529
2530                         OBD_FREE(tmp, size);
2531                         return;
2532                 }
2533         }
2534         up_write(&llioc.ioc_sem);
2535
2536         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2537 }
2538
2539 EXPORT_SYMBOL(ll_iocontrol_register);
2540 EXPORT_SYMBOL(ll_iocontrol_unregister);
2541
2542 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2543                         unsigned int cmd, unsigned long arg, int *rcp)
2544 {
2545         enum llioc_iter ret = LLIOC_CONT;
2546         struct llioc_data *data;
2547         int rc = -EINVAL, i;
2548
2549         down_read(&llioc.ioc_sem);
2550         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2551                 for (i = 0; i < data->iocd_count; i++) {
2552                         if (cmd != data->iocd_cmd[i])
2553                                 continue;
2554
2555                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2556                         break;
2557                 }
2558
2559                 if (ret == LLIOC_STOP)
2560                         break;
2561         }
2562         up_read(&llioc.ioc_sem);
2563
2564         if (rcp)
2565                 *rcp = rc;
2566         return ret;
2567 }