Whamcloud - gitweb
land clio.
[fs/lustre-release.git] / lustre / llite / file.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51
52 #include "cl_object.h"
53
54 struct ll_file_data *ll_file_data_get(void)
55 {
56         struct ll_file_data *fd;
57
58         OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
59         return fd;
60 }
61
62 static void ll_file_data_put(struct ll_file_data *fd)
63 {
64         if (fd != NULL)
65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 }
67
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69                           struct lustre_handle *fh)
70 {
71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72         op_data->op_attr.ia_mode = inode->i_mode;
73         op_data->op_attr.ia_atime = inode->i_atime;
74         op_data->op_attr.ia_mtime = inode->i_mtime;
75         op_data->op_attr.ia_ctime = inode->i_ctime;
76         op_data->op_attr.ia_size = i_size_read(inode);
77         op_data->op_attr_blocks = inode->i_blocks;
78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
79         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
80         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
81         op_data->op_capa1 = ll_mdscapa_get(inode);
82 }
83
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85                              struct obd_client_handle *och)
86 {
87         ENTRY;
88
89         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
90                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
91
92         if (!(och->och_flags & FMODE_WRITE))
93                 goto out;
94
95         if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
96             !S_ISREG(inode->i_mode))
97                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
98         else
99                 ll_epoch_close(inode, op_data, &och, 0);
100
101 out:
102         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
103         EXIT;
104 }
105
106 static int ll_close_inode_openhandle(struct obd_export *md_exp,
107                                      struct inode *inode,
108                                      struct obd_client_handle *och)
109 {
110         struct obd_export *exp = ll_i2mdexp(inode);
111         struct md_op_data *op_data;
112         struct ptlrpc_request *req = NULL;
113         struct obd_device *obd = class_exp2obd(exp);
114         int epoch_close = 1;
115         int seq_end = 0, rc;
116         ENTRY;
117
118         if (obd == NULL) {
119                 /*
120                  * XXX: in case of LMV, is this correct to access
121                  * ->exp_handle?
122                  */
123                 CERROR("Invalid MDC connection handle "LPX64"\n",
124                        ll_i2mdexp(inode)->exp_handle.h_cookie);
125                 GOTO(out, rc = 0);
126         }
127
128         /*
129          * here we check if this is forced umount. If so this is called on
130          * canceling "open lock" and we do not call md_close() in this case, as
131          * it will not be successful, as import is already deactivated.
132          */
133         if (obd->obd_force)
134                 GOTO(out, rc = 0);
135
136         OBD_ALLOC_PTR(op_data);
137         if (op_data == NULL)
138                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
139
140         ll_prepare_close(inode, op_data, och);
141         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
142         rc = md_close(md_exp, op_data, och->och_mod, &req);
143         if (rc != -EAGAIN)
144                 seq_end = 1;
145
146         if (rc == -EAGAIN) {
147                 /* This close must have the epoch closed. */
148                 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
149                 LASSERT(epoch_close);
150                 /* MDS has instructed us to obtain Size-on-MDS attribute from
151                  * OSTs and send setattr to back to MDS. */
152                 rc = ll_sizeonmds_update(inode, och->och_mod,
153                                          &och->och_fh, op_data->op_ioepoch);
154                 if (rc) {
155                         CERROR("inode %lu mdc Size-on-MDS update failed: "
156                                "rc = %d\n", inode->i_ino, rc);
157                         rc = 0;
158                 }
159         } else if (rc) {
160                 CERROR("inode %lu mdc close failed: rc = %d\n",
161                        inode->i_ino, rc);
162         }
163         ll_finish_md_op_data(op_data);
164
165         if (rc == 0) {
166                 rc = ll_objects_destroy(req, inode);
167                 if (rc)
168                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
169                                inode->i_ino, rc);
170         }
171
172         EXIT;
173 out:
174
175         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
176             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
177                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
178         } else {
179                 if (seq_end)
180                         ptlrpc_close_replay_seq(req);
181                 md_clear_open_replay_data(md_exp, och);
182                 /* Free @och if it is not waiting for DONE_WRITING. */
183                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
184                 OBD_FREE_PTR(och);
185         }
186         if (req) /* This is close request */
187                 ptlrpc_req_finished(req);
188         return rc;
189 }
190
191 int ll_md_real_close(struct inode *inode, int flags)
192 {
193         struct ll_inode_info *lli = ll_i2info(inode);
194         struct obd_client_handle **och_p;
195         struct obd_client_handle *och;
196         __u64 *och_usecount;
197         int rc = 0;
198         ENTRY;
199
200         if (flags & FMODE_WRITE) {
201                 och_p = &lli->lli_mds_write_och;
202                 och_usecount = &lli->lli_open_fd_write_count;
203         } else if (flags & FMODE_EXEC) {
204                 och_p = &lli->lli_mds_exec_och;
205                 och_usecount = &lli->lli_open_fd_exec_count;
206         } else {
207                 LASSERT(flags & FMODE_READ);
208                 och_p = &lli->lli_mds_read_och;
209                 och_usecount = &lli->lli_open_fd_read_count;
210         }
211
212         down(&lli->lli_och_sem);
213         if (*och_usecount) { /* There are still users of this handle, so
214                                 skip freeing it. */
215                 up(&lli->lli_och_sem);
216                 RETURN(0);
217         }
218         och=*och_p;
219         *och_p = NULL;
220         up(&lli->lli_och_sem);
221
222         if (och) { /* There might be a race and somebody have freed this och
223                       already */
224                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
225                                                inode, och);
226         }
227
228         RETURN(rc);
229 }
230
231 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
232                 struct file *file)
233 {
234         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
235         struct ll_inode_info *lli = ll_i2info(inode);
236         int rc = 0;
237         ENTRY;
238
239         /* clear group lock, if present */
240         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
241 #if 0 /* XXX */
242                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
243                 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
244                 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
245                                       &fd->fd_cwlockh);
246 #endif
247         }
248
249         /* Let's see if we have good enough OPEN lock on the file and if
250            we can skip talking to MDS */
251         if (file->f_dentry->d_inode) { /* Can this ever be false? */
252                 int lockmode;
253                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
254                 struct lustre_handle lockh;
255                 struct inode *inode = file->f_dentry->d_inode;
256                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
257
258                 down(&lli->lli_och_sem);
259                 if (fd->fd_omode & FMODE_WRITE) {
260                         lockmode = LCK_CW;
261                         LASSERT(lli->lli_open_fd_write_count);
262                         lli->lli_open_fd_write_count--;
263                 } else if (fd->fd_omode & FMODE_EXEC) {
264                         lockmode = LCK_PR;
265                         LASSERT(lli->lli_open_fd_exec_count);
266                         lli->lli_open_fd_exec_count--;
267                 } else {
268                         lockmode = LCK_CR;
269                         LASSERT(lli->lli_open_fd_read_count);
270                         lli->lli_open_fd_read_count--;
271                 }
272                 up(&lli->lli_och_sem);
273
274                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
275                                    LDLM_IBITS, &policy, lockmode,
276                                    &lockh)) {
277                         rc = ll_md_real_close(file->f_dentry->d_inode,
278                                               fd->fd_omode);
279                 }
280         } else {
281                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
282                        file, file->f_dentry, file->f_dentry->d_name.name);
283         }
284
285         LUSTRE_FPRIVATE(file) = NULL;
286         ll_file_data_put(fd);
287         ll_capa_close(inode);
288
289         RETURN(rc);
290 }
291
292 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
293
294 /* While this returns an error code, fput() the caller does not, so we need
295  * to make every effort to clean up all of our state here.  Also, applications
296  * rarely check close errors and even if an error is returned they will not
297  * re-try the close call.
298  */
299 int ll_file_release(struct inode *inode, struct file *file)
300 {
301         struct ll_file_data *fd;
302         struct ll_sb_info *sbi = ll_i2sbi(inode);
303         struct ll_inode_info *lli = ll_i2info(inode);
304         struct lov_stripe_md *lsm = lli->lli_smd;
305         int rc;
306         ENTRY;
307
308         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
309                inode->i_generation, inode);
310
311 #ifdef CONFIG_FS_POSIX_ACL
312         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
313             inode == inode->i_sb->s_root->d_inode) {
314                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
315
316                 LASSERT(fd != NULL);
317                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
318                         fd->fd_flags &= ~LL_FILE_RMTACL;
319                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
320                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
321                 }
322         }
323 #endif
324
325         if (inode->i_sb->s_root != file->f_dentry)
326                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
327         fd = LUSTRE_FPRIVATE(file);
328         LASSERT(fd != NULL);
329
330         /* The last ref on @file, maybe not the the owner pid of statahead.
331          * Different processes can open the same dir, "ll_opendir_key" means:
332          * it is me that should stop the statahead thread. */
333         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
334                 ll_stop_statahead(inode, fd);
335
336         if (inode->i_sb->s_root == file->f_dentry) {
337                 LUSTRE_FPRIVATE(file) = NULL;
338                 ll_file_data_put(fd);
339                 RETURN(0);
340         }
341
342         if (lsm)
343                 lov_test_and_clear_async_rc(lsm);
344         lli->lli_async_rc = 0;
345
346         rc = ll_md_close(sbi->ll_md_exp, inode, file);
347         RETURN(rc);
348 }
349
350 static int ll_intent_file_open(struct file *file, void *lmm,
351                                int lmmsize, struct lookup_intent *itp)
352 {
353         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
354         struct dentry *parent = file->f_dentry->d_parent;
355         const char *name = file->f_dentry->d_name.name;
356         const int len = file->f_dentry->d_name.len;
357         struct md_op_data *op_data;
358         struct ptlrpc_request *req;
359         int rc;
360         ENTRY;
361
362         if (!parent)
363                 RETURN(-ENOENT);
364
365         /* Usually we come here only for NFSD, and we want open lock.
366            But we can also get here with pre 2.6.15 patchless kernels, and in
367            that case that lock is also ok */
368         /* We can also get here if there was cached open handle in revalidate_it
369          * but it disappeared while we were getting from there to ll_file_open.
370          * But this means this file was closed and immediatelly opened which
371          * makes a good candidate for using OPEN lock */
372         /* If lmmsize & lmm are not 0, we are just setting stripe info
373          * parameters. No need for the open lock */
374         if (!lmm && !lmmsize)
375                 itp->it_flags |= MDS_OPEN_LOCK;
376
377         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
378                                       file->f_dentry->d_inode, name, len,
379                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
380         if (IS_ERR(op_data))
381                 RETURN(PTR_ERR(op_data));
382
383         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
384                             0 /*unused */, &req, ll_md_blocking_ast, 0);
385         ll_finish_md_op_data(op_data);
386         if (rc == -ESTALE) {
387                 /* reason for keep own exit path - don`t flood log
388                 * with messages with -ESTALE errors.
389                 */
390                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
391                      it_open_error(DISP_OPEN_OPEN, itp))
392                         GOTO(out, rc);
393                 ll_release_openhandle(file->f_dentry, itp);
394                 GOTO(out, rc);
395         }
396
397         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
398                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
399                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
400                 GOTO(out, rc);
401         }
402
403         if (itp->d.lustre.it_lock_mode)
404                 md_set_lock_data(sbi->ll_md_exp,
405                                  &itp->d.lustre.it_lock_handle,
406                                  file->f_dentry->d_inode);
407
408         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
409 out:
410         ptlrpc_req_finished(itp->d.lustre.it_data);
411         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
412         ll_intent_drop_lock(itp);
413
414         RETURN(rc);
415 }
416
417 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
418                        struct lookup_intent *it, struct obd_client_handle *och)
419 {
420         struct ptlrpc_request *req = it->d.lustre.it_data;
421         struct mdt_body *body;
422
423         LASSERT(och);
424
425         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
426         LASSERT(body != NULL);                      /* reply already checked out */
427
428         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
429         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
430         och->och_fid = lli->lli_fid;
431         och->och_flags = it->it_flags;
432         lli->lli_ioepoch = body->ioepoch;
433
434         return md_set_open_replay_data(md_exp, och, req);
435 }
436
437 int ll_local_open(struct file *file, struct lookup_intent *it,
438                   struct ll_file_data *fd, struct obd_client_handle *och)
439 {
440         struct inode *inode = file->f_dentry->d_inode;
441         struct ll_inode_info *lli = ll_i2info(inode);
442         ENTRY;
443
444         LASSERT(!LUSTRE_FPRIVATE(file));
445
446         LASSERT(fd != NULL);
447
448         if (och) {
449                 struct ptlrpc_request *req = it->d.lustre.it_data;
450                 struct mdt_body *body;
451                 int rc;
452
453                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
454                 if (rc)
455                         RETURN(rc);
456
457                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
458                 if ((it->it_flags & FMODE_WRITE) &&
459                     (body->valid & OBD_MD_FLSIZE))
460                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
461                                lli->lli_ioepoch, PFID(&lli->lli_fid));
462         }
463
464         LUSTRE_FPRIVATE(file) = fd;
465         ll_readahead_init(inode, &fd->fd_ras);
466         fd->fd_omode = it->it_flags;
467         RETURN(0);
468 }
469
470 /* Open a file, and (for the very first open) create objects on the OSTs at
471  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
472  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
473  * lli_open_sem to ensure no other process will create objects, send the
474  * stripe MD to the MDS, or try to destroy the objects if that fails.
475  *
476  * If we already have the stripe MD locally then we don't request it in
477  * md_open(), by passing a lmm_size = 0.
478  *
479  * It is up to the application to ensure no other processes open this file
480  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
481  * used.  We might be able to avoid races of that sort by getting lli_open_sem
482  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
483  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
484  */
485 int ll_file_open(struct inode *inode, struct file *file)
486 {
487         struct ll_inode_info *lli = ll_i2info(inode);
488         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
489                                           .it_flags = file->f_flags };
490         struct lov_stripe_md *lsm;
491         struct ptlrpc_request *req = NULL;
492         struct obd_client_handle **och_p;
493         __u64 *och_usecount;
494         struct ll_file_data *fd;
495         int rc = 0, opendir_set = 0;
496         ENTRY;
497
498         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
499                inode->i_generation, inode, file->f_flags);
500
501 #ifdef HAVE_VFS_INTENT_PATCHES
502         it = file->f_it;
503 #else
504         it = file->private_data; /* XXX: compat macro */
505         file->private_data = NULL; /* prevent ll_local_open assertion */
506 #endif
507
508         fd = ll_file_data_get();
509         if (fd == NULL)
510                 RETURN(-ENOMEM);
511
512         fd->fd_file = file;
513         if (S_ISDIR(inode->i_mode)) {
514 again:
515                 spin_lock(&lli->lli_lock);
516                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
517                         LASSERT(lli->lli_sai == NULL);
518                         lli->lli_opendir_key = fd;
519                         lli->lli_opendir_pid = cfs_curproc_pid();
520                         opendir_set = 1;
521                 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid() &&
522                                     lli->lli_opendir_key != NULL)) {
523                         /* Two cases for this:
524                          * (1) The same process open such directory many times.
525                          * (2) The old process opened the directory, and exited
526                          *     before its children processes. Then new process
527                          *     with the same pid opens such directory before the
528                          *     old process's children processes exit.
529                          * reset stat ahead for such cases. */
530                         spin_unlock(&lli->lli_lock);
531                         CDEBUG(D_INFO, "Conflict statahead for %.*s "DFID
532                                " reset it.\n", file->f_dentry->d_name.len,
533                                file->f_dentry->d_name.name,
534                                PFID(&lli->lli_fid));
535                         ll_stop_statahead(inode, lli->lli_opendir_key);
536                         goto again;
537                 }
538                 spin_unlock(&lli->lli_lock);
539         }
540
541         if (inode->i_sb->s_root == file->f_dentry) {
542                 LUSTRE_FPRIVATE(file) = fd;
543                 RETURN(0);
544         }
545
546         if (!it || !it->d.lustre.it_disposition) {
547                 /* Convert f_flags into access mode. We cannot use file->f_mode,
548                  * because everything but O_ACCMODE mask was stripped from
549                  * there */
550                 if ((oit.it_flags + 1) & O_ACCMODE)
551                         oit.it_flags++;
552                 if (file->f_flags & O_TRUNC)
553                         oit.it_flags |= FMODE_WRITE;
554
555                 /* kernel only call f_op->open in dentry_open.  filp_open calls
556                  * dentry_open after call to open_namei that checks permissions.
557                  * Only nfsd_open call dentry_open directly without checking
558                  * permissions and because of that this code below is safe. */
559                 if (oit.it_flags & FMODE_WRITE)
560                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
561
562                 /* We do not want O_EXCL here, presumably we opened the file
563                  * already? XXX - NFS implications? */
564                 oit.it_flags &= ~O_EXCL;
565
566                 it = &oit;
567         }
568
569 restart:
570         /* Let's see if we have file open on MDS already. */
571         if (it->it_flags & FMODE_WRITE) {
572                 och_p = &lli->lli_mds_write_och;
573                 och_usecount = &lli->lli_open_fd_write_count;
574         } else if (it->it_flags & FMODE_EXEC) {
575                 och_p = &lli->lli_mds_exec_och;
576                 och_usecount = &lli->lli_open_fd_exec_count;
577          } else {
578                 och_p = &lli->lli_mds_read_och;
579                 och_usecount = &lli->lli_open_fd_read_count;
580         }
581
582         down(&lli->lli_och_sem);
583         if (*och_p) { /* Open handle is present */
584                 if (it_disposition(it, DISP_OPEN_OPEN)) {
585                         /* Well, there's extra open request that we do not need,
586                            let's close it somehow. This will decref request. */
587                         rc = it_open_error(DISP_OPEN_OPEN, it);
588                         if (rc) {
589                                 up(&lli->lli_och_sem);
590                                 ll_file_data_put(fd);
591                                 GOTO(out_openerr, rc);
592                         }
593                         ll_release_openhandle(file->f_dentry, it);
594                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
595                                              LPROC_LL_OPEN);
596                 }
597                 (*och_usecount)++;
598
599                 rc = ll_local_open(file, it, fd, NULL);
600                 if (rc) {
601                         (*och_usecount)--;
602                         up(&lli->lli_och_sem);
603                         ll_file_data_put(fd);
604                         GOTO(out_openerr, rc);
605                 }
606         } else {
607                 LASSERT(*och_usecount == 0);
608                 if (!it->d.lustre.it_disposition) {
609                         /* We cannot just request lock handle now, new ELC code
610                            means that one of other OPEN locks for this file
611                            could be cancelled, and since blocking ast handler
612                            would attempt to grab och_sem as well, that would
613                            result in a deadlock */
614                         up(&lli->lli_och_sem);
615                         it->it_flags |= O_CHECK_STALE;
616                         rc = ll_intent_file_open(file, NULL, 0, it);
617                         it->it_flags &= ~O_CHECK_STALE;
618                         if (rc) {
619                                 ll_file_data_put(fd);
620                                 GOTO(out_openerr, rc);
621                         }
622
623                         /* Got some error? Release the request */
624                         if (it->d.lustre.it_status < 0) {
625                                 req = it->d.lustre.it_data;
626                                 ptlrpc_req_finished(req);
627                         }
628                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
629                                          &it->d.lustre.it_lock_handle,
630                                          file->f_dentry->d_inode);
631                         goto restart;
632                 }
633                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
634                 if (!*och_p) {
635                         ll_file_data_put(fd);
636                         GOTO(out_och_free, rc = -ENOMEM);
637                 }
638                 (*och_usecount)++;
639                 req = it->d.lustre.it_data;
640
641                 /* md_intent_lock() didn't get a request ref if there was an
642                  * open error, so don't do cleanup on the request here
643                  * (bug 3430) */
644                 /* XXX (green): Should not we bail out on any error here, not
645                  * just open error? */
646                 rc = it_open_error(DISP_OPEN_OPEN, it);
647                 if (rc) {
648                         ll_file_data_put(fd);
649                         GOTO(out_och_free, rc);
650                 }
651
652                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
653                 rc = ll_local_open(file, it, fd, *och_p);
654                 if (rc) {
655                         ll_file_data_put(fd);
656                         GOTO(out_och_free, rc);
657                 }
658         }
659         up(&lli->lli_och_sem);
660
661         /* Must do this outside lli_och_sem lock to prevent deadlock where
662            different kind of OPEN lock for this same inode gets cancelled
663            by ldlm_cancel_lru */
664         if (!S_ISREG(inode->i_mode))
665                 GOTO(out, rc);
666
667         ll_capa_open(inode);
668
669         lsm = lli->lli_smd;
670         if (lsm == NULL) {
671                 if (file->f_flags & O_LOV_DELAY_CREATE ||
672                     !(file->f_mode & FMODE_WRITE)) {
673                         CDEBUG(D_INODE, "object creation was delayed\n");
674                         GOTO(out, rc);
675                 }
676         }
677         file->f_flags &= ~O_LOV_DELAY_CREATE;
678         GOTO(out, rc);
679 out:
680         ptlrpc_req_finished(req);
681         if (req)
682                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
683 out_och_free:
684         if (rc) {
685                 if (*och_p) {
686                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
687                         *och_p = NULL; /* OBD_FREE writes some magic there */
688                         (*och_usecount)--;
689                 }
690                 up(&lli->lli_och_sem);
691 out_openerr:
692                 if (opendir_set != 0)
693                         ll_stop_statahead(inode, fd);
694         }
695
696         return rc;
697 }
698
699 /* Fills the obdo with the attributes for the inode defined by lsm */
700 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
701 {
702         struct ptlrpc_request_set *set;
703         struct ll_inode_info *lli = ll_i2info(inode);
704         struct lov_stripe_md *lsm = lli->lli_smd;
705
706         struct obd_info oinfo = { { { 0 } } };
707         int rc;
708         ENTRY;
709
710         LASSERT(lsm != NULL);
711
712         oinfo.oi_md = lsm;
713         oinfo.oi_oa = obdo;
714         oinfo.oi_oa->o_id = lsm->lsm_object_id;
715         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
716         oinfo.oi_oa->o_mode = S_IFREG;
717         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
718                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
719                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
720                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
721                                OBD_MD_FLGROUP;
722         oinfo.oi_capa = ll_mdscapa_get(inode);
723
724         set = ptlrpc_prep_set();
725         if (set == NULL) {
726                 CERROR("can't allocate ptlrpc set\n");
727                 rc = -ENOMEM;
728         } else {
729                 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
730                 if (rc == 0)
731                         rc = ptlrpc_set_wait(set);
732                 ptlrpc_set_destroy(set);
733         }
734         capa_put(oinfo.oi_capa);
735         if (rc)
736                 RETURN(rc);
737
738         oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
739                                  OBD_MD_FLATIME | OBD_MD_FLMTIME |
740                                  OBD_MD_FLCTIME | OBD_MD_FLSIZE);
741
742         obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
743         CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
744                lli->lli_smd->lsm_object_id, i_size_read(inode),
745                (unsigned long long)inode->i_blocks,
746                (unsigned long)ll_inode_blksize(inode));
747         RETURN(0);
748 }
749
750 int ll_merge_lvb(struct inode *inode)
751 {
752         struct ll_inode_info *lli = ll_i2info(inode);
753         struct ll_sb_info *sbi = ll_i2sbi(inode);
754         struct ost_lvb lvb;
755         int rc;
756
757         ENTRY;
758
759         ll_inode_size_lock(inode, 1);
760         inode_init_lvb(inode, &lvb);
761         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
762         i_size_write(inode, lvb.lvb_size);
763         inode->i_blocks = lvb.lvb_blocks;
764
765         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
766         LTIME_S(inode->i_atime) = lvb.lvb_atime;
767         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
768         ll_inode_size_unlock(inode, 1);
769
770         RETURN(rc);
771 }
772
773 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
774                      lstat_t *st)
775 {
776         /* XXX */
777         return -ENOSYS;
778 }
779
780 void ll_io_init(struct cl_io *io, const struct file *file, int write)
781 {
782         struct inode *inode     = file->f_dentry->d_inode;
783         struct ll_sb_info *sbi  = ll_i2sbi(inode);
784         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
785
786         LASSERT(fd != NULL);
787         memset(io, 0, sizeof *io);
788         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
789         if (write)
790                 io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
791         io->ci_obj     = ll_i2info(inode)->lli_clob;
792         io->ci_lockreq = CILR_MAYBE;
793         if (fd->fd_flags & LL_FILE_IGNORE_LOCK || sbi->ll_flags & LL_SBI_NOLCK)
794                 io->ci_lockreq = CILR_NEVER;
795         else if (file->f_flags & O_APPEND)
796                 io->ci_lockreq = CILR_MANDATORY;
797 }
798
799 static ssize_t ll_file_io_generic(const struct lu_env *env,
800                 struct ccc_io_args *args, struct file *file,
801                 enum cl_io_type iot, loff_t *ppos, size_t count)
802 {
803         struct cl_io       *io;
804         ssize_t             result;
805         ENTRY;
806
807         io = &ccc_env_info(env)->cti_io;
808         ll_io_init(io, file, iot == CIT_WRITE);
809
810         if (iot == CIT_READ)
811                 io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile;
812
813         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
814                 struct vvp_io *vio = vvp_env_io(env);
815                 struct ccc_io *cio = ccc_env_io(env);
816                 if (cl_io_is_sendfile(io)) {
817                         vio->u.read.cui_actor = args->cia_actor;
818                         vio->u.read.cui_target = args->cia_target;
819                 } else {
820                         cio->cui_iov = args->cia_iov;
821                         cio->cui_nrsegs = args->cia_nrsegs;
822 #ifndef HAVE_FILE_WRITEV
823                         cio->cui_iocb = args->cia_iocb;
824 #endif
825                 }
826                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
827                 result = cl_io_loop(env, io);
828         } else
829                 /* cl_io_rw_init() handled IO */
830                 result = io->ci_result;
831         if (io->ci_nob > 0) {
832                 result = io->ci_nob;
833                 *ppos = io->u.ci_wr.wr.crw_pos;
834         }
835         cl_io_fini(env, io);
836         RETURN(result);
837 }
838
839
840 /*
841  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
842  */
843 static int ll_file_get_iov_count(const struct iovec *iov,
844                                  unsigned long *nr_segs, size_t *count)
845 {
846         size_t cnt = 0;
847         unsigned long seg;
848
849         for (seg = 0; seg < *nr_segs; seg++) {
850                 const struct iovec *iv = &iov[seg];
851
852                 /*
853                  * If any segment has a negative length, or the cumulative
854                  * length ever wraps negative then return -EINVAL.
855                  */
856                 cnt += iv->iov_len;
857                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
858                         return -EINVAL;
859                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
860                         continue;
861                 if (seg == 0)
862                         return -EFAULT;
863                 *nr_segs = seg;
864                 cnt -= iv->iov_len;   /* This segment is no good */
865                 break;
866         }
867         *count = cnt;
868         return 0;
869 }
870
871 #ifdef HAVE_FILE_READV
872 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
873                               unsigned long nr_segs, loff_t *ppos)
874 {
875         struct lu_env      *env;
876         struct ccc_io_args *args;
877         size_t              count;
878         ssize_t             result;
879         int                 refcheck;
880         ENTRY;
881
882         result = ll_file_get_iov_count(iov, &nr_segs, &count);
883         if (result)
884                 RETURN(result);
885
886         env = cl_env_get(&refcheck);
887         if (IS_ERR(env))
888                 RETURN(PTR_ERR(env));
889
890         args = &vvp_env_info(env)->vti_args;
891         args->cia_is_sendfile = 0;
892         args->cia_iov = (struct iovec *)iov;
893         args->cia_nrsegs = nr_segs;
894         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
895         cl_env_put(env, &refcheck);
896         RETURN(result);
897 }
898
899 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
900                             loff_t *ppos)
901 {
902         struct lu_env *env;
903         struct iovec  *local_iov;
904         ssize_t        result;
905         int            refcheck;
906         ENTRY;
907
908         env = cl_env_get(&refcheck);
909         if (IS_ERR(env))
910                 RETURN(PTR_ERR(env));
911
912         local_iov = &vvp_env_info(env)->vti_local_iov;
913         local_iov->iov_base = (void __user *)buf;
914         local_iov->iov_len = count;
915         result = ll_file_readv(file, local_iov, 1, ppos);
916         cl_env_put(env, &refcheck);
917         RETURN(result);
918 }
919
920 #else
921 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
922                                 unsigned long nr_segs, loff_t pos)
923 {
924         struct lu_env      *env;
925         struct ccc_io_args *args;
926         size_t              count;
927         ssize_t             result;
928         int                 refcheck;
929         ENTRY;
930
931         result = ll_file_get_iov_count(iov, &nr_segs, &count);
932         if (result)
933                 RETURN(result);
934
935         env = cl_env_get(&refcheck);
936         if (IS_ERR(env))
937                 RETURN(PTR_ERR(env));
938
939         args = &vvp_env_info(env)->vti_args;
940         args->cia_is_sendfile = 0;
941         args->cia_iov = (struct iovec *)iov;
942         args->cia_nrsegs = nr_segs;
943         args->cia_iocb = iocb;
944         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
945                                     &iocb->ki_pos, count);
946         cl_env_put(env, &refcheck);
947         RETURN(result);
948 }
949
950 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
951                             loff_t *ppos)
952 {
953         struct lu_env *env;
954         struct iovec  *local_iov;
955         struct kiocb  *kiocb;
956         ssize_t        result;
957         int            refcheck;
958         ENTRY;
959
960         env = cl_env_get(&refcheck);
961         if (IS_ERR(env))
962                 RETURN(PTR_ERR(env));
963
964         local_iov = &vvp_env_info(env)->vti_local_iov;
965         kiocb = &vvp_env_info(env)->vti_kiocb;
966         local_iov->iov_base = (void __user *)buf;
967         local_iov->iov_len = count;
968         init_sync_kiocb(kiocb, file);
969         kiocb->ki_pos = *ppos;
970         kiocb->ki_left = count;
971
972         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
973         *ppos = kiocb->ki_pos;
974
975         cl_env_put(env, &refcheck);
976         RETURN(result);
977 }
978 #endif
979
980 /*
981  * Write to a file (through the page cache).
982  */
983 #ifdef HAVE_FILE_WRITEV
984 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
985                               unsigned long nr_segs, loff_t *ppos)
986 {
987         struct lu_env      *env;
988         struct ccc_io_args *args;
989         size_t              count;
990         ssize_t             result;
991         int                 refcheck;
992         ENTRY;
993
994         result = ll_file_get_iov_count(iov, &nr_segs, &count);
995         if (result)
996                 RETURN(result);
997
998         env = cl_env_get(&refcheck);
999         if (IS_ERR(env))
1000                 RETURN(PTR_ERR(env));
1001
1002         args = &vvp_env_info(env)->vti_args;
1003         args->cia_iov = (struct iovec *)iov;
1004         args->cia_nrsegs = nr_segs;
1005         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1006         cl_env_put(env, &refcheck);
1007         RETURN(result);
1008 }
1009
1010 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1011                              loff_t *ppos)
1012 {
1013         struct lu_env    *env;
1014         struct iovec     *local_iov;
1015         ssize_t           result;
1016         int               refcheck;
1017         ENTRY;
1018
1019         env = cl_env_get(&refcheck);
1020         if (IS_ERR(env))
1021                 RETURN(PTR_ERR(env));
1022
1023         local_iov = &vvp_env_info(env)->vti_local_iov;
1024         local_iov->iov_base = (void __user *)buf;
1025         local_iov->iov_len = count;
1026
1027         result = ll_file_writev(file, local_iov, 1, ppos);
1028         cl_env_put(env, &refcheck);
1029         RETURN(result);
1030 }
1031
1032 #else /* AIO stuff */
1033 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1034                                  unsigned long nr_segs, loff_t pos)
1035 {
1036         struct lu_env      *env;
1037         struct ccc_io_args *args;
1038         size_t              count;
1039         ssize_t             result;
1040         int                 refcheck;
1041         ENTRY;
1042
1043         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1044         if (result)
1045                 RETURN(result);
1046
1047         env = cl_env_get(&refcheck);
1048         if (IS_ERR(env))
1049                 RETURN(PTR_ERR(env));
1050
1051         args = &vvp_env_info(env)->vti_args;
1052         args->cia_iov = (struct iovec *)iov;
1053         args->cia_nrsegs = nr_segs;
1054         args->cia_iocb = iocb;
1055         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1056                                   &iocb->ki_pos, count);
1057         cl_env_put(env, &refcheck);
1058         RETURN(result);
1059 }
1060
1061 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1062                              loff_t *ppos)
1063 {
1064         struct lu_env *env;
1065         struct iovec  *local_iov;
1066         struct kiocb  *kiocb;
1067         ssize_t        result;
1068         int            refcheck;
1069         ENTRY;
1070
1071         env = cl_env_get(&refcheck);
1072         if (IS_ERR(env))
1073                 RETURN(PTR_ERR(env));
1074
1075         local_iov = &vvp_env_info(env)->vti_local_iov;
1076         kiocb = &vvp_env_info(env)->vti_kiocb;
1077         local_iov->iov_base = (void __user *)buf;
1078         local_iov->iov_len = count;
1079         init_sync_kiocb(kiocb, file);
1080         kiocb->ki_pos = *ppos;
1081         kiocb->ki_left = count;
1082
1083         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1084         *ppos = kiocb->ki_pos;
1085
1086         cl_env_put(env, &refcheck);
1087         RETURN(result);
1088 }
1089 #endif
1090
1091
1092 /*
1093  * Send file content (through pagecache) somewhere with helper
1094  */
1095 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1096                                 read_actor_t actor, void *target)
1097 {
1098         struct lu_env      *env;
1099         struct ccc_io_args *args;
1100         ssize_t             result;
1101         int                 refcheck;
1102         ENTRY;
1103
1104         env = cl_env_get(&refcheck);
1105         if (IS_ERR(env))
1106                 RETURN(PTR_ERR(env));
1107
1108         args = &vvp_env_info(env)->vti_args;
1109         args->cia_is_sendfile = 1;
1110         args->cia_target = target;
1111         args->cia_actor = actor;
1112         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1113         cl_env_put(env, &refcheck);
1114         RETURN(result);
1115 }
1116
1117 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1118                                unsigned long arg)
1119 {
1120         struct obd_export *exp = ll_i2dtexp(inode);
1121         struct ll_recreate_obj ucreatp;
1122         struct obd_trans_info oti = { 0 };
1123         struct obdo *oa = NULL;
1124         int lsm_size;
1125         int rc = 0;
1126         struct lov_stripe_md *lsm, *lsm2;
1127         ENTRY;
1128
1129         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1130                 RETURN(-EPERM);
1131
1132         if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1133                            sizeof(struct ll_recreate_obj)))
1134                 RETURN(-EFAULT);
1135
1136         OBDO_ALLOC(oa);
1137         if (oa == NULL)
1138                 RETURN(-ENOMEM);
1139
1140         ll_inode_size_lock(inode, 0);
1141         lsm = ll_i2info(inode)->lli_smd;
1142         if (lsm == NULL)
1143                 GOTO(out, rc = -ENOENT);
1144         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1145                    (lsm->lsm_stripe_count));
1146
1147         OBD_ALLOC(lsm2, lsm_size);
1148         if (lsm2 == NULL)
1149                 GOTO(out, rc = -ENOMEM);
1150
1151         oa->o_id = ucreatp.lrc_id;
1152         oa->o_gr = ucreatp.lrc_group;
1153         oa->o_nlink = ucreatp.lrc_ost_idx;
1154         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1155         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1156         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1157                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1158
1159         memcpy(lsm2, lsm, lsm_size);
1160         rc = obd_create(exp, oa, &lsm2, &oti);
1161
1162         OBD_FREE(lsm2, lsm_size);
1163         GOTO(out, rc);
1164 out:
1165         ll_inode_size_unlock(inode, 0);
1166         OBDO_FREE(oa);
1167         return rc;
1168 }
1169
1170 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1171                              int flags, struct lov_user_md *lum, int lum_size)
1172 {
1173         struct lov_stripe_md *lsm;
1174         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1175         int rc = 0;
1176         ENTRY;
1177
1178         ll_inode_size_lock(inode, 0);
1179         lsm = ll_i2info(inode)->lli_smd;
1180         if (lsm) {
1181                 ll_inode_size_unlock(inode, 0);
1182                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1183                        inode->i_ino);
1184                 RETURN(-EEXIST);
1185         }
1186
1187         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1188         if (rc)
1189                 GOTO(out, rc);
1190         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1191                 GOTO(out_req_free, rc = -ENOENT);
1192         rc = oit.d.lustre.it_status;
1193         if (rc < 0)
1194                 GOTO(out_req_free, rc);
1195
1196         ll_release_openhandle(file->f_dentry, &oit);
1197
1198  out:
1199         ll_inode_size_unlock(inode, 0);
1200         ll_intent_release(&oit);
1201         RETURN(rc);
1202 out_req_free:
1203         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1204         goto out;
1205 }
1206
1207 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1208                              struct lov_mds_md **lmmp, int *lmm_size,
1209                              struct ptlrpc_request **request)
1210 {
1211         struct ll_sb_info *sbi = ll_i2sbi(inode);
1212         struct mdt_body  *body;
1213         struct lov_mds_md *lmm = NULL;
1214         struct ptlrpc_request *req = NULL;
1215         struct obd_capa *oc;
1216         int rc, lmmsize;
1217
1218         rc = ll_get_max_mdsize(sbi, &lmmsize);
1219         if (rc)
1220                 RETURN(rc);
1221
1222         oc = ll_mdscapa_get(inode);
1223         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1224                              oc, filename, strlen(filename) + 1,
1225                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1226                              ll_i2suppgid(inode), &req);
1227         capa_put(oc);
1228         if (rc < 0) {
1229                 CDEBUG(D_INFO, "md_getattr_name failed "
1230                        "on %s: rc %d\n", filename, rc);
1231                 GOTO(out, rc);
1232         }
1233
1234         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1235         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1236
1237         lmmsize = body->eadatasize;
1238
1239         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1240                         lmmsize == 0) {
1241                 GOTO(out, rc = -ENODATA);
1242         }
1243
1244         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1245         LASSERT(lmm != NULL);
1246
1247         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1248             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
1249             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
1250                 GOTO(out, rc = -EPROTO);
1251         }
1252
1253         /*
1254          * This is coming from the MDS, so is probably in
1255          * little endian.  We convert it to host endian before
1256          * passing it to userspace.
1257          */
1258         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1259                 /* if function called for directory - we should
1260                  * avoid swab not existent lsm objects */
1261                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1262                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1263                         if (S_ISREG(body->mode))
1264                                 lustre_swab_lov_user_md_objects(
1265                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1266                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1267                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1268                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1269                         if (S_ISREG(body->mode))
1270                                 lustre_swab_lov_user_md_objects(
1271                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1272                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1273                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
1274                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1275                 }
1276         }
1277
1278         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1279                 struct lov_stripe_md *lsm;
1280                 struct lov_user_md_join *lmj;
1281                 int lmj_size, i, aindex = 0;
1282
1283                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1284                 if (rc < 0)
1285                         GOTO(out, rc = -ENOMEM);
1286                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1287                 if (rc)
1288                         GOTO(out_free_memmd, rc);
1289
1290                 lmj_size = sizeof(struct lov_user_md_join) +
1291                            lsm->lsm_stripe_count *
1292                            sizeof(struct lov_user_ost_data_join);
1293                 OBD_ALLOC(lmj, lmj_size);
1294                 if (!lmj)
1295                         GOTO(out_free_memmd, rc = -ENOMEM);
1296
1297                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1298                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1299                         struct lov_extent *lex =
1300                                 &lsm->lsm_array->lai_ext_array[aindex];
1301
1302                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1303                                 aindex ++;
1304                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1305                                         LPU64" len %d\n", aindex, i,
1306                                         lex->le_start, (int)lex->le_len);
1307                         lmj->lmm_objects[i].l_extent_start =
1308                                 lex->le_start;
1309
1310                         if ((int)lex->le_len == -1)
1311                                 lmj->lmm_objects[i].l_extent_end = -1;
1312                         else
1313                                 lmj->lmm_objects[i].l_extent_end =
1314                                         lex->le_start + lex->le_len;
1315                         lmj->lmm_objects[i].l_object_id =
1316                                 lsm->lsm_oinfo[i]->loi_id;
1317                         lmj->lmm_objects[i].l_object_gr =
1318                                 lsm->lsm_oinfo[i]->loi_gr;
1319                         lmj->lmm_objects[i].l_ost_gen =
1320                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1321                         lmj->lmm_objects[i].l_ost_idx =
1322                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1323                 }
1324                 lmm = (struct lov_mds_md *)lmj;
1325                 lmmsize = lmj_size;
1326 out_free_memmd:
1327                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1328         }
1329 out:
1330         *lmmp = lmm;
1331         *lmm_size = lmmsize;
1332         *request = req;
1333         return rc;
1334 }
1335
1336 static int ll_lov_setea(struct inode *inode, struct file *file,
1337                             unsigned long arg)
1338 {
1339         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1340         struct lov_user_md  *lump;
1341         int lum_size = sizeof(struct lov_user_md) +
1342                        sizeof(struct lov_user_ost_data);
1343         int rc;
1344         ENTRY;
1345
1346         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1347                 RETURN(-EPERM);
1348
1349         OBD_ALLOC(lump, lum_size);
1350         if (lump == NULL) {
1351                 RETURN(-ENOMEM);
1352         }
1353         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1354                 OBD_FREE(lump, lum_size);
1355                 RETURN(-EFAULT);
1356         }
1357
1358         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1359
1360         OBD_FREE(lump, lum_size);
1361         RETURN(rc);
1362 }
1363
1364 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1365                             unsigned long arg)
1366 {
1367         struct lov_user_md_v3 lumv3;
1368         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1369         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1370         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1371         int lum_size;
1372         int rc;
1373         int flags = FMODE_WRITE;
1374         ENTRY;
1375
1376         /* first try with v1 which is smaller than v3 */
1377         lum_size = sizeof(struct lov_user_md_v1);
1378         if (copy_from_user(lumv1, lumv1p, lum_size))
1379                 RETURN(-EFAULT);
1380
1381         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1382                 lum_size = sizeof(struct lov_user_md_v3);
1383                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1384                         RETURN(-EFAULT);
1385         }
1386
1387         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1388         if (rc == 0) {
1389                  put_user(0, &lumv1p->lmm_stripe_count);
1390                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1391                                     0, ll_i2info(inode)->lli_smd,
1392                                     (void *)arg);
1393         }
1394         RETURN(rc);
1395 }
1396
1397 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1398 {
1399         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1400
1401         if (!lsm)
1402                 RETURN(-ENODATA);
1403
1404         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1405                             (void *)arg);
1406 }
1407
1408 static int ll_get_grouplock(struct inode *inode, struct file *file,
1409                             unsigned long arg)
1410 {
1411         /* XXX */
1412         return -ENOSYS;
1413 }
1414
1415 static int ll_put_grouplock(struct inode *inode, struct file *file,
1416                             unsigned long arg)
1417 {
1418         /* XXX */
1419         return -ENOSYS;
1420 }
1421
1422 #if LUSTRE_FIX >= 50
1423 static int join_sanity_check(struct inode *head, struct inode *tail)
1424 {
1425         ENTRY;
1426         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1427                 CERROR("server do not support join \n");
1428                 RETURN(-EINVAL);
1429         }
1430         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1431                 CERROR("tail ino %lu and ino head %lu must be regular\n",
1432                        head->i_ino, tail->i_ino);
1433                 RETURN(-EINVAL);
1434         }
1435         if (head->i_ino == tail->i_ino) {
1436                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1437                 RETURN(-EINVAL);
1438         }
1439         if (i_size_read(head) % JOIN_FILE_ALIGN) {
1440                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1441                 RETURN(-EINVAL);
1442         }
1443         RETURN(0);
1444 }
1445
1446 static int join_file(struct inode *head_inode, struct file *head_filp,
1447                      struct file *tail_filp)
1448 {
1449         struct dentry *tail_dentry = tail_filp->f_dentry;
1450         struct lookup_intent oit = {.it_op = IT_OPEN,
1451                                    .it_flags = head_filp->f_flags|O_JOIN_FILE};
1452         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
1453                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
1454
1455         struct lustre_handle lockh;
1456         struct md_op_data *op_data;
1457         int    rc;
1458         loff_t data;
1459         ENTRY;
1460
1461         tail_dentry = tail_filp->f_dentry;
1462
1463         data = i_size_read(head_inode);
1464         op_data = ll_prep_md_op_data(NULL, head_inode,
1465                                      tail_dentry->d_parent->d_inode,
1466                                      tail_dentry->d_name.name,
1467                                      tail_dentry->d_name.len, 0,
1468                                      LUSTRE_OPC_ANY, &data);
1469         if (IS_ERR(op_data))
1470                 RETURN(PTR_ERR(op_data));
1471
1472         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
1473                          op_data, &lockh, NULL, 0, NULL, 0);
1474
1475         ll_finish_md_op_data(op_data);
1476         if (rc < 0)
1477                 GOTO(out, rc);
1478
1479         rc = oit.d.lustre.it_status;
1480
1481         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
1482                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
1483                 ptlrpc_req_finished((struct ptlrpc_request *)
1484                                     oit.d.lustre.it_data);
1485                 GOTO(out, rc);
1486         }
1487
1488         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1489                                            * away */
1490                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1491                 oit.d.lustre.it_lock_mode = 0;
1492         }
1493         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1494         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
1495         ll_release_openhandle(head_filp->f_dentry, &oit);
1496 out:
1497         ll_intent_release(&oit);
1498         RETURN(rc);
1499 }
1500
1501 static int ll_file_join(struct inode *head, struct file *filp,
1502                         char *filename_tail)
1503 {
1504         struct inode *tail = NULL, *first = NULL, *second = NULL;
1505         struct dentry *tail_dentry;
1506         struct file *tail_filp, *first_filp, *second_filp;
1507         struct ll_lock_tree first_tree, second_tree;
1508         struct ll_lock_tree_node *first_node, *second_node;
1509         struct ll_inode_info *hlli = ll_i2info(head), *tlli;
1510         int rc = 0, cleanup_phase = 0;
1511         ENTRY;
1512
1513         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1514                head->i_ino, head->i_generation, head, filename_tail);
1515
1516         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1517         if (IS_ERR(tail_filp)) {
1518                 CERROR("Can not open tail file %s", filename_tail);
1519                 rc = PTR_ERR(tail_filp);
1520                 GOTO(cleanup, rc);
1521         }
1522         tail = igrab(tail_filp->f_dentry->d_inode);
1523
1524         tlli = ll_i2info(tail);
1525         tail_dentry = tail_filp->f_dentry;
1526         LASSERT(tail_dentry);
1527         cleanup_phase = 1;
1528
1529         /*reorder the inode for lock sequence*/
1530         first = head->i_ino > tail->i_ino ? head : tail;
1531         second = head->i_ino > tail->i_ino ? tail : head;
1532         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1533         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1534
1535         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1536                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1537         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1538         if (IS_ERR(first_node)){
1539                 rc = PTR_ERR(first_node);
1540                 GOTO(cleanup, rc);
1541         }
1542         first_tree.lt_fd = first_filp->private_data;
1543         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1544         if (rc != 0)
1545                 GOTO(cleanup, rc);
1546         cleanup_phase = 2;
1547
1548         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1549         if (IS_ERR(second_node)){
1550                 rc = PTR_ERR(second_node);
1551                 GOTO(cleanup, rc);
1552         }
1553         second_tree.lt_fd = second_filp->private_data;
1554         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1555         if (rc != 0)
1556                 GOTO(cleanup, rc);
1557         cleanup_phase = 3;
1558
1559         rc = join_sanity_check(head, tail);
1560         if (rc)
1561                 GOTO(cleanup, rc);
1562
1563         rc = join_file(head, filp, tail_filp);
1564         if (rc)
1565                 GOTO(cleanup, rc);
1566 cleanup:
1567         switch (cleanup_phase) {
1568         case 3:
1569                 ll_tree_unlock(&second_tree);
1570                 obd_cancel_unused(ll_i2dtexp(second),
1571                                   ll_i2info(second)->lli_smd, 0, NULL);
1572         case 2:
1573                 ll_tree_unlock(&first_tree);
1574                 obd_cancel_unused(ll_i2dtexp(first),
1575                                   ll_i2info(first)->lli_smd, 0, NULL);
1576         case 1:
1577                 filp_close(tail_filp, 0);
1578                 if (tail)
1579                         iput(tail);
1580                 if (head && rc == 0) {
1581                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
1582                                        &hlli->lli_smd);
1583                         hlli->lli_smd = NULL;
1584                 }
1585         case 0:
1586                 break;
1587         default:
1588                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
1589                 LBUG();
1590         }
1591         RETURN(rc);
1592 }
1593 #endif /* LUSTRE_FIX >= 50 */
1594
1595 /**
1596  * Close inode open handle
1597  *
1598  * \param dentry [in]     dentry which contains the inode
1599  * \param it     [in,out] intent which contains open info and result
1600  *
1601  * \retval 0     success
1602  * \retval <0    failure
1603  */
1604 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1605 {
1606         struct inode *inode = dentry->d_inode;
1607         struct obd_client_handle *och;
1608         int rc;
1609         ENTRY;
1610
1611         LASSERT(inode);
1612
1613         /* Root ? Do nothing. */
1614         if (dentry->d_inode->i_sb->s_root == dentry)
1615                 RETURN(0);
1616
1617         /* No open handle to close? Move away */
1618         if (!it_disposition(it, DISP_OPEN_OPEN))
1619                 RETURN(0);
1620
1621         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1622
1623         OBD_ALLOC(och, sizeof(*och));
1624         if (!och)
1625                 GOTO(out, rc = -ENOMEM);
1626
1627         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1628                     ll_i2info(inode), it, och);
1629
1630         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1631                                        inode, och);
1632  out:
1633         /* this one is in place of ll_file_open */
1634         if (it_disposition(it, DISP_ENQ_OPEN_REF))
1635                 ptlrpc_req_finished(it->d.lustre.it_data);
1636         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1637         RETURN(rc);
1638 }
1639
1640 /**
1641  * Get size for inode for which FIEMAP mapping is requested.
1642  * Make the FIEMAP get_info call and returns the result.
1643  */
1644 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1645               int num_bytes)
1646 {
1647         struct obd_export *exp = ll_i2dtexp(inode);
1648         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1649         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1650         int vallen = num_bytes;
1651         int rc;
1652         ENTRY;
1653
1654         /* If the stripe_count > 1 and the application does not understand
1655          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1656          */
1657         if (lsm->lsm_stripe_count > 1 &&
1658             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1659                 return -EOPNOTSUPP;
1660
1661         fm_key.oa.o_id = lsm->lsm_object_id;
1662         fm_key.oa.o_gr = lsm->lsm_object_gr;
1663         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1664
1665         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
1666                         OBD_MD_FLSIZE);
1667
1668         /* If filesize is 0, then there would be no objects for mapping */
1669         if (fm_key.oa.o_size == 0) {
1670                 fiemap->fm_mapped_extents = 0;
1671                 RETURN(0);
1672         }
1673
1674         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1675
1676         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1677         if (rc)
1678                 CERROR("obd_get_info failed: rc = %d\n", rc);
1679
1680         RETURN(rc);
1681 }
1682
1683 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1684                   unsigned long arg)
1685 {
1686         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1687         int flags;
1688         ENTRY;
1689
1690         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1691                inode->i_generation, inode, cmd);
1692         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1693
1694         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1695         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1696                 RETURN(-ENOTTY);
1697
1698         switch(cmd) {
1699         case LL_IOC_GETFLAGS:
1700                 /* Get the current value of the file flags */
1701                 return put_user(fd->fd_flags, (int *)arg);
1702         case LL_IOC_SETFLAGS:
1703         case LL_IOC_CLRFLAGS:
1704                 /* Set or clear specific file flags */
1705                 /* XXX This probably needs checks to ensure the flags are
1706                  *     not abused, and to handle any flag side effects.
1707                  */
1708                 if (get_user(flags, (int *) arg))
1709                         RETURN(-EFAULT);
1710
1711                 if (cmd == LL_IOC_SETFLAGS) {
1712                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1713                             !(file->f_flags & O_DIRECT)) {
1714                                 CERROR("%s: unable to disable locking on "
1715                                        "non-O_DIRECT file\n", current->comm);
1716                                 RETURN(-EINVAL);
1717                         }
1718
1719                         fd->fd_flags |= flags;
1720                 } else {
1721                         fd->fd_flags &= ~flags;
1722                 }
1723                 RETURN(0);
1724         case LL_IOC_LOV_SETSTRIPE:
1725                 RETURN(ll_lov_setstripe(inode, file, arg));
1726         case LL_IOC_LOV_SETEA:
1727                 RETURN(ll_lov_setea(inode, file, arg));
1728         case LL_IOC_LOV_GETSTRIPE:
1729                 RETURN(ll_lov_getstripe(inode, arg));
1730         case LL_IOC_RECREATE_OBJ:
1731                 RETURN(ll_lov_recreate_obj(inode, file, arg));
1732         case EXT3_IOC_FIEMAP: {
1733                 struct ll_user_fiemap *fiemap_s;
1734                 size_t num_bytes, ret_bytes;
1735                 unsigned int extent_count;
1736                 int rc = 0;
1737
1738                 /* Get the extent count so we can calculate the size of
1739                  * required fiemap buffer */
1740                 if (get_user(extent_count,
1741                     &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1742                         RETURN(-EFAULT);
1743                 num_bytes = sizeof(*fiemap_s) + (extent_count *
1744                                                  sizeof(struct ll_fiemap_extent));
1745                 OBD_VMALLOC(fiemap_s, num_bytes);
1746                 if (fiemap_s == NULL)
1747                         RETURN(-ENOMEM);
1748
1749                 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1750                                    sizeof(*fiemap_s)))
1751                         GOTO(error, rc = -EFAULT);
1752
1753                 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1754                         fiemap_s->fm_flags = fiemap_s->fm_flags &
1755                                                     ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1756                         if (copy_to_user((char *)arg, fiemap_s,
1757                                          sizeof(*fiemap_s)))
1758                                 GOTO(error, rc = -EFAULT);
1759
1760                         GOTO(error, rc = -EBADR);
1761                 }
1762
1763                 /* If fm_extent_count is non-zero, read the first extent since
1764                  * it is used to calculate end_offset and device from previous
1765                  * fiemap call. */
1766                 if (extent_count) {
1767                         if (copy_from_user(&fiemap_s->fm_extents[0],
1768                             (char __user *)arg + sizeof(*fiemap_s),
1769                             sizeof(struct ll_fiemap_extent)))
1770                                 GOTO(error, rc = -EFAULT);
1771                 }
1772
1773                 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
1774                         int rc;
1775
1776                         rc = filemap_fdatawrite(inode->i_mapping);
1777                         if (rc)
1778                                 GOTO(error, rc);
1779                 }
1780
1781                 rc = ll_fiemap(inode, fiemap_s, num_bytes);
1782                 if (rc)
1783                         GOTO(error, rc);
1784
1785                 ret_bytes = sizeof(struct ll_user_fiemap);
1786
1787                 if (extent_count != 0)
1788                         ret_bytes += (fiemap_s->fm_mapped_extents *
1789                                          sizeof(struct ll_fiemap_extent));
1790
1791                 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1792                         rc = -EFAULT;
1793
1794 error:
1795                 OBD_VFREE(fiemap_s, num_bytes);
1796                 RETURN(rc);
1797         }
1798         case EXT3_IOC_GETFLAGS:
1799         case EXT3_IOC_SETFLAGS:
1800                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1801         case EXT3_IOC_GETVERSION_OLD:
1802         case EXT3_IOC_GETVERSION:
1803                 RETURN(put_user(inode->i_generation, (int *)arg));
1804         case LL_IOC_JOIN: {
1805 #if LUSTRE_FIX >= 50
1806                 /* Allow file join in beta builds to allow debuggging */
1807                 char *ftail;
1808                 int rc;
1809
1810                 ftail = getname((const char *)arg);
1811                 if (IS_ERR(ftail))
1812                         RETURN(PTR_ERR(ftail));
1813                 rc = ll_file_join(inode, file, ftail);
1814                 putname(ftail);
1815                 RETURN(rc);
1816 #else
1817                 CWARN("file join is not supported in this version of Lustre\n");
1818                 RETURN(-ENOTTY);
1819 #endif
1820         }
1821         case LL_IOC_GROUP_LOCK:
1822                 RETURN(ll_get_grouplock(inode, file, arg));
1823         case LL_IOC_GROUP_UNLOCK:
1824                 RETURN(ll_put_grouplock(inode, file, arg));
1825         case IOC_OBD_STATFS:
1826                 RETURN(ll_obd_statfs(inode, (void *)arg));
1827
1828         /* We need to special case any other ioctls we want to handle,
1829          * to send them to the MDS/OST as appropriate and to properly
1830          * network encode the arg field.
1831         case EXT3_IOC_SETVERSION_OLD:
1832         case EXT3_IOC_SETVERSION:
1833         */
1834         case LL_IOC_FLUSHCTX:
1835                 RETURN(ll_flush_ctx(inode));
1836         case LL_IOC_PATH2FID: {
1837                 if (copy_to_user((void *)arg, &ll_i2info(inode)->lli_fid,
1838                                  sizeof(struct lu_fid)))
1839                         RETURN(-EFAULT);
1840
1841                 RETURN(0);
1842         }
1843         default: {
1844                 int err;
1845
1846                 if (LLIOC_STOP ==
1847                     ll_iocontrol_call(inode, file, cmd, arg, &err))
1848                         RETURN(err);
1849
1850                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1851                                      (void *)arg));
1852         }
1853         }
1854 }
1855
1856 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1857 {
1858         struct inode *inode = file->f_dentry->d_inode;
1859         loff_t retval;
1860         ENTRY;
1861         retval = offset + ((origin == 2) ? i_size_read(inode) :
1862                            (origin == 1) ? file->f_pos : 0);
1863         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
1864                inode->i_ino, inode->i_generation, inode, retval, retval,
1865                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1866         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1867
1868         if (origin == 2) { /* SEEK_END */
1869                 int nonblock = 0, rc;
1870
1871                 if (file->f_flags & O_NONBLOCK)
1872                         nonblock = LDLM_FL_BLOCK_NOWAIT;
1873
1874                 rc = cl_glimpse_size(inode);
1875                 if (rc != 0)
1876                         RETURN(rc);
1877
1878                 ll_inode_size_lock(inode, 0);
1879                 offset += i_size_read(inode);
1880                 ll_inode_size_unlock(inode, 0);
1881         } else if (origin == 1) { /* SEEK_CUR */
1882                 offset += file->f_pos;
1883         }
1884
1885         retval = -EINVAL;
1886         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1887                 if (offset != file->f_pos) {
1888                         file->f_pos = offset;
1889                 }
1890                 retval = offset;
1891         }
1892
1893         RETURN(retval);
1894 }
1895
1896 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1897 {
1898         struct inode *inode = dentry->d_inode;
1899         struct ll_inode_info *lli = ll_i2info(inode);
1900         struct lov_stripe_md *lsm = lli->lli_smd;
1901         struct ptlrpc_request *req;
1902         struct obd_capa *oc;
1903         int rc, err;
1904         ENTRY;
1905         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
1906                inode->i_generation, inode);
1907         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
1908
1909         /* fsync's caller has already called _fdata{sync,write}, we want
1910          * that IO to finish before calling the osc and mdc sync methods */
1911         rc = filemap_fdatawait(inode->i_mapping);
1912
1913         /* catch async errors that were recorded back when async writeback
1914          * failed for pages in this mapping. */
1915         err = lli->lli_async_rc;
1916         lli->lli_async_rc = 0;
1917         if (rc == 0)
1918                 rc = err;
1919         if (lsm) {
1920                 err = lov_test_and_clear_async_rc(lsm);
1921                 if (rc == 0)
1922                         rc = err;
1923         }
1924
1925         oc = ll_mdscapa_get(inode);
1926         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
1927                       &req);
1928         capa_put(oc);
1929         if (!rc)
1930                 rc = err;
1931         if (!err)
1932                 ptlrpc_req_finished(req);
1933
1934         if (data && lsm) {
1935                 struct obdo *oa;
1936
1937                 OBDO_ALLOC(oa);
1938                 if (!oa)
1939                         RETURN(rc ? rc : -ENOMEM);
1940
1941                 oa->o_id = lsm->lsm_object_id;
1942                 oa->o_gr = lsm->lsm_object_gr;
1943                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1944                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1945                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
1946                                            OBD_MD_FLGROUP);
1947
1948                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
1949                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
1950                                0, OBD_OBJECT_EOF, oc);
1951                 capa_put(oc);
1952                 if (!rc)
1953                         rc = err;
1954                 OBDO_FREE(oa);
1955         }
1956
1957         RETURN(rc);
1958 }
1959
1960 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
1961 {
1962         struct inode *inode = file->f_dentry->d_inode;
1963         struct ll_sb_info *sbi = ll_i2sbi(inode);
1964         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
1965                                            .ei_cb_cp =ldlm_flock_completion_ast,
1966                                            .ei_cbdata = file_lock };
1967         struct md_op_data *op_data;
1968         struct lustre_handle lockh = {0};
1969         ldlm_policy_data_t flock;
1970         int flags = 0;
1971         int rc;
1972         ENTRY;
1973
1974         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
1975                inode->i_ino, file_lock);
1976
1977         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
1978
1979         if (file_lock->fl_flags & FL_FLOCK) {
1980                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
1981                 /* set missing params for flock() calls */
1982                 file_lock->fl_end = OFFSET_MAX;
1983                 file_lock->fl_pid = current->tgid;
1984         }
1985         flock.l_flock.pid = file_lock->fl_pid;
1986         flock.l_flock.start = file_lock->fl_start;
1987         flock.l_flock.end = file_lock->fl_end;
1988
1989         switch (file_lock->fl_type) {
1990         case F_RDLCK:
1991                 einfo.ei_mode = LCK_PR;
1992                 break;
1993         case F_UNLCK:
1994                 /* An unlock request may or may not have any relation to
1995                  * existing locks so we may not be able to pass a lock handle
1996                  * via a normal ldlm_lock_cancel() request. The request may even
1997                  * unlock a byte range in the middle of an existing lock. In
1998                  * order to process an unlock request we need all of the same
1999                  * information that is given with a normal read or write record
2000                  * lock request. To avoid creating another ldlm unlock (cancel)
2001                  * message we'll treat a LCK_NL flock request as an unlock. */
2002                 einfo.ei_mode = LCK_NL;
2003                 break;
2004         case F_WRLCK:
2005                 einfo.ei_mode = LCK_PW;
2006                 break;
2007         default:
2008                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2009                 RETURN (-EINVAL);
2010         }
2011
2012         switch (cmd) {
2013         case F_SETLKW:
2014 #ifdef F_SETLKW64
2015         case F_SETLKW64:
2016 #endif
2017                 flags = 0;
2018                 break;
2019         case F_SETLK:
2020 #ifdef F_SETLK64
2021         case F_SETLK64:
2022 #endif
2023                 flags = LDLM_FL_BLOCK_NOWAIT;
2024                 break;
2025         case F_GETLK:
2026 #ifdef F_GETLK64
2027         case F_GETLK64:
2028 #endif
2029                 flags = LDLM_FL_TEST_LOCK;
2030                 /* Save the old mode so that if the mode in the lock changes we
2031                  * can decrement the appropriate reader or writer refcount. */
2032                 file_lock->fl_type = einfo.ei_mode;
2033                 break;
2034         default:
2035                 CERROR("unknown fcntl lock command: %d\n", cmd);
2036                 RETURN (-EINVAL);
2037         }
2038
2039         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2040                                      LUSTRE_OPC_ANY, NULL);
2041         if (IS_ERR(op_data))
2042                 RETURN(PTR_ERR(op_data));
2043
2044         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2045                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2046                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2047
2048         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2049                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2050
2051         ll_finish_md_op_data(op_data);
2052
2053         if ((file_lock->fl_flags & FL_FLOCK) &&
2054             (rc == 0 || file_lock->fl_type == F_UNLCK))
2055                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2056 #ifdef HAVE_F_OP_FLOCK
2057         if ((file_lock->fl_flags & FL_POSIX) &&
2058             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2059             !(flags & LDLM_FL_TEST_LOCK))
2060                 posix_lock_file_wait(file, file_lock);
2061 #endif
2062
2063         RETURN(rc);
2064 }
2065
2066 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2067 {
2068         ENTRY;
2069
2070         RETURN(-ENOSYS);
2071 }
2072
2073 int ll_have_md_lock(struct inode *inode, __u64 bits)
2074 {
2075         struct lustre_handle lockh;
2076         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2077         struct lu_fid *fid;
2078         int flags;
2079         ENTRY;
2080
2081         if (!inode)
2082                RETURN(0);
2083
2084         fid = &ll_i2info(inode)->lli_fid;
2085         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2086
2087         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2088         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2089                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2090                 RETURN(1);
2091         }
2092         RETURN(0);
2093 }
2094
2095 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2096                             struct lustre_handle *lockh)
2097 {
2098         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2099         struct lu_fid *fid;
2100         ldlm_mode_t rc;
2101         int flags;
2102         ENTRY;
2103
2104         fid = &ll_i2info(inode)->lli_fid;
2105         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2106
2107         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2108         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2109                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2110         RETURN(rc);
2111 }
2112
2113 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2114         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2115                               * and return success */
2116                 inode->i_nlink = 0;
2117                 /* This path cannot be hit for regular files unless in
2118                  * case of obscure races, so no need to to validate
2119                  * size. */
2120                 if (!S_ISREG(inode->i_mode) &&
2121                     !S_ISDIR(inode->i_mode))
2122                         return 0;
2123         }
2124
2125         if (rc) {
2126                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2127                 return -abs(rc);
2128
2129         }
2130
2131         return 0;
2132 }
2133
2134 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2135 {
2136         struct inode *inode = dentry->d_inode;
2137         struct ptlrpc_request *req = NULL;
2138         struct ll_sb_info *sbi;
2139         struct obd_export *exp;
2140         int rc;
2141         ENTRY;
2142
2143         if (!inode) {
2144                 CERROR("REPORT THIS LINE TO PETER\n");
2145                 RETURN(0);
2146         }
2147         sbi = ll_i2sbi(inode);
2148
2149         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2150                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2151
2152         exp = ll_i2mdexp(inode);
2153
2154         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2155                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2156                 struct md_op_data *op_data;
2157
2158                 /* Call getattr by fid, so do not provide name at all. */
2159                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2160                                              dentry->d_inode, NULL, 0, 0,
2161                                              LUSTRE_OPC_ANY, NULL);
2162                 if (IS_ERR(op_data))
2163                         RETURN(PTR_ERR(op_data));
2164
2165                 oit.it_flags |= O_CHECK_STALE;
2166                 rc = md_intent_lock(exp, op_data, NULL, 0,
2167                                     /* we are not interested in name
2168                                        based lookup */
2169                                     &oit, 0, &req,
2170                                     ll_md_blocking_ast, 0);
2171                 ll_finish_md_op_data(op_data);
2172                 oit.it_flags &= ~O_CHECK_STALE;
2173                 if (rc < 0) {
2174                         rc = ll_inode_revalidate_fini(inode, rc);
2175                         GOTO (out, rc);
2176                 }
2177
2178                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2179                 if (rc != 0) {
2180                         ll_intent_release(&oit);
2181                         GOTO(out, rc);
2182                 }
2183
2184                 /* Unlinked? Unhash dentry, so it is not picked up later by
2185                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2186                    here to preserve get_cwd functionality on 2.6.
2187                    Bug 10503 */
2188                 if (!dentry->d_inode->i_nlink) {
2189                         spin_lock(&ll_lookup_lock);
2190                         spin_lock(&dcache_lock);
2191                         ll_drop_dentry(dentry);
2192                         spin_unlock(&dcache_lock);
2193                         spin_unlock(&ll_lookup_lock);
2194                 }
2195
2196                 ll_lookup_finish_locks(&oit, dentry);
2197         } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE |
2198                                                      MDS_INODELOCK_LOOKUP)) {
2199                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2200                 obd_valid valid = OBD_MD_FLGETATTR;
2201                 struct obd_capa *oc;
2202                 int ealen = 0;
2203
2204                 if (S_ISREG(inode->i_mode)) {
2205                         rc = ll_get_max_mdsize(sbi, &ealen);
2206                         if (rc)
2207                                 RETURN(rc);
2208                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2209                 }
2210                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2211                  * capa for this inode. Because we only keep capas of dirs
2212                  * fresh. */
2213                 oc = ll_mdscapa_get(inode);
2214                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2215                                 ealen, &req);
2216                 capa_put(oc);
2217                 if (rc) {
2218                         rc = ll_inode_revalidate_fini(inode, rc);
2219                         RETURN(rc);
2220                 }
2221
2222                 rc = ll_prep_inode(&inode, req, NULL);
2223                 if (rc)
2224                         GOTO(out, rc);
2225         }
2226
2227         /* if object not yet allocated, don't validate size */
2228         if (ll_i2info(inode)->lli_smd == NULL)
2229                 GOTO(out, rc = 0);
2230
2231         /* cl_glimpse_size will prefer locally cached writes if they extend
2232          * the file */
2233         rc = cl_glimpse_size(inode);
2234         EXIT;
2235 out:
2236         ptlrpc_req_finished(req);
2237         return rc;
2238 }
2239
2240 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2241                   struct lookup_intent *it, struct kstat *stat)
2242 {
2243         struct inode *inode = de->d_inode;
2244         int res = 0;
2245
2246         res = ll_inode_revalidate_it(de, it);
2247         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2248
2249         if (res)
2250                 return res;
2251
2252         stat->dev = inode->i_sb->s_dev;
2253         stat->ino = inode->i_ino;
2254         stat->mode = inode->i_mode;
2255         stat->nlink = inode->i_nlink;
2256         stat->uid = inode->i_uid;
2257         stat->gid = inode->i_gid;
2258         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2259         stat->atime = inode->i_atime;
2260         stat->mtime = inode->i_mtime;
2261         stat->ctime = inode->i_ctime;
2262 #ifdef HAVE_INODE_BLKSIZE
2263         stat->blksize = inode->i_blksize;
2264 #else
2265         stat->blksize = 1 << inode->i_blkbits;
2266 #endif
2267
2268         ll_inode_size_lock(inode, 0);
2269         stat->size = i_size_read(inode);
2270         stat->blocks = inode->i_blocks;
2271         ll_inode_size_unlock(inode, 0);
2272
2273         return 0;
2274 }
2275 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2276 {
2277         struct lookup_intent it = { .it_op = IT_GETATTR };
2278
2279         return ll_getattr_it(mnt, de, &it, stat);
2280 }
2281
2282 static
2283 int lustre_check_acl(struct inode *inode, int mask)
2284 {
2285 #ifdef CONFIG_FS_POSIX_ACL
2286         struct ll_inode_info *lli = ll_i2info(inode);
2287         struct posix_acl *acl;
2288         int rc;
2289         ENTRY;
2290
2291         spin_lock(&lli->lli_lock);
2292         acl = posix_acl_dup(lli->lli_posix_acl);
2293         spin_unlock(&lli->lli_lock);
2294
2295         if (!acl)
2296                 RETURN(-EAGAIN);
2297
2298         rc = posix_acl_permission(inode, acl, mask);
2299         posix_acl_release(acl);
2300
2301         RETURN(rc);
2302 #else
2303         return -EAGAIN;
2304 #endif
2305 }
2306
2307 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2308 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2309 {
2310         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2311                inode->i_ino, inode->i_generation, inode, mask);
2312         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2313                 return lustre_check_remote_perm(inode, mask);
2314
2315         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2316         return generic_permission(inode, mask, lustre_check_acl);
2317 }
2318 #else
2319 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2320 {
2321         int mode = inode->i_mode;
2322         int rc;
2323
2324         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2325                inode->i_ino, inode->i_generation, inode, mask);
2326
2327         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2328                 return lustre_check_remote_perm(inode, mask);
2329
2330         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2331
2332         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2333             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2334                 return -EROFS;
2335         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2336                 return -EACCES;
2337         if (current->fsuid == inode->i_uid) {
2338                 mode >>= 6;
2339         } else if (1) {
2340                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2341                         goto check_groups;
2342                 rc = lustre_check_acl(inode, mask);
2343                 if (rc == -EAGAIN)
2344                         goto check_groups;
2345                 if (rc == -EACCES)
2346                         goto check_capabilities;
2347                 return rc;
2348         } else {
2349 check_groups:
2350                 if (in_group_p(inode->i_gid))
2351                         mode >>= 3;
2352         }
2353         if ((mode & mask & S_IRWXO) == mask)
2354                 return 0;
2355
2356 check_capabilities:
2357         if (!(mask & MAY_EXEC) ||
2358             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2359                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2360                         return 0;
2361
2362         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2363             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2364                 return 0;
2365
2366         return -EACCES;
2367 }
2368 #endif
2369
2370 #ifdef HAVE_FILE_READV
2371 #define READ_METHOD readv
2372 #define READ_FUNCTION ll_file_readv
2373 #define WRITE_METHOD writev
2374 #define WRITE_FUNCTION ll_file_writev
2375 #else
2376 #define READ_METHOD aio_read
2377 #define READ_FUNCTION ll_file_aio_read
2378 #define WRITE_METHOD aio_write
2379 #define WRITE_FUNCTION ll_file_aio_write
2380 #endif
2381
2382 /* -o localflock - only provides locally consistent flock locks */
2383 struct file_operations ll_file_operations = {
2384         .read           = ll_file_read,
2385         .READ_METHOD    = READ_FUNCTION,
2386         .write          = ll_file_write,
2387         .WRITE_METHOD   = WRITE_FUNCTION,
2388         .ioctl          = ll_file_ioctl,
2389         .open           = ll_file_open,
2390         .release        = ll_file_release,
2391         .mmap           = ll_file_mmap,
2392         .llseek         = ll_file_seek,
2393         .sendfile       = ll_file_sendfile,
2394         .fsync          = ll_fsync,
2395 };
2396
2397 struct file_operations ll_file_operations_flock = {
2398         .read           = ll_file_read,
2399         .READ_METHOD    = READ_FUNCTION,
2400         .write          = ll_file_write,
2401         .WRITE_METHOD   = WRITE_FUNCTION,
2402         .ioctl          = ll_file_ioctl,
2403         .open           = ll_file_open,
2404         .release        = ll_file_release,
2405         .mmap           = ll_file_mmap,
2406         .llseek         = ll_file_seek,
2407         .sendfile       = ll_file_sendfile,
2408         .fsync          = ll_fsync,
2409 #ifdef HAVE_F_OP_FLOCK
2410         .flock          = ll_file_flock,
2411 #endif
2412         .lock           = ll_file_flock
2413 };
2414
2415 /* These are for -o noflock - to return ENOSYS on flock calls */
2416 struct file_operations ll_file_operations_noflock = {
2417         .read           = ll_file_read,
2418         .READ_METHOD    = READ_FUNCTION,
2419         .write          = ll_file_write,
2420         .WRITE_METHOD   = WRITE_FUNCTION,
2421         .ioctl          = ll_file_ioctl,
2422         .open           = ll_file_open,
2423         .release        = ll_file_release,
2424         .mmap           = ll_file_mmap,
2425         .llseek         = ll_file_seek,
2426         .sendfile       = ll_file_sendfile,
2427         .fsync          = ll_fsync,
2428 #ifdef HAVE_F_OP_FLOCK
2429         .flock          = ll_file_noflock,
2430 #endif
2431         .lock           = ll_file_noflock
2432 };
2433
2434 struct inode_operations ll_file_inode_operations = {
2435 #ifdef HAVE_VFS_INTENT_PATCHES
2436         .setattr_raw    = ll_setattr_raw,
2437 #endif
2438         .setattr        = ll_setattr,
2439         .truncate       = ll_truncate,
2440         .getattr        = ll_getattr,
2441         .permission     = ll_inode_permission,
2442         .setxattr       = ll_setxattr,
2443         .getxattr       = ll_getxattr,
2444         .listxattr      = ll_listxattr,
2445         .removexattr    = ll_removexattr,
2446 };
2447
2448 /* dynamic ioctl number support routins */
2449 static struct llioc_ctl_data {
2450         struct rw_semaphore ioc_sem;
2451         struct list_head    ioc_head;
2452 } llioc = {
2453         __RWSEM_INITIALIZER(llioc.ioc_sem),
2454         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2455 };
2456
2457
2458 struct llioc_data {
2459         struct list_head        iocd_list;
2460         unsigned int            iocd_size;
2461         llioc_callback_t        iocd_cb;
2462         unsigned int            iocd_count;
2463         unsigned int            iocd_cmd[0];
2464 };
2465
2466 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2467 {
2468         unsigned int size;
2469         struct llioc_data *in_data = NULL;
2470         ENTRY;
2471
2472         if (cb == NULL || cmd == NULL ||
2473             count > LLIOC_MAX_CMD || count < 0)
2474                 RETURN(NULL);
2475
2476         size = sizeof(*in_data) + count * sizeof(unsigned int);
2477         OBD_ALLOC(in_data, size);
2478         if (in_data == NULL)
2479                 RETURN(NULL);
2480
2481         memset(in_data, 0, sizeof(*in_data));
2482         in_data->iocd_size = size;
2483         in_data->iocd_cb = cb;
2484         in_data->iocd_count = count;
2485         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2486
2487         down_write(&llioc.ioc_sem);
2488         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2489         up_write(&llioc.ioc_sem);
2490
2491         RETURN(in_data);
2492 }
2493
2494 void ll_iocontrol_unregister(void *magic)
2495 {
2496         struct llioc_data *tmp;
2497
2498         if (magic == NULL)
2499                 return;
2500
2501         down_write(&llioc.ioc_sem);
2502         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2503                 if (tmp == magic) {
2504                         unsigned int size = tmp->iocd_size;
2505
2506                         list_del(&tmp->iocd_list);
2507                         up_write(&llioc.ioc_sem);
2508
2509                         OBD_FREE(tmp, size);
2510                         return;
2511                 }
2512         }
2513         up_write(&llioc.ioc_sem);
2514
2515         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2516 }
2517
2518 EXPORT_SYMBOL(ll_iocontrol_register);
2519 EXPORT_SYMBOL(ll_iocontrol_unregister);
2520
2521 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2522                         unsigned int cmd, unsigned long arg, int *rcp)
2523 {
2524         enum llioc_iter ret = LLIOC_CONT;
2525         struct llioc_data *data;
2526         int rc = -EINVAL, i;
2527
2528         down_read(&llioc.ioc_sem);
2529         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2530                 for (i = 0; i < data->iocd_count; i++) {
2531                         if (cmd != data->iocd_cmd[i])
2532                                 continue;
2533
2534                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2535                         break;
2536                 }
2537
2538                 if (ret == LLIOC_STOP)
2539                         break;
2540         }
2541         up_read(&llioc.ioc_sem);
2542
2543         if (rcp)
2544                 *rcp = rc;
2545         return ret;
2546 }