Whamcloud - gitweb
b=20748
[fs/lustre-release.git] / lustre / llite / file.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51
52 #include "cl_object.h"
53
54 struct ll_file_data *ll_file_data_get(void)
55 {
56         struct ll_file_data *fd;
57
58         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
59         return fd;
60 }
61
62 static void ll_file_data_put(struct ll_file_data *fd)
63 {
64         if (fd != NULL)
65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 }
67
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69                           struct lustre_handle *fh)
70 {
71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72         op_data->op_attr.ia_mode = inode->i_mode;
73         op_data->op_attr.ia_atime = inode->i_atime;
74         op_data->op_attr.ia_mtime = inode->i_mtime;
75         op_data->op_attr.ia_ctime = inode->i_ctime;
76         op_data->op_attr.ia_size = i_size_read(inode);
77         op_data->op_attr_blocks = inode->i_blocks;
78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
79         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
80         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
81         op_data->op_capa1 = ll_mdscapa_get(inode);
82 }
83
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85                              struct obd_client_handle *och)
86 {
87         ENTRY;
88
89         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
90                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
91
92         if (!(och->och_flags & FMODE_WRITE))
93                 goto out;
94
95         if (!(exp_connect_som(ll_i2mdexp(inode))) || !S_ISREG(inode->i_mode))
96                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
97         else
98                 ll_epoch_close(inode, op_data, &och, 0);
99
100 out:
101         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
102         EXIT;
103 }
104
105 static int ll_close_inode_openhandle(struct obd_export *md_exp,
106                                      struct inode *inode,
107                                      struct obd_client_handle *och)
108 {
109         struct obd_export *exp = ll_i2mdexp(inode);
110         struct md_op_data *op_data;
111         struct ptlrpc_request *req = NULL;
112         struct obd_device *obd = class_exp2obd(exp);
113         int epoch_close = 1;
114         int rc;
115         ENTRY;
116
117         if (obd == NULL) {
118                 /*
119                  * XXX: in case of LMV, is this correct to access
120                  * ->exp_handle?
121                  */
122                 CERROR("Invalid MDC connection handle "LPX64"\n",
123                        ll_i2mdexp(inode)->exp_handle.h_cookie);
124                 GOTO(out, rc = 0);
125         }
126
127         /*
128          * here we check if this is forced umount. If so this is called on
129          * canceling "open lock" and we do not call md_close() in this case, as
130          * it will not be successful, as import is already deactivated.
131          */
132         if (obd->obd_force)
133                 GOTO(out, rc = 0);
134
135         OBD_ALLOC_PTR(op_data);
136         if (op_data == NULL)
137                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
138
139         ll_prepare_close(inode, op_data, och);
140         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
141         rc = md_close(md_exp, op_data, och->och_mod, &req);
142         if (rc == -EAGAIN) {
143                 /* This close must have the epoch closed. */
144                 LASSERT(epoch_close);
145                 /* MDS has instructed us to obtain Size-on-MDS attribute from
146                  * OSTs and send setattr to back to MDS. */
147                 rc = ll_sizeonmds_update(inode, &och->och_fh,
148                                          op_data->op_ioepoch);
149                 if (rc) {
150                         CERROR("inode %lu mdc Size-on-MDS update failed: "
151                                "rc = %d\n", inode->i_ino, rc);
152                         rc = 0;
153                 }
154         } else if (rc) {
155                 CERROR("inode %lu mdc close failed: rc = %d\n",
156                        inode->i_ino, rc);
157         }
158         ll_finish_md_op_data(op_data);
159
160         if (rc == 0) {
161                 rc = ll_objects_destroy(req, inode);
162                 if (rc)
163                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
164                                inode->i_ino, rc);
165         }
166
167         EXIT;
168 out:
169
170         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
171             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
172                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
173         } else {
174                 md_clear_open_replay_data(md_exp, och);
175                 /* Free @och if it is not waiting for DONE_WRITING. */
176                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
177                 OBD_FREE_PTR(och);
178         }
179         if (req) /* This is close request */
180                 ptlrpc_req_finished(req);
181         return rc;
182 }
183
184 int ll_md_real_close(struct inode *inode, int flags)
185 {
186         struct ll_inode_info *lli = ll_i2info(inode);
187         struct obd_client_handle **och_p;
188         struct obd_client_handle *och;
189         __u64 *och_usecount;
190         int rc = 0;
191         ENTRY;
192
193         if (flags & FMODE_WRITE) {
194                 och_p = &lli->lli_mds_write_och;
195                 och_usecount = &lli->lli_open_fd_write_count;
196         } else if (flags & FMODE_EXEC) {
197                 och_p = &lli->lli_mds_exec_och;
198                 och_usecount = &lli->lli_open_fd_exec_count;
199         } else {
200                 LASSERT(flags & FMODE_READ);
201                 och_p = &lli->lli_mds_read_och;
202                 och_usecount = &lli->lli_open_fd_read_count;
203         }
204
205         down(&lli->lli_och_sem);
206         if (*och_usecount) { /* There are still users of this handle, so
207                                 skip freeing it. */
208                 up(&lli->lli_och_sem);
209                 RETURN(0);
210         }
211         och=*och_p;
212         *och_p = NULL;
213         up(&lli->lli_och_sem);
214
215         if (och) { /* There might be a race and somebody have freed this och
216                       already */
217                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
218                                                inode, och);
219         }
220
221         RETURN(rc);
222 }
223
224 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
225                 struct file *file)
226 {
227         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
228         struct ll_inode_info *lli = ll_i2info(inode);
229         int rc = 0;
230         ENTRY;
231
232         /* clear group lock, if present */
233         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
234                 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
235
236         /* Let's see if we have good enough OPEN lock on the file and if
237            we can skip talking to MDS */
238         if (file->f_dentry->d_inode) { /* Can this ever be false? */
239                 int lockmode;
240                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
241                 struct lustre_handle lockh;
242                 struct inode *inode = file->f_dentry->d_inode;
243                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
244
245                 down(&lli->lli_och_sem);
246                 if (fd->fd_omode & FMODE_WRITE) {
247                         lockmode = LCK_CW;
248                         LASSERT(lli->lli_open_fd_write_count);
249                         lli->lli_open_fd_write_count--;
250                 } else if (fd->fd_omode & FMODE_EXEC) {
251                         lockmode = LCK_PR;
252                         LASSERT(lli->lli_open_fd_exec_count);
253                         lli->lli_open_fd_exec_count--;
254                 } else {
255                         lockmode = LCK_CR;
256                         LASSERT(lli->lli_open_fd_read_count);
257                         lli->lli_open_fd_read_count--;
258                 }
259                 up(&lli->lli_och_sem);
260
261                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
262                                    LDLM_IBITS, &policy, lockmode,
263                                    &lockh)) {
264                         rc = ll_md_real_close(file->f_dentry->d_inode,
265                                               fd->fd_omode);
266                 }
267         } else {
268                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
269                        file, file->f_dentry, file->f_dentry->d_name.name);
270         }
271
272         LUSTRE_FPRIVATE(file) = NULL;
273         ll_file_data_put(fd);
274         ll_capa_close(inode);
275
276         RETURN(rc);
277 }
278
279 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
280
281 /* While this returns an error code, fput() the caller does not, so we need
282  * to make every effort to clean up all of our state here.  Also, applications
283  * rarely check close errors and even if an error is returned they will not
284  * re-try the close call.
285  */
286 int ll_file_release(struct inode *inode, struct file *file)
287 {
288         struct ll_file_data *fd;
289         struct ll_sb_info *sbi = ll_i2sbi(inode);
290         struct ll_inode_info *lli = ll_i2info(inode);
291         struct lov_stripe_md *lsm = lli->lli_smd;
292         int rc;
293         ENTRY;
294
295         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
296                inode->i_generation, inode);
297
298 #ifdef CONFIG_FS_POSIX_ACL
299         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
300             inode == inode->i_sb->s_root->d_inode) {
301                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
302
303                 LASSERT(fd != NULL);
304                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
305                         fd->fd_flags &= ~LL_FILE_RMTACL;
306                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
307                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
308                 }
309         }
310 #endif
311
312         if (inode->i_sb->s_root != file->f_dentry)
313                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
314         fd = LUSTRE_FPRIVATE(file);
315         LASSERT(fd != NULL);
316
317         /* The last ref on @file, maybe not the the owner pid of statahead.
318          * Different processes can open the same dir, "ll_opendir_key" means:
319          * it is me that should stop the statahead thread. */
320         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
321                 ll_stop_statahead(inode, lli->lli_opendir_key);
322
323         if (inode->i_sb->s_root == file->f_dentry) {
324                 LUSTRE_FPRIVATE(file) = NULL;
325                 ll_file_data_put(fd);
326                 RETURN(0);
327         }
328
329         if (lsm)
330                 lov_test_and_clear_async_rc(lsm);
331         lli->lli_async_rc = 0;
332
333         rc = ll_md_close(sbi->ll_md_exp, inode, file);
334         RETURN(rc);
335 }
336
337 static int ll_intent_file_open(struct file *file, void *lmm,
338                                int lmmsize, struct lookup_intent *itp)
339 {
340         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
341         struct dentry *parent = file->f_dentry->d_parent;
342         const char *name = file->f_dentry->d_name.name;
343         const int len = file->f_dentry->d_name.len;
344         struct md_op_data *op_data;
345         struct ptlrpc_request *req;
346         int rc;
347         ENTRY;
348
349         if (!parent)
350                 RETURN(-ENOENT);
351
352         /* Usually we come here only for NFSD, and we want open lock.
353            But we can also get here with pre 2.6.15 patchless kernels, and in
354            that case that lock is also ok */
355         /* We can also get here if there was cached open handle in revalidate_it
356          * but it disappeared while we were getting from there to ll_file_open.
357          * But this means this file was closed and immediatelly opened which
358          * makes a good candidate for using OPEN lock */
359         /* If lmmsize & lmm are not 0, we are just setting stripe info
360          * parameters. No need for the open lock */
361         if (!lmm && !lmmsize)
362                 itp->it_flags |= MDS_OPEN_LOCK;
363
364         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
365                                       file->f_dentry->d_inode, name, len,
366                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
367         if (IS_ERR(op_data))
368                 RETURN(PTR_ERR(op_data));
369
370         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
371                             0 /*unused */, &req, ll_md_blocking_ast, 0);
372         ll_finish_md_op_data(op_data);
373         if (rc == -ESTALE) {
374                 /* reason for keep own exit path - don`t flood log
375                 * with messages with -ESTALE errors.
376                 */
377                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
378                      it_open_error(DISP_OPEN_OPEN, itp))
379                         GOTO(out, rc);
380                 ll_release_openhandle(file->f_dentry, itp);
381                 GOTO(out, rc);
382         }
383
384         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
385                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
386                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
387                 GOTO(out, rc);
388         }
389
390         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
391         if (!rc && itp->d.lustre.it_lock_mode)
392                 md_set_lock_data(sbi->ll_md_exp,
393                                  &itp->d.lustre.it_lock_handle,
394                                  file->f_dentry->d_inode, NULL);
395
396 out:
397         ptlrpc_req_finished(itp->d.lustre.it_data);
398         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
399         ll_intent_drop_lock(itp);
400
401         RETURN(rc);
402 }
403
404 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
405 {
406         if (ioepoch && lli->lli_ioepoch != ioepoch) {
407                 lli->lli_ioepoch = ioepoch;
408                 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
409                        ioepoch, PFID(&lli->lli_fid));
410         }
411 }
412
413 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
414                        struct lookup_intent *it, struct obd_client_handle *och)
415 {
416         struct ptlrpc_request *req = it->d.lustre.it_data;
417         struct mdt_body *body;
418
419         LASSERT(och);
420
421         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
422         LASSERT(body != NULL);                      /* reply already checked out */
423
424         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
425         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
426         och->och_fid = lli->lli_fid;
427         och->och_flags = it->it_flags;
428         ll_ioepoch_open(lli, body->ioepoch);
429
430         return md_set_open_replay_data(md_exp, och, req);
431 }
432
433 int ll_local_open(struct file *file, struct lookup_intent *it,
434                   struct ll_file_data *fd, struct obd_client_handle *och)
435 {
436         struct inode *inode = file->f_dentry->d_inode;
437         struct ll_inode_info *lli = ll_i2info(inode);
438         ENTRY;
439
440         LASSERT(!LUSTRE_FPRIVATE(file));
441
442         LASSERT(fd != NULL);
443
444         if (och) {
445                 struct ptlrpc_request *req = it->d.lustre.it_data;
446                 struct mdt_body *body;
447                 int rc;
448
449                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
450                 if (rc)
451                         RETURN(rc);
452
453                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
454                 if ((it->it_flags & FMODE_WRITE) &&
455                     (body->valid & OBD_MD_FLSIZE))
456                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
457                                lli->lli_ioepoch, PFID(&lli->lli_fid));
458         }
459
460         LUSTRE_FPRIVATE(file) = fd;
461         ll_readahead_init(inode, &fd->fd_ras);
462         fd->fd_omode = it->it_flags;
463         RETURN(0);
464 }
465
466 /* Open a file, and (for the very first open) create objects on the OSTs at
467  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
468  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
469  * lli_open_sem to ensure no other process will create objects, send the
470  * stripe MD to the MDS, or try to destroy the objects if that fails.
471  *
472  * If we already have the stripe MD locally then we don't request it in
473  * md_open(), by passing a lmm_size = 0.
474  *
475  * It is up to the application to ensure no other processes open this file
476  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
477  * used.  We might be able to avoid races of that sort by getting lli_open_sem
478  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
479  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
480  */
481 int ll_file_open(struct inode *inode, struct file *file)
482 {
483         struct ll_inode_info *lli = ll_i2info(inode);
484         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
485                                           .it_flags = file->f_flags };
486         struct lov_stripe_md *lsm;
487         struct ptlrpc_request *req = NULL;
488         struct obd_client_handle **och_p;
489         __u64 *och_usecount;
490         struct ll_file_data *fd;
491         int rc = 0, opendir_set = 0;
492         ENTRY;
493
494         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
495                inode->i_generation, inode, file->f_flags);
496
497 #ifdef HAVE_VFS_INTENT_PATCHES
498         it = file->f_it;
499 #else
500         it = file->private_data; /* XXX: compat macro */
501         file->private_data = NULL; /* prevent ll_local_open assertion */
502 #endif
503
504         fd = ll_file_data_get();
505         if (fd == NULL)
506                 RETURN(-ENOMEM);
507
508         fd->fd_file = file;
509         if (S_ISDIR(inode->i_mode)) {
510                 spin_lock(&lli->lli_lock);
511                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
512                         LASSERT(lli->lli_sai == NULL);
513                         lli->lli_opendir_key = fd;
514                         lli->lli_opendir_pid = cfs_curproc_pid();
515                         opendir_set = 1;
516                 }
517                 spin_unlock(&lli->lli_lock);
518         }
519
520         if (inode->i_sb->s_root == file->f_dentry) {
521                 LUSTRE_FPRIVATE(file) = fd;
522                 RETURN(0);
523         }
524
525         if (!it || !it->d.lustre.it_disposition) {
526                 /* Convert f_flags into access mode. We cannot use file->f_mode,
527                  * because everything but O_ACCMODE mask was stripped from
528                  * there */
529                 if ((oit.it_flags + 1) & O_ACCMODE)
530                         oit.it_flags++;
531                 if (file->f_flags & O_TRUNC)
532                         oit.it_flags |= FMODE_WRITE;
533
534                 /* kernel only call f_op->open in dentry_open.  filp_open calls
535                  * dentry_open after call to open_namei that checks permissions.
536                  * Only nfsd_open call dentry_open directly without checking
537                  * permissions and because of that this code below is safe. */
538                 if (oit.it_flags & FMODE_WRITE)
539                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
540
541                 /* We do not want O_EXCL here, presumably we opened the file
542                  * already? XXX - NFS implications? */
543                 oit.it_flags &= ~O_EXCL;
544
545                 it = &oit;
546         }
547
548 restart:
549         /* Let's see if we have file open on MDS already. */
550         if (it->it_flags & FMODE_WRITE) {
551                 och_p = &lli->lli_mds_write_och;
552                 och_usecount = &lli->lli_open_fd_write_count;
553         } else if (it->it_flags & FMODE_EXEC) {
554                 och_p = &lli->lli_mds_exec_och;
555                 och_usecount = &lli->lli_open_fd_exec_count;
556          } else {
557                 och_p = &lli->lli_mds_read_och;
558                 och_usecount = &lli->lli_open_fd_read_count;
559         }
560
561         down(&lli->lli_och_sem);
562         if (*och_p) { /* Open handle is present */
563                 if (it_disposition(it, DISP_OPEN_OPEN)) {
564                         /* Well, there's extra open request that we do not need,
565                            let's close it somehow. This will decref request. */
566                         rc = it_open_error(DISP_OPEN_OPEN, it);
567                         if (rc) {
568                                 up(&lli->lli_och_sem);
569                                 ll_file_data_put(fd);
570                                 GOTO(out_openerr, rc);
571                         }
572                         ll_release_openhandle(file->f_dentry, it);
573                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
574                                              LPROC_LL_OPEN);
575                 }
576                 (*och_usecount)++;
577
578                 rc = ll_local_open(file, it, fd, NULL);
579                 if (rc) {
580                         (*och_usecount)--;
581                         up(&lli->lli_och_sem);
582                         ll_file_data_put(fd);
583                         GOTO(out_openerr, rc);
584                 }
585         } else {
586                 LASSERT(*och_usecount == 0);
587                 if (!it->d.lustre.it_disposition) {
588                         /* We cannot just request lock handle now, new ELC code
589                            means that one of other OPEN locks for this file
590                            could be cancelled, and since blocking ast handler
591                            would attempt to grab och_sem as well, that would
592                            result in a deadlock */
593                         up(&lli->lli_och_sem);
594                         it->it_create_mode |= M_CHECK_STALE;
595                         rc = ll_intent_file_open(file, NULL, 0, it);
596                         it->it_create_mode &= ~M_CHECK_STALE;
597                         if (rc) {
598                                 ll_file_data_put(fd);
599                                 GOTO(out_openerr, rc);
600                         }
601
602                         /* Got some error? Release the request */
603                         if (it->d.lustre.it_status < 0) {
604                                 req = it->d.lustre.it_data;
605                                 ptlrpc_req_finished(req);
606                         }
607                         goto restart;
608                 }
609                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
610                 if (!*och_p) {
611                         ll_file_data_put(fd);
612                         GOTO(out_och_free, rc = -ENOMEM);
613                 }
614                 (*och_usecount)++;
615                 req = it->d.lustre.it_data;
616
617                 /* md_intent_lock() didn't get a request ref if there was an
618                  * open error, so don't do cleanup on the request here
619                  * (bug 3430) */
620                 /* XXX (green): Should not we bail out on any error here, not
621                  * just open error? */
622                 rc = it_open_error(DISP_OPEN_OPEN, it);
623                 if (rc) {
624                         ll_file_data_put(fd);
625                         GOTO(out_och_free, rc);
626                 }
627
628                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
629                 rc = ll_local_open(file, it, fd, *och_p);
630                 if (rc) {
631                         ll_file_data_put(fd);
632                         GOTO(out_och_free, rc);
633                 }
634         }
635         up(&lli->lli_och_sem);
636
637         /* Must do this outside lli_och_sem lock to prevent deadlock where
638            different kind of OPEN lock for this same inode gets cancelled
639            by ldlm_cancel_lru */
640         if (!S_ISREG(inode->i_mode))
641                 GOTO(out, rc);
642
643         ll_capa_open(inode);
644
645         lsm = lli->lli_smd;
646         if (lsm == NULL) {
647                 if (file->f_flags & O_LOV_DELAY_CREATE ||
648                     !(file->f_mode & FMODE_WRITE)) {
649                         CDEBUG(D_INODE, "object creation was delayed\n");
650                         GOTO(out, rc);
651                 }
652         }
653         file->f_flags &= ~O_LOV_DELAY_CREATE;
654         GOTO(out, rc);
655 out:
656         ptlrpc_req_finished(req);
657         if (req)
658                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
659 out_och_free:
660         if (rc) {
661                 if (*och_p) {
662                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
663                         *och_p = NULL; /* OBD_FREE writes some magic there */
664                         (*och_usecount)--;
665                 }
666                 up(&lli->lli_och_sem);
667 out_openerr:
668                 if (opendir_set != 0)
669                         ll_stop_statahead(inode, lli->lli_opendir_key);
670         }
671
672         return rc;
673 }
674
675 /* Fills the obdo with the attributes for the lsm */
676 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
677                           struct obd_capa *capa, struct obdo *obdo)
678 {
679         struct ptlrpc_request_set *set;
680         struct obd_info            oinfo = { { { 0 } } };
681         int                        rc;
682
683         ENTRY;
684
685         LASSERT(lsm != NULL);
686
687         oinfo.oi_md = lsm;
688         oinfo.oi_oa = obdo;
689         oinfo.oi_oa->o_id = lsm->lsm_object_id;
690         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
691         oinfo.oi_oa->o_mode = S_IFREG;
692         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
693                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
694                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
695                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
696                                OBD_MD_FLGROUP;
697         oinfo.oi_capa = capa;
698
699         set = ptlrpc_prep_set();
700         if (set == NULL) {
701                 CERROR("can't allocate ptlrpc set\n");
702                 rc = -ENOMEM;
703         } else {
704                 rc = obd_getattr_async(exp, &oinfo, set);
705                 if (rc == 0)
706                         rc = ptlrpc_set_wait(set);
707                 ptlrpc_set_destroy(set);
708         }
709         if (rc == 0)
710                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
711                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
712                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE);
713         RETURN(rc);
714 }
715
716 /* Fills the obdo with the attributes for the inode defined by lsm */
717 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
718 {
719         struct ll_inode_info *lli  = ll_i2info(inode);
720         struct obd_capa      *capa = ll_mdscapa_get(inode);
721         int rc;
722         ENTRY;
723
724         rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode), capa, obdo);
725         capa_put(capa);
726         if (rc == 0) {
727                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
728                 CDEBUG(D_INODE,
729                        "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
730                        lli->lli_smd->lsm_object_id, i_size_read(inode),
731                        (unsigned long long)inode->i_blocks,
732                        (unsigned long)ll_inode_blksize(inode));
733         }
734         RETURN(rc);
735 }
736
737 int ll_merge_lvb(struct inode *inode)
738 {
739         struct ll_inode_info *lli = ll_i2info(inode);
740         struct ll_sb_info *sbi = ll_i2sbi(inode);
741         struct ost_lvb lvb;
742         int rc;
743
744         ENTRY;
745
746         ll_inode_size_lock(inode, 1);
747         inode_init_lvb(inode, &lvb);
748         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
749         i_size_write(inode, lvb.lvb_size);
750         inode->i_blocks = lvb.lvb_blocks;
751
752         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
753         LTIME_S(inode->i_atime) = lvb.lvb_atime;
754         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
755         ll_inode_size_unlock(inode, 1);
756
757         RETURN(rc);
758 }
759
760 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
761                      lstat_t *st)
762 {
763         struct obdo obdo = { 0 };
764         int rc;
765
766         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo);
767         if (rc == 0) {
768                 st->st_size   = obdo.o_size;
769                 st->st_blocks = obdo.o_blocks;
770                 st->st_mtime  = obdo.o_mtime;
771                 st->st_atime  = obdo.o_atime;
772                 st->st_ctime  = obdo.o_ctime;
773         }
774         return rc;
775 }
776
777 void ll_io_init(struct cl_io *io, const struct file *file, int write)
778 {
779         struct inode *inode     = file->f_dentry->d_inode;
780         struct ll_sb_info *sbi  = ll_i2sbi(inode);
781         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
782
783         LASSERT(fd != NULL);
784         memset(io, 0, sizeof *io);
785         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
786         if (write)
787                 io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
788         io->ci_obj     = ll_i2info(inode)->lli_clob;
789         io->ci_lockreq = CILR_MAYBE;
790         if (fd->fd_flags & LL_FILE_IGNORE_LOCK ||
791             sbi->ll_flags & LL_SBI_NOLCK) {
792                 io->ci_lockreq = CILR_NEVER;
793                 io->ci_no_srvlock = 1;
794         } else if (file->f_flags & O_APPEND) {
795                 io->ci_lockreq = CILR_MANDATORY;
796         }
797 }
798
799 static ssize_t ll_file_io_generic(const struct lu_env *env,
800                 struct ccc_io_args *args, struct file *file,
801                 enum cl_io_type iot, loff_t *ppos, size_t count)
802 {
803         struct cl_io       *io;
804         ssize_t             result;
805         ENTRY;
806
807         io = &ccc_env_info(env)->cti_io;
808         ll_io_init(io, file, iot == CIT_WRITE);
809
810         if (iot == CIT_READ)
811                 io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile;
812
813         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
814                 struct vvp_io *vio = vvp_env_io(env);
815                 struct ccc_io *cio = ccc_env_io(env);
816                 if (cl_io_is_sendfile(io)) {
817                         vio->u.read.cui_actor = args->cia_actor;
818                         vio->u.read.cui_target = args->cia_target;
819                 } else {
820                         cio->cui_iov = args->cia_iov;
821                         cio->cui_nrsegs = args->cia_nrsegs;
822 #ifndef HAVE_FILE_WRITEV
823                         cio->cui_iocb = args->cia_iocb;
824 #endif
825                 }
826                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
827                 result = cl_io_loop(env, io);
828         } else
829                 /* cl_io_rw_init() handled IO */
830                 result = io->ci_result;
831         if (io->ci_nob > 0) {
832                 result = io->ci_nob;
833                 *ppos = io->u.ci_wr.wr.crw_pos;
834         }
835         cl_io_fini(env, io);
836         RETURN(result);
837 }
838
839
840 /*
841  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
842  */
843 static int ll_file_get_iov_count(const struct iovec *iov,
844                                  unsigned long *nr_segs, size_t *count)
845 {
846         size_t cnt = 0;
847         unsigned long seg;
848
849         for (seg = 0; seg < *nr_segs; seg++) {
850                 const struct iovec *iv = &iov[seg];
851
852                 /*
853                  * If any segment has a negative length, or the cumulative
854                  * length ever wraps negative then return -EINVAL.
855                  */
856                 cnt += iv->iov_len;
857                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
858                         return -EINVAL;
859                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
860                         continue;
861                 if (seg == 0)
862                         return -EFAULT;
863                 *nr_segs = seg;
864                 cnt -= iv->iov_len;   /* This segment is no good */
865                 break;
866         }
867         *count = cnt;
868         return 0;
869 }
870
871 #ifdef HAVE_FILE_READV
872 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
873                               unsigned long nr_segs, loff_t *ppos)
874 {
875         struct lu_env      *env;
876         struct ccc_io_args *args;
877         size_t              count;
878         ssize_t             result;
879         int                 refcheck;
880         ENTRY;
881
882         result = ll_file_get_iov_count(iov, &nr_segs, &count);
883         if (result)
884                 RETURN(result);
885
886         env = cl_env_get(&refcheck);
887         if (IS_ERR(env))
888                 RETURN(PTR_ERR(env));
889
890         args = &vvp_env_info(env)->vti_args;
891         args->cia_is_sendfile = 0;
892         args->cia_iov = (struct iovec *)iov;
893         args->cia_nrsegs = nr_segs;
894         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
895         cl_env_put(env, &refcheck);
896         RETURN(result);
897 }
898
899 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
900                             loff_t *ppos)
901 {
902         struct lu_env *env;
903         struct iovec  *local_iov;
904         ssize_t        result;
905         int            refcheck;
906         ENTRY;
907
908         env = cl_env_get(&refcheck);
909         if (IS_ERR(env))
910                 RETURN(PTR_ERR(env));
911
912         local_iov = &vvp_env_info(env)->vti_local_iov;
913         local_iov->iov_base = (void __user *)buf;
914         local_iov->iov_len = count;
915         result = ll_file_readv(file, local_iov, 1, ppos);
916         cl_env_put(env, &refcheck);
917         RETURN(result);
918 }
919
920 #else
921 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
922                                 unsigned long nr_segs, loff_t pos)
923 {
924         struct lu_env      *env;
925         struct ccc_io_args *args;
926         size_t              count;
927         ssize_t             result;
928         int                 refcheck;
929         ENTRY;
930
931         result = ll_file_get_iov_count(iov, &nr_segs, &count);
932         if (result)
933                 RETURN(result);
934
935         env = cl_env_get(&refcheck);
936         if (IS_ERR(env))
937                 RETURN(PTR_ERR(env));
938
939         args = &vvp_env_info(env)->vti_args;
940         args->cia_is_sendfile = 0;
941         args->cia_iov = (struct iovec *)iov;
942         args->cia_nrsegs = nr_segs;
943         args->cia_iocb = iocb;
944         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
945                                     &iocb->ki_pos, count);
946         cl_env_put(env, &refcheck);
947         RETURN(result);
948 }
949
950 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
951                             loff_t *ppos)
952 {
953         struct lu_env *env;
954         struct iovec  *local_iov;
955         struct kiocb  *kiocb;
956         ssize_t        result;
957         int            refcheck;
958         ENTRY;
959
960         env = cl_env_get(&refcheck);
961         if (IS_ERR(env))
962                 RETURN(PTR_ERR(env));
963
964         local_iov = &vvp_env_info(env)->vti_local_iov;
965         kiocb = &vvp_env_info(env)->vti_kiocb;
966         local_iov->iov_base = (void __user *)buf;
967         local_iov->iov_len = count;
968         init_sync_kiocb(kiocb, file);
969         kiocb->ki_pos = *ppos;
970         kiocb->ki_left = count;
971
972         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
973         *ppos = kiocb->ki_pos;
974
975         cl_env_put(env, &refcheck);
976         RETURN(result);
977 }
978 #endif
979
980 /*
981  * Write to a file (through the page cache).
982  */
983 #ifdef HAVE_FILE_WRITEV
984 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
985                               unsigned long nr_segs, loff_t *ppos)
986 {
987         struct lu_env      *env;
988         struct ccc_io_args *args;
989         size_t              count;
990         ssize_t             result;
991         int                 refcheck;
992         ENTRY;
993
994         result = ll_file_get_iov_count(iov, &nr_segs, &count);
995         if (result)
996                 RETURN(result);
997
998         env = cl_env_get(&refcheck);
999         if (IS_ERR(env))
1000                 RETURN(PTR_ERR(env));
1001
1002         args = &vvp_env_info(env)->vti_args;
1003         args->cia_iov = (struct iovec *)iov;
1004         args->cia_nrsegs = nr_segs;
1005         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1006         cl_env_put(env, &refcheck);
1007         RETURN(result);
1008 }
1009
1010 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1011                              loff_t *ppos)
1012 {
1013         struct lu_env    *env;
1014         struct iovec     *local_iov;
1015         ssize_t           result;
1016         int               refcheck;
1017         ENTRY;
1018
1019         env = cl_env_get(&refcheck);
1020         if (IS_ERR(env))
1021                 RETURN(PTR_ERR(env));
1022
1023         local_iov = &vvp_env_info(env)->vti_local_iov;
1024         local_iov->iov_base = (void __user *)buf;
1025         local_iov->iov_len = count;
1026
1027         result = ll_file_writev(file, local_iov, 1, ppos);
1028         cl_env_put(env, &refcheck);
1029         RETURN(result);
1030 }
1031
1032 #else /* AIO stuff */
1033 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1034                                  unsigned long nr_segs, loff_t pos)
1035 {
1036         struct lu_env      *env;
1037         struct ccc_io_args *args;
1038         size_t              count;
1039         ssize_t             result;
1040         int                 refcheck;
1041         ENTRY;
1042
1043         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1044         if (result)
1045                 RETURN(result);
1046
1047         env = cl_env_get(&refcheck);
1048         if (IS_ERR(env))
1049                 RETURN(PTR_ERR(env));
1050
1051         args = &vvp_env_info(env)->vti_args;
1052         args->cia_iov = (struct iovec *)iov;
1053         args->cia_nrsegs = nr_segs;
1054         args->cia_iocb = iocb;
1055         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1056                                   &iocb->ki_pos, count);
1057         cl_env_put(env, &refcheck);
1058         RETURN(result);
1059 }
1060
1061 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1062                              loff_t *ppos)
1063 {
1064         struct lu_env *env;
1065         struct iovec  *local_iov;
1066         struct kiocb  *kiocb;
1067         ssize_t        result;
1068         int            refcheck;
1069         ENTRY;
1070
1071         env = cl_env_get(&refcheck);
1072         if (IS_ERR(env))
1073                 RETURN(PTR_ERR(env));
1074
1075         local_iov = &vvp_env_info(env)->vti_local_iov;
1076         kiocb = &vvp_env_info(env)->vti_kiocb;
1077         local_iov->iov_base = (void __user *)buf;
1078         local_iov->iov_len = count;
1079         init_sync_kiocb(kiocb, file);
1080         kiocb->ki_pos = *ppos;
1081         kiocb->ki_left = count;
1082
1083         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1084         *ppos = kiocb->ki_pos;
1085
1086         cl_env_put(env, &refcheck);
1087         RETURN(result);
1088 }
1089 #endif
1090
1091
1092 /*
1093  * Send file content (through pagecache) somewhere with helper
1094  */
1095 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1096                                 read_actor_t actor, void *target)
1097 {
1098         struct lu_env      *env;
1099         struct ccc_io_args *args;
1100         ssize_t             result;
1101         int                 refcheck;
1102         ENTRY;
1103
1104         env = cl_env_get(&refcheck);
1105         if (IS_ERR(env))
1106                 RETURN(PTR_ERR(env));
1107
1108         args = &vvp_env_info(env)->vti_args;
1109         args->cia_is_sendfile = 1;
1110         args->cia_target = target;
1111         args->cia_actor = actor;
1112         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1113         cl_env_put(env, &refcheck);
1114         RETURN(result);
1115 }
1116
1117 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1118                                unsigned long arg)
1119 {
1120         struct obd_export *exp = ll_i2dtexp(inode);
1121         struct ll_recreate_obj ucreatp;
1122         struct obd_trans_info oti = { 0 };
1123         struct obdo *oa = NULL;
1124         int lsm_size;
1125         int rc = 0;
1126         struct lov_stripe_md *lsm, *lsm2;
1127         ENTRY;
1128
1129         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1130                 RETURN(-EPERM);
1131
1132         if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1133                            sizeof(struct ll_recreate_obj)))
1134                 RETURN(-EFAULT);
1135
1136         OBDO_ALLOC(oa);
1137         if (oa == NULL)
1138                 RETURN(-ENOMEM);
1139
1140         ll_inode_size_lock(inode, 0);
1141         lsm = ll_i2info(inode)->lli_smd;
1142         if (lsm == NULL)
1143                 GOTO(out, rc = -ENOENT);
1144         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1145                    (lsm->lsm_stripe_count));
1146
1147         OBD_ALLOC(lsm2, lsm_size);
1148         if (lsm2 == NULL)
1149                 GOTO(out, rc = -ENOMEM);
1150
1151         oa->o_id = ucreatp.lrc_id;
1152         oa->o_gr = ucreatp.lrc_group;
1153         oa->o_nlink = ucreatp.lrc_ost_idx;
1154         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1155         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1156         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1157                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1158
1159         memcpy(lsm2, lsm, lsm_size);
1160         rc = obd_create(exp, oa, &lsm2, &oti);
1161
1162         OBD_FREE(lsm2, lsm_size);
1163         GOTO(out, rc);
1164 out:
1165         ll_inode_size_unlock(inode, 0);
1166         OBDO_FREE(oa);
1167         return rc;
1168 }
1169
1170 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1171                              int flags, struct lov_user_md *lum, int lum_size)
1172 {
1173         struct lov_stripe_md *lsm;
1174         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1175         int rc = 0;
1176         ENTRY;
1177
1178         ll_inode_size_lock(inode, 0);
1179         lsm = ll_i2info(inode)->lli_smd;
1180         if (lsm) {
1181                 ll_inode_size_unlock(inode, 0);
1182                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1183                        inode->i_ino);
1184                 RETURN(-EEXIST);
1185         }
1186
1187         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1188         if (rc)
1189                 GOTO(out, rc);
1190         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1191                 GOTO(out_req_free, rc = -ENOENT);
1192         rc = oit.d.lustre.it_status;
1193         if (rc < 0)
1194                 GOTO(out_req_free, rc);
1195
1196         ll_release_openhandle(file->f_dentry, &oit);
1197
1198  out:
1199         ll_inode_size_unlock(inode, 0);
1200         ll_intent_release(&oit);
1201         RETURN(rc);
1202 out_req_free:
1203         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1204         goto out;
1205 }
1206
1207 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1208                              struct lov_mds_md **lmmp, int *lmm_size,
1209                              struct ptlrpc_request **request)
1210 {
1211         struct ll_sb_info *sbi = ll_i2sbi(inode);
1212         struct mdt_body  *body;
1213         struct lov_mds_md *lmm = NULL;
1214         struct ptlrpc_request *req = NULL;
1215         struct obd_capa *oc;
1216         int rc, lmmsize;
1217
1218         rc = ll_get_max_mdsize(sbi, &lmmsize);
1219         if (rc)
1220                 RETURN(rc);
1221
1222         oc = ll_mdscapa_get(inode);
1223         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1224                              oc, filename, strlen(filename) + 1,
1225                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1226                              ll_i2suppgid(inode), &req);
1227         capa_put(oc);
1228         if (rc < 0) {
1229                 CDEBUG(D_INFO, "md_getattr_name failed "
1230                        "on %s: rc %d\n", filename, rc);
1231                 GOTO(out, rc);
1232         }
1233
1234         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1235         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1236
1237         lmmsize = body->eadatasize;
1238
1239         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1240                         lmmsize == 0) {
1241                 GOTO(out, rc = -ENODATA);
1242         }
1243
1244         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1245         LASSERT(lmm != NULL);
1246
1247         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1248             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
1249             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
1250                 GOTO(out, rc = -EPROTO);
1251         }
1252
1253         /*
1254          * This is coming from the MDS, so is probably in
1255          * little endian.  We convert it to host endian before
1256          * passing it to userspace.
1257          */
1258         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1259                 /* if function called for directory - we should
1260                  * avoid swab not existent lsm objects */
1261                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1262                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1263                         if (S_ISREG(body->mode))
1264                                 lustre_swab_lov_user_md_objects(
1265                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1266                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1267                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1268                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1269                         if (S_ISREG(body->mode))
1270                                 lustre_swab_lov_user_md_objects(
1271                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1272                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1273                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
1274                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1275                 }
1276         }
1277
1278         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1279                 struct lov_stripe_md *lsm;
1280                 struct lov_user_md_join *lmj;
1281                 int lmj_size, i, aindex = 0;
1282
1283                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1284                 if (rc < 0)
1285                         GOTO(out, rc = -ENOMEM);
1286                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1287                 if (rc)
1288                         GOTO(out_free_memmd, rc);
1289
1290                 lmj_size = sizeof(struct lov_user_md_join) +
1291                            lsm->lsm_stripe_count *
1292                            sizeof(struct lov_user_ost_data_join);
1293                 OBD_ALLOC(lmj, lmj_size);
1294                 if (!lmj)
1295                         GOTO(out_free_memmd, rc = -ENOMEM);
1296
1297                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1298                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1299                         struct lov_extent *lex =
1300                                 &lsm->lsm_array->lai_ext_array[aindex];
1301
1302                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1303                                 aindex ++;
1304                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1305                                         LPU64" len %d\n", aindex, i,
1306                                         lex->le_start, (int)lex->le_len);
1307                         lmj->lmm_objects[i].l_extent_start =
1308                                 lex->le_start;
1309
1310                         if ((int)lex->le_len == -1)
1311                                 lmj->lmm_objects[i].l_extent_end = -1;
1312                         else
1313                                 lmj->lmm_objects[i].l_extent_end =
1314                                         lex->le_start + lex->le_len;
1315                         lmj->lmm_objects[i].l_object_id =
1316                                 lsm->lsm_oinfo[i]->loi_id;
1317                         lmj->lmm_objects[i].l_object_gr =
1318                                 lsm->lsm_oinfo[i]->loi_gr;
1319                         lmj->lmm_objects[i].l_ost_gen =
1320                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1321                         lmj->lmm_objects[i].l_ost_idx =
1322                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1323                 }
1324                 lmm = (struct lov_mds_md *)lmj;
1325                 lmmsize = lmj_size;
1326 out_free_memmd:
1327                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1328         }
1329 out:
1330         *lmmp = lmm;
1331         *lmm_size = lmmsize;
1332         *request = req;
1333         return rc;
1334 }
1335
1336 static int ll_lov_setea(struct inode *inode, struct file *file,
1337                             unsigned long arg)
1338 {
1339         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1340         struct lov_user_md  *lump;
1341         int lum_size = sizeof(struct lov_user_md) +
1342                        sizeof(struct lov_user_ost_data);
1343         int rc;
1344         ENTRY;
1345
1346         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1347                 RETURN(-EPERM);
1348
1349         OBD_ALLOC(lump, lum_size);
1350         if (lump == NULL) {
1351                 RETURN(-ENOMEM);
1352         }
1353         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1354                 OBD_FREE(lump, lum_size);
1355                 RETURN(-EFAULT);
1356         }
1357
1358         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1359
1360         OBD_FREE(lump, lum_size);
1361         RETURN(rc);
1362 }
1363
1364 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1365                             unsigned long arg)
1366 {
1367         struct lov_user_md_v3 lumv3;
1368         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1369         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1370         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1371         int lum_size;
1372         int rc;
1373         int flags = FMODE_WRITE;
1374         ENTRY;
1375
1376         /* first try with v1 which is smaller than v3 */
1377         lum_size = sizeof(struct lov_user_md_v1);
1378         if (copy_from_user(lumv1, lumv1p, lum_size))
1379                 RETURN(-EFAULT);
1380
1381         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1382                 lum_size = sizeof(struct lov_user_md_v3);
1383                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1384                         RETURN(-EFAULT);
1385         }
1386
1387         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1388         if (rc == 0) {
1389                  put_user(0, &lumv1p->lmm_stripe_count);
1390                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1391                                     0, ll_i2info(inode)->lli_smd,
1392                                     (void *)arg);
1393         }
1394         RETURN(rc);
1395 }
1396
1397 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1398 {
1399         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1400
1401         if (!lsm)
1402                 RETURN(-ENODATA);
1403
1404         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1405                             (void *)arg);
1406 }
1407
1408 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1409 {
1410         struct ll_inode_info   *lli = ll_i2info(inode);
1411         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1412         struct ccc_grouplock    grouplock;
1413         int                     rc;
1414         ENTRY;
1415
1416         spin_lock(&lli->lli_lock);
1417         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1418                 CERROR("group lock already existed with gid %lu\n",
1419                        fd->fd_grouplock.cg_gid);
1420                 spin_unlock(&lli->lli_lock);
1421                 RETURN(-EINVAL);
1422         }
1423         LASSERT(fd->fd_grouplock.cg_lock == NULL);
1424         spin_unlock(&lli->lli_lock);
1425
1426         rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1427                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1428         if (rc)
1429                 RETURN(rc);
1430
1431         spin_lock(&lli->lli_lock);
1432         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1433                 spin_unlock(&lli->lli_lock);
1434                 CERROR("another thread just won the race\n");
1435                 cl_put_grouplock(&grouplock);
1436                 RETURN(-EINVAL);
1437         }
1438
1439         fd->fd_flags |= (LL_FILE_GROUP_LOCKED | LL_FILE_IGNORE_LOCK);
1440         fd->fd_grouplock = grouplock;
1441         spin_unlock(&lli->lli_lock);
1442
1443         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1444         RETURN(0);
1445 }
1446
1447 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1448 {
1449         struct ll_inode_info   *lli = ll_i2info(inode);
1450         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1451         struct ccc_grouplock    grouplock;
1452         ENTRY;
1453
1454         spin_lock(&lli->lli_lock);
1455         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1456                 spin_unlock(&lli->lli_lock);
1457                 CERROR("no group lock held\n");
1458                 RETURN(-EINVAL);
1459         }
1460         LASSERT(fd->fd_grouplock.cg_lock != NULL);
1461
1462         if (fd->fd_grouplock.cg_gid != arg) {
1463                 CERROR("group lock %lu doesn't match current id %lu\n",
1464                        arg, fd->fd_grouplock.cg_gid);
1465                 spin_unlock(&lli->lli_lock);
1466                 RETURN(-EINVAL);
1467         }
1468
1469         grouplock = fd->fd_grouplock;
1470         fd->fd_grouplock.cg_env = NULL;
1471         fd->fd_grouplock.cg_lock = NULL;
1472         fd->fd_grouplock.cg_gid = 0;
1473         fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED | LL_FILE_IGNORE_LOCK);
1474         spin_unlock(&lli->lli_lock);
1475
1476         cl_put_grouplock(&grouplock);
1477         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1478         RETURN(0);
1479 }
1480
1481 #if LUSTRE_FIX >= 50
1482 static int join_sanity_check(struct inode *head, struct inode *tail)
1483 {
1484         ENTRY;
1485         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1486                 CERROR("server do not support join \n");
1487                 RETURN(-EINVAL);
1488         }
1489         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1490                 CERROR("tail ino %lu and ino head %lu must be regular\n",
1491                        head->i_ino, tail->i_ino);
1492                 RETURN(-EINVAL);
1493         }
1494         if (head->i_ino == tail->i_ino) {
1495                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1496                 RETURN(-EINVAL);
1497         }
1498         if (i_size_read(head) % JOIN_FILE_ALIGN) {
1499                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1500                 RETURN(-EINVAL);
1501         }
1502         RETURN(0);
1503 }
1504
1505 static int join_file(struct inode *head_inode, struct file *head_filp,
1506                      struct file *tail_filp)
1507 {
1508         struct dentry *tail_dentry = tail_filp->f_dentry;
1509         struct lookup_intent oit = {.it_op = IT_OPEN,
1510                                     .it_flags = head_filp->f_flags,
1511                                     .it_create_mode = M_JOIN_FILE};
1512         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
1513                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
1514
1515         struct lustre_handle lockh;
1516         struct md_op_data *op_data;
1517         int    rc;
1518         loff_t data;
1519         ENTRY;
1520
1521         tail_dentry = tail_filp->f_dentry;
1522
1523         data = i_size_read(head_inode);
1524         op_data = ll_prep_md_op_data(NULL, head_inode,
1525                                      tail_dentry->d_parent->d_inode,
1526                                      tail_dentry->d_name.name,
1527                                      tail_dentry->d_name.len, 0,
1528                                      LUSTRE_OPC_ANY, &data);
1529         if (IS_ERR(op_data))
1530                 RETURN(PTR_ERR(op_data));
1531
1532         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
1533                          op_data, &lockh, NULL, 0, NULL, 0);
1534
1535         ll_finish_md_op_data(op_data);
1536         if (rc < 0)
1537                 GOTO(out, rc);
1538
1539         rc = oit.d.lustre.it_status;
1540
1541         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
1542                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
1543                 ptlrpc_req_finished((struct ptlrpc_request *)
1544                                     oit.d.lustre.it_data);
1545                 GOTO(out, rc);
1546         }
1547
1548         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1549                                            * away */
1550                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1551                 oit.d.lustre.it_lock_mode = 0;
1552         }
1553         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1554         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
1555         ll_release_openhandle(head_filp->f_dentry, &oit);
1556 out:
1557         ll_intent_release(&oit);
1558         RETURN(rc);
1559 }
1560
1561 static int ll_file_join(struct inode *head, struct file *filp,
1562                         char *filename_tail)
1563 {
1564         struct inode *tail = NULL, *first = NULL, *second = NULL;
1565         struct dentry *tail_dentry;
1566         struct file *tail_filp, *first_filp, *second_filp;
1567         struct ll_lock_tree first_tree, second_tree;
1568         struct ll_lock_tree_node *first_node, *second_node;
1569         struct ll_inode_info *hlli = ll_i2info(head);
1570         int rc = 0, cleanup_phase = 0;
1571         ENTRY;
1572
1573         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1574                head->i_ino, head->i_generation, head, filename_tail);
1575
1576         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1577         if (IS_ERR(tail_filp)) {
1578                 CERROR("Can not open tail file %s", filename_tail);
1579                 rc = PTR_ERR(tail_filp);
1580                 GOTO(cleanup, rc);
1581         }
1582         tail = igrab(tail_filp->f_dentry->d_inode);
1583
1584         tail_dentry = tail_filp->f_dentry;
1585         LASSERT(tail_dentry);
1586         cleanup_phase = 1;
1587
1588         /*reorder the inode for lock sequence*/
1589         first = head->i_ino > tail->i_ino ? head : tail;
1590         second = head->i_ino > tail->i_ino ? tail : head;
1591         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1592         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1593
1594         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1595                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1596         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1597         if (IS_ERR(first_node)){
1598                 rc = PTR_ERR(first_node);
1599                 GOTO(cleanup, rc);
1600         }
1601         first_tree.lt_fd = first_filp->private_data;
1602         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1603         if (rc != 0)
1604                 GOTO(cleanup, rc);
1605         cleanup_phase = 2;
1606
1607         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1608         if (IS_ERR(second_node)){
1609                 rc = PTR_ERR(second_node);
1610                 GOTO(cleanup, rc);
1611         }
1612         second_tree.lt_fd = second_filp->private_data;
1613         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1614         if (rc != 0)
1615                 GOTO(cleanup, rc);
1616         cleanup_phase = 3;
1617
1618         rc = join_sanity_check(head, tail);
1619         if (rc)
1620                 GOTO(cleanup, rc);
1621
1622         rc = join_file(head, filp, tail_filp);
1623         if (rc)
1624                 GOTO(cleanup, rc);
1625 cleanup:
1626         switch (cleanup_phase) {
1627         case 3:
1628                 ll_tree_unlock(&second_tree);
1629                 obd_cancel_unused(ll_i2dtexp(second),
1630                                   ll_i2info(second)->lli_smd, 0, NULL);
1631         case 2:
1632                 ll_tree_unlock(&first_tree);
1633                 obd_cancel_unused(ll_i2dtexp(first),
1634                                   ll_i2info(first)->lli_smd, 0, NULL);
1635         case 1:
1636                 filp_close(tail_filp, 0);
1637                 if (tail)
1638                         iput(tail);
1639                 if (head && rc == 0) {
1640                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
1641                                        &hlli->lli_smd);
1642                         hlli->lli_smd = NULL;
1643                 }
1644         case 0:
1645                 break;
1646         default:
1647                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
1648                 LBUG();
1649         }
1650         RETURN(rc);
1651 }
1652 #endif /* LUSTRE_FIX >= 50 */
1653
1654 /**
1655  * Close inode open handle
1656  *
1657  * \param dentry [in]     dentry which contains the inode
1658  * \param it     [in,out] intent which contains open info and result
1659  *
1660  * \retval 0     success
1661  * \retval <0    failure
1662  */
1663 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1664 {
1665         struct inode *inode = dentry->d_inode;
1666         struct obd_client_handle *och;
1667         int rc;
1668         ENTRY;
1669
1670         LASSERT(inode);
1671
1672         /* Root ? Do nothing. */
1673         if (dentry->d_inode->i_sb->s_root == dentry)
1674                 RETURN(0);
1675
1676         /* No open handle to close? Move away */
1677         if (!it_disposition(it, DISP_OPEN_OPEN))
1678                 RETURN(0);
1679
1680         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1681
1682         OBD_ALLOC(och, sizeof(*och));
1683         if (!och)
1684                 GOTO(out, rc = -ENOMEM);
1685
1686         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1687                     ll_i2info(inode), it, och);
1688
1689         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1690                                        inode, och);
1691  out:
1692         /* this one is in place of ll_file_open */
1693         if (it_disposition(it, DISP_ENQ_OPEN_REF))
1694                 ptlrpc_req_finished(it->d.lustre.it_data);
1695         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1696         RETURN(rc);
1697 }
1698
1699 /**
1700  * Get size for inode for which FIEMAP mapping is requested.
1701  * Make the FIEMAP get_info call and returns the result.
1702  */
1703 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1704               int num_bytes)
1705 {
1706         struct obd_export *exp = ll_i2dtexp(inode);
1707         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1708         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1709         int vallen = num_bytes;
1710         int rc;
1711         ENTRY;
1712
1713         /* If the stripe_count > 1 and the application does not understand
1714          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1715          */
1716         if (lsm->lsm_stripe_count > 1 &&
1717             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1718                 return -EOPNOTSUPP;
1719
1720         fm_key.oa.o_id = lsm->lsm_object_id;
1721         fm_key.oa.o_gr = lsm->lsm_object_gr;
1722         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1723
1724         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
1725                         OBD_MD_FLSIZE);
1726
1727         /* If filesize is 0, then there would be no objects for mapping */
1728         if (fm_key.oa.o_size == 0) {
1729                 fiemap->fm_mapped_extents = 0;
1730                 RETURN(0);
1731         }
1732
1733         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1734
1735         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1736         if (rc)
1737                 CERROR("obd_get_info failed: rc = %d\n", rc);
1738
1739         RETURN(rc);
1740 }
1741
1742 int ll_fid2path(struct obd_export *exp, void *arg)
1743 {
1744         struct getinfo_fid2path *gfout, *gfin;
1745         int outsize, rc;
1746         ENTRY;
1747
1748         /* Need to get the buflen */
1749         OBD_ALLOC_PTR(gfin);
1750         if (gfin == NULL)
1751                 RETURN(-ENOMEM);
1752         if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1753                 OBD_FREE_PTR(gfin);
1754                 RETURN(-EFAULT);
1755         }
1756
1757         outsize = sizeof(*gfout) + gfin->gf_pathlen;
1758         OBD_ALLOC(gfout, outsize);
1759         if (gfout == NULL) {
1760                 OBD_FREE_PTR(gfin);
1761                 RETURN(-ENOMEM);
1762         }
1763         memcpy(gfout, gfin, sizeof(*gfout));
1764         OBD_FREE_PTR(gfin);
1765
1766         /* Call mdc_iocontrol */
1767         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1768         if (rc)
1769                 GOTO(gf_free, rc);
1770         if (copy_to_user(arg, gfout, outsize))
1771                 rc = -EFAULT;
1772
1773 gf_free:
1774         OBD_FREE(gfout, outsize);
1775         RETURN(rc);
1776 }
1777
1778 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1779                   unsigned long arg)
1780 {
1781         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1782         int flags;
1783         ENTRY;
1784
1785         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1786                inode->i_generation, inode, cmd);
1787         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1788
1789         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1790         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1791                 RETURN(-ENOTTY);
1792
1793         switch(cmd) {
1794         case LL_IOC_GETFLAGS:
1795                 /* Get the current value of the file flags */
1796                 return put_user(fd->fd_flags, (int *)arg);
1797         case LL_IOC_SETFLAGS:
1798         case LL_IOC_CLRFLAGS:
1799                 /* Set or clear specific file flags */
1800                 /* XXX This probably needs checks to ensure the flags are
1801                  *     not abused, and to handle any flag side effects.
1802                  */
1803                 if (get_user(flags, (int *) arg))
1804                         RETURN(-EFAULT);
1805
1806                 if (cmd == LL_IOC_SETFLAGS) {
1807                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1808                             !(file->f_flags & O_DIRECT)) {
1809                                 CERROR("%s: unable to disable locking on "
1810                                        "non-O_DIRECT file\n", current->comm);
1811                                 RETURN(-EINVAL);
1812                         }
1813
1814                         fd->fd_flags |= flags;
1815                 } else {
1816                         fd->fd_flags &= ~flags;
1817                 }
1818                 RETURN(0);
1819         case LL_IOC_LOV_SETSTRIPE:
1820                 RETURN(ll_lov_setstripe(inode, file, arg));
1821         case LL_IOC_LOV_SETEA:
1822                 RETURN(ll_lov_setea(inode, file, arg));
1823         case LL_IOC_LOV_GETSTRIPE:
1824                 RETURN(ll_lov_getstripe(inode, arg));
1825         case LL_IOC_RECREATE_OBJ:
1826                 RETURN(ll_lov_recreate_obj(inode, file, arg));
1827         case FSFILT_IOC_FIEMAP: {
1828                 struct ll_user_fiemap *fiemap_s;
1829                 size_t num_bytes, ret_bytes;
1830                 unsigned int extent_count;
1831                 int rc = 0;
1832
1833                 /* Get the extent count so we can calculate the size of
1834                  * required fiemap buffer */
1835                 if (get_user(extent_count,
1836                     &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1837                         RETURN(-EFAULT);
1838                 num_bytes = sizeof(*fiemap_s) + (extent_count *
1839                                                  sizeof(struct ll_fiemap_extent));
1840                 OBD_VMALLOC(fiemap_s, num_bytes);
1841                 if (fiemap_s == NULL)
1842                         RETURN(-ENOMEM);
1843
1844                 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1845                                    sizeof(*fiemap_s)))
1846                         GOTO(error, rc = -EFAULT);
1847
1848                 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1849                         fiemap_s->fm_flags = fiemap_s->fm_flags &
1850                                                     ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1851                         if (copy_to_user((char *)arg, fiemap_s,
1852                                          sizeof(*fiemap_s)))
1853                                 GOTO(error, rc = -EFAULT);
1854
1855                         GOTO(error, rc = -EBADR);
1856                 }
1857
1858                 /* If fm_extent_count is non-zero, read the first extent since
1859                  * it is used to calculate end_offset and device from previous
1860                  * fiemap call. */
1861                 if (extent_count) {
1862                         if (copy_from_user(&fiemap_s->fm_extents[0],
1863                             (char __user *)arg + sizeof(*fiemap_s),
1864                             sizeof(struct ll_fiemap_extent)))
1865                                 GOTO(error, rc = -EFAULT);
1866                 }
1867
1868                 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
1869                         int rc;
1870
1871                         rc = filemap_fdatawrite(inode->i_mapping);
1872                         if (rc)
1873                                 GOTO(error, rc);
1874                 }
1875
1876                 rc = ll_fiemap(inode, fiemap_s, num_bytes);
1877                 if (rc)
1878                         GOTO(error, rc);
1879
1880                 ret_bytes = sizeof(struct ll_user_fiemap);
1881
1882                 if (extent_count != 0)
1883                         ret_bytes += (fiemap_s->fm_mapped_extents *
1884                                          sizeof(struct ll_fiemap_extent));
1885
1886                 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1887                         rc = -EFAULT;
1888
1889 error:
1890                 OBD_VFREE(fiemap_s, num_bytes);
1891                 RETURN(rc);
1892         }
1893         case FSFILT_IOC_GETFLAGS:
1894         case FSFILT_IOC_SETFLAGS:
1895                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1896         case FSFILT_IOC_GETVERSION_OLD:
1897         case FSFILT_IOC_GETVERSION:
1898                 RETURN(put_user(inode->i_generation, (int *)arg));
1899         case LL_IOC_JOIN: {
1900 #if LUSTRE_FIX >= 50
1901                 /* Allow file join in beta builds to allow debuggging */
1902                 char *ftail;
1903                 int rc;
1904
1905                 ftail = getname((const char *)arg);
1906                 if (IS_ERR(ftail))
1907                         RETURN(PTR_ERR(ftail));
1908                 rc = ll_file_join(inode, file, ftail);
1909                 putname(ftail);
1910                 RETURN(rc);
1911 #else
1912                 CWARN("file join is not supported in this version of Lustre\n");
1913                 RETURN(-ENOTTY);
1914 #endif
1915         }
1916         case LL_IOC_GROUP_LOCK:
1917                 RETURN(ll_get_grouplock(inode, file, arg));
1918         case LL_IOC_GROUP_UNLOCK:
1919                 RETURN(ll_put_grouplock(inode, file, arg));
1920         case IOC_OBD_STATFS:
1921                 RETURN(ll_obd_statfs(inode, (void *)arg));
1922
1923         /* We need to special case any other ioctls we want to handle,
1924          * to send them to the MDS/OST as appropriate and to properly
1925          * network encode the arg field.
1926         case FSFILT_IOC_SETVERSION_OLD:
1927         case FSFILT_IOC_SETVERSION:
1928         */
1929         case LL_IOC_FLUSHCTX:
1930                 RETURN(ll_flush_ctx(inode));
1931         case LL_IOC_PATH2FID: {
1932                 if (copy_to_user((void *)arg, ll_inode2fid(inode),
1933                                  sizeof(struct lu_fid)))
1934                         RETURN(-EFAULT);
1935
1936                 RETURN(0);
1937         }
1938         case OBD_IOC_FID2PATH:
1939                 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1940
1941         default: {
1942                 int err;
1943
1944                 if (LLIOC_STOP ==
1945                     ll_iocontrol_call(inode, file, cmd, arg, &err))
1946                         RETURN(err);
1947
1948                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1949                                      (void *)arg));
1950         }
1951         }
1952 }
1953
1954 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1955 {
1956         struct inode *inode = file->f_dentry->d_inode;
1957         loff_t retval;
1958         ENTRY;
1959         retval = offset + ((origin == 2) ? i_size_read(inode) :
1960                            (origin == 1) ? file->f_pos : 0);
1961         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
1962                inode->i_ino, inode->i_generation, inode, retval, retval,
1963                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1964         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1965
1966         if (origin == 2) { /* SEEK_END */
1967                 int nonblock = 0, rc;
1968
1969                 if (file->f_flags & O_NONBLOCK)
1970                         nonblock = LDLM_FL_BLOCK_NOWAIT;
1971
1972                 rc = cl_glimpse_size(inode);
1973                 if (rc != 0)
1974                         RETURN(rc);
1975
1976                 ll_inode_size_lock(inode, 0);
1977                 offset += i_size_read(inode);
1978                 ll_inode_size_unlock(inode, 0);
1979         } else if (origin == 1) { /* SEEK_CUR */
1980                 offset += file->f_pos;
1981         }
1982
1983         retval = -EINVAL;
1984         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1985                 if (offset != file->f_pos) {
1986                         file->f_pos = offset;
1987                 }
1988                 retval = offset;
1989         }
1990
1991         RETURN(retval);
1992 }
1993
1994 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1995 {
1996         struct inode *inode = dentry->d_inode;
1997         struct ll_inode_info *lli = ll_i2info(inode);
1998         struct lov_stripe_md *lsm = lli->lli_smd;
1999         struct ptlrpc_request *req;
2000         struct obd_capa *oc;
2001         int rc, err;
2002         ENTRY;
2003         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2004                inode->i_generation, inode);
2005         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2006
2007         /* fsync's caller has already called _fdata{sync,write}, we want
2008          * that IO to finish before calling the osc and mdc sync methods */
2009         rc = filemap_fdatawait(inode->i_mapping);
2010
2011         /* catch async errors that were recorded back when async writeback
2012          * failed for pages in this mapping. */
2013         err = lli->lli_async_rc;
2014         lli->lli_async_rc = 0;
2015         if (rc == 0)
2016                 rc = err;
2017         if (lsm) {
2018                 err = lov_test_and_clear_async_rc(lsm);
2019                 if (rc == 0)
2020                         rc = err;
2021         }
2022
2023         oc = ll_mdscapa_get(inode);
2024         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2025                       &req);
2026         capa_put(oc);
2027         if (!rc)
2028                 rc = err;
2029         if (!err)
2030                 ptlrpc_req_finished(req);
2031
2032         if (data && lsm) {
2033                 struct obdo *oa;
2034
2035                 OBDO_ALLOC(oa);
2036                 if (!oa)
2037                         RETURN(rc ? rc : -ENOMEM);
2038
2039                 oa->o_id = lsm->lsm_object_id;
2040                 oa->o_gr = lsm->lsm_object_gr;
2041                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2042                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2043                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2044                                            OBD_MD_FLGROUP);
2045
2046                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2047                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2048                                0, OBD_OBJECT_EOF, oc);
2049                 capa_put(oc);
2050                 if (!rc)
2051                         rc = err;
2052                 OBDO_FREE(oa);
2053         }
2054
2055         RETURN(rc);
2056 }
2057
2058 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2059 {
2060         struct inode *inode = file->f_dentry->d_inode;
2061         struct ll_sb_info *sbi = ll_i2sbi(inode);
2062         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2063                                            .ei_cb_cp =ldlm_flock_completion_ast,
2064                                            .ei_cbdata = file_lock };
2065         struct md_op_data *op_data;
2066         struct lustre_handle lockh = {0};
2067         ldlm_policy_data_t flock;
2068         int flags = 0;
2069         int rc;
2070         ENTRY;
2071
2072         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2073                inode->i_ino, file_lock);
2074
2075         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2076
2077         if (file_lock->fl_flags & FL_FLOCK) {
2078                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2079                 /* set missing params for flock() calls */
2080                 file_lock->fl_end = OFFSET_MAX;
2081                 file_lock->fl_pid = current->tgid;
2082         }
2083         flock.l_flock.pid = file_lock->fl_pid;
2084         flock.l_flock.start = file_lock->fl_start;
2085         flock.l_flock.end = file_lock->fl_end;
2086
2087         switch (file_lock->fl_type) {
2088         case F_RDLCK:
2089                 einfo.ei_mode = LCK_PR;
2090                 break;
2091         case F_UNLCK:
2092                 /* An unlock request may or may not have any relation to
2093                  * existing locks so we may not be able to pass a lock handle
2094                  * via a normal ldlm_lock_cancel() request. The request may even
2095                  * unlock a byte range in the middle of an existing lock. In
2096                  * order to process an unlock request we need all of the same
2097                  * information that is given with a normal read or write record
2098                  * lock request. To avoid creating another ldlm unlock (cancel)
2099                  * message we'll treat a LCK_NL flock request as an unlock. */
2100                 einfo.ei_mode = LCK_NL;
2101                 break;
2102         case F_WRLCK:
2103                 einfo.ei_mode = LCK_PW;
2104                 break;
2105         default:
2106                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2107                 RETURN (-EINVAL);
2108         }
2109
2110         switch (cmd) {
2111         case F_SETLKW:
2112 #ifdef F_SETLKW64
2113         case F_SETLKW64:
2114 #endif
2115                 flags = 0;
2116                 break;
2117         case F_SETLK:
2118 #ifdef F_SETLK64
2119         case F_SETLK64:
2120 #endif
2121                 flags = LDLM_FL_BLOCK_NOWAIT;
2122                 break;
2123         case F_GETLK:
2124 #ifdef F_GETLK64
2125         case F_GETLK64:
2126 #endif
2127                 flags = LDLM_FL_TEST_LOCK;
2128                 /* Save the old mode so that if the mode in the lock changes we
2129                  * can decrement the appropriate reader or writer refcount. */
2130                 file_lock->fl_type = einfo.ei_mode;
2131                 break;
2132         default:
2133                 CERROR("unknown fcntl lock command: %d\n", cmd);
2134                 RETURN (-EINVAL);
2135         }
2136
2137         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2138                                      LUSTRE_OPC_ANY, NULL);
2139         if (IS_ERR(op_data))
2140                 RETURN(PTR_ERR(op_data));
2141
2142         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2143                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2144                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2145
2146         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2147                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2148
2149         ll_finish_md_op_data(op_data);
2150
2151         if ((file_lock->fl_flags & FL_FLOCK) &&
2152             (rc == 0 || file_lock->fl_type == F_UNLCK))
2153                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2154 #ifdef HAVE_F_OP_FLOCK
2155         if ((file_lock->fl_flags & FL_POSIX) &&
2156             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2157             !(flags & LDLM_FL_TEST_LOCK))
2158                 posix_lock_file_wait(file, file_lock);
2159 #endif
2160
2161         RETURN(rc);
2162 }
2163
2164 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2165 {
2166         ENTRY;
2167
2168         RETURN(-ENOSYS);
2169 }
2170
2171 int ll_have_md_lock(struct inode *inode, __u64 bits)
2172 {
2173         struct lustre_handle lockh;
2174         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2175         struct lu_fid *fid;
2176         int flags;
2177         ENTRY;
2178
2179         if (!inode)
2180                RETURN(0);
2181
2182         fid = &ll_i2info(inode)->lli_fid;
2183         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2184
2185         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2186         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2187                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2188                 RETURN(1);
2189         }
2190         RETURN(0);
2191 }
2192
2193 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2194                             struct lustre_handle *lockh)
2195 {
2196         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2197         struct lu_fid *fid;
2198         ldlm_mode_t rc;
2199         int flags;
2200         ENTRY;
2201
2202         fid = &ll_i2info(inode)->lli_fid;
2203         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2204
2205         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2206         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2207                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2208         RETURN(rc);
2209 }
2210
2211 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2212         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2213                               * and return success */
2214                 inode->i_nlink = 0;
2215                 /* This path cannot be hit for regular files unless in
2216                  * case of obscure races, so no need to to validate
2217                  * size. */
2218                 if (!S_ISREG(inode->i_mode) &&
2219                     !S_ISDIR(inode->i_mode))
2220                         return 0;
2221         }
2222
2223         if (rc) {
2224                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2225                 return -abs(rc);
2226
2227         }
2228
2229         return 0;
2230 }
2231
2232 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2233                              __u64 ibits)
2234 {
2235         struct inode *inode = dentry->d_inode;
2236         struct ptlrpc_request *req = NULL;
2237         struct ll_sb_info *sbi;
2238         struct obd_export *exp;
2239         int rc = 0;
2240         ENTRY;
2241
2242         if (!inode) {
2243                 CERROR("REPORT THIS LINE TO PETER\n");
2244                 RETURN(0);
2245         }
2246         sbi = ll_i2sbi(inode);
2247
2248         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2249                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2250
2251         exp = ll_i2mdexp(inode);
2252
2253         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2254                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2255                 struct md_op_data *op_data;
2256
2257                 /* Call getattr by fid, so do not provide name at all. */
2258                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2259                                              dentry->d_inode, NULL, 0, 0,
2260                                              LUSTRE_OPC_ANY, NULL);
2261                 if (IS_ERR(op_data))
2262                         RETURN(PTR_ERR(op_data));
2263
2264                 oit.it_create_mode |= M_CHECK_STALE;
2265                 rc = md_intent_lock(exp, op_data, NULL, 0,
2266                                     /* we are not interested in name
2267                                        based lookup */
2268                                     &oit, 0, &req,
2269                                     ll_md_blocking_ast, 0);
2270                 ll_finish_md_op_data(op_data);
2271                 oit.it_create_mode &= ~M_CHECK_STALE;
2272                 if (rc < 0) {
2273                         rc = ll_inode_revalidate_fini(inode, rc);
2274                         GOTO (out, rc);
2275                 }
2276
2277                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2278                 if (rc != 0) {
2279                         ll_intent_release(&oit);
2280                         GOTO(out, rc);
2281                 }
2282
2283                 /* Unlinked? Unhash dentry, so it is not picked up later by
2284                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2285                    here to preserve get_cwd functionality on 2.6.
2286                    Bug 10503 */
2287                 if (!dentry->d_inode->i_nlink) {
2288                         spin_lock(&ll_lookup_lock);
2289                         spin_lock(&dcache_lock);
2290                         ll_drop_dentry(dentry);
2291                         spin_unlock(&dcache_lock);
2292                         spin_unlock(&ll_lookup_lock);
2293                 }
2294
2295                 ll_lookup_finish_locks(&oit, dentry);
2296         } else if (!ll_have_md_lock(dentry->d_inode, ibits)) {
2297
2298                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2299                 obd_valid valid = OBD_MD_FLGETATTR;
2300                 struct obd_capa *oc;
2301                 int ealen = 0;
2302
2303                 if (S_ISREG(inode->i_mode)) {
2304                         rc = ll_get_max_mdsize(sbi, &ealen);
2305                         if (rc)
2306                                 RETURN(rc);
2307                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2308                 }
2309                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2310                  * capa for this inode. Because we only keep capas of dirs
2311                  * fresh. */
2312                 oc = ll_mdscapa_get(inode);
2313                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2314                                 ealen, &req);
2315                 capa_put(oc);
2316                 if (rc) {
2317                         rc = ll_inode_revalidate_fini(inode, rc);
2318                         RETURN(rc);
2319                 }
2320
2321                 rc = ll_prep_inode(&inode, req, NULL);
2322         }
2323 out:
2324         ptlrpc_req_finished(req);
2325         return rc;
2326 }
2327
2328 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2329 {
2330         int rc;
2331         ENTRY;
2332
2333         rc = __ll_inode_revalidate_it(dentry, it, MDS_INODELOCK_UPDATE |
2334                                                   MDS_INODELOCK_LOOKUP);
2335
2336         /* if object not yet allocated, don't validate size */
2337         if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL)
2338                 RETURN(0);
2339
2340         /* cl_glimpse_size will prefer locally cached writes if they extend
2341          * the file */
2342
2343         if (rc == 0)
2344                 rc = cl_glimpse_size(dentry->d_inode);
2345
2346         RETURN(rc);
2347 }
2348
2349 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2350                   struct lookup_intent *it, struct kstat *stat)
2351 {
2352         struct inode *inode = de->d_inode;
2353         int res = 0;
2354
2355         res = ll_inode_revalidate_it(de, it);
2356         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2357
2358         if (res)
2359                 return res;
2360
2361         stat->dev = inode->i_sb->s_dev;
2362         stat->ino = inode->i_ino;
2363         stat->mode = inode->i_mode;
2364         stat->nlink = inode->i_nlink;
2365         stat->uid = inode->i_uid;
2366         stat->gid = inode->i_gid;
2367         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2368         stat->atime = inode->i_atime;
2369         stat->mtime = inode->i_mtime;
2370         stat->ctime = inode->i_ctime;
2371 #ifdef HAVE_INODE_BLKSIZE
2372         stat->blksize = inode->i_blksize;
2373 #else
2374         stat->blksize = 1 << inode->i_blkbits;
2375 #endif
2376
2377         ll_inode_size_lock(inode, 0);
2378         stat->size = i_size_read(inode);
2379         stat->blocks = inode->i_blocks;
2380         ll_inode_size_unlock(inode, 0);
2381
2382         return 0;
2383 }
2384 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2385 {
2386         struct lookup_intent it = { .it_op = IT_GETATTR };
2387
2388         return ll_getattr_it(mnt, de, &it, stat);
2389 }
2390
2391 static
2392 int lustre_check_acl(struct inode *inode, int mask)
2393 {
2394 #ifdef CONFIG_FS_POSIX_ACL
2395         struct ll_inode_info *lli = ll_i2info(inode);
2396         struct posix_acl *acl;
2397         int rc;
2398         ENTRY;
2399
2400         spin_lock(&lli->lli_lock);
2401         acl = posix_acl_dup(lli->lli_posix_acl);
2402         spin_unlock(&lli->lli_lock);
2403
2404         if (!acl)
2405                 RETURN(-EAGAIN);
2406
2407         rc = posix_acl_permission(inode, acl, mask);
2408         posix_acl_release(acl);
2409
2410         RETURN(rc);
2411 #else
2412         return -EAGAIN;
2413 #endif
2414 }
2415
2416 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2417 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2418 {
2419         int rc = 0;
2420         ENTRY;
2421
2422        /* as root inode are NOT getting validated in lookup operation,
2423         * need to do it before permission check. */
2424
2425         if (inode == inode->i_sb->s_root->d_inode) {
2426                 struct lookup_intent it = { .it_op = IT_LOOKUP };
2427
2428                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2429                                               MDS_INODELOCK_LOOKUP);
2430                 if (rc)
2431                         RETURN(rc);
2432         }
2433
2434         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2435                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2436
2437         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2438                 return lustre_check_remote_perm(inode, mask);
2439
2440         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2441         rc = generic_permission(inode, mask, lustre_check_acl);
2442
2443         RETURN(rc);
2444 }
2445 #else
2446 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2447 {
2448         int mode = inode->i_mode;
2449         int rc;
2450
2451         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2452                inode->i_ino, inode->i_generation, inode, mask);
2453
2454         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2455                 return lustre_check_remote_perm(inode, mask);
2456
2457         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2458
2459         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2460             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2461                 return -EROFS;
2462         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2463                 return -EACCES;
2464         if (current->fsuid == inode->i_uid) {
2465                 mode >>= 6;
2466         } else if (1) {
2467                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2468                         goto check_groups;
2469                 rc = lustre_check_acl(inode, mask);
2470                 if (rc == -EAGAIN)
2471                         goto check_groups;
2472                 if (rc == -EACCES)
2473                         goto check_capabilities;
2474                 return rc;
2475         } else {
2476 check_groups:
2477                 if (in_group_p(inode->i_gid))
2478                         mode >>= 3;
2479         }
2480         if ((mode & mask & S_IRWXO) == mask)
2481                 return 0;
2482
2483 check_capabilities:
2484         if (!(mask & MAY_EXEC) ||
2485             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2486                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2487                         return 0;
2488
2489         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2490             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2491                 return 0;
2492
2493         return -EACCES;
2494 }
2495 #endif
2496
2497 #ifdef HAVE_FILE_READV
2498 #define READ_METHOD readv
2499 #define READ_FUNCTION ll_file_readv
2500 #define WRITE_METHOD writev
2501 #define WRITE_FUNCTION ll_file_writev
2502 #else
2503 #define READ_METHOD aio_read
2504 #define READ_FUNCTION ll_file_aio_read
2505 #define WRITE_METHOD aio_write
2506 #define WRITE_FUNCTION ll_file_aio_write
2507 #endif
2508
2509 /* -o localflock - only provides locally consistent flock locks */
2510 struct file_operations ll_file_operations = {
2511         .read           = ll_file_read,
2512         .READ_METHOD    = READ_FUNCTION,
2513         .write          = ll_file_write,
2514         .WRITE_METHOD   = WRITE_FUNCTION,
2515         .ioctl          = ll_file_ioctl,
2516         .open           = ll_file_open,
2517         .release        = ll_file_release,
2518         .mmap           = ll_file_mmap,
2519         .llseek         = ll_file_seek,
2520         .sendfile       = ll_file_sendfile,
2521         .fsync          = ll_fsync,
2522 };
2523
2524 struct file_operations ll_file_operations_flock = {
2525         .read           = ll_file_read,
2526         .READ_METHOD    = READ_FUNCTION,
2527         .write          = ll_file_write,
2528         .WRITE_METHOD   = WRITE_FUNCTION,
2529         .ioctl          = ll_file_ioctl,
2530         .open           = ll_file_open,
2531         .release        = ll_file_release,
2532         .mmap           = ll_file_mmap,
2533         .llseek         = ll_file_seek,
2534         .sendfile       = ll_file_sendfile,
2535         .fsync          = ll_fsync,
2536 #ifdef HAVE_F_OP_FLOCK
2537         .flock          = ll_file_flock,
2538 #endif
2539         .lock           = ll_file_flock
2540 };
2541
2542 /* These are for -o noflock - to return ENOSYS on flock calls */
2543 struct file_operations ll_file_operations_noflock = {
2544         .read           = ll_file_read,
2545         .READ_METHOD    = READ_FUNCTION,
2546         .write          = ll_file_write,
2547         .WRITE_METHOD   = WRITE_FUNCTION,
2548         .ioctl          = ll_file_ioctl,
2549         .open           = ll_file_open,
2550         .release        = ll_file_release,
2551         .mmap           = ll_file_mmap,
2552         .llseek         = ll_file_seek,
2553         .sendfile       = ll_file_sendfile,
2554         .fsync          = ll_fsync,
2555 #ifdef HAVE_F_OP_FLOCK
2556         .flock          = ll_file_noflock,
2557 #endif
2558         .lock           = ll_file_noflock
2559 };
2560
2561 struct inode_operations ll_file_inode_operations = {
2562 #ifdef HAVE_VFS_INTENT_PATCHES
2563         .setattr_raw    = ll_setattr_raw,
2564 #endif
2565         .setattr        = ll_setattr,
2566         .truncate       = ll_truncate,
2567         .getattr        = ll_getattr,
2568         .permission     = ll_inode_permission,
2569         .setxattr       = ll_setxattr,
2570         .getxattr       = ll_getxattr,
2571         .listxattr      = ll_listxattr,
2572         .removexattr    = ll_removexattr,
2573 };
2574
2575 /* dynamic ioctl number support routins */
2576 static struct llioc_ctl_data {
2577         struct rw_semaphore ioc_sem;
2578         struct list_head    ioc_head;
2579 } llioc = {
2580         __RWSEM_INITIALIZER(llioc.ioc_sem),
2581         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2582 };
2583
2584
2585 struct llioc_data {
2586         struct list_head        iocd_list;
2587         unsigned int            iocd_size;
2588         llioc_callback_t        iocd_cb;
2589         unsigned int            iocd_count;
2590         unsigned int            iocd_cmd[0];
2591 };
2592
2593 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2594 {
2595         unsigned int size;
2596         struct llioc_data *in_data = NULL;
2597         ENTRY;
2598
2599         if (cb == NULL || cmd == NULL ||
2600             count > LLIOC_MAX_CMD || count < 0)
2601                 RETURN(NULL);
2602
2603         size = sizeof(*in_data) + count * sizeof(unsigned int);
2604         OBD_ALLOC(in_data, size);
2605         if (in_data == NULL)
2606                 RETURN(NULL);
2607
2608         memset(in_data, 0, sizeof(*in_data));
2609         in_data->iocd_size = size;
2610         in_data->iocd_cb = cb;
2611         in_data->iocd_count = count;
2612         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2613
2614         down_write(&llioc.ioc_sem);
2615         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2616         up_write(&llioc.ioc_sem);
2617
2618         RETURN(in_data);
2619 }
2620
2621 void ll_iocontrol_unregister(void *magic)
2622 {
2623         struct llioc_data *tmp;
2624
2625         if (magic == NULL)
2626                 return;
2627
2628         down_write(&llioc.ioc_sem);
2629         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2630                 if (tmp == magic) {
2631                         unsigned int size = tmp->iocd_size;
2632
2633                         list_del(&tmp->iocd_list);
2634                         up_write(&llioc.ioc_sem);
2635
2636                         OBD_FREE(tmp, size);
2637                         return;
2638                 }
2639         }
2640         up_write(&llioc.ioc_sem);
2641
2642         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2643 }
2644
2645 EXPORT_SYMBOL(ll_iocontrol_register);
2646 EXPORT_SYMBOL(ll_iocontrol_unregister);
2647
2648 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2649                         unsigned int cmd, unsigned long arg, int *rcp)
2650 {
2651         enum llioc_iter ret = LLIOC_CONT;
2652         struct llioc_data *data;
2653         int rc = -EINVAL, i;
2654
2655         down_read(&llioc.ioc_sem);
2656         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2657                 for (i = 0; i < data->iocd_count; i++) {
2658                         if (cmd != data->iocd_cmd[i])
2659                                 continue;
2660
2661                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2662                         break;
2663                 }
2664
2665                 if (ret == LLIOC_STOP)
2666                         break;
2667         }
2668         up_read(&llioc.ioc_sem);
2669
2670         if (rcp)
2671                 *rcp = rc;
2672         return ret;
2673 }