Whamcloud - gitweb
dd9989c9056800cfae85132b7ccc40272ebaaae4
[fs/lustre-release.git] / lustre / llite / file.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51
52 #include "cl_object.h"
53
54 struct ll_file_data *ll_file_data_get(void)
55 {
56         struct ll_file_data *fd;
57
58         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
59         return fd;
60 }
61
62 static void ll_file_data_put(struct ll_file_data *fd)
63 {
64         if (fd != NULL)
65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 }
67
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69                           struct lustre_handle *fh)
70 {
71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72         op_data->op_attr.ia_mode = inode->i_mode;
73         op_data->op_attr.ia_atime = inode->i_atime;
74         op_data->op_attr.ia_mtime = inode->i_mtime;
75         op_data->op_attr.ia_ctime = inode->i_ctime;
76         op_data->op_attr.ia_size = i_size_read(inode);
77         op_data->op_attr_blocks = inode->i_blocks;
78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
79         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
80         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
81         op_data->op_capa1 = ll_mdscapa_get(inode);
82 }
83
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85                              struct obd_client_handle *och)
86 {
87         ENTRY;
88
89         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
90                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
91
92         if (!(och->och_flags & FMODE_WRITE))
93                 goto out;
94
95         if (!(exp_connect_som(ll_i2mdexp(inode))) || !S_ISREG(inode->i_mode))
96                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
97         else
98                 ll_epoch_close(inode, op_data, &och, 0);
99
100 out:
101         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
102         EXIT;
103 }
104
105 static int ll_close_inode_openhandle(struct obd_export *md_exp,
106                                      struct inode *inode,
107                                      struct obd_client_handle *och)
108 {
109         struct obd_export *exp = ll_i2mdexp(inode);
110         struct md_op_data *op_data;
111         struct ptlrpc_request *req = NULL;
112         struct obd_device *obd = class_exp2obd(exp);
113         int epoch_close = 1;
114         int rc;
115         ENTRY;
116
117         if (obd == NULL) {
118                 /*
119                  * XXX: in case of LMV, is this correct to access
120                  * ->exp_handle?
121                  */
122                 CERROR("Invalid MDC connection handle "LPX64"\n",
123                        ll_i2mdexp(inode)->exp_handle.h_cookie);
124                 GOTO(out, rc = 0);
125         }
126
127         /*
128          * here we check if this is forced umount. If so this is called on
129          * canceling "open lock" and we do not call md_close() in this case, as
130          * it will not be successful, as import is already deactivated.
131          */
132         if (obd->obd_force)
133                 GOTO(out, rc = 0);
134
135         OBD_ALLOC_PTR(op_data);
136         if (op_data == NULL)
137                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
138
139         ll_prepare_close(inode, op_data, och);
140         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
141         rc = md_close(md_exp, op_data, och->och_mod, &req);
142         if (rc == -EAGAIN) {
143                 /* This close must have the epoch closed. */
144                 LASSERT(epoch_close);
145                 /* MDS has instructed us to obtain Size-on-MDS attribute from
146                  * OSTs and send setattr to back to MDS. */
147                 rc = ll_sizeonmds_update(inode, &och->och_fh,
148                                          op_data->op_ioepoch);
149                 if (rc) {
150                         CERROR("inode %lu mdc Size-on-MDS update failed: "
151                                "rc = %d\n", inode->i_ino, rc);
152                         rc = 0;
153                 }
154         } else if (rc) {
155                 CERROR("inode %lu mdc close failed: rc = %d\n",
156                        inode->i_ino, rc);
157         }
158         ll_finish_md_op_data(op_data);
159
160         if (rc == 0) {
161                 rc = ll_objects_destroy(req, inode);
162                 if (rc)
163                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
164                                inode->i_ino, rc);
165         }
166
167         EXIT;
168 out:
169
170         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
171             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
172                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
173         } else {
174                 md_clear_open_replay_data(md_exp, och);
175                 /* Free @och if it is not waiting for DONE_WRITING. */
176                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
177                 OBD_FREE_PTR(och);
178         }
179         if (req) /* This is close request */
180                 ptlrpc_req_finished(req);
181         return rc;
182 }
183
184 int ll_md_real_close(struct inode *inode, int flags)
185 {
186         struct ll_inode_info *lli = ll_i2info(inode);
187         struct obd_client_handle **och_p;
188         struct obd_client_handle *och;
189         __u64 *och_usecount;
190         int rc = 0;
191         ENTRY;
192
193         if (flags & FMODE_WRITE) {
194                 och_p = &lli->lli_mds_write_och;
195                 och_usecount = &lli->lli_open_fd_write_count;
196         } else if (flags & FMODE_EXEC) {
197                 och_p = &lli->lli_mds_exec_och;
198                 och_usecount = &lli->lli_open_fd_exec_count;
199         } else {
200                 LASSERT(flags & FMODE_READ);
201                 och_p = &lli->lli_mds_read_och;
202                 och_usecount = &lli->lli_open_fd_read_count;
203         }
204
205         down(&lli->lli_och_sem);
206         if (*och_usecount) { /* There are still users of this handle, so
207                                 skip freeing it. */
208                 up(&lli->lli_och_sem);
209                 RETURN(0);
210         }
211         och=*och_p;
212         *och_p = NULL;
213         up(&lli->lli_och_sem);
214
215         if (och) { /* There might be a race and somebody have freed this och
216                       already */
217                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
218                                                inode, och);
219         }
220
221         RETURN(rc);
222 }
223
224 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
225                 struct file *file)
226 {
227         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
228         struct ll_inode_info *lli = ll_i2info(inode);
229         int rc = 0;
230         ENTRY;
231
232         /* clear group lock, if present */
233         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
234                 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
235
236         /* Let's see if we have good enough OPEN lock on the file and if
237            we can skip talking to MDS */
238         if (file->f_dentry->d_inode) { /* Can this ever be false? */
239                 int lockmode;
240                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
241                 struct lustre_handle lockh;
242                 struct inode *inode = file->f_dentry->d_inode;
243                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
244
245                 down(&lli->lli_och_sem);
246                 if (fd->fd_omode & FMODE_WRITE) {
247                         lockmode = LCK_CW;
248                         LASSERT(lli->lli_open_fd_write_count);
249                         lli->lli_open_fd_write_count--;
250                 } else if (fd->fd_omode & FMODE_EXEC) {
251                         lockmode = LCK_PR;
252                         LASSERT(lli->lli_open_fd_exec_count);
253                         lli->lli_open_fd_exec_count--;
254                 } else {
255                         lockmode = LCK_CR;
256                         LASSERT(lli->lli_open_fd_read_count);
257                         lli->lli_open_fd_read_count--;
258                 }
259                 up(&lli->lli_och_sem);
260
261                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
262                                    LDLM_IBITS, &policy, lockmode,
263                                    &lockh)) {
264                         rc = ll_md_real_close(file->f_dentry->d_inode,
265                                               fd->fd_omode);
266                 }
267         } else {
268                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
269                        file, file->f_dentry, file->f_dentry->d_name.name);
270         }
271
272         LUSTRE_FPRIVATE(file) = NULL;
273         ll_file_data_put(fd);
274         ll_capa_close(inode);
275
276         RETURN(rc);
277 }
278
279 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
280
281 /* While this returns an error code, fput() the caller does not, so we need
282  * to make every effort to clean up all of our state here.  Also, applications
283  * rarely check close errors and even if an error is returned they will not
284  * re-try the close call.
285  */
286 int ll_file_release(struct inode *inode, struct file *file)
287 {
288         struct ll_file_data *fd;
289         struct ll_sb_info *sbi = ll_i2sbi(inode);
290         struct ll_inode_info *lli = ll_i2info(inode);
291         struct lov_stripe_md *lsm = lli->lli_smd;
292         int rc;
293         ENTRY;
294
295         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
296                inode->i_generation, inode);
297
298 #ifdef CONFIG_FS_POSIX_ACL
299         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
300             inode == inode->i_sb->s_root->d_inode) {
301                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
302
303                 LASSERT(fd != NULL);
304                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
305                         fd->fd_flags &= ~LL_FILE_RMTACL;
306                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
307                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
308                 }
309         }
310 #endif
311
312         if (inode->i_sb->s_root != file->f_dentry)
313                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
314         fd = LUSTRE_FPRIVATE(file);
315         LASSERT(fd != NULL);
316
317         /* The last ref on @file, maybe not the the owner pid of statahead.
318          * Different processes can open the same dir, "ll_opendir_key" means:
319          * it is me that should stop the statahead thread. */
320         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
321                 ll_stop_statahead(inode, lli->lli_opendir_key);
322
323         if (inode->i_sb->s_root == file->f_dentry) {
324                 LUSTRE_FPRIVATE(file) = NULL;
325                 ll_file_data_put(fd);
326                 RETURN(0);
327         }
328
329         if (lsm)
330                 lov_test_and_clear_async_rc(lsm);
331         lli->lli_async_rc = 0;
332
333         rc = ll_md_close(sbi->ll_md_exp, inode, file);
334         RETURN(rc);
335 }
336
337 static int ll_intent_file_open(struct file *file, void *lmm,
338                                int lmmsize, struct lookup_intent *itp)
339 {
340         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
341         struct dentry *parent = file->f_dentry->d_parent;
342         const char *name = file->f_dentry->d_name.name;
343         const int len = file->f_dentry->d_name.len;
344         struct md_op_data *op_data;
345         struct ptlrpc_request *req;
346         int rc;
347         ENTRY;
348
349         if (!parent)
350                 RETURN(-ENOENT);
351
352         /* Usually we come here only for NFSD, and we want open lock.
353            But we can also get here with pre 2.6.15 patchless kernels, and in
354            that case that lock is also ok */
355         /* We can also get here if there was cached open handle in revalidate_it
356          * but it disappeared while we were getting from there to ll_file_open.
357          * But this means this file was closed and immediatelly opened which
358          * makes a good candidate for using OPEN lock */
359         /* If lmmsize & lmm are not 0, we are just setting stripe info
360          * parameters. No need for the open lock */
361         if (!lmm && !lmmsize)
362                 itp->it_flags |= MDS_OPEN_LOCK;
363
364         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
365                                       file->f_dentry->d_inode, name, len,
366                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
367         if (IS_ERR(op_data))
368                 RETURN(PTR_ERR(op_data));
369
370         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
371                             0 /*unused */, &req, ll_md_blocking_ast, 0);
372         ll_finish_md_op_data(op_data);
373         if (rc == -ESTALE) {
374                 /* reason for keep own exit path - don`t flood log
375                 * with messages with -ESTALE errors.
376                 */
377                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
378                      it_open_error(DISP_OPEN_OPEN, itp))
379                         GOTO(out, rc);
380                 ll_release_openhandle(file->f_dentry, itp);
381                 GOTO(out, rc);
382         }
383
384         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
385                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
386                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
387                 GOTO(out, rc);
388         }
389
390         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
391         if (!rc && itp->d.lustre.it_lock_mode)
392                 md_set_lock_data(sbi->ll_md_exp,
393                                  &itp->d.lustre.it_lock_handle,
394                                  file->f_dentry->d_inode, NULL);
395
396 out:
397         ptlrpc_req_finished(itp->d.lustre.it_data);
398         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
399         ll_intent_drop_lock(itp);
400
401         RETURN(rc);
402 }
403
404 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
405 {
406         if (ioepoch && lli->lli_ioepoch != ioepoch) {
407                 lli->lli_ioepoch = ioepoch;
408                 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
409                        ioepoch, PFID(&lli->lli_fid));
410         }
411 }
412
413 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
414                        struct lookup_intent *it, struct obd_client_handle *och)
415 {
416         struct ptlrpc_request *req = it->d.lustre.it_data;
417         struct mdt_body *body;
418
419         LASSERT(och);
420
421         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
422         LASSERT(body != NULL);                      /* reply already checked out */
423
424         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
425         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
426         och->och_fid = lli->lli_fid;
427         och->och_flags = it->it_flags;
428         ll_ioepoch_open(lli, body->ioepoch);
429
430         return md_set_open_replay_data(md_exp, och, req);
431 }
432
433 int ll_local_open(struct file *file, struct lookup_intent *it,
434                   struct ll_file_data *fd, struct obd_client_handle *och)
435 {
436         struct inode *inode = file->f_dentry->d_inode;
437         struct ll_inode_info *lli = ll_i2info(inode);
438         ENTRY;
439
440         LASSERT(!LUSTRE_FPRIVATE(file));
441
442         LASSERT(fd != NULL);
443
444         if (och) {
445                 struct ptlrpc_request *req = it->d.lustre.it_data;
446                 struct mdt_body *body;
447                 int rc;
448
449                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
450                 if (rc)
451                         RETURN(rc);
452
453                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
454                 if ((it->it_flags & FMODE_WRITE) &&
455                     (body->valid & OBD_MD_FLSIZE))
456                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
457                                lli->lli_ioepoch, PFID(&lli->lli_fid));
458         }
459
460         LUSTRE_FPRIVATE(file) = fd;
461         ll_readahead_init(inode, &fd->fd_ras);
462         fd->fd_omode = it->it_flags;
463         RETURN(0);
464 }
465
466 /* Open a file, and (for the very first open) create objects on the OSTs at
467  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
468  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
469  * lli_open_sem to ensure no other process will create objects, send the
470  * stripe MD to the MDS, or try to destroy the objects if that fails.
471  *
472  * If we already have the stripe MD locally then we don't request it in
473  * md_open(), by passing a lmm_size = 0.
474  *
475  * It is up to the application to ensure no other processes open this file
476  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
477  * used.  We might be able to avoid races of that sort by getting lli_open_sem
478  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
479  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
480  */
481 int ll_file_open(struct inode *inode, struct file *file)
482 {
483         struct ll_inode_info *lli = ll_i2info(inode);
484         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
485                                           .it_flags = file->f_flags };
486         struct lov_stripe_md *lsm;
487         struct ptlrpc_request *req = NULL;
488         struct obd_client_handle **och_p;
489         __u64 *och_usecount;
490         struct ll_file_data *fd;
491         int rc = 0, opendir_set = 0;
492         ENTRY;
493
494         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
495                inode->i_generation, inode, file->f_flags);
496
497 #ifdef HAVE_VFS_INTENT_PATCHES
498         it = file->f_it;
499 #else
500         it = file->private_data; /* XXX: compat macro */
501         file->private_data = NULL; /* prevent ll_local_open assertion */
502 #endif
503
504         fd = ll_file_data_get();
505         if (fd == NULL)
506                 RETURN(-ENOMEM);
507
508         fd->fd_file = file;
509         if (S_ISDIR(inode->i_mode)) {
510                 spin_lock(&lli->lli_lock);
511                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
512                         LASSERT(lli->lli_sai == NULL);
513                         lli->lli_opendir_key = fd;
514                         lli->lli_opendir_pid = cfs_curproc_pid();
515                         opendir_set = 1;
516                 }
517                 spin_unlock(&lli->lli_lock);
518         }
519
520         if (inode->i_sb->s_root == file->f_dentry) {
521                 LUSTRE_FPRIVATE(file) = fd;
522                 RETURN(0);
523         }
524
525         if (!it || !it->d.lustre.it_disposition) {
526                 /* Convert f_flags into access mode. We cannot use file->f_mode,
527                  * because everything but O_ACCMODE mask was stripped from
528                  * there */
529                 if ((oit.it_flags + 1) & O_ACCMODE)
530                         oit.it_flags++;
531                 if (file->f_flags & O_TRUNC)
532                         oit.it_flags |= FMODE_WRITE;
533
534                 /* kernel only call f_op->open in dentry_open.  filp_open calls
535                  * dentry_open after call to open_namei that checks permissions.
536                  * Only nfsd_open call dentry_open directly without checking
537                  * permissions and because of that this code below is safe. */
538                 if (oit.it_flags & FMODE_WRITE)
539                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
540
541                 /* We do not want O_EXCL here, presumably we opened the file
542                  * already? XXX - NFS implications? */
543                 oit.it_flags &= ~O_EXCL;
544
545                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
546                  * created if necessary, then "IT_CREAT" should be set to keep
547                  * consistent with it */
548                 if (oit.it_flags & O_CREAT)
549                         oit.it_op |= IT_CREAT;
550
551                 it = &oit;
552         }
553
554 restart:
555         /* Let's see if we have file open on MDS already. */
556         if (it->it_flags & FMODE_WRITE) {
557                 och_p = &lli->lli_mds_write_och;
558                 och_usecount = &lli->lli_open_fd_write_count;
559         } else if (it->it_flags & FMODE_EXEC) {
560                 och_p = &lli->lli_mds_exec_och;
561                 och_usecount = &lli->lli_open_fd_exec_count;
562          } else {
563                 och_p = &lli->lli_mds_read_och;
564                 och_usecount = &lli->lli_open_fd_read_count;
565         }
566
567         down(&lli->lli_och_sem);
568         if (*och_p) { /* Open handle is present */
569                 if (it_disposition(it, DISP_OPEN_OPEN)) {
570                         /* Well, there's extra open request that we do not need,
571                            let's close it somehow. This will decref request. */
572                         rc = it_open_error(DISP_OPEN_OPEN, it);
573                         if (rc) {
574                                 up(&lli->lli_och_sem);
575                                 ll_file_data_put(fd);
576                                 GOTO(out_openerr, rc);
577                         }
578                         ll_release_openhandle(file->f_dentry, it);
579                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
580                                              LPROC_LL_OPEN);
581                 }
582                 (*och_usecount)++;
583
584                 rc = ll_local_open(file, it, fd, NULL);
585                 if (rc) {
586                         (*och_usecount)--;
587                         up(&lli->lli_och_sem);
588                         ll_file_data_put(fd);
589                         GOTO(out_openerr, rc);
590                 }
591         } else {
592                 LASSERT(*och_usecount == 0);
593                 if (!it->d.lustre.it_disposition) {
594                         /* We cannot just request lock handle now, new ELC code
595                            means that one of other OPEN locks for this file
596                            could be cancelled, and since blocking ast handler
597                            would attempt to grab och_sem as well, that would
598                            result in a deadlock */
599                         up(&lli->lli_och_sem);
600                         it->it_create_mode |= M_CHECK_STALE;
601                         rc = ll_intent_file_open(file, NULL, 0, it);
602                         it->it_create_mode &= ~M_CHECK_STALE;
603                         if (rc) {
604                                 ll_file_data_put(fd);
605                                 GOTO(out_openerr, rc);
606                         }
607
608                         /* Got some error? Release the request */
609                         if (it->d.lustre.it_status < 0) {
610                                 req = it->d.lustre.it_data;
611                                 ptlrpc_req_finished(req);
612                         }
613                         goto restart;
614                 }
615                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
616                 if (!*och_p) {
617                         ll_file_data_put(fd);
618                         GOTO(out_och_free, rc = -ENOMEM);
619                 }
620                 (*och_usecount)++;
621                 req = it->d.lustre.it_data;
622
623                 /* md_intent_lock() didn't get a request ref if there was an
624                  * open error, so don't do cleanup on the request here
625                  * (bug 3430) */
626                 /* XXX (green): Should not we bail out on any error here, not
627                  * just open error? */
628                 rc = it_open_error(DISP_OPEN_OPEN, it);
629                 if (rc) {
630                         ll_file_data_put(fd);
631                         GOTO(out_och_free, rc);
632                 }
633
634                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
635                 rc = ll_local_open(file, it, fd, *och_p);
636                 if (rc) {
637                         ll_file_data_put(fd);
638                         GOTO(out_och_free, rc);
639                 }
640         }
641         up(&lli->lli_och_sem);
642
643         /* Must do this outside lli_och_sem lock to prevent deadlock where
644            different kind of OPEN lock for this same inode gets cancelled
645            by ldlm_cancel_lru */
646         if (!S_ISREG(inode->i_mode))
647                 GOTO(out, rc);
648
649         ll_capa_open(inode);
650
651         lsm = lli->lli_smd;
652         if (lsm == NULL) {
653                 if (file->f_flags & O_LOV_DELAY_CREATE ||
654                     !(file->f_mode & FMODE_WRITE)) {
655                         CDEBUG(D_INODE, "object creation was delayed\n");
656                         GOTO(out, rc);
657                 }
658         }
659         file->f_flags &= ~O_LOV_DELAY_CREATE;
660         GOTO(out, rc);
661 out:
662         ptlrpc_req_finished(req);
663         if (req)
664                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
665 out_och_free:
666         if (rc) {
667                 if (*och_p) {
668                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
669                         *och_p = NULL; /* OBD_FREE writes some magic there */
670                         (*och_usecount)--;
671                 }
672                 up(&lli->lli_och_sem);
673 out_openerr:
674                 if (opendir_set != 0)
675                         ll_stop_statahead(inode, lli->lli_opendir_key);
676         }
677
678         return rc;
679 }
680
681 /* Fills the obdo with the attributes for the lsm */
682 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
683                           struct obd_capa *capa, struct obdo *obdo)
684 {
685         struct ptlrpc_request_set *set;
686         struct obd_info            oinfo = { { { 0 } } };
687         int                        rc;
688
689         ENTRY;
690
691         LASSERT(lsm != NULL);
692
693         oinfo.oi_md = lsm;
694         oinfo.oi_oa = obdo;
695         oinfo.oi_oa->o_id = lsm->lsm_object_id;
696         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
697         oinfo.oi_oa->o_mode = S_IFREG;
698         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
699                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
700                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
701                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
702                                OBD_MD_FLGROUP;
703         oinfo.oi_capa = capa;
704
705         set = ptlrpc_prep_set();
706         if (set == NULL) {
707                 CERROR("can't allocate ptlrpc set\n");
708                 rc = -ENOMEM;
709         } else {
710                 rc = obd_getattr_async(exp, &oinfo, set);
711                 if (rc == 0)
712                         rc = ptlrpc_set_wait(set);
713                 ptlrpc_set_destroy(set);
714         }
715         if (rc == 0)
716                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
717                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
718                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE);
719         RETURN(rc);
720 }
721
722 /* Fills the obdo with the attributes for the inode defined by lsm */
723 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
724 {
725         struct ll_inode_info *lli  = ll_i2info(inode);
726         struct obd_capa      *capa = ll_mdscapa_get(inode);
727         int rc;
728         ENTRY;
729
730         rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode), capa, obdo);
731         capa_put(capa);
732         if (rc == 0) {
733                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
734                 CDEBUG(D_INODE,
735                        "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
736                        lli->lli_smd->lsm_object_id, i_size_read(inode),
737                        (unsigned long long)inode->i_blocks,
738                        (unsigned long)ll_inode_blksize(inode));
739         }
740         RETURN(rc);
741 }
742
743 int ll_merge_lvb(struct inode *inode)
744 {
745         struct ll_inode_info *lli = ll_i2info(inode);
746         struct ll_sb_info *sbi = ll_i2sbi(inode);
747         struct ost_lvb lvb;
748         int rc;
749
750         ENTRY;
751
752         ll_inode_size_lock(inode, 1);
753         inode_init_lvb(inode, &lvb);
754         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
755         i_size_write(inode, lvb.lvb_size);
756         inode->i_blocks = lvb.lvb_blocks;
757
758         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
759         LTIME_S(inode->i_atime) = lvb.lvb_atime;
760         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
761         ll_inode_size_unlock(inode, 1);
762
763         RETURN(rc);
764 }
765
766 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
767                      lstat_t *st)
768 {
769         struct obdo obdo = { 0 };
770         int rc;
771
772         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo);
773         if (rc == 0) {
774                 st->st_size   = obdo.o_size;
775                 st->st_blocks = obdo.o_blocks;
776                 st->st_mtime  = obdo.o_mtime;
777                 st->st_atime  = obdo.o_atime;
778                 st->st_ctime  = obdo.o_ctime;
779         }
780         return rc;
781 }
782
783 void ll_io_init(struct cl_io *io, const struct file *file, int write)
784 {
785         struct inode *inode = file->f_dentry->d_inode;
786
787         memset(io, 0, sizeof *io);
788         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
789         if (write)
790                 io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
791         io->ci_obj     = ll_i2info(inode)->lli_clob;
792         io->ci_lockreq = CILR_MAYBE;
793         if (ll_file_nolock(file)) {
794                 io->ci_lockreq = CILR_NEVER;
795                 io->ci_no_srvlock = 1;
796         } else if (file->f_flags & O_APPEND) {
797                 io->ci_lockreq = CILR_MANDATORY;
798         }
799 }
800
801 static ssize_t ll_file_io_generic(const struct lu_env *env,
802                 struct ccc_io_args *args, struct file *file,
803                 enum cl_io_type iot, loff_t *ppos, size_t count)
804 {
805         struct cl_io       *io;
806         ssize_t             result;
807         ENTRY;
808
809         io = &ccc_env_info(env)->cti_io;
810         ll_io_init(io, file, iot == CIT_WRITE);
811
812         if (iot == CIT_READ)
813                 io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile;
814
815         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
816                 struct vvp_io *vio = vvp_env_io(env);
817                 struct ccc_io *cio = ccc_env_io(env);
818                 if (cl_io_is_sendfile(io)) {
819                         vio->u.read.cui_actor = args->cia_actor;
820                         vio->u.read.cui_target = args->cia_target;
821                 } else {
822                         cio->cui_iov = args->cia_iov;
823                         cio->cui_nrsegs = args->cia_nrsegs;
824 #ifndef HAVE_FILE_WRITEV
825                         cio->cui_iocb = args->cia_iocb;
826 #endif
827                 }
828                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
829                 result = cl_io_loop(env, io);
830         } else
831                 /* cl_io_rw_init() handled IO */
832                 result = io->ci_result;
833         if (io->ci_nob > 0) {
834                 result = io->ci_nob;
835                 *ppos = io->u.ci_wr.wr.crw_pos;
836         }
837         cl_io_fini(env, io);
838         RETURN(result);
839 }
840
841
842 /*
843  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
844  */
845 static int ll_file_get_iov_count(const struct iovec *iov,
846                                  unsigned long *nr_segs, size_t *count)
847 {
848         size_t cnt = 0;
849         unsigned long seg;
850
851         for (seg = 0; seg < *nr_segs; seg++) {
852                 const struct iovec *iv = &iov[seg];
853
854                 /*
855                  * If any segment has a negative length, or the cumulative
856                  * length ever wraps negative then return -EINVAL.
857                  */
858                 cnt += iv->iov_len;
859                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
860                         return -EINVAL;
861                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
862                         continue;
863                 if (seg == 0)
864                         return -EFAULT;
865                 *nr_segs = seg;
866                 cnt -= iv->iov_len;   /* This segment is no good */
867                 break;
868         }
869         *count = cnt;
870         return 0;
871 }
872
873 #ifdef HAVE_FILE_READV
874 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
875                               unsigned long nr_segs, loff_t *ppos)
876 {
877         struct lu_env      *env;
878         struct ccc_io_args *args;
879         size_t              count;
880         ssize_t             result;
881         int                 refcheck;
882         ENTRY;
883
884         result = ll_file_get_iov_count(iov, &nr_segs, &count);
885         if (result)
886                 RETURN(result);
887
888         env = cl_env_get(&refcheck);
889         if (IS_ERR(env))
890                 RETURN(PTR_ERR(env));
891
892         args = &vvp_env_info(env)->vti_args;
893         args->cia_is_sendfile = 0;
894         args->cia_iov = (struct iovec *)iov;
895         args->cia_nrsegs = nr_segs;
896         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
897         cl_env_put(env, &refcheck);
898         RETURN(result);
899 }
900
901 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
902                             loff_t *ppos)
903 {
904         struct lu_env *env;
905         struct iovec  *local_iov;
906         ssize_t        result;
907         int            refcheck;
908         ENTRY;
909
910         env = cl_env_get(&refcheck);
911         if (IS_ERR(env))
912                 RETURN(PTR_ERR(env));
913
914         local_iov = &vvp_env_info(env)->vti_local_iov;
915         local_iov->iov_base = (void __user *)buf;
916         local_iov->iov_len = count;
917         result = ll_file_readv(file, local_iov, 1, ppos);
918         cl_env_put(env, &refcheck);
919         RETURN(result);
920 }
921
922 #else
923 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
924                                 unsigned long nr_segs, loff_t pos)
925 {
926         struct lu_env      *env;
927         struct ccc_io_args *args;
928         size_t              count;
929         ssize_t             result;
930         int                 refcheck;
931         ENTRY;
932
933         result = ll_file_get_iov_count(iov, &nr_segs, &count);
934         if (result)
935                 RETURN(result);
936
937         env = cl_env_get(&refcheck);
938         if (IS_ERR(env))
939                 RETURN(PTR_ERR(env));
940
941         args = &vvp_env_info(env)->vti_args;
942         args->cia_is_sendfile = 0;
943         args->cia_iov = (struct iovec *)iov;
944         args->cia_nrsegs = nr_segs;
945         args->cia_iocb = iocb;
946         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
947                                     &iocb->ki_pos, count);
948         cl_env_put(env, &refcheck);
949         RETURN(result);
950 }
951
952 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
953                             loff_t *ppos)
954 {
955         struct lu_env *env;
956         struct iovec  *local_iov;
957         struct kiocb  *kiocb;
958         ssize_t        result;
959         int            refcheck;
960         ENTRY;
961
962         env = cl_env_get(&refcheck);
963         if (IS_ERR(env))
964                 RETURN(PTR_ERR(env));
965
966         local_iov = &vvp_env_info(env)->vti_local_iov;
967         kiocb = &vvp_env_info(env)->vti_kiocb;
968         local_iov->iov_base = (void __user *)buf;
969         local_iov->iov_len = count;
970         init_sync_kiocb(kiocb, file);
971         kiocb->ki_pos = *ppos;
972         kiocb->ki_left = count;
973
974         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
975         *ppos = kiocb->ki_pos;
976
977         cl_env_put(env, &refcheck);
978         RETURN(result);
979 }
980 #endif
981
982 /*
983  * Write to a file (through the page cache).
984  */
985 #ifdef HAVE_FILE_WRITEV
986 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
987                               unsigned long nr_segs, loff_t *ppos)
988 {
989         struct lu_env      *env;
990         struct ccc_io_args *args;
991         size_t              count;
992         ssize_t             result;
993         int                 refcheck;
994         ENTRY;
995
996         result = ll_file_get_iov_count(iov, &nr_segs, &count);
997         if (result)
998                 RETURN(result);
999
1000         env = cl_env_get(&refcheck);
1001         if (IS_ERR(env))
1002                 RETURN(PTR_ERR(env));
1003
1004         args = &vvp_env_info(env)->vti_args;
1005         args->cia_iov = (struct iovec *)iov;
1006         args->cia_nrsegs = nr_segs;
1007         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1008         cl_env_put(env, &refcheck);
1009         RETURN(result);
1010 }
1011
1012 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1013                              loff_t *ppos)
1014 {
1015         struct lu_env    *env;
1016         struct iovec     *local_iov;
1017         ssize_t           result;
1018         int               refcheck;
1019         ENTRY;
1020
1021         env = cl_env_get(&refcheck);
1022         if (IS_ERR(env))
1023                 RETURN(PTR_ERR(env));
1024
1025         local_iov = &vvp_env_info(env)->vti_local_iov;
1026         local_iov->iov_base = (void __user *)buf;
1027         local_iov->iov_len = count;
1028
1029         result = ll_file_writev(file, local_iov, 1, ppos);
1030         cl_env_put(env, &refcheck);
1031         RETURN(result);
1032 }
1033
1034 #else /* AIO stuff */
1035 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1036                                  unsigned long nr_segs, loff_t pos)
1037 {
1038         struct lu_env      *env;
1039         struct ccc_io_args *args;
1040         size_t              count;
1041         ssize_t             result;
1042         int                 refcheck;
1043         ENTRY;
1044
1045         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1046         if (result)
1047                 RETURN(result);
1048
1049         env = cl_env_get(&refcheck);
1050         if (IS_ERR(env))
1051                 RETURN(PTR_ERR(env));
1052
1053         args = &vvp_env_info(env)->vti_args;
1054         args->cia_iov = (struct iovec *)iov;
1055         args->cia_nrsegs = nr_segs;
1056         args->cia_iocb = iocb;
1057         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1058                                   &iocb->ki_pos, count);
1059         cl_env_put(env, &refcheck);
1060         RETURN(result);
1061 }
1062
1063 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1064                              loff_t *ppos)
1065 {
1066         struct lu_env *env;
1067         struct iovec  *local_iov;
1068         struct kiocb  *kiocb;
1069         ssize_t        result;
1070         int            refcheck;
1071         ENTRY;
1072
1073         env = cl_env_get(&refcheck);
1074         if (IS_ERR(env))
1075                 RETURN(PTR_ERR(env));
1076
1077         local_iov = &vvp_env_info(env)->vti_local_iov;
1078         kiocb = &vvp_env_info(env)->vti_kiocb;
1079         local_iov->iov_base = (void __user *)buf;
1080         local_iov->iov_len = count;
1081         init_sync_kiocb(kiocb, file);
1082         kiocb->ki_pos = *ppos;
1083         kiocb->ki_left = count;
1084
1085         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1086         *ppos = kiocb->ki_pos;
1087
1088         cl_env_put(env, &refcheck);
1089         RETURN(result);
1090 }
1091 #endif
1092
1093
1094 /*
1095  * Send file content (through pagecache) somewhere with helper
1096  */
1097 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1098                                 read_actor_t actor, void *target)
1099 {
1100         struct lu_env      *env;
1101         struct ccc_io_args *args;
1102         ssize_t             result;
1103         int                 refcheck;
1104         ENTRY;
1105
1106         env = cl_env_get(&refcheck);
1107         if (IS_ERR(env))
1108                 RETURN(PTR_ERR(env));
1109
1110         args = &vvp_env_info(env)->vti_args;
1111         args->cia_is_sendfile = 1;
1112         args->cia_target = target;
1113         args->cia_actor = actor;
1114         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1115         cl_env_put(env, &refcheck);
1116         RETURN(result);
1117 }
1118
1119 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1120                                unsigned long arg)
1121 {
1122         struct obd_export *exp = ll_i2dtexp(inode);
1123         struct ll_recreate_obj ucreatp;
1124         struct obd_trans_info oti = { 0 };
1125         struct obdo *oa = NULL;
1126         int lsm_size;
1127         int rc = 0;
1128         struct lov_stripe_md *lsm, *lsm2;
1129         ENTRY;
1130
1131         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1132                 RETURN(-EPERM);
1133
1134         if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1135                            sizeof(struct ll_recreate_obj)))
1136                 RETURN(-EFAULT);
1137
1138         OBDO_ALLOC(oa);
1139         if (oa == NULL)
1140                 RETURN(-ENOMEM);
1141
1142         ll_inode_size_lock(inode, 0);
1143         lsm = ll_i2info(inode)->lli_smd;
1144         if (lsm == NULL)
1145                 GOTO(out, rc = -ENOENT);
1146         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1147                    (lsm->lsm_stripe_count));
1148
1149         OBD_ALLOC(lsm2, lsm_size);
1150         if (lsm2 == NULL)
1151                 GOTO(out, rc = -ENOMEM);
1152
1153         oa->o_id = ucreatp.lrc_id;
1154         oa->o_gr = ucreatp.lrc_group;
1155         oa->o_nlink = ucreatp.lrc_ost_idx;
1156         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1157         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1158         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1159                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1160
1161         memcpy(lsm2, lsm, lsm_size);
1162         rc = obd_create(exp, oa, &lsm2, &oti);
1163
1164         OBD_FREE(lsm2, lsm_size);
1165         GOTO(out, rc);
1166 out:
1167         ll_inode_size_unlock(inode, 0);
1168         OBDO_FREE(oa);
1169         return rc;
1170 }
1171
1172 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1173                              int flags, struct lov_user_md *lum, int lum_size)
1174 {
1175         struct lov_stripe_md *lsm;
1176         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1177         int rc = 0;
1178         ENTRY;
1179
1180         ll_inode_size_lock(inode, 0);
1181         lsm = ll_i2info(inode)->lli_smd;
1182         if (lsm) {
1183                 ll_inode_size_unlock(inode, 0);
1184                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1185                        inode->i_ino);
1186                 RETURN(-EEXIST);
1187         }
1188
1189         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1190         if (rc)
1191                 GOTO(out, rc);
1192         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1193                 GOTO(out_req_free, rc = -ENOENT);
1194         rc = oit.d.lustre.it_status;
1195         if (rc < 0)
1196                 GOTO(out_req_free, rc);
1197
1198         ll_release_openhandle(file->f_dentry, &oit);
1199
1200  out:
1201         ll_inode_size_unlock(inode, 0);
1202         ll_intent_release(&oit);
1203         RETURN(rc);
1204 out_req_free:
1205         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1206         goto out;
1207 }
1208
1209 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1210                              struct lov_mds_md **lmmp, int *lmm_size,
1211                              struct ptlrpc_request **request)
1212 {
1213         struct ll_sb_info *sbi = ll_i2sbi(inode);
1214         struct mdt_body  *body;
1215         struct lov_mds_md *lmm = NULL;
1216         struct ptlrpc_request *req = NULL;
1217         struct obd_capa *oc;
1218         int rc, lmmsize;
1219
1220         rc = ll_get_max_mdsize(sbi, &lmmsize);
1221         if (rc)
1222                 RETURN(rc);
1223
1224         oc = ll_mdscapa_get(inode);
1225         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1226                              oc, filename, strlen(filename) + 1,
1227                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1228                              ll_i2suppgid(inode), &req);
1229         capa_put(oc);
1230         if (rc < 0) {
1231                 CDEBUG(D_INFO, "md_getattr_name failed "
1232                        "on %s: rc %d\n", filename, rc);
1233                 GOTO(out, rc);
1234         }
1235
1236         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1237         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1238
1239         lmmsize = body->eadatasize;
1240
1241         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1242                         lmmsize == 0) {
1243                 GOTO(out, rc = -ENODATA);
1244         }
1245
1246         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1247         LASSERT(lmm != NULL);
1248
1249         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1250             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
1251             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
1252                 GOTO(out, rc = -EPROTO);
1253         }
1254
1255         /*
1256          * This is coming from the MDS, so is probably in
1257          * little endian.  We convert it to host endian before
1258          * passing it to userspace.
1259          */
1260         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1261                 /* if function called for directory - we should
1262                  * avoid swab not existent lsm objects */
1263                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1264                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1265                         if (S_ISREG(body->mode))
1266                                 lustre_swab_lov_user_md_objects(
1267                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1268                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1269                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1270                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1271                         if (S_ISREG(body->mode))
1272                                 lustre_swab_lov_user_md_objects(
1273                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1274                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1275                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
1276                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1277                 }
1278         }
1279
1280         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1281                 struct lov_stripe_md *lsm;
1282                 struct lov_user_md_join *lmj;
1283                 int lmj_size, i, aindex = 0;
1284
1285                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1286                 if (rc < 0)
1287                         GOTO(out, rc = -ENOMEM);
1288                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1289                 if (rc)
1290                         GOTO(out_free_memmd, rc);
1291
1292                 lmj_size = sizeof(struct lov_user_md_join) +
1293                            lsm->lsm_stripe_count *
1294                            sizeof(struct lov_user_ost_data_join);
1295                 OBD_ALLOC(lmj, lmj_size);
1296                 if (!lmj)
1297                         GOTO(out_free_memmd, rc = -ENOMEM);
1298
1299                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1300                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1301                         struct lov_extent *lex =
1302                                 &lsm->lsm_array->lai_ext_array[aindex];
1303
1304                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1305                                 aindex ++;
1306                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1307                                         LPU64" len %d\n", aindex, i,
1308                                         lex->le_start, (int)lex->le_len);
1309                         lmj->lmm_objects[i].l_extent_start =
1310                                 lex->le_start;
1311
1312                         if ((int)lex->le_len == -1)
1313                                 lmj->lmm_objects[i].l_extent_end = -1;
1314                         else
1315                                 lmj->lmm_objects[i].l_extent_end =
1316                                         lex->le_start + lex->le_len;
1317                         lmj->lmm_objects[i].l_object_id =
1318                                 lsm->lsm_oinfo[i]->loi_id;
1319                         lmj->lmm_objects[i].l_object_gr =
1320                                 lsm->lsm_oinfo[i]->loi_gr;
1321                         lmj->lmm_objects[i].l_ost_gen =
1322                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1323                         lmj->lmm_objects[i].l_ost_idx =
1324                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1325                 }
1326                 lmm = (struct lov_mds_md *)lmj;
1327                 lmmsize = lmj_size;
1328 out_free_memmd:
1329                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1330         }
1331 out:
1332         *lmmp = lmm;
1333         *lmm_size = lmmsize;
1334         *request = req;
1335         return rc;
1336 }
1337
1338 static int ll_lov_setea(struct inode *inode, struct file *file,
1339                             unsigned long arg)
1340 {
1341         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1342         struct lov_user_md  *lump;
1343         int lum_size = sizeof(struct lov_user_md) +
1344                        sizeof(struct lov_user_ost_data);
1345         int rc;
1346         ENTRY;
1347
1348         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1349                 RETURN(-EPERM);
1350
1351         OBD_ALLOC(lump, lum_size);
1352         if (lump == NULL) {
1353                 RETURN(-ENOMEM);
1354         }
1355         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1356                 OBD_FREE(lump, lum_size);
1357                 RETURN(-EFAULT);
1358         }
1359
1360         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1361
1362         OBD_FREE(lump, lum_size);
1363         RETURN(rc);
1364 }
1365
1366 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1367                             unsigned long arg)
1368 {
1369         struct lov_user_md_v3 lumv3;
1370         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1371         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1372         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1373         int lum_size;
1374         int rc;
1375         int flags = FMODE_WRITE;
1376         ENTRY;
1377
1378         /* first try with v1 which is smaller than v3 */
1379         lum_size = sizeof(struct lov_user_md_v1);
1380         if (copy_from_user(lumv1, lumv1p, lum_size))
1381                 RETURN(-EFAULT);
1382
1383         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1384                 lum_size = sizeof(struct lov_user_md_v3);
1385                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1386                         RETURN(-EFAULT);
1387         }
1388
1389         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1390         if (rc == 0) {
1391                  put_user(0, &lumv1p->lmm_stripe_count);
1392                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1393                                     0, ll_i2info(inode)->lli_smd,
1394                                     (void *)arg);
1395         }
1396         RETURN(rc);
1397 }
1398
1399 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1400 {
1401         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1402
1403         if (!lsm)
1404                 RETURN(-ENODATA);
1405
1406         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1407                             (void *)arg);
1408 }
1409
1410 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1411 {
1412         struct ll_inode_info   *lli = ll_i2info(inode);
1413         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1414         struct ccc_grouplock    grouplock;
1415         int                     rc;
1416         ENTRY;
1417
1418         if (ll_file_nolock(file))
1419                 RETURN(-EOPNOTSUPP);
1420
1421         spin_lock(&lli->lli_lock);
1422         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1423                 CERROR("group lock already existed with gid %lu\n",
1424                        fd->fd_grouplock.cg_gid);
1425                 spin_unlock(&lli->lli_lock);
1426                 RETURN(-EINVAL);
1427         }
1428         LASSERT(fd->fd_grouplock.cg_lock == NULL);
1429         spin_unlock(&lli->lli_lock);
1430
1431         rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1432                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1433         if (rc)
1434                 RETURN(rc);
1435
1436         spin_lock(&lli->lli_lock);
1437         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1438                 spin_unlock(&lli->lli_lock);
1439                 CERROR("another thread just won the race\n");
1440                 cl_put_grouplock(&grouplock);
1441                 RETURN(-EINVAL);
1442         }
1443
1444         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1445         fd->fd_grouplock = grouplock;
1446         spin_unlock(&lli->lli_lock);
1447
1448         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1449         RETURN(0);
1450 }
1451
1452 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1453 {
1454         struct ll_inode_info   *lli = ll_i2info(inode);
1455         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1456         struct ccc_grouplock    grouplock;
1457         ENTRY;
1458
1459         spin_lock(&lli->lli_lock);
1460         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1461                 spin_unlock(&lli->lli_lock);
1462                 CERROR("no group lock held\n");
1463                 RETURN(-EINVAL);
1464         }
1465         LASSERT(fd->fd_grouplock.cg_lock != NULL);
1466
1467         if (fd->fd_grouplock.cg_gid != arg) {
1468                 CERROR("group lock %lu doesn't match current id %lu\n",
1469                        arg, fd->fd_grouplock.cg_gid);
1470                 spin_unlock(&lli->lli_lock);
1471                 RETURN(-EINVAL);
1472         }
1473
1474         grouplock = fd->fd_grouplock;
1475         fd->fd_grouplock.cg_env = NULL;
1476         fd->fd_grouplock.cg_lock = NULL;
1477         fd->fd_grouplock.cg_gid = 0;
1478         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1479         spin_unlock(&lli->lli_lock);
1480
1481         cl_put_grouplock(&grouplock);
1482         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1483         RETURN(0);
1484 }
1485
1486 #if LUSTRE_FIX >= 50
1487 static int join_sanity_check(struct inode *head, struct inode *tail)
1488 {
1489         ENTRY;
1490         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1491                 CERROR("server do not support join \n");
1492                 RETURN(-EINVAL);
1493         }
1494         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1495                 CERROR("tail ino %lu and ino head %lu must be regular\n",
1496                        head->i_ino, tail->i_ino);
1497                 RETURN(-EINVAL);
1498         }
1499         if (head->i_ino == tail->i_ino) {
1500                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1501                 RETURN(-EINVAL);
1502         }
1503         if (i_size_read(head) % JOIN_FILE_ALIGN) {
1504                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1505                 RETURN(-EINVAL);
1506         }
1507         RETURN(0);
1508 }
1509
1510 static int join_file(struct inode *head_inode, struct file *head_filp,
1511                      struct file *tail_filp)
1512 {
1513         struct dentry *tail_dentry = tail_filp->f_dentry;
1514         struct lookup_intent oit = {.it_op = IT_OPEN,
1515                                     .it_flags = head_filp->f_flags,
1516                                     .it_create_mode = M_JOIN_FILE};
1517         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
1518                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
1519
1520         struct lustre_handle lockh;
1521         struct md_op_data *op_data;
1522         int    rc;
1523         loff_t data;
1524         ENTRY;
1525
1526         tail_dentry = tail_filp->f_dentry;
1527
1528         data = i_size_read(head_inode);
1529         op_data = ll_prep_md_op_data(NULL, head_inode,
1530                                      tail_dentry->d_parent->d_inode,
1531                                      tail_dentry->d_name.name,
1532                                      tail_dentry->d_name.len, 0,
1533                                      LUSTRE_OPC_ANY, &data);
1534         if (IS_ERR(op_data))
1535                 RETURN(PTR_ERR(op_data));
1536
1537         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
1538                          op_data, &lockh, NULL, 0, NULL, 0);
1539
1540         ll_finish_md_op_data(op_data);
1541         if (rc < 0)
1542                 GOTO(out, rc);
1543
1544         rc = oit.d.lustre.it_status;
1545
1546         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
1547                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
1548                 ptlrpc_req_finished((struct ptlrpc_request *)
1549                                     oit.d.lustre.it_data);
1550                 GOTO(out, rc);
1551         }
1552
1553         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1554                                            * away */
1555                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1556                 oit.d.lustre.it_lock_mode = 0;
1557         }
1558         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1559         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
1560         ll_release_openhandle(head_filp->f_dentry, &oit);
1561 out:
1562         ll_intent_release(&oit);
1563         RETURN(rc);
1564 }
1565
1566 static int ll_file_join(struct inode *head, struct file *filp,
1567                         char *filename_tail)
1568 {
1569         struct inode *tail = NULL, *first = NULL, *second = NULL;
1570         struct dentry *tail_dentry;
1571         struct file *tail_filp, *first_filp, *second_filp;
1572         struct ll_lock_tree first_tree, second_tree;
1573         struct ll_lock_tree_node *first_node, *second_node;
1574         struct ll_inode_info *hlli = ll_i2info(head);
1575         int rc = 0, cleanup_phase = 0;
1576         ENTRY;
1577
1578         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1579                head->i_ino, head->i_generation, head, filename_tail);
1580
1581         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1582         if (IS_ERR(tail_filp)) {
1583                 CERROR("Can not open tail file %s", filename_tail);
1584                 rc = PTR_ERR(tail_filp);
1585                 GOTO(cleanup, rc);
1586         }
1587         tail = igrab(tail_filp->f_dentry->d_inode);
1588
1589         tail_dentry = tail_filp->f_dentry;
1590         LASSERT(tail_dentry);
1591         cleanup_phase = 1;
1592
1593         /*reorder the inode for lock sequence*/
1594         first = head->i_ino > tail->i_ino ? head : tail;
1595         second = head->i_ino > tail->i_ino ? tail : head;
1596         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1597         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1598
1599         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1600                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1601         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1602         if (IS_ERR(first_node)){
1603                 rc = PTR_ERR(first_node);
1604                 GOTO(cleanup, rc);
1605         }
1606         first_tree.lt_fd = first_filp->private_data;
1607         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1608         if (rc != 0)
1609                 GOTO(cleanup, rc);
1610         cleanup_phase = 2;
1611
1612         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1613         if (IS_ERR(second_node)){
1614                 rc = PTR_ERR(second_node);
1615                 GOTO(cleanup, rc);
1616         }
1617         second_tree.lt_fd = second_filp->private_data;
1618         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1619         if (rc != 0)
1620                 GOTO(cleanup, rc);
1621         cleanup_phase = 3;
1622
1623         rc = join_sanity_check(head, tail);
1624         if (rc)
1625                 GOTO(cleanup, rc);
1626
1627         rc = join_file(head, filp, tail_filp);
1628         if (rc)
1629                 GOTO(cleanup, rc);
1630 cleanup:
1631         switch (cleanup_phase) {
1632         case 3:
1633                 ll_tree_unlock(&second_tree);
1634                 obd_cancel_unused(ll_i2dtexp(second),
1635                                   ll_i2info(second)->lli_smd, 0, NULL);
1636         case 2:
1637                 ll_tree_unlock(&first_tree);
1638                 obd_cancel_unused(ll_i2dtexp(first),
1639                                   ll_i2info(first)->lli_smd, 0, NULL);
1640         case 1:
1641                 filp_close(tail_filp, 0);
1642                 if (tail)
1643                         iput(tail);
1644                 if (head && rc == 0) {
1645                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
1646                                        &hlli->lli_smd);
1647                         hlli->lli_smd = NULL;
1648                 }
1649         case 0:
1650                 break;
1651         default:
1652                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
1653                 LBUG();
1654         }
1655         RETURN(rc);
1656 }
1657 #endif /* LUSTRE_FIX >= 50 */
1658
1659 /**
1660  * Close inode open handle
1661  *
1662  * \param dentry [in]     dentry which contains the inode
1663  * \param it     [in,out] intent which contains open info and result
1664  *
1665  * \retval 0     success
1666  * \retval <0    failure
1667  */
1668 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1669 {
1670         struct inode *inode = dentry->d_inode;
1671         struct obd_client_handle *och;
1672         int rc;
1673         ENTRY;
1674
1675         LASSERT(inode);
1676
1677         /* Root ? Do nothing. */
1678         if (dentry->d_inode->i_sb->s_root == dentry)
1679                 RETURN(0);
1680
1681         /* No open handle to close? Move away */
1682         if (!it_disposition(it, DISP_OPEN_OPEN))
1683                 RETURN(0);
1684
1685         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1686
1687         OBD_ALLOC(och, sizeof(*och));
1688         if (!och)
1689                 GOTO(out, rc = -ENOMEM);
1690
1691         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1692                     ll_i2info(inode), it, och);
1693
1694         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1695                                        inode, och);
1696  out:
1697         /* this one is in place of ll_file_open */
1698         if (it_disposition(it, DISP_ENQ_OPEN_REF))
1699                 ptlrpc_req_finished(it->d.lustre.it_data);
1700         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1701         RETURN(rc);
1702 }
1703
1704 /**
1705  * Get size for inode for which FIEMAP mapping is requested.
1706  * Make the FIEMAP get_info call and returns the result.
1707  */
1708 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1709               int num_bytes)
1710 {
1711         struct obd_export *exp = ll_i2dtexp(inode);
1712         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1713         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1714         int vallen = num_bytes;
1715         int rc;
1716         ENTRY;
1717
1718         /* If the stripe_count > 1 and the application does not understand
1719          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1720          */
1721         if (lsm->lsm_stripe_count > 1 &&
1722             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1723                 return -EOPNOTSUPP;
1724
1725         fm_key.oa.o_id = lsm->lsm_object_id;
1726         fm_key.oa.o_gr = lsm->lsm_object_gr;
1727         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1728
1729         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
1730                         OBD_MD_FLSIZE);
1731
1732         /* If filesize is 0, then there would be no objects for mapping */
1733         if (fm_key.oa.o_size == 0) {
1734                 fiemap->fm_mapped_extents = 0;
1735                 RETURN(0);
1736         }
1737
1738         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1739
1740         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1741         if (rc)
1742                 CERROR("obd_get_info failed: rc = %d\n", rc);
1743
1744         RETURN(rc);
1745 }
1746
1747 int ll_fid2path(struct obd_export *exp, void *arg)
1748 {
1749         struct getinfo_fid2path *gfout, *gfin;
1750         int outsize, rc;
1751         ENTRY;
1752
1753         /* Need to get the buflen */
1754         OBD_ALLOC_PTR(gfin);
1755         if (gfin == NULL)
1756                 RETURN(-ENOMEM);
1757         if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1758                 OBD_FREE_PTR(gfin);
1759                 RETURN(-EFAULT);
1760         }
1761
1762         outsize = sizeof(*gfout) + gfin->gf_pathlen;
1763         OBD_ALLOC(gfout, outsize);
1764         if (gfout == NULL) {
1765                 OBD_FREE_PTR(gfin);
1766                 RETURN(-ENOMEM);
1767         }
1768         memcpy(gfout, gfin, sizeof(*gfout));
1769         OBD_FREE_PTR(gfin);
1770
1771         /* Call mdc_iocontrol */
1772         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1773         if (rc)
1774                 GOTO(gf_free, rc);
1775         if (copy_to_user(arg, gfout, outsize))
1776                 rc = -EFAULT;
1777
1778 gf_free:
1779         OBD_FREE(gfout, outsize);
1780         RETURN(rc);
1781 }
1782
1783 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1784                   unsigned long arg)
1785 {
1786         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1787         int flags;
1788         ENTRY;
1789
1790         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1791                inode->i_generation, inode, cmd);
1792         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1793
1794         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1795         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1796                 RETURN(-ENOTTY);
1797
1798         switch(cmd) {
1799         case LL_IOC_GETFLAGS:
1800                 /* Get the current value of the file flags */
1801                 return put_user(fd->fd_flags, (int *)arg);
1802         case LL_IOC_SETFLAGS:
1803         case LL_IOC_CLRFLAGS:
1804                 /* Set or clear specific file flags */
1805                 /* XXX This probably needs checks to ensure the flags are
1806                  *     not abused, and to handle any flag side effects.
1807                  */
1808                 if (get_user(flags, (int *) arg))
1809                         RETURN(-EFAULT);
1810
1811                 if (cmd == LL_IOC_SETFLAGS) {
1812                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1813                             !(file->f_flags & O_DIRECT)) {
1814                                 CERROR("%s: unable to disable locking on "
1815                                        "non-O_DIRECT file\n", current->comm);
1816                                 RETURN(-EINVAL);
1817                         }
1818
1819                         fd->fd_flags |= flags;
1820                 } else {
1821                         fd->fd_flags &= ~flags;
1822                 }
1823                 RETURN(0);
1824         case LL_IOC_LOV_SETSTRIPE:
1825                 RETURN(ll_lov_setstripe(inode, file, arg));
1826         case LL_IOC_LOV_SETEA:
1827                 RETURN(ll_lov_setea(inode, file, arg));
1828         case LL_IOC_LOV_GETSTRIPE:
1829                 RETURN(ll_lov_getstripe(inode, arg));
1830         case LL_IOC_RECREATE_OBJ:
1831                 RETURN(ll_lov_recreate_obj(inode, file, arg));
1832         case FSFILT_IOC_FIEMAP: {
1833                 struct ll_user_fiemap *fiemap_s;
1834                 size_t num_bytes, ret_bytes;
1835                 unsigned int extent_count;
1836                 int rc = 0;
1837
1838                 /* Get the extent count so we can calculate the size of
1839                  * required fiemap buffer */
1840                 if (get_user(extent_count,
1841                     &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1842                         RETURN(-EFAULT);
1843                 num_bytes = sizeof(*fiemap_s) + (extent_count *
1844                                                  sizeof(struct ll_fiemap_extent));
1845                 OBD_VMALLOC(fiemap_s, num_bytes);
1846                 if (fiemap_s == NULL)
1847                         RETURN(-ENOMEM);
1848
1849                 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1850                                    sizeof(*fiemap_s)))
1851                         GOTO(error, rc = -EFAULT);
1852
1853                 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1854                         fiemap_s->fm_flags = fiemap_s->fm_flags &
1855                                                     ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1856                         if (copy_to_user((char *)arg, fiemap_s,
1857                                          sizeof(*fiemap_s)))
1858                                 GOTO(error, rc = -EFAULT);
1859
1860                         GOTO(error, rc = -EBADR);
1861                 }
1862
1863                 /* If fm_extent_count is non-zero, read the first extent since
1864                  * it is used to calculate end_offset and device from previous
1865                  * fiemap call. */
1866                 if (extent_count) {
1867                         if (copy_from_user(&fiemap_s->fm_extents[0],
1868                             (char __user *)arg + sizeof(*fiemap_s),
1869                             sizeof(struct ll_fiemap_extent)))
1870                                 GOTO(error, rc = -EFAULT);
1871                 }
1872
1873                 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
1874                         int rc;
1875
1876                         rc = filemap_fdatawrite(inode->i_mapping);
1877                         if (rc)
1878                                 GOTO(error, rc);
1879                 }
1880
1881                 rc = ll_fiemap(inode, fiemap_s, num_bytes);
1882                 if (rc)
1883                         GOTO(error, rc);
1884
1885                 ret_bytes = sizeof(struct ll_user_fiemap);
1886
1887                 if (extent_count != 0)
1888                         ret_bytes += (fiemap_s->fm_mapped_extents *
1889                                          sizeof(struct ll_fiemap_extent));
1890
1891                 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1892                         rc = -EFAULT;
1893
1894 error:
1895                 OBD_VFREE(fiemap_s, num_bytes);
1896                 RETURN(rc);
1897         }
1898         case FSFILT_IOC_GETFLAGS:
1899         case FSFILT_IOC_SETFLAGS:
1900                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1901         case FSFILT_IOC_GETVERSION_OLD:
1902         case FSFILT_IOC_GETVERSION:
1903                 RETURN(put_user(inode->i_generation, (int *)arg));
1904         case LL_IOC_JOIN: {
1905 #if LUSTRE_FIX >= 50
1906                 /* Allow file join in beta builds to allow debuggging */
1907                 char *ftail;
1908                 int rc;
1909
1910                 ftail = getname((const char *)arg);
1911                 if (IS_ERR(ftail))
1912                         RETURN(PTR_ERR(ftail));
1913                 rc = ll_file_join(inode, file, ftail);
1914                 putname(ftail);
1915                 RETURN(rc);
1916 #else
1917                 CWARN("file join is not supported in this version of Lustre\n");
1918                 RETURN(-ENOTTY);
1919 #endif
1920         }
1921         case LL_IOC_GROUP_LOCK:
1922                 RETURN(ll_get_grouplock(inode, file, arg));
1923         case LL_IOC_GROUP_UNLOCK:
1924                 RETURN(ll_put_grouplock(inode, file, arg));
1925         case IOC_OBD_STATFS:
1926                 RETURN(ll_obd_statfs(inode, (void *)arg));
1927
1928         /* We need to special case any other ioctls we want to handle,
1929          * to send them to the MDS/OST as appropriate and to properly
1930          * network encode the arg field.
1931         case FSFILT_IOC_SETVERSION_OLD:
1932         case FSFILT_IOC_SETVERSION:
1933         */
1934         case LL_IOC_FLUSHCTX:
1935                 RETURN(ll_flush_ctx(inode));
1936         case LL_IOC_PATH2FID: {
1937                 if (copy_to_user((void *)arg, ll_inode2fid(inode),
1938                                  sizeof(struct lu_fid)))
1939                         RETURN(-EFAULT);
1940
1941                 RETURN(0);
1942         }
1943         case OBD_IOC_FID2PATH:
1944                 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1945
1946         default: {
1947                 int err;
1948
1949                 if (LLIOC_STOP ==
1950                     ll_iocontrol_call(inode, file, cmd, arg, &err))
1951                         RETURN(err);
1952
1953                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1954                                      (void *)arg));
1955         }
1956         }
1957 }
1958
1959 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1960 {
1961         struct inode *inode = file->f_dentry->d_inode;
1962         loff_t retval;
1963         ENTRY;
1964         retval = offset + ((origin == 2) ? i_size_read(inode) :
1965                            (origin == 1) ? file->f_pos : 0);
1966         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
1967                inode->i_ino, inode->i_generation, inode, retval, retval,
1968                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1969         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1970
1971         if (origin == 2) { /* SEEK_END */
1972                 int nonblock = 0, rc;
1973
1974                 if (file->f_flags & O_NONBLOCK)
1975                         nonblock = LDLM_FL_BLOCK_NOWAIT;
1976
1977                 rc = cl_glimpse_size(inode);
1978                 if (rc != 0)
1979                         RETURN(rc);
1980
1981                 ll_inode_size_lock(inode, 0);
1982                 offset += i_size_read(inode);
1983                 ll_inode_size_unlock(inode, 0);
1984         } else if (origin == 1) { /* SEEK_CUR */
1985                 offset += file->f_pos;
1986         }
1987
1988         retval = -EINVAL;
1989         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1990                 if (offset != file->f_pos) {
1991                         file->f_pos = offset;
1992                 }
1993                 retval = offset;
1994         }
1995
1996         RETURN(retval);
1997 }
1998
1999 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2000 {
2001         struct inode *inode = dentry->d_inode;
2002         struct ll_inode_info *lli = ll_i2info(inode);
2003         struct lov_stripe_md *lsm = lli->lli_smd;
2004         struct ptlrpc_request *req;
2005         struct obd_capa *oc;
2006         int rc, err;
2007         ENTRY;
2008         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2009                inode->i_generation, inode);
2010         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2011
2012         /* fsync's caller has already called _fdata{sync,write}, we want
2013          * that IO to finish before calling the osc and mdc sync methods */
2014         rc = filemap_fdatawait(inode->i_mapping);
2015
2016         /* catch async errors that were recorded back when async writeback
2017          * failed for pages in this mapping. */
2018         err = lli->lli_async_rc;
2019         lli->lli_async_rc = 0;
2020         if (rc == 0)
2021                 rc = err;
2022         if (lsm) {
2023                 err = lov_test_and_clear_async_rc(lsm);
2024                 if (rc == 0)
2025                         rc = err;
2026         }
2027
2028         oc = ll_mdscapa_get(inode);
2029         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2030                       &req);
2031         capa_put(oc);
2032         if (!rc)
2033                 rc = err;
2034         if (!err)
2035                 ptlrpc_req_finished(req);
2036
2037         if (data && lsm) {
2038                 struct obdo *oa;
2039
2040                 OBDO_ALLOC(oa);
2041                 if (!oa)
2042                         RETURN(rc ? rc : -ENOMEM);
2043
2044                 oa->o_id = lsm->lsm_object_id;
2045                 oa->o_gr = lsm->lsm_object_gr;
2046                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2047                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2048                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2049                                            OBD_MD_FLGROUP);
2050
2051                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2052                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2053                                0, OBD_OBJECT_EOF, oc);
2054                 capa_put(oc);
2055                 if (!rc)
2056                         rc = err;
2057                 OBDO_FREE(oa);
2058         }
2059
2060         RETURN(rc);
2061 }
2062
2063 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2064 {
2065         struct inode *inode = file->f_dentry->d_inode;
2066         struct ll_sb_info *sbi = ll_i2sbi(inode);
2067         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2068                                            .ei_cb_cp =ldlm_flock_completion_ast,
2069                                            .ei_cbdata = file_lock };
2070         struct md_op_data *op_data;
2071         struct lustre_handle lockh = {0};
2072         ldlm_policy_data_t flock;
2073         int flags = 0;
2074         int rc;
2075         ENTRY;
2076
2077         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2078                inode->i_ino, file_lock);
2079
2080         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2081
2082         if (file_lock->fl_flags & FL_FLOCK) {
2083                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2084                 /* set missing params for flock() calls */
2085                 file_lock->fl_end = OFFSET_MAX;
2086                 file_lock->fl_pid = current->tgid;
2087         }
2088         flock.l_flock.pid = file_lock->fl_pid;
2089         flock.l_flock.start = file_lock->fl_start;
2090         flock.l_flock.end = file_lock->fl_end;
2091
2092         switch (file_lock->fl_type) {
2093         case F_RDLCK:
2094                 einfo.ei_mode = LCK_PR;
2095                 break;
2096         case F_UNLCK:
2097                 /* An unlock request may or may not have any relation to
2098                  * existing locks so we may not be able to pass a lock handle
2099                  * via a normal ldlm_lock_cancel() request. The request may even
2100                  * unlock a byte range in the middle of an existing lock. In
2101                  * order to process an unlock request we need all of the same
2102                  * information that is given with a normal read or write record
2103                  * lock request. To avoid creating another ldlm unlock (cancel)
2104                  * message we'll treat a LCK_NL flock request as an unlock. */
2105                 einfo.ei_mode = LCK_NL;
2106                 break;
2107         case F_WRLCK:
2108                 einfo.ei_mode = LCK_PW;
2109                 break;
2110         default:
2111                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2112                 RETURN (-EINVAL);
2113         }
2114
2115         switch (cmd) {
2116         case F_SETLKW:
2117 #ifdef F_SETLKW64
2118         case F_SETLKW64:
2119 #endif
2120                 flags = 0;
2121                 break;
2122         case F_SETLK:
2123 #ifdef F_SETLK64
2124         case F_SETLK64:
2125 #endif
2126                 flags = LDLM_FL_BLOCK_NOWAIT;
2127                 break;
2128         case F_GETLK:
2129 #ifdef F_GETLK64
2130         case F_GETLK64:
2131 #endif
2132                 flags = LDLM_FL_TEST_LOCK;
2133                 /* Save the old mode so that if the mode in the lock changes we
2134                  * can decrement the appropriate reader or writer refcount. */
2135                 file_lock->fl_type = einfo.ei_mode;
2136                 break;
2137         default:
2138                 CERROR("unknown fcntl lock command: %d\n", cmd);
2139                 RETURN (-EINVAL);
2140         }
2141
2142         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2143                                      LUSTRE_OPC_ANY, NULL);
2144         if (IS_ERR(op_data))
2145                 RETURN(PTR_ERR(op_data));
2146
2147         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2148                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2149                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2150
2151         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2152                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2153
2154         ll_finish_md_op_data(op_data);
2155
2156         if ((file_lock->fl_flags & FL_FLOCK) &&
2157             (rc == 0 || file_lock->fl_type == F_UNLCK))
2158                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2159 #ifdef HAVE_F_OP_FLOCK
2160         if ((file_lock->fl_flags & FL_POSIX) &&
2161             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2162             !(flags & LDLM_FL_TEST_LOCK))
2163                 posix_lock_file_wait(file, file_lock);
2164 #endif
2165
2166         RETURN(rc);
2167 }
2168
2169 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2170 {
2171         ENTRY;
2172
2173         RETURN(-ENOSYS);
2174 }
2175
2176 int ll_have_md_lock(struct inode *inode, __u64 bits)
2177 {
2178         struct lustre_handle lockh;
2179         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2180         struct lu_fid *fid;
2181         int flags;
2182         ENTRY;
2183
2184         if (!inode)
2185                RETURN(0);
2186
2187         fid = &ll_i2info(inode)->lli_fid;
2188         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2189
2190         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2191         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2192                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2193                 RETURN(1);
2194         }
2195         RETURN(0);
2196 }
2197
2198 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2199                             struct lustre_handle *lockh)
2200 {
2201         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2202         struct lu_fid *fid;
2203         ldlm_mode_t rc;
2204         int flags;
2205         ENTRY;
2206
2207         fid = &ll_i2info(inode)->lli_fid;
2208         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2209
2210         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2211         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2212                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2213         RETURN(rc);
2214 }
2215
2216 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2217         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2218                               * and return success */
2219                 inode->i_nlink = 0;
2220                 /* This path cannot be hit for regular files unless in
2221                  * case of obscure races, so no need to to validate
2222                  * size. */
2223                 if (!S_ISREG(inode->i_mode) &&
2224                     !S_ISDIR(inode->i_mode))
2225                         return 0;
2226         }
2227
2228         if (rc) {
2229                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2230                 return -abs(rc);
2231
2232         }
2233
2234         return 0;
2235 }
2236
2237 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2238                              __u64 ibits)
2239 {
2240         struct inode *inode = dentry->d_inode;
2241         struct ptlrpc_request *req = NULL;
2242         struct ll_sb_info *sbi;
2243         struct obd_export *exp;
2244         int rc = 0;
2245         ENTRY;
2246
2247         if (!inode) {
2248                 CERROR("REPORT THIS LINE TO PETER\n");
2249                 RETURN(0);
2250         }
2251         sbi = ll_i2sbi(inode);
2252
2253         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2254                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2255
2256         exp = ll_i2mdexp(inode);
2257
2258         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2259                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2260                 struct md_op_data *op_data;
2261
2262                 /* Call getattr by fid, so do not provide name at all. */
2263                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2264                                              dentry->d_inode, NULL, 0, 0,
2265                                              LUSTRE_OPC_ANY, NULL);
2266                 if (IS_ERR(op_data))
2267                         RETURN(PTR_ERR(op_data));
2268
2269                 oit.it_create_mode |= M_CHECK_STALE;
2270                 rc = md_intent_lock(exp, op_data, NULL, 0,
2271                                     /* we are not interested in name
2272                                        based lookup */
2273                                     &oit, 0, &req,
2274                                     ll_md_blocking_ast, 0);
2275                 ll_finish_md_op_data(op_data);
2276                 oit.it_create_mode &= ~M_CHECK_STALE;
2277                 if (rc < 0) {
2278                         rc = ll_inode_revalidate_fini(inode, rc);
2279                         GOTO (out, rc);
2280                 }
2281
2282                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2283                 if (rc != 0) {
2284                         ll_intent_release(&oit);
2285                         GOTO(out, rc);
2286                 }
2287
2288                 /* Unlinked? Unhash dentry, so it is not picked up later by
2289                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2290                    here to preserve get_cwd functionality on 2.6.
2291                    Bug 10503 */
2292                 if (!dentry->d_inode->i_nlink) {
2293                         spin_lock(&ll_lookup_lock);
2294                         spin_lock(&dcache_lock);
2295                         ll_drop_dentry(dentry);
2296                         spin_unlock(&dcache_lock);
2297                         spin_unlock(&ll_lookup_lock);
2298                 }
2299
2300                 ll_lookup_finish_locks(&oit, dentry);
2301         } else if (!ll_have_md_lock(dentry->d_inode, ibits)) {
2302
2303                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2304                 obd_valid valid = OBD_MD_FLGETATTR;
2305                 struct obd_capa *oc;
2306                 int ealen = 0;
2307
2308                 if (S_ISREG(inode->i_mode)) {
2309                         rc = ll_get_max_mdsize(sbi, &ealen);
2310                         if (rc)
2311                                 RETURN(rc);
2312                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2313                 }
2314                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2315                  * capa for this inode. Because we only keep capas of dirs
2316                  * fresh. */
2317                 oc = ll_mdscapa_get(inode);
2318                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2319                                 ealen, &req);
2320                 capa_put(oc);
2321                 if (rc) {
2322                         rc = ll_inode_revalidate_fini(inode, rc);
2323                         RETURN(rc);
2324                 }
2325
2326                 rc = ll_prep_inode(&inode, req, NULL);
2327         }
2328 out:
2329         ptlrpc_req_finished(req);
2330         return rc;
2331 }
2332
2333 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2334 {
2335         int rc;
2336         ENTRY;
2337
2338         rc = __ll_inode_revalidate_it(dentry, it, MDS_INODELOCK_UPDATE |
2339                                                   MDS_INODELOCK_LOOKUP);
2340
2341         /* if object not yet allocated, don't validate size */
2342         if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL)
2343                 RETURN(0);
2344
2345         /* cl_glimpse_size will prefer locally cached writes if they extend
2346          * the file */
2347
2348         if (rc == 0)
2349                 rc = cl_glimpse_size(dentry->d_inode);
2350
2351         RETURN(rc);
2352 }
2353
2354 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2355                   struct lookup_intent *it, struct kstat *stat)
2356 {
2357         struct inode *inode = de->d_inode;
2358         int res = 0;
2359
2360         res = ll_inode_revalidate_it(de, it);
2361         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2362
2363         if (res)
2364                 return res;
2365
2366         stat->dev = inode->i_sb->s_dev;
2367         stat->ino = inode->i_ino;
2368         stat->mode = inode->i_mode;
2369         stat->nlink = inode->i_nlink;
2370         stat->uid = inode->i_uid;
2371         stat->gid = inode->i_gid;
2372         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2373         stat->atime = inode->i_atime;
2374         stat->mtime = inode->i_mtime;
2375         stat->ctime = inode->i_ctime;
2376 #ifdef HAVE_INODE_BLKSIZE
2377         stat->blksize = inode->i_blksize;
2378 #else
2379         stat->blksize = 1 << inode->i_blkbits;
2380 #endif
2381
2382         ll_inode_size_lock(inode, 0);
2383         stat->size = i_size_read(inode);
2384         stat->blocks = inode->i_blocks;
2385         ll_inode_size_unlock(inode, 0);
2386
2387         return 0;
2388 }
2389 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2390 {
2391         struct lookup_intent it = { .it_op = IT_GETATTR };
2392
2393         return ll_getattr_it(mnt, de, &it, stat);
2394 }
2395
2396 static
2397 int lustre_check_acl(struct inode *inode, int mask)
2398 {
2399 #ifdef CONFIG_FS_POSIX_ACL
2400         struct ll_inode_info *lli = ll_i2info(inode);
2401         struct posix_acl *acl;
2402         int rc;
2403         ENTRY;
2404
2405         spin_lock(&lli->lli_lock);
2406         acl = posix_acl_dup(lli->lli_posix_acl);
2407         spin_unlock(&lli->lli_lock);
2408
2409         if (!acl)
2410                 RETURN(-EAGAIN);
2411
2412         rc = posix_acl_permission(inode, acl, mask);
2413         posix_acl_release(acl);
2414
2415         RETURN(rc);
2416 #else
2417         return -EAGAIN;
2418 #endif
2419 }
2420
2421 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2422 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2423 {
2424         int rc = 0;
2425         ENTRY;
2426
2427        /* as root inode are NOT getting validated in lookup operation,
2428         * need to do it before permission check. */
2429
2430         if (inode == inode->i_sb->s_root->d_inode) {
2431                 struct lookup_intent it = { .it_op = IT_LOOKUP };
2432
2433                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2434                                               MDS_INODELOCK_LOOKUP);
2435                 if (rc)
2436                         RETURN(rc);
2437         }
2438
2439         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2440                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2441
2442         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2443                 return lustre_check_remote_perm(inode, mask);
2444
2445         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2446         rc = generic_permission(inode, mask, lustre_check_acl);
2447
2448         RETURN(rc);
2449 }
2450 #else
2451 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2452 {
2453         int mode = inode->i_mode;
2454         int rc;
2455
2456         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2457                inode->i_ino, inode->i_generation, inode, mask);
2458
2459         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2460                 return lustre_check_remote_perm(inode, mask);
2461
2462         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2463
2464         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2465             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2466                 return -EROFS;
2467         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2468                 return -EACCES;
2469         if (current->fsuid == inode->i_uid) {
2470                 mode >>= 6;
2471         } else if (1) {
2472                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2473                         goto check_groups;
2474                 rc = lustre_check_acl(inode, mask);
2475                 if (rc == -EAGAIN)
2476                         goto check_groups;
2477                 if (rc == -EACCES)
2478                         goto check_capabilities;
2479                 return rc;
2480         } else {
2481 check_groups:
2482                 if (in_group_p(inode->i_gid))
2483                         mode >>= 3;
2484         }
2485         if ((mode & mask & S_IRWXO) == mask)
2486                 return 0;
2487
2488 check_capabilities:
2489         if (!(mask & MAY_EXEC) ||
2490             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2491                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2492                         return 0;
2493
2494         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2495             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2496                 return 0;
2497
2498         return -EACCES;
2499 }
2500 #endif
2501
2502 #ifdef HAVE_FILE_READV
2503 #define READ_METHOD readv
2504 #define READ_FUNCTION ll_file_readv
2505 #define WRITE_METHOD writev
2506 #define WRITE_FUNCTION ll_file_writev
2507 #else
2508 #define READ_METHOD aio_read
2509 #define READ_FUNCTION ll_file_aio_read
2510 #define WRITE_METHOD aio_write
2511 #define WRITE_FUNCTION ll_file_aio_write
2512 #endif
2513
2514 /* -o localflock - only provides locally consistent flock locks */
2515 struct file_operations ll_file_operations = {
2516         .read           = ll_file_read,
2517         .READ_METHOD    = READ_FUNCTION,
2518         .write          = ll_file_write,
2519         .WRITE_METHOD   = WRITE_FUNCTION,
2520         .ioctl          = ll_file_ioctl,
2521         .open           = ll_file_open,
2522         .release        = ll_file_release,
2523         .mmap           = ll_file_mmap,
2524         .llseek         = ll_file_seek,
2525         .sendfile       = ll_file_sendfile,
2526         .fsync          = ll_fsync,
2527 };
2528
2529 struct file_operations ll_file_operations_flock = {
2530         .read           = ll_file_read,
2531         .READ_METHOD    = READ_FUNCTION,
2532         .write          = ll_file_write,
2533         .WRITE_METHOD   = WRITE_FUNCTION,
2534         .ioctl          = ll_file_ioctl,
2535         .open           = ll_file_open,
2536         .release        = ll_file_release,
2537         .mmap           = ll_file_mmap,
2538         .llseek         = ll_file_seek,
2539         .sendfile       = ll_file_sendfile,
2540         .fsync          = ll_fsync,
2541 #ifdef HAVE_F_OP_FLOCK
2542         .flock          = ll_file_flock,
2543 #endif
2544         .lock           = ll_file_flock
2545 };
2546
2547 /* These are for -o noflock - to return ENOSYS on flock calls */
2548 struct file_operations ll_file_operations_noflock = {
2549         .read           = ll_file_read,
2550         .READ_METHOD    = READ_FUNCTION,
2551         .write          = ll_file_write,
2552         .WRITE_METHOD   = WRITE_FUNCTION,
2553         .ioctl          = ll_file_ioctl,
2554         .open           = ll_file_open,
2555         .release        = ll_file_release,
2556         .mmap           = ll_file_mmap,
2557         .llseek         = ll_file_seek,
2558         .sendfile       = ll_file_sendfile,
2559         .fsync          = ll_fsync,
2560 #ifdef HAVE_F_OP_FLOCK
2561         .flock          = ll_file_noflock,
2562 #endif
2563         .lock           = ll_file_noflock
2564 };
2565
2566 struct inode_operations ll_file_inode_operations = {
2567 #ifdef HAVE_VFS_INTENT_PATCHES
2568         .setattr_raw    = ll_setattr_raw,
2569 #endif
2570         .setattr        = ll_setattr,
2571         .truncate       = ll_truncate,
2572         .getattr        = ll_getattr,
2573         .permission     = ll_inode_permission,
2574         .setxattr       = ll_setxattr,
2575         .getxattr       = ll_getxattr,
2576         .listxattr      = ll_listxattr,
2577         .removexattr    = ll_removexattr,
2578 };
2579
2580 /* dynamic ioctl number support routins */
2581 static struct llioc_ctl_data {
2582         struct rw_semaphore ioc_sem;
2583         struct list_head    ioc_head;
2584 } llioc = {
2585         __RWSEM_INITIALIZER(llioc.ioc_sem),
2586         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2587 };
2588
2589
2590 struct llioc_data {
2591         struct list_head        iocd_list;
2592         unsigned int            iocd_size;
2593         llioc_callback_t        iocd_cb;
2594         unsigned int            iocd_count;
2595         unsigned int            iocd_cmd[0];
2596 };
2597
2598 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2599 {
2600         unsigned int size;
2601         struct llioc_data *in_data = NULL;
2602         ENTRY;
2603
2604         if (cb == NULL || cmd == NULL ||
2605             count > LLIOC_MAX_CMD || count < 0)
2606                 RETURN(NULL);
2607
2608         size = sizeof(*in_data) + count * sizeof(unsigned int);
2609         OBD_ALLOC(in_data, size);
2610         if (in_data == NULL)
2611                 RETURN(NULL);
2612
2613         memset(in_data, 0, sizeof(*in_data));
2614         in_data->iocd_size = size;
2615         in_data->iocd_cb = cb;
2616         in_data->iocd_count = count;
2617         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2618
2619         down_write(&llioc.ioc_sem);
2620         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2621         up_write(&llioc.ioc_sem);
2622
2623         RETURN(in_data);
2624 }
2625
2626 void ll_iocontrol_unregister(void *magic)
2627 {
2628         struct llioc_data *tmp;
2629
2630         if (magic == NULL)
2631                 return;
2632
2633         down_write(&llioc.ioc_sem);
2634         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2635                 if (tmp == magic) {
2636                         unsigned int size = tmp->iocd_size;
2637
2638                         list_del(&tmp->iocd_list);
2639                         up_write(&llioc.ioc_sem);
2640
2641                         OBD_FREE(tmp, size);
2642                         return;
2643                 }
2644         }
2645         up_write(&llioc.ioc_sem);
2646
2647         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2648 }
2649
2650 EXPORT_SYMBOL(ll_iocontrol_register);
2651 EXPORT_SYMBOL(ll_iocontrol_unregister);
2652
2653 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2654                         unsigned int cmd, unsigned long arg, int *rcp)
2655 {
2656         enum llioc_iter ret = LLIOC_CONT;
2657         struct llioc_data *data;
2658         int rc = -EINVAL, i;
2659
2660         down_read(&llioc.ioc_sem);
2661         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2662                 for (i = 0; i < data->iocd_count; i++) {
2663                         if (cmd != data->iocd_cmd[i])
2664                                 continue;
2665
2666                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2667                         break;
2668                 }
2669
2670                 if (ret == LLIOC_STOP)
2671                         break;
2672         }
2673         up_read(&llioc.ioc_sem);
2674
2675         if (rcp)
2676                 *rcp = rc;
2677         return ret;
2678 }