Whamcloud - gitweb
b=20529
[fs/lustre-release.git] / lustre / llite / file.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51
52 #include "cl_object.h"
53
54 struct ll_file_data *ll_file_data_get(void)
55 {
56         struct ll_file_data *fd;
57
58         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
59         return fd;
60 }
61
62 static void ll_file_data_put(struct ll_file_data *fd)
63 {
64         if (fd != NULL)
65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 }
67
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69                           struct lustre_handle *fh)
70 {
71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72         op_data->op_attr.ia_mode = inode->i_mode;
73         op_data->op_attr.ia_atime = inode->i_atime;
74         op_data->op_attr.ia_mtime = inode->i_mtime;
75         op_data->op_attr.ia_ctime = inode->i_ctime;
76         op_data->op_attr.ia_size = i_size_read(inode);
77         op_data->op_attr_blocks = inode->i_blocks;
78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
79         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
80         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
81         op_data->op_capa1 = ll_mdscapa_get(inode);
82 }
83
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85                              struct obd_client_handle *och)
86 {
87         ENTRY;
88
89         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
90                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
91
92         if (!(och->och_flags & FMODE_WRITE))
93                 goto out;
94
95         if (!(exp_connect_som(ll_i2mdexp(inode))) || !S_ISREG(inode->i_mode))
96                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
97         else
98                 ll_epoch_close(inode, op_data, &och, 0);
99
100 out:
101         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
102         EXIT;
103 }
104
105 static int ll_close_inode_openhandle(struct obd_export *md_exp,
106                                      struct inode *inode,
107                                      struct obd_client_handle *och)
108 {
109         struct obd_export *exp = ll_i2mdexp(inode);
110         struct md_op_data *op_data;
111         struct ptlrpc_request *req = NULL;
112         struct obd_device *obd = class_exp2obd(exp);
113         int epoch_close = 1;
114         int rc;
115         ENTRY;
116
117         if (obd == NULL) {
118                 /*
119                  * XXX: in case of LMV, is this correct to access
120                  * ->exp_handle?
121                  */
122                 CERROR("Invalid MDC connection handle "LPX64"\n",
123                        ll_i2mdexp(inode)->exp_handle.h_cookie);
124                 GOTO(out, rc = 0);
125         }
126
127         /*
128          * here we check if this is forced umount. If so this is called on
129          * canceling "open lock" and we do not call md_close() in this case, as
130          * it will not be successful, as import is already deactivated.
131          */
132         if (obd->obd_force)
133                 GOTO(out, rc = 0);
134
135         OBD_ALLOC_PTR(op_data);
136         if (op_data == NULL)
137                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
138
139         ll_prepare_close(inode, op_data, och);
140         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
141         rc = md_close(md_exp, op_data, och->och_mod, &req);
142         if (rc == -EAGAIN) {
143                 /* This close must have the epoch closed. */
144                 LASSERT(epoch_close);
145                 /* MDS has instructed us to obtain Size-on-MDS attribute from
146                  * OSTs and send setattr to back to MDS. */
147                 rc = ll_sizeonmds_update(inode, &och->och_fh,
148                                          op_data->op_ioepoch);
149                 if (rc) {
150                         CERROR("inode %lu mdc Size-on-MDS update failed: "
151                                "rc = %d\n", inode->i_ino, rc);
152                         rc = 0;
153                 }
154         } else if (rc) {
155                 CERROR("inode %lu mdc close failed: rc = %d\n",
156                        inode->i_ino, rc);
157         }
158         ll_finish_md_op_data(op_data);
159
160         if (rc == 0) {
161                 rc = ll_objects_destroy(req, inode);
162                 if (rc)
163                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
164                                inode->i_ino, rc);
165         }
166
167         EXIT;
168 out:
169
170         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
171             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
172                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
173         } else {
174                 md_clear_open_replay_data(md_exp, och);
175                 /* Free @och if it is not waiting for DONE_WRITING. */
176                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
177                 OBD_FREE_PTR(och);
178         }
179         if (req) /* This is close request */
180                 ptlrpc_req_finished(req);
181         return rc;
182 }
183
184 int ll_md_real_close(struct inode *inode, int flags)
185 {
186         struct ll_inode_info *lli = ll_i2info(inode);
187         struct obd_client_handle **och_p;
188         struct obd_client_handle *och;
189         __u64 *och_usecount;
190         int rc = 0;
191         ENTRY;
192
193         if (flags & FMODE_WRITE) {
194                 och_p = &lli->lli_mds_write_och;
195                 och_usecount = &lli->lli_open_fd_write_count;
196         } else if (flags & FMODE_EXEC) {
197                 och_p = &lli->lli_mds_exec_och;
198                 och_usecount = &lli->lli_open_fd_exec_count;
199         } else {
200                 LASSERT(flags & FMODE_READ);
201                 och_p = &lli->lli_mds_read_och;
202                 och_usecount = &lli->lli_open_fd_read_count;
203         }
204
205         down(&lli->lli_och_sem);
206         if (*och_usecount) { /* There are still users of this handle, so
207                                 skip freeing it. */
208                 up(&lli->lli_och_sem);
209                 RETURN(0);
210         }
211         och=*och_p;
212         *och_p = NULL;
213         up(&lli->lli_och_sem);
214
215         if (och) { /* There might be a race and somebody have freed this och
216                       already */
217                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
218                                                inode, och);
219         }
220
221         RETURN(rc);
222 }
223
224 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
225                 struct file *file)
226 {
227         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
228         struct ll_inode_info *lli = ll_i2info(inode);
229         int rc = 0;
230         ENTRY;
231
232         /* clear group lock, if present */
233         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
234                 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
235
236         /* Let's see if we have good enough OPEN lock on the file and if
237            we can skip talking to MDS */
238         if (file->f_dentry->d_inode) { /* Can this ever be false? */
239                 int lockmode;
240                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
241                 struct lustre_handle lockh;
242                 struct inode *inode = file->f_dentry->d_inode;
243                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
244
245                 down(&lli->lli_och_sem);
246                 if (fd->fd_omode & FMODE_WRITE) {
247                         lockmode = LCK_CW;
248                         LASSERT(lli->lli_open_fd_write_count);
249                         lli->lli_open_fd_write_count--;
250                 } else if (fd->fd_omode & FMODE_EXEC) {
251                         lockmode = LCK_PR;
252                         LASSERT(lli->lli_open_fd_exec_count);
253                         lli->lli_open_fd_exec_count--;
254                 } else {
255                         lockmode = LCK_CR;
256                         LASSERT(lli->lli_open_fd_read_count);
257                         lli->lli_open_fd_read_count--;
258                 }
259                 up(&lli->lli_och_sem);
260
261                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
262                                    LDLM_IBITS, &policy, lockmode,
263                                    &lockh)) {
264                         rc = ll_md_real_close(file->f_dentry->d_inode,
265                                               fd->fd_omode);
266                 }
267         } else {
268                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
269                        file, file->f_dentry, file->f_dentry->d_name.name);
270         }
271
272         LUSTRE_FPRIVATE(file) = NULL;
273         ll_file_data_put(fd);
274         ll_capa_close(inode);
275
276         RETURN(rc);
277 }
278
279 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
280
281 /* While this returns an error code, fput() the caller does not, so we need
282  * to make every effort to clean up all of our state here.  Also, applications
283  * rarely check close errors and even if an error is returned they will not
284  * re-try the close call.
285  */
286 int ll_file_release(struct inode *inode, struct file *file)
287 {
288         struct ll_file_data *fd;
289         struct ll_sb_info *sbi = ll_i2sbi(inode);
290         struct ll_inode_info *lli = ll_i2info(inode);
291         struct lov_stripe_md *lsm = lli->lli_smd;
292         int rc;
293         ENTRY;
294
295         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
296                inode->i_generation, inode);
297
298 #ifdef CONFIG_FS_POSIX_ACL
299         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
300             inode == inode->i_sb->s_root->d_inode) {
301                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
302
303                 LASSERT(fd != NULL);
304                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
305                         fd->fd_flags &= ~LL_FILE_RMTACL;
306                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
307                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
308                 }
309         }
310 #endif
311
312         if (inode->i_sb->s_root != file->f_dentry)
313                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
314         fd = LUSTRE_FPRIVATE(file);
315         LASSERT(fd != NULL);
316
317         /* The last ref on @file, maybe not the the owner pid of statahead.
318          * Different processes can open the same dir, "ll_opendir_key" means:
319          * it is me that should stop the statahead thread. */
320         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
321                 ll_stop_statahead(inode, lli->lli_opendir_key);
322
323         if (inode->i_sb->s_root == file->f_dentry) {
324                 LUSTRE_FPRIVATE(file) = NULL;
325                 ll_file_data_put(fd);
326                 RETURN(0);
327         }
328
329         if (lsm)
330                 lov_test_and_clear_async_rc(lsm);
331         lli->lli_async_rc = 0;
332
333         rc = ll_md_close(sbi->ll_md_exp, inode, file);
334         RETURN(rc);
335 }
336
337 static int ll_intent_file_open(struct file *file, void *lmm,
338                                int lmmsize, struct lookup_intent *itp)
339 {
340         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
341         struct dentry *parent = file->f_dentry->d_parent;
342         const char *name = file->f_dentry->d_name.name;
343         const int len = file->f_dentry->d_name.len;
344         struct md_op_data *op_data;
345         struct ptlrpc_request *req;
346         int rc;
347         ENTRY;
348
349         if (!parent)
350                 RETURN(-ENOENT);
351
352         /* Usually we come here only for NFSD, and we want open lock.
353            But we can also get here with pre 2.6.15 patchless kernels, and in
354            that case that lock is also ok */
355         /* We can also get here if there was cached open handle in revalidate_it
356          * but it disappeared while we were getting from there to ll_file_open.
357          * But this means this file was closed and immediatelly opened which
358          * makes a good candidate for using OPEN lock */
359         /* If lmmsize & lmm are not 0, we are just setting stripe info
360          * parameters. No need for the open lock */
361         if (!lmm && !lmmsize)
362                 itp->it_flags |= MDS_OPEN_LOCK;
363
364         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
365                                       file->f_dentry->d_inode, name, len,
366                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
367         if (IS_ERR(op_data))
368                 RETURN(PTR_ERR(op_data));
369
370         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
371                             0 /*unused */, &req, ll_md_blocking_ast, 0);
372         ll_finish_md_op_data(op_data);
373         if (rc == -ESTALE) {
374                 /* reason for keep own exit path - don`t flood log
375                 * with messages with -ESTALE errors.
376                 */
377                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
378                      it_open_error(DISP_OPEN_OPEN, itp))
379                         GOTO(out, rc);
380                 ll_release_openhandle(file->f_dentry, itp);
381                 GOTO(out, rc);
382         }
383
384         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
385                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
386                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
387                 GOTO(out, rc);
388         }
389
390         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
391         if (!rc && itp->d.lustre.it_lock_mode)
392                 md_set_lock_data(sbi->ll_md_exp,
393                                  &itp->d.lustre.it_lock_handle,
394                                  file->f_dentry->d_inode, NULL);
395
396 out:
397         ptlrpc_req_finished(itp->d.lustre.it_data);
398         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
399         ll_intent_drop_lock(itp);
400
401         RETURN(rc);
402 }
403
404 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
405 {
406         if (ioepoch && lli->lli_ioepoch != ioepoch) {
407                 lli->lli_ioepoch = ioepoch;
408                 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
409                        ioepoch, PFID(&lli->lli_fid));
410         }
411 }
412
413 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
414                        struct lookup_intent *it, struct obd_client_handle *och)
415 {
416         struct ptlrpc_request *req = it->d.lustre.it_data;
417         struct mdt_body *body;
418
419         LASSERT(och);
420
421         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
422         LASSERT(body != NULL);                      /* reply already checked out */
423
424         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
425         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
426         och->och_fid = lli->lli_fid;
427         och->och_flags = it->it_flags;
428         ll_ioepoch_open(lli, body->ioepoch);
429
430         return md_set_open_replay_data(md_exp, och, req);
431 }
432
433 int ll_local_open(struct file *file, struct lookup_intent *it,
434                   struct ll_file_data *fd, struct obd_client_handle *och)
435 {
436         struct inode *inode = file->f_dentry->d_inode;
437         struct ll_inode_info *lli = ll_i2info(inode);
438         ENTRY;
439
440         LASSERT(!LUSTRE_FPRIVATE(file));
441
442         LASSERT(fd != NULL);
443
444         if (och) {
445                 struct ptlrpc_request *req = it->d.lustre.it_data;
446                 struct mdt_body *body;
447                 int rc;
448
449                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
450                 if (rc)
451                         RETURN(rc);
452
453                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
454                 if ((it->it_flags & FMODE_WRITE) &&
455                     (body->valid & OBD_MD_FLSIZE))
456                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
457                                lli->lli_ioepoch, PFID(&lli->lli_fid));
458         }
459
460         LUSTRE_FPRIVATE(file) = fd;
461         ll_readahead_init(inode, &fd->fd_ras);
462         fd->fd_omode = it->it_flags;
463         RETURN(0);
464 }
465
466 /* Open a file, and (for the very first open) create objects on the OSTs at
467  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
468  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
469  * lli_open_sem to ensure no other process will create objects, send the
470  * stripe MD to the MDS, or try to destroy the objects if that fails.
471  *
472  * If we already have the stripe MD locally then we don't request it in
473  * md_open(), by passing a lmm_size = 0.
474  *
475  * It is up to the application to ensure no other processes open this file
476  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
477  * used.  We might be able to avoid races of that sort by getting lli_open_sem
478  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
479  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
480  */
481 int ll_file_open(struct inode *inode, struct file *file)
482 {
483         struct ll_inode_info *lli = ll_i2info(inode);
484         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
485                                           .it_flags = file->f_flags };
486         struct lov_stripe_md *lsm;
487         struct ptlrpc_request *req = NULL;
488         struct obd_client_handle **och_p;
489         __u64 *och_usecount;
490         struct ll_file_data *fd;
491         int rc = 0, opendir_set = 0;
492         ENTRY;
493
494         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
495                inode->i_generation, inode, file->f_flags);
496
497 #ifdef HAVE_VFS_INTENT_PATCHES
498         it = file->f_it;
499 #else
500         it = file->private_data; /* XXX: compat macro */
501         file->private_data = NULL; /* prevent ll_local_open assertion */
502 #endif
503
504         fd = ll_file_data_get();
505         if (fd == NULL)
506                 RETURN(-ENOMEM);
507
508         fd->fd_file = file;
509         if (S_ISDIR(inode->i_mode)) {
510                 spin_lock(&lli->lli_lock);
511                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
512                         LASSERT(lli->lli_sai == NULL);
513                         lli->lli_opendir_key = fd;
514                         lli->lli_opendir_pid = cfs_curproc_pid();
515                         opendir_set = 1;
516                 }
517                 spin_unlock(&lli->lli_lock);
518         }
519
520         if (inode->i_sb->s_root == file->f_dentry) {
521                 LUSTRE_FPRIVATE(file) = fd;
522                 RETURN(0);
523         }
524
525         if (!it || !it->d.lustre.it_disposition) {
526                 /* Convert f_flags into access mode. We cannot use file->f_mode,
527                  * because everything but O_ACCMODE mask was stripped from
528                  * there */
529                 if ((oit.it_flags + 1) & O_ACCMODE)
530                         oit.it_flags++;
531                 if (file->f_flags & O_TRUNC)
532                         oit.it_flags |= FMODE_WRITE;
533
534                 /* kernel only call f_op->open in dentry_open.  filp_open calls
535                  * dentry_open after call to open_namei that checks permissions.
536                  * Only nfsd_open call dentry_open directly without checking
537                  * permissions and because of that this code below is safe. */
538                 if (oit.it_flags & FMODE_WRITE)
539                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
540
541                 /* We do not want O_EXCL here, presumably we opened the file
542                  * already? XXX - NFS implications? */
543                 oit.it_flags &= ~O_EXCL;
544
545                 it = &oit;
546         }
547
548 restart:
549         /* Let's see if we have file open on MDS already. */
550         if (it->it_flags & FMODE_WRITE) {
551                 och_p = &lli->lli_mds_write_och;
552                 och_usecount = &lli->lli_open_fd_write_count;
553         } else if (it->it_flags & FMODE_EXEC) {
554                 och_p = &lli->lli_mds_exec_och;
555                 och_usecount = &lli->lli_open_fd_exec_count;
556          } else {
557                 och_p = &lli->lli_mds_read_och;
558                 och_usecount = &lli->lli_open_fd_read_count;
559         }
560
561         down(&lli->lli_och_sem);
562         if (*och_p) { /* Open handle is present */
563                 if (it_disposition(it, DISP_OPEN_OPEN)) {
564                         /* Well, there's extra open request that we do not need,
565                            let's close it somehow. This will decref request. */
566                         rc = it_open_error(DISP_OPEN_OPEN, it);
567                         if (rc) {
568                                 up(&lli->lli_och_sem);
569                                 ll_file_data_put(fd);
570                                 GOTO(out_openerr, rc);
571                         }
572                         ll_release_openhandle(file->f_dentry, it);
573                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
574                                              LPROC_LL_OPEN);
575                 }
576                 (*och_usecount)++;
577
578                 rc = ll_local_open(file, it, fd, NULL);
579                 if (rc) {
580                         (*och_usecount)--;
581                         up(&lli->lli_och_sem);
582                         ll_file_data_put(fd);
583                         GOTO(out_openerr, rc);
584                 }
585         } else {
586                 LASSERT(*och_usecount == 0);
587                 if (!it->d.lustre.it_disposition) {
588                         /* We cannot just request lock handle now, new ELC code
589                            means that one of other OPEN locks for this file
590                            could be cancelled, and since blocking ast handler
591                            would attempt to grab och_sem as well, that would
592                            result in a deadlock */
593                         up(&lli->lli_och_sem);
594                         it->it_create_mode |= M_CHECK_STALE;
595                         rc = ll_intent_file_open(file, NULL, 0, it);
596                         it->it_create_mode &= ~M_CHECK_STALE;
597                         if (rc) {
598                                 ll_file_data_put(fd);
599                                 GOTO(out_openerr, rc);
600                         }
601
602                         /* Got some error? Release the request */
603                         if (it->d.lustre.it_status < 0) {
604                                 req = it->d.lustre.it_data;
605                                 ptlrpc_req_finished(req);
606                         }
607                         goto restart;
608                 }
609                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
610                 if (!*och_p) {
611                         ll_file_data_put(fd);
612                         GOTO(out_och_free, rc = -ENOMEM);
613                 }
614                 (*och_usecount)++;
615                 req = it->d.lustre.it_data;
616
617                 /* md_intent_lock() didn't get a request ref if there was an
618                  * open error, so don't do cleanup on the request here
619                  * (bug 3430) */
620                 /* XXX (green): Should not we bail out on any error here, not
621                  * just open error? */
622                 rc = it_open_error(DISP_OPEN_OPEN, it);
623                 if (rc) {
624                         ll_file_data_put(fd);
625                         GOTO(out_och_free, rc);
626                 }
627
628                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
629                 rc = ll_local_open(file, it, fd, *och_p);
630                 if (rc) {
631                         ll_file_data_put(fd);
632                         GOTO(out_och_free, rc);
633                 }
634         }
635         up(&lli->lli_och_sem);
636
637         /* Must do this outside lli_och_sem lock to prevent deadlock where
638            different kind of OPEN lock for this same inode gets cancelled
639            by ldlm_cancel_lru */
640         if (!S_ISREG(inode->i_mode))
641                 GOTO(out, rc);
642
643         ll_capa_open(inode);
644
645         lsm = lli->lli_smd;
646         if (lsm == NULL) {
647                 if (file->f_flags & O_LOV_DELAY_CREATE ||
648                     !(file->f_mode & FMODE_WRITE)) {
649                         CDEBUG(D_INODE, "object creation was delayed\n");
650                         GOTO(out, rc);
651                 }
652         }
653         file->f_flags &= ~O_LOV_DELAY_CREATE;
654         GOTO(out, rc);
655 out:
656         ptlrpc_req_finished(req);
657         if (req)
658                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
659 out_och_free:
660         if (rc) {
661                 if (*och_p) {
662                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
663                         *och_p = NULL; /* OBD_FREE writes some magic there */
664                         (*och_usecount)--;
665                 }
666                 up(&lli->lli_och_sem);
667 out_openerr:
668                 if (opendir_set != 0)
669                         ll_stop_statahead(inode, lli->lli_opendir_key);
670         }
671
672         return rc;
673 }
674
675 /* Fills the obdo with the attributes for the lsm */
676 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
677                           struct obd_capa *capa, struct obdo *obdo)
678 {
679         struct ptlrpc_request_set *set;
680         struct obd_info            oinfo = { { { 0 } } };
681         int                        rc;
682
683         ENTRY;
684
685         LASSERT(lsm != NULL);
686
687         oinfo.oi_md = lsm;
688         oinfo.oi_oa = obdo;
689         oinfo.oi_oa->o_id = lsm->lsm_object_id;
690         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
691         oinfo.oi_oa->o_mode = S_IFREG;
692         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
693                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
694                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
695                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
696                                OBD_MD_FLGROUP;
697         oinfo.oi_capa = capa;
698
699         set = ptlrpc_prep_set();
700         if (set == NULL) {
701                 CERROR("can't allocate ptlrpc set\n");
702                 rc = -ENOMEM;
703         } else {
704                 rc = obd_getattr_async(exp, &oinfo, set);
705                 if (rc == 0)
706                         rc = ptlrpc_set_wait(set);
707                 ptlrpc_set_destroy(set);
708         }
709         if (rc == 0)
710                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
711                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
712                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE);
713         RETURN(rc);
714 }
715
716 /* Fills the obdo with the attributes for the inode defined by lsm */
717 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
718 {
719         struct ll_inode_info *lli  = ll_i2info(inode);
720         struct obd_capa      *capa = ll_mdscapa_get(inode);
721         int rc;
722         ENTRY;
723
724         rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode), capa, obdo);
725         capa_put(capa);
726         if (rc == 0) {
727                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
728                 CDEBUG(D_INODE,
729                        "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
730                        lli->lli_smd->lsm_object_id, i_size_read(inode),
731                        (unsigned long long)inode->i_blocks,
732                        (unsigned long)ll_inode_blksize(inode));
733         }
734         RETURN(rc);
735 }
736
737 int ll_merge_lvb(struct inode *inode)
738 {
739         struct ll_inode_info *lli = ll_i2info(inode);
740         struct ll_sb_info *sbi = ll_i2sbi(inode);
741         struct ost_lvb lvb;
742         int rc;
743
744         ENTRY;
745
746         ll_inode_size_lock(inode, 1);
747         inode_init_lvb(inode, &lvb);
748         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
749         i_size_write(inode, lvb.lvb_size);
750         inode->i_blocks = lvb.lvb_blocks;
751
752         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
753         LTIME_S(inode->i_atime) = lvb.lvb_atime;
754         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
755         ll_inode_size_unlock(inode, 1);
756
757         RETURN(rc);
758 }
759
760 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
761                      lstat_t *st)
762 {
763         struct obdo obdo = { 0 };
764         int rc;
765
766         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo);
767         if (rc == 0) {
768                 st->st_size   = obdo.o_size;
769                 st->st_blocks = obdo.o_blocks;
770                 st->st_mtime  = obdo.o_mtime;
771                 st->st_atime  = obdo.o_atime;
772                 st->st_ctime  = obdo.o_ctime;
773         }
774         return rc;
775 }
776
777 void ll_io_init(struct cl_io *io, const struct file *file, int write)
778 {
779         struct inode *inode = file->f_dentry->d_inode;
780
781         memset(io, 0, sizeof *io);
782         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
783         if (write)
784                 io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
785         io->ci_obj     = ll_i2info(inode)->lli_clob;
786         io->ci_lockreq = CILR_MAYBE;
787         if (ll_file_nolock(file)) {
788                 io->ci_lockreq = CILR_NEVER;
789                 io->ci_no_srvlock = 1;
790         } else if (file->f_flags & O_APPEND) {
791                 io->ci_lockreq = CILR_MANDATORY;
792         }
793 }
794
795 static ssize_t ll_file_io_generic(const struct lu_env *env,
796                 struct ccc_io_args *args, struct file *file,
797                 enum cl_io_type iot, loff_t *ppos, size_t count)
798 {
799         struct cl_io       *io;
800         ssize_t             result;
801         ENTRY;
802
803         io = &ccc_env_info(env)->cti_io;
804         ll_io_init(io, file, iot == CIT_WRITE);
805
806         if (iot == CIT_READ)
807                 io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile;
808
809         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
810                 struct vvp_io *vio = vvp_env_io(env);
811                 struct ccc_io *cio = ccc_env_io(env);
812                 if (cl_io_is_sendfile(io)) {
813                         vio->u.read.cui_actor = args->cia_actor;
814                         vio->u.read.cui_target = args->cia_target;
815                 } else {
816                         cio->cui_iov = args->cia_iov;
817                         cio->cui_nrsegs = args->cia_nrsegs;
818 #ifndef HAVE_FILE_WRITEV
819                         cio->cui_iocb = args->cia_iocb;
820 #endif
821                 }
822                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
823                 result = cl_io_loop(env, io);
824         } else
825                 /* cl_io_rw_init() handled IO */
826                 result = io->ci_result;
827         if (io->ci_nob > 0) {
828                 result = io->ci_nob;
829                 *ppos = io->u.ci_wr.wr.crw_pos;
830         }
831         cl_io_fini(env, io);
832         RETURN(result);
833 }
834
835
836 /*
837  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
838  */
839 static int ll_file_get_iov_count(const struct iovec *iov,
840                                  unsigned long *nr_segs, size_t *count)
841 {
842         size_t cnt = 0;
843         unsigned long seg;
844
845         for (seg = 0; seg < *nr_segs; seg++) {
846                 const struct iovec *iv = &iov[seg];
847
848                 /*
849                  * If any segment has a negative length, or the cumulative
850                  * length ever wraps negative then return -EINVAL.
851                  */
852                 cnt += iv->iov_len;
853                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
854                         return -EINVAL;
855                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
856                         continue;
857                 if (seg == 0)
858                         return -EFAULT;
859                 *nr_segs = seg;
860                 cnt -= iv->iov_len;   /* This segment is no good */
861                 break;
862         }
863         *count = cnt;
864         return 0;
865 }
866
867 #ifdef HAVE_FILE_READV
868 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
869                               unsigned long nr_segs, loff_t *ppos)
870 {
871         struct lu_env      *env;
872         struct ccc_io_args *args;
873         size_t              count;
874         ssize_t             result;
875         int                 refcheck;
876         ENTRY;
877
878         result = ll_file_get_iov_count(iov, &nr_segs, &count);
879         if (result)
880                 RETURN(result);
881
882         env = cl_env_get(&refcheck);
883         if (IS_ERR(env))
884                 RETURN(PTR_ERR(env));
885
886         args = &vvp_env_info(env)->vti_args;
887         args->cia_is_sendfile = 0;
888         args->cia_iov = (struct iovec *)iov;
889         args->cia_nrsegs = nr_segs;
890         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
891         cl_env_put(env, &refcheck);
892         RETURN(result);
893 }
894
895 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
896                             loff_t *ppos)
897 {
898         struct lu_env *env;
899         struct iovec  *local_iov;
900         ssize_t        result;
901         int            refcheck;
902         ENTRY;
903
904         env = cl_env_get(&refcheck);
905         if (IS_ERR(env))
906                 RETURN(PTR_ERR(env));
907
908         local_iov = &vvp_env_info(env)->vti_local_iov;
909         local_iov->iov_base = (void __user *)buf;
910         local_iov->iov_len = count;
911         result = ll_file_readv(file, local_iov, 1, ppos);
912         cl_env_put(env, &refcheck);
913         RETURN(result);
914 }
915
916 #else
917 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
918                                 unsigned long nr_segs, loff_t pos)
919 {
920         struct lu_env      *env;
921         struct ccc_io_args *args;
922         size_t              count;
923         ssize_t             result;
924         int                 refcheck;
925         ENTRY;
926
927         result = ll_file_get_iov_count(iov, &nr_segs, &count);
928         if (result)
929                 RETURN(result);
930
931         env = cl_env_get(&refcheck);
932         if (IS_ERR(env))
933                 RETURN(PTR_ERR(env));
934
935         args = &vvp_env_info(env)->vti_args;
936         args->cia_is_sendfile = 0;
937         args->cia_iov = (struct iovec *)iov;
938         args->cia_nrsegs = nr_segs;
939         args->cia_iocb = iocb;
940         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
941                                     &iocb->ki_pos, count);
942         cl_env_put(env, &refcheck);
943         RETURN(result);
944 }
945
946 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
947                             loff_t *ppos)
948 {
949         struct lu_env *env;
950         struct iovec  *local_iov;
951         struct kiocb  *kiocb;
952         ssize_t        result;
953         int            refcheck;
954         ENTRY;
955
956         env = cl_env_get(&refcheck);
957         if (IS_ERR(env))
958                 RETURN(PTR_ERR(env));
959
960         local_iov = &vvp_env_info(env)->vti_local_iov;
961         kiocb = &vvp_env_info(env)->vti_kiocb;
962         local_iov->iov_base = (void __user *)buf;
963         local_iov->iov_len = count;
964         init_sync_kiocb(kiocb, file);
965         kiocb->ki_pos = *ppos;
966         kiocb->ki_left = count;
967
968         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
969         *ppos = kiocb->ki_pos;
970
971         cl_env_put(env, &refcheck);
972         RETURN(result);
973 }
974 #endif
975
976 /*
977  * Write to a file (through the page cache).
978  */
979 #ifdef HAVE_FILE_WRITEV
980 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
981                               unsigned long nr_segs, loff_t *ppos)
982 {
983         struct lu_env      *env;
984         struct ccc_io_args *args;
985         size_t              count;
986         ssize_t             result;
987         int                 refcheck;
988         ENTRY;
989
990         result = ll_file_get_iov_count(iov, &nr_segs, &count);
991         if (result)
992                 RETURN(result);
993
994         env = cl_env_get(&refcheck);
995         if (IS_ERR(env))
996                 RETURN(PTR_ERR(env));
997
998         args = &vvp_env_info(env)->vti_args;
999         args->cia_iov = (struct iovec *)iov;
1000         args->cia_nrsegs = nr_segs;
1001         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1002         cl_env_put(env, &refcheck);
1003         RETURN(result);
1004 }
1005
1006 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1007                              loff_t *ppos)
1008 {
1009         struct lu_env    *env;
1010         struct iovec     *local_iov;
1011         ssize_t           result;
1012         int               refcheck;
1013         ENTRY;
1014
1015         env = cl_env_get(&refcheck);
1016         if (IS_ERR(env))
1017                 RETURN(PTR_ERR(env));
1018
1019         local_iov = &vvp_env_info(env)->vti_local_iov;
1020         local_iov->iov_base = (void __user *)buf;
1021         local_iov->iov_len = count;
1022
1023         result = ll_file_writev(file, local_iov, 1, ppos);
1024         cl_env_put(env, &refcheck);
1025         RETURN(result);
1026 }
1027
1028 #else /* AIO stuff */
1029 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1030                                  unsigned long nr_segs, loff_t pos)
1031 {
1032         struct lu_env      *env;
1033         struct ccc_io_args *args;
1034         size_t              count;
1035         ssize_t             result;
1036         int                 refcheck;
1037         ENTRY;
1038
1039         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1040         if (result)
1041                 RETURN(result);
1042
1043         env = cl_env_get(&refcheck);
1044         if (IS_ERR(env))
1045                 RETURN(PTR_ERR(env));
1046
1047         args = &vvp_env_info(env)->vti_args;
1048         args->cia_iov = (struct iovec *)iov;
1049         args->cia_nrsegs = nr_segs;
1050         args->cia_iocb = iocb;
1051         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1052                                   &iocb->ki_pos, count);
1053         cl_env_put(env, &refcheck);
1054         RETURN(result);
1055 }
1056
1057 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1058                              loff_t *ppos)
1059 {
1060         struct lu_env *env;
1061         struct iovec  *local_iov;
1062         struct kiocb  *kiocb;
1063         ssize_t        result;
1064         int            refcheck;
1065         ENTRY;
1066
1067         env = cl_env_get(&refcheck);
1068         if (IS_ERR(env))
1069                 RETURN(PTR_ERR(env));
1070
1071         local_iov = &vvp_env_info(env)->vti_local_iov;
1072         kiocb = &vvp_env_info(env)->vti_kiocb;
1073         local_iov->iov_base = (void __user *)buf;
1074         local_iov->iov_len = count;
1075         init_sync_kiocb(kiocb, file);
1076         kiocb->ki_pos = *ppos;
1077         kiocb->ki_left = count;
1078
1079         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1080         *ppos = kiocb->ki_pos;
1081
1082         cl_env_put(env, &refcheck);
1083         RETURN(result);
1084 }
1085 #endif
1086
1087
1088 /*
1089  * Send file content (through pagecache) somewhere with helper
1090  */
1091 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1092                                 read_actor_t actor, void *target)
1093 {
1094         struct lu_env      *env;
1095         struct ccc_io_args *args;
1096         ssize_t             result;
1097         int                 refcheck;
1098         ENTRY;
1099
1100         env = cl_env_get(&refcheck);
1101         if (IS_ERR(env))
1102                 RETURN(PTR_ERR(env));
1103
1104         args = &vvp_env_info(env)->vti_args;
1105         args->cia_is_sendfile = 1;
1106         args->cia_target = target;
1107         args->cia_actor = actor;
1108         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1109         cl_env_put(env, &refcheck);
1110         RETURN(result);
1111 }
1112
1113 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1114                                unsigned long arg)
1115 {
1116         struct obd_export *exp = ll_i2dtexp(inode);
1117         struct ll_recreate_obj ucreatp;
1118         struct obd_trans_info oti = { 0 };
1119         struct obdo *oa = NULL;
1120         int lsm_size;
1121         int rc = 0;
1122         struct lov_stripe_md *lsm, *lsm2;
1123         ENTRY;
1124
1125         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1126                 RETURN(-EPERM);
1127
1128         if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1129                            sizeof(struct ll_recreate_obj)))
1130                 RETURN(-EFAULT);
1131
1132         OBDO_ALLOC(oa);
1133         if (oa == NULL)
1134                 RETURN(-ENOMEM);
1135
1136         ll_inode_size_lock(inode, 0);
1137         lsm = ll_i2info(inode)->lli_smd;
1138         if (lsm == NULL)
1139                 GOTO(out, rc = -ENOENT);
1140         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1141                    (lsm->lsm_stripe_count));
1142
1143         OBD_ALLOC(lsm2, lsm_size);
1144         if (lsm2 == NULL)
1145                 GOTO(out, rc = -ENOMEM);
1146
1147         oa->o_id = ucreatp.lrc_id;
1148         oa->o_gr = ucreatp.lrc_group;
1149         oa->o_nlink = ucreatp.lrc_ost_idx;
1150         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1151         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1152         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1153                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1154
1155         memcpy(lsm2, lsm, lsm_size);
1156         rc = obd_create(exp, oa, &lsm2, &oti);
1157
1158         OBD_FREE(lsm2, lsm_size);
1159         GOTO(out, rc);
1160 out:
1161         ll_inode_size_unlock(inode, 0);
1162         OBDO_FREE(oa);
1163         return rc;
1164 }
1165
1166 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1167                              int flags, struct lov_user_md *lum, int lum_size)
1168 {
1169         struct lov_stripe_md *lsm;
1170         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1171         int rc = 0;
1172         ENTRY;
1173
1174         ll_inode_size_lock(inode, 0);
1175         lsm = ll_i2info(inode)->lli_smd;
1176         if (lsm) {
1177                 ll_inode_size_unlock(inode, 0);
1178                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1179                        inode->i_ino);
1180                 RETURN(-EEXIST);
1181         }
1182
1183         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1184         if (rc)
1185                 GOTO(out, rc);
1186         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1187                 GOTO(out_req_free, rc = -ENOENT);
1188         rc = oit.d.lustre.it_status;
1189         if (rc < 0)
1190                 GOTO(out_req_free, rc);
1191
1192         ll_release_openhandle(file->f_dentry, &oit);
1193
1194  out:
1195         ll_inode_size_unlock(inode, 0);
1196         ll_intent_release(&oit);
1197         RETURN(rc);
1198 out_req_free:
1199         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1200         goto out;
1201 }
1202
1203 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1204                              struct lov_mds_md **lmmp, int *lmm_size,
1205                              struct ptlrpc_request **request)
1206 {
1207         struct ll_sb_info *sbi = ll_i2sbi(inode);
1208         struct mdt_body  *body;
1209         struct lov_mds_md *lmm = NULL;
1210         struct ptlrpc_request *req = NULL;
1211         struct obd_capa *oc;
1212         int rc, lmmsize;
1213
1214         rc = ll_get_max_mdsize(sbi, &lmmsize);
1215         if (rc)
1216                 RETURN(rc);
1217
1218         oc = ll_mdscapa_get(inode);
1219         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1220                              oc, filename, strlen(filename) + 1,
1221                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1222                              ll_i2suppgid(inode), &req);
1223         capa_put(oc);
1224         if (rc < 0) {
1225                 CDEBUG(D_INFO, "md_getattr_name failed "
1226                        "on %s: rc %d\n", filename, rc);
1227                 GOTO(out, rc);
1228         }
1229
1230         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1231         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1232
1233         lmmsize = body->eadatasize;
1234
1235         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1236                         lmmsize == 0) {
1237                 GOTO(out, rc = -ENODATA);
1238         }
1239
1240         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1241         LASSERT(lmm != NULL);
1242
1243         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1244             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
1245             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
1246                 GOTO(out, rc = -EPROTO);
1247         }
1248
1249         /*
1250          * This is coming from the MDS, so is probably in
1251          * little endian.  We convert it to host endian before
1252          * passing it to userspace.
1253          */
1254         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1255                 /* if function called for directory - we should
1256                  * avoid swab not existent lsm objects */
1257                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1258                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1259                         if (S_ISREG(body->mode))
1260                                 lustre_swab_lov_user_md_objects(
1261                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1262                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1263                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1264                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1265                         if (S_ISREG(body->mode))
1266                                 lustre_swab_lov_user_md_objects(
1267                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1268                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1269                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
1270                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1271                 }
1272         }
1273
1274         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1275                 struct lov_stripe_md *lsm;
1276                 struct lov_user_md_join *lmj;
1277                 int lmj_size, i, aindex = 0;
1278
1279                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1280                 if (rc < 0)
1281                         GOTO(out, rc = -ENOMEM);
1282                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1283                 if (rc)
1284                         GOTO(out_free_memmd, rc);
1285
1286                 lmj_size = sizeof(struct lov_user_md_join) +
1287                            lsm->lsm_stripe_count *
1288                            sizeof(struct lov_user_ost_data_join);
1289                 OBD_ALLOC(lmj, lmj_size);
1290                 if (!lmj)
1291                         GOTO(out_free_memmd, rc = -ENOMEM);
1292
1293                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1294                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1295                         struct lov_extent *lex =
1296                                 &lsm->lsm_array->lai_ext_array[aindex];
1297
1298                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1299                                 aindex ++;
1300                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1301                                         LPU64" len %d\n", aindex, i,
1302                                         lex->le_start, (int)lex->le_len);
1303                         lmj->lmm_objects[i].l_extent_start =
1304                                 lex->le_start;
1305
1306                         if ((int)lex->le_len == -1)
1307                                 lmj->lmm_objects[i].l_extent_end = -1;
1308                         else
1309                                 lmj->lmm_objects[i].l_extent_end =
1310                                         lex->le_start + lex->le_len;
1311                         lmj->lmm_objects[i].l_object_id =
1312                                 lsm->lsm_oinfo[i]->loi_id;
1313                         lmj->lmm_objects[i].l_object_gr =
1314                                 lsm->lsm_oinfo[i]->loi_gr;
1315                         lmj->lmm_objects[i].l_ost_gen =
1316                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1317                         lmj->lmm_objects[i].l_ost_idx =
1318                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1319                 }
1320                 lmm = (struct lov_mds_md *)lmj;
1321                 lmmsize = lmj_size;
1322 out_free_memmd:
1323                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1324         }
1325 out:
1326         *lmmp = lmm;
1327         *lmm_size = lmmsize;
1328         *request = req;
1329         return rc;
1330 }
1331
1332 static int ll_lov_setea(struct inode *inode, struct file *file,
1333                             unsigned long arg)
1334 {
1335         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1336         struct lov_user_md  *lump;
1337         int lum_size = sizeof(struct lov_user_md) +
1338                        sizeof(struct lov_user_ost_data);
1339         int rc;
1340         ENTRY;
1341
1342         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1343                 RETURN(-EPERM);
1344
1345         OBD_ALLOC(lump, lum_size);
1346         if (lump == NULL) {
1347                 RETURN(-ENOMEM);
1348         }
1349         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1350                 OBD_FREE(lump, lum_size);
1351                 RETURN(-EFAULT);
1352         }
1353
1354         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1355
1356         OBD_FREE(lump, lum_size);
1357         RETURN(rc);
1358 }
1359
1360 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1361                             unsigned long arg)
1362 {
1363         struct lov_user_md_v3 lumv3;
1364         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1365         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1366         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1367         int lum_size;
1368         int rc;
1369         int flags = FMODE_WRITE;
1370         ENTRY;
1371
1372         /* first try with v1 which is smaller than v3 */
1373         lum_size = sizeof(struct lov_user_md_v1);
1374         if (copy_from_user(lumv1, lumv1p, lum_size))
1375                 RETURN(-EFAULT);
1376
1377         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1378                 lum_size = sizeof(struct lov_user_md_v3);
1379                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1380                         RETURN(-EFAULT);
1381         }
1382
1383         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1384         if (rc == 0) {
1385                  put_user(0, &lumv1p->lmm_stripe_count);
1386                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1387                                     0, ll_i2info(inode)->lli_smd,
1388                                     (void *)arg);
1389         }
1390         RETURN(rc);
1391 }
1392
1393 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1394 {
1395         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1396
1397         if (!lsm)
1398                 RETURN(-ENODATA);
1399
1400         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1401                             (void *)arg);
1402 }
1403
1404 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1405 {
1406         struct ll_inode_info   *lli = ll_i2info(inode);
1407         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1408         struct ccc_grouplock    grouplock;
1409         int                     rc;
1410         ENTRY;
1411
1412         if (ll_file_nolock(file))
1413                 RETURN(-EOPNOTSUPP);
1414
1415         spin_lock(&lli->lli_lock);
1416         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1417                 CERROR("group lock already existed with gid %lu\n",
1418                        fd->fd_grouplock.cg_gid);
1419                 spin_unlock(&lli->lli_lock);
1420                 RETURN(-EINVAL);
1421         }
1422         LASSERT(fd->fd_grouplock.cg_lock == NULL);
1423         spin_unlock(&lli->lli_lock);
1424
1425         rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1426                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1427         if (rc)
1428                 RETURN(rc);
1429
1430         spin_lock(&lli->lli_lock);
1431         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1432                 spin_unlock(&lli->lli_lock);
1433                 CERROR("another thread just won the race\n");
1434                 cl_put_grouplock(&grouplock);
1435                 RETURN(-EINVAL);
1436         }
1437
1438         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1439         fd->fd_grouplock = grouplock;
1440         spin_unlock(&lli->lli_lock);
1441
1442         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1443         RETURN(0);
1444 }
1445
1446 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1447 {
1448         struct ll_inode_info   *lli = ll_i2info(inode);
1449         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1450         struct ccc_grouplock    grouplock;
1451         ENTRY;
1452
1453         spin_lock(&lli->lli_lock);
1454         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1455                 spin_unlock(&lli->lli_lock);
1456                 CERROR("no group lock held\n");
1457                 RETURN(-EINVAL);
1458         }
1459         LASSERT(fd->fd_grouplock.cg_lock != NULL);
1460
1461         if (fd->fd_grouplock.cg_gid != arg) {
1462                 CERROR("group lock %lu doesn't match current id %lu\n",
1463                        arg, fd->fd_grouplock.cg_gid);
1464                 spin_unlock(&lli->lli_lock);
1465                 RETURN(-EINVAL);
1466         }
1467
1468         grouplock = fd->fd_grouplock;
1469         fd->fd_grouplock.cg_env = NULL;
1470         fd->fd_grouplock.cg_lock = NULL;
1471         fd->fd_grouplock.cg_gid = 0;
1472         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1473         spin_unlock(&lli->lli_lock);
1474
1475         cl_put_grouplock(&grouplock);
1476         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1477         RETURN(0);
1478 }
1479
1480 #if LUSTRE_FIX >= 50
1481 static int join_sanity_check(struct inode *head, struct inode *tail)
1482 {
1483         ENTRY;
1484         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1485                 CERROR("server do not support join \n");
1486                 RETURN(-EINVAL);
1487         }
1488         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1489                 CERROR("tail ino %lu and ino head %lu must be regular\n",
1490                        head->i_ino, tail->i_ino);
1491                 RETURN(-EINVAL);
1492         }
1493         if (head->i_ino == tail->i_ino) {
1494                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1495                 RETURN(-EINVAL);
1496         }
1497         if (i_size_read(head) % JOIN_FILE_ALIGN) {
1498                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1499                 RETURN(-EINVAL);
1500         }
1501         RETURN(0);
1502 }
1503
1504 static int join_file(struct inode *head_inode, struct file *head_filp,
1505                      struct file *tail_filp)
1506 {
1507         struct dentry *tail_dentry = tail_filp->f_dentry;
1508         struct lookup_intent oit = {.it_op = IT_OPEN,
1509                                     .it_flags = head_filp->f_flags,
1510                                     .it_create_mode = M_JOIN_FILE};
1511         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
1512                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
1513
1514         struct lustre_handle lockh;
1515         struct md_op_data *op_data;
1516         int    rc;
1517         loff_t data;
1518         ENTRY;
1519
1520         tail_dentry = tail_filp->f_dentry;
1521
1522         data = i_size_read(head_inode);
1523         op_data = ll_prep_md_op_data(NULL, head_inode,
1524                                      tail_dentry->d_parent->d_inode,
1525                                      tail_dentry->d_name.name,
1526                                      tail_dentry->d_name.len, 0,
1527                                      LUSTRE_OPC_ANY, &data);
1528         if (IS_ERR(op_data))
1529                 RETURN(PTR_ERR(op_data));
1530
1531         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
1532                          op_data, &lockh, NULL, 0, NULL, 0);
1533
1534         ll_finish_md_op_data(op_data);
1535         if (rc < 0)
1536                 GOTO(out, rc);
1537
1538         rc = oit.d.lustre.it_status;
1539
1540         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
1541                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
1542                 ptlrpc_req_finished((struct ptlrpc_request *)
1543                                     oit.d.lustre.it_data);
1544                 GOTO(out, rc);
1545         }
1546
1547         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1548                                            * away */
1549                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1550                 oit.d.lustre.it_lock_mode = 0;
1551         }
1552         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1553         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
1554         ll_release_openhandle(head_filp->f_dentry, &oit);
1555 out:
1556         ll_intent_release(&oit);
1557         RETURN(rc);
1558 }
1559
1560 static int ll_file_join(struct inode *head, struct file *filp,
1561                         char *filename_tail)
1562 {
1563         struct inode *tail = NULL, *first = NULL, *second = NULL;
1564         struct dentry *tail_dentry;
1565         struct file *tail_filp, *first_filp, *second_filp;
1566         struct ll_lock_tree first_tree, second_tree;
1567         struct ll_lock_tree_node *first_node, *second_node;
1568         struct ll_inode_info *hlli = ll_i2info(head);
1569         int rc = 0, cleanup_phase = 0;
1570         ENTRY;
1571
1572         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1573                head->i_ino, head->i_generation, head, filename_tail);
1574
1575         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1576         if (IS_ERR(tail_filp)) {
1577                 CERROR("Can not open tail file %s", filename_tail);
1578                 rc = PTR_ERR(tail_filp);
1579                 GOTO(cleanup, rc);
1580         }
1581         tail = igrab(tail_filp->f_dentry->d_inode);
1582
1583         tail_dentry = tail_filp->f_dentry;
1584         LASSERT(tail_dentry);
1585         cleanup_phase = 1;
1586
1587         /*reorder the inode for lock sequence*/
1588         first = head->i_ino > tail->i_ino ? head : tail;
1589         second = head->i_ino > tail->i_ino ? tail : head;
1590         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1591         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1592
1593         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1594                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1595         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1596         if (IS_ERR(first_node)){
1597                 rc = PTR_ERR(first_node);
1598                 GOTO(cleanup, rc);
1599         }
1600         first_tree.lt_fd = first_filp->private_data;
1601         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1602         if (rc != 0)
1603                 GOTO(cleanup, rc);
1604         cleanup_phase = 2;
1605
1606         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1607         if (IS_ERR(second_node)){
1608                 rc = PTR_ERR(second_node);
1609                 GOTO(cleanup, rc);
1610         }
1611         second_tree.lt_fd = second_filp->private_data;
1612         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1613         if (rc != 0)
1614                 GOTO(cleanup, rc);
1615         cleanup_phase = 3;
1616
1617         rc = join_sanity_check(head, tail);
1618         if (rc)
1619                 GOTO(cleanup, rc);
1620
1621         rc = join_file(head, filp, tail_filp);
1622         if (rc)
1623                 GOTO(cleanup, rc);
1624 cleanup:
1625         switch (cleanup_phase) {
1626         case 3:
1627                 ll_tree_unlock(&second_tree);
1628                 obd_cancel_unused(ll_i2dtexp(second),
1629                                   ll_i2info(second)->lli_smd, 0, NULL);
1630         case 2:
1631                 ll_tree_unlock(&first_tree);
1632                 obd_cancel_unused(ll_i2dtexp(first),
1633                                   ll_i2info(first)->lli_smd, 0, NULL);
1634         case 1:
1635                 filp_close(tail_filp, 0);
1636                 if (tail)
1637                         iput(tail);
1638                 if (head && rc == 0) {
1639                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
1640                                        &hlli->lli_smd);
1641                         hlli->lli_smd = NULL;
1642                 }
1643         case 0:
1644                 break;
1645         default:
1646                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
1647                 LBUG();
1648         }
1649         RETURN(rc);
1650 }
1651 #endif /* LUSTRE_FIX >= 50 */
1652
1653 /**
1654  * Close inode open handle
1655  *
1656  * \param dentry [in]     dentry which contains the inode
1657  * \param it     [in,out] intent which contains open info and result
1658  *
1659  * \retval 0     success
1660  * \retval <0    failure
1661  */
1662 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1663 {
1664         struct inode *inode = dentry->d_inode;
1665         struct obd_client_handle *och;
1666         int rc;
1667         ENTRY;
1668
1669         LASSERT(inode);
1670
1671         /* Root ? Do nothing. */
1672         if (dentry->d_inode->i_sb->s_root == dentry)
1673                 RETURN(0);
1674
1675         /* No open handle to close? Move away */
1676         if (!it_disposition(it, DISP_OPEN_OPEN))
1677                 RETURN(0);
1678
1679         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1680
1681         OBD_ALLOC(och, sizeof(*och));
1682         if (!och)
1683                 GOTO(out, rc = -ENOMEM);
1684
1685         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1686                     ll_i2info(inode), it, och);
1687
1688         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1689                                        inode, och);
1690  out:
1691         /* this one is in place of ll_file_open */
1692         if (it_disposition(it, DISP_ENQ_OPEN_REF))
1693                 ptlrpc_req_finished(it->d.lustre.it_data);
1694         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1695         RETURN(rc);
1696 }
1697
1698 /**
1699  * Get size for inode for which FIEMAP mapping is requested.
1700  * Make the FIEMAP get_info call and returns the result.
1701  */
1702 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1703               int num_bytes)
1704 {
1705         struct obd_export *exp = ll_i2dtexp(inode);
1706         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1707         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1708         int vallen = num_bytes;
1709         int rc;
1710         ENTRY;
1711
1712         /* If the stripe_count > 1 and the application does not understand
1713          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1714          */
1715         if (lsm->lsm_stripe_count > 1 &&
1716             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1717                 return -EOPNOTSUPP;
1718
1719         fm_key.oa.o_id = lsm->lsm_object_id;
1720         fm_key.oa.o_gr = lsm->lsm_object_gr;
1721         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1722
1723         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
1724                         OBD_MD_FLSIZE);
1725
1726         /* If filesize is 0, then there would be no objects for mapping */
1727         if (fm_key.oa.o_size == 0) {
1728                 fiemap->fm_mapped_extents = 0;
1729                 RETURN(0);
1730         }
1731
1732         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1733
1734         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1735         if (rc)
1736                 CERROR("obd_get_info failed: rc = %d\n", rc);
1737
1738         RETURN(rc);
1739 }
1740
1741 int ll_fid2path(struct obd_export *exp, void *arg)
1742 {
1743         struct getinfo_fid2path *gfout, *gfin;
1744         int outsize, rc;
1745         ENTRY;
1746
1747         /* Need to get the buflen */
1748         OBD_ALLOC_PTR(gfin);
1749         if (gfin == NULL)
1750                 RETURN(-ENOMEM);
1751         if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1752                 OBD_FREE_PTR(gfin);
1753                 RETURN(-EFAULT);
1754         }
1755
1756         outsize = sizeof(*gfout) + gfin->gf_pathlen;
1757         OBD_ALLOC(gfout, outsize);
1758         if (gfout == NULL) {
1759                 OBD_FREE_PTR(gfin);
1760                 RETURN(-ENOMEM);
1761         }
1762         memcpy(gfout, gfin, sizeof(*gfout));
1763         OBD_FREE_PTR(gfin);
1764
1765         /* Call mdc_iocontrol */
1766         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1767         if (rc)
1768                 GOTO(gf_free, rc);
1769         if (copy_to_user(arg, gfout, outsize))
1770                 rc = -EFAULT;
1771
1772 gf_free:
1773         OBD_FREE(gfout, outsize);
1774         RETURN(rc);
1775 }
1776
1777 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1778                   unsigned long arg)
1779 {
1780         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1781         int flags;
1782         ENTRY;
1783
1784         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1785                inode->i_generation, inode, cmd);
1786         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1787
1788         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1789         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1790                 RETURN(-ENOTTY);
1791
1792         switch(cmd) {
1793         case LL_IOC_GETFLAGS:
1794                 /* Get the current value of the file flags */
1795                 return put_user(fd->fd_flags, (int *)arg);
1796         case LL_IOC_SETFLAGS:
1797         case LL_IOC_CLRFLAGS:
1798                 /* Set or clear specific file flags */
1799                 /* XXX This probably needs checks to ensure the flags are
1800                  *     not abused, and to handle any flag side effects.
1801                  */
1802                 if (get_user(flags, (int *) arg))
1803                         RETURN(-EFAULT);
1804
1805                 if (cmd == LL_IOC_SETFLAGS) {
1806                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1807                             !(file->f_flags & O_DIRECT)) {
1808                                 CERROR("%s: unable to disable locking on "
1809                                        "non-O_DIRECT file\n", current->comm);
1810                                 RETURN(-EINVAL);
1811                         }
1812
1813                         fd->fd_flags |= flags;
1814                 } else {
1815                         fd->fd_flags &= ~flags;
1816                 }
1817                 RETURN(0);
1818         case LL_IOC_LOV_SETSTRIPE:
1819                 RETURN(ll_lov_setstripe(inode, file, arg));
1820         case LL_IOC_LOV_SETEA:
1821                 RETURN(ll_lov_setea(inode, file, arg));
1822         case LL_IOC_LOV_GETSTRIPE:
1823                 RETURN(ll_lov_getstripe(inode, arg));
1824         case LL_IOC_RECREATE_OBJ:
1825                 RETURN(ll_lov_recreate_obj(inode, file, arg));
1826         case FSFILT_IOC_FIEMAP: {
1827                 struct ll_user_fiemap *fiemap_s;
1828                 size_t num_bytes, ret_bytes;
1829                 unsigned int extent_count;
1830                 int rc = 0;
1831
1832                 /* Get the extent count so we can calculate the size of
1833                  * required fiemap buffer */
1834                 if (get_user(extent_count,
1835                     &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1836                         RETURN(-EFAULT);
1837                 num_bytes = sizeof(*fiemap_s) + (extent_count *
1838                                                  sizeof(struct ll_fiemap_extent));
1839                 OBD_VMALLOC(fiemap_s, num_bytes);
1840                 if (fiemap_s == NULL)
1841                         RETURN(-ENOMEM);
1842
1843                 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1844                                    sizeof(*fiemap_s)))
1845                         GOTO(error, rc = -EFAULT);
1846
1847                 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1848                         fiemap_s->fm_flags = fiemap_s->fm_flags &
1849                                                     ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1850                         if (copy_to_user((char *)arg, fiemap_s,
1851                                          sizeof(*fiemap_s)))
1852                                 GOTO(error, rc = -EFAULT);
1853
1854                         GOTO(error, rc = -EBADR);
1855                 }
1856
1857                 /* If fm_extent_count is non-zero, read the first extent since
1858                  * it is used to calculate end_offset and device from previous
1859                  * fiemap call. */
1860                 if (extent_count) {
1861                         if (copy_from_user(&fiemap_s->fm_extents[0],
1862                             (char __user *)arg + sizeof(*fiemap_s),
1863                             sizeof(struct ll_fiemap_extent)))
1864                                 GOTO(error, rc = -EFAULT);
1865                 }
1866
1867                 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
1868                         int rc;
1869
1870                         rc = filemap_fdatawrite(inode->i_mapping);
1871                         if (rc)
1872                                 GOTO(error, rc);
1873                 }
1874
1875                 rc = ll_fiemap(inode, fiemap_s, num_bytes);
1876                 if (rc)
1877                         GOTO(error, rc);
1878
1879                 ret_bytes = sizeof(struct ll_user_fiemap);
1880
1881                 if (extent_count != 0)
1882                         ret_bytes += (fiemap_s->fm_mapped_extents *
1883                                          sizeof(struct ll_fiemap_extent));
1884
1885                 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1886                         rc = -EFAULT;
1887
1888 error:
1889                 OBD_VFREE(fiemap_s, num_bytes);
1890                 RETURN(rc);
1891         }
1892         case FSFILT_IOC_GETFLAGS:
1893         case FSFILT_IOC_SETFLAGS:
1894                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1895         case FSFILT_IOC_GETVERSION_OLD:
1896         case FSFILT_IOC_GETVERSION:
1897                 RETURN(put_user(inode->i_generation, (int *)arg));
1898         case LL_IOC_JOIN: {
1899 #if LUSTRE_FIX >= 50
1900                 /* Allow file join in beta builds to allow debuggging */
1901                 char *ftail;
1902                 int rc;
1903
1904                 ftail = getname((const char *)arg);
1905                 if (IS_ERR(ftail))
1906                         RETURN(PTR_ERR(ftail));
1907                 rc = ll_file_join(inode, file, ftail);
1908                 putname(ftail);
1909                 RETURN(rc);
1910 #else
1911                 CWARN("file join is not supported in this version of Lustre\n");
1912                 RETURN(-ENOTTY);
1913 #endif
1914         }
1915         case LL_IOC_GROUP_LOCK:
1916                 RETURN(ll_get_grouplock(inode, file, arg));
1917         case LL_IOC_GROUP_UNLOCK:
1918                 RETURN(ll_put_grouplock(inode, file, arg));
1919         case IOC_OBD_STATFS:
1920                 RETURN(ll_obd_statfs(inode, (void *)arg));
1921
1922         /* We need to special case any other ioctls we want to handle,
1923          * to send them to the MDS/OST as appropriate and to properly
1924          * network encode the arg field.
1925         case FSFILT_IOC_SETVERSION_OLD:
1926         case FSFILT_IOC_SETVERSION:
1927         */
1928         case LL_IOC_FLUSHCTX:
1929                 RETURN(ll_flush_ctx(inode));
1930         case LL_IOC_PATH2FID: {
1931                 if (copy_to_user((void *)arg, ll_inode2fid(inode),
1932                                  sizeof(struct lu_fid)))
1933                         RETURN(-EFAULT);
1934
1935                 RETURN(0);
1936         }
1937         case OBD_IOC_FID2PATH:
1938                 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1939
1940         default: {
1941                 int err;
1942
1943                 if (LLIOC_STOP ==
1944                     ll_iocontrol_call(inode, file, cmd, arg, &err))
1945                         RETURN(err);
1946
1947                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1948                                      (void *)arg));
1949         }
1950         }
1951 }
1952
1953 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1954 {
1955         struct inode *inode = file->f_dentry->d_inode;
1956         loff_t retval;
1957         ENTRY;
1958         retval = offset + ((origin == 2) ? i_size_read(inode) :
1959                            (origin == 1) ? file->f_pos : 0);
1960         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
1961                inode->i_ino, inode->i_generation, inode, retval, retval,
1962                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1963         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1964
1965         if (origin == 2) { /* SEEK_END */
1966                 int nonblock = 0, rc;
1967
1968                 if (file->f_flags & O_NONBLOCK)
1969                         nonblock = LDLM_FL_BLOCK_NOWAIT;
1970
1971                 rc = cl_glimpse_size(inode);
1972                 if (rc != 0)
1973                         RETURN(rc);
1974
1975                 ll_inode_size_lock(inode, 0);
1976                 offset += i_size_read(inode);
1977                 ll_inode_size_unlock(inode, 0);
1978         } else if (origin == 1) { /* SEEK_CUR */
1979                 offset += file->f_pos;
1980         }
1981
1982         retval = -EINVAL;
1983         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1984                 if (offset != file->f_pos) {
1985                         file->f_pos = offset;
1986                 }
1987                 retval = offset;
1988         }
1989
1990         RETURN(retval);
1991 }
1992
1993 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1994 {
1995         struct inode *inode = dentry->d_inode;
1996         struct ll_inode_info *lli = ll_i2info(inode);
1997         struct lov_stripe_md *lsm = lli->lli_smd;
1998         struct ptlrpc_request *req;
1999         struct obd_capa *oc;
2000         int rc, err;
2001         ENTRY;
2002         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2003                inode->i_generation, inode);
2004         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2005
2006         /* fsync's caller has already called _fdata{sync,write}, we want
2007          * that IO to finish before calling the osc and mdc sync methods */
2008         rc = filemap_fdatawait(inode->i_mapping);
2009
2010         /* catch async errors that were recorded back when async writeback
2011          * failed for pages in this mapping. */
2012         err = lli->lli_async_rc;
2013         lli->lli_async_rc = 0;
2014         if (rc == 0)
2015                 rc = err;
2016         if (lsm) {
2017                 err = lov_test_and_clear_async_rc(lsm);
2018                 if (rc == 0)
2019                         rc = err;
2020         }
2021
2022         oc = ll_mdscapa_get(inode);
2023         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2024                       &req);
2025         capa_put(oc);
2026         if (!rc)
2027                 rc = err;
2028         if (!err)
2029                 ptlrpc_req_finished(req);
2030
2031         if (data && lsm) {
2032                 struct obdo *oa;
2033
2034                 OBDO_ALLOC(oa);
2035                 if (!oa)
2036                         RETURN(rc ? rc : -ENOMEM);
2037
2038                 oa->o_id = lsm->lsm_object_id;
2039                 oa->o_gr = lsm->lsm_object_gr;
2040                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2041                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2042                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2043                                            OBD_MD_FLGROUP);
2044
2045                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2046                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2047                                0, OBD_OBJECT_EOF, oc);
2048                 capa_put(oc);
2049                 if (!rc)
2050                         rc = err;
2051                 OBDO_FREE(oa);
2052         }
2053
2054         RETURN(rc);
2055 }
2056
2057 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2058 {
2059         struct inode *inode = file->f_dentry->d_inode;
2060         struct ll_sb_info *sbi = ll_i2sbi(inode);
2061         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2062                                            .ei_cb_cp =ldlm_flock_completion_ast,
2063                                            .ei_cbdata = file_lock };
2064         struct md_op_data *op_data;
2065         struct lustre_handle lockh = {0};
2066         ldlm_policy_data_t flock;
2067         int flags = 0;
2068         int rc;
2069         ENTRY;
2070
2071         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2072                inode->i_ino, file_lock);
2073
2074         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2075
2076         if (file_lock->fl_flags & FL_FLOCK) {
2077                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2078                 /* set missing params for flock() calls */
2079                 file_lock->fl_end = OFFSET_MAX;
2080                 file_lock->fl_pid = current->tgid;
2081         }
2082         flock.l_flock.pid = file_lock->fl_pid;
2083         flock.l_flock.start = file_lock->fl_start;
2084         flock.l_flock.end = file_lock->fl_end;
2085
2086         switch (file_lock->fl_type) {
2087         case F_RDLCK:
2088                 einfo.ei_mode = LCK_PR;
2089                 break;
2090         case F_UNLCK:
2091                 /* An unlock request may or may not have any relation to
2092                  * existing locks so we may not be able to pass a lock handle
2093                  * via a normal ldlm_lock_cancel() request. The request may even
2094                  * unlock a byte range in the middle of an existing lock. In
2095                  * order to process an unlock request we need all of the same
2096                  * information that is given with a normal read or write record
2097                  * lock request. To avoid creating another ldlm unlock (cancel)
2098                  * message we'll treat a LCK_NL flock request as an unlock. */
2099                 einfo.ei_mode = LCK_NL;
2100                 break;
2101         case F_WRLCK:
2102                 einfo.ei_mode = LCK_PW;
2103                 break;
2104         default:
2105                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2106                 RETURN (-EINVAL);
2107         }
2108
2109         switch (cmd) {
2110         case F_SETLKW:
2111 #ifdef F_SETLKW64
2112         case F_SETLKW64:
2113 #endif
2114                 flags = 0;
2115                 break;
2116         case F_SETLK:
2117 #ifdef F_SETLK64
2118         case F_SETLK64:
2119 #endif
2120                 flags = LDLM_FL_BLOCK_NOWAIT;
2121                 break;
2122         case F_GETLK:
2123 #ifdef F_GETLK64
2124         case F_GETLK64:
2125 #endif
2126                 flags = LDLM_FL_TEST_LOCK;
2127                 /* Save the old mode so that if the mode in the lock changes we
2128                  * can decrement the appropriate reader or writer refcount. */
2129                 file_lock->fl_type = einfo.ei_mode;
2130                 break;
2131         default:
2132                 CERROR("unknown fcntl lock command: %d\n", cmd);
2133                 RETURN (-EINVAL);
2134         }
2135
2136         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2137                                      LUSTRE_OPC_ANY, NULL);
2138         if (IS_ERR(op_data))
2139                 RETURN(PTR_ERR(op_data));
2140
2141         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2142                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2143                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2144
2145         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2146                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2147
2148         ll_finish_md_op_data(op_data);
2149
2150         if ((file_lock->fl_flags & FL_FLOCK) &&
2151             (rc == 0 || file_lock->fl_type == F_UNLCK))
2152                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2153 #ifdef HAVE_F_OP_FLOCK
2154         if ((file_lock->fl_flags & FL_POSIX) &&
2155             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2156             !(flags & LDLM_FL_TEST_LOCK))
2157                 posix_lock_file_wait(file, file_lock);
2158 #endif
2159
2160         RETURN(rc);
2161 }
2162
2163 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2164 {
2165         ENTRY;
2166
2167         RETURN(-ENOSYS);
2168 }
2169
2170 int ll_have_md_lock(struct inode *inode, __u64 bits)
2171 {
2172         struct lustre_handle lockh;
2173         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2174         struct lu_fid *fid;
2175         int flags;
2176         ENTRY;
2177
2178         if (!inode)
2179                RETURN(0);
2180
2181         fid = &ll_i2info(inode)->lli_fid;
2182         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2183
2184         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2185         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2186                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2187                 RETURN(1);
2188         }
2189         RETURN(0);
2190 }
2191
2192 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2193                             struct lustre_handle *lockh)
2194 {
2195         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2196         struct lu_fid *fid;
2197         ldlm_mode_t rc;
2198         int flags;
2199         ENTRY;
2200
2201         fid = &ll_i2info(inode)->lli_fid;
2202         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2203
2204         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2205         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2206                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2207         RETURN(rc);
2208 }
2209
2210 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2211         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2212                               * and return success */
2213                 inode->i_nlink = 0;
2214                 /* This path cannot be hit for regular files unless in
2215                  * case of obscure races, so no need to to validate
2216                  * size. */
2217                 if (!S_ISREG(inode->i_mode) &&
2218                     !S_ISDIR(inode->i_mode))
2219                         return 0;
2220         }
2221
2222         if (rc) {
2223                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2224                 return -abs(rc);
2225
2226         }
2227
2228         return 0;
2229 }
2230
2231 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2232                              __u64 ibits)
2233 {
2234         struct inode *inode = dentry->d_inode;
2235         struct ptlrpc_request *req = NULL;
2236         struct ll_sb_info *sbi;
2237         struct obd_export *exp;
2238         int rc = 0;
2239         ENTRY;
2240
2241         if (!inode) {
2242                 CERROR("REPORT THIS LINE TO PETER\n");
2243                 RETURN(0);
2244         }
2245         sbi = ll_i2sbi(inode);
2246
2247         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2248                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2249
2250         exp = ll_i2mdexp(inode);
2251
2252         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2253                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2254                 struct md_op_data *op_data;
2255
2256                 /* Call getattr by fid, so do not provide name at all. */
2257                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2258                                              dentry->d_inode, NULL, 0, 0,
2259                                              LUSTRE_OPC_ANY, NULL);
2260                 if (IS_ERR(op_data))
2261                         RETURN(PTR_ERR(op_data));
2262
2263                 oit.it_create_mode |= M_CHECK_STALE;
2264                 rc = md_intent_lock(exp, op_data, NULL, 0,
2265                                     /* we are not interested in name
2266                                        based lookup */
2267                                     &oit, 0, &req,
2268                                     ll_md_blocking_ast, 0);
2269                 ll_finish_md_op_data(op_data);
2270                 oit.it_create_mode &= ~M_CHECK_STALE;
2271                 if (rc < 0) {
2272                         rc = ll_inode_revalidate_fini(inode, rc);
2273                         GOTO (out, rc);
2274                 }
2275
2276                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2277                 if (rc != 0) {
2278                         ll_intent_release(&oit);
2279                         GOTO(out, rc);
2280                 }
2281
2282                 /* Unlinked? Unhash dentry, so it is not picked up later by
2283                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2284                    here to preserve get_cwd functionality on 2.6.
2285                    Bug 10503 */
2286                 if (!dentry->d_inode->i_nlink) {
2287                         spin_lock(&ll_lookup_lock);
2288                         spin_lock(&dcache_lock);
2289                         ll_drop_dentry(dentry);
2290                         spin_unlock(&dcache_lock);
2291                         spin_unlock(&ll_lookup_lock);
2292                 }
2293
2294                 ll_lookup_finish_locks(&oit, dentry);
2295         } else if (!ll_have_md_lock(dentry->d_inode, ibits)) {
2296
2297                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2298                 obd_valid valid = OBD_MD_FLGETATTR;
2299                 struct obd_capa *oc;
2300                 int ealen = 0;
2301
2302                 if (S_ISREG(inode->i_mode)) {
2303                         rc = ll_get_max_mdsize(sbi, &ealen);
2304                         if (rc)
2305                                 RETURN(rc);
2306                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2307                 }
2308                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2309                  * capa for this inode. Because we only keep capas of dirs
2310                  * fresh. */
2311                 oc = ll_mdscapa_get(inode);
2312                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2313                                 ealen, &req);
2314                 capa_put(oc);
2315                 if (rc) {
2316                         rc = ll_inode_revalidate_fini(inode, rc);
2317                         RETURN(rc);
2318                 }
2319
2320                 rc = ll_prep_inode(&inode, req, NULL);
2321         }
2322 out:
2323         ptlrpc_req_finished(req);
2324         return rc;
2325 }
2326
2327 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2328 {
2329         int rc;
2330         ENTRY;
2331
2332         rc = __ll_inode_revalidate_it(dentry, it, MDS_INODELOCK_UPDATE |
2333                                                   MDS_INODELOCK_LOOKUP);
2334
2335         /* if object not yet allocated, don't validate size */
2336         if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL)
2337                 RETURN(0);
2338
2339         /* cl_glimpse_size will prefer locally cached writes if they extend
2340          * the file */
2341
2342         if (rc == 0)
2343                 rc = cl_glimpse_size(dentry->d_inode);
2344
2345         RETURN(rc);
2346 }
2347
2348 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2349                   struct lookup_intent *it, struct kstat *stat)
2350 {
2351         struct inode *inode = de->d_inode;
2352         int res = 0;
2353
2354         res = ll_inode_revalidate_it(de, it);
2355         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2356
2357         if (res)
2358                 return res;
2359
2360         stat->dev = inode->i_sb->s_dev;
2361         stat->ino = inode->i_ino;
2362         stat->mode = inode->i_mode;
2363         stat->nlink = inode->i_nlink;
2364         stat->uid = inode->i_uid;
2365         stat->gid = inode->i_gid;
2366         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2367         stat->atime = inode->i_atime;
2368         stat->mtime = inode->i_mtime;
2369         stat->ctime = inode->i_ctime;
2370 #ifdef HAVE_INODE_BLKSIZE
2371         stat->blksize = inode->i_blksize;
2372 #else
2373         stat->blksize = 1 << inode->i_blkbits;
2374 #endif
2375
2376         ll_inode_size_lock(inode, 0);
2377         stat->size = i_size_read(inode);
2378         stat->blocks = inode->i_blocks;
2379         ll_inode_size_unlock(inode, 0);
2380
2381         return 0;
2382 }
2383 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2384 {
2385         struct lookup_intent it = { .it_op = IT_GETATTR };
2386
2387         return ll_getattr_it(mnt, de, &it, stat);
2388 }
2389
2390 static
2391 int lustre_check_acl(struct inode *inode, int mask)
2392 {
2393 #ifdef CONFIG_FS_POSIX_ACL
2394         struct ll_inode_info *lli = ll_i2info(inode);
2395         struct posix_acl *acl;
2396         int rc;
2397         ENTRY;
2398
2399         spin_lock(&lli->lli_lock);
2400         acl = posix_acl_dup(lli->lli_posix_acl);
2401         spin_unlock(&lli->lli_lock);
2402
2403         if (!acl)
2404                 RETURN(-EAGAIN);
2405
2406         rc = posix_acl_permission(inode, acl, mask);
2407         posix_acl_release(acl);
2408
2409         RETURN(rc);
2410 #else
2411         return -EAGAIN;
2412 #endif
2413 }
2414
2415 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2416 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2417 {
2418         int rc = 0;
2419         ENTRY;
2420
2421        /* as root inode are NOT getting validated in lookup operation,
2422         * need to do it before permission check. */
2423
2424         if (inode == inode->i_sb->s_root->d_inode) {
2425                 struct lookup_intent it = { .it_op = IT_LOOKUP };
2426
2427                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2428                                               MDS_INODELOCK_LOOKUP);
2429                 if (rc)
2430                         RETURN(rc);
2431         }
2432
2433         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2434                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2435
2436         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2437                 return lustre_check_remote_perm(inode, mask);
2438
2439         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2440         rc = generic_permission(inode, mask, lustre_check_acl);
2441
2442         RETURN(rc);
2443 }
2444 #else
2445 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2446 {
2447         int mode = inode->i_mode;
2448         int rc;
2449
2450         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2451                inode->i_ino, inode->i_generation, inode, mask);
2452
2453         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2454                 return lustre_check_remote_perm(inode, mask);
2455
2456         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2457
2458         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2459             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2460                 return -EROFS;
2461         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2462                 return -EACCES;
2463         if (current->fsuid == inode->i_uid) {
2464                 mode >>= 6;
2465         } else if (1) {
2466                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2467                         goto check_groups;
2468                 rc = lustre_check_acl(inode, mask);
2469                 if (rc == -EAGAIN)
2470                         goto check_groups;
2471                 if (rc == -EACCES)
2472                         goto check_capabilities;
2473                 return rc;
2474         } else {
2475 check_groups:
2476                 if (in_group_p(inode->i_gid))
2477                         mode >>= 3;
2478         }
2479         if ((mode & mask & S_IRWXO) == mask)
2480                 return 0;
2481
2482 check_capabilities:
2483         if (!(mask & MAY_EXEC) ||
2484             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2485                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2486                         return 0;
2487
2488         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2489             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2490                 return 0;
2491
2492         return -EACCES;
2493 }
2494 #endif
2495
2496 #ifdef HAVE_FILE_READV
2497 #define READ_METHOD readv
2498 #define READ_FUNCTION ll_file_readv
2499 #define WRITE_METHOD writev
2500 #define WRITE_FUNCTION ll_file_writev
2501 #else
2502 #define READ_METHOD aio_read
2503 #define READ_FUNCTION ll_file_aio_read
2504 #define WRITE_METHOD aio_write
2505 #define WRITE_FUNCTION ll_file_aio_write
2506 #endif
2507
2508 /* -o localflock - only provides locally consistent flock locks */
2509 struct file_operations ll_file_operations = {
2510         .read           = ll_file_read,
2511         .READ_METHOD    = READ_FUNCTION,
2512         .write          = ll_file_write,
2513         .WRITE_METHOD   = WRITE_FUNCTION,
2514         .ioctl          = ll_file_ioctl,
2515         .open           = ll_file_open,
2516         .release        = ll_file_release,
2517         .mmap           = ll_file_mmap,
2518         .llseek         = ll_file_seek,
2519         .sendfile       = ll_file_sendfile,
2520         .fsync          = ll_fsync,
2521 };
2522
2523 struct file_operations ll_file_operations_flock = {
2524         .read           = ll_file_read,
2525         .READ_METHOD    = READ_FUNCTION,
2526         .write          = ll_file_write,
2527         .WRITE_METHOD   = WRITE_FUNCTION,
2528         .ioctl          = ll_file_ioctl,
2529         .open           = ll_file_open,
2530         .release        = ll_file_release,
2531         .mmap           = ll_file_mmap,
2532         .llseek         = ll_file_seek,
2533         .sendfile       = ll_file_sendfile,
2534         .fsync          = ll_fsync,
2535 #ifdef HAVE_F_OP_FLOCK
2536         .flock          = ll_file_flock,
2537 #endif
2538         .lock           = ll_file_flock
2539 };
2540
2541 /* These are for -o noflock - to return ENOSYS on flock calls */
2542 struct file_operations ll_file_operations_noflock = {
2543         .read           = ll_file_read,
2544         .READ_METHOD    = READ_FUNCTION,
2545         .write          = ll_file_write,
2546         .WRITE_METHOD   = WRITE_FUNCTION,
2547         .ioctl          = ll_file_ioctl,
2548         .open           = ll_file_open,
2549         .release        = ll_file_release,
2550         .mmap           = ll_file_mmap,
2551         .llseek         = ll_file_seek,
2552         .sendfile       = ll_file_sendfile,
2553         .fsync          = ll_fsync,
2554 #ifdef HAVE_F_OP_FLOCK
2555         .flock          = ll_file_noflock,
2556 #endif
2557         .lock           = ll_file_noflock
2558 };
2559
2560 struct inode_operations ll_file_inode_operations = {
2561 #ifdef HAVE_VFS_INTENT_PATCHES
2562         .setattr_raw    = ll_setattr_raw,
2563 #endif
2564         .setattr        = ll_setattr,
2565         .truncate       = ll_truncate,
2566         .getattr        = ll_getattr,
2567         .permission     = ll_inode_permission,
2568         .setxattr       = ll_setxattr,
2569         .getxattr       = ll_getxattr,
2570         .listxattr      = ll_listxattr,
2571         .removexattr    = ll_removexattr,
2572 };
2573
2574 /* dynamic ioctl number support routins */
2575 static struct llioc_ctl_data {
2576         struct rw_semaphore ioc_sem;
2577         struct list_head    ioc_head;
2578 } llioc = {
2579         __RWSEM_INITIALIZER(llioc.ioc_sem),
2580         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2581 };
2582
2583
2584 struct llioc_data {
2585         struct list_head        iocd_list;
2586         unsigned int            iocd_size;
2587         llioc_callback_t        iocd_cb;
2588         unsigned int            iocd_count;
2589         unsigned int            iocd_cmd[0];
2590 };
2591
2592 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2593 {
2594         unsigned int size;
2595         struct llioc_data *in_data = NULL;
2596         ENTRY;
2597
2598         if (cb == NULL || cmd == NULL ||
2599             count > LLIOC_MAX_CMD || count < 0)
2600                 RETURN(NULL);
2601
2602         size = sizeof(*in_data) + count * sizeof(unsigned int);
2603         OBD_ALLOC(in_data, size);
2604         if (in_data == NULL)
2605                 RETURN(NULL);
2606
2607         memset(in_data, 0, sizeof(*in_data));
2608         in_data->iocd_size = size;
2609         in_data->iocd_cb = cb;
2610         in_data->iocd_count = count;
2611         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2612
2613         down_write(&llioc.ioc_sem);
2614         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2615         up_write(&llioc.ioc_sem);
2616
2617         RETURN(in_data);
2618 }
2619
2620 void ll_iocontrol_unregister(void *magic)
2621 {
2622         struct llioc_data *tmp;
2623
2624         if (magic == NULL)
2625                 return;
2626
2627         down_write(&llioc.ioc_sem);
2628         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2629                 if (tmp == magic) {
2630                         unsigned int size = tmp->iocd_size;
2631
2632                         list_del(&tmp->iocd_list);
2633                         up_write(&llioc.ioc_sem);
2634
2635                         OBD_FREE(tmp, size);
2636                         return;
2637                 }
2638         }
2639         up_write(&llioc.ioc_sem);
2640
2641         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2642 }
2643
2644 EXPORT_SYMBOL(ll_iocontrol_register);
2645 EXPORT_SYMBOL(ll_iocontrol_unregister);
2646
2647 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2648                         unsigned int cmd, unsigned long arg, int *rcp)
2649 {
2650         enum llioc_iter ret = LLIOC_CONT;
2651         struct llioc_data *data;
2652         int rc = -EINVAL, i;
2653
2654         down_read(&llioc.ioc_sem);
2655         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2656                 for (i = 0; i < data->iocd_count; i++) {
2657                         if (cmd != data->iocd_cmd[i])
2658                                 continue;
2659
2660                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2661                         break;
2662                 }
2663
2664                 if (ret == LLIOC_STOP)
2665                         break;
2666         }
2667         up_read(&llioc.ioc_sem);
2668
2669         if (rcp)
2670                 *rcp = rc;
2671         return ret;
2672 }