Whamcloud - gitweb
b=19053
[fs/lustre-release.git] / lustre / llite / file.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51
52 #include "cl_object.h"
53
54 struct ll_file_data *ll_file_data_get(void)
55 {
56         struct ll_file_data *fd;
57
58         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
59         return fd;
60 }
61
62 static void ll_file_data_put(struct ll_file_data *fd)
63 {
64         if (fd != NULL)
65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 }
67
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69                           struct lustre_handle *fh)
70 {
71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72         op_data->op_attr.ia_mode = inode->i_mode;
73         op_data->op_attr.ia_atime = inode->i_atime;
74         op_data->op_attr.ia_mtime = inode->i_mtime;
75         op_data->op_attr.ia_ctime = inode->i_ctime;
76         op_data->op_attr.ia_size = i_size_read(inode);
77         op_data->op_attr_blocks = inode->i_blocks;
78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
79         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
80         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
81         op_data->op_capa1 = ll_mdscapa_get(inode);
82 }
83
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85                              struct obd_client_handle *och)
86 {
87         ENTRY;
88
89         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
90                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
91
92         if (!(och->och_flags & FMODE_WRITE))
93                 goto out;
94
95         if (!(exp_connect_som(ll_i2mdexp(inode))) || !S_ISREG(inode->i_mode))
96                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
97         else
98                 ll_epoch_close(inode, op_data, &och, 0);
99
100 out:
101         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
102         EXIT;
103 }
104
105 static int ll_close_inode_openhandle(struct obd_export *md_exp,
106                                      struct inode *inode,
107                                      struct obd_client_handle *och)
108 {
109         struct obd_export *exp = ll_i2mdexp(inode);
110         struct md_op_data *op_data;
111         struct ptlrpc_request *req = NULL;
112         struct obd_device *obd = class_exp2obd(exp);
113         int epoch_close = 1;
114         int rc;
115         ENTRY;
116
117         if (obd == NULL) {
118                 /*
119                  * XXX: in case of LMV, is this correct to access
120                  * ->exp_handle?
121                  */
122                 CERROR("Invalid MDC connection handle "LPX64"\n",
123                        ll_i2mdexp(inode)->exp_handle.h_cookie);
124                 GOTO(out, rc = 0);
125         }
126
127         /*
128          * here we check if this is forced umount. If so this is called on
129          * canceling "open lock" and we do not call md_close() in this case, as
130          * it will not be successful, as import is already deactivated.
131          */
132         if (obd->obd_force)
133                 GOTO(out, rc = 0);
134
135         OBD_ALLOC_PTR(op_data);
136         if (op_data == NULL)
137                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
138
139         ll_prepare_close(inode, op_data, och);
140         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
141         rc = md_close(md_exp, op_data, och->och_mod, &req);
142         if (rc == -EAGAIN) {
143                 /* This close must have the epoch closed. */
144                 LASSERT(epoch_close);
145                 /* MDS has instructed us to obtain Size-on-MDS attribute from
146                  * OSTs and send setattr to back to MDS. */
147                 rc = ll_sizeonmds_update(inode, &och->och_fh,
148                                          op_data->op_ioepoch);
149                 if (rc) {
150                         CERROR("inode %lu mdc Size-on-MDS update failed: "
151                                "rc = %d\n", inode->i_ino, rc);
152                         rc = 0;
153                 }
154         } else if (rc) {
155                 CERROR("inode %lu mdc close failed: rc = %d\n",
156                        inode->i_ino, rc);
157         }
158         ll_finish_md_op_data(op_data);
159
160         if (rc == 0) {
161                 rc = ll_objects_destroy(req, inode);
162                 if (rc)
163                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
164                                inode->i_ino, rc);
165         }
166
167         EXIT;
168 out:
169
170         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
171             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
172                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
173         } else {
174                 md_clear_open_replay_data(md_exp, och);
175                 /* Free @och if it is not waiting for DONE_WRITING. */
176                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
177                 OBD_FREE_PTR(och);
178         }
179         if (req) /* This is close request */
180                 ptlrpc_req_finished(req);
181         return rc;
182 }
183
184 int ll_md_real_close(struct inode *inode, int flags)
185 {
186         struct ll_inode_info *lli = ll_i2info(inode);
187         struct obd_client_handle **och_p;
188         struct obd_client_handle *och;
189         __u64 *och_usecount;
190         int rc = 0;
191         ENTRY;
192
193         if (flags & FMODE_WRITE) {
194                 och_p = &lli->lli_mds_write_och;
195                 och_usecount = &lli->lli_open_fd_write_count;
196         } else if (flags & FMODE_EXEC) {
197                 och_p = &lli->lli_mds_exec_och;
198                 och_usecount = &lli->lli_open_fd_exec_count;
199         } else {
200                 LASSERT(flags & FMODE_READ);
201                 och_p = &lli->lli_mds_read_och;
202                 och_usecount = &lli->lli_open_fd_read_count;
203         }
204
205         down(&lli->lli_och_sem);
206         if (*och_usecount) { /* There are still users of this handle, so
207                                 skip freeing it. */
208                 up(&lli->lli_och_sem);
209                 RETURN(0);
210         }
211         och=*och_p;
212         *och_p = NULL;
213         up(&lli->lli_och_sem);
214
215         if (och) { /* There might be a race and somebody have freed this och
216                       already */
217                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
218                                                inode, och);
219         }
220
221         RETURN(rc);
222 }
223
224 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
225                 struct file *file)
226 {
227         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
228         struct ll_inode_info *lli = ll_i2info(inode);
229         int rc = 0;
230         ENTRY;
231
232         /* clear group lock, if present */
233         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
234                 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
235
236         /* Let's see if we have good enough OPEN lock on the file and if
237            we can skip talking to MDS */
238         if (file->f_dentry->d_inode) { /* Can this ever be false? */
239                 int lockmode;
240                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
241                 struct lustre_handle lockh;
242                 struct inode *inode = file->f_dentry->d_inode;
243                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
244
245                 down(&lli->lli_och_sem);
246                 if (fd->fd_omode & FMODE_WRITE) {
247                         lockmode = LCK_CW;
248                         LASSERT(lli->lli_open_fd_write_count);
249                         lli->lli_open_fd_write_count--;
250                 } else if (fd->fd_omode & FMODE_EXEC) {
251                         lockmode = LCK_PR;
252                         LASSERT(lli->lli_open_fd_exec_count);
253                         lli->lli_open_fd_exec_count--;
254                 } else {
255                         lockmode = LCK_CR;
256                         LASSERT(lli->lli_open_fd_read_count);
257                         lli->lli_open_fd_read_count--;
258                 }
259                 up(&lli->lli_och_sem);
260
261                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
262                                    LDLM_IBITS, &policy, lockmode,
263                                    &lockh)) {
264                         rc = ll_md_real_close(file->f_dentry->d_inode,
265                                               fd->fd_omode);
266                 }
267         } else {
268                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
269                        file, file->f_dentry, file->f_dentry->d_name.name);
270         }
271
272         LUSTRE_FPRIVATE(file) = NULL;
273         ll_file_data_put(fd);
274         ll_capa_close(inode);
275
276         RETURN(rc);
277 }
278
279 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
280
281 /* While this returns an error code, fput() the caller does not, so we need
282  * to make every effort to clean up all of our state here.  Also, applications
283  * rarely check close errors and even if an error is returned they will not
284  * re-try the close call.
285  */
286 int ll_file_release(struct inode *inode, struct file *file)
287 {
288         struct ll_file_data *fd;
289         struct ll_sb_info *sbi = ll_i2sbi(inode);
290         struct ll_inode_info *lli = ll_i2info(inode);
291         struct lov_stripe_md *lsm = lli->lli_smd;
292         int rc;
293         ENTRY;
294
295         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
296                inode->i_generation, inode);
297
298 #ifdef CONFIG_FS_POSIX_ACL
299         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
300             inode == inode->i_sb->s_root->d_inode) {
301                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
302
303                 LASSERT(fd != NULL);
304                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
305                         fd->fd_flags &= ~LL_FILE_RMTACL;
306                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
307                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
308                 }
309         }
310 #endif
311
312         if (inode->i_sb->s_root != file->f_dentry)
313                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
314         fd = LUSTRE_FPRIVATE(file);
315         LASSERT(fd != NULL);
316
317         /* The last ref on @file, maybe not the the owner pid of statahead.
318          * Different processes can open the same dir, "ll_opendir_key" means:
319          * it is me that should stop the statahead thread. */
320         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
321                 ll_stop_statahead(inode, lli->lli_opendir_key);
322
323         if (inode->i_sb->s_root == file->f_dentry) {
324                 LUSTRE_FPRIVATE(file) = NULL;
325                 ll_file_data_put(fd);
326                 RETURN(0);
327         }
328
329         if (lsm)
330                 lov_test_and_clear_async_rc(lsm);
331         lli->lli_async_rc = 0;
332
333         rc = ll_md_close(sbi->ll_md_exp, inode, file);
334         RETURN(rc);
335 }
336
337 static int ll_intent_file_open(struct file *file, void *lmm,
338                                int lmmsize, struct lookup_intent *itp)
339 {
340         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
341         struct dentry *parent = file->f_dentry->d_parent;
342         const char *name = file->f_dentry->d_name.name;
343         const int len = file->f_dentry->d_name.len;
344         struct md_op_data *op_data;
345         struct ptlrpc_request *req;
346         int rc;
347         ENTRY;
348
349         if (!parent)
350                 RETURN(-ENOENT);
351
352         /* Usually we come here only for NFSD, and we want open lock.
353            But we can also get here with pre 2.6.15 patchless kernels, and in
354            that case that lock is also ok */
355         /* We can also get here if there was cached open handle in revalidate_it
356          * but it disappeared while we were getting from there to ll_file_open.
357          * But this means this file was closed and immediatelly opened which
358          * makes a good candidate for using OPEN lock */
359         /* If lmmsize & lmm are not 0, we are just setting stripe info
360          * parameters. No need for the open lock */
361         if (!lmm && !lmmsize)
362                 itp->it_flags |= MDS_OPEN_LOCK;
363
364         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
365                                       file->f_dentry->d_inode, name, len,
366                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
367         if (IS_ERR(op_data))
368                 RETURN(PTR_ERR(op_data));
369
370         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
371                             0 /*unused */, &req, ll_md_blocking_ast, 0);
372         ll_finish_md_op_data(op_data);
373         if (rc == -ESTALE) {
374                 /* reason for keep own exit path - don`t flood log
375                 * with messages with -ESTALE errors.
376                 */
377                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
378                      it_open_error(DISP_OPEN_OPEN, itp))
379                         GOTO(out, rc);
380                 ll_release_openhandle(file->f_dentry, itp);
381                 GOTO(out, rc);
382         }
383
384         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
385                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
386                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
387                 GOTO(out, rc);
388         }
389
390         if (itp->d.lustre.it_lock_mode)
391                 md_set_lock_data(sbi->ll_md_exp,
392                                  &itp->d.lustre.it_lock_handle,
393                                  file->f_dentry->d_inode, NULL);
394
395         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
396 out:
397         ptlrpc_req_finished(itp->d.lustre.it_data);
398         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
399         ll_intent_drop_lock(itp);
400
401         RETURN(rc);
402 }
403
404 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
405 {
406         if (ioepoch && lli->lli_ioepoch != ioepoch) {
407                 lli->lli_ioepoch = ioepoch;
408                 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
409                        ioepoch, PFID(&lli->lli_fid));
410         }
411 }
412
413 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
414                        struct lookup_intent *it, struct obd_client_handle *och)
415 {
416         struct ptlrpc_request *req = it->d.lustre.it_data;
417         struct mdt_body *body;
418
419         LASSERT(och);
420
421         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
422         LASSERT(body != NULL);                      /* reply already checked out */
423
424         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
425         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
426         och->och_fid = lli->lli_fid;
427         och->och_flags = it->it_flags;
428         ll_ioepoch_open(lli, body->ioepoch);
429
430         return md_set_open_replay_data(md_exp, och, req);
431 }
432
433 int ll_local_open(struct file *file, struct lookup_intent *it,
434                   struct ll_file_data *fd, struct obd_client_handle *och)
435 {
436         struct inode *inode = file->f_dentry->d_inode;
437         struct ll_inode_info *lli = ll_i2info(inode);
438         ENTRY;
439
440         LASSERT(!LUSTRE_FPRIVATE(file));
441
442         LASSERT(fd != NULL);
443
444         if (och) {
445                 struct ptlrpc_request *req = it->d.lustre.it_data;
446                 struct mdt_body *body;
447                 int rc;
448
449                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
450                 if (rc)
451                         RETURN(rc);
452
453                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
454                 if ((it->it_flags & FMODE_WRITE) &&
455                     (body->valid & OBD_MD_FLSIZE))
456                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
457                                lli->lli_ioepoch, PFID(&lli->lli_fid));
458         }
459
460         LUSTRE_FPRIVATE(file) = fd;
461         ll_readahead_init(inode, &fd->fd_ras);
462         fd->fd_omode = it->it_flags;
463         RETURN(0);
464 }
465
466 /* Open a file, and (for the very first open) create objects on the OSTs at
467  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
468  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
469  * lli_open_sem to ensure no other process will create objects, send the
470  * stripe MD to the MDS, or try to destroy the objects if that fails.
471  *
472  * If we already have the stripe MD locally then we don't request it in
473  * md_open(), by passing a lmm_size = 0.
474  *
475  * It is up to the application to ensure no other processes open this file
476  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
477  * used.  We might be able to avoid races of that sort by getting lli_open_sem
478  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
479  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
480  */
481 int ll_file_open(struct inode *inode, struct file *file)
482 {
483         struct ll_inode_info *lli = ll_i2info(inode);
484         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
485                                           .it_flags = file->f_flags };
486         struct lov_stripe_md *lsm;
487         struct ptlrpc_request *req = NULL;
488         struct obd_client_handle **och_p;
489         __u64 *och_usecount;
490         struct ll_file_data *fd;
491         int rc = 0, opendir_set = 0;
492         ENTRY;
493
494         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
495                inode->i_generation, inode, file->f_flags);
496
497 #ifdef HAVE_VFS_INTENT_PATCHES
498         it = file->f_it;
499 #else
500         it = file->private_data; /* XXX: compat macro */
501         file->private_data = NULL; /* prevent ll_local_open assertion */
502 #endif
503
504         fd = ll_file_data_get();
505         if (fd == NULL)
506                 RETURN(-ENOMEM);
507
508         fd->fd_file = file;
509         if (S_ISDIR(inode->i_mode)) {
510                 spin_lock(&lli->lli_lock);
511                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
512                         LASSERT(lli->lli_sai == NULL);
513                         lli->lli_opendir_key = fd;
514                         lli->lli_opendir_pid = cfs_curproc_pid();
515                         opendir_set = 1;
516                 }
517                 spin_unlock(&lli->lli_lock);
518         }
519
520         if (inode->i_sb->s_root == file->f_dentry) {
521                 LUSTRE_FPRIVATE(file) = fd;
522                 RETURN(0);
523         }
524
525         if (!it || !it->d.lustre.it_disposition) {
526                 /* Convert f_flags into access mode. We cannot use file->f_mode,
527                  * because everything but O_ACCMODE mask was stripped from
528                  * there */
529                 if ((oit.it_flags + 1) & O_ACCMODE)
530                         oit.it_flags++;
531                 if (file->f_flags & O_TRUNC)
532                         oit.it_flags |= FMODE_WRITE;
533
534                 /* kernel only call f_op->open in dentry_open.  filp_open calls
535                  * dentry_open after call to open_namei that checks permissions.
536                  * Only nfsd_open call dentry_open directly without checking
537                  * permissions and because of that this code below is safe. */
538                 if (oit.it_flags & FMODE_WRITE)
539                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
540
541                 /* We do not want O_EXCL here, presumably we opened the file
542                  * already? XXX - NFS implications? */
543                 oit.it_flags &= ~O_EXCL;
544
545                 it = &oit;
546         }
547
548 restart:
549         /* Let's see if we have file open on MDS already. */
550         if (it->it_flags & FMODE_WRITE) {
551                 och_p = &lli->lli_mds_write_och;
552                 och_usecount = &lli->lli_open_fd_write_count;
553         } else if (it->it_flags & FMODE_EXEC) {
554                 och_p = &lli->lli_mds_exec_och;
555                 och_usecount = &lli->lli_open_fd_exec_count;
556          } else {
557                 och_p = &lli->lli_mds_read_och;
558                 och_usecount = &lli->lli_open_fd_read_count;
559         }
560
561         down(&lli->lli_och_sem);
562         if (*och_p) { /* Open handle is present */
563                 if (it_disposition(it, DISP_OPEN_OPEN)) {
564                         /* Well, there's extra open request that we do not need,
565                            let's close it somehow. This will decref request. */
566                         rc = it_open_error(DISP_OPEN_OPEN, it);
567                         if (rc) {
568                                 up(&lli->lli_och_sem);
569                                 ll_file_data_put(fd);
570                                 GOTO(out_openerr, rc);
571                         }
572                         ll_release_openhandle(file->f_dentry, it);
573                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
574                                              LPROC_LL_OPEN);
575                 }
576                 (*och_usecount)++;
577
578                 rc = ll_local_open(file, it, fd, NULL);
579                 if (rc) {
580                         (*och_usecount)--;
581                         up(&lli->lli_och_sem);
582                         ll_file_data_put(fd);
583                         GOTO(out_openerr, rc);
584                 }
585         } else {
586                 LASSERT(*och_usecount == 0);
587                 if (!it->d.lustre.it_disposition) {
588                         /* We cannot just request lock handle now, new ELC code
589                            means that one of other OPEN locks for this file
590                            could be cancelled, and since blocking ast handler
591                            would attempt to grab och_sem as well, that would
592                            result in a deadlock */
593                         up(&lli->lli_och_sem);
594                         it->it_create_mode |= M_CHECK_STALE;
595                         rc = ll_intent_file_open(file, NULL, 0, it);
596                         it->it_create_mode &= ~M_CHECK_STALE;
597                         if (rc) {
598                                 ll_file_data_put(fd);
599                                 GOTO(out_openerr, rc);
600                         }
601
602                         /* Got some error? Release the request */
603                         if (it->d.lustre.it_status < 0) {
604                                 req = it->d.lustre.it_data;
605                                 ptlrpc_req_finished(req);
606                         }
607                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
608                                          &it->d.lustre.it_lock_handle,
609                                          file->f_dentry->d_inode, NULL);
610                         goto restart;
611                 }
612                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
613                 if (!*och_p) {
614                         ll_file_data_put(fd);
615                         GOTO(out_och_free, rc = -ENOMEM);
616                 }
617                 (*och_usecount)++;
618                 req = it->d.lustre.it_data;
619
620                 /* md_intent_lock() didn't get a request ref if there was an
621                  * open error, so don't do cleanup on the request here
622                  * (bug 3430) */
623                 /* XXX (green): Should not we bail out on any error here, not
624                  * just open error? */
625                 rc = it_open_error(DISP_OPEN_OPEN, it);
626                 if (rc) {
627                         ll_file_data_put(fd);
628                         GOTO(out_och_free, rc);
629                 }
630
631                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
632                 rc = ll_local_open(file, it, fd, *och_p);
633                 if (rc) {
634                         ll_file_data_put(fd);
635                         GOTO(out_och_free, rc);
636                 }
637         }
638         up(&lli->lli_och_sem);
639
640         /* Must do this outside lli_och_sem lock to prevent deadlock where
641            different kind of OPEN lock for this same inode gets cancelled
642            by ldlm_cancel_lru */
643         if (!S_ISREG(inode->i_mode))
644                 GOTO(out, rc);
645
646         ll_capa_open(inode);
647
648         lsm = lli->lli_smd;
649         if (lsm == NULL) {
650                 if (file->f_flags & O_LOV_DELAY_CREATE ||
651                     !(file->f_mode & FMODE_WRITE)) {
652                         CDEBUG(D_INODE, "object creation was delayed\n");
653                         GOTO(out, rc);
654                 }
655         }
656         file->f_flags &= ~O_LOV_DELAY_CREATE;
657         GOTO(out, rc);
658 out:
659         ptlrpc_req_finished(req);
660         if (req)
661                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
662 out_och_free:
663         if (rc) {
664                 if (*och_p) {
665                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
666                         *och_p = NULL; /* OBD_FREE writes some magic there */
667                         (*och_usecount)--;
668                 }
669                 up(&lli->lli_och_sem);
670 out_openerr:
671                 if (opendir_set != 0)
672                         ll_stop_statahead(inode, lli->lli_opendir_key);
673         }
674
675         return rc;
676 }
677
678 /* Fills the obdo with the attributes for the lsm */
679 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
680                           struct obd_capa *capa, struct obdo *obdo)
681 {
682         struct ptlrpc_request_set *set;
683         struct obd_info            oinfo = { { { 0 } } };
684         int                        rc;
685
686         ENTRY;
687
688         LASSERT(lsm != NULL);
689
690         oinfo.oi_md = lsm;
691         oinfo.oi_oa = obdo;
692         oinfo.oi_oa->o_id = lsm->lsm_object_id;
693         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
694         oinfo.oi_oa->o_mode = S_IFREG;
695         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
696                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
697                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
698                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
699                                OBD_MD_FLGROUP;
700         oinfo.oi_capa = capa;
701
702         set = ptlrpc_prep_set();
703         if (set == NULL) {
704                 CERROR("can't allocate ptlrpc set\n");
705                 rc = -ENOMEM;
706         } else {
707                 rc = obd_getattr_async(exp, &oinfo, set);
708                 if (rc == 0)
709                         rc = ptlrpc_set_wait(set);
710                 ptlrpc_set_destroy(set);
711         }
712         if (rc == 0)
713                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
714                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
715                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE);
716         RETURN(rc);
717 }
718
719 /* Fills the obdo with the attributes for the inode defined by lsm */
720 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
721 {
722         struct ll_inode_info *lli  = ll_i2info(inode);
723         struct obd_capa      *capa = ll_mdscapa_get(inode);
724         int rc;
725         ENTRY;
726
727         rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode), capa, obdo);
728         capa_put(capa);
729         if (rc == 0) {
730                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
731                 CDEBUG(D_INODE,
732                        "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
733                        lli->lli_smd->lsm_object_id, i_size_read(inode),
734                        (unsigned long long)inode->i_blocks,
735                        (unsigned long)ll_inode_blksize(inode));
736         }
737         RETURN(rc);
738 }
739
740 int ll_merge_lvb(struct inode *inode)
741 {
742         struct ll_inode_info *lli = ll_i2info(inode);
743         struct ll_sb_info *sbi = ll_i2sbi(inode);
744         struct ost_lvb lvb;
745         int rc;
746
747         ENTRY;
748
749         ll_inode_size_lock(inode, 1);
750         inode_init_lvb(inode, &lvb);
751         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
752         i_size_write(inode, lvb.lvb_size);
753         inode->i_blocks = lvb.lvb_blocks;
754
755         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
756         LTIME_S(inode->i_atime) = lvb.lvb_atime;
757         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
758         ll_inode_size_unlock(inode, 1);
759
760         RETURN(rc);
761 }
762
763 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
764                      lstat_t *st)
765 {
766         struct obdo obdo = { 0 };
767         int rc;
768
769         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo);
770         if (rc == 0) {
771                 st->st_size   = obdo.o_size;
772                 st->st_blocks = obdo.o_blocks;
773                 st->st_mtime  = obdo.o_mtime;
774                 st->st_atime  = obdo.o_atime;
775                 st->st_ctime  = obdo.o_ctime;
776         }
777         return rc;
778 }
779
780 void ll_io_init(struct cl_io *io, const struct file *file, int write)
781 {
782         struct inode *inode     = file->f_dentry->d_inode;
783         struct ll_sb_info *sbi  = ll_i2sbi(inode);
784         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
785
786         LASSERT(fd != NULL);
787         memset(io, 0, sizeof *io);
788         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
789         if (write)
790                 io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
791         io->ci_obj     = ll_i2info(inode)->lli_clob;
792         io->ci_lockreq = CILR_MAYBE;
793         if (fd->fd_flags & LL_FILE_IGNORE_LOCK ||
794             sbi->ll_flags & LL_SBI_NOLCK) {
795                 io->ci_lockreq = CILR_NEVER;
796                 io->ci_no_srvlock = 1;
797         } else if (file->f_flags & O_APPEND) {
798                 io->ci_lockreq = CILR_MANDATORY;
799         }
800 }
801
802 static ssize_t ll_file_io_generic(const struct lu_env *env,
803                 struct ccc_io_args *args, struct file *file,
804                 enum cl_io_type iot, loff_t *ppos, size_t count)
805 {
806         struct cl_io       *io;
807         ssize_t             result;
808         ENTRY;
809
810         io = &ccc_env_info(env)->cti_io;
811         ll_io_init(io, file, iot == CIT_WRITE);
812
813         if (iot == CIT_READ)
814                 io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile;
815
816         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
817                 struct vvp_io *vio = vvp_env_io(env);
818                 struct ccc_io *cio = ccc_env_io(env);
819                 if (cl_io_is_sendfile(io)) {
820                         vio->u.read.cui_actor = args->cia_actor;
821                         vio->u.read.cui_target = args->cia_target;
822                 } else {
823                         cio->cui_iov = args->cia_iov;
824                         cio->cui_nrsegs = args->cia_nrsegs;
825 #ifndef HAVE_FILE_WRITEV
826                         cio->cui_iocb = args->cia_iocb;
827 #endif
828                 }
829                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
830                 result = cl_io_loop(env, io);
831         } else
832                 /* cl_io_rw_init() handled IO */
833                 result = io->ci_result;
834         if (io->ci_nob > 0) {
835                 result = io->ci_nob;
836                 *ppos = io->u.ci_wr.wr.crw_pos;
837         }
838         cl_io_fini(env, io);
839         RETURN(result);
840 }
841
842
843 /*
844  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
845  */
846 static int ll_file_get_iov_count(const struct iovec *iov,
847                                  unsigned long *nr_segs, size_t *count)
848 {
849         size_t cnt = 0;
850         unsigned long seg;
851
852         for (seg = 0; seg < *nr_segs; seg++) {
853                 const struct iovec *iv = &iov[seg];
854
855                 /*
856                  * If any segment has a negative length, or the cumulative
857                  * length ever wraps negative then return -EINVAL.
858                  */
859                 cnt += iv->iov_len;
860                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
861                         return -EINVAL;
862                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
863                         continue;
864                 if (seg == 0)
865                         return -EFAULT;
866                 *nr_segs = seg;
867                 cnt -= iv->iov_len;   /* This segment is no good */
868                 break;
869         }
870         *count = cnt;
871         return 0;
872 }
873
874 #ifdef HAVE_FILE_READV
875 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
876                               unsigned long nr_segs, loff_t *ppos)
877 {
878         struct lu_env      *env;
879         struct ccc_io_args *args;
880         size_t              count;
881         ssize_t             result;
882         int                 refcheck;
883         ENTRY;
884
885         result = ll_file_get_iov_count(iov, &nr_segs, &count);
886         if (result)
887                 RETURN(result);
888
889         env = cl_env_get(&refcheck);
890         if (IS_ERR(env))
891                 RETURN(PTR_ERR(env));
892
893         args = &vvp_env_info(env)->vti_args;
894         args->cia_is_sendfile = 0;
895         args->cia_iov = (struct iovec *)iov;
896         args->cia_nrsegs = nr_segs;
897         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
898         cl_env_put(env, &refcheck);
899         RETURN(result);
900 }
901
902 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
903                             loff_t *ppos)
904 {
905         struct lu_env *env;
906         struct iovec  *local_iov;
907         ssize_t        result;
908         int            refcheck;
909         ENTRY;
910
911         env = cl_env_get(&refcheck);
912         if (IS_ERR(env))
913                 RETURN(PTR_ERR(env));
914
915         local_iov = &vvp_env_info(env)->vti_local_iov;
916         local_iov->iov_base = (void __user *)buf;
917         local_iov->iov_len = count;
918         result = ll_file_readv(file, local_iov, 1, ppos);
919         cl_env_put(env, &refcheck);
920         RETURN(result);
921 }
922
923 #else
924 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
925                                 unsigned long nr_segs, loff_t pos)
926 {
927         struct lu_env      *env;
928         struct ccc_io_args *args;
929         size_t              count;
930         ssize_t             result;
931         int                 refcheck;
932         ENTRY;
933
934         result = ll_file_get_iov_count(iov, &nr_segs, &count);
935         if (result)
936                 RETURN(result);
937
938         env = cl_env_get(&refcheck);
939         if (IS_ERR(env))
940                 RETURN(PTR_ERR(env));
941
942         args = &vvp_env_info(env)->vti_args;
943         args->cia_is_sendfile = 0;
944         args->cia_iov = (struct iovec *)iov;
945         args->cia_nrsegs = nr_segs;
946         args->cia_iocb = iocb;
947         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
948                                     &iocb->ki_pos, count);
949         cl_env_put(env, &refcheck);
950         RETURN(result);
951 }
952
953 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
954                             loff_t *ppos)
955 {
956         struct lu_env *env;
957         struct iovec  *local_iov;
958         struct kiocb  *kiocb;
959         ssize_t        result;
960         int            refcheck;
961         ENTRY;
962
963         env = cl_env_get(&refcheck);
964         if (IS_ERR(env))
965                 RETURN(PTR_ERR(env));
966
967         local_iov = &vvp_env_info(env)->vti_local_iov;
968         kiocb = &vvp_env_info(env)->vti_kiocb;
969         local_iov->iov_base = (void __user *)buf;
970         local_iov->iov_len = count;
971         init_sync_kiocb(kiocb, file);
972         kiocb->ki_pos = *ppos;
973         kiocb->ki_left = count;
974
975         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
976         *ppos = kiocb->ki_pos;
977
978         cl_env_put(env, &refcheck);
979         RETURN(result);
980 }
981 #endif
982
983 /*
984  * Write to a file (through the page cache).
985  */
986 #ifdef HAVE_FILE_WRITEV
987 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
988                               unsigned long nr_segs, loff_t *ppos)
989 {
990         struct lu_env      *env;
991         struct ccc_io_args *args;
992         size_t              count;
993         ssize_t             result;
994         int                 refcheck;
995         ENTRY;
996
997         result = ll_file_get_iov_count(iov, &nr_segs, &count);
998         if (result)
999                 RETURN(result);
1000
1001         env = cl_env_get(&refcheck);
1002         if (IS_ERR(env))
1003                 RETURN(PTR_ERR(env));
1004
1005         args = &vvp_env_info(env)->vti_args;
1006         args->cia_iov = (struct iovec *)iov;
1007         args->cia_nrsegs = nr_segs;
1008         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1009         cl_env_put(env, &refcheck);
1010         RETURN(result);
1011 }
1012
1013 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1014                              loff_t *ppos)
1015 {
1016         struct lu_env    *env;
1017         struct iovec     *local_iov;
1018         ssize_t           result;
1019         int               refcheck;
1020         ENTRY;
1021
1022         env = cl_env_get(&refcheck);
1023         if (IS_ERR(env))
1024                 RETURN(PTR_ERR(env));
1025
1026         local_iov = &vvp_env_info(env)->vti_local_iov;
1027         local_iov->iov_base = (void __user *)buf;
1028         local_iov->iov_len = count;
1029
1030         result = ll_file_writev(file, local_iov, 1, ppos);
1031         cl_env_put(env, &refcheck);
1032         RETURN(result);
1033 }
1034
1035 #else /* AIO stuff */
1036 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1037                                  unsigned long nr_segs, loff_t pos)
1038 {
1039         struct lu_env      *env;
1040         struct ccc_io_args *args;
1041         size_t              count;
1042         ssize_t             result;
1043         int                 refcheck;
1044         ENTRY;
1045
1046         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1047         if (result)
1048                 RETURN(result);
1049
1050         env = cl_env_get(&refcheck);
1051         if (IS_ERR(env))
1052                 RETURN(PTR_ERR(env));
1053
1054         args = &vvp_env_info(env)->vti_args;
1055         args->cia_iov = (struct iovec *)iov;
1056         args->cia_nrsegs = nr_segs;
1057         args->cia_iocb = iocb;
1058         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1059                                   &iocb->ki_pos, count);
1060         cl_env_put(env, &refcheck);
1061         RETURN(result);
1062 }
1063
1064 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1065                              loff_t *ppos)
1066 {
1067         struct lu_env *env;
1068         struct iovec  *local_iov;
1069         struct kiocb  *kiocb;
1070         ssize_t        result;
1071         int            refcheck;
1072         ENTRY;
1073
1074         env = cl_env_get(&refcheck);
1075         if (IS_ERR(env))
1076                 RETURN(PTR_ERR(env));
1077
1078         local_iov = &vvp_env_info(env)->vti_local_iov;
1079         kiocb = &vvp_env_info(env)->vti_kiocb;
1080         local_iov->iov_base = (void __user *)buf;
1081         local_iov->iov_len = count;
1082         init_sync_kiocb(kiocb, file);
1083         kiocb->ki_pos = *ppos;
1084         kiocb->ki_left = count;
1085
1086         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1087         *ppos = kiocb->ki_pos;
1088
1089         cl_env_put(env, &refcheck);
1090         RETURN(result);
1091 }
1092 #endif
1093
1094
1095 /*
1096  * Send file content (through pagecache) somewhere with helper
1097  */
1098 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1099                                 read_actor_t actor, void *target)
1100 {
1101         struct lu_env      *env;
1102         struct ccc_io_args *args;
1103         ssize_t             result;
1104         int                 refcheck;
1105         ENTRY;
1106
1107         env = cl_env_get(&refcheck);
1108         if (IS_ERR(env))
1109                 RETURN(PTR_ERR(env));
1110
1111         args = &vvp_env_info(env)->vti_args;
1112         args->cia_is_sendfile = 1;
1113         args->cia_target = target;
1114         args->cia_actor = actor;
1115         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1116         cl_env_put(env, &refcheck);
1117         RETURN(result);
1118 }
1119
1120 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1121                                unsigned long arg)
1122 {
1123         struct obd_export *exp = ll_i2dtexp(inode);
1124         struct ll_recreate_obj ucreatp;
1125         struct obd_trans_info oti = { 0 };
1126         struct obdo *oa = NULL;
1127         int lsm_size;
1128         int rc = 0;
1129         struct lov_stripe_md *lsm, *lsm2;
1130         ENTRY;
1131
1132         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1133                 RETURN(-EPERM);
1134
1135         if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1136                            sizeof(struct ll_recreate_obj)))
1137                 RETURN(-EFAULT);
1138
1139         OBDO_ALLOC(oa);
1140         if (oa == NULL)
1141                 RETURN(-ENOMEM);
1142
1143         ll_inode_size_lock(inode, 0);
1144         lsm = ll_i2info(inode)->lli_smd;
1145         if (lsm == NULL)
1146                 GOTO(out, rc = -ENOENT);
1147         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1148                    (lsm->lsm_stripe_count));
1149
1150         OBD_ALLOC(lsm2, lsm_size);
1151         if (lsm2 == NULL)
1152                 GOTO(out, rc = -ENOMEM);
1153
1154         oa->o_id = ucreatp.lrc_id;
1155         oa->o_gr = ucreatp.lrc_group;
1156         oa->o_nlink = ucreatp.lrc_ost_idx;
1157         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1158         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1159         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1160                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1161
1162         memcpy(lsm2, lsm, lsm_size);
1163         rc = obd_create(exp, oa, &lsm2, &oti);
1164
1165         OBD_FREE(lsm2, lsm_size);
1166         GOTO(out, rc);
1167 out:
1168         ll_inode_size_unlock(inode, 0);
1169         OBDO_FREE(oa);
1170         return rc;
1171 }
1172
1173 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1174                              int flags, struct lov_user_md *lum, int lum_size)
1175 {
1176         struct lov_stripe_md *lsm;
1177         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1178         int rc = 0;
1179         ENTRY;
1180
1181         ll_inode_size_lock(inode, 0);
1182         lsm = ll_i2info(inode)->lli_smd;
1183         if (lsm) {
1184                 ll_inode_size_unlock(inode, 0);
1185                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1186                        inode->i_ino);
1187                 RETURN(-EEXIST);
1188         }
1189
1190         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1191         if (rc)
1192                 GOTO(out, rc);
1193         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1194                 GOTO(out_req_free, rc = -ENOENT);
1195         rc = oit.d.lustre.it_status;
1196         if (rc < 0)
1197                 GOTO(out_req_free, rc);
1198
1199         ll_release_openhandle(file->f_dentry, &oit);
1200
1201  out:
1202         ll_inode_size_unlock(inode, 0);
1203         ll_intent_release(&oit);
1204         RETURN(rc);
1205 out_req_free:
1206         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1207         goto out;
1208 }
1209
1210 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1211                              struct lov_mds_md **lmmp, int *lmm_size,
1212                              struct ptlrpc_request **request)
1213 {
1214         struct ll_sb_info *sbi = ll_i2sbi(inode);
1215         struct mdt_body  *body;
1216         struct lov_mds_md *lmm = NULL;
1217         struct ptlrpc_request *req = NULL;
1218         struct obd_capa *oc;
1219         int rc, lmmsize;
1220
1221         rc = ll_get_max_mdsize(sbi, &lmmsize);
1222         if (rc)
1223                 RETURN(rc);
1224
1225         oc = ll_mdscapa_get(inode);
1226         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1227                              oc, filename, strlen(filename) + 1,
1228                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1229                              ll_i2suppgid(inode), &req);
1230         capa_put(oc);
1231         if (rc < 0) {
1232                 CDEBUG(D_INFO, "md_getattr_name failed "
1233                        "on %s: rc %d\n", filename, rc);
1234                 GOTO(out, rc);
1235         }
1236
1237         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1238         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1239
1240         lmmsize = body->eadatasize;
1241
1242         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1243                         lmmsize == 0) {
1244                 GOTO(out, rc = -ENODATA);
1245         }
1246
1247         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1248         LASSERT(lmm != NULL);
1249
1250         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1251             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
1252             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
1253                 GOTO(out, rc = -EPROTO);
1254         }
1255
1256         /*
1257          * This is coming from the MDS, so is probably in
1258          * little endian.  We convert it to host endian before
1259          * passing it to userspace.
1260          */
1261         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1262                 /* if function called for directory - we should
1263                  * avoid swab not existent lsm objects */
1264                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1265                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1266                         if (S_ISREG(body->mode))
1267                                 lustre_swab_lov_user_md_objects(
1268                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1269                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1270                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1271                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1272                         if (S_ISREG(body->mode))
1273                                 lustre_swab_lov_user_md_objects(
1274                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1275                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1276                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
1277                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1278                 }
1279         }
1280
1281         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1282                 struct lov_stripe_md *lsm;
1283                 struct lov_user_md_join *lmj;
1284                 int lmj_size, i, aindex = 0;
1285
1286                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1287                 if (rc < 0)
1288                         GOTO(out, rc = -ENOMEM);
1289                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1290                 if (rc)
1291                         GOTO(out_free_memmd, rc);
1292
1293                 lmj_size = sizeof(struct lov_user_md_join) +
1294                            lsm->lsm_stripe_count *
1295                            sizeof(struct lov_user_ost_data_join);
1296                 OBD_ALLOC(lmj, lmj_size);
1297                 if (!lmj)
1298                         GOTO(out_free_memmd, rc = -ENOMEM);
1299
1300                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1301                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1302                         struct lov_extent *lex =
1303                                 &lsm->lsm_array->lai_ext_array[aindex];
1304
1305                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1306                                 aindex ++;
1307                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1308                                         LPU64" len %d\n", aindex, i,
1309                                         lex->le_start, (int)lex->le_len);
1310                         lmj->lmm_objects[i].l_extent_start =
1311                                 lex->le_start;
1312
1313                         if ((int)lex->le_len == -1)
1314                                 lmj->lmm_objects[i].l_extent_end = -1;
1315                         else
1316                                 lmj->lmm_objects[i].l_extent_end =
1317                                         lex->le_start + lex->le_len;
1318                         lmj->lmm_objects[i].l_object_id =
1319                                 lsm->lsm_oinfo[i]->loi_id;
1320                         lmj->lmm_objects[i].l_object_gr =
1321                                 lsm->lsm_oinfo[i]->loi_gr;
1322                         lmj->lmm_objects[i].l_ost_gen =
1323                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1324                         lmj->lmm_objects[i].l_ost_idx =
1325                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1326                 }
1327                 lmm = (struct lov_mds_md *)lmj;
1328                 lmmsize = lmj_size;
1329 out_free_memmd:
1330                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1331         }
1332 out:
1333         *lmmp = lmm;
1334         *lmm_size = lmmsize;
1335         *request = req;
1336         return rc;
1337 }
1338
1339 static int ll_lov_setea(struct inode *inode, struct file *file,
1340                             unsigned long arg)
1341 {
1342         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1343         struct lov_user_md  *lump;
1344         int lum_size = sizeof(struct lov_user_md) +
1345                        sizeof(struct lov_user_ost_data);
1346         int rc;
1347         ENTRY;
1348
1349         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1350                 RETURN(-EPERM);
1351
1352         OBD_ALLOC(lump, lum_size);
1353         if (lump == NULL) {
1354                 RETURN(-ENOMEM);
1355         }
1356         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1357                 OBD_FREE(lump, lum_size);
1358                 RETURN(-EFAULT);
1359         }
1360
1361         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1362
1363         OBD_FREE(lump, lum_size);
1364         RETURN(rc);
1365 }
1366
1367 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1368                             unsigned long arg)
1369 {
1370         struct lov_user_md_v3 lumv3;
1371         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1372         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1373         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1374         int lum_size;
1375         int rc;
1376         int flags = FMODE_WRITE;
1377         ENTRY;
1378
1379         /* first try with v1 which is smaller than v3 */
1380         lum_size = sizeof(struct lov_user_md_v1);
1381         if (copy_from_user(lumv1, lumv1p, lum_size))
1382                 RETURN(-EFAULT);
1383
1384         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1385                 lum_size = sizeof(struct lov_user_md_v3);
1386                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1387                         RETURN(-EFAULT);
1388         }
1389
1390         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1391         if (rc == 0) {
1392                  put_user(0, &lumv1p->lmm_stripe_count);
1393                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1394                                     0, ll_i2info(inode)->lli_smd,
1395                                     (void *)arg);
1396         }
1397         RETURN(rc);
1398 }
1399
1400 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1401 {
1402         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1403
1404         if (!lsm)
1405                 RETURN(-ENODATA);
1406
1407         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1408                             (void *)arg);
1409 }
1410
1411 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1412 {
1413         struct ll_inode_info   *lli = ll_i2info(inode);
1414         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1415         struct ccc_grouplock    grouplock;
1416         int                     rc;
1417         ENTRY;
1418
1419         spin_lock(&lli->lli_lock);
1420         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1421                 CERROR("group lock already existed with gid %lu\n",
1422                        fd->fd_grouplock.cg_gid);
1423                 spin_unlock(&lli->lli_lock);
1424                 RETURN(-EINVAL);
1425         }
1426         LASSERT(fd->fd_grouplock.cg_lock == NULL);
1427         spin_unlock(&lli->lli_lock);
1428
1429         rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1430                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1431         if (rc)
1432                 RETURN(rc);
1433
1434         spin_lock(&lli->lli_lock);
1435         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1436                 spin_unlock(&lli->lli_lock);
1437                 CERROR("another thread just won the race\n");
1438                 cl_put_grouplock(&grouplock);
1439                 RETURN(-EINVAL);
1440         }
1441
1442         fd->fd_flags |= (LL_FILE_GROUP_LOCKED | LL_FILE_IGNORE_LOCK);
1443         fd->fd_grouplock = grouplock;
1444         spin_unlock(&lli->lli_lock);
1445
1446         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1447         RETURN(0);
1448 }
1449
1450 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1451 {
1452         struct ll_inode_info   *lli = ll_i2info(inode);
1453         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1454         struct ccc_grouplock    grouplock;
1455         ENTRY;
1456
1457         spin_lock(&lli->lli_lock);
1458         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1459                 spin_unlock(&lli->lli_lock);
1460                 CERROR("no group lock held\n");
1461                 RETURN(-EINVAL);
1462         }
1463         LASSERT(fd->fd_grouplock.cg_lock != NULL);
1464
1465         if (fd->fd_grouplock.cg_gid != arg) {
1466                 CERROR("group lock %lu doesn't match current id %lu\n",
1467                        arg, fd->fd_grouplock.cg_gid);
1468                 spin_unlock(&lli->lli_lock);
1469                 RETURN(-EINVAL);
1470         }
1471
1472         grouplock = fd->fd_grouplock;
1473         fd->fd_grouplock.cg_env = NULL;
1474         fd->fd_grouplock.cg_lock = NULL;
1475         fd->fd_grouplock.cg_gid = 0;
1476         fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED | LL_FILE_IGNORE_LOCK);
1477         spin_unlock(&lli->lli_lock);
1478
1479         cl_put_grouplock(&grouplock);
1480         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1481         RETURN(0);
1482 }
1483
1484 #if LUSTRE_FIX >= 50
1485 static int join_sanity_check(struct inode *head, struct inode *tail)
1486 {
1487         ENTRY;
1488         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1489                 CERROR("server do not support join \n");
1490                 RETURN(-EINVAL);
1491         }
1492         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1493                 CERROR("tail ino %lu and ino head %lu must be regular\n",
1494                        head->i_ino, tail->i_ino);
1495                 RETURN(-EINVAL);
1496         }
1497         if (head->i_ino == tail->i_ino) {
1498                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1499                 RETURN(-EINVAL);
1500         }
1501         if (i_size_read(head) % JOIN_FILE_ALIGN) {
1502                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1503                 RETURN(-EINVAL);
1504         }
1505         RETURN(0);
1506 }
1507
1508 static int join_file(struct inode *head_inode, struct file *head_filp,
1509                      struct file *tail_filp)
1510 {
1511         struct dentry *tail_dentry = tail_filp->f_dentry;
1512         struct lookup_intent oit = {.it_op = IT_OPEN,
1513                                     .it_flags = head_filp->f_flags,
1514                                     .it_create_mode = M_JOIN_FILE};
1515         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
1516                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
1517
1518         struct lustre_handle lockh;
1519         struct md_op_data *op_data;
1520         int    rc;
1521         loff_t data;
1522         ENTRY;
1523
1524         tail_dentry = tail_filp->f_dentry;
1525
1526         data = i_size_read(head_inode);
1527         op_data = ll_prep_md_op_data(NULL, head_inode,
1528                                      tail_dentry->d_parent->d_inode,
1529                                      tail_dentry->d_name.name,
1530                                      tail_dentry->d_name.len, 0,
1531                                      LUSTRE_OPC_ANY, &data);
1532         if (IS_ERR(op_data))
1533                 RETURN(PTR_ERR(op_data));
1534
1535         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
1536                          op_data, &lockh, NULL, 0, NULL, 0);
1537
1538         ll_finish_md_op_data(op_data);
1539         if (rc < 0)
1540                 GOTO(out, rc);
1541
1542         rc = oit.d.lustre.it_status;
1543
1544         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
1545                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
1546                 ptlrpc_req_finished((struct ptlrpc_request *)
1547                                     oit.d.lustre.it_data);
1548                 GOTO(out, rc);
1549         }
1550
1551         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1552                                            * away */
1553                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1554                 oit.d.lustre.it_lock_mode = 0;
1555         }
1556         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1557         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
1558         ll_release_openhandle(head_filp->f_dentry, &oit);
1559 out:
1560         ll_intent_release(&oit);
1561         RETURN(rc);
1562 }
1563
1564 static int ll_file_join(struct inode *head, struct file *filp,
1565                         char *filename_tail)
1566 {
1567         struct inode *tail = NULL, *first = NULL, *second = NULL;
1568         struct dentry *tail_dentry;
1569         struct file *tail_filp, *first_filp, *second_filp;
1570         struct ll_lock_tree first_tree, second_tree;
1571         struct ll_lock_tree_node *first_node, *second_node;
1572         struct ll_inode_info *hlli = ll_i2info(head);
1573         int rc = 0, cleanup_phase = 0;
1574         ENTRY;
1575
1576         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1577                head->i_ino, head->i_generation, head, filename_tail);
1578
1579         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1580         if (IS_ERR(tail_filp)) {
1581                 CERROR("Can not open tail file %s", filename_tail);
1582                 rc = PTR_ERR(tail_filp);
1583                 GOTO(cleanup, rc);
1584         }
1585         tail = igrab(tail_filp->f_dentry->d_inode);
1586
1587         tail_dentry = tail_filp->f_dentry;
1588         LASSERT(tail_dentry);
1589         cleanup_phase = 1;
1590
1591         /*reorder the inode for lock sequence*/
1592         first = head->i_ino > tail->i_ino ? head : tail;
1593         second = head->i_ino > tail->i_ino ? tail : head;
1594         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1595         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1596
1597         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1598                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1599         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1600         if (IS_ERR(first_node)){
1601                 rc = PTR_ERR(first_node);
1602                 GOTO(cleanup, rc);
1603         }
1604         first_tree.lt_fd = first_filp->private_data;
1605         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1606         if (rc != 0)
1607                 GOTO(cleanup, rc);
1608         cleanup_phase = 2;
1609
1610         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1611         if (IS_ERR(second_node)){
1612                 rc = PTR_ERR(second_node);
1613                 GOTO(cleanup, rc);
1614         }
1615         second_tree.lt_fd = second_filp->private_data;
1616         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1617         if (rc != 0)
1618                 GOTO(cleanup, rc);
1619         cleanup_phase = 3;
1620
1621         rc = join_sanity_check(head, tail);
1622         if (rc)
1623                 GOTO(cleanup, rc);
1624
1625         rc = join_file(head, filp, tail_filp);
1626         if (rc)
1627                 GOTO(cleanup, rc);
1628 cleanup:
1629         switch (cleanup_phase) {
1630         case 3:
1631                 ll_tree_unlock(&second_tree);
1632                 obd_cancel_unused(ll_i2dtexp(second),
1633                                   ll_i2info(second)->lli_smd, 0, NULL);
1634         case 2:
1635                 ll_tree_unlock(&first_tree);
1636                 obd_cancel_unused(ll_i2dtexp(first),
1637                                   ll_i2info(first)->lli_smd, 0, NULL);
1638         case 1:
1639                 filp_close(tail_filp, 0);
1640                 if (tail)
1641                         iput(tail);
1642                 if (head && rc == 0) {
1643                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
1644                                        &hlli->lli_smd);
1645                         hlli->lli_smd = NULL;
1646                 }
1647         case 0:
1648                 break;
1649         default:
1650                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
1651                 LBUG();
1652         }
1653         RETURN(rc);
1654 }
1655 #endif /* LUSTRE_FIX >= 50 */
1656
1657 /**
1658  * Close inode open handle
1659  *
1660  * \param dentry [in]     dentry which contains the inode
1661  * \param it     [in,out] intent which contains open info and result
1662  *
1663  * \retval 0     success
1664  * \retval <0    failure
1665  */
1666 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1667 {
1668         struct inode *inode = dentry->d_inode;
1669         struct obd_client_handle *och;
1670         int rc;
1671         ENTRY;
1672
1673         LASSERT(inode);
1674
1675         /* Root ? Do nothing. */
1676         if (dentry->d_inode->i_sb->s_root == dentry)
1677                 RETURN(0);
1678
1679         /* No open handle to close? Move away */
1680         if (!it_disposition(it, DISP_OPEN_OPEN))
1681                 RETURN(0);
1682
1683         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1684
1685         OBD_ALLOC(och, sizeof(*och));
1686         if (!och)
1687                 GOTO(out, rc = -ENOMEM);
1688
1689         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1690                     ll_i2info(inode), it, och);
1691
1692         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1693                                        inode, och);
1694  out:
1695         /* this one is in place of ll_file_open */
1696         if (it_disposition(it, DISP_ENQ_OPEN_REF))
1697                 ptlrpc_req_finished(it->d.lustre.it_data);
1698         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1699         RETURN(rc);
1700 }
1701
1702 /**
1703  * Get size for inode for which FIEMAP mapping is requested.
1704  * Make the FIEMAP get_info call and returns the result.
1705  */
1706 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1707               int num_bytes)
1708 {
1709         struct obd_export *exp = ll_i2dtexp(inode);
1710         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1711         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1712         int vallen = num_bytes;
1713         int rc;
1714         ENTRY;
1715
1716         /* If the stripe_count > 1 and the application does not understand
1717          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1718          */
1719         if (lsm->lsm_stripe_count > 1 &&
1720             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1721                 return -EOPNOTSUPP;
1722
1723         fm_key.oa.o_id = lsm->lsm_object_id;
1724         fm_key.oa.o_gr = lsm->lsm_object_gr;
1725         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1726
1727         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
1728                         OBD_MD_FLSIZE);
1729
1730         /* If filesize is 0, then there would be no objects for mapping */
1731         if (fm_key.oa.o_size == 0) {
1732                 fiemap->fm_mapped_extents = 0;
1733                 RETURN(0);
1734         }
1735
1736         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1737
1738         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1739         if (rc)
1740                 CERROR("obd_get_info failed: rc = %d\n", rc);
1741
1742         RETURN(rc);
1743 }
1744
1745 int ll_fid2path(struct obd_export *exp, void *arg)
1746 {
1747         struct getinfo_fid2path *gfout, *gfin;
1748         int outsize, rc;
1749         ENTRY;
1750
1751         /* Need to get the buflen */
1752         OBD_ALLOC_PTR(gfin);
1753         if (gfin == NULL)
1754                 RETURN(-ENOMEM);
1755         if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1756                 OBD_FREE_PTR(gfin);
1757                 RETURN(-EFAULT);
1758         }
1759
1760         outsize = sizeof(*gfout) + gfin->gf_pathlen;
1761         OBD_ALLOC(gfout, outsize);
1762         if (gfout == NULL) {
1763                 OBD_FREE_PTR(gfin);
1764                 RETURN(-ENOMEM);
1765         }
1766         memcpy(gfout, gfin, sizeof(*gfout));
1767         OBD_FREE_PTR(gfin);
1768
1769         /* Call mdc_iocontrol */
1770         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1771         if (rc)
1772                 GOTO(gf_free, rc);
1773         if (copy_to_user(arg, gfout, outsize))
1774                 rc = -EFAULT;
1775
1776 gf_free:
1777         OBD_FREE(gfout, outsize);
1778         RETURN(rc);
1779 }
1780
1781 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1782                   unsigned long arg)
1783 {
1784         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1785         int flags;
1786         ENTRY;
1787
1788         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1789                inode->i_generation, inode, cmd);
1790         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1791
1792         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1793         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1794                 RETURN(-ENOTTY);
1795
1796         switch(cmd) {
1797         case LL_IOC_GETFLAGS:
1798                 /* Get the current value of the file flags */
1799                 return put_user(fd->fd_flags, (int *)arg);
1800         case LL_IOC_SETFLAGS:
1801         case LL_IOC_CLRFLAGS:
1802                 /* Set or clear specific file flags */
1803                 /* XXX This probably needs checks to ensure the flags are
1804                  *     not abused, and to handle any flag side effects.
1805                  */
1806                 if (get_user(flags, (int *) arg))
1807                         RETURN(-EFAULT);
1808
1809                 if (cmd == LL_IOC_SETFLAGS) {
1810                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1811                             !(file->f_flags & O_DIRECT)) {
1812                                 CERROR("%s: unable to disable locking on "
1813                                        "non-O_DIRECT file\n", current->comm);
1814                                 RETURN(-EINVAL);
1815                         }
1816
1817                         fd->fd_flags |= flags;
1818                 } else {
1819                         fd->fd_flags &= ~flags;
1820                 }
1821                 RETURN(0);
1822         case LL_IOC_LOV_SETSTRIPE:
1823                 RETURN(ll_lov_setstripe(inode, file, arg));
1824         case LL_IOC_LOV_SETEA:
1825                 RETURN(ll_lov_setea(inode, file, arg));
1826         case LL_IOC_LOV_GETSTRIPE:
1827                 RETURN(ll_lov_getstripe(inode, arg));
1828         case LL_IOC_RECREATE_OBJ:
1829                 RETURN(ll_lov_recreate_obj(inode, file, arg));
1830         case FSFILT_IOC_FIEMAP: {
1831                 struct ll_user_fiemap *fiemap_s;
1832                 size_t num_bytes, ret_bytes;
1833                 unsigned int extent_count;
1834                 int rc = 0;
1835
1836                 /* Get the extent count so we can calculate the size of
1837                  * required fiemap buffer */
1838                 if (get_user(extent_count,
1839                     &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1840                         RETURN(-EFAULT);
1841                 num_bytes = sizeof(*fiemap_s) + (extent_count *
1842                                                  sizeof(struct ll_fiemap_extent));
1843                 OBD_VMALLOC(fiemap_s, num_bytes);
1844                 if (fiemap_s == NULL)
1845                         RETURN(-ENOMEM);
1846
1847                 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1848                                    sizeof(*fiemap_s)))
1849                         GOTO(error, rc = -EFAULT);
1850
1851                 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1852                         fiemap_s->fm_flags = fiemap_s->fm_flags &
1853                                                     ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1854                         if (copy_to_user((char *)arg, fiemap_s,
1855                                          sizeof(*fiemap_s)))
1856                                 GOTO(error, rc = -EFAULT);
1857
1858                         GOTO(error, rc = -EBADR);
1859                 }
1860
1861                 /* If fm_extent_count is non-zero, read the first extent since
1862                  * it is used to calculate end_offset and device from previous
1863                  * fiemap call. */
1864                 if (extent_count) {
1865                         if (copy_from_user(&fiemap_s->fm_extents[0],
1866                             (char __user *)arg + sizeof(*fiemap_s),
1867                             sizeof(struct ll_fiemap_extent)))
1868                                 GOTO(error, rc = -EFAULT);
1869                 }
1870
1871                 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
1872                         int rc;
1873
1874                         rc = filemap_fdatawrite(inode->i_mapping);
1875                         if (rc)
1876                                 GOTO(error, rc);
1877                 }
1878
1879                 rc = ll_fiemap(inode, fiemap_s, num_bytes);
1880                 if (rc)
1881                         GOTO(error, rc);
1882
1883                 ret_bytes = sizeof(struct ll_user_fiemap);
1884
1885                 if (extent_count != 0)
1886                         ret_bytes += (fiemap_s->fm_mapped_extents *
1887                                          sizeof(struct ll_fiemap_extent));
1888
1889                 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1890                         rc = -EFAULT;
1891
1892 error:
1893                 OBD_VFREE(fiemap_s, num_bytes);
1894                 RETURN(rc);
1895         }
1896         case FSFILT_IOC_GETFLAGS:
1897         case FSFILT_IOC_SETFLAGS:
1898                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1899         case FSFILT_IOC_GETVERSION_OLD:
1900         case FSFILT_IOC_GETVERSION:
1901                 RETURN(put_user(inode->i_generation, (int *)arg));
1902         case LL_IOC_JOIN: {
1903 #if LUSTRE_FIX >= 50
1904                 /* Allow file join in beta builds to allow debuggging */
1905                 char *ftail;
1906                 int rc;
1907
1908                 ftail = getname((const char *)arg);
1909                 if (IS_ERR(ftail))
1910                         RETURN(PTR_ERR(ftail));
1911                 rc = ll_file_join(inode, file, ftail);
1912                 putname(ftail);
1913                 RETURN(rc);
1914 #else
1915                 CWARN("file join is not supported in this version of Lustre\n");
1916                 RETURN(-ENOTTY);
1917 #endif
1918         }
1919         case LL_IOC_GROUP_LOCK:
1920                 RETURN(ll_get_grouplock(inode, file, arg));
1921         case LL_IOC_GROUP_UNLOCK:
1922                 RETURN(ll_put_grouplock(inode, file, arg));
1923         case IOC_OBD_STATFS:
1924                 RETURN(ll_obd_statfs(inode, (void *)arg));
1925
1926         /* We need to special case any other ioctls we want to handle,
1927          * to send them to the MDS/OST as appropriate and to properly
1928          * network encode the arg field.
1929         case FSFILT_IOC_SETVERSION_OLD:
1930         case FSFILT_IOC_SETVERSION:
1931         */
1932         case LL_IOC_FLUSHCTX:
1933                 RETURN(ll_flush_ctx(inode));
1934         case LL_IOC_PATH2FID: {
1935                 if (copy_to_user((void *)arg, ll_inode2fid(inode),
1936                                  sizeof(struct lu_fid)))
1937                         RETURN(-EFAULT);
1938
1939                 RETURN(0);
1940         }
1941         case OBD_IOC_FID2PATH:
1942                 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1943
1944         default: {
1945                 int err;
1946
1947                 if (LLIOC_STOP ==
1948                     ll_iocontrol_call(inode, file, cmd, arg, &err))
1949                         RETURN(err);
1950
1951                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1952                                      (void *)arg));
1953         }
1954         }
1955 }
1956
1957 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1958 {
1959         struct inode *inode = file->f_dentry->d_inode;
1960         loff_t retval;
1961         ENTRY;
1962         retval = offset + ((origin == 2) ? i_size_read(inode) :
1963                            (origin == 1) ? file->f_pos : 0);
1964         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
1965                inode->i_ino, inode->i_generation, inode, retval, retval,
1966                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1967         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1968
1969         if (origin == 2) { /* SEEK_END */
1970                 int nonblock = 0, rc;
1971
1972                 if (file->f_flags & O_NONBLOCK)
1973                         nonblock = LDLM_FL_BLOCK_NOWAIT;
1974
1975                 rc = cl_glimpse_size(inode);
1976                 if (rc != 0)
1977                         RETURN(rc);
1978
1979                 ll_inode_size_lock(inode, 0);
1980                 offset += i_size_read(inode);
1981                 ll_inode_size_unlock(inode, 0);
1982         } else if (origin == 1) { /* SEEK_CUR */
1983                 offset += file->f_pos;
1984         }
1985
1986         retval = -EINVAL;
1987         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1988                 if (offset != file->f_pos) {
1989                         file->f_pos = offset;
1990                 }
1991                 retval = offset;
1992         }
1993
1994         RETURN(retval);
1995 }
1996
1997 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1998 {
1999         struct inode *inode = dentry->d_inode;
2000         struct ll_inode_info *lli = ll_i2info(inode);
2001         struct lov_stripe_md *lsm = lli->lli_smd;
2002         struct ptlrpc_request *req;
2003         struct obd_capa *oc;
2004         int rc, err;
2005         ENTRY;
2006         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2007                inode->i_generation, inode);
2008         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2009
2010         /* fsync's caller has already called _fdata{sync,write}, we want
2011          * that IO to finish before calling the osc and mdc sync methods */
2012         rc = filemap_fdatawait(inode->i_mapping);
2013
2014         /* catch async errors that were recorded back when async writeback
2015          * failed for pages in this mapping. */
2016         err = lli->lli_async_rc;
2017         lli->lli_async_rc = 0;
2018         if (rc == 0)
2019                 rc = err;
2020         if (lsm) {
2021                 err = lov_test_and_clear_async_rc(lsm);
2022                 if (rc == 0)
2023                         rc = err;
2024         }
2025
2026         oc = ll_mdscapa_get(inode);
2027         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2028                       &req);
2029         capa_put(oc);
2030         if (!rc)
2031                 rc = err;
2032         if (!err)
2033                 ptlrpc_req_finished(req);
2034
2035         if (data && lsm) {
2036                 struct obdo *oa;
2037
2038                 OBDO_ALLOC(oa);
2039                 if (!oa)
2040                         RETURN(rc ? rc : -ENOMEM);
2041
2042                 oa->o_id = lsm->lsm_object_id;
2043                 oa->o_gr = lsm->lsm_object_gr;
2044                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2045                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2046                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2047                                            OBD_MD_FLGROUP);
2048
2049                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2050                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2051                                0, OBD_OBJECT_EOF, oc);
2052                 capa_put(oc);
2053                 if (!rc)
2054                         rc = err;
2055                 OBDO_FREE(oa);
2056         }
2057
2058         RETURN(rc);
2059 }
2060
2061 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2062 {
2063         struct inode *inode = file->f_dentry->d_inode;
2064         struct ll_sb_info *sbi = ll_i2sbi(inode);
2065         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2066                                            .ei_cb_cp =ldlm_flock_completion_ast,
2067                                            .ei_cbdata = file_lock };
2068         struct md_op_data *op_data;
2069         struct lustre_handle lockh = {0};
2070         ldlm_policy_data_t flock;
2071         int flags = 0;
2072         int rc;
2073         ENTRY;
2074
2075         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2076                inode->i_ino, file_lock);
2077
2078         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2079
2080         if (file_lock->fl_flags & FL_FLOCK) {
2081                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2082                 /* set missing params for flock() calls */
2083                 file_lock->fl_end = OFFSET_MAX;
2084                 file_lock->fl_pid = current->tgid;
2085         }
2086         flock.l_flock.pid = file_lock->fl_pid;
2087         flock.l_flock.start = file_lock->fl_start;
2088         flock.l_flock.end = file_lock->fl_end;
2089
2090         switch (file_lock->fl_type) {
2091         case F_RDLCK:
2092                 einfo.ei_mode = LCK_PR;
2093                 break;
2094         case F_UNLCK:
2095                 /* An unlock request may or may not have any relation to
2096                  * existing locks so we may not be able to pass a lock handle
2097                  * via a normal ldlm_lock_cancel() request. The request may even
2098                  * unlock a byte range in the middle of an existing lock. In
2099                  * order to process an unlock request we need all of the same
2100                  * information that is given with a normal read or write record
2101                  * lock request. To avoid creating another ldlm unlock (cancel)
2102                  * message we'll treat a LCK_NL flock request as an unlock. */
2103                 einfo.ei_mode = LCK_NL;
2104                 break;
2105         case F_WRLCK:
2106                 einfo.ei_mode = LCK_PW;
2107                 break;
2108         default:
2109                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2110                 RETURN (-EINVAL);
2111         }
2112
2113         switch (cmd) {
2114         case F_SETLKW:
2115 #ifdef F_SETLKW64
2116         case F_SETLKW64:
2117 #endif
2118                 flags = 0;
2119                 break;
2120         case F_SETLK:
2121 #ifdef F_SETLK64
2122         case F_SETLK64:
2123 #endif
2124                 flags = LDLM_FL_BLOCK_NOWAIT;
2125                 break;
2126         case F_GETLK:
2127 #ifdef F_GETLK64
2128         case F_GETLK64:
2129 #endif
2130                 flags = LDLM_FL_TEST_LOCK;
2131                 /* Save the old mode so that if the mode in the lock changes we
2132                  * can decrement the appropriate reader or writer refcount. */
2133                 file_lock->fl_type = einfo.ei_mode;
2134                 break;
2135         default:
2136                 CERROR("unknown fcntl lock command: %d\n", cmd);
2137                 RETURN (-EINVAL);
2138         }
2139
2140         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2141                                      LUSTRE_OPC_ANY, NULL);
2142         if (IS_ERR(op_data))
2143                 RETURN(PTR_ERR(op_data));
2144
2145         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2146                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2147                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2148
2149         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2150                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2151
2152         ll_finish_md_op_data(op_data);
2153
2154         if ((file_lock->fl_flags & FL_FLOCK) &&
2155             (rc == 0 || file_lock->fl_type == F_UNLCK))
2156                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2157 #ifdef HAVE_F_OP_FLOCK
2158         if ((file_lock->fl_flags & FL_POSIX) &&
2159             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2160             !(flags & LDLM_FL_TEST_LOCK))
2161                 posix_lock_file_wait(file, file_lock);
2162 #endif
2163
2164         RETURN(rc);
2165 }
2166
2167 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2168 {
2169         ENTRY;
2170
2171         RETURN(-ENOSYS);
2172 }
2173
2174 int ll_have_md_lock(struct inode *inode, __u64 bits)
2175 {
2176         struct lustre_handle lockh;
2177         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2178         struct lu_fid *fid;
2179         int flags;
2180         ENTRY;
2181
2182         if (!inode)
2183                RETURN(0);
2184
2185         fid = &ll_i2info(inode)->lli_fid;
2186         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2187
2188         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2189         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2190                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2191                 RETURN(1);
2192         }
2193         RETURN(0);
2194 }
2195
2196 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2197                             struct lustre_handle *lockh)
2198 {
2199         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2200         struct lu_fid *fid;
2201         ldlm_mode_t rc;
2202         int flags;
2203         ENTRY;
2204
2205         fid = &ll_i2info(inode)->lli_fid;
2206         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2207
2208         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2209         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2210                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2211         RETURN(rc);
2212 }
2213
2214 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2215         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2216                               * and return success */
2217                 inode->i_nlink = 0;
2218                 /* This path cannot be hit for regular files unless in
2219                  * case of obscure races, so no need to to validate
2220                  * size. */
2221                 if (!S_ISREG(inode->i_mode) &&
2222                     !S_ISDIR(inode->i_mode))
2223                         return 0;
2224         }
2225
2226         if (rc) {
2227                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2228                 return -abs(rc);
2229
2230         }
2231
2232         return 0;
2233 }
2234
2235 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2236                              __u64 ibits)
2237 {
2238         struct inode *inode = dentry->d_inode;
2239         struct ptlrpc_request *req = NULL;
2240         struct ll_sb_info *sbi;
2241         struct obd_export *exp;
2242         int rc = 0;
2243         ENTRY;
2244
2245         if (!inode) {
2246                 CERROR("REPORT THIS LINE TO PETER\n");
2247                 RETURN(0);
2248         }
2249         sbi = ll_i2sbi(inode);
2250
2251         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2252                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2253
2254         exp = ll_i2mdexp(inode);
2255
2256         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2257                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2258                 struct md_op_data *op_data;
2259
2260                 /* Call getattr by fid, so do not provide name at all. */
2261                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2262                                              dentry->d_inode, NULL, 0, 0,
2263                                              LUSTRE_OPC_ANY, NULL);
2264                 if (IS_ERR(op_data))
2265                         RETURN(PTR_ERR(op_data));
2266
2267                 oit.it_create_mode |= M_CHECK_STALE;
2268                 rc = md_intent_lock(exp, op_data, NULL, 0,
2269                                     /* we are not interested in name
2270                                        based lookup */
2271                                     &oit, 0, &req,
2272                                     ll_md_blocking_ast, 0);
2273                 ll_finish_md_op_data(op_data);
2274                 oit.it_create_mode &= ~M_CHECK_STALE;
2275                 if (rc < 0) {
2276                         rc = ll_inode_revalidate_fini(inode, rc);
2277                         GOTO (out, rc);
2278                 }
2279
2280                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2281                 if (rc != 0) {
2282                         ll_intent_release(&oit);
2283                         GOTO(out, rc);
2284                 }
2285
2286                 /* Unlinked? Unhash dentry, so it is not picked up later by
2287                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2288                    here to preserve get_cwd functionality on 2.6.
2289                    Bug 10503 */
2290                 if (!dentry->d_inode->i_nlink) {
2291                         spin_lock(&ll_lookup_lock);
2292                         spin_lock(&dcache_lock);
2293                         ll_drop_dentry(dentry);
2294                         spin_unlock(&dcache_lock);
2295                         spin_unlock(&ll_lookup_lock);
2296                 }
2297
2298                 ll_lookup_finish_locks(&oit, dentry);
2299         } else if (!ll_have_md_lock(dentry->d_inode, ibits)) {
2300
2301                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2302                 obd_valid valid = OBD_MD_FLGETATTR;
2303                 struct obd_capa *oc;
2304                 int ealen = 0;
2305
2306                 if (S_ISREG(inode->i_mode)) {
2307                         rc = ll_get_max_mdsize(sbi, &ealen);
2308                         if (rc)
2309                                 RETURN(rc);
2310                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2311                 }
2312                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2313                  * capa for this inode. Because we only keep capas of dirs
2314                  * fresh. */
2315                 oc = ll_mdscapa_get(inode);
2316                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2317                                 ealen, &req);
2318                 capa_put(oc);
2319                 if (rc) {
2320                         rc = ll_inode_revalidate_fini(inode, rc);
2321                         RETURN(rc);
2322                 }
2323
2324                 rc = ll_prep_inode(&inode, req, NULL);
2325         }
2326 out:
2327         ptlrpc_req_finished(req);
2328         return rc;
2329 }
2330
2331 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2332 {
2333         int rc;
2334         ENTRY;
2335
2336         rc = __ll_inode_revalidate_it(dentry, it, MDS_INODELOCK_UPDATE |
2337                                                   MDS_INODELOCK_LOOKUP);
2338
2339         /* if object not yet allocated, don't validate size */
2340         if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL)
2341                 RETURN(0);
2342
2343         /* cl_glimpse_size will prefer locally cached writes if they extend
2344          * the file */
2345
2346         if (rc == 0)
2347                 rc = cl_glimpse_size(dentry->d_inode);
2348
2349         RETURN(rc);
2350 }
2351
2352 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2353                   struct lookup_intent *it, struct kstat *stat)
2354 {
2355         struct inode *inode = de->d_inode;
2356         int res = 0;
2357
2358         res = ll_inode_revalidate_it(de, it);
2359         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2360
2361         if (res)
2362                 return res;
2363
2364         stat->dev = inode->i_sb->s_dev;
2365         stat->ino = inode->i_ino;
2366         stat->mode = inode->i_mode;
2367         stat->nlink = inode->i_nlink;
2368         stat->uid = inode->i_uid;
2369         stat->gid = inode->i_gid;
2370         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2371         stat->atime = inode->i_atime;
2372         stat->mtime = inode->i_mtime;
2373         stat->ctime = inode->i_ctime;
2374 #ifdef HAVE_INODE_BLKSIZE
2375         stat->blksize = inode->i_blksize;
2376 #else
2377         stat->blksize = 1 << inode->i_blkbits;
2378 #endif
2379
2380         ll_inode_size_lock(inode, 0);
2381         stat->size = i_size_read(inode);
2382         stat->blocks = inode->i_blocks;
2383         ll_inode_size_unlock(inode, 0);
2384
2385         return 0;
2386 }
2387 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2388 {
2389         struct lookup_intent it = { .it_op = IT_GETATTR };
2390
2391         return ll_getattr_it(mnt, de, &it, stat);
2392 }
2393
2394 static
2395 int lustre_check_acl(struct inode *inode, int mask)
2396 {
2397 #ifdef CONFIG_FS_POSIX_ACL
2398         struct ll_inode_info *lli = ll_i2info(inode);
2399         struct posix_acl *acl;
2400         int rc;
2401         ENTRY;
2402
2403         spin_lock(&lli->lli_lock);
2404         acl = posix_acl_dup(lli->lli_posix_acl);
2405         spin_unlock(&lli->lli_lock);
2406
2407         if (!acl)
2408                 RETURN(-EAGAIN);
2409
2410         rc = posix_acl_permission(inode, acl, mask);
2411         posix_acl_release(acl);
2412
2413         RETURN(rc);
2414 #else
2415         return -EAGAIN;
2416 #endif
2417 }
2418
2419 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2420 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2421 {
2422         int rc = 0;
2423         ENTRY;
2424
2425        /* as root inode are NOT getting validated in lookup operation,
2426         * need to do it before permission check. */
2427
2428         if (inode == inode->i_sb->s_root->d_inode) {
2429                 struct lookup_intent it = { .it_op = IT_LOOKUP };
2430
2431                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2432                                               MDS_INODELOCK_LOOKUP);
2433                 if (rc)
2434                         RETURN(rc);
2435         }
2436
2437         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2438                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2439
2440         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2441                 return lustre_check_remote_perm(inode, mask);
2442
2443         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2444         rc = generic_permission(inode, mask, lustre_check_acl);
2445
2446         RETURN(rc);
2447 }
2448 #else
2449 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2450 {
2451         int mode = inode->i_mode;
2452         int rc;
2453
2454         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2455                inode->i_ino, inode->i_generation, inode, mask);
2456
2457         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2458                 return lustre_check_remote_perm(inode, mask);
2459
2460         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2461
2462         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2463             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2464                 return -EROFS;
2465         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2466                 return -EACCES;
2467         if (current->fsuid == inode->i_uid) {
2468                 mode >>= 6;
2469         } else if (1) {
2470                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2471                         goto check_groups;
2472                 rc = lustre_check_acl(inode, mask);
2473                 if (rc == -EAGAIN)
2474                         goto check_groups;
2475                 if (rc == -EACCES)
2476                         goto check_capabilities;
2477                 return rc;
2478         } else {
2479 check_groups:
2480                 if (in_group_p(inode->i_gid))
2481                         mode >>= 3;
2482         }
2483         if ((mode & mask & S_IRWXO) == mask)
2484                 return 0;
2485
2486 check_capabilities:
2487         if (!(mask & MAY_EXEC) ||
2488             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2489                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2490                         return 0;
2491
2492         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2493             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2494                 return 0;
2495
2496         return -EACCES;
2497 }
2498 #endif
2499
2500 #ifdef HAVE_FILE_READV
2501 #define READ_METHOD readv
2502 #define READ_FUNCTION ll_file_readv
2503 #define WRITE_METHOD writev
2504 #define WRITE_FUNCTION ll_file_writev
2505 #else
2506 #define READ_METHOD aio_read
2507 #define READ_FUNCTION ll_file_aio_read
2508 #define WRITE_METHOD aio_write
2509 #define WRITE_FUNCTION ll_file_aio_write
2510 #endif
2511
2512 /* -o localflock - only provides locally consistent flock locks */
2513 struct file_operations ll_file_operations = {
2514         .read           = ll_file_read,
2515         .READ_METHOD    = READ_FUNCTION,
2516         .write          = ll_file_write,
2517         .WRITE_METHOD   = WRITE_FUNCTION,
2518         .ioctl          = ll_file_ioctl,
2519         .open           = ll_file_open,
2520         .release        = ll_file_release,
2521         .mmap           = ll_file_mmap,
2522         .llseek         = ll_file_seek,
2523         .sendfile       = ll_file_sendfile,
2524         .fsync          = ll_fsync,
2525 };
2526
2527 struct file_operations ll_file_operations_flock = {
2528         .read           = ll_file_read,
2529         .READ_METHOD    = READ_FUNCTION,
2530         .write          = ll_file_write,
2531         .WRITE_METHOD   = WRITE_FUNCTION,
2532         .ioctl          = ll_file_ioctl,
2533         .open           = ll_file_open,
2534         .release        = ll_file_release,
2535         .mmap           = ll_file_mmap,
2536         .llseek         = ll_file_seek,
2537         .sendfile       = ll_file_sendfile,
2538         .fsync          = ll_fsync,
2539 #ifdef HAVE_F_OP_FLOCK
2540         .flock          = ll_file_flock,
2541 #endif
2542         .lock           = ll_file_flock
2543 };
2544
2545 /* These are for -o noflock - to return ENOSYS on flock calls */
2546 struct file_operations ll_file_operations_noflock = {
2547         .read           = ll_file_read,
2548         .READ_METHOD    = READ_FUNCTION,
2549         .write          = ll_file_write,
2550         .WRITE_METHOD   = WRITE_FUNCTION,
2551         .ioctl          = ll_file_ioctl,
2552         .open           = ll_file_open,
2553         .release        = ll_file_release,
2554         .mmap           = ll_file_mmap,
2555         .llseek         = ll_file_seek,
2556         .sendfile       = ll_file_sendfile,
2557         .fsync          = ll_fsync,
2558 #ifdef HAVE_F_OP_FLOCK
2559         .flock          = ll_file_noflock,
2560 #endif
2561         .lock           = ll_file_noflock
2562 };
2563
2564 struct inode_operations ll_file_inode_operations = {
2565 #ifdef HAVE_VFS_INTENT_PATCHES
2566         .setattr_raw    = ll_setattr_raw,
2567 #endif
2568         .setattr        = ll_setattr,
2569         .truncate       = ll_truncate,
2570         .getattr        = ll_getattr,
2571         .permission     = ll_inode_permission,
2572         .setxattr       = ll_setxattr,
2573         .getxattr       = ll_getxattr,
2574         .listxattr      = ll_listxattr,
2575         .removexattr    = ll_removexattr,
2576 };
2577
2578 /* dynamic ioctl number support routins */
2579 static struct llioc_ctl_data {
2580         struct rw_semaphore ioc_sem;
2581         struct list_head    ioc_head;
2582 } llioc = {
2583         __RWSEM_INITIALIZER(llioc.ioc_sem),
2584         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2585 };
2586
2587
2588 struct llioc_data {
2589         struct list_head        iocd_list;
2590         unsigned int            iocd_size;
2591         llioc_callback_t        iocd_cb;
2592         unsigned int            iocd_count;
2593         unsigned int            iocd_cmd[0];
2594 };
2595
2596 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2597 {
2598         unsigned int size;
2599         struct llioc_data *in_data = NULL;
2600         ENTRY;
2601
2602         if (cb == NULL || cmd == NULL ||
2603             count > LLIOC_MAX_CMD || count < 0)
2604                 RETURN(NULL);
2605
2606         size = sizeof(*in_data) + count * sizeof(unsigned int);
2607         OBD_ALLOC(in_data, size);
2608         if (in_data == NULL)
2609                 RETURN(NULL);
2610
2611         memset(in_data, 0, sizeof(*in_data));
2612         in_data->iocd_size = size;
2613         in_data->iocd_cb = cb;
2614         in_data->iocd_count = count;
2615         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2616
2617         down_write(&llioc.ioc_sem);
2618         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2619         up_write(&llioc.ioc_sem);
2620
2621         RETURN(in_data);
2622 }
2623
2624 void ll_iocontrol_unregister(void *magic)
2625 {
2626         struct llioc_data *tmp;
2627
2628         if (magic == NULL)
2629                 return;
2630
2631         down_write(&llioc.ioc_sem);
2632         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2633                 if (tmp == magic) {
2634                         unsigned int size = tmp->iocd_size;
2635
2636                         list_del(&tmp->iocd_list);
2637                         up_write(&llioc.ioc_sem);
2638
2639                         OBD_FREE(tmp, size);
2640                         return;
2641                 }
2642         }
2643         up_write(&llioc.ioc_sem);
2644
2645         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2646 }
2647
2648 EXPORT_SYMBOL(ll_iocontrol_register);
2649 EXPORT_SYMBOL(ll_iocontrol_unregister);
2650
2651 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2652                         unsigned int cmd, unsigned long arg, int *rcp)
2653 {
2654         enum llioc_iter ret = LLIOC_CONT;
2655         struct llioc_data *data;
2656         int rc = -EINVAL, i;
2657
2658         down_read(&llioc.ioc_sem);
2659         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2660                 for (i = 0; i < data->iocd_count; i++) {
2661                         if (cmd != data->iocd_cmd[i])
2662                                 continue;
2663
2664                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2665                         break;
2666                 }
2667
2668                 if (ret == LLIOC_STOP)
2669                         break;
2670         }
2671         up_read(&llioc.ioc_sem);
2672
2673         if (rcp)
2674                 *rcp = rc;
2675         return ret;
2676 }