Whamcloud - gitweb
add 2.6.27 kernel support
[fs/lustre-release.git] / lustre / llite / file.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51
52 #include "cl_object.h"
53
54 struct ll_file_data *ll_file_data_get(void)
55 {
56         struct ll_file_data *fd;
57
58         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
59         return fd;
60 }
61
62 static void ll_file_data_put(struct ll_file_data *fd)
63 {
64         if (fd != NULL)
65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 }
67
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69                           struct lustre_handle *fh)
70 {
71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72         op_data->op_attr.ia_mode = inode->i_mode;
73         op_data->op_attr.ia_atime = inode->i_atime;
74         op_data->op_attr.ia_mtime = inode->i_mtime;
75         op_data->op_attr.ia_ctime = inode->i_ctime;
76         op_data->op_attr.ia_size = i_size_read(inode);
77         op_data->op_attr_blocks = inode->i_blocks;
78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
79         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
80         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
81         op_data->op_capa1 = ll_mdscapa_get(inode);
82 }
83
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85                              struct obd_client_handle *och)
86 {
87         ENTRY;
88
89         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
90                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
91
92         if (!(och->och_flags & FMODE_WRITE))
93                 goto out;
94
95         if (!(exp_connect_som(ll_i2mdexp(inode))) || !S_ISREG(inode->i_mode))
96                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
97         else
98                 ll_epoch_close(inode, op_data, &och, 0);
99
100 out:
101         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
102         EXIT;
103 }
104
105 static int ll_close_inode_openhandle(struct obd_export *md_exp,
106                                      struct inode *inode,
107                                      struct obd_client_handle *och)
108 {
109         struct obd_export *exp = ll_i2mdexp(inode);
110         struct md_op_data *op_data;
111         struct ptlrpc_request *req = NULL;
112         struct obd_device *obd = class_exp2obd(exp);
113         int epoch_close = 1;
114         int rc;
115         ENTRY;
116
117         if (obd == NULL) {
118                 /*
119                  * XXX: in case of LMV, is this correct to access
120                  * ->exp_handle?
121                  */
122                 CERROR("Invalid MDC connection handle "LPX64"\n",
123                        ll_i2mdexp(inode)->exp_handle.h_cookie);
124                 GOTO(out, rc = 0);
125         }
126
127         /*
128          * here we check if this is forced umount. If so this is called on
129          * canceling "open lock" and we do not call md_close() in this case, as
130          * it will not be successful, as import is already deactivated.
131          */
132         if (obd->obd_force)
133                 GOTO(out, rc = 0);
134
135         OBD_ALLOC_PTR(op_data);
136         if (op_data == NULL)
137                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
138
139         ll_prepare_close(inode, op_data, och);
140         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
141         rc = md_close(md_exp, op_data, och->och_mod, &req);
142         if (rc == -EAGAIN) {
143                 /* This close must have the epoch closed. */
144                 LASSERT(epoch_close);
145                 /* MDS has instructed us to obtain Size-on-MDS attribute from
146                  * OSTs and send setattr to back to MDS. */
147                 rc = ll_sizeonmds_update(inode, &och->och_fh,
148                                          op_data->op_ioepoch);
149                 if (rc) {
150                         CERROR("inode %lu mdc Size-on-MDS update failed: "
151                                "rc = %d\n", inode->i_ino, rc);
152                         rc = 0;
153                 }
154         } else if (rc) {
155                 CERROR("inode %lu mdc close failed: rc = %d\n",
156                        inode->i_ino, rc);
157         }
158         ll_finish_md_op_data(op_data);
159
160         if (rc == 0) {
161                 rc = ll_objects_destroy(req, inode);
162                 if (rc)
163                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
164                                inode->i_ino, rc);
165         }
166
167         EXIT;
168 out:
169
170         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
171             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
172                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
173         } else {
174                 md_clear_open_replay_data(md_exp, och);
175                 /* Free @och if it is not waiting for DONE_WRITING. */
176                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
177                 OBD_FREE_PTR(och);
178         }
179         if (req) /* This is close request */
180                 ptlrpc_req_finished(req);
181         return rc;
182 }
183
184 int ll_md_real_close(struct inode *inode, int flags)
185 {
186         struct ll_inode_info *lli = ll_i2info(inode);
187         struct obd_client_handle **och_p;
188         struct obd_client_handle *och;
189         __u64 *och_usecount;
190         int rc = 0;
191         ENTRY;
192
193         if (flags & FMODE_WRITE) {
194                 och_p = &lli->lli_mds_write_och;
195                 och_usecount = &lli->lli_open_fd_write_count;
196         } else if (flags & FMODE_EXEC) {
197                 och_p = &lli->lli_mds_exec_och;
198                 och_usecount = &lli->lli_open_fd_exec_count;
199         } else {
200                 LASSERT(flags & FMODE_READ);
201                 och_p = &lli->lli_mds_read_och;
202                 och_usecount = &lli->lli_open_fd_read_count;
203         }
204
205         down(&lli->lli_och_sem);
206         if (*och_usecount) { /* There are still users of this handle, so
207                                 skip freeing it. */
208                 up(&lli->lli_och_sem);
209                 RETURN(0);
210         }
211         och=*och_p;
212         *och_p = NULL;
213         up(&lli->lli_och_sem);
214
215         if (och) { /* There might be a race and somebody have freed this och
216                       already */
217                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
218                                                inode, och);
219         }
220
221         RETURN(rc);
222 }
223
224 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
225                 struct file *file)
226 {
227         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
228         struct ll_inode_info *lli = ll_i2info(inode);
229         int rc = 0;
230         ENTRY;
231
232         /* clear group lock, if present */
233         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
234                 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
235
236         /* Let's see if we have good enough OPEN lock on the file and if
237            we can skip talking to MDS */
238         if (file->f_dentry->d_inode) { /* Can this ever be false? */
239                 int lockmode;
240                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
241                 struct lustre_handle lockh;
242                 struct inode *inode = file->f_dentry->d_inode;
243                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
244
245                 down(&lli->lli_och_sem);
246                 if (fd->fd_omode & FMODE_WRITE) {
247                         lockmode = LCK_CW;
248                         LASSERT(lli->lli_open_fd_write_count);
249                         lli->lli_open_fd_write_count--;
250                 } else if (fd->fd_omode & FMODE_EXEC) {
251                         lockmode = LCK_PR;
252                         LASSERT(lli->lli_open_fd_exec_count);
253                         lli->lli_open_fd_exec_count--;
254                 } else {
255                         lockmode = LCK_CR;
256                         LASSERT(lli->lli_open_fd_read_count);
257                         lli->lli_open_fd_read_count--;
258                 }
259                 up(&lli->lli_och_sem);
260
261                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
262                                    LDLM_IBITS, &policy, lockmode,
263                                    &lockh)) {
264                         rc = ll_md_real_close(file->f_dentry->d_inode,
265                                               fd->fd_omode);
266                 }
267         } else {
268                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
269                        file, file->f_dentry, file->f_dentry->d_name.name);
270         }
271
272         LUSTRE_FPRIVATE(file) = NULL;
273         ll_file_data_put(fd);
274         ll_capa_close(inode);
275
276         RETURN(rc);
277 }
278
279 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
280
281 /* While this returns an error code, fput() the caller does not, so we need
282  * to make every effort to clean up all of our state here.  Also, applications
283  * rarely check close errors and even if an error is returned they will not
284  * re-try the close call.
285  */
286 int ll_file_release(struct inode *inode, struct file *file)
287 {
288         struct ll_file_data *fd;
289         struct ll_sb_info *sbi = ll_i2sbi(inode);
290         struct ll_inode_info *lli = ll_i2info(inode);
291         struct lov_stripe_md *lsm = lli->lli_smd;
292         int rc;
293         ENTRY;
294
295         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
296                inode->i_generation, inode);
297
298 #ifdef CONFIG_FS_POSIX_ACL
299         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
300             inode == inode->i_sb->s_root->d_inode) {
301                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
302
303                 LASSERT(fd != NULL);
304                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
305                         fd->fd_flags &= ~LL_FILE_RMTACL;
306                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
307                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
308                 }
309         }
310 #endif
311
312         if (inode->i_sb->s_root != file->f_dentry)
313                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
314         fd = LUSTRE_FPRIVATE(file);
315         LASSERT(fd != NULL);
316
317         /* The last ref on @file, maybe not the the owner pid of statahead.
318          * Different processes can open the same dir, "ll_opendir_key" means:
319          * it is me that should stop the statahead thread. */
320         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
321                 ll_stop_statahead(inode, lli->lli_opendir_key);
322
323         if (inode->i_sb->s_root == file->f_dentry) {
324                 LUSTRE_FPRIVATE(file) = NULL;
325                 ll_file_data_put(fd);
326                 RETURN(0);
327         }
328
329         if (lsm)
330                 lov_test_and_clear_async_rc(lsm);
331         lli->lli_async_rc = 0;
332
333         rc = ll_md_close(sbi->ll_md_exp, inode, file);
334         RETURN(rc);
335 }
336
337 static int ll_intent_file_open(struct file *file, void *lmm,
338                                int lmmsize, struct lookup_intent *itp)
339 {
340         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
341         struct dentry *parent = file->f_dentry->d_parent;
342         const char *name = file->f_dentry->d_name.name;
343         const int len = file->f_dentry->d_name.len;
344         struct md_op_data *op_data;
345         struct ptlrpc_request *req;
346         int rc;
347         ENTRY;
348
349         if (!parent)
350                 RETURN(-ENOENT);
351
352         /* Usually we come here only for NFSD, and we want open lock.
353            But we can also get here with pre 2.6.15 patchless kernels, and in
354            that case that lock is also ok */
355         /* We can also get here if there was cached open handle in revalidate_it
356          * but it disappeared while we were getting from there to ll_file_open.
357          * But this means this file was closed and immediatelly opened which
358          * makes a good candidate for using OPEN lock */
359         /* If lmmsize & lmm are not 0, we are just setting stripe info
360          * parameters. No need for the open lock */
361         if (!lmm && !lmmsize)
362                 itp->it_flags |= MDS_OPEN_LOCK;
363
364         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
365                                       file->f_dentry->d_inode, name, len,
366                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
367         if (IS_ERR(op_data))
368                 RETURN(PTR_ERR(op_data));
369
370         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
371                             0 /*unused */, &req, ll_md_blocking_ast, 0);
372         ll_finish_md_op_data(op_data);
373         if (rc == -ESTALE) {
374                 /* reason for keep own exit path - don`t flood log
375                 * with messages with -ESTALE errors.
376                 */
377                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
378                      it_open_error(DISP_OPEN_OPEN, itp))
379                         GOTO(out, rc);
380                 ll_release_openhandle(file->f_dentry, itp);
381                 GOTO(out, rc);
382         }
383
384         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
385                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
386                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
387                 GOTO(out, rc);
388         }
389
390         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
391         if (!rc && itp->d.lustre.it_lock_mode)
392                 md_set_lock_data(sbi->ll_md_exp,
393                                  &itp->d.lustre.it_lock_handle,
394                                  file->f_dentry->d_inode, NULL);
395
396 out:
397         ptlrpc_req_finished(itp->d.lustre.it_data);
398         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
399         ll_intent_drop_lock(itp);
400
401         RETURN(rc);
402 }
403
404 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
405 {
406         if (ioepoch && lli->lli_ioepoch != ioepoch) {
407                 lli->lli_ioepoch = ioepoch;
408                 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
409                        ioepoch, PFID(&lli->lli_fid));
410         }
411 }
412
413 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
414                        struct lookup_intent *it, struct obd_client_handle *och)
415 {
416         struct ptlrpc_request *req = it->d.lustre.it_data;
417         struct mdt_body *body;
418
419         LASSERT(och);
420
421         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
422         LASSERT(body != NULL);                      /* reply already checked out */
423
424         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
425         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
426         och->och_fid = lli->lli_fid;
427         och->och_flags = it->it_flags;
428         ll_ioepoch_open(lli, body->ioepoch);
429
430         return md_set_open_replay_data(md_exp, och, req);
431 }
432
433 int ll_local_open(struct file *file, struct lookup_intent *it,
434                   struct ll_file_data *fd, struct obd_client_handle *och)
435 {
436         struct inode *inode = file->f_dentry->d_inode;
437         struct ll_inode_info *lli = ll_i2info(inode);
438         ENTRY;
439
440         LASSERT(!LUSTRE_FPRIVATE(file));
441
442         LASSERT(fd != NULL);
443
444         if (och) {
445                 struct ptlrpc_request *req = it->d.lustre.it_data;
446                 struct mdt_body *body;
447                 int rc;
448
449                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
450                 if (rc)
451                         RETURN(rc);
452
453                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
454                 if ((it->it_flags & FMODE_WRITE) &&
455                     (body->valid & OBD_MD_FLSIZE))
456                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
457                                lli->lli_ioepoch, PFID(&lli->lli_fid));
458         }
459
460         LUSTRE_FPRIVATE(file) = fd;
461         ll_readahead_init(inode, &fd->fd_ras);
462         fd->fd_omode = it->it_flags;
463         RETURN(0);
464 }
465
466 /* Open a file, and (for the very first open) create objects on the OSTs at
467  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
468  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
469  * lli_open_sem to ensure no other process will create objects, send the
470  * stripe MD to the MDS, or try to destroy the objects if that fails.
471  *
472  * If we already have the stripe MD locally then we don't request it in
473  * md_open(), by passing a lmm_size = 0.
474  *
475  * It is up to the application to ensure no other processes open this file
476  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
477  * used.  We might be able to avoid races of that sort by getting lli_open_sem
478  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
479  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
480  */
481 int ll_file_open(struct inode *inode, struct file *file)
482 {
483         struct ll_inode_info *lli = ll_i2info(inode);
484         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
485                                           .it_flags = file->f_flags };
486         struct lov_stripe_md *lsm;
487         struct ptlrpc_request *req = NULL;
488         struct obd_client_handle **och_p;
489         __u64 *och_usecount;
490         struct ll_file_data *fd;
491         int rc = 0, opendir_set = 0;
492         ENTRY;
493
494         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
495                inode->i_generation, inode, file->f_flags);
496
497 #ifdef HAVE_VFS_INTENT_PATCHES
498         it = file->f_it;
499 #else
500         it = file->private_data; /* XXX: compat macro */
501         file->private_data = NULL; /* prevent ll_local_open assertion */
502 #endif
503
504         fd = ll_file_data_get();
505         if (fd == NULL)
506                 RETURN(-ENOMEM);
507
508         fd->fd_file = file;
509         if (S_ISDIR(inode->i_mode)) {
510                 spin_lock(&lli->lli_lock);
511                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
512                         LASSERT(lli->lli_sai == NULL);
513                         lli->lli_opendir_key = fd;
514                         lli->lli_opendir_pid = cfs_curproc_pid();
515                         opendir_set = 1;
516                 }
517                 spin_unlock(&lli->lli_lock);
518         }
519
520         if (inode->i_sb->s_root == file->f_dentry) {
521                 LUSTRE_FPRIVATE(file) = fd;
522                 RETURN(0);
523         }
524
525         if (!it || !it->d.lustre.it_disposition) {
526                 /* Convert f_flags into access mode. We cannot use file->f_mode,
527                  * because everything but O_ACCMODE mask was stripped from
528                  * there */
529                 if ((oit.it_flags + 1) & O_ACCMODE)
530                         oit.it_flags++;
531                 if (file->f_flags & O_TRUNC)
532                         oit.it_flags |= FMODE_WRITE;
533
534                 /* kernel only call f_op->open in dentry_open.  filp_open calls
535                  * dentry_open after call to open_namei that checks permissions.
536                  * Only nfsd_open call dentry_open directly without checking
537                  * permissions and because of that this code below is safe. */
538                 if (oit.it_flags & FMODE_WRITE)
539                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
540
541                 /* We do not want O_EXCL here, presumably we opened the file
542                  * already? XXX - NFS implications? */
543                 oit.it_flags &= ~O_EXCL;
544
545                 /* bug20584, if "it_flags" contains O_CREAT, the file will be
546                  * created if necessary, then "IT_CREAT" should be set to keep
547                  * consistent with it */
548                 if (oit.it_flags & O_CREAT)
549                         oit.it_op |= IT_CREAT;
550
551                 it = &oit;
552         }
553
554 restart:
555         /* Let's see if we have file open on MDS already. */
556         if (it->it_flags & FMODE_WRITE) {
557                 och_p = &lli->lli_mds_write_och;
558                 och_usecount = &lli->lli_open_fd_write_count;
559         } else if (it->it_flags & FMODE_EXEC) {
560                 och_p = &lli->lli_mds_exec_och;
561                 och_usecount = &lli->lli_open_fd_exec_count;
562          } else {
563                 och_p = &lli->lli_mds_read_och;
564                 och_usecount = &lli->lli_open_fd_read_count;
565         }
566
567         down(&lli->lli_och_sem);
568         if (*och_p) { /* Open handle is present */
569                 if (it_disposition(it, DISP_OPEN_OPEN)) {
570                         /* Well, there's extra open request that we do not need,
571                            let's close it somehow. This will decref request. */
572                         rc = it_open_error(DISP_OPEN_OPEN, it);
573                         if (rc) {
574                                 up(&lli->lli_och_sem);
575                                 ll_file_data_put(fd);
576                                 GOTO(out_openerr, rc);
577                         }
578                         ll_release_openhandle(file->f_dentry, it);
579                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
580                                              LPROC_LL_OPEN);
581                 }
582                 (*och_usecount)++;
583
584                 rc = ll_local_open(file, it, fd, NULL);
585                 if (rc) {
586                         (*och_usecount)--;
587                         up(&lli->lli_och_sem);
588                         ll_file_data_put(fd);
589                         GOTO(out_openerr, rc);
590                 }
591         } else {
592                 LASSERT(*och_usecount == 0);
593                 if (!it->d.lustre.it_disposition) {
594                         /* We cannot just request lock handle now, new ELC code
595                            means that one of other OPEN locks for this file
596                            could be cancelled, and since blocking ast handler
597                            would attempt to grab och_sem as well, that would
598                            result in a deadlock */
599                         up(&lli->lli_och_sem);
600                         it->it_create_mode |= M_CHECK_STALE;
601                         rc = ll_intent_file_open(file, NULL, 0, it);
602                         it->it_create_mode &= ~M_CHECK_STALE;
603                         if (rc) {
604                                 ll_file_data_put(fd);
605                                 GOTO(out_openerr, rc);
606                         }
607
608                         /* Got some error? Release the request */
609                         if (it->d.lustre.it_status < 0) {
610                                 req = it->d.lustre.it_data;
611                                 ptlrpc_req_finished(req);
612                         }
613                         goto restart;
614                 }
615                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
616                 if (!*och_p) {
617                         ll_file_data_put(fd);
618                         GOTO(out_och_free, rc = -ENOMEM);
619                 }
620                 (*och_usecount)++;
621                 req = it->d.lustre.it_data;
622
623                 /* md_intent_lock() didn't get a request ref if there was an
624                  * open error, so don't do cleanup on the request here
625                  * (bug 3430) */
626                 /* XXX (green): Should not we bail out on any error here, not
627                  * just open error? */
628                 rc = it_open_error(DISP_OPEN_OPEN, it);
629                 if (rc) {
630                         ll_file_data_put(fd);
631                         GOTO(out_och_free, rc);
632                 }
633
634                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
635                 rc = ll_local_open(file, it, fd, *och_p);
636                 if (rc) {
637                         ll_file_data_put(fd);
638                         GOTO(out_och_free, rc);
639                 }
640         }
641         up(&lli->lli_och_sem);
642
643         /* Must do this outside lli_och_sem lock to prevent deadlock where
644            different kind of OPEN lock for this same inode gets cancelled
645            by ldlm_cancel_lru */
646         if (!S_ISREG(inode->i_mode))
647                 GOTO(out, rc);
648
649         ll_capa_open(inode);
650
651         lsm = lli->lli_smd;
652         if (lsm == NULL) {
653                 if (file->f_flags & O_LOV_DELAY_CREATE ||
654                     !(file->f_mode & FMODE_WRITE)) {
655                         CDEBUG(D_INODE, "object creation was delayed\n");
656                         GOTO(out, rc);
657                 }
658         }
659         file->f_flags &= ~O_LOV_DELAY_CREATE;
660         GOTO(out, rc);
661 out:
662         ptlrpc_req_finished(req);
663         if (req)
664                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
665 out_och_free:
666         if (rc) {
667                 if (*och_p) {
668                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
669                         *och_p = NULL; /* OBD_FREE writes some magic there */
670                         (*och_usecount)--;
671                 }
672                 up(&lli->lli_och_sem);
673 out_openerr:
674                 if (opendir_set != 0)
675                         ll_stop_statahead(inode, lli->lli_opendir_key);
676         }
677
678         return rc;
679 }
680
681 /* Fills the obdo with the attributes for the lsm */
682 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
683                           struct obd_capa *capa, struct obdo *obdo)
684 {
685         struct ptlrpc_request_set *set;
686         struct obd_info            oinfo = { { { 0 } } };
687         int                        rc;
688
689         ENTRY;
690
691         LASSERT(lsm != NULL);
692
693         oinfo.oi_md = lsm;
694         oinfo.oi_oa = obdo;
695         oinfo.oi_oa->o_id = lsm->lsm_object_id;
696         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
697         oinfo.oi_oa->o_mode = S_IFREG;
698         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
699                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
700                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
701                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
702                                OBD_MD_FLGROUP;
703         oinfo.oi_capa = capa;
704
705         set = ptlrpc_prep_set();
706         if (set == NULL) {
707                 CERROR("can't allocate ptlrpc set\n");
708                 rc = -ENOMEM;
709         } else {
710                 rc = obd_getattr_async(exp, &oinfo, set);
711                 if (rc == 0)
712                         rc = ptlrpc_set_wait(set);
713                 ptlrpc_set_destroy(set);
714         }
715         if (rc == 0)
716                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
717                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
718                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE);
719         RETURN(rc);
720 }
721
722 /* Fills the obdo with the attributes for the inode defined by lsm */
723 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
724 {
725         struct ll_inode_info *lli  = ll_i2info(inode);
726         struct obd_capa      *capa = ll_mdscapa_get(inode);
727         int rc;
728         ENTRY;
729
730         rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode), capa, obdo);
731         capa_put(capa);
732         if (rc == 0) {
733                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
734                 CDEBUG(D_INODE,
735                        "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
736                        lli->lli_smd->lsm_object_id, i_size_read(inode),
737                        (unsigned long long)inode->i_blocks,
738                        (unsigned long)ll_inode_blksize(inode));
739         }
740         RETURN(rc);
741 }
742
743 int ll_merge_lvb(struct inode *inode)
744 {
745         struct ll_inode_info *lli = ll_i2info(inode);
746         struct ll_sb_info *sbi = ll_i2sbi(inode);
747         struct ost_lvb lvb;
748         int rc;
749
750         ENTRY;
751
752         ll_inode_size_lock(inode, 1);
753         inode_init_lvb(inode, &lvb);
754         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
755         i_size_write(inode, lvb.lvb_size);
756         inode->i_blocks = lvb.lvb_blocks;
757
758         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
759         LTIME_S(inode->i_atime) = lvb.lvb_atime;
760         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
761         ll_inode_size_unlock(inode, 1);
762
763         RETURN(rc);
764 }
765
766 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
767                      lstat_t *st)
768 {
769         struct obdo obdo = { 0 };
770         int rc;
771
772         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo);
773         if (rc == 0) {
774                 st->st_size   = obdo.o_size;
775                 st->st_blocks = obdo.o_blocks;
776                 st->st_mtime  = obdo.o_mtime;
777                 st->st_atime  = obdo.o_atime;
778                 st->st_ctime  = obdo.o_ctime;
779         }
780         return rc;
781 }
782
783 void ll_io_init(struct cl_io *io, const struct file *file, int write)
784 {
785         struct inode *inode = file->f_dentry->d_inode;
786
787         memset(io, 0, sizeof *io);
788         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
789         if (write)
790                 io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
791         io->ci_obj     = ll_i2info(inode)->lli_clob;
792         io->ci_lockreq = CILR_MAYBE;
793         if (ll_file_nolock(file)) {
794                 io->ci_lockreq = CILR_NEVER;
795                 io->ci_no_srvlock = 1;
796         } else if (file->f_flags & O_APPEND) {
797                 io->ci_lockreq = CILR_MANDATORY;
798         }
799 }
800
801 static ssize_t ll_file_io_generic(const struct lu_env *env,
802                 struct vvp_io_args *args, struct file *file,
803                 enum cl_io_type iot, loff_t *ppos, size_t count)
804 {
805         struct cl_io       *io;
806         ssize_t             result;
807         ENTRY;
808
809         io = &ccc_env_info(env)->cti_io;
810         ll_io_init(io, file, iot == CIT_WRITE);
811
812         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
813                 struct vvp_io *vio = vvp_env_io(env);
814                 struct ccc_io *cio = ccc_env_io(env);
815
816                 vio->cui_io_subtype = args->via_io_subtype;
817
818                 switch (vio->cui_io_subtype) {
819                 case IO_NORMAL:
820                         cio->cui_iov = args->u.normal.via_iov;
821                         cio->cui_nrsegs = args->u.normal.via_nrsegs;
822 #ifndef HAVE_FILE_WRITEV
823                         cio->cui_iocb = args->u.normal.via_iocb;
824 #endif
825                         break;
826                 case IO_SENDFILE:
827                         vio->u.sendfile.cui_actor = args->u.sendfile.via_actor;
828                         vio->u.sendfile.cui_target = args->u.sendfile.via_target;
829                         break;
830                 case IO_SPLICE:
831                         vio->u.splice.cui_pipe = args->u.splice.via_pipe;
832                         vio->u.splice.cui_flags = args->u.splice.via_flags;
833                         break;
834                 default:
835                         CERROR("Unknow IO type - %u\n", vio->cui_io_subtype);
836                         LBUG();
837                 }
838                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
839                 result = cl_io_loop(env, io);
840         } else {
841                 /* cl_io_rw_init() handled IO */
842                 result = io->ci_result;
843         }
844
845         if (io->ci_nob > 0) {
846                 result = io->ci_nob;
847                 *ppos = io->u.ci_wr.wr.crw_pos;
848         }
849         cl_io_fini(env, io);
850         RETURN(result);
851 }
852
853
854 /*
855  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
856  */
857 static int ll_file_get_iov_count(const struct iovec *iov,
858                                  unsigned long *nr_segs, size_t *count)
859 {
860         size_t cnt = 0;
861         unsigned long seg;
862
863         for (seg = 0; seg < *nr_segs; seg++) {
864                 const struct iovec *iv = &iov[seg];
865
866                 /*
867                  * If any segment has a negative length, or the cumulative
868                  * length ever wraps negative then return -EINVAL.
869                  */
870                 cnt += iv->iov_len;
871                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
872                         return -EINVAL;
873                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
874                         continue;
875                 if (seg == 0)
876                         return -EFAULT;
877                 *nr_segs = seg;
878                 cnt -= iv->iov_len;   /* This segment is no good */
879                 break;
880         }
881         *count = cnt;
882         return 0;
883 }
884
885 #ifdef HAVE_FILE_READV
886 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
887                               unsigned long nr_segs, loff_t *ppos)
888 {
889         struct lu_env      *env;
890         struct vvp_io_args *args;
891         size_t              count;
892         ssize_t             result;
893         int                 refcheck;
894         ENTRY;
895
896         result = ll_file_get_iov_count(iov, &nr_segs, &count);
897         if (result)
898                 RETURN(result);
899
900         env = cl_env_get(&refcheck);
901         if (IS_ERR(env))
902                 RETURN(PTR_ERR(env));
903
904         args = vvp_env_args(env, IO_NORMAL);
905         args->u.normal.via_iov = (struct iovec *)iov;
906         args->u.normal.via_nrsegs = nr_segs;
907
908         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
909         cl_env_put(env, &refcheck);
910         RETURN(result);
911 }
912
913 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
914                             loff_t *ppos)
915 {
916         struct lu_env *env;
917         struct iovec  *local_iov;
918         ssize_t        result;
919         int            refcheck;
920         ENTRY;
921
922         env = cl_env_get(&refcheck);
923         if (IS_ERR(env))
924                 RETURN(PTR_ERR(env));
925
926         local_iov = &vvp_env_info(env)->vti_local_iov;
927         local_iov->iov_base = (void __user *)buf;
928         local_iov->iov_len = count;
929         result = ll_file_readv(file, local_iov, 1, ppos);
930         cl_env_put(env, &refcheck);
931         RETURN(result);
932 }
933
934 #else
935 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
936                                 unsigned long nr_segs, loff_t pos)
937 {
938         struct lu_env      *env;
939         struct vvp_io_args *args;
940         size_t              count;
941         ssize_t             result;
942         int                 refcheck;
943         ENTRY;
944
945         result = ll_file_get_iov_count(iov, &nr_segs, &count);
946         if (result)
947                 RETURN(result);
948
949         env = cl_env_get(&refcheck);
950         if (IS_ERR(env))
951                 RETURN(PTR_ERR(env));
952
953         args = vvp_env_args(env, IO_NORMAL);
954         args->u.normal.via_iov = (struct iovec *)iov;
955         args->u.normal.via_nrsegs = nr_segs;
956         args->u.normal.via_iocb = iocb;
957
958         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
959                                     &iocb->ki_pos, count);
960         cl_env_put(env, &refcheck);
961         RETURN(result);
962 }
963
964 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
965                             loff_t *ppos)
966 {
967         struct lu_env *env;
968         struct iovec  *local_iov;
969         struct kiocb  *kiocb;
970         ssize_t        result;
971         int            refcheck;
972         ENTRY;
973
974         env = cl_env_get(&refcheck);
975         if (IS_ERR(env))
976                 RETURN(PTR_ERR(env));
977
978         local_iov = &vvp_env_info(env)->vti_local_iov;
979         kiocb = &vvp_env_info(env)->vti_kiocb;
980         local_iov->iov_base = (void __user *)buf;
981         local_iov->iov_len = count;
982         init_sync_kiocb(kiocb, file);
983         kiocb->ki_pos = *ppos;
984         kiocb->ki_left = count;
985
986         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
987         *ppos = kiocb->ki_pos;
988
989         cl_env_put(env, &refcheck);
990         RETURN(result);
991 }
992 #endif
993
994 /*
995  * Write to a file (through the page cache).
996  */
997 #ifdef HAVE_FILE_WRITEV
998 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
999                               unsigned long nr_segs, loff_t *ppos)
1000 {
1001         struct lu_env      *env;
1002         struct vvp_io_args *args;
1003         size_t              count;
1004         ssize_t             result;
1005         int                 refcheck;
1006         ENTRY;
1007
1008         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1009         if (result)
1010                 RETURN(result);
1011
1012         env = cl_env_get(&refcheck);
1013         if (IS_ERR(env))
1014                 RETURN(PTR_ERR(env));
1015
1016         args = vvp_env_args(env, IO_NORMAL);
1017         args->u.normal.via_iov = (struct iovec *)iov;
1018         args->u.normal.via_nrsegs = nr_segs;
1019
1020         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1021         cl_env_put(env, &refcheck);
1022         RETURN(result);
1023 }
1024
1025 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1026                              loff_t *ppos)
1027 {
1028         struct lu_env    *env;
1029         struct iovec     *local_iov;
1030         ssize_t           result;
1031         int               refcheck;
1032         ENTRY;
1033
1034         env = cl_env_get(&refcheck);
1035         if (IS_ERR(env))
1036                 RETURN(PTR_ERR(env));
1037
1038         local_iov = &vvp_env_info(env)->vti_local_iov;
1039         local_iov->iov_base = (void __user *)buf;
1040         local_iov->iov_len = count;
1041
1042         result = ll_file_writev(file, local_iov, 1, ppos);
1043         cl_env_put(env, &refcheck);
1044         RETURN(result);
1045 }
1046
1047 #else /* AIO stuff */
1048 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1049                                  unsigned long nr_segs, loff_t pos)
1050 {
1051         struct lu_env      *env;
1052         struct vvp_io_args *args;
1053         size_t              count;
1054         ssize_t             result;
1055         int                 refcheck;
1056         ENTRY;
1057
1058         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1059         if (result)
1060                 RETURN(result);
1061
1062         env = cl_env_get(&refcheck);
1063         if (IS_ERR(env))
1064                 RETURN(PTR_ERR(env));
1065
1066         args = vvp_env_args(env, IO_NORMAL);
1067         args->u.normal.via_iov = (struct iovec *)iov;
1068         args->u.normal.via_nrsegs = nr_segs;
1069         args->u.normal.via_iocb = iocb;
1070
1071         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1072                                   &iocb->ki_pos, count);
1073         cl_env_put(env, &refcheck);
1074         RETURN(result);
1075 }
1076
1077 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1078                              loff_t *ppos)
1079 {
1080         struct lu_env *env;
1081         struct iovec  *local_iov;
1082         struct kiocb  *kiocb;
1083         ssize_t        result;
1084         int            refcheck;
1085         ENTRY;
1086
1087         env = cl_env_get(&refcheck);
1088         if (IS_ERR(env))
1089                 RETURN(PTR_ERR(env));
1090
1091         local_iov = &vvp_env_info(env)->vti_local_iov;
1092         kiocb = &vvp_env_info(env)->vti_kiocb;
1093         local_iov->iov_base = (void __user *)buf;
1094         local_iov->iov_len = count;
1095         init_sync_kiocb(kiocb, file);
1096         kiocb->ki_pos = *ppos;
1097         kiocb->ki_left = count;
1098
1099         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1100         *ppos = kiocb->ki_pos;
1101
1102         cl_env_put(env, &refcheck);
1103         RETURN(result);
1104 }
1105 #endif
1106
1107
1108 #ifdef HAVE_KERNEL_SENDFILE
1109 /*
1110  * Send file content (through pagecache) somewhere with helper
1111  */
1112 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1113                                 read_actor_t actor, void *target)
1114 {
1115         struct lu_env      *env;
1116         struct vvp_io_args *args;
1117         ssize_t             result;
1118         int                 refcheck;
1119         ENTRY;
1120
1121         env = cl_env_get(&refcheck);
1122         if (IS_ERR(env))
1123                 RETURN(PTR_ERR(env));
1124
1125         args = vvp_env_args(env, IO_SENDFILE);
1126         args->u.sendfile.via_target = target;
1127         args->u.sendfile.via_actor = actor;
1128
1129         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1130         cl_env_put(env, &refcheck);
1131         RETURN(result);
1132 }
1133 #endif
1134
1135 #ifdef HAVE_KERNEL_SPLICE_READ
1136 /*
1137  * Send file content (through pagecache) somewhere with helper
1138  */
1139 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1140                                    struct pipe_inode_info *pipe, size_t count,
1141                                    unsigned int flags)
1142 {
1143         struct lu_env      *env;
1144         struct vvp_io_args *args;
1145         ssize_t             result;
1146         int                 refcheck;
1147         ENTRY;
1148
1149         env = cl_env_get(&refcheck);
1150         if (IS_ERR(env))
1151                 RETURN(PTR_ERR(env));
1152
1153         args = vvp_env_args(env, IO_SPLICE);
1154         args->u.splice.via_pipe = pipe;
1155         args->u.splice.via_flags = flags;
1156
1157         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1158         cl_env_put(env, &refcheck);
1159         RETURN(result);
1160 }
1161 #endif
1162
1163 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1164                                unsigned long arg)
1165 {
1166         struct obd_export *exp = ll_i2dtexp(inode);
1167         struct ll_recreate_obj ucreatp;
1168         struct obd_trans_info oti = { 0 };
1169         struct obdo *oa = NULL;
1170         int lsm_size;
1171         int rc = 0;
1172         struct lov_stripe_md *lsm, *lsm2;
1173         ENTRY;
1174
1175         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1176                 RETURN(-EPERM);
1177
1178         if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1179                            sizeof(struct ll_recreate_obj)))
1180                 RETURN(-EFAULT);
1181
1182         OBDO_ALLOC(oa);
1183         if (oa == NULL)
1184                 RETURN(-ENOMEM);
1185
1186         ll_inode_size_lock(inode, 0);
1187         lsm = ll_i2info(inode)->lli_smd;
1188         if (lsm == NULL)
1189                 GOTO(out, rc = -ENOENT);
1190         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1191                    (lsm->lsm_stripe_count));
1192
1193         OBD_ALLOC(lsm2, lsm_size);
1194         if (lsm2 == NULL)
1195                 GOTO(out, rc = -ENOMEM);
1196
1197         oa->o_id = ucreatp.lrc_id;
1198         oa->o_gr = ucreatp.lrc_group;
1199         oa->o_nlink = ucreatp.lrc_ost_idx;
1200         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1201         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1202         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1203                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1204
1205         memcpy(lsm2, lsm, lsm_size);
1206         rc = obd_create(exp, oa, &lsm2, &oti);
1207
1208         OBD_FREE(lsm2, lsm_size);
1209         GOTO(out, rc);
1210 out:
1211         ll_inode_size_unlock(inode, 0);
1212         OBDO_FREE(oa);
1213         return rc;
1214 }
1215
1216 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1217                              int flags, struct lov_user_md *lum, int lum_size)
1218 {
1219         struct lov_stripe_md *lsm;
1220         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1221         int rc = 0;
1222         ENTRY;
1223
1224         ll_inode_size_lock(inode, 0);
1225         lsm = ll_i2info(inode)->lli_smd;
1226         if (lsm) {
1227                 ll_inode_size_unlock(inode, 0);
1228                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1229                        inode->i_ino);
1230                 RETURN(-EEXIST);
1231         }
1232
1233         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1234         if (rc)
1235                 GOTO(out, rc);
1236         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1237                 GOTO(out_req_free, rc = -ENOENT);
1238         rc = oit.d.lustre.it_status;
1239         if (rc < 0)
1240                 GOTO(out_req_free, rc);
1241
1242         ll_release_openhandle(file->f_dentry, &oit);
1243
1244  out:
1245         ll_inode_size_unlock(inode, 0);
1246         ll_intent_release(&oit);
1247         RETURN(rc);
1248 out_req_free:
1249         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1250         goto out;
1251 }
1252
1253 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1254                              struct lov_mds_md **lmmp, int *lmm_size,
1255                              struct ptlrpc_request **request)
1256 {
1257         struct ll_sb_info *sbi = ll_i2sbi(inode);
1258         struct mdt_body  *body;
1259         struct lov_mds_md *lmm = NULL;
1260         struct ptlrpc_request *req = NULL;
1261         struct obd_capa *oc;
1262         int rc, lmmsize;
1263
1264         rc = ll_get_max_mdsize(sbi, &lmmsize);
1265         if (rc)
1266                 RETURN(rc);
1267
1268         oc = ll_mdscapa_get(inode);
1269         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1270                              oc, filename, strlen(filename) + 1,
1271                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1272                              ll_i2suppgid(inode), &req);
1273         capa_put(oc);
1274         if (rc < 0) {
1275                 CDEBUG(D_INFO, "md_getattr_name failed "
1276                        "on %s: rc %d\n", filename, rc);
1277                 GOTO(out, rc);
1278         }
1279
1280         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1281         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1282
1283         lmmsize = body->eadatasize;
1284
1285         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1286                         lmmsize == 0) {
1287                 GOTO(out, rc = -ENODATA);
1288         }
1289
1290         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1291         LASSERT(lmm != NULL);
1292
1293         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1294             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
1295             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
1296                 GOTO(out, rc = -EPROTO);
1297         }
1298
1299         /*
1300          * This is coming from the MDS, so is probably in
1301          * little endian.  We convert it to host endian before
1302          * passing it to userspace.
1303          */
1304         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1305                 /* if function called for directory - we should
1306                  * avoid swab not existent lsm objects */
1307                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1308                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1309                         if (S_ISREG(body->mode))
1310                                 lustre_swab_lov_user_md_objects(
1311                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1312                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1313                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1314                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1315                         if (S_ISREG(body->mode))
1316                                 lustre_swab_lov_user_md_objects(
1317                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1318                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1319                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
1320                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1321                 }
1322         }
1323
1324         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1325                 struct lov_stripe_md *lsm;
1326                 struct lov_user_md_join *lmj;
1327                 int lmj_size, i, aindex = 0;
1328
1329                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1330                 if (rc < 0)
1331                         GOTO(out, rc = -ENOMEM);
1332                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1333                 if (rc)
1334                         GOTO(out_free_memmd, rc);
1335
1336                 lmj_size = sizeof(struct lov_user_md_join) +
1337                            lsm->lsm_stripe_count *
1338                            sizeof(struct lov_user_ost_data_join);
1339                 OBD_ALLOC(lmj, lmj_size);
1340                 if (!lmj)
1341                         GOTO(out_free_memmd, rc = -ENOMEM);
1342
1343                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1344                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1345                         struct lov_extent *lex =
1346                                 &lsm->lsm_array->lai_ext_array[aindex];
1347
1348                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1349                                 aindex ++;
1350                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1351                                         LPU64" len %d\n", aindex, i,
1352                                         lex->le_start, (int)lex->le_len);
1353                         lmj->lmm_objects[i].l_extent_start =
1354                                 lex->le_start;
1355
1356                         if ((int)lex->le_len == -1)
1357                                 lmj->lmm_objects[i].l_extent_end = -1;
1358                         else
1359                                 lmj->lmm_objects[i].l_extent_end =
1360                                         lex->le_start + lex->le_len;
1361                         lmj->lmm_objects[i].l_object_id =
1362                                 lsm->lsm_oinfo[i]->loi_id;
1363                         lmj->lmm_objects[i].l_object_gr =
1364                                 lsm->lsm_oinfo[i]->loi_gr;
1365                         lmj->lmm_objects[i].l_ost_gen =
1366                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1367                         lmj->lmm_objects[i].l_ost_idx =
1368                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1369                 }
1370                 lmm = (struct lov_mds_md *)lmj;
1371                 lmmsize = lmj_size;
1372 out_free_memmd:
1373                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1374         }
1375 out:
1376         *lmmp = lmm;
1377         *lmm_size = lmmsize;
1378         *request = req;
1379         return rc;
1380 }
1381
1382 static int ll_lov_setea(struct inode *inode, struct file *file,
1383                             unsigned long arg)
1384 {
1385         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1386         struct lov_user_md  *lump;
1387         int lum_size = sizeof(struct lov_user_md) +
1388                        sizeof(struct lov_user_ost_data);
1389         int rc;
1390         ENTRY;
1391
1392         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1393                 RETURN(-EPERM);
1394
1395         OBD_ALLOC(lump, lum_size);
1396         if (lump == NULL) {
1397                 RETURN(-ENOMEM);
1398         }
1399         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1400                 OBD_FREE(lump, lum_size);
1401                 RETURN(-EFAULT);
1402         }
1403
1404         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1405
1406         OBD_FREE(lump, lum_size);
1407         RETURN(rc);
1408 }
1409
1410 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1411                             unsigned long arg)
1412 {
1413         struct lov_user_md_v3 lumv3;
1414         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1415         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1416         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1417         int lum_size;
1418         int rc;
1419         int flags = FMODE_WRITE;
1420         ENTRY;
1421
1422         /* first try with v1 which is smaller than v3 */
1423         lum_size = sizeof(struct lov_user_md_v1);
1424         if (copy_from_user(lumv1, lumv1p, lum_size))
1425                 RETURN(-EFAULT);
1426
1427         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1428                 lum_size = sizeof(struct lov_user_md_v3);
1429                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1430                         RETURN(-EFAULT);
1431         }
1432
1433         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1434         if (rc == 0) {
1435                  put_user(0, &lumv1p->lmm_stripe_count);
1436                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1437                                     0, ll_i2info(inode)->lli_smd,
1438                                     (void *)arg);
1439         }
1440         RETURN(rc);
1441 }
1442
1443 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1444 {
1445         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1446
1447         if (!lsm)
1448                 RETURN(-ENODATA);
1449
1450         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1451                             (void *)arg);
1452 }
1453
1454 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1455 {
1456         struct ll_inode_info   *lli = ll_i2info(inode);
1457         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1458         struct ccc_grouplock    grouplock;
1459         int                     rc;
1460         ENTRY;
1461
1462         if (ll_file_nolock(file))
1463                 RETURN(-EOPNOTSUPP);
1464
1465         spin_lock(&lli->lli_lock);
1466         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1467                 CERROR("group lock already existed with gid %lu\n",
1468                        fd->fd_grouplock.cg_gid);
1469                 spin_unlock(&lli->lli_lock);
1470                 RETURN(-EINVAL);
1471         }
1472         LASSERT(fd->fd_grouplock.cg_lock == NULL);
1473         spin_unlock(&lli->lli_lock);
1474
1475         rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1476                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1477         if (rc)
1478                 RETURN(rc);
1479
1480         spin_lock(&lli->lli_lock);
1481         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1482                 spin_unlock(&lli->lli_lock);
1483                 CERROR("another thread just won the race\n");
1484                 cl_put_grouplock(&grouplock);
1485                 RETURN(-EINVAL);
1486         }
1487
1488         fd->fd_flags |= LL_FILE_GROUP_LOCKED;
1489         fd->fd_grouplock = grouplock;
1490         spin_unlock(&lli->lli_lock);
1491
1492         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1493         RETURN(0);
1494 }
1495
1496 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1497 {
1498         struct ll_inode_info   *lli = ll_i2info(inode);
1499         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1500         struct ccc_grouplock    grouplock;
1501         ENTRY;
1502
1503         spin_lock(&lli->lli_lock);
1504         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1505                 spin_unlock(&lli->lli_lock);
1506                 CERROR("no group lock held\n");
1507                 RETURN(-EINVAL);
1508         }
1509         LASSERT(fd->fd_grouplock.cg_lock != NULL);
1510
1511         if (fd->fd_grouplock.cg_gid != arg) {
1512                 CERROR("group lock %lu doesn't match current id %lu\n",
1513                        arg, fd->fd_grouplock.cg_gid);
1514                 spin_unlock(&lli->lli_lock);
1515                 RETURN(-EINVAL);
1516         }
1517
1518         grouplock = fd->fd_grouplock;
1519         fd->fd_grouplock.cg_env = NULL;
1520         fd->fd_grouplock.cg_lock = NULL;
1521         fd->fd_grouplock.cg_gid = 0;
1522         fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
1523         spin_unlock(&lli->lli_lock);
1524
1525         cl_put_grouplock(&grouplock);
1526         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1527         RETURN(0);
1528 }
1529
1530 #if LUSTRE_FIX >= 50
1531 static int join_sanity_check(struct inode *head, struct inode *tail)
1532 {
1533         ENTRY;
1534         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1535                 CERROR("server do not support join \n");
1536                 RETURN(-EINVAL);
1537         }
1538         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1539                 CERROR("tail ino %lu and ino head %lu must be regular\n",
1540                        head->i_ino, tail->i_ino);
1541                 RETURN(-EINVAL);
1542         }
1543         if (head->i_ino == tail->i_ino) {
1544                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1545                 RETURN(-EINVAL);
1546         }
1547         if (i_size_read(head) % JOIN_FILE_ALIGN) {
1548                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1549                 RETURN(-EINVAL);
1550         }
1551         RETURN(0);
1552 }
1553
1554 static int join_file(struct inode *head_inode, struct file *head_filp,
1555                      struct file *tail_filp)
1556 {
1557         struct dentry *tail_dentry = tail_filp->f_dentry;
1558         struct lookup_intent oit = {.it_op = IT_OPEN,
1559                                     .it_flags = head_filp->f_flags,
1560                                     .it_create_mode = M_JOIN_FILE};
1561         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
1562                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
1563
1564         struct lustre_handle lockh;
1565         struct md_op_data *op_data;
1566         int    rc;
1567         loff_t data;
1568         ENTRY;
1569
1570         tail_dentry = tail_filp->f_dentry;
1571
1572         data = i_size_read(head_inode);
1573         op_data = ll_prep_md_op_data(NULL, head_inode,
1574                                      tail_dentry->d_parent->d_inode,
1575                                      tail_dentry->d_name.name,
1576                                      tail_dentry->d_name.len, 0,
1577                                      LUSTRE_OPC_ANY, &data);
1578         if (IS_ERR(op_data))
1579                 RETURN(PTR_ERR(op_data));
1580
1581         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
1582                          op_data, &lockh, NULL, 0, NULL, 0);
1583
1584         ll_finish_md_op_data(op_data);
1585         if (rc < 0)
1586                 GOTO(out, rc);
1587
1588         rc = oit.d.lustre.it_status;
1589
1590         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
1591                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
1592                 ptlrpc_req_finished((struct ptlrpc_request *)
1593                                     oit.d.lustre.it_data);
1594                 GOTO(out, rc);
1595         }
1596
1597         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1598                                            * away */
1599                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1600                 oit.d.lustre.it_lock_mode = 0;
1601         }
1602         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1603         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
1604         ll_release_openhandle(head_filp->f_dentry, &oit);
1605 out:
1606         ll_intent_release(&oit);
1607         RETURN(rc);
1608 }
1609
1610 static int ll_file_join(struct inode *head, struct file *filp,
1611                         char *filename_tail)
1612 {
1613         struct inode *tail = NULL, *first = NULL, *second = NULL;
1614         struct dentry *tail_dentry;
1615         struct file *tail_filp, *first_filp, *second_filp;
1616         struct ll_lock_tree first_tree, second_tree;
1617         struct ll_lock_tree_node *first_node, *second_node;
1618         struct ll_inode_info *hlli = ll_i2info(head);
1619         int rc = 0, cleanup_phase = 0;
1620         ENTRY;
1621
1622         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1623                head->i_ino, head->i_generation, head, filename_tail);
1624
1625         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1626         if (IS_ERR(tail_filp)) {
1627                 CERROR("Can not open tail file %s", filename_tail);
1628                 rc = PTR_ERR(tail_filp);
1629                 GOTO(cleanup, rc);
1630         }
1631         tail = igrab(tail_filp->f_dentry->d_inode);
1632
1633         tail_dentry = tail_filp->f_dentry;
1634         LASSERT(tail_dentry);
1635         cleanup_phase = 1;
1636
1637         /*reorder the inode for lock sequence*/
1638         first = head->i_ino > tail->i_ino ? head : tail;
1639         second = head->i_ino > tail->i_ino ? tail : head;
1640         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1641         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1642
1643         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1644                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1645         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1646         if (IS_ERR(first_node)){
1647                 rc = PTR_ERR(first_node);
1648                 GOTO(cleanup, rc);
1649         }
1650         first_tree.lt_fd = first_filp->private_data;
1651         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1652         if (rc != 0)
1653                 GOTO(cleanup, rc);
1654         cleanup_phase = 2;
1655
1656         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1657         if (IS_ERR(second_node)){
1658                 rc = PTR_ERR(second_node);
1659                 GOTO(cleanup, rc);
1660         }
1661         second_tree.lt_fd = second_filp->private_data;
1662         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1663         if (rc != 0)
1664                 GOTO(cleanup, rc);
1665         cleanup_phase = 3;
1666
1667         rc = join_sanity_check(head, tail);
1668         if (rc)
1669                 GOTO(cleanup, rc);
1670
1671         rc = join_file(head, filp, tail_filp);
1672         if (rc)
1673                 GOTO(cleanup, rc);
1674 cleanup:
1675         switch (cleanup_phase) {
1676         case 3:
1677                 ll_tree_unlock(&second_tree);
1678                 obd_cancel_unused(ll_i2dtexp(second),
1679                                   ll_i2info(second)->lli_smd, 0, NULL);
1680         case 2:
1681                 ll_tree_unlock(&first_tree);
1682                 obd_cancel_unused(ll_i2dtexp(first),
1683                                   ll_i2info(first)->lli_smd, 0, NULL);
1684         case 1:
1685                 filp_close(tail_filp, 0);
1686                 if (tail)
1687                         iput(tail);
1688                 if (head && rc == 0) {
1689                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
1690                                        &hlli->lli_smd);
1691                         hlli->lli_smd = NULL;
1692                 }
1693         case 0:
1694                 break;
1695         default:
1696                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
1697                 LBUG();
1698         }
1699         RETURN(rc);
1700 }
1701 #endif /* LUSTRE_FIX >= 50 */
1702
1703 /**
1704  * Close inode open handle
1705  *
1706  * \param dentry [in]     dentry which contains the inode
1707  * \param it     [in,out] intent which contains open info and result
1708  *
1709  * \retval 0     success
1710  * \retval <0    failure
1711  */
1712 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1713 {
1714         struct inode *inode = dentry->d_inode;
1715         struct obd_client_handle *och;
1716         int rc;
1717         ENTRY;
1718
1719         LASSERT(inode);
1720
1721         /* Root ? Do nothing. */
1722         if (dentry->d_inode->i_sb->s_root == dentry)
1723                 RETURN(0);
1724
1725         /* No open handle to close? Move away */
1726         if (!it_disposition(it, DISP_OPEN_OPEN))
1727                 RETURN(0);
1728
1729         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1730
1731         OBD_ALLOC(och, sizeof(*och));
1732         if (!och)
1733                 GOTO(out, rc = -ENOMEM);
1734
1735         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1736                     ll_i2info(inode), it, och);
1737
1738         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1739                                        inode, och);
1740  out:
1741         /* this one is in place of ll_file_open */
1742         if (it_disposition(it, DISP_ENQ_OPEN_REF))
1743                 ptlrpc_req_finished(it->d.lustre.it_data);
1744         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1745         RETURN(rc);
1746 }
1747
1748 /**
1749  * Get size for inode for which FIEMAP mapping is requested.
1750  * Make the FIEMAP get_info call and returns the result.
1751  */
1752 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1753               int num_bytes)
1754 {
1755         struct obd_export *exp = ll_i2dtexp(inode);
1756         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1757         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1758         int vallen = num_bytes;
1759         int rc;
1760         ENTRY;
1761
1762         /* If the stripe_count > 1 and the application does not understand
1763          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1764          */
1765         if (lsm->lsm_stripe_count > 1 &&
1766             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1767                 return -EOPNOTSUPP;
1768
1769         fm_key.oa.o_id = lsm->lsm_object_id;
1770         fm_key.oa.o_gr = lsm->lsm_object_gr;
1771         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1772
1773         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
1774                         OBD_MD_FLSIZE);
1775
1776         /* If filesize is 0, then there would be no objects for mapping */
1777         if (fm_key.oa.o_size == 0) {
1778                 fiemap->fm_mapped_extents = 0;
1779                 RETURN(0);
1780         }
1781
1782         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1783
1784         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1785         if (rc)
1786                 CERROR("obd_get_info failed: rc = %d\n", rc);
1787
1788         RETURN(rc);
1789 }
1790
1791 int ll_fid2path(struct obd_export *exp, void *arg)
1792 {
1793         struct getinfo_fid2path *gfout, *gfin;
1794         int outsize, rc;
1795         ENTRY;
1796
1797         /* Need to get the buflen */
1798         OBD_ALLOC_PTR(gfin);
1799         if (gfin == NULL)
1800                 RETURN(-ENOMEM);
1801         if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1802                 OBD_FREE_PTR(gfin);
1803                 RETURN(-EFAULT);
1804         }
1805
1806         outsize = sizeof(*gfout) + gfin->gf_pathlen;
1807         OBD_ALLOC(gfout, outsize);
1808         if (gfout == NULL) {
1809                 OBD_FREE_PTR(gfin);
1810                 RETURN(-ENOMEM);
1811         }
1812         memcpy(gfout, gfin, sizeof(*gfout));
1813         OBD_FREE_PTR(gfin);
1814
1815         /* Call mdc_iocontrol */
1816         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1817         if (rc)
1818                 GOTO(gf_free, rc);
1819         if (copy_to_user(arg, gfout, outsize))
1820                 rc = -EFAULT;
1821
1822 gf_free:
1823         OBD_FREE(gfout, outsize);
1824         RETURN(rc);
1825 }
1826
1827 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1828                   unsigned long arg)
1829 {
1830         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1831         int flags;
1832         ENTRY;
1833
1834         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1835                inode->i_generation, inode, cmd);
1836         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1837
1838         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1839         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1840                 RETURN(-ENOTTY);
1841
1842         switch(cmd) {
1843         case LL_IOC_GETFLAGS:
1844                 /* Get the current value of the file flags */
1845                 return put_user(fd->fd_flags, (int *)arg);
1846         case LL_IOC_SETFLAGS:
1847         case LL_IOC_CLRFLAGS:
1848                 /* Set or clear specific file flags */
1849                 /* XXX This probably needs checks to ensure the flags are
1850                  *     not abused, and to handle any flag side effects.
1851                  */
1852                 if (get_user(flags, (int *) arg))
1853                         RETURN(-EFAULT);
1854
1855                 if (cmd == LL_IOC_SETFLAGS) {
1856                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1857                             !(file->f_flags & O_DIRECT)) {
1858                                 CERROR("%s: unable to disable locking on "
1859                                        "non-O_DIRECT file\n", current->comm);
1860                                 RETURN(-EINVAL);
1861                         }
1862
1863                         fd->fd_flags |= flags;
1864                 } else {
1865                         fd->fd_flags &= ~flags;
1866                 }
1867                 RETURN(0);
1868         case LL_IOC_LOV_SETSTRIPE:
1869                 RETURN(ll_lov_setstripe(inode, file, arg));
1870         case LL_IOC_LOV_SETEA:
1871                 RETURN(ll_lov_setea(inode, file, arg));
1872         case LL_IOC_LOV_GETSTRIPE:
1873                 RETURN(ll_lov_getstripe(inode, arg));
1874         case LL_IOC_RECREATE_OBJ:
1875                 RETURN(ll_lov_recreate_obj(inode, file, arg));
1876         case FSFILT_IOC_FIEMAP: {
1877                 struct ll_user_fiemap *fiemap_s;
1878                 size_t num_bytes, ret_bytes;
1879                 unsigned int extent_count;
1880                 int rc = 0;
1881
1882                 /* Get the extent count so we can calculate the size of
1883                  * required fiemap buffer */
1884                 if (get_user(extent_count,
1885                     &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1886                         RETURN(-EFAULT);
1887                 num_bytes = sizeof(*fiemap_s) + (extent_count *
1888                                                  sizeof(struct ll_fiemap_extent));
1889                 OBD_VMALLOC(fiemap_s, num_bytes);
1890                 if (fiemap_s == NULL)
1891                         RETURN(-ENOMEM);
1892
1893                 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1894                                    sizeof(*fiemap_s)))
1895                         GOTO(error, rc = -EFAULT);
1896
1897                 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1898                         fiemap_s->fm_flags = fiemap_s->fm_flags &
1899                                                     ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1900                         if (copy_to_user((char *)arg, fiemap_s,
1901                                          sizeof(*fiemap_s)))
1902                                 GOTO(error, rc = -EFAULT);
1903
1904                         GOTO(error, rc = -EBADR);
1905                 }
1906
1907                 /* If fm_extent_count is non-zero, read the first extent since
1908                  * it is used to calculate end_offset and device from previous
1909                  * fiemap call. */
1910                 if (extent_count) {
1911                         if (copy_from_user(&fiemap_s->fm_extents[0],
1912                             (char __user *)arg + sizeof(*fiemap_s),
1913                             sizeof(struct ll_fiemap_extent)))
1914                                 GOTO(error, rc = -EFAULT);
1915                 }
1916
1917                 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
1918                         int rc;
1919
1920                         rc = filemap_fdatawrite(inode->i_mapping);
1921                         if (rc)
1922                                 GOTO(error, rc);
1923                 }
1924
1925                 rc = ll_fiemap(inode, fiemap_s, num_bytes);
1926                 if (rc)
1927                         GOTO(error, rc);
1928
1929                 ret_bytes = sizeof(struct ll_user_fiemap);
1930
1931                 if (extent_count != 0)
1932                         ret_bytes += (fiemap_s->fm_mapped_extents *
1933                                          sizeof(struct ll_fiemap_extent));
1934
1935                 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1936                         rc = -EFAULT;
1937
1938 error:
1939                 OBD_VFREE(fiemap_s, num_bytes);
1940                 RETURN(rc);
1941         }
1942         case FSFILT_IOC_GETFLAGS:
1943         case FSFILT_IOC_SETFLAGS:
1944                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1945         case FSFILT_IOC_GETVERSION_OLD:
1946         case FSFILT_IOC_GETVERSION:
1947                 RETURN(put_user(inode->i_generation, (int *)arg));
1948         case LL_IOC_JOIN: {
1949 #if LUSTRE_FIX >= 50
1950                 /* Allow file join in beta builds to allow debuggging */
1951                 char *ftail;
1952                 int rc;
1953
1954                 ftail = getname((const char *)arg);
1955                 if (IS_ERR(ftail))
1956                         RETURN(PTR_ERR(ftail));
1957                 rc = ll_file_join(inode, file, ftail);
1958                 putname(ftail);
1959                 RETURN(rc);
1960 #else
1961                 CWARN("file join is not supported in this version of Lustre\n");
1962                 RETURN(-ENOTTY);
1963 #endif
1964         }
1965         case LL_IOC_GROUP_LOCK:
1966                 RETURN(ll_get_grouplock(inode, file, arg));
1967         case LL_IOC_GROUP_UNLOCK:
1968                 RETURN(ll_put_grouplock(inode, file, arg));
1969         case IOC_OBD_STATFS:
1970                 RETURN(ll_obd_statfs(inode, (void *)arg));
1971
1972         /* We need to special case any other ioctls we want to handle,
1973          * to send them to the MDS/OST as appropriate and to properly
1974          * network encode the arg field.
1975         case FSFILT_IOC_SETVERSION_OLD:
1976         case FSFILT_IOC_SETVERSION:
1977         */
1978         case LL_IOC_FLUSHCTX:
1979                 RETURN(ll_flush_ctx(inode));
1980         case LL_IOC_PATH2FID: {
1981                 if (copy_to_user((void *)arg, ll_inode2fid(inode),
1982                                  sizeof(struct lu_fid)))
1983                         RETURN(-EFAULT);
1984
1985                 RETURN(0);
1986         }
1987         case OBD_IOC_FID2PATH:
1988                 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1989
1990         default: {
1991                 int err;
1992
1993                 if (LLIOC_STOP ==
1994                     ll_iocontrol_call(inode, file, cmd, arg, &err))
1995                         RETURN(err);
1996
1997                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1998                                      (void *)arg));
1999         }
2000         }
2001 }
2002
2003 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2004 {
2005         struct inode *inode = file->f_dentry->d_inode;
2006         loff_t retval;
2007         ENTRY;
2008         retval = offset + ((origin == 2) ? i_size_read(inode) :
2009                            (origin == 1) ? file->f_pos : 0);
2010         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2011                inode->i_ino, inode->i_generation, inode, retval, retval,
2012                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2013         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2014
2015         if (origin == 2) { /* SEEK_END */
2016                 int nonblock = 0, rc;
2017
2018                 if (file->f_flags & O_NONBLOCK)
2019                         nonblock = LDLM_FL_BLOCK_NOWAIT;
2020
2021                 rc = cl_glimpse_size(inode);
2022                 if (rc != 0)
2023                         RETURN(rc);
2024
2025                 ll_inode_size_lock(inode, 0);
2026                 offset += i_size_read(inode);
2027                 ll_inode_size_unlock(inode, 0);
2028         } else if (origin == 1) { /* SEEK_CUR */
2029                 offset += file->f_pos;
2030         }
2031
2032         retval = -EINVAL;
2033         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2034                 if (offset != file->f_pos) {
2035                         file->f_pos = offset;
2036                 }
2037                 retval = offset;
2038         }
2039
2040         RETURN(retval);
2041 }
2042
2043 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2044 {
2045         struct inode *inode = dentry->d_inode;
2046         struct ll_inode_info *lli = ll_i2info(inode);
2047         struct lov_stripe_md *lsm = lli->lli_smd;
2048         struct ptlrpc_request *req;
2049         struct obd_capa *oc;
2050         int rc, err;
2051         ENTRY;
2052         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2053                inode->i_generation, inode);
2054         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2055
2056         /* fsync's caller has already called _fdata{sync,write}, we want
2057          * that IO to finish before calling the osc and mdc sync methods */
2058         rc = filemap_fdatawait(inode->i_mapping);
2059
2060         /* catch async errors that were recorded back when async writeback
2061          * failed for pages in this mapping. */
2062         err = lli->lli_async_rc;
2063         lli->lli_async_rc = 0;
2064         if (rc == 0)
2065                 rc = err;
2066         if (lsm) {
2067                 err = lov_test_and_clear_async_rc(lsm);
2068                 if (rc == 0)
2069                         rc = err;
2070         }
2071
2072         oc = ll_mdscapa_get(inode);
2073         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2074                       &req);
2075         capa_put(oc);
2076         if (!rc)
2077                 rc = err;
2078         if (!err)
2079                 ptlrpc_req_finished(req);
2080
2081         if (data && lsm) {
2082                 struct obdo *oa;
2083
2084                 OBDO_ALLOC(oa);
2085                 if (!oa)
2086                         RETURN(rc ? rc : -ENOMEM);
2087
2088                 oa->o_id = lsm->lsm_object_id;
2089                 oa->o_gr = lsm->lsm_object_gr;
2090                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2091                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2092                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2093                                            OBD_MD_FLGROUP);
2094
2095                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2096                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2097                                0, OBD_OBJECT_EOF, oc);
2098                 capa_put(oc);
2099                 if (!rc)
2100                         rc = err;
2101                 OBDO_FREE(oa);
2102         }
2103
2104         RETURN(rc);
2105 }
2106
2107 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2108 {
2109         struct inode *inode = file->f_dentry->d_inode;
2110         struct ll_sb_info *sbi = ll_i2sbi(inode);
2111         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2112                                            .ei_cb_cp =ldlm_flock_completion_ast,
2113                                            .ei_cbdata = file_lock };
2114         struct md_op_data *op_data;
2115         struct lustre_handle lockh = {0};
2116         ldlm_policy_data_t flock;
2117         int flags = 0;
2118         int rc;
2119         ENTRY;
2120
2121         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2122                inode->i_ino, file_lock);
2123
2124         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2125
2126         if (file_lock->fl_flags & FL_FLOCK) {
2127                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2128                 /* set missing params for flock() calls */
2129                 file_lock->fl_end = OFFSET_MAX;
2130                 file_lock->fl_pid = current->tgid;
2131         }
2132         flock.l_flock.pid = file_lock->fl_pid;
2133         flock.l_flock.start = file_lock->fl_start;
2134         flock.l_flock.end = file_lock->fl_end;
2135
2136         switch (file_lock->fl_type) {
2137         case F_RDLCK:
2138                 einfo.ei_mode = LCK_PR;
2139                 break;
2140         case F_UNLCK:
2141                 /* An unlock request may or may not have any relation to
2142                  * existing locks so we may not be able to pass a lock handle
2143                  * via a normal ldlm_lock_cancel() request. The request may even
2144                  * unlock a byte range in the middle of an existing lock. In
2145                  * order to process an unlock request we need all of the same
2146                  * information that is given with a normal read or write record
2147                  * lock request. To avoid creating another ldlm unlock (cancel)
2148                  * message we'll treat a LCK_NL flock request as an unlock. */
2149                 einfo.ei_mode = LCK_NL;
2150                 break;
2151         case F_WRLCK:
2152                 einfo.ei_mode = LCK_PW;
2153                 break;
2154         default:
2155                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2156                 RETURN (-EINVAL);
2157         }
2158
2159         switch (cmd) {
2160         case F_SETLKW:
2161 #ifdef F_SETLKW64
2162         case F_SETLKW64:
2163 #endif
2164                 flags = 0;
2165                 break;
2166         case F_SETLK:
2167 #ifdef F_SETLK64
2168         case F_SETLK64:
2169 #endif
2170                 flags = LDLM_FL_BLOCK_NOWAIT;
2171                 break;
2172         case F_GETLK:
2173 #ifdef F_GETLK64
2174         case F_GETLK64:
2175 #endif
2176                 flags = LDLM_FL_TEST_LOCK;
2177                 /* Save the old mode so that if the mode in the lock changes we
2178                  * can decrement the appropriate reader or writer refcount. */
2179                 file_lock->fl_type = einfo.ei_mode;
2180                 break;
2181         default:
2182                 CERROR("unknown fcntl lock command: %d\n", cmd);
2183                 RETURN (-EINVAL);
2184         }
2185
2186         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2187                                      LUSTRE_OPC_ANY, NULL);
2188         if (IS_ERR(op_data))
2189                 RETURN(PTR_ERR(op_data));
2190
2191         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2192                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2193                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2194
2195         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2196                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2197
2198         ll_finish_md_op_data(op_data);
2199
2200         if ((file_lock->fl_flags & FL_FLOCK) &&
2201             (rc == 0 || file_lock->fl_type == F_UNLCK))
2202                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2203 #ifdef HAVE_F_OP_FLOCK
2204         if ((file_lock->fl_flags & FL_POSIX) &&
2205             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2206             !(flags & LDLM_FL_TEST_LOCK))
2207                 posix_lock_file_wait(file, file_lock);
2208 #endif
2209
2210         RETURN(rc);
2211 }
2212
2213 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2214 {
2215         ENTRY;
2216
2217         RETURN(-ENOSYS);
2218 }
2219
2220 int ll_have_md_lock(struct inode *inode, __u64 bits)
2221 {
2222         struct lustre_handle lockh;
2223         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2224         struct lu_fid *fid;
2225         int flags;
2226         ENTRY;
2227
2228         if (!inode)
2229                RETURN(0);
2230
2231         fid = &ll_i2info(inode)->lli_fid;
2232         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2233
2234         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2235         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2236                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2237                 RETURN(1);
2238         }
2239         RETURN(0);
2240 }
2241
2242 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2243                             struct lustre_handle *lockh)
2244 {
2245         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2246         struct lu_fid *fid;
2247         ldlm_mode_t rc;
2248         int flags;
2249         ENTRY;
2250
2251         fid = &ll_i2info(inode)->lli_fid;
2252         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2253
2254         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2255         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2256                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2257         RETURN(rc);
2258 }
2259
2260 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2261         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2262                               * and return success */
2263                 inode->i_nlink = 0;
2264                 /* This path cannot be hit for regular files unless in
2265                  * case of obscure races, so no need to to validate
2266                  * size. */
2267                 if (!S_ISREG(inode->i_mode) &&
2268                     !S_ISDIR(inode->i_mode))
2269                         return 0;
2270         }
2271
2272         if (rc) {
2273                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2274                 return -abs(rc);
2275
2276         }
2277
2278         return 0;
2279 }
2280
2281 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2282                              __u64 ibits)
2283 {
2284         struct inode *inode = dentry->d_inode;
2285         struct ptlrpc_request *req = NULL;
2286         struct ll_sb_info *sbi;
2287         struct obd_export *exp;
2288         int rc = 0;
2289         ENTRY;
2290
2291         if (!inode) {
2292                 CERROR("REPORT THIS LINE TO PETER\n");
2293                 RETURN(0);
2294         }
2295         sbi = ll_i2sbi(inode);
2296
2297         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2298                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2299
2300         exp = ll_i2mdexp(inode);
2301
2302         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2303                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2304                 struct md_op_data *op_data;
2305
2306                 /* Call getattr by fid, so do not provide name at all. */
2307                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2308                                              dentry->d_inode, NULL, 0, 0,
2309                                              LUSTRE_OPC_ANY, NULL);
2310                 if (IS_ERR(op_data))
2311                         RETURN(PTR_ERR(op_data));
2312
2313                 oit.it_create_mode |= M_CHECK_STALE;
2314                 rc = md_intent_lock(exp, op_data, NULL, 0,
2315                                     /* we are not interested in name
2316                                        based lookup */
2317                                     &oit, 0, &req,
2318                                     ll_md_blocking_ast, 0);
2319                 ll_finish_md_op_data(op_data);
2320                 oit.it_create_mode &= ~M_CHECK_STALE;
2321                 if (rc < 0) {
2322                         rc = ll_inode_revalidate_fini(inode, rc);
2323                         GOTO (out, rc);
2324                 }
2325
2326                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2327                 if (rc != 0) {
2328                         ll_intent_release(&oit);
2329                         GOTO(out, rc);
2330                 }
2331
2332                 /* Unlinked? Unhash dentry, so it is not picked up later by
2333                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2334                    here to preserve get_cwd functionality on 2.6.
2335                    Bug 10503 */
2336                 if (!dentry->d_inode->i_nlink) {
2337                         spin_lock(&ll_lookup_lock);
2338                         spin_lock(&dcache_lock);
2339                         ll_drop_dentry(dentry);
2340                         spin_unlock(&dcache_lock);
2341                         spin_unlock(&ll_lookup_lock);
2342                 }
2343
2344                 ll_lookup_finish_locks(&oit, dentry);
2345         } else if (!ll_have_md_lock(dentry->d_inode, ibits)) {
2346
2347                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2348                 obd_valid valid = OBD_MD_FLGETATTR;
2349                 struct obd_capa *oc;
2350                 int ealen = 0;
2351
2352                 if (S_ISREG(inode->i_mode)) {
2353                         rc = ll_get_max_mdsize(sbi, &ealen);
2354                         if (rc)
2355                                 RETURN(rc);
2356                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2357                 }
2358                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2359                  * capa for this inode. Because we only keep capas of dirs
2360                  * fresh. */
2361                 oc = ll_mdscapa_get(inode);
2362                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2363                                 ealen, &req);
2364                 capa_put(oc);
2365                 if (rc) {
2366                         rc = ll_inode_revalidate_fini(inode, rc);
2367                         RETURN(rc);
2368                 }
2369
2370                 rc = ll_prep_inode(&inode, req, NULL);
2371         }
2372 out:
2373         ptlrpc_req_finished(req);
2374         return rc;
2375 }
2376
2377 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2378 {
2379         int rc;
2380         ENTRY;
2381
2382         rc = __ll_inode_revalidate_it(dentry, it, MDS_INODELOCK_UPDATE |
2383                                                   MDS_INODELOCK_LOOKUP);
2384
2385         /* if object not yet allocated, don't validate size */
2386         if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL)
2387                 RETURN(0);
2388
2389         /* cl_glimpse_size will prefer locally cached writes if they extend
2390          * the file */
2391
2392         if (rc == 0)
2393                 rc = cl_glimpse_size(dentry->d_inode);
2394
2395         RETURN(rc);
2396 }
2397
2398 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2399                   struct lookup_intent *it, struct kstat *stat)
2400 {
2401         struct inode *inode = de->d_inode;
2402         int res = 0;
2403
2404         res = ll_inode_revalidate_it(de, it);
2405         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2406
2407         if (res)
2408                 return res;
2409
2410         stat->dev = inode->i_sb->s_dev;
2411         stat->ino = inode->i_ino;
2412         stat->mode = inode->i_mode;
2413         stat->nlink = inode->i_nlink;
2414         stat->uid = inode->i_uid;
2415         stat->gid = inode->i_gid;
2416         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2417         stat->atime = inode->i_atime;
2418         stat->mtime = inode->i_mtime;
2419         stat->ctime = inode->i_ctime;
2420 #ifdef HAVE_INODE_BLKSIZE
2421         stat->blksize = inode->i_blksize;
2422 #else
2423         stat->blksize = 1 << inode->i_blkbits;
2424 #endif
2425
2426         ll_inode_size_lock(inode, 0);
2427         stat->size = i_size_read(inode);
2428         stat->blocks = inode->i_blocks;
2429         ll_inode_size_unlock(inode, 0);
2430
2431         return 0;
2432 }
2433 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2434 {
2435         struct lookup_intent it = { .it_op = IT_GETATTR };
2436
2437         return ll_getattr_it(mnt, de, &it, stat);
2438 }
2439
2440 static
2441 int lustre_check_acl(struct inode *inode, int mask)
2442 {
2443 #ifdef CONFIG_FS_POSIX_ACL
2444         struct ll_inode_info *lli = ll_i2info(inode);
2445         struct posix_acl *acl;
2446         int rc;
2447         ENTRY;
2448
2449         spin_lock(&lli->lli_lock);
2450         acl = posix_acl_dup(lli->lli_posix_acl);
2451         spin_unlock(&lli->lli_lock);
2452
2453         if (!acl)
2454                 RETURN(-EAGAIN);
2455
2456         rc = posix_acl_permission(inode, acl, mask);
2457         posix_acl_release(acl);
2458
2459         RETURN(rc);
2460 #else
2461         return -EAGAIN;
2462 #endif
2463 }
2464
2465 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2466 #ifndef HAVE_INODE_PERMISION_2ARGS
2467 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2468 #else
2469 int ll_inode_permission(struct inode *inode, int mask)
2470 #endif
2471 {
2472         int rc = 0;
2473         ENTRY;
2474
2475        /* as root inode are NOT getting validated in lookup operation,
2476         * need to do it before permission check. */
2477
2478         if (inode == inode->i_sb->s_root->d_inode) {
2479                 struct lookup_intent it = { .it_op = IT_LOOKUP };
2480
2481                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2482                                               MDS_INODELOCK_LOOKUP);
2483                 if (rc)
2484                         RETURN(rc);
2485         }
2486
2487         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2488                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2489
2490         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2491                 return lustre_check_remote_perm(inode, mask);
2492
2493         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2494         rc = generic_permission(inode, mask, lustre_check_acl);
2495
2496         RETURN(rc);
2497 }
2498 #else
2499 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2500 {
2501         int mode = inode->i_mode;
2502         int rc;
2503
2504         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2505                inode->i_ino, inode->i_generation, inode, mask);
2506
2507         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2508                 return lustre_check_remote_perm(inode, mask);
2509
2510         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2511
2512         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2513             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2514                 return -EROFS;
2515         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2516                 return -EACCES;
2517         if (current->fsuid == inode->i_uid) {
2518                 mode >>= 6;
2519         } else if (1) {
2520                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2521                         goto check_groups;
2522                 rc = lustre_check_acl(inode, mask);
2523                 if (rc == -EAGAIN)
2524                         goto check_groups;
2525                 if (rc == -EACCES)
2526                         goto check_capabilities;
2527                 return rc;
2528         } else {
2529 check_groups:
2530                 if (in_group_p(inode->i_gid))
2531                         mode >>= 3;
2532         }
2533         if ((mode & mask & S_IRWXO) == mask)
2534                 return 0;
2535
2536 check_capabilities:
2537         if (!(mask & MAY_EXEC) ||
2538             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2539                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2540                         return 0;
2541
2542         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2543             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2544                 return 0;
2545
2546         return -EACCES;
2547 }
2548 #endif
2549
2550 #ifdef HAVE_FILE_READV
2551 #define READ_METHOD readv
2552 #define READ_FUNCTION ll_file_readv
2553 #define WRITE_METHOD writev
2554 #define WRITE_FUNCTION ll_file_writev
2555 #else
2556 #define READ_METHOD aio_read
2557 #define READ_FUNCTION ll_file_aio_read
2558 #define WRITE_METHOD aio_write
2559 #define WRITE_FUNCTION ll_file_aio_write
2560 #endif
2561
2562 /* -o localflock - only provides locally consistent flock locks */
2563 struct file_operations ll_file_operations = {
2564         .read           = ll_file_read,
2565         .READ_METHOD    = READ_FUNCTION,
2566         .write          = ll_file_write,
2567         .WRITE_METHOD   = WRITE_FUNCTION,
2568         .ioctl          = ll_file_ioctl,
2569         .open           = ll_file_open,
2570         .release        = ll_file_release,
2571         .mmap           = ll_file_mmap,
2572         .llseek         = ll_file_seek,
2573 #ifdef HAVE_KERNEL_SENDFILE
2574         .sendfile       = ll_file_sendfile,
2575 #endif
2576 #ifdef HAVE_KERNEL_SPLICE_READ
2577         .splice_read    = ll_file_splice_read,
2578 #endif
2579         .fsync          = ll_fsync,
2580 };
2581
2582 struct file_operations ll_file_operations_flock = {
2583         .read           = ll_file_read,
2584         .READ_METHOD    = READ_FUNCTION,
2585         .write          = ll_file_write,
2586         .WRITE_METHOD   = WRITE_FUNCTION,
2587         .ioctl          = ll_file_ioctl,
2588         .open           = ll_file_open,
2589         .release        = ll_file_release,
2590         .mmap           = ll_file_mmap,
2591         .llseek         = ll_file_seek,
2592 #ifdef HAVE_KERNEL_SENDFILE
2593         .sendfile       = ll_file_sendfile,
2594 #endif
2595 #ifdef HAVE_KERNEL_SPLICE_READ
2596         .splice_read    = ll_file_splice_read,
2597 #endif
2598         .fsync          = ll_fsync,
2599 #ifdef HAVE_F_OP_FLOCK
2600         .flock          = ll_file_flock,
2601 #endif
2602         .lock           = ll_file_flock
2603 };
2604
2605 /* These are for -o noflock - to return ENOSYS on flock calls */
2606 struct file_operations ll_file_operations_noflock = {
2607         .read           = ll_file_read,
2608         .READ_METHOD    = READ_FUNCTION,
2609         .write          = ll_file_write,
2610         .WRITE_METHOD   = WRITE_FUNCTION,
2611         .ioctl          = ll_file_ioctl,
2612         .open           = ll_file_open,
2613         .release        = ll_file_release,
2614         .mmap           = ll_file_mmap,
2615         .llseek         = ll_file_seek,
2616 #ifdef HAVE_KERNEL_SENDFILE
2617         .sendfile       = ll_file_sendfile,
2618 #endif
2619 #ifdef HAVE_KERNEL_SPLICE_READ
2620         .splice_read    = ll_file_splice_read,
2621 #endif
2622         .fsync          = ll_fsync,
2623 #ifdef HAVE_F_OP_FLOCK
2624         .flock          = ll_file_noflock,
2625 #endif
2626         .lock           = ll_file_noflock
2627 };
2628
2629 struct inode_operations ll_file_inode_operations = {
2630 #ifdef HAVE_VFS_INTENT_PATCHES
2631         .setattr_raw    = ll_setattr_raw,
2632 #endif
2633         .setattr        = ll_setattr,
2634         .truncate       = ll_truncate,
2635         .getattr        = ll_getattr,
2636         .permission     = ll_inode_permission,
2637         .setxattr       = ll_setxattr,
2638         .getxattr       = ll_getxattr,
2639         .listxattr      = ll_listxattr,
2640         .removexattr    = ll_removexattr,
2641 };
2642
2643 /* dynamic ioctl number support routins */
2644 static struct llioc_ctl_data {
2645         struct rw_semaphore ioc_sem;
2646         struct list_head    ioc_head;
2647 } llioc = {
2648         __RWSEM_INITIALIZER(llioc.ioc_sem),
2649         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2650 };
2651
2652
2653 struct llioc_data {
2654         struct list_head        iocd_list;
2655         unsigned int            iocd_size;
2656         llioc_callback_t        iocd_cb;
2657         unsigned int            iocd_count;
2658         unsigned int            iocd_cmd[0];
2659 };
2660
2661 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2662 {
2663         unsigned int size;
2664         struct llioc_data *in_data = NULL;
2665         ENTRY;
2666
2667         if (cb == NULL || cmd == NULL ||
2668             count > LLIOC_MAX_CMD || count < 0)
2669                 RETURN(NULL);
2670
2671         size = sizeof(*in_data) + count * sizeof(unsigned int);
2672         OBD_ALLOC(in_data, size);
2673         if (in_data == NULL)
2674                 RETURN(NULL);
2675
2676         memset(in_data, 0, sizeof(*in_data));
2677         in_data->iocd_size = size;
2678         in_data->iocd_cb = cb;
2679         in_data->iocd_count = count;
2680         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2681
2682         down_write(&llioc.ioc_sem);
2683         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2684         up_write(&llioc.ioc_sem);
2685
2686         RETURN(in_data);
2687 }
2688
2689 void ll_iocontrol_unregister(void *magic)
2690 {
2691         struct llioc_data *tmp;
2692
2693         if (magic == NULL)
2694                 return;
2695
2696         down_write(&llioc.ioc_sem);
2697         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2698                 if (tmp == magic) {
2699                         unsigned int size = tmp->iocd_size;
2700
2701                         list_del(&tmp->iocd_list);
2702                         up_write(&llioc.ioc_sem);
2703
2704                         OBD_FREE(tmp, size);
2705                         return;
2706                 }
2707         }
2708         up_write(&llioc.ioc_sem);
2709
2710         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2711 }
2712
2713 EXPORT_SYMBOL(ll_iocontrol_register);
2714 EXPORT_SYMBOL(ll_iocontrol_unregister);
2715
2716 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2717                         unsigned int cmd, unsigned long arg, int *rcp)
2718 {
2719         enum llioc_iter ret = LLIOC_CONT;
2720         struct llioc_data *data;
2721         int rc = -EINVAL, i;
2722
2723         down_read(&llioc.ioc_sem);
2724         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2725                 for (i = 0; i < data->iocd_count; i++) {
2726                         if (cmd != data->iocd_cmd[i])
2727                                 continue;
2728
2729                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2730                         break;
2731                 }
2732
2733                 if (ret == LLIOC_STOP)
2734                         break;
2735         }
2736         up_read(&llioc.ioc_sem);
2737
2738         if (rcp)
2739                 *rcp = rc;
2740         return ret;
2741 }