Whamcloud - gitweb
b=18144
[fs/lustre-release.git] / lustre / llite / file.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51
52 #include "cl_object.h"
53
54 struct ll_file_data *ll_file_data_get(void)
55 {
56         struct ll_file_data *fd;
57
58         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
59         return fd;
60 }
61
62 static void ll_file_data_put(struct ll_file_data *fd)
63 {
64         if (fd != NULL)
65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 }
67
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69                           struct lustre_handle *fh)
70 {
71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72         op_data->op_attr.ia_mode = inode->i_mode;
73         op_data->op_attr.ia_atime = inode->i_atime;
74         op_data->op_attr.ia_mtime = inode->i_mtime;
75         op_data->op_attr.ia_ctime = inode->i_ctime;
76         op_data->op_attr.ia_size = i_size_read(inode);
77         op_data->op_attr_blocks = inode->i_blocks;
78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
79         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
80         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
81         op_data->op_capa1 = ll_mdscapa_get(inode);
82 }
83
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85                              struct obd_client_handle *och)
86 {
87         ENTRY;
88
89         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
90                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
91
92         if (!(och->och_flags & FMODE_WRITE))
93                 goto out;
94
95         if (!(exp_connect_som(ll_i2mdexp(inode))) || !S_ISREG(inode->i_mode))
96                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
97         else
98                 ll_epoch_close(inode, op_data, &och, 0);
99
100 out:
101         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
102         EXIT;
103 }
104
105 static int ll_close_inode_openhandle(struct obd_export *md_exp,
106                                      struct inode *inode,
107                                      struct obd_client_handle *och)
108 {
109         struct obd_export *exp = ll_i2mdexp(inode);
110         struct md_op_data *op_data;
111         struct ptlrpc_request *req = NULL;
112         struct obd_device *obd = class_exp2obd(exp);
113         int epoch_close = 1;
114         int rc;
115         ENTRY;
116
117         if (obd == NULL) {
118                 /*
119                  * XXX: in case of LMV, is this correct to access
120                  * ->exp_handle?
121                  */
122                 CERROR("Invalid MDC connection handle "LPX64"\n",
123                        ll_i2mdexp(inode)->exp_handle.h_cookie);
124                 GOTO(out, rc = 0);
125         }
126
127         /*
128          * here we check if this is forced umount. If so this is called on
129          * canceling "open lock" and we do not call md_close() in this case, as
130          * it will not be successful, as import is already deactivated.
131          */
132         if (obd->obd_force)
133                 GOTO(out, rc = 0);
134
135         OBD_ALLOC_PTR(op_data);
136         if (op_data == NULL)
137                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
138
139         ll_prepare_close(inode, op_data, och);
140         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
141         rc = md_close(md_exp, op_data, och->och_mod, &req);
142         if (rc == -EAGAIN) {
143                 /* This close must have the epoch closed. */
144                 LASSERT(epoch_close);
145                 /* MDS has instructed us to obtain Size-on-MDS attribute from
146                  * OSTs and send setattr to back to MDS. */
147                 rc = ll_sizeonmds_update(inode, &och->och_fh,
148                                          op_data->op_ioepoch);
149                 if (rc) {
150                         CERROR("inode %lu mdc Size-on-MDS update failed: "
151                                "rc = %d\n", inode->i_ino, rc);
152                         rc = 0;
153                 }
154         } else if (rc) {
155                 CERROR("inode %lu mdc close failed: rc = %d\n",
156                        inode->i_ino, rc);
157         }
158         ll_finish_md_op_data(op_data);
159
160         if (rc == 0) {
161                 rc = ll_objects_destroy(req, inode);
162                 if (rc)
163                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
164                                inode->i_ino, rc);
165         }
166
167         EXIT;
168 out:
169
170         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
171             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
172                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
173         } else {
174                 md_clear_open_replay_data(md_exp, och);
175                 /* Free @och if it is not waiting for DONE_WRITING. */
176                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
177                 OBD_FREE_PTR(och);
178         }
179         if (req) /* This is close request */
180                 ptlrpc_req_finished(req);
181         return rc;
182 }
183
184 int ll_md_real_close(struct inode *inode, int flags)
185 {
186         struct ll_inode_info *lli = ll_i2info(inode);
187         struct obd_client_handle **och_p;
188         struct obd_client_handle *och;
189         __u64 *och_usecount;
190         int rc = 0;
191         ENTRY;
192
193         if (flags & FMODE_WRITE) {
194                 och_p = &lli->lli_mds_write_och;
195                 och_usecount = &lli->lli_open_fd_write_count;
196         } else if (flags & FMODE_EXEC) {
197                 och_p = &lli->lli_mds_exec_och;
198                 och_usecount = &lli->lli_open_fd_exec_count;
199         } else {
200                 LASSERT(flags & FMODE_READ);
201                 och_p = &lli->lli_mds_read_och;
202                 och_usecount = &lli->lli_open_fd_read_count;
203         }
204
205         down(&lli->lli_och_sem);
206         if (*och_usecount) { /* There are still users of this handle, so
207                                 skip freeing it. */
208                 up(&lli->lli_och_sem);
209                 RETURN(0);
210         }
211         och=*och_p;
212         *och_p = NULL;
213         up(&lli->lli_och_sem);
214
215         if (och) { /* There might be a race and somebody have freed this och
216                       already */
217                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
218                                                inode, och);
219         }
220
221         RETURN(rc);
222 }
223
224 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
225                 struct file *file)
226 {
227         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
228         struct ll_inode_info *lli = ll_i2info(inode);
229         int rc = 0;
230         ENTRY;
231
232         /* clear group lock, if present */
233         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
234                 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
235
236         /* Let's see if we have good enough OPEN lock on the file and if
237            we can skip talking to MDS */
238         if (file->f_dentry->d_inode) { /* Can this ever be false? */
239                 int lockmode;
240                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
241                 struct lustre_handle lockh;
242                 struct inode *inode = file->f_dentry->d_inode;
243                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
244
245                 down(&lli->lli_och_sem);
246                 if (fd->fd_omode & FMODE_WRITE) {
247                         lockmode = LCK_CW;
248                         LASSERT(lli->lli_open_fd_write_count);
249                         lli->lli_open_fd_write_count--;
250                 } else if (fd->fd_omode & FMODE_EXEC) {
251                         lockmode = LCK_PR;
252                         LASSERT(lli->lli_open_fd_exec_count);
253                         lli->lli_open_fd_exec_count--;
254                 } else {
255                         lockmode = LCK_CR;
256                         LASSERT(lli->lli_open_fd_read_count);
257                         lli->lli_open_fd_read_count--;
258                 }
259                 up(&lli->lli_och_sem);
260
261                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
262                                    LDLM_IBITS, &policy, lockmode,
263                                    &lockh)) {
264                         rc = ll_md_real_close(file->f_dentry->d_inode,
265                                               fd->fd_omode);
266                 }
267         } else {
268                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
269                        file, file->f_dentry, file->f_dentry->d_name.name);
270         }
271
272         LUSTRE_FPRIVATE(file) = NULL;
273         ll_file_data_put(fd);
274         ll_capa_close(inode);
275
276         RETURN(rc);
277 }
278
279 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
280
281 /* While this returns an error code, fput() the caller does not, so we need
282  * to make every effort to clean up all of our state here.  Also, applications
283  * rarely check close errors and even if an error is returned they will not
284  * re-try the close call.
285  */
286 int ll_file_release(struct inode *inode, struct file *file)
287 {
288         struct ll_file_data *fd;
289         struct ll_sb_info *sbi = ll_i2sbi(inode);
290         struct ll_inode_info *lli = ll_i2info(inode);
291         struct lov_stripe_md *lsm = lli->lli_smd;
292         int rc;
293         ENTRY;
294
295         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
296                inode->i_generation, inode);
297
298 #ifdef CONFIG_FS_POSIX_ACL
299         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
300             inode == inode->i_sb->s_root->d_inode) {
301                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
302
303                 LASSERT(fd != NULL);
304                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
305                         fd->fd_flags &= ~LL_FILE_RMTACL;
306                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
307                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
308                 }
309         }
310 #endif
311
312         if (inode->i_sb->s_root != file->f_dentry)
313                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
314         fd = LUSTRE_FPRIVATE(file);
315         LASSERT(fd != NULL);
316
317         /* The last ref on @file, maybe not the the owner pid of statahead.
318          * Different processes can open the same dir, "ll_opendir_key" means:
319          * it is me that should stop the statahead thread. */
320         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
321                 ll_stop_statahead(inode, lli->lli_opendir_key);
322
323         if (inode->i_sb->s_root == file->f_dentry) {
324                 LUSTRE_FPRIVATE(file) = NULL;
325                 ll_file_data_put(fd);
326                 RETURN(0);
327         }
328
329         if (lsm)
330                 lov_test_and_clear_async_rc(lsm);
331         lli->lli_async_rc = 0;
332
333         rc = ll_md_close(sbi->ll_md_exp, inode, file);
334         RETURN(rc);
335 }
336
337 static int ll_intent_file_open(struct file *file, void *lmm,
338                                int lmmsize, struct lookup_intent *itp)
339 {
340         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
341         struct dentry *parent = file->f_dentry->d_parent;
342         const char *name = file->f_dentry->d_name.name;
343         const int len = file->f_dentry->d_name.len;
344         struct md_op_data *op_data;
345         struct ptlrpc_request *req;
346         int rc;
347         ENTRY;
348
349         if (!parent)
350                 RETURN(-ENOENT);
351
352         /* Usually we come here only for NFSD, and we want open lock.
353            But we can also get here with pre 2.6.15 patchless kernels, and in
354            that case that lock is also ok */
355         /* We can also get here if there was cached open handle in revalidate_it
356          * but it disappeared while we were getting from there to ll_file_open.
357          * But this means this file was closed and immediatelly opened which
358          * makes a good candidate for using OPEN lock */
359         /* If lmmsize & lmm are not 0, we are just setting stripe info
360          * parameters. No need for the open lock */
361         if (!lmm && !lmmsize)
362                 itp->it_flags |= MDS_OPEN_LOCK;
363
364         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
365                                       file->f_dentry->d_inode, name, len,
366                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
367         if (IS_ERR(op_data))
368                 RETURN(PTR_ERR(op_data));
369
370         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
371                             0 /*unused */, &req, ll_md_blocking_ast, 0);
372         ll_finish_md_op_data(op_data);
373         if (rc == -ESTALE) {
374                 /* reason for keep own exit path - don`t flood log
375                 * with messages with -ESTALE errors.
376                 */
377                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
378                      it_open_error(DISP_OPEN_OPEN, itp))
379                         GOTO(out, rc);
380                 ll_release_openhandle(file->f_dentry, itp);
381                 GOTO(out, rc);
382         }
383
384         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
385                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
386                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
387                 GOTO(out, rc);
388         }
389
390         if (itp->d.lustre.it_lock_mode)
391                 md_set_lock_data(sbi->ll_md_exp,
392                                  &itp->d.lustre.it_lock_handle,
393                                  file->f_dentry->d_inode);
394
395         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
396 out:
397         ptlrpc_req_finished(itp->d.lustre.it_data);
398         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
399         ll_intent_drop_lock(itp);
400
401         RETURN(rc);
402 }
403
404 void ll_ioepoch_open(struct ll_inode_info *lli, __u64 ioepoch)
405 {
406         if (ioepoch && lli->lli_ioepoch != ioepoch) {
407                 lli->lli_ioepoch = ioepoch;
408                 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
409                        ioepoch, PFID(&lli->lli_fid));
410         }
411 }
412
413 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
414                        struct lookup_intent *it, struct obd_client_handle *och)
415 {
416         struct ptlrpc_request *req = it->d.lustre.it_data;
417         struct mdt_body *body;
418
419         LASSERT(och);
420
421         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
422         LASSERT(body != NULL);                      /* reply already checked out */
423
424         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
425         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
426         och->och_fid = lli->lli_fid;
427         och->och_flags = it->it_flags;
428         ll_ioepoch_open(lli, body->ioepoch);
429
430         return md_set_open_replay_data(md_exp, och, req);
431 }
432
433 int ll_local_open(struct file *file, struct lookup_intent *it,
434                   struct ll_file_data *fd, struct obd_client_handle *och)
435 {
436         struct inode *inode = file->f_dentry->d_inode;
437         struct ll_inode_info *lli = ll_i2info(inode);
438         ENTRY;
439
440         LASSERT(!LUSTRE_FPRIVATE(file));
441
442         LASSERT(fd != NULL);
443
444         if (och) {
445                 struct ptlrpc_request *req = it->d.lustre.it_data;
446                 struct mdt_body *body;
447                 int rc;
448
449                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
450                 if (rc)
451                         RETURN(rc);
452
453                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
454                 if ((it->it_flags & FMODE_WRITE) &&
455                     (body->valid & OBD_MD_FLSIZE))
456                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
457                                lli->lli_ioepoch, PFID(&lli->lli_fid));
458         }
459
460         LUSTRE_FPRIVATE(file) = fd;
461         ll_readahead_init(inode, &fd->fd_ras);
462         fd->fd_omode = it->it_flags;
463         RETURN(0);
464 }
465
466 /* Open a file, and (for the very first open) create objects on the OSTs at
467  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
468  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
469  * lli_open_sem to ensure no other process will create objects, send the
470  * stripe MD to the MDS, or try to destroy the objects if that fails.
471  *
472  * If we already have the stripe MD locally then we don't request it in
473  * md_open(), by passing a lmm_size = 0.
474  *
475  * It is up to the application to ensure no other processes open this file
476  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
477  * used.  We might be able to avoid races of that sort by getting lli_open_sem
478  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
479  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
480  */
481 int ll_file_open(struct inode *inode, struct file *file)
482 {
483         struct ll_inode_info *lli = ll_i2info(inode);
484         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
485                                           .it_flags = file->f_flags };
486         struct lov_stripe_md *lsm;
487         struct ptlrpc_request *req = NULL;
488         struct obd_client_handle **och_p;
489         __u64 *och_usecount;
490         struct ll_file_data *fd;
491         int rc = 0, opendir_set = 0;
492         ENTRY;
493
494         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
495                inode->i_generation, inode, file->f_flags);
496
497 #ifdef HAVE_VFS_INTENT_PATCHES
498         it = file->f_it;
499 #else
500         it = file->private_data; /* XXX: compat macro */
501         file->private_data = NULL; /* prevent ll_local_open assertion */
502 #endif
503
504         fd = ll_file_data_get();
505         if (fd == NULL)
506                 RETURN(-ENOMEM);
507
508         fd->fd_file = file;
509         if (S_ISDIR(inode->i_mode)) {
510 again:
511                 spin_lock(&lli->lli_lock);
512                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
513                         LASSERT(lli->lli_sai == NULL);
514                         lli->lli_opendir_key = fd;
515                         lli->lli_opendir_pid = cfs_curproc_pid();
516                         opendir_set = 1;
517                 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid() &&
518                                     lli->lli_opendir_key != NULL)) {
519                         /* Two cases for this:
520                          * (1) The same process open such directory many times.
521                          * (2) The old process opened the directory, and exited
522                          *     before its children processes. Then new process
523                          *     with the same pid opens such directory before the
524                          *     old process's children processes exit.
525                          * reset stat ahead for such cases. */
526                         spin_unlock(&lli->lli_lock);
527                         CDEBUG(D_INFO, "Conflict statahead for %.*s "DFID
528                                " reset it.\n", file->f_dentry->d_name.len,
529                                file->f_dentry->d_name.name,
530                                PFID(&lli->lli_fid));
531                         ll_stop_statahead(inode, lli->lli_opendir_key);
532                         goto again;
533                 }
534                 spin_unlock(&lli->lli_lock);
535         }
536
537         if (inode->i_sb->s_root == file->f_dentry) {
538                 LUSTRE_FPRIVATE(file) = fd;
539                 RETURN(0);
540         }
541
542         if (!it || !it->d.lustre.it_disposition) {
543                 /* Convert f_flags into access mode. We cannot use file->f_mode,
544                  * because everything but O_ACCMODE mask was stripped from
545                  * there */
546                 if ((oit.it_flags + 1) & O_ACCMODE)
547                         oit.it_flags++;
548                 if (file->f_flags & O_TRUNC)
549                         oit.it_flags |= FMODE_WRITE;
550
551                 /* kernel only call f_op->open in dentry_open.  filp_open calls
552                  * dentry_open after call to open_namei that checks permissions.
553                  * Only nfsd_open call dentry_open directly without checking
554                  * permissions and because of that this code below is safe. */
555                 if (oit.it_flags & FMODE_WRITE)
556                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
557
558                 /* We do not want O_EXCL here, presumably we opened the file
559                  * already? XXX - NFS implications? */
560                 oit.it_flags &= ~O_EXCL;
561
562                 it = &oit;
563         }
564
565 restart:
566         /* Let's see if we have file open on MDS already. */
567         if (it->it_flags & FMODE_WRITE) {
568                 och_p = &lli->lli_mds_write_och;
569                 och_usecount = &lli->lli_open_fd_write_count;
570         } else if (it->it_flags & FMODE_EXEC) {
571                 och_p = &lli->lli_mds_exec_och;
572                 och_usecount = &lli->lli_open_fd_exec_count;
573          } else {
574                 och_p = &lli->lli_mds_read_och;
575                 och_usecount = &lli->lli_open_fd_read_count;
576         }
577
578         down(&lli->lli_och_sem);
579         if (*och_p) { /* Open handle is present */
580                 if (it_disposition(it, DISP_OPEN_OPEN)) {
581                         /* Well, there's extra open request that we do not need,
582                            let's close it somehow. This will decref request. */
583                         rc = it_open_error(DISP_OPEN_OPEN, it);
584                         if (rc) {
585                                 up(&lli->lli_och_sem);
586                                 ll_file_data_put(fd);
587                                 GOTO(out_openerr, rc);
588                         }
589                         ll_release_openhandle(file->f_dentry, it);
590                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
591                                              LPROC_LL_OPEN);
592                 }
593                 (*och_usecount)++;
594
595                 rc = ll_local_open(file, it, fd, NULL);
596                 if (rc) {
597                         (*och_usecount)--;
598                         up(&lli->lli_och_sem);
599                         ll_file_data_put(fd);
600                         GOTO(out_openerr, rc);
601                 }
602         } else {
603                 LASSERT(*och_usecount == 0);
604                 if (!it->d.lustre.it_disposition) {
605                         /* We cannot just request lock handle now, new ELC code
606                            means that one of other OPEN locks for this file
607                            could be cancelled, and since blocking ast handler
608                            would attempt to grab och_sem as well, that would
609                            result in a deadlock */
610                         up(&lli->lli_och_sem);
611                         it->it_create_mode |= M_CHECK_STALE;
612                         rc = ll_intent_file_open(file, NULL, 0, it);
613                         it->it_create_mode &= ~M_CHECK_STALE;
614                         if (rc) {
615                                 ll_file_data_put(fd);
616                                 GOTO(out_openerr, rc);
617                         }
618
619                         /* Got some error? Release the request */
620                         if (it->d.lustre.it_status < 0) {
621                                 req = it->d.lustre.it_data;
622                                 ptlrpc_req_finished(req);
623                         }
624                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
625                                          &it->d.lustre.it_lock_handle,
626                                          file->f_dentry->d_inode);
627                         goto restart;
628                 }
629                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
630                 if (!*och_p) {
631                         ll_file_data_put(fd);
632                         GOTO(out_och_free, rc = -ENOMEM);
633                 }
634                 (*och_usecount)++;
635                 req = it->d.lustre.it_data;
636
637                 /* md_intent_lock() didn't get a request ref if there was an
638                  * open error, so don't do cleanup on the request here
639                  * (bug 3430) */
640                 /* XXX (green): Should not we bail out on any error here, not
641                  * just open error? */
642                 rc = it_open_error(DISP_OPEN_OPEN, it);
643                 if (rc) {
644                         ll_file_data_put(fd);
645                         GOTO(out_och_free, rc);
646                 }
647
648                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
649                 rc = ll_local_open(file, it, fd, *och_p);
650                 if (rc) {
651                         ll_file_data_put(fd);
652                         GOTO(out_och_free, rc);
653                 }
654         }
655         up(&lli->lli_och_sem);
656
657         /* Must do this outside lli_och_sem lock to prevent deadlock where
658            different kind of OPEN lock for this same inode gets cancelled
659            by ldlm_cancel_lru */
660         if (!S_ISREG(inode->i_mode))
661                 GOTO(out, rc);
662
663         ll_capa_open(inode);
664
665         lsm = lli->lli_smd;
666         if (lsm == NULL) {
667                 if (file->f_flags & O_LOV_DELAY_CREATE ||
668                     !(file->f_mode & FMODE_WRITE)) {
669                         CDEBUG(D_INODE, "object creation was delayed\n");
670                         GOTO(out, rc);
671                 }
672         }
673         file->f_flags &= ~O_LOV_DELAY_CREATE;
674         GOTO(out, rc);
675 out:
676         ptlrpc_req_finished(req);
677         if (req)
678                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
679 out_och_free:
680         if (rc) {
681                 if (*och_p) {
682                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
683                         *och_p = NULL; /* OBD_FREE writes some magic there */
684                         (*och_usecount)--;
685                 }
686                 up(&lli->lli_och_sem);
687 out_openerr:
688                 if (opendir_set != 0)
689                         ll_stop_statahead(inode, lli->lli_opendir_key);
690         }
691
692         return rc;
693 }
694
695 /* Fills the obdo with the attributes for the lsm */
696 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
697                           struct obd_capa *capa, struct obdo *obdo)
698 {
699         struct ptlrpc_request_set *set;
700         struct obd_info            oinfo = { { { 0 } } };
701         int                        rc;
702
703         ENTRY;
704
705         LASSERT(lsm != NULL);
706
707         oinfo.oi_md = lsm;
708         oinfo.oi_oa = obdo;
709         oinfo.oi_oa->o_id = lsm->lsm_object_id;
710         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
711         oinfo.oi_oa->o_mode = S_IFREG;
712         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
713                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
714                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
715                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
716                                OBD_MD_FLGROUP;
717         oinfo.oi_capa = capa;
718
719         set = ptlrpc_prep_set();
720         if (set == NULL) {
721                 CERROR("can't allocate ptlrpc set\n");
722                 rc = -ENOMEM;
723         } else {
724                 rc = obd_getattr_async(exp, &oinfo, set);
725                 if (rc == 0)
726                         rc = ptlrpc_set_wait(set);
727                 ptlrpc_set_destroy(set);
728         }
729         if (rc == 0)
730                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
731                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
732                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE);
733         RETURN(rc);
734 }
735
736 /* Fills the obdo with the attributes for the inode defined by lsm */
737 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
738 {
739         struct ll_inode_info *lli  = ll_i2info(inode);
740         struct obd_capa      *capa = ll_mdscapa_get(inode);
741         int rc;
742         ENTRY;
743
744         rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode), capa, obdo);
745         capa_put(capa);
746         if (rc == 0) {
747                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
748                 CDEBUG(D_INODE,
749                        "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
750                        lli->lli_smd->lsm_object_id, i_size_read(inode),
751                        (unsigned long long)inode->i_blocks,
752                        (unsigned long)ll_inode_blksize(inode));
753         }
754         RETURN(rc);
755 }
756
757 int ll_merge_lvb(struct inode *inode)
758 {
759         struct ll_inode_info *lli = ll_i2info(inode);
760         struct ll_sb_info *sbi = ll_i2sbi(inode);
761         struct ost_lvb lvb;
762         int rc;
763
764         ENTRY;
765
766         ll_inode_size_lock(inode, 1);
767         inode_init_lvb(inode, &lvb);
768         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
769         i_size_write(inode, lvb.lvb_size);
770         inode->i_blocks = lvb.lvb_blocks;
771
772         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
773         LTIME_S(inode->i_atime) = lvb.lvb_atime;
774         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
775         ll_inode_size_unlock(inode, 1);
776
777         RETURN(rc);
778 }
779
780 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
781                      lstat_t *st)
782 {
783         struct obdo obdo = { 0 };
784         int rc;
785
786         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo);
787         if (rc == 0) {
788                 st->st_size   = obdo.o_size;
789                 st->st_blocks = obdo.o_blocks;
790                 st->st_mtime  = obdo.o_mtime;
791                 st->st_atime  = obdo.o_atime;
792                 st->st_ctime  = obdo.o_ctime;
793         }
794         return rc;
795 }
796
797 void ll_io_init(struct cl_io *io, const struct file *file, int write)
798 {
799         struct inode *inode     = file->f_dentry->d_inode;
800         struct ll_sb_info *sbi  = ll_i2sbi(inode);
801         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
802
803         LASSERT(fd != NULL);
804         memset(io, 0, sizeof *io);
805         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
806         if (write)
807                 io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
808         io->ci_obj     = ll_i2info(inode)->lli_clob;
809         io->ci_lockreq = CILR_MAYBE;
810         if (fd->fd_flags & LL_FILE_IGNORE_LOCK ||
811             sbi->ll_flags & LL_SBI_NOLCK) {
812                 io->ci_lockreq = CILR_NEVER;
813                 io->ci_no_srvlock = 1;
814         } else if (file->f_flags & O_APPEND) {
815                 io->ci_lockreq = CILR_MANDATORY;
816         }
817 }
818
819 static ssize_t ll_file_io_generic(const struct lu_env *env,
820                 struct ccc_io_args *args, struct file *file,
821                 enum cl_io_type iot, loff_t *ppos, size_t count)
822 {
823         struct cl_io       *io;
824         ssize_t             result;
825         ENTRY;
826
827         io = &ccc_env_info(env)->cti_io;
828         ll_io_init(io, file, iot == CIT_WRITE);
829
830         if (iot == CIT_READ)
831                 io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile;
832
833         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
834                 struct vvp_io *vio = vvp_env_io(env);
835                 struct ccc_io *cio = ccc_env_io(env);
836                 if (cl_io_is_sendfile(io)) {
837                         vio->u.read.cui_actor = args->cia_actor;
838                         vio->u.read.cui_target = args->cia_target;
839                 } else {
840                         cio->cui_iov = args->cia_iov;
841                         cio->cui_nrsegs = args->cia_nrsegs;
842 #ifndef HAVE_FILE_WRITEV
843                         cio->cui_iocb = args->cia_iocb;
844 #endif
845                 }
846                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
847                 result = cl_io_loop(env, io);
848         } else
849                 /* cl_io_rw_init() handled IO */
850                 result = io->ci_result;
851         if (io->ci_nob > 0) {
852                 result = io->ci_nob;
853                 *ppos = io->u.ci_wr.wr.crw_pos;
854         }
855         cl_io_fini(env, io);
856         RETURN(result);
857 }
858
859
860 /*
861  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
862  */
863 static int ll_file_get_iov_count(const struct iovec *iov,
864                                  unsigned long *nr_segs, size_t *count)
865 {
866         size_t cnt = 0;
867         unsigned long seg;
868
869         for (seg = 0; seg < *nr_segs; seg++) {
870                 const struct iovec *iv = &iov[seg];
871
872                 /*
873                  * If any segment has a negative length, or the cumulative
874                  * length ever wraps negative then return -EINVAL.
875                  */
876                 cnt += iv->iov_len;
877                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
878                         return -EINVAL;
879                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
880                         continue;
881                 if (seg == 0)
882                         return -EFAULT;
883                 *nr_segs = seg;
884                 cnt -= iv->iov_len;   /* This segment is no good */
885                 break;
886         }
887         *count = cnt;
888         return 0;
889 }
890
891 #ifdef HAVE_FILE_READV
892 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
893                               unsigned long nr_segs, loff_t *ppos)
894 {
895         struct lu_env      *env;
896         struct ccc_io_args *args;
897         size_t              count;
898         ssize_t             result;
899         int                 refcheck;
900         ENTRY;
901
902         result = ll_file_get_iov_count(iov, &nr_segs, &count);
903         if (result)
904                 RETURN(result);
905
906         env = cl_env_get(&refcheck);
907         if (IS_ERR(env))
908                 RETURN(PTR_ERR(env));
909
910         args = &vvp_env_info(env)->vti_args;
911         args->cia_is_sendfile = 0;
912         args->cia_iov = (struct iovec *)iov;
913         args->cia_nrsegs = nr_segs;
914         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
915         cl_env_put(env, &refcheck);
916         RETURN(result);
917 }
918
919 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
920                             loff_t *ppos)
921 {
922         struct lu_env *env;
923         struct iovec  *local_iov;
924         ssize_t        result;
925         int            refcheck;
926         ENTRY;
927
928         env = cl_env_get(&refcheck);
929         if (IS_ERR(env))
930                 RETURN(PTR_ERR(env));
931
932         local_iov = &vvp_env_info(env)->vti_local_iov;
933         local_iov->iov_base = (void __user *)buf;
934         local_iov->iov_len = count;
935         result = ll_file_readv(file, local_iov, 1, ppos);
936         cl_env_put(env, &refcheck);
937         RETURN(result);
938 }
939
940 #else
941 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
942                                 unsigned long nr_segs, loff_t pos)
943 {
944         struct lu_env      *env;
945         struct ccc_io_args *args;
946         size_t              count;
947         ssize_t             result;
948         int                 refcheck;
949         ENTRY;
950
951         result = ll_file_get_iov_count(iov, &nr_segs, &count);
952         if (result)
953                 RETURN(result);
954
955         env = cl_env_get(&refcheck);
956         if (IS_ERR(env))
957                 RETURN(PTR_ERR(env));
958
959         args = &vvp_env_info(env)->vti_args;
960         args->cia_is_sendfile = 0;
961         args->cia_iov = (struct iovec *)iov;
962         args->cia_nrsegs = nr_segs;
963         args->cia_iocb = iocb;
964         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
965                                     &iocb->ki_pos, count);
966         cl_env_put(env, &refcheck);
967         RETURN(result);
968 }
969
970 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
971                             loff_t *ppos)
972 {
973         struct lu_env *env;
974         struct iovec  *local_iov;
975         struct kiocb  *kiocb;
976         ssize_t        result;
977         int            refcheck;
978         ENTRY;
979
980         env = cl_env_get(&refcheck);
981         if (IS_ERR(env))
982                 RETURN(PTR_ERR(env));
983
984         local_iov = &vvp_env_info(env)->vti_local_iov;
985         kiocb = &vvp_env_info(env)->vti_kiocb;
986         local_iov->iov_base = (void __user *)buf;
987         local_iov->iov_len = count;
988         init_sync_kiocb(kiocb, file);
989         kiocb->ki_pos = *ppos;
990         kiocb->ki_left = count;
991
992         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
993         *ppos = kiocb->ki_pos;
994
995         cl_env_put(env, &refcheck);
996         RETURN(result);
997 }
998 #endif
999
1000 /*
1001  * Write to a file (through the page cache).
1002  */
1003 #ifdef HAVE_FILE_WRITEV
1004 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1005                               unsigned long nr_segs, loff_t *ppos)
1006 {
1007         struct lu_env      *env;
1008         struct ccc_io_args *args;
1009         size_t              count;
1010         ssize_t             result;
1011         int                 refcheck;
1012         ENTRY;
1013
1014         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1015         if (result)
1016                 RETURN(result);
1017
1018         env = cl_env_get(&refcheck);
1019         if (IS_ERR(env))
1020                 RETURN(PTR_ERR(env));
1021
1022         args = &vvp_env_info(env)->vti_args;
1023         args->cia_iov = (struct iovec *)iov;
1024         args->cia_nrsegs = nr_segs;
1025         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1026         cl_env_put(env, &refcheck);
1027         RETURN(result);
1028 }
1029
1030 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1031                              loff_t *ppos)
1032 {
1033         struct lu_env    *env;
1034         struct iovec     *local_iov;
1035         ssize_t           result;
1036         int               refcheck;
1037         ENTRY;
1038
1039         env = cl_env_get(&refcheck);
1040         if (IS_ERR(env))
1041                 RETURN(PTR_ERR(env));
1042
1043         local_iov = &vvp_env_info(env)->vti_local_iov;
1044         local_iov->iov_base = (void __user *)buf;
1045         local_iov->iov_len = count;
1046
1047         result = ll_file_writev(file, local_iov, 1, ppos);
1048         cl_env_put(env, &refcheck);
1049         RETURN(result);
1050 }
1051
1052 #else /* AIO stuff */
1053 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1054                                  unsigned long nr_segs, loff_t pos)
1055 {
1056         struct lu_env      *env;
1057         struct ccc_io_args *args;
1058         size_t              count;
1059         ssize_t             result;
1060         int                 refcheck;
1061         ENTRY;
1062
1063         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1064         if (result)
1065                 RETURN(result);
1066
1067         env = cl_env_get(&refcheck);
1068         if (IS_ERR(env))
1069                 RETURN(PTR_ERR(env));
1070
1071         args = &vvp_env_info(env)->vti_args;
1072         args->cia_iov = (struct iovec *)iov;
1073         args->cia_nrsegs = nr_segs;
1074         args->cia_iocb = iocb;
1075         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1076                                   &iocb->ki_pos, count);
1077         cl_env_put(env, &refcheck);
1078         RETURN(result);
1079 }
1080
1081 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1082                              loff_t *ppos)
1083 {
1084         struct lu_env *env;
1085         struct iovec  *local_iov;
1086         struct kiocb  *kiocb;
1087         ssize_t        result;
1088         int            refcheck;
1089         ENTRY;
1090
1091         env = cl_env_get(&refcheck);
1092         if (IS_ERR(env))
1093                 RETURN(PTR_ERR(env));
1094
1095         local_iov = &vvp_env_info(env)->vti_local_iov;
1096         kiocb = &vvp_env_info(env)->vti_kiocb;
1097         local_iov->iov_base = (void __user *)buf;
1098         local_iov->iov_len = count;
1099         init_sync_kiocb(kiocb, file);
1100         kiocb->ki_pos = *ppos;
1101         kiocb->ki_left = count;
1102
1103         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1104         *ppos = kiocb->ki_pos;
1105
1106         cl_env_put(env, &refcheck);
1107         RETURN(result);
1108 }
1109 #endif
1110
1111
1112 /*
1113  * Send file content (through pagecache) somewhere with helper
1114  */
1115 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1116                                 read_actor_t actor, void *target)
1117 {
1118         struct lu_env      *env;
1119         struct ccc_io_args *args;
1120         ssize_t             result;
1121         int                 refcheck;
1122         ENTRY;
1123
1124         env = cl_env_get(&refcheck);
1125         if (IS_ERR(env))
1126                 RETURN(PTR_ERR(env));
1127
1128         args = &vvp_env_info(env)->vti_args;
1129         args->cia_is_sendfile = 1;
1130         args->cia_target = target;
1131         args->cia_actor = actor;
1132         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1133         cl_env_put(env, &refcheck);
1134         RETURN(result);
1135 }
1136
1137 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1138                                unsigned long arg)
1139 {
1140         struct obd_export *exp = ll_i2dtexp(inode);
1141         struct ll_recreate_obj ucreatp;
1142         struct obd_trans_info oti = { 0 };
1143         struct obdo *oa = NULL;
1144         int lsm_size;
1145         int rc = 0;
1146         struct lov_stripe_md *lsm, *lsm2;
1147         ENTRY;
1148
1149         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1150                 RETURN(-EPERM);
1151
1152         if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1153                            sizeof(struct ll_recreate_obj)))
1154                 RETURN(-EFAULT);
1155
1156         OBDO_ALLOC(oa);
1157         if (oa == NULL)
1158                 RETURN(-ENOMEM);
1159
1160         ll_inode_size_lock(inode, 0);
1161         lsm = ll_i2info(inode)->lli_smd;
1162         if (lsm == NULL)
1163                 GOTO(out, rc = -ENOENT);
1164         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1165                    (lsm->lsm_stripe_count));
1166
1167         OBD_ALLOC(lsm2, lsm_size);
1168         if (lsm2 == NULL)
1169                 GOTO(out, rc = -ENOMEM);
1170
1171         oa->o_id = ucreatp.lrc_id;
1172         oa->o_gr = ucreatp.lrc_group;
1173         oa->o_nlink = ucreatp.lrc_ost_idx;
1174         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1175         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1176         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1177                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1178
1179         memcpy(lsm2, lsm, lsm_size);
1180         rc = obd_create(exp, oa, &lsm2, &oti);
1181
1182         OBD_FREE(lsm2, lsm_size);
1183         GOTO(out, rc);
1184 out:
1185         ll_inode_size_unlock(inode, 0);
1186         OBDO_FREE(oa);
1187         return rc;
1188 }
1189
1190 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1191                              int flags, struct lov_user_md *lum, int lum_size)
1192 {
1193         struct lov_stripe_md *lsm;
1194         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1195         int rc = 0;
1196         ENTRY;
1197
1198         ll_inode_size_lock(inode, 0);
1199         lsm = ll_i2info(inode)->lli_smd;
1200         if (lsm) {
1201                 ll_inode_size_unlock(inode, 0);
1202                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1203                        inode->i_ino);
1204                 RETURN(-EEXIST);
1205         }
1206
1207         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1208         if (rc)
1209                 GOTO(out, rc);
1210         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1211                 GOTO(out_req_free, rc = -ENOENT);
1212         rc = oit.d.lustre.it_status;
1213         if (rc < 0)
1214                 GOTO(out_req_free, rc);
1215
1216         ll_release_openhandle(file->f_dentry, &oit);
1217
1218  out:
1219         ll_inode_size_unlock(inode, 0);
1220         ll_intent_release(&oit);
1221         RETURN(rc);
1222 out_req_free:
1223         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1224         goto out;
1225 }
1226
1227 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1228                              struct lov_mds_md **lmmp, int *lmm_size,
1229                              struct ptlrpc_request **request)
1230 {
1231         struct ll_sb_info *sbi = ll_i2sbi(inode);
1232         struct mdt_body  *body;
1233         struct lov_mds_md *lmm = NULL;
1234         struct ptlrpc_request *req = NULL;
1235         struct obd_capa *oc;
1236         int rc, lmmsize;
1237
1238         rc = ll_get_max_mdsize(sbi, &lmmsize);
1239         if (rc)
1240                 RETURN(rc);
1241
1242         oc = ll_mdscapa_get(inode);
1243         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1244                              oc, filename, strlen(filename) + 1,
1245                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1246                              ll_i2suppgid(inode), &req);
1247         capa_put(oc);
1248         if (rc < 0) {
1249                 CDEBUG(D_INFO, "md_getattr_name failed "
1250                        "on %s: rc %d\n", filename, rc);
1251                 GOTO(out, rc);
1252         }
1253
1254         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1255         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1256
1257         lmmsize = body->eadatasize;
1258
1259         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1260                         lmmsize == 0) {
1261                 GOTO(out, rc = -ENODATA);
1262         }
1263
1264         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1265         LASSERT(lmm != NULL);
1266
1267         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1268             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
1269             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
1270                 GOTO(out, rc = -EPROTO);
1271         }
1272
1273         /*
1274          * This is coming from the MDS, so is probably in
1275          * little endian.  We convert it to host endian before
1276          * passing it to userspace.
1277          */
1278         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1279                 /* if function called for directory - we should
1280                  * avoid swab not existent lsm objects */
1281                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1282                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1283                         if (S_ISREG(body->mode))
1284                                 lustre_swab_lov_user_md_objects(
1285                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1286                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1287                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1288                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1289                         if (S_ISREG(body->mode))
1290                                 lustre_swab_lov_user_md_objects(
1291                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1292                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1293                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
1294                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1295                 }
1296         }
1297
1298         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1299                 struct lov_stripe_md *lsm;
1300                 struct lov_user_md_join *lmj;
1301                 int lmj_size, i, aindex = 0;
1302
1303                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1304                 if (rc < 0)
1305                         GOTO(out, rc = -ENOMEM);
1306                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1307                 if (rc)
1308                         GOTO(out_free_memmd, rc);
1309
1310                 lmj_size = sizeof(struct lov_user_md_join) +
1311                            lsm->lsm_stripe_count *
1312                            sizeof(struct lov_user_ost_data_join);
1313                 OBD_ALLOC(lmj, lmj_size);
1314                 if (!lmj)
1315                         GOTO(out_free_memmd, rc = -ENOMEM);
1316
1317                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1318                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1319                         struct lov_extent *lex =
1320                                 &lsm->lsm_array->lai_ext_array[aindex];
1321
1322                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1323                                 aindex ++;
1324                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1325                                         LPU64" len %d\n", aindex, i,
1326                                         lex->le_start, (int)lex->le_len);
1327                         lmj->lmm_objects[i].l_extent_start =
1328                                 lex->le_start;
1329
1330                         if ((int)lex->le_len == -1)
1331                                 lmj->lmm_objects[i].l_extent_end = -1;
1332                         else
1333                                 lmj->lmm_objects[i].l_extent_end =
1334                                         lex->le_start + lex->le_len;
1335                         lmj->lmm_objects[i].l_object_id =
1336                                 lsm->lsm_oinfo[i]->loi_id;
1337                         lmj->lmm_objects[i].l_object_gr =
1338                                 lsm->lsm_oinfo[i]->loi_gr;
1339                         lmj->lmm_objects[i].l_ost_gen =
1340                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1341                         lmj->lmm_objects[i].l_ost_idx =
1342                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1343                 }
1344                 lmm = (struct lov_mds_md *)lmj;
1345                 lmmsize = lmj_size;
1346 out_free_memmd:
1347                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1348         }
1349 out:
1350         *lmmp = lmm;
1351         *lmm_size = lmmsize;
1352         *request = req;
1353         return rc;
1354 }
1355
1356 static int ll_lov_setea(struct inode *inode, struct file *file,
1357                             unsigned long arg)
1358 {
1359         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1360         struct lov_user_md  *lump;
1361         int lum_size = sizeof(struct lov_user_md) +
1362                        sizeof(struct lov_user_ost_data);
1363         int rc;
1364         ENTRY;
1365
1366         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1367                 RETURN(-EPERM);
1368
1369         OBD_ALLOC(lump, lum_size);
1370         if (lump == NULL) {
1371                 RETURN(-ENOMEM);
1372         }
1373         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1374                 OBD_FREE(lump, lum_size);
1375                 RETURN(-EFAULT);
1376         }
1377
1378         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1379
1380         OBD_FREE(lump, lum_size);
1381         RETURN(rc);
1382 }
1383
1384 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1385                             unsigned long arg)
1386 {
1387         struct lov_user_md_v3 lumv3;
1388         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1389         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1390         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1391         int lum_size;
1392         int rc;
1393         int flags = FMODE_WRITE;
1394         ENTRY;
1395
1396         /* first try with v1 which is smaller than v3 */
1397         lum_size = sizeof(struct lov_user_md_v1);
1398         if (copy_from_user(lumv1, lumv1p, lum_size))
1399                 RETURN(-EFAULT);
1400
1401         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1402                 lum_size = sizeof(struct lov_user_md_v3);
1403                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1404                         RETURN(-EFAULT);
1405         }
1406
1407         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1408         if (rc == 0) {
1409                  put_user(0, &lumv1p->lmm_stripe_count);
1410                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1411                                     0, ll_i2info(inode)->lli_smd,
1412                                     (void *)arg);
1413         }
1414         RETURN(rc);
1415 }
1416
1417 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1418 {
1419         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1420
1421         if (!lsm)
1422                 RETURN(-ENODATA);
1423
1424         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1425                             (void *)arg);
1426 }
1427
1428 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1429 {
1430         struct ll_inode_info   *lli = ll_i2info(inode);
1431         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1432         struct ccc_grouplock    grouplock;
1433         int                     rc;
1434         ENTRY;
1435
1436         spin_lock(&lli->lli_lock);
1437         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1438                 CERROR("group lock already existed with gid %lu\n",
1439                        fd->fd_grouplock.cg_gid);
1440                 spin_unlock(&lli->lli_lock);
1441                 RETURN(-EINVAL);
1442         }
1443         LASSERT(fd->fd_grouplock.cg_lock == NULL);
1444         spin_unlock(&lli->lli_lock);
1445
1446         rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1447                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1448         if (rc)
1449                 RETURN(rc);
1450
1451         spin_lock(&lli->lli_lock);
1452         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1453                 spin_unlock(&lli->lli_lock);
1454                 CERROR("another thread just won the race\n");
1455                 cl_put_grouplock(&grouplock);
1456                 RETURN(-EINVAL);
1457         }
1458
1459         fd->fd_flags |= (LL_FILE_GROUP_LOCKED | LL_FILE_IGNORE_LOCK);
1460         fd->fd_grouplock = grouplock;
1461         spin_unlock(&lli->lli_lock);
1462
1463         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1464         RETURN(0);
1465 }
1466
1467 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1468 {
1469         struct ll_inode_info   *lli = ll_i2info(inode);
1470         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1471         struct ccc_grouplock    grouplock;
1472         ENTRY;
1473
1474         spin_lock(&lli->lli_lock);
1475         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1476                 spin_unlock(&lli->lli_lock);
1477                 CERROR("no group lock held\n");
1478                 RETURN(-EINVAL);
1479         }
1480         LASSERT(fd->fd_grouplock.cg_lock != NULL);
1481
1482         if (fd->fd_grouplock.cg_gid != arg) {
1483                 CERROR("group lock %lu doesn't match current id %lu\n",
1484                        arg, fd->fd_grouplock.cg_gid);
1485                 spin_unlock(&lli->lli_lock);
1486                 RETURN(-EINVAL);
1487         }
1488
1489         grouplock = fd->fd_grouplock;
1490         fd->fd_grouplock.cg_env = NULL;
1491         fd->fd_grouplock.cg_lock = NULL;
1492         fd->fd_grouplock.cg_gid = 0;
1493         fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED | LL_FILE_IGNORE_LOCK);
1494         spin_unlock(&lli->lli_lock);
1495
1496         cl_put_grouplock(&grouplock);
1497         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1498         RETURN(0);
1499 }
1500
1501 #if LUSTRE_FIX >= 50
1502 static int join_sanity_check(struct inode *head, struct inode *tail)
1503 {
1504         ENTRY;
1505         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1506                 CERROR("server do not support join \n");
1507                 RETURN(-EINVAL);
1508         }
1509         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1510                 CERROR("tail ino %lu and ino head %lu must be regular\n",
1511                        head->i_ino, tail->i_ino);
1512                 RETURN(-EINVAL);
1513         }
1514         if (head->i_ino == tail->i_ino) {
1515                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1516                 RETURN(-EINVAL);
1517         }
1518         if (i_size_read(head) % JOIN_FILE_ALIGN) {
1519                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1520                 RETURN(-EINVAL);
1521         }
1522         RETURN(0);
1523 }
1524
1525 static int join_file(struct inode *head_inode, struct file *head_filp,
1526                      struct file *tail_filp)
1527 {
1528         struct dentry *tail_dentry = tail_filp->f_dentry;
1529         struct lookup_intent oit = {.it_op = IT_OPEN,
1530                                     .it_flags = head_filp->f_flags,
1531                                     .it_create_mode = M_JOIN_FILE};
1532         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
1533                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
1534
1535         struct lustre_handle lockh;
1536         struct md_op_data *op_data;
1537         int    rc;
1538         loff_t data;
1539         ENTRY;
1540
1541         tail_dentry = tail_filp->f_dentry;
1542
1543         data = i_size_read(head_inode);
1544         op_data = ll_prep_md_op_data(NULL, head_inode,
1545                                      tail_dentry->d_parent->d_inode,
1546                                      tail_dentry->d_name.name,
1547                                      tail_dentry->d_name.len, 0,
1548                                      LUSTRE_OPC_ANY, &data);
1549         if (IS_ERR(op_data))
1550                 RETURN(PTR_ERR(op_data));
1551
1552         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
1553                          op_data, &lockh, NULL, 0, NULL, 0);
1554
1555         ll_finish_md_op_data(op_data);
1556         if (rc < 0)
1557                 GOTO(out, rc);
1558
1559         rc = oit.d.lustre.it_status;
1560
1561         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
1562                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
1563                 ptlrpc_req_finished((struct ptlrpc_request *)
1564                                     oit.d.lustre.it_data);
1565                 GOTO(out, rc);
1566         }
1567
1568         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1569                                            * away */
1570                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1571                 oit.d.lustre.it_lock_mode = 0;
1572         }
1573         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1574         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
1575         ll_release_openhandle(head_filp->f_dentry, &oit);
1576 out:
1577         ll_intent_release(&oit);
1578         RETURN(rc);
1579 }
1580
1581 static int ll_file_join(struct inode *head, struct file *filp,
1582                         char *filename_tail)
1583 {
1584         struct inode *tail = NULL, *first = NULL, *second = NULL;
1585         struct dentry *tail_dentry;
1586         struct file *tail_filp, *first_filp, *second_filp;
1587         struct ll_lock_tree first_tree, second_tree;
1588         struct ll_lock_tree_node *first_node, *second_node;
1589         struct ll_inode_info *hlli = ll_i2info(head);
1590         int rc = 0, cleanup_phase = 0;
1591         ENTRY;
1592
1593         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1594                head->i_ino, head->i_generation, head, filename_tail);
1595
1596         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1597         if (IS_ERR(tail_filp)) {
1598                 CERROR("Can not open tail file %s", filename_tail);
1599                 rc = PTR_ERR(tail_filp);
1600                 GOTO(cleanup, rc);
1601         }
1602         tail = igrab(tail_filp->f_dentry->d_inode);
1603
1604         tail_dentry = tail_filp->f_dentry;
1605         LASSERT(tail_dentry);
1606         cleanup_phase = 1;
1607
1608         /*reorder the inode for lock sequence*/
1609         first = head->i_ino > tail->i_ino ? head : tail;
1610         second = head->i_ino > tail->i_ino ? tail : head;
1611         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1612         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1613
1614         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1615                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1616         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1617         if (IS_ERR(first_node)){
1618                 rc = PTR_ERR(first_node);
1619                 GOTO(cleanup, rc);
1620         }
1621         first_tree.lt_fd = first_filp->private_data;
1622         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1623         if (rc != 0)
1624                 GOTO(cleanup, rc);
1625         cleanup_phase = 2;
1626
1627         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1628         if (IS_ERR(second_node)){
1629                 rc = PTR_ERR(second_node);
1630                 GOTO(cleanup, rc);
1631         }
1632         second_tree.lt_fd = second_filp->private_data;
1633         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1634         if (rc != 0)
1635                 GOTO(cleanup, rc);
1636         cleanup_phase = 3;
1637
1638         rc = join_sanity_check(head, tail);
1639         if (rc)
1640                 GOTO(cleanup, rc);
1641
1642         rc = join_file(head, filp, tail_filp);
1643         if (rc)
1644                 GOTO(cleanup, rc);
1645 cleanup:
1646         switch (cleanup_phase) {
1647         case 3:
1648                 ll_tree_unlock(&second_tree);
1649                 obd_cancel_unused(ll_i2dtexp(second),
1650                                   ll_i2info(second)->lli_smd, 0, NULL);
1651         case 2:
1652                 ll_tree_unlock(&first_tree);
1653                 obd_cancel_unused(ll_i2dtexp(first),
1654                                   ll_i2info(first)->lli_smd, 0, NULL);
1655         case 1:
1656                 filp_close(tail_filp, 0);
1657                 if (tail)
1658                         iput(tail);
1659                 if (head && rc == 0) {
1660                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
1661                                        &hlli->lli_smd);
1662                         hlli->lli_smd = NULL;
1663                 }
1664         case 0:
1665                 break;
1666         default:
1667                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
1668                 LBUG();
1669         }
1670         RETURN(rc);
1671 }
1672 #endif /* LUSTRE_FIX >= 50 */
1673
1674 /**
1675  * Close inode open handle
1676  *
1677  * \param dentry [in]     dentry which contains the inode
1678  * \param it     [in,out] intent which contains open info and result
1679  *
1680  * \retval 0     success
1681  * \retval <0    failure
1682  */
1683 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1684 {
1685         struct inode *inode = dentry->d_inode;
1686         struct obd_client_handle *och;
1687         int rc;
1688         ENTRY;
1689
1690         LASSERT(inode);
1691
1692         /* Root ? Do nothing. */
1693         if (dentry->d_inode->i_sb->s_root == dentry)
1694                 RETURN(0);
1695
1696         /* No open handle to close? Move away */
1697         if (!it_disposition(it, DISP_OPEN_OPEN))
1698                 RETURN(0);
1699
1700         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1701
1702         OBD_ALLOC(och, sizeof(*och));
1703         if (!och)
1704                 GOTO(out, rc = -ENOMEM);
1705
1706         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1707                     ll_i2info(inode), it, och);
1708
1709         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1710                                        inode, och);
1711  out:
1712         /* this one is in place of ll_file_open */
1713         if (it_disposition(it, DISP_ENQ_OPEN_REF))
1714                 ptlrpc_req_finished(it->d.lustre.it_data);
1715         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1716         RETURN(rc);
1717 }
1718
1719 /**
1720  * Get size for inode for which FIEMAP mapping is requested.
1721  * Make the FIEMAP get_info call and returns the result.
1722  */
1723 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1724               int num_bytes)
1725 {
1726         struct obd_export *exp = ll_i2dtexp(inode);
1727         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1728         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1729         int vallen = num_bytes;
1730         int rc;
1731         ENTRY;
1732
1733         /* If the stripe_count > 1 and the application does not understand
1734          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1735          */
1736         if (lsm->lsm_stripe_count > 1 &&
1737             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1738                 return -EOPNOTSUPP;
1739
1740         fm_key.oa.o_id = lsm->lsm_object_id;
1741         fm_key.oa.o_gr = lsm->lsm_object_gr;
1742         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1743
1744         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
1745                         OBD_MD_FLSIZE);
1746
1747         /* If filesize is 0, then there would be no objects for mapping */
1748         if (fm_key.oa.o_size == 0) {
1749                 fiemap->fm_mapped_extents = 0;
1750                 RETURN(0);
1751         }
1752
1753         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1754
1755         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1756         if (rc)
1757                 CERROR("obd_get_info failed: rc = %d\n", rc);
1758
1759         RETURN(rc);
1760 }
1761
1762 int ll_fid2path(struct obd_export *exp, void *arg)
1763 {
1764         struct getinfo_fid2path *gfout, *gfin;
1765         int outsize, rc;
1766         ENTRY;
1767
1768         /* Need to get the buflen */
1769         OBD_ALLOC_PTR(gfin);
1770         if (gfin == NULL)
1771                 RETURN(-ENOMEM);
1772         if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1773                 OBD_FREE_PTR(gfin);
1774                 RETURN(-EFAULT);
1775         }
1776
1777         outsize = sizeof(*gfout) + gfin->gf_pathlen;
1778         OBD_ALLOC(gfout, outsize);
1779         if (gfout == NULL) {
1780                 OBD_FREE_PTR(gfin);
1781                 RETURN(-ENOMEM);
1782         }
1783         memcpy(gfout, gfin, sizeof(*gfout));
1784         OBD_FREE_PTR(gfin);
1785
1786         /* Call mdc_iocontrol */
1787         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1788         if (rc)
1789                 GOTO(gf_free, rc);
1790         if (copy_to_user(arg, gfout, outsize))
1791                 rc = -EFAULT;
1792
1793 gf_free:
1794         OBD_FREE(gfout, outsize);
1795         RETURN(rc);
1796 }
1797
1798 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1799                   unsigned long arg)
1800 {
1801         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1802         int flags;
1803         ENTRY;
1804
1805         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1806                inode->i_generation, inode, cmd);
1807         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1808
1809         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1810         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1811                 RETURN(-ENOTTY);
1812
1813         switch(cmd) {
1814         case LL_IOC_GETFLAGS:
1815                 /* Get the current value of the file flags */
1816                 return put_user(fd->fd_flags, (int *)arg);
1817         case LL_IOC_SETFLAGS:
1818         case LL_IOC_CLRFLAGS:
1819                 /* Set or clear specific file flags */
1820                 /* XXX This probably needs checks to ensure the flags are
1821                  *     not abused, and to handle any flag side effects.
1822                  */
1823                 if (get_user(flags, (int *) arg))
1824                         RETURN(-EFAULT);
1825
1826                 if (cmd == LL_IOC_SETFLAGS) {
1827                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1828                             !(file->f_flags & O_DIRECT)) {
1829                                 CERROR("%s: unable to disable locking on "
1830                                        "non-O_DIRECT file\n", current->comm);
1831                                 RETURN(-EINVAL);
1832                         }
1833
1834                         fd->fd_flags |= flags;
1835                 } else {
1836                         fd->fd_flags &= ~flags;
1837                 }
1838                 RETURN(0);
1839         case LL_IOC_LOV_SETSTRIPE:
1840                 RETURN(ll_lov_setstripe(inode, file, arg));
1841         case LL_IOC_LOV_SETEA:
1842                 RETURN(ll_lov_setea(inode, file, arg));
1843         case LL_IOC_LOV_GETSTRIPE:
1844                 RETURN(ll_lov_getstripe(inode, arg));
1845         case LL_IOC_RECREATE_OBJ:
1846                 RETURN(ll_lov_recreate_obj(inode, file, arg));
1847         case EXT3_IOC_FIEMAP: {
1848                 struct ll_user_fiemap *fiemap_s;
1849                 size_t num_bytes, ret_bytes;
1850                 unsigned int extent_count;
1851                 int rc = 0;
1852
1853                 /* Get the extent count so we can calculate the size of
1854                  * required fiemap buffer */
1855                 if (get_user(extent_count,
1856                     &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1857                         RETURN(-EFAULT);
1858                 num_bytes = sizeof(*fiemap_s) + (extent_count *
1859                                                  sizeof(struct ll_fiemap_extent));
1860                 OBD_VMALLOC(fiemap_s, num_bytes);
1861                 if (fiemap_s == NULL)
1862                         RETURN(-ENOMEM);
1863
1864                 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1865                                    sizeof(*fiemap_s)))
1866                         GOTO(error, rc = -EFAULT);
1867
1868                 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1869                         fiemap_s->fm_flags = fiemap_s->fm_flags &
1870                                                     ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1871                         if (copy_to_user((char *)arg, fiemap_s,
1872                                          sizeof(*fiemap_s)))
1873                                 GOTO(error, rc = -EFAULT);
1874
1875                         GOTO(error, rc = -EBADR);
1876                 }
1877
1878                 /* If fm_extent_count is non-zero, read the first extent since
1879                  * it is used to calculate end_offset and device from previous
1880                  * fiemap call. */
1881                 if (extent_count) {
1882                         if (copy_from_user(&fiemap_s->fm_extents[0],
1883                             (char __user *)arg + sizeof(*fiemap_s),
1884                             sizeof(struct ll_fiemap_extent)))
1885                                 GOTO(error, rc = -EFAULT);
1886                 }
1887
1888                 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
1889                         int rc;
1890
1891                         rc = filemap_fdatawrite(inode->i_mapping);
1892                         if (rc)
1893                                 GOTO(error, rc);
1894                 }
1895
1896                 rc = ll_fiemap(inode, fiemap_s, num_bytes);
1897                 if (rc)
1898                         GOTO(error, rc);
1899
1900                 ret_bytes = sizeof(struct ll_user_fiemap);
1901
1902                 if (extent_count != 0)
1903                         ret_bytes += (fiemap_s->fm_mapped_extents *
1904                                          sizeof(struct ll_fiemap_extent));
1905
1906                 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1907                         rc = -EFAULT;
1908
1909 error:
1910                 OBD_VFREE(fiemap_s, num_bytes);
1911                 RETURN(rc);
1912         }
1913         case EXT3_IOC_GETFLAGS:
1914         case EXT3_IOC_SETFLAGS:
1915                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1916         case EXT3_IOC_GETVERSION_OLD:
1917         case EXT3_IOC_GETVERSION:
1918                 RETURN(put_user(inode->i_generation, (int *)arg));
1919         case LL_IOC_JOIN: {
1920 #if LUSTRE_FIX >= 50
1921                 /* Allow file join in beta builds to allow debuggging */
1922                 char *ftail;
1923                 int rc;
1924
1925                 ftail = getname((const char *)arg);
1926                 if (IS_ERR(ftail))
1927                         RETURN(PTR_ERR(ftail));
1928                 rc = ll_file_join(inode, file, ftail);
1929                 putname(ftail);
1930                 RETURN(rc);
1931 #else
1932                 CWARN("file join is not supported in this version of Lustre\n");
1933                 RETURN(-ENOTTY);
1934 #endif
1935         }
1936         case LL_IOC_GROUP_LOCK:
1937                 RETURN(ll_get_grouplock(inode, file, arg));
1938         case LL_IOC_GROUP_UNLOCK:
1939                 RETURN(ll_put_grouplock(inode, file, arg));
1940         case IOC_OBD_STATFS:
1941                 RETURN(ll_obd_statfs(inode, (void *)arg));
1942
1943         /* We need to special case any other ioctls we want to handle,
1944          * to send them to the MDS/OST as appropriate and to properly
1945          * network encode the arg field.
1946         case EXT3_IOC_SETVERSION_OLD:
1947         case EXT3_IOC_SETVERSION:
1948         */
1949         case LL_IOC_FLUSHCTX:
1950                 RETURN(ll_flush_ctx(inode));
1951         case LL_IOC_PATH2FID: {
1952                 if (copy_to_user((void *)arg, &ll_i2info(inode)->lli_fid,
1953                                  sizeof(struct lu_fid)))
1954                         RETURN(-EFAULT);
1955
1956                 RETURN(0);
1957         }
1958         case OBD_IOC_FID2PATH:
1959                 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1960
1961         default: {
1962                 int err;
1963
1964                 if (LLIOC_STOP ==
1965                     ll_iocontrol_call(inode, file, cmd, arg, &err))
1966                         RETURN(err);
1967
1968                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1969                                      (void *)arg));
1970         }
1971         }
1972 }
1973
1974 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1975 {
1976         struct inode *inode = file->f_dentry->d_inode;
1977         loff_t retval;
1978         ENTRY;
1979         retval = offset + ((origin == 2) ? i_size_read(inode) :
1980                            (origin == 1) ? file->f_pos : 0);
1981         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
1982                inode->i_ino, inode->i_generation, inode, retval, retval,
1983                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1984         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1985
1986         if (origin == 2) { /* SEEK_END */
1987                 int nonblock = 0, rc;
1988
1989                 if (file->f_flags & O_NONBLOCK)
1990                         nonblock = LDLM_FL_BLOCK_NOWAIT;
1991
1992                 rc = cl_glimpse_size(inode);
1993                 if (rc != 0)
1994                         RETURN(rc);
1995
1996                 ll_inode_size_lock(inode, 0);
1997                 offset += i_size_read(inode);
1998                 ll_inode_size_unlock(inode, 0);
1999         } else if (origin == 1) { /* SEEK_CUR */
2000                 offset += file->f_pos;
2001         }
2002
2003         retval = -EINVAL;
2004         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2005                 if (offset != file->f_pos) {
2006                         file->f_pos = offset;
2007                 }
2008                 retval = offset;
2009         }
2010
2011         RETURN(retval);
2012 }
2013
2014 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2015 {
2016         struct inode *inode = dentry->d_inode;
2017         struct ll_inode_info *lli = ll_i2info(inode);
2018         struct lov_stripe_md *lsm = lli->lli_smd;
2019         struct ptlrpc_request *req;
2020         struct obd_capa *oc;
2021         int rc, err;
2022         ENTRY;
2023         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2024                inode->i_generation, inode);
2025         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2026
2027         /* fsync's caller has already called _fdata{sync,write}, we want
2028          * that IO to finish before calling the osc and mdc sync methods */
2029         rc = filemap_fdatawait(inode->i_mapping);
2030
2031         /* catch async errors that were recorded back when async writeback
2032          * failed for pages in this mapping. */
2033         err = lli->lli_async_rc;
2034         lli->lli_async_rc = 0;
2035         if (rc == 0)
2036                 rc = err;
2037         if (lsm) {
2038                 err = lov_test_and_clear_async_rc(lsm);
2039                 if (rc == 0)
2040                         rc = err;
2041         }
2042
2043         oc = ll_mdscapa_get(inode);
2044         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2045                       &req);
2046         capa_put(oc);
2047         if (!rc)
2048                 rc = err;
2049         if (!err)
2050                 ptlrpc_req_finished(req);
2051
2052         if (data && lsm) {
2053                 struct obdo *oa;
2054
2055                 OBDO_ALLOC(oa);
2056                 if (!oa)
2057                         RETURN(rc ? rc : -ENOMEM);
2058
2059                 oa->o_id = lsm->lsm_object_id;
2060                 oa->o_gr = lsm->lsm_object_gr;
2061                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2062                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2063                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2064                                            OBD_MD_FLGROUP);
2065
2066                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2067                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2068                                0, OBD_OBJECT_EOF, oc);
2069                 capa_put(oc);
2070                 if (!rc)
2071                         rc = err;
2072                 OBDO_FREE(oa);
2073         }
2074
2075         RETURN(rc);
2076 }
2077
2078 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2079 {
2080         struct inode *inode = file->f_dentry->d_inode;
2081         struct ll_sb_info *sbi = ll_i2sbi(inode);
2082         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2083                                            .ei_cb_cp =ldlm_flock_completion_ast,
2084                                            .ei_cbdata = file_lock };
2085         struct md_op_data *op_data;
2086         struct lustre_handle lockh = {0};
2087         ldlm_policy_data_t flock;
2088         int flags = 0;
2089         int rc;
2090         ENTRY;
2091
2092         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2093                inode->i_ino, file_lock);
2094
2095         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2096
2097         if (file_lock->fl_flags & FL_FLOCK) {
2098                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2099                 /* set missing params for flock() calls */
2100                 file_lock->fl_end = OFFSET_MAX;
2101                 file_lock->fl_pid = current->tgid;
2102         }
2103         flock.l_flock.pid = file_lock->fl_pid;
2104         flock.l_flock.start = file_lock->fl_start;
2105         flock.l_flock.end = file_lock->fl_end;
2106
2107         switch (file_lock->fl_type) {
2108         case F_RDLCK:
2109                 einfo.ei_mode = LCK_PR;
2110                 break;
2111         case F_UNLCK:
2112                 /* An unlock request may or may not have any relation to
2113                  * existing locks so we may not be able to pass a lock handle
2114                  * via a normal ldlm_lock_cancel() request. The request may even
2115                  * unlock a byte range in the middle of an existing lock. In
2116                  * order to process an unlock request we need all of the same
2117                  * information that is given with a normal read or write record
2118                  * lock request. To avoid creating another ldlm unlock (cancel)
2119                  * message we'll treat a LCK_NL flock request as an unlock. */
2120                 einfo.ei_mode = LCK_NL;
2121                 break;
2122         case F_WRLCK:
2123                 einfo.ei_mode = LCK_PW;
2124                 break;
2125         default:
2126                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2127                 RETURN (-EINVAL);
2128         }
2129
2130         switch (cmd) {
2131         case F_SETLKW:
2132 #ifdef F_SETLKW64
2133         case F_SETLKW64:
2134 #endif
2135                 flags = 0;
2136                 break;
2137         case F_SETLK:
2138 #ifdef F_SETLK64
2139         case F_SETLK64:
2140 #endif
2141                 flags = LDLM_FL_BLOCK_NOWAIT;
2142                 break;
2143         case F_GETLK:
2144 #ifdef F_GETLK64
2145         case F_GETLK64:
2146 #endif
2147                 flags = LDLM_FL_TEST_LOCK;
2148                 /* Save the old mode so that if the mode in the lock changes we
2149                  * can decrement the appropriate reader or writer refcount. */
2150                 file_lock->fl_type = einfo.ei_mode;
2151                 break;
2152         default:
2153                 CERROR("unknown fcntl lock command: %d\n", cmd);
2154                 RETURN (-EINVAL);
2155         }
2156
2157         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2158                                      LUSTRE_OPC_ANY, NULL);
2159         if (IS_ERR(op_data))
2160                 RETURN(PTR_ERR(op_data));
2161
2162         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2163                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2164                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2165
2166         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2167                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2168
2169         ll_finish_md_op_data(op_data);
2170
2171         if ((file_lock->fl_flags & FL_FLOCK) &&
2172             (rc == 0 || file_lock->fl_type == F_UNLCK))
2173                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2174 #ifdef HAVE_F_OP_FLOCK
2175         if ((file_lock->fl_flags & FL_POSIX) &&
2176             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2177             !(flags & LDLM_FL_TEST_LOCK))
2178                 posix_lock_file_wait(file, file_lock);
2179 #endif
2180
2181         RETURN(rc);
2182 }
2183
2184 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2185 {
2186         ENTRY;
2187
2188         RETURN(-ENOSYS);
2189 }
2190
2191 int ll_have_md_lock(struct inode *inode, __u64 bits)
2192 {
2193         struct lustre_handle lockh;
2194         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2195         struct lu_fid *fid;
2196         int flags;
2197         ENTRY;
2198
2199         if (!inode)
2200                RETURN(0);
2201
2202         fid = &ll_i2info(inode)->lli_fid;
2203         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2204
2205         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2206         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2207                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2208                 RETURN(1);
2209         }
2210         RETURN(0);
2211 }
2212
2213 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2214                             struct lustre_handle *lockh)
2215 {
2216         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2217         struct lu_fid *fid;
2218         ldlm_mode_t rc;
2219         int flags;
2220         ENTRY;
2221
2222         fid = &ll_i2info(inode)->lli_fid;
2223         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2224
2225         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2226         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2227                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2228         RETURN(rc);
2229 }
2230
2231 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2232         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2233                               * and return success */
2234                 inode->i_nlink = 0;
2235                 /* This path cannot be hit for regular files unless in
2236                  * case of obscure races, so no need to to validate
2237                  * size. */
2238                 if (!S_ISREG(inode->i_mode) &&
2239                     !S_ISDIR(inode->i_mode))
2240                         return 0;
2241         }
2242
2243         if (rc) {
2244                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2245                 return -abs(rc);
2246
2247         }
2248
2249         return 0;
2250 }
2251
2252 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2253                              __u64 ibits)
2254 {
2255         struct inode *inode = dentry->d_inode;
2256         struct ptlrpc_request *req = NULL;
2257         struct ll_sb_info *sbi;
2258         struct obd_export *exp;
2259         int rc = 0;
2260         ENTRY;
2261
2262         if (!inode) {
2263                 CERROR("REPORT THIS LINE TO PETER\n");
2264                 RETURN(0);
2265         }
2266         sbi = ll_i2sbi(inode);
2267
2268         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2269                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2270
2271         exp = ll_i2mdexp(inode);
2272
2273         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2274                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2275                 struct md_op_data *op_data;
2276
2277                 /* Call getattr by fid, so do not provide name at all. */
2278                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2279                                              dentry->d_inode, NULL, 0, 0,
2280                                              LUSTRE_OPC_ANY, NULL);
2281                 if (IS_ERR(op_data))
2282                         RETURN(PTR_ERR(op_data));
2283
2284                 oit.it_create_mode |= M_CHECK_STALE;
2285                 rc = md_intent_lock(exp, op_data, NULL, 0,
2286                                     /* we are not interested in name
2287                                        based lookup */
2288                                     &oit, 0, &req,
2289                                     ll_md_blocking_ast, 0);
2290                 ll_finish_md_op_data(op_data);
2291                 oit.it_create_mode &= ~M_CHECK_STALE;
2292                 if (rc < 0) {
2293                         rc = ll_inode_revalidate_fini(inode, rc);
2294                         GOTO (out, rc);
2295                 }
2296
2297                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2298                 if (rc != 0) {
2299                         ll_intent_release(&oit);
2300                         GOTO(out, rc);
2301                 }
2302
2303                 /* Unlinked? Unhash dentry, so it is not picked up later by
2304                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2305                    here to preserve get_cwd functionality on 2.6.
2306                    Bug 10503 */
2307                 if (!dentry->d_inode->i_nlink) {
2308                         spin_lock(&ll_lookup_lock);
2309                         spin_lock(&dcache_lock);
2310                         ll_drop_dentry(dentry);
2311                         spin_unlock(&dcache_lock);
2312                         spin_unlock(&ll_lookup_lock);
2313                 }
2314
2315                 ll_lookup_finish_locks(&oit, dentry);
2316         } else if (!ll_have_md_lock(dentry->d_inode, ibits)) {
2317
2318                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2319                 obd_valid valid = OBD_MD_FLGETATTR;
2320                 struct obd_capa *oc;
2321                 int ealen = 0;
2322
2323                 if (S_ISREG(inode->i_mode)) {
2324                         rc = ll_get_max_mdsize(sbi, &ealen);
2325                         if (rc)
2326                                 RETURN(rc);
2327                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2328                 }
2329                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2330                  * capa for this inode. Because we only keep capas of dirs
2331                  * fresh. */
2332                 oc = ll_mdscapa_get(inode);
2333                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2334                                 ealen, &req);
2335                 capa_put(oc);
2336                 if (rc) {
2337                         rc = ll_inode_revalidate_fini(inode, rc);
2338                         RETURN(rc);
2339                 }
2340
2341                 rc = ll_prep_inode(&inode, req, NULL);
2342         }
2343 out:
2344         ptlrpc_req_finished(req);
2345         return rc;
2346 }
2347
2348 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2349 {
2350         int rc;
2351         ENTRY;
2352
2353         rc = __ll_inode_revalidate_it(dentry, it, MDS_INODELOCK_UPDATE |
2354                                                   MDS_INODELOCK_LOOKUP);
2355
2356         /* if object not yet allocated, don't validate size */
2357         if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL)
2358                 RETURN(0);
2359
2360         /* cl_glimpse_size will prefer locally cached writes if they extend
2361          * the file */
2362
2363         if (rc == 0)
2364                 rc = cl_glimpse_size(dentry->d_inode);
2365
2366         RETURN(rc);
2367 }
2368
2369 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2370                   struct lookup_intent *it, struct kstat *stat)
2371 {
2372         struct inode *inode = de->d_inode;
2373         int res = 0;
2374
2375         res = ll_inode_revalidate_it(de, it);
2376         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2377
2378         if (res)
2379                 return res;
2380
2381         stat->dev = inode->i_sb->s_dev;
2382         stat->ino = inode->i_ino;
2383         stat->mode = inode->i_mode;
2384         stat->nlink = inode->i_nlink;
2385         stat->uid = inode->i_uid;
2386         stat->gid = inode->i_gid;
2387         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2388         stat->atime = inode->i_atime;
2389         stat->mtime = inode->i_mtime;
2390         stat->ctime = inode->i_ctime;
2391 #ifdef HAVE_INODE_BLKSIZE
2392         stat->blksize = inode->i_blksize;
2393 #else
2394         stat->blksize = 1 << inode->i_blkbits;
2395 #endif
2396
2397         ll_inode_size_lock(inode, 0);
2398         stat->size = i_size_read(inode);
2399         stat->blocks = inode->i_blocks;
2400         ll_inode_size_unlock(inode, 0);
2401
2402         return 0;
2403 }
2404 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2405 {
2406         struct lookup_intent it = { .it_op = IT_GETATTR };
2407
2408         return ll_getattr_it(mnt, de, &it, stat);
2409 }
2410
2411 static
2412 int lustre_check_acl(struct inode *inode, int mask)
2413 {
2414 #ifdef CONFIG_FS_POSIX_ACL
2415         struct ll_inode_info *lli = ll_i2info(inode);
2416         struct posix_acl *acl;
2417         int rc;
2418         ENTRY;
2419
2420         spin_lock(&lli->lli_lock);
2421         acl = posix_acl_dup(lli->lli_posix_acl);
2422         spin_unlock(&lli->lli_lock);
2423
2424         if (!acl)
2425                 RETURN(-EAGAIN);
2426
2427         rc = posix_acl_permission(inode, acl, mask);
2428         posix_acl_release(acl);
2429
2430         RETURN(rc);
2431 #else
2432         return -EAGAIN;
2433 #endif
2434 }
2435
2436 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2437 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2438 {
2439         int rc = 0;
2440         ENTRY;
2441
2442        /* as root inode are NOT getting validated in lookup operation,
2443         * need to do it before permission check. */
2444
2445         if (inode == inode->i_sb->s_root->d_inode) {
2446                 struct lookup_intent it = { .it_op = IT_LOOKUP };
2447
2448                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2449                                               MDS_INODELOCK_LOOKUP);
2450                 if (rc)
2451                         RETURN(rc);
2452         }
2453
2454         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2455                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2456
2457         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2458                 return lustre_check_remote_perm(inode, mask);
2459
2460         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2461         rc = generic_permission(inode, mask, lustre_check_acl);
2462
2463         RETURN(rc);
2464 }
2465 #else
2466 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2467 {
2468         int mode = inode->i_mode;
2469         int rc;
2470
2471         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2472                inode->i_ino, inode->i_generation, inode, mask);
2473
2474         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2475                 return lustre_check_remote_perm(inode, mask);
2476
2477         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2478
2479         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2480             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2481                 return -EROFS;
2482         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2483                 return -EACCES;
2484         if (current->fsuid == inode->i_uid) {
2485                 mode >>= 6;
2486         } else if (1) {
2487                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2488                         goto check_groups;
2489                 rc = lustre_check_acl(inode, mask);
2490                 if (rc == -EAGAIN)
2491                         goto check_groups;
2492                 if (rc == -EACCES)
2493                         goto check_capabilities;
2494                 return rc;
2495         } else {
2496 check_groups:
2497                 if (in_group_p(inode->i_gid))
2498                         mode >>= 3;
2499         }
2500         if ((mode & mask & S_IRWXO) == mask)
2501                 return 0;
2502
2503 check_capabilities:
2504         if (!(mask & MAY_EXEC) ||
2505             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2506                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2507                         return 0;
2508
2509         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2510             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2511                 return 0;
2512
2513         return -EACCES;
2514 }
2515 #endif
2516
2517 #ifdef HAVE_FILE_READV
2518 #define READ_METHOD readv
2519 #define READ_FUNCTION ll_file_readv
2520 #define WRITE_METHOD writev
2521 #define WRITE_FUNCTION ll_file_writev
2522 #else
2523 #define READ_METHOD aio_read
2524 #define READ_FUNCTION ll_file_aio_read
2525 #define WRITE_METHOD aio_write
2526 #define WRITE_FUNCTION ll_file_aio_write
2527 #endif
2528
2529 /* -o localflock - only provides locally consistent flock locks */
2530 struct file_operations ll_file_operations = {
2531         .read           = ll_file_read,
2532         .READ_METHOD    = READ_FUNCTION,
2533         .write          = ll_file_write,
2534         .WRITE_METHOD   = WRITE_FUNCTION,
2535         .ioctl          = ll_file_ioctl,
2536         .open           = ll_file_open,
2537         .release        = ll_file_release,
2538         .mmap           = ll_file_mmap,
2539         .llseek         = ll_file_seek,
2540         .sendfile       = ll_file_sendfile,
2541         .fsync          = ll_fsync,
2542 };
2543
2544 struct file_operations ll_file_operations_flock = {
2545         .read           = ll_file_read,
2546         .READ_METHOD    = READ_FUNCTION,
2547         .write          = ll_file_write,
2548         .WRITE_METHOD   = WRITE_FUNCTION,
2549         .ioctl          = ll_file_ioctl,
2550         .open           = ll_file_open,
2551         .release        = ll_file_release,
2552         .mmap           = ll_file_mmap,
2553         .llseek         = ll_file_seek,
2554         .sendfile       = ll_file_sendfile,
2555         .fsync          = ll_fsync,
2556 #ifdef HAVE_F_OP_FLOCK
2557         .flock          = ll_file_flock,
2558 #endif
2559         .lock           = ll_file_flock
2560 };
2561
2562 /* These are for -o noflock - to return ENOSYS on flock calls */
2563 struct file_operations ll_file_operations_noflock = {
2564         .read           = ll_file_read,
2565         .READ_METHOD    = READ_FUNCTION,
2566         .write          = ll_file_write,
2567         .WRITE_METHOD   = WRITE_FUNCTION,
2568         .ioctl          = ll_file_ioctl,
2569         .open           = ll_file_open,
2570         .release        = ll_file_release,
2571         .mmap           = ll_file_mmap,
2572         .llseek         = ll_file_seek,
2573         .sendfile       = ll_file_sendfile,
2574         .fsync          = ll_fsync,
2575 #ifdef HAVE_F_OP_FLOCK
2576         .flock          = ll_file_noflock,
2577 #endif
2578         .lock           = ll_file_noflock
2579 };
2580
2581 struct inode_operations ll_file_inode_operations = {
2582 #ifdef HAVE_VFS_INTENT_PATCHES
2583         .setattr_raw    = ll_setattr_raw,
2584 #endif
2585         .setattr        = ll_setattr,
2586         .truncate       = ll_truncate,
2587         .getattr        = ll_getattr,
2588         .permission     = ll_inode_permission,
2589         .setxattr       = ll_setxattr,
2590         .getxattr       = ll_getxattr,
2591         .listxattr      = ll_listxattr,
2592         .removexattr    = ll_removexattr,
2593 };
2594
2595 /* dynamic ioctl number support routins */
2596 static struct llioc_ctl_data {
2597         struct rw_semaphore ioc_sem;
2598         struct list_head    ioc_head;
2599 } llioc = {
2600         __RWSEM_INITIALIZER(llioc.ioc_sem),
2601         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2602 };
2603
2604
2605 struct llioc_data {
2606         struct list_head        iocd_list;
2607         unsigned int            iocd_size;
2608         llioc_callback_t        iocd_cb;
2609         unsigned int            iocd_count;
2610         unsigned int            iocd_cmd[0];
2611 };
2612
2613 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2614 {
2615         unsigned int size;
2616         struct llioc_data *in_data = NULL;
2617         ENTRY;
2618
2619         if (cb == NULL || cmd == NULL ||
2620             count > LLIOC_MAX_CMD || count < 0)
2621                 RETURN(NULL);
2622
2623         size = sizeof(*in_data) + count * sizeof(unsigned int);
2624         OBD_ALLOC(in_data, size);
2625         if (in_data == NULL)
2626                 RETURN(NULL);
2627
2628         memset(in_data, 0, sizeof(*in_data));
2629         in_data->iocd_size = size;
2630         in_data->iocd_cb = cb;
2631         in_data->iocd_count = count;
2632         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2633
2634         down_write(&llioc.ioc_sem);
2635         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2636         up_write(&llioc.ioc_sem);
2637
2638         RETURN(in_data);
2639 }
2640
2641 void ll_iocontrol_unregister(void *magic)
2642 {
2643         struct llioc_data *tmp;
2644
2645         if (magic == NULL)
2646                 return;
2647
2648         down_write(&llioc.ioc_sem);
2649         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2650                 if (tmp == magic) {
2651                         unsigned int size = tmp->iocd_size;
2652
2653                         list_del(&tmp->iocd_list);
2654                         up_write(&llioc.ioc_sem);
2655
2656                         OBD_FREE(tmp, size);
2657                         return;
2658                 }
2659         }
2660         up_write(&llioc.ioc_sem);
2661
2662         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2663 }
2664
2665 EXPORT_SYMBOL(ll_iocontrol_register);
2666 EXPORT_SYMBOL(ll_iocontrol_unregister);
2667
2668 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2669                         unsigned int cmd, unsigned long arg, int *rcp)
2670 {
2671         enum llioc_iter ret = LLIOC_CONT;
2672         struct llioc_data *data;
2673         int rc = -EINVAL, i;
2674
2675         down_read(&llioc.ioc_sem);
2676         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2677                 for (i = 0; i < data->iocd_count; i++) {
2678                         if (cmd != data->iocd_cmd[i])
2679                                 continue;
2680
2681                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2682                         break;
2683                 }
2684
2685                 if (ret == LLIOC_STOP)
2686                         break;
2687         }
2688         up_read(&llioc.ioc_sem);
2689
2690         if (rcp)
2691                 *rcp = rc;
2692         return ret;
2693 }