Whamcloud - gitweb
79203b8d60b83a13d115fab60720c58652e36c0c
[fs/lustre-release.git] / lustre / llite / file.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51
52 #include "cl_object.h"
53
54 struct ll_file_data *ll_file_data_get(void)
55 {
56         struct ll_file_data *fd;
57
58         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
59         return fd;
60 }
61
62 static void ll_file_data_put(struct ll_file_data *fd)
63 {
64         if (fd != NULL)
65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 }
67
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69                           struct lustre_handle *fh)
70 {
71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72         op_data->op_attr.ia_mode = inode->i_mode;
73         op_data->op_attr.ia_atime = inode->i_atime;
74         op_data->op_attr.ia_mtime = inode->i_mtime;
75         op_data->op_attr.ia_ctime = inode->i_ctime;
76         op_data->op_attr.ia_size = i_size_read(inode);
77         op_data->op_attr_blocks = inode->i_blocks;
78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
79         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
80         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
81         op_data->op_capa1 = ll_mdscapa_get(inode);
82 }
83
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85                              struct obd_client_handle *och)
86 {
87         ENTRY;
88
89         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
90                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
91
92         if (!(och->och_flags & FMODE_WRITE))
93                 goto out;
94
95         if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
96             !S_ISREG(inode->i_mode))
97                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
98         else
99                 ll_epoch_close(inode, op_data, &och, 0);
100
101 out:
102         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
103         EXIT;
104 }
105
106 static int ll_close_inode_openhandle(struct obd_export *md_exp,
107                                      struct inode *inode,
108                                      struct obd_client_handle *och)
109 {
110         struct obd_export *exp = ll_i2mdexp(inode);
111         struct md_op_data *op_data;
112         struct ptlrpc_request *req = NULL;
113         struct obd_device *obd = class_exp2obd(exp);
114         int epoch_close = 1;
115         int rc;
116         ENTRY;
117
118         if (obd == NULL) {
119                 /*
120                  * XXX: in case of LMV, is this correct to access
121                  * ->exp_handle?
122                  */
123                 CERROR("Invalid MDC connection handle "LPX64"\n",
124                        ll_i2mdexp(inode)->exp_handle.h_cookie);
125                 GOTO(out, rc = 0);
126         }
127
128         /*
129          * here we check if this is forced umount. If so this is called on
130          * canceling "open lock" and we do not call md_close() in this case, as
131          * it will not be successful, as import is already deactivated.
132          */
133         if (obd->obd_force)
134                 GOTO(out, rc = 0);
135
136         OBD_ALLOC_PTR(op_data);
137         if (op_data == NULL)
138                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
139
140         ll_prepare_close(inode, op_data, och);
141         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
142         rc = md_close(md_exp, op_data, och->och_mod, &req);
143         if (rc == -EAGAIN) {
144                 /* This close must have the epoch closed. */
145                 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
146                 LASSERT(epoch_close);
147                 /* MDS has instructed us to obtain Size-on-MDS attribute from
148                  * OSTs and send setattr to back to MDS. */
149                 rc = ll_sizeonmds_update(inode, &och->och_fh,
150                                          op_data->op_ioepoch);
151                 if (rc) {
152                         CERROR("inode %lu mdc Size-on-MDS update failed: "
153                                "rc = %d\n", inode->i_ino, rc);
154                         rc = 0;
155                 }
156         } else if (rc) {
157                 CERROR("inode %lu mdc close failed: rc = %d\n",
158                        inode->i_ino, rc);
159         }
160         ll_finish_md_op_data(op_data);
161
162         if (rc == 0) {
163                 rc = ll_objects_destroy(req, inode);
164                 if (rc)
165                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
166                                inode->i_ino, rc);
167         }
168
169         EXIT;
170 out:
171
172         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
173             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
174                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
175         } else {
176                 md_clear_open_replay_data(md_exp, och);
177                 /* Free @och if it is not waiting for DONE_WRITING. */
178                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
179                 OBD_FREE_PTR(och);
180         }
181         if (req) /* This is close request */
182                 ptlrpc_req_finished(req);
183         return rc;
184 }
185
186 int ll_md_real_close(struct inode *inode, int flags)
187 {
188         struct ll_inode_info *lli = ll_i2info(inode);
189         struct obd_client_handle **och_p;
190         struct obd_client_handle *och;
191         __u64 *och_usecount;
192         int rc = 0;
193         ENTRY;
194
195         if (flags & FMODE_WRITE) {
196                 och_p = &lli->lli_mds_write_och;
197                 och_usecount = &lli->lli_open_fd_write_count;
198         } else if (flags & FMODE_EXEC) {
199                 och_p = &lli->lli_mds_exec_och;
200                 och_usecount = &lli->lli_open_fd_exec_count;
201         } else {
202                 LASSERT(flags & FMODE_READ);
203                 och_p = &lli->lli_mds_read_och;
204                 och_usecount = &lli->lli_open_fd_read_count;
205         }
206
207         down(&lli->lli_och_sem);
208         if (*och_usecount) { /* There are still users of this handle, so
209                                 skip freeing it. */
210                 up(&lli->lli_och_sem);
211                 RETURN(0);
212         }
213         och=*och_p;
214         *och_p = NULL;
215         up(&lli->lli_och_sem);
216
217         if (och) { /* There might be a race and somebody have freed this och
218                       already */
219                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
220                                                inode, och);
221         }
222
223         RETURN(rc);
224 }
225
226 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
227                 struct file *file)
228 {
229         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
230         struct ll_inode_info *lli = ll_i2info(inode);
231         int rc = 0;
232         ENTRY;
233
234         /* clear group lock, if present */
235         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
236 #if 0 /* XXX */
237                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
238                 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
239                 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
240                                       &fd->fd_cwlockh);
241 #endif
242         }
243
244         /* Let's see if we have good enough OPEN lock on the file and if
245            we can skip talking to MDS */
246         if (file->f_dentry->d_inode) { /* Can this ever be false? */
247                 int lockmode;
248                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
249                 struct lustre_handle lockh;
250                 struct inode *inode = file->f_dentry->d_inode;
251                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
252
253                 down(&lli->lli_och_sem);
254                 if (fd->fd_omode & FMODE_WRITE) {
255                         lockmode = LCK_CW;
256                         LASSERT(lli->lli_open_fd_write_count);
257                         lli->lli_open_fd_write_count--;
258                 } else if (fd->fd_omode & FMODE_EXEC) {
259                         lockmode = LCK_PR;
260                         LASSERT(lli->lli_open_fd_exec_count);
261                         lli->lli_open_fd_exec_count--;
262                 } else {
263                         lockmode = LCK_CR;
264                         LASSERT(lli->lli_open_fd_read_count);
265                         lli->lli_open_fd_read_count--;
266                 }
267                 up(&lli->lli_och_sem);
268
269                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
270                                    LDLM_IBITS, &policy, lockmode,
271                                    &lockh)) {
272                         rc = ll_md_real_close(file->f_dentry->d_inode,
273                                               fd->fd_omode);
274                 }
275         } else {
276                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
277                        file, file->f_dentry, file->f_dentry->d_name.name);
278         }
279
280         LUSTRE_FPRIVATE(file) = NULL;
281         ll_file_data_put(fd);
282         ll_capa_close(inode);
283
284         RETURN(rc);
285 }
286
287 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
288
289 /* While this returns an error code, fput() the caller does not, so we need
290  * to make every effort to clean up all of our state here.  Also, applications
291  * rarely check close errors and even if an error is returned they will not
292  * re-try the close call.
293  */
294 int ll_file_release(struct inode *inode, struct file *file)
295 {
296         struct ll_file_data *fd;
297         struct ll_sb_info *sbi = ll_i2sbi(inode);
298         struct ll_inode_info *lli = ll_i2info(inode);
299         struct lov_stripe_md *lsm = lli->lli_smd;
300         int rc;
301         ENTRY;
302
303         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
304                inode->i_generation, inode);
305
306 #ifdef CONFIG_FS_POSIX_ACL
307         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
308             inode == inode->i_sb->s_root->d_inode) {
309                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
310
311                 LASSERT(fd != NULL);
312                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
313                         fd->fd_flags &= ~LL_FILE_RMTACL;
314                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
315                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
316                 }
317         }
318 #endif
319
320         if (inode->i_sb->s_root != file->f_dentry)
321                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
322         fd = LUSTRE_FPRIVATE(file);
323         LASSERT(fd != NULL);
324
325         /* The last ref on @file, maybe not the the owner pid of statahead.
326          * Different processes can open the same dir, "ll_opendir_key" means:
327          * it is me that should stop the statahead thread. */
328         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
329                 ll_stop_statahead(inode, lli->lli_opendir_key);
330
331         if (inode->i_sb->s_root == file->f_dentry) {
332                 LUSTRE_FPRIVATE(file) = NULL;
333                 ll_file_data_put(fd);
334                 RETURN(0);
335         }
336
337         if (lsm)
338                 lov_test_and_clear_async_rc(lsm);
339         lli->lli_async_rc = 0;
340
341         rc = ll_md_close(sbi->ll_md_exp, inode, file);
342         RETURN(rc);
343 }
344
345 static int ll_intent_file_open(struct file *file, void *lmm,
346                                int lmmsize, struct lookup_intent *itp)
347 {
348         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
349         struct dentry *parent = file->f_dentry->d_parent;
350         const char *name = file->f_dentry->d_name.name;
351         const int len = file->f_dentry->d_name.len;
352         struct md_op_data *op_data;
353         struct ptlrpc_request *req;
354         int rc;
355         ENTRY;
356
357         if (!parent)
358                 RETURN(-ENOENT);
359
360         /* Usually we come here only for NFSD, and we want open lock.
361            But we can also get here with pre 2.6.15 patchless kernels, and in
362            that case that lock is also ok */
363         /* We can also get here if there was cached open handle in revalidate_it
364          * but it disappeared while we were getting from there to ll_file_open.
365          * But this means this file was closed and immediatelly opened which
366          * makes a good candidate for using OPEN lock */
367         /* If lmmsize & lmm are not 0, we are just setting stripe info
368          * parameters. No need for the open lock */
369         if (!lmm && !lmmsize)
370                 itp->it_flags |= MDS_OPEN_LOCK;
371
372         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
373                                       file->f_dentry->d_inode, name, len,
374                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
375         if (IS_ERR(op_data))
376                 RETURN(PTR_ERR(op_data));
377
378         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
379                             0 /*unused */, &req, ll_md_blocking_ast, 0);
380         ll_finish_md_op_data(op_data);
381         if (rc == -ESTALE) {
382                 /* reason for keep own exit path - don`t flood log
383                 * with messages with -ESTALE errors.
384                 */
385                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
386                      it_open_error(DISP_OPEN_OPEN, itp))
387                         GOTO(out, rc);
388                 ll_release_openhandle(file->f_dentry, itp);
389                 GOTO(out, rc);
390         }
391
392         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
393                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
394                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
395                 GOTO(out, rc);
396         }
397
398         if (itp->d.lustre.it_lock_mode)
399                 md_set_lock_data(sbi->ll_md_exp,
400                                  &itp->d.lustre.it_lock_handle,
401                                  file->f_dentry->d_inode);
402
403         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
404 out:
405         ptlrpc_req_finished(itp->d.lustre.it_data);
406         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
407         ll_intent_drop_lock(itp);
408
409         RETURN(rc);
410 }
411
412 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
413                        struct lookup_intent *it, struct obd_client_handle *och)
414 {
415         struct ptlrpc_request *req = it->d.lustre.it_data;
416         struct mdt_body *body;
417
418         LASSERT(och);
419
420         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
421         LASSERT(body != NULL);                      /* reply already checked out */
422
423         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
424         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
425         och->och_fid = lli->lli_fid;
426         och->och_flags = it->it_flags;
427         lli->lli_ioepoch = body->ioepoch;
428
429         return md_set_open_replay_data(md_exp, och, req);
430 }
431
432 int ll_local_open(struct file *file, struct lookup_intent *it,
433                   struct ll_file_data *fd, struct obd_client_handle *och)
434 {
435         struct inode *inode = file->f_dentry->d_inode;
436         struct ll_inode_info *lli = ll_i2info(inode);
437         ENTRY;
438
439         LASSERT(!LUSTRE_FPRIVATE(file));
440
441         LASSERT(fd != NULL);
442
443         if (och) {
444                 struct ptlrpc_request *req = it->d.lustre.it_data;
445                 struct mdt_body *body;
446                 int rc;
447
448                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
449                 if (rc)
450                         RETURN(rc);
451
452                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
453                 if ((it->it_flags & FMODE_WRITE) &&
454                     (body->valid & OBD_MD_FLSIZE))
455                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
456                                lli->lli_ioepoch, PFID(&lli->lli_fid));
457         }
458
459         LUSTRE_FPRIVATE(file) = fd;
460         ll_readahead_init(inode, &fd->fd_ras);
461         fd->fd_omode = it->it_flags;
462         RETURN(0);
463 }
464
465 /* Open a file, and (for the very first open) create objects on the OSTs at
466  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
467  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
468  * lli_open_sem to ensure no other process will create objects, send the
469  * stripe MD to the MDS, or try to destroy the objects if that fails.
470  *
471  * If we already have the stripe MD locally then we don't request it in
472  * md_open(), by passing a lmm_size = 0.
473  *
474  * It is up to the application to ensure no other processes open this file
475  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
476  * used.  We might be able to avoid races of that sort by getting lli_open_sem
477  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
478  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
479  */
480 int ll_file_open(struct inode *inode, struct file *file)
481 {
482         struct ll_inode_info *lli = ll_i2info(inode);
483         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
484                                           .it_flags = file->f_flags };
485         struct lov_stripe_md *lsm;
486         struct ptlrpc_request *req = NULL;
487         struct obd_client_handle **och_p;
488         __u64 *och_usecount;
489         struct ll_file_data *fd;
490         int rc = 0, opendir_set = 0;
491         ENTRY;
492
493         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
494                inode->i_generation, inode, file->f_flags);
495
496 #ifdef HAVE_VFS_INTENT_PATCHES
497         it = file->f_it;
498 #else
499         it = file->private_data; /* XXX: compat macro */
500         file->private_data = NULL; /* prevent ll_local_open assertion */
501 #endif
502
503         fd = ll_file_data_get();
504         if (fd == NULL)
505                 RETURN(-ENOMEM);
506
507         fd->fd_file = file;
508         if (S_ISDIR(inode->i_mode)) {
509 again:
510                 spin_lock(&lli->lli_lock);
511                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
512                         LASSERT(lli->lli_sai == NULL);
513                         lli->lli_opendir_key = fd;
514                         lli->lli_opendir_pid = cfs_curproc_pid();
515                         opendir_set = 1;
516                 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid() &&
517                                     lli->lli_opendir_key != NULL)) {
518                         /* Two cases for this:
519                          * (1) The same process open such directory many times.
520                          * (2) The old process opened the directory, and exited
521                          *     before its children processes. Then new process
522                          *     with the same pid opens such directory before the
523                          *     old process's children processes exit.
524                          * reset stat ahead for such cases. */
525                         spin_unlock(&lli->lli_lock);
526                         CDEBUG(D_INFO, "Conflict statahead for %.*s "DFID
527                                " reset it.\n", file->f_dentry->d_name.len,
528                                file->f_dentry->d_name.name,
529                                PFID(&lli->lli_fid));
530                         ll_stop_statahead(inode, lli->lli_opendir_key);
531                         goto again;
532                 }
533                 spin_unlock(&lli->lli_lock);
534         }
535
536         if (inode->i_sb->s_root == file->f_dentry) {
537                 LUSTRE_FPRIVATE(file) = fd;
538                 RETURN(0);
539         }
540
541         if (!it || !it->d.lustre.it_disposition) {
542                 /* Convert f_flags into access mode. We cannot use file->f_mode,
543                  * because everything but O_ACCMODE mask was stripped from
544                  * there */
545                 if ((oit.it_flags + 1) & O_ACCMODE)
546                         oit.it_flags++;
547                 if (file->f_flags & O_TRUNC)
548                         oit.it_flags |= FMODE_WRITE;
549
550                 /* kernel only call f_op->open in dentry_open.  filp_open calls
551                  * dentry_open after call to open_namei that checks permissions.
552                  * Only nfsd_open call dentry_open directly without checking
553                  * permissions and because of that this code below is safe. */
554                 if (oit.it_flags & FMODE_WRITE)
555                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
556
557                 /* We do not want O_EXCL here, presumably we opened the file
558                  * already? XXX - NFS implications? */
559                 oit.it_flags &= ~O_EXCL;
560
561                 it = &oit;
562         }
563
564 restart:
565         /* Let's see if we have file open on MDS already. */
566         if (it->it_flags & FMODE_WRITE) {
567                 och_p = &lli->lli_mds_write_och;
568                 och_usecount = &lli->lli_open_fd_write_count;
569         } else if (it->it_flags & FMODE_EXEC) {
570                 och_p = &lli->lli_mds_exec_och;
571                 och_usecount = &lli->lli_open_fd_exec_count;
572          } else {
573                 och_p = &lli->lli_mds_read_och;
574                 och_usecount = &lli->lli_open_fd_read_count;
575         }
576
577         down(&lli->lli_och_sem);
578         if (*och_p) { /* Open handle is present */
579                 if (it_disposition(it, DISP_OPEN_OPEN)) {
580                         /* Well, there's extra open request that we do not need,
581                            let's close it somehow. This will decref request. */
582                         rc = it_open_error(DISP_OPEN_OPEN, it);
583                         if (rc) {
584                                 up(&lli->lli_och_sem);
585                                 ll_file_data_put(fd);
586                                 GOTO(out_openerr, rc);
587                         }
588                         ll_release_openhandle(file->f_dentry, it);
589                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
590                                              LPROC_LL_OPEN);
591                 }
592                 (*och_usecount)++;
593
594                 rc = ll_local_open(file, it, fd, NULL);
595                 if (rc) {
596                         (*och_usecount)--;
597                         up(&lli->lli_och_sem);
598                         ll_file_data_put(fd);
599                         GOTO(out_openerr, rc);
600                 }
601         } else {
602                 LASSERT(*och_usecount == 0);
603                 if (!it->d.lustre.it_disposition) {
604                         /* We cannot just request lock handle now, new ELC code
605                            means that one of other OPEN locks for this file
606                            could be cancelled, and since blocking ast handler
607                            would attempt to grab och_sem as well, that would
608                            result in a deadlock */
609                         up(&lli->lli_och_sem);
610                         it->it_create_mode |= M_CHECK_STALE;
611                         rc = ll_intent_file_open(file, NULL, 0, it);
612                         it->it_create_mode &= ~M_CHECK_STALE;
613                         if (rc) {
614                                 ll_file_data_put(fd);
615                                 GOTO(out_openerr, rc);
616                         }
617
618                         /* Got some error? Release the request */
619                         if (it->d.lustre.it_status < 0) {
620                                 req = it->d.lustre.it_data;
621                                 ptlrpc_req_finished(req);
622                         }
623                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
624                                          &it->d.lustre.it_lock_handle,
625                                          file->f_dentry->d_inode);
626                         goto restart;
627                 }
628                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
629                 if (!*och_p) {
630                         ll_file_data_put(fd);
631                         GOTO(out_och_free, rc = -ENOMEM);
632                 }
633                 (*och_usecount)++;
634                 req = it->d.lustre.it_data;
635
636                 /* md_intent_lock() didn't get a request ref if there was an
637                  * open error, so don't do cleanup on the request here
638                  * (bug 3430) */
639                 /* XXX (green): Should not we bail out on any error here, not
640                  * just open error? */
641                 rc = it_open_error(DISP_OPEN_OPEN, it);
642                 if (rc) {
643                         ll_file_data_put(fd);
644                         GOTO(out_och_free, rc);
645                 }
646
647                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
648                 rc = ll_local_open(file, it, fd, *och_p);
649                 if (rc) {
650                         ll_file_data_put(fd);
651                         GOTO(out_och_free, rc);
652                 }
653         }
654         up(&lli->lli_och_sem);
655
656         /* Must do this outside lli_och_sem lock to prevent deadlock where
657            different kind of OPEN lock for this same inode gets cancelled
658            by ldlm_cancel_lru */
659         if (!S_ISREG(inode->i_mode))
660                 GOTO(out, rc);
661
662         ll_capa_open(inode);
663
664         lsm = lli->lli_smd;
665         if (lsm == NULL) {
666                 if (file->f_flags & O_LOV_DELAY_CREATE ||
667                     !(file->f_mode & FMODE_WRITE)) {
668                         CDEBUG(D_INODE, "object creation was delayed\n");
669                         GOTO(out, rc);
670                 }
671         }
672         file->f_flags &= ~O_LOV_DELAY_CREATE;
673         GOTO(out, rc);
674 out:
675         ptlrpc_req_finished(req);
676         if (req)
677                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
678 out_och_free:
679         if (rc) {
680                 if (*och_p) {
681                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
682                         *och_p = NULL; /* OBD_FREE writes some magic there */
683                         (*och_usecount)--;
684                 }
685                 up(&lli->lli_och_sem);
686 out_openerr:
687                 if (opendir_set != 0)
688                         ll_stop_statahead(inode, lli->lli_opendir_key);
689         }
690
691         return rc;
692 }
693
694 /* Fills the obdo with the attributes for the lsm */
695 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
696                           struct obd_capa *capa, struct obdo *obdo)
697 {
698         struct ptlrpc_request_set *set;
699         struct obd_info            oinfo = { { { 0 } } };
700         int                        rc;
701
702         ENTRY;
703
704         LASSERT(lsm != NULL);
705
706         oinfo.oi_md = lsm;
707         oinfo.oi_oa = obdo;
708         oinfo.oi_oa->o_id = lsm->lsm_object_id;
709         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
710         oinfo.oi_oa->o_mode = S_IFREG;
711         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
712                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
713                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
714                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
715                                OBD_MD_FLGROUP;
716         oinfo.oi_capa = capa;
717
718         set = ptlrpc_prep_set();
719         if (set == NULL) {
720                 CERROR("can't allocate ptlrpc set\n");
721                 rc = -ENOMEM;
722         } else {
723                 rc = obd_getattr_async(exp, &oinfo, set);
724                 if (rc == 0)
725                         rc = ptlrpc_set_wait(set);
726                 ptlrpc_set_destroy(set);
727         }
728         if (rc == 0)
729                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
730                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
731                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE);
732         RETURN(rc);
733 }
734
735 /* Fills the obdo with the attributes for the inode defined by lsm */
736 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
737 {
738         struct ll_inode_info *lli  = ll_i2info(inode);
739         struct obd_capa      *capa = ll_mdscapa_get(inode);
740         int rc;
741         ENTRY;
742
743         rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode), capa, obdo);
744         capa_put(capa);
745         if (rc == 0) {
746                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
747                 CDEBUG(D_INODE,
748                        "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
749                        lli->lli_smd->lsm_object_id, i_size_read(inode),
750                        (unsigned long long)inode->i_blocks,
751                        (unsigned long)ll_inode_blksize(inode));
752         }
753         RETURN(rc);
754 }
755
756 int ll_merge_lvb(struct inode *inode)
757 {
758         struct ll_inode_info *lli = ll_i2info(inode);
759         struct ll_sb_info *sbi = ll_i2sbi(inode);
760         struct ost_lvb lvb;
761         int rc;
762
763         ENTRY;
764
765         ll_inode_size_lock(inode, 1);
766         inode_init_lvb(inode, &lvb);
767         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
768         i_size_write(inode, lvb.lvb_size);
769         inode->i_blocks = lvb.lvb_blocks;
770
771         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
772         LTIME_S(inode->i_atime) = lvb.lvb_atime;
773         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
774         ll_inode_size_unlock(inode, 1);
775
776         RETURN(rc);
777 }
778
779 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
780                      lstat_t *st)
781 {
782         struct obdo obdo = { 0 };
783         int rc;
784
785         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo);
786         if (rc == 0) {
787                 st->st_size   = obdo.o_size;
788                 st->st_blocks = obdo.o_blocks;
789                 st->st_mtime  = obdo.o_mtime;
790                 st->st_atime  = obdo.o_atime;
791                 st->st_ctime  = obdo.o_ctime;
792         }
793         return rc;
794 }
795
796 void ll_io_init(struct cl_io *io, const struct file *file, int write)
797 {
798         struct inode *inode     = file->f_dentry->d_inode;
799         struct ll_sb_info *sbi  = ll_i2sbi(inode);
800         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
801
802         LASSERT(fd != NULL);
803         memset(io, 0, sizeof *io);
804         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
805         if (write)
806                 io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
807         io->ci_obj     = ll_i2info(inode)->lli_clob;
808         io->ci_lockreq = CILR_MAYBE;
809         if (fd->fd_flags & LL_FILE_IGNORE_LOCK || sbi->ll_flags & LL_SBI_NOLCK)
810                 io->ci_lockreq = CILR_NEVER;
811         else if (file->f_flags & O_APPEND)
812                 io->ci_lockreq = CILR_MANDATORY;
813 }
814
815 static ssize_t ll_file_io_generic(const struct lu_env *env,
816                 struct ccc_io_args *args, struct file *file,
817                 enum cl_io_type iot, loff_t *ppos, size_t count)
818 {
819         struct cl_io       *io;
820         ssize_t             result;
821         ENTRY;
822
823         io = &ccc_env_info(env)->cti_io;
824         ll_io_init(io, file, iot == CIT_WRITE);
825
826         if (iot == CIT_READ)
827                 io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile;
828
829         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
830                 struct vvp_io *vio = vvp_env_io(env);
831                 struct ccc_io *cio = ccc_env_io(env);
832                 if (cl_io_is_sendfile(io)) {
833                         vio->u.read.cui_actor = args->cia_actor;
834                         vio->u.read.cui_target = args->cia_target;
835                 } else {
836                         cio->cui_iov = args->cia_iov;
837                         cio->cui_nrsegs = args->cia_nrsegs;
838 #ifndef HAVE_FILE_WRITEV
839                         cio->cui_iocb = args->cia_iocb;
840 #endif
841                 }
842                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
843                 result = cl_io_loop(env, io);
844         } else
845                 /* cl_io_rw_init() handled IO */
846                 result = io->ci_result;
847         if (io->ci_nob > 0) {
848                 result = io->ci_nob;
849                 *ppos = io->u.ci_wr.wr.crw_pos;
850         }
851         cl_io_fini(env, io);
852         RETURN(result);
853 }
854
855
856 /*
857  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
858  */
859 static int ll_file_get_iov_count(const struct iovec *iov,
860                                  unsigned long *nr_segs, size_t *count)
861 {
862         size_t cnt = 0;
863         unsigned long seg;
864
865         for (seg = 0; seg < *nr_segs; seg++) {
866                 const struct iovec *iv = &iov[seg];
867
868                 /*
869                  * If any segment has a negative length, or the cumulative
870                  * length ever wraps negative then return -EINVAL.
871                  */
872                 cnt += iv->iov_len;
873                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
874                         return -EINVAL;
875                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
876                         continue;
877                 if (seg == 0)
878                         return -EFAULT;
879                 *nr_segs = seg;
880                 cnt -= iv->iov_len;   /* This segment is no good */
881                 break;
882         }
883         *count = cnt;
884         return 0;
885 }
886
887 #ifdef HAVE_FILE_READV
888 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
889                               unsigned long nr_segs, loff_t *ppos)
890 {
891         struct lu_env      *env;
892         struct ccc_io_args *args;
893         size_t              count;
894         ssize_t             result;
895         int                 refcheck;
896         ENTRY;
897
898         result = ll_file_get_iov_count(iov, &nr_segs, &count);
899         if (result)
900                 RETURN(result);
901
902         env = cl_env_get(&refcheck);
903         if (IS_ERR(env))
904                 RETURN(PTR_ERR(env));
905
906         args = &vvp_env_info(env)->vti_args;
907         args->cia_is_sendfile = 0;
908         args->cia_iov = (struct iovec *)iov;
909         args->cia_nrsegs = nr_segs;
910         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
911         cl_env_put(env, &refcheck);
912         RETURN(result);
913 }
914
915 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
916                             loff_t *ppos)
917 {
918         struct lu_env *env;
919         struct iovec  *local_iov;
920         ssize_t        result;
921         int            refcheck;
922         ENTRY;
923
924         env = cl_env_get(&refcheck);
925         if (IS_ERR(env))
926                 RETURN(PTR_ERR(env));
927
928         local_iov = &vvp_env_info(env)->vti_local_iov;
929         local_iov->iov_base = (void __user *)buf;
930         local_iov->iov_len = count;
931         result = ll_file_readv(file, local_iov, 1, ppos);
932         cl_env_put(env, &refcheck);
933         RETURN(result);
934 }
935
936 #else
937 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
938                                 unsigned long nr_segs, loff_t pos)
939 {
940         struct lu_env      *env;
941         struct ccc_io_args *args;
942         size_t              count;
943         ssize_t             result;
944         int                 refcheck;
945         ENTRY;
946
947         result = ll_file_get_iov_count(iov, &nr_segs, &count);
948         if (result)
949                 RETURN(result);
950
951         env = cl_env_get(&refcheck);
952         if (IS_ERR(env))
953                 RETURN(PTR_ERR(env));
954
955         args = &vvp_env_info(env)->vti_args;
956         args->cia_is_sendfile = 0;
957         args->cia_iov = (struct iovec *)iov;
958         args->cia_nrsegs = nr_segs;
959         args->cia_iocb = iocb;
960         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
961                                     &iocb->ki_pos, count);
962         cl_env_put(env, &refcheck);
963         RETURN(result);
964 }
965
966 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
967                             loff_t *ppos)
968 {
969         struct lu_env *env;
970         struct iovec  *local_iov;
971         struct kiocb  *kiocb;
972         ssize_t        result;
973         int            refcheck;
974         ENTRY;
975
976         env = cl_env_get(&refcheck);
977         if (IS_ERR(env))
978                 RETURN(PTR_ERR(env));
979
980         local_iov = &vvp_env_info(env)->vti_local_iov;
981         kiocb = &vvp_env_info(env)->vti_kiocb;
982         local_iov->iov_base = (void __user *)buf;
983         local_iov->iov_len = count;
984         init_sync_kiocb(kiocb, file);
985         kiocb->ki_pos = *ppos;
986         kiocb->ki_left = count;
987
988         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
989         *ppos = kiocb->ki_pos;
990
991         cl_env_put(env, &refcheck);
992         RETURN(result);
993 }
994 #endif
995
996 /*
997  * Write to a file (through the page cache).
998  */
999 #ifdef HAVE_FILE_WRITEV
1000 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1001                               unsigned long nr_segs, loff_t *ppos)
1002 {
1003         struct lu_env      *env;
1004         struct ccc_io_args *args;
1005         size_t              count;
1006         ssize_t             result;
1007         int                 refcheck;
1008         ENTRY;
1009
1010         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1011         if (result)
1012                 RETURN(result);
1013
1014         env = cl_env_get(&refcheck);
1015         if (IS_ERR(env))
1016                 RETURN(PTR_ERR(env));
1017
1018         args = &vvp_env_info(env)->vti_args;
1019         args->cia_iov = (struct iovec *)iov;
1020         args->cia_nrsegs = nr_segs;
1021         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1022         cl_env_put(env, &refcheck);
1023         RETURN(result);
1024 }
1025
1026 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1027                              loff_t *ppos)
1028 {
1029         struct lu_env    *env;
1030         struct iovec     *local_iov;
1031         ssize_t           result;
1032         int               refcheck;
1033         ENTRY;
1034
1035         env = cl_env_get(&refcheck);
1036         if (IS_ERR(env))
1037                 RETURN(PTR_ERR(env));
1038
1039         local_iov = &vvp_env_info(env)->vti_local_iov;
1040         local_iov->iov_base = (void __user *)buf;
1041         local_iov->iov_len = count;
1042
1043         result = ll_file_writev(file, local_iov, 1, ppos);
1044         cl_env_put(env, &refcheck);
1045         RETURN(result);
1046 }
1047
1048 #else /* AIO stuff */
1049 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1050                                  unsigned long nr_segs, loff_t pos)
1051 {
1052         struct lu_env      *env;
1053         struct ccc_io_args *args;
1054         size_t              count;
1055         ssize_t             result;
1056         int                 refcheck;
1057         ENTRY;
1058
1059         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1060         if (result)
1061                 RETURN(result);
1062
1063         env = cl_env_get(&refcheck);
1064         if (IS_ERR(env))
1065                 RETURN(PTR_ERR(env));
1066
1067         args = &vvp_env_info(env)->vti_args;
1068         args->cia_iov = (struct iovec *)iov;
1069         args->cia_nrsegs = nr_segs;
1070         args->cia_iocb = iocb;
1071         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1072                                   &iocb->ki_pos, count);
1073         cl_env_put(env, &refcheck);
1074         RETURN(result);
1075 }
1076
1077 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1078                              loff_t *ppos)
1079 {
1080         struct lu_env *env;
1081         struct iovec  *local_iov;
1082         struct kiocb  *kiocb;
1083         ssize_t        result;
1084         int            refcheck;
1085         ENTRY;
1086
1087         env = cl_env_get(&refcheck);
1088         if (IS_ERR(env))
1089                 RETURN(PTR_ERR(env));
1090
1091         local_iov = &vvp_env_info(env)->vti_local_iov;
1092         kiocb = &vvp_env_info(env)->vti_kiocb;
1093         local_iov->iov_base = (void __user *)buf;
1094         local_iov->iov_len = count;
1095         init_sync_kiocb(kiocb, file);
1096         kiocb->ki_pos = *ppos;
1097         kiocb->ki_left = count;
1098
1099         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1100         *ppos = kiocb->ki_pos;
1101
1102         cl_env_put(env, &refcheck);
1103         RETURN(result);
1104 }
1105 #endif
1106
1107
1108 /*
1109  * Send file content (through pagecache) somewhere with helper
1110  */
1111 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1112                                 read_actor_t actor, void *target)
1113 {
1114         struct lu_env      *env;
1115         struct ccc_io_args *args;
1116         ssize_t             result;
1117         int                 refcheck;
1118         ENTRY;
1119
1120         env = cl_env_get(&refcheck);
1121         if (IS_ERR(env))
1122                 RETURN(PTR_ERR(env));
1123
1124         args = &vvp_env_info(env)->vti_args;
1125         args->cia_is_sendfile = 1;
1126         args->cia_target = target;
1127         args->cia_actor = actor;
1128         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1129         cl_env_put(env, &refcheck);
1130         RETURN(result);
1131 }
1132
1133 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1134                                unsigned long arg)
1135 {
1136         struct obd_export *exp = ll_i2dtexp(inode);
1137         struct ll_recreate_obj ucreatp;
1138         struct obd_trans_info oti = { 0 };
1139         struct obdo *oa = NULL;
1140         int lsm_size;
1141         int rc = 0;
1142         struct lov_stripe_md *lsm, *lsm2;
1143         ENTRY;
1144
1145         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1146                 RETURN(-EPERM);
1147
1148         if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1149                            sizeof(struct ll_recreate_obj)))
1150                 RETURN(-EFAULT);
1151
1152         OBDO_ALLOC(oa);
1153         if (oa == NULL)
1154                 RETURN(-ENOMEM);
1155
1156         ll_inode_size_lock(inode, 0);
1157         lsm = ll_i2info(inode)->lli_smd;
1158         if (lsm == NULL)
1159                 GOTO(out, rc = -ENOENT);
1160         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1161                    (lsm->lsm_stripe_count));
1162
1163         OBD_ALLOC(lsm2, lsm_size);
1164         if (lsm2 == NULL)
1165                 GOTO(out, rc = -ENOMEM);
1166
1167         oa->o_id = ucreatp.lrc_id;
1168         oa->o_gr = ucreatp.lrc_group;
1169         oa->o_nlink = ucreatp.lrc_ost_idx;
1170         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1171         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1172         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1173                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1174
1175         memcpy(lsm2, lsm, lsm_size);
1176         rc = obd_create(exp, oa, &lsm2, &oti);
1177
1178         OBD_FREE(lsm2, lsm_size);
1179         GOTO(out, rc);
1180 out:
1181         ll_inode_size_unlock(inode, 0);
1182         OBDO_FREE(oa);
1183         return rc;
1184 }
1185
1186 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1187                              int flags, struct lov_user_md *lum, int lum_size)
1188 {
1189         struct lov_stripe_md *lsm;
1190         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1191         int rc = 0;
1192         ENTRY;
1193
1194         ll_inode_size_lock(inode, 0);
1195         lsm = ll_i2info(inode)->lli_smd;
1196         if (lsm) {
1197                 ll_inode_size_unlock(inode, 0);
1198                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1199                        inode->i_ino);
1200                 RETURN(-EEXIST);
1201         }
1202
1203         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1204         if (rc)
1205                 GOTO(out, rc);
1206         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1207                 GOTO(out_req_free, rc = -ENOENT);
1208         rc = oit.d.lustre.it_status;
1209         if (rc < 0)
1210                 GOTO(out_req_free, rc);
1211
1212         ll_release_openhandle(file->f_dentry, &oit);
1213
1214  out:
1215         ll_inode_size_unlock(inode, 0);
1216         ll_intent_release(&oit);
1217         RETURN(rc);
1218 out_req_free:
1219         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1220         goto out;
1221 }
1222
1223 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1224                              struct lov_mds_md **lmmp, int *lmm_size,
1225                              struct ptlrpc_request **request)
1226 {
1227         struct ll_sb_info *sbi = ll_i2sbi(inode);
1228         struct mdt_body  *body;
1229         struct lov_mds_md *lmm = NULL;
1230         struct ptlrpc_request *req = NULL;
1231         struct obd_capa *oc;
1232         int rc, lmmsize;
1233
1234         rc = ll_get_max_mdsize(sbi, &lmmsize);
1235         if (rc)
1236                 RETURN(rc);
1237
1238         oc = ll_mdscapa_get(inode);
1239         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1240                              oc, filename, strlen(filename) + 1,
1241                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1242                              ll_i2suppgid(inode), &req);
1243         capa_put(oc);
1244         if (rc < 0) {
1245                 CDEBUG(D_INFO, "md_getattr_name failed "
1246                        "on %s: rc %d\n", filename, rc);
1247                 GOTO(out, rc);
1248         }
1249
1250         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1251         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1252
1253         lmmsize = body->eadatasize;
1254
1255         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1256                         lmmsize == 0) {
1257                 GOTO(out, rc = -ENODATA);
1258         }
1259
1260         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1261         LASSERT(lmm != NULL);
1262
1263         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1264             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
1265             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
1266                 GOTO(out, rc = -EPROTO);
1267         }
1268
1269         /*
1270          * This is coming from the MDS, so is probably in
1271          * little endian.  We convert it to host endian before
1272          * passing it to userspace.
1273          */
1274         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1275                 /* if function called for directory - we should
1276                  * avoid swab not existent lsm objects */
1277                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1278                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1279                         if (S_ISREG(body->mode))
1280                                 lustre_swab_lov_user_md_objects(
1281                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1282                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1283                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1284                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1285                         if (S_ISREG(body->mode))
1286                                 lustre_swab_lov_user_md_objects(
1287                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1288                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1289                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
1290                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1291                 }
1292         }
1293
1294         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1295                 struct lov_stripe_md *lsm;
1296                 struct lov_user_md_join *lmj;
1297                 int lmj_size, i, aindex = 0;
1298
1299                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1300                 if (rc < 0)
1301                         GOTO(out, rc = -ENOMEM);
1302                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1303                 if (rc)
1304                         GOTO(out_free_memmd, rc);
1305
1306                 lmj_size = sizeof(struct lov_user_md_join) +
1307                            lsm->lsm_stripe_count *
1308                            sizeof(struct lov_user_ost_data_join);
1309                 OBD_ALLOC(lmj, lmj_size);
1310                 if (!lmj)
1311                         GOTO(out_free_memmd, rc = -ENOMEM);
1312
1313                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1314                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1315                         struct lov_extent *lex =
1316                                 &lsm->lsm_array->lai_ext_array[aindex];
1317
1318                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1319                                 aindex ++;
1320                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1321                                         LPU64" len %d\n", aindex, i,
1322                                         lex->le_start, (int)lex->le_len);
1323                         lmj->lmm_objects[i].l_extent_start =
1324                                 lex->le_start;
1325
1326                         if ((int)lex->le_len == -1)
1327                                 lmj->lmm_objects[i].l_extent_end = -1;
1328                         else
1329                                 lmj->lmm_objects[i].l_extent_end =
1330                                         lex->le_start + lex->le_len;
1331                         lmj->lmm_objects[i].l_object_id =
1332                                 lsm->lsm_oinfo[i]->loi_id;
1333                         lmj->lmm_objects[i].l_object_gr =
1334                                 lsm->lsm_oinfo[i]->loi_gr;
1335                         lmj->lmm_objects[i].l_ost_gen =
1336                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1337                         lmj->lmm_objects[i].l_ost_idx =
1338                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1339                 }
1340                 lmm = (struct lov_mds_md *)lmj;
1341                 lmmsize = lmj_size;
1342 out_free_memmd:
1343                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1344         }
1345 out:
1346         *lmmp = lmm;
1347         *lmm_size = lmmsize;
1348         *request = req;
1349         return rc;
1350 }
1351
1352 static int ll_lov_setea(struct inode *inode, struct file *file,
1353                             unsigned long arg)
1354 {
1355         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1356         struct lov_user_md  *lump;
1357         int lum_size = sizeof(struct lov_user_md) +
1358                        sizeof(struct lov_user_ost_data);
1359         int rc;
1360         ENTRY;
1361
1362         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1363                 RETURN(-EPERM);
1364
1365         OBD_ALLOC(lump, lum_size);
1366         if (lump == NULL) {
1367                 RETURN(-ENOMEM);
1368         }
1369         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1370                 OBD_FREE(lump, lum_size);
1371                 RETURN(-EFAULT);
1372         }
1373
1374         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1375
1376         OBD_FREE(lump, lum_size);
1377         RETURN(rc);
1378 }
1379
1380 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1381                             unsigned long arg)
1382 {
1383         struct lov_user_md_v3 lumv3;
1384         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1385         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1386         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1387         int lum_size;
1388         int rc;
1389         int flags = FMODE_WRITE;
1390         ENTRY;
1391
1392         /* first try with v1 which is smaller than v3 */
1393         lum_size = sizeof(struct lov_user_md_v1);
1394         if (copy_from_user(lumv1, lumv1p, lum_size))
1395                 RETURN(-EFAULT);
1396
1397         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1398                 lum_size = sizeof(struct lov_user_md_v3);
1399                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1400                         RETURN(-EFAULT);
1401         }
1402
1403         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1404         if (rc == 0) {
1405                  put_user(0, &lumv1p->lmm_stripe_count);
1406                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1407                                     0, ll_i2info(inode)->lli_smd,
1408                                     (void *)arg);
1409         }
1410         RETURN(rc);
1411 }
1412
1413 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1414 {
1415         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1416
1417         if (!lsm)
1418                 RETURN(-ENODATA);
1419
1420         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1421                             (void *)arg);
1422 }
1423
1424 static int ll_get_grouplock(struct inode *inode, struct file *file,
1425                             unsigned long arg)
1426 {
1427         /* XXX */
1428         return -ENOSYS;
1429 }
1430
1431 static int ll_put_grouplock(struct inode *inode, struct file *file,
1432                             unsigned long arg)
1433 {
1434         /* XXX */
1435         return -ENOSYS;
1436 }
1437
1438 #if LUSTRE_FIX >= 50
1439 static int join_sanity_check(struct inode *head, struct inode *tail)
1440 {
1441         ENTRY;
1442         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1443                 CERROR("server do not support join \n");
1444                 RETURN(-EINVAL);
1445         }
1446         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1447                 CERROR("tail ino %lu and ino head %lu must be regular\n",
1448                        head->i_ino, tail->i_ino);
1449                 RETURN(-EINVAL);
1450         }
1451         if (head->i_ino == tail->i_ino) {
1452                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1453                 RETURN(-EINVAL);
1454         }
1455         if (i_size_read(head) % JOIN_FILE_ALIGN) {
1456                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1457                 RETURN(-EINVAL);
1458         }
1459         RETURN(0);
1460 }
1461
1462 static int join_file(struct inode *head_inode, struct file *head_filp,
1463                      struct file *tail_filp)
1464 {
1465         struct dentry *tail_dentry = tail_filp->f_dentry;
1466         struct lookup_intent oit = {.it_op = IT_OPEN,
1467                                     .it_flags = head_filp->f_flags,
1468                                     .it_create_mode = M_JOIN_FILE};
1469         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
1470                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
1471
1472         struct lustre_handle lockh;
1473         struct md_op_data *op_data;
1474         int    rc;
1475         loff_t data;
1476         ENTRY;
1477
1478         tail_dentry = tail_filp->f_dentry;
1479
1480         data = i_size_read(head_inode);
1481         op_data = ll_prep_md_op_data(NULL, head_inode,
1482                                      tail_dentry->d_parent->d_inode,
1483                                      tail_dentry->d_name.name,
1484                                      tail_dentry->d_name.len, 0,
1485                                      LUSTRE_OPC_ANY, &data);
1486         if (IS_ERR(op_data))
1487                 RETURN(PTR_ERR(op_data));
1488
1489         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
1490                          op_data, &lockh, NULL, 0, NULL, 0);
1491
1492         ll_finish_md_op_data(op_data);
1493         if (rc < 0)
1494                 GOTO(out, rc);
1495
1496         rc = oit.d.lustre.it_status;
1497
1498         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
1499                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
1500                 ptlrpc_req_finished((struct ptlrpc_request *)
1501                                     oit.d.lustre.it_data);
1502                 GOTO(out, rc);
1503         }
1504
1505         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1506                                            * away */
1507                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1508                 oit.d.lustre.it_lock_mode = 0;
1509         }
1510         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1511         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
1512         ll_release_openhandle(head_filp->f_dentry, &oit);
1513 out:
1514         ll_intent_release(&oit);
1515         RETURN(rc);
1516 }
1517
1518 static int ll_file_join(struct inode *head, struct file *filp,
1519                         char *filename_tail)
1520 {
1521         struct inode *tail = NULL, *first = NULL, *second = NULL;
1522         struct dentry *tail_dentry;
1523         struct file *tail_filp, *first_filp, *second_filp;
1524         struct ll_lock_tree first_tree, second_tree;
1525         struct ll_lock_tree_node *first_node, *second_node;
1526         struct ll_inode_info *hlli = ll_i2info(head);
1527         int rc = 0, cleanup_phase = 0;
1528         ENTRY;
1529
1530         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1531                head->i_ino, head->i_generation, head, filename_tail);
1532
1533         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1534         if (IS_ERR(tail_filp)) {
1535                 CERROR("Can not open tail file %s", filename_tail);
1536                 rc = PTR_ERR(tail_filp);
1537                 GOTO(cleanup, rc);
1538         }
1539         tail = igrab(tail_filp->f_dentry->d_inode);
1540
1541         tail_dentry = tail_filp->f_dentry;
1542         LASSERT(tail_dentry);
1543         cleanup_phase = 1;
1544
1545         /*reorder the inode for lock sequence*/
1546         first = head->i_ino > tail->i_ino ? head : tail;
1547         second = head->i_ino > tail->i_ino ? tail : head;
1548         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1549         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1550
1551         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1552                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1553         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1554         if (IS_ERR(first_node)){
1555                 rc = PTR_ERR(first_node);
1556                 GOTO(cleanup, rc);
1557         }
1558         first_tree.lt_fd = first_filp->private_data;
1559         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1560         if (rc != 0)
1561                 GOTO(cleanup, rc);
1562         cleanup_phase = 2;
1563
1564         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1565         if (IS_ERR(second_node)){
1566                 rc = PTR_ERR(second_node);
1567                 GOTO(cleanup, rc);
1568         }
1569         second_tree.lt_fd = second_filp->private_data;
1570         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1571         if (rc != 0)
1572                 GOTO(cleanup, rc);
1573         cleanup_phase = 3;
1574
1575         rc = join_sanity_check(head, tail);
1576         if (rc)
1577                 GOTO(cleanup, rc);
1578
1579         rc = join_file(head, filp, tail_filp);
1580         if (rc)
1581                 GOTO(cleanup, rc);
1582 cleanup:
1583         switch (cleanup_phase) {
1584         case 3:
1585                 ll_tree_unlock(&second_tree);
1586                 obd_cancel_unused(ll_i2dtexp(second),
1587                                   ll_i2info(second)->lli_smd, 0, NULL);
1588         case 2:
1589                 ll_tree_unlock(&first_tree);
1590                 obd_cancel_unused(ll_i2dtexp(first),
1591                                   ll_i2info(first)->lli_smd, 0, NULL);
1592         case 1:
1593                 filp_close(tail_filp, 0);
1594                 if (tail)
1595                         iput(tail);
1596                 if (head && rc == 0) {
1597                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
1598                                        &hlli->lli_smd);
1599                         hlli->lli_smd = NULL;
1600                 }
1601         case 0:
1602                 break;
1603         default:
1604                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
1605                 LBUG();
1606         }
1607         RETURN(rc);
1608 }
1609 #endif /* LUSTRE_FIX >= 50 */
1610
1611 /**
1612  * Close inode open handle
1613  *
1614  * \param dentry [in]     dentry which contains the inode
1615  * \param it     [in,out] intent which contains open info and result
1616  *
1617  * \retval 0     success
1618  * \retval <0    failure
1619  */
1620 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1621 {
1622         struct inode *inode = dentry->d_inode;
1623         struct obd_client_handle *och;
1624         int rc;
1625         ENTRY;
1626
1627         LASSERT(inode);
1628
1629         /* Root ? Do nothing. */
1630         if (dentry->d_inode->i_sb->s_root == dentry)
1631                 RETURN(0);
1632
1633         /* No open handle to close? Move away */
1634         if (!it_disposition(it, DISP_OPEN_OPEN))
1635                 RETURN(0);
1636
1637         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1638
1639         OBD_ALLOC(och, sizeof(*och));
1640         if (!och)
1641                 GOTO(out, rc = -ENOMEM);
1642
1643         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1644                     ll_i2info(inode), it, och);
1645
1646         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1647                                        inode, och);
1648  out:
1649         /* this one is in place of ll_file_open */
1650         if (it_disposition(it, DISP_ENQ_OPEN_REF))
1651                 ptlrpc_req_finished(it->d.lustre.it_data);
1652         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1653         RETURN(rc);
1654 }
1655
1656 /**
1657  * Get size for inode for which FIEMAP mapping is requested.
1658  * Make the FIEMAP get_info call and returns the result.
1659  */
1660 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1661               int num_bytes)
1662 {
1663         struct obd_export *exp = ll_i2dtexp(inode);
1664         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1665         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1666         int vallen = num_bytes;
1667         int rc;
1668         ENTRY;
1669
1670         /* If the stripe_count > 1 and the application does not understand
1671          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1672          */
1673         if (lsm->lsm_stripe_count > 1 &&
1674             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1675                 return -EOPNOTSUPP;
1676
1677         fm_key.oa.o_id = lsm->lsm_object_id;
1678         fm_key.oa.o_gr = lsm->lsm_object_gr;
1679         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1680
1681         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
1682                         OBD_MD_FLSIZE);
1683
1684         /* If filesize is 0, then there would be no objects for mapping */
1685         if (fm_key.oa.o_size == 0) {
1686                 fiemap->fm_mapped_extents = 0;
1687                 RETURN(0);
1688         }
1689
1690         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1691
1692         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1693         if (rc)
1694                 CERROR("obd_get_info failed: rc = %d\n", rc);
1695
1696         RETURN(rc);
1697 }
1698
1699 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1700                   unsigned long arg)
1701 {
1702         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1703         int flags;
1704         ENTRY;
1705
1706         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1707                inode->i_generation, inode, cmd);
1708         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1709
1710         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1711         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1712                 RETURN(-ENOTTY);
1713
1714         switch(cmd) {
1715         case LL_IOC_GETFLAGS:
1716                 /* Get the current value of the file flags */
1717                 return put_user(fd->fd_flags, (int *)arg);
1718         case LL_IOC_SETFLAGS:
1719         case LL_IOC_CLRFLAGS:
1720                 /* Set or clear specific file flags */
1721                 /* XXX This probably needs checks to ensure the flags are
1722                  *     not abused, and to handle any flag side effects.
1723                  */
1724                 if (get_user(flags, (int *) arg))
1725                         RETURN(-EFAULT);
1726
1727                 if (cmd == LL_IOC_SETFLAGS) {
1728                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1729                             !(file->f_flags & O_DIRECT)) {
1730                                 CERROR("%s: unable to disable locking on "
1731                                        "non-O_DIRECT file\n", current->comm);
1732                                 RETURN(-EINVAL);
1733                         }
1734
1735                         fd->fd_flags |= flags;
1736                 } else {
1737                         fd->fd_flags &= ~flags;
1738                 }
1739                 RETURN(0);
1740         case LL_IOC_LOV_SETSTRIPE:
1741                 RETURN(ll_lov_setstripe(inode, file, arg));
1742         case LL_IOC_LOV_SETEA:
1743                 RETURN(ll_lov_setea(inode, file, arg));
1744         case LL_IOC_LOV_GETSTRIPE:
1745                 RETURN(ll_lov_getstripe(inode, arg));
1746         case LL_IOC_RECREATE_OBJ:
1747                 RETURN(ll_lov_recreate_obj(inode, file, arg));
1748         case EXT3_IOC_FIEMAP: {
1749                 struct ll_user_fiemap *fiemap_s;
1750                 size_t num_bytes, ret_bytes;
1751                 unsigned int extent_count;
1752                 int rc = 0;
1753
1754                 /* Get the extent count so we can calculate the size of
1755                  * required fiemap buffer */
1756                 if (get_user(extent_count,
1757                     &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1758                         RETURN(-EFAULT);
1759                 num_bytes = sizeof(*fiemap_s) + (extent_count *
1760                                                  sizeof(struct ll_fiemap_extent));
1761                 OBD_VMALLOC(fiemap_s, num_bytes);
1762                 if (fiemap_s == NULL)
1763                         RETURN(-ENOMEM);
1764
1765                 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1766                                    sizeof(*fiemap_s)))
1767                         GOTO(error, rc = -EFAULT);
1768
1769                 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1770                         fiemap_s->fm_flags = fiemap_s->fm_flags &
1771                                                     ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1772                         if (copy_to_user((char *)arg, fiemap_s,
1773                                          sizeof(*fiemap_s)))
1774                                 GOTO(error, rc = -EFAULT);
1775
1776                         GOTO(error, rc = -EBADR);
1777                 }
1778
1779                 /* If fm_extent_count is non-zero, read the first extent since
1780                  * it is used to calculate end_offset and device from previous
1781                  * fiemap call. */
1782                 if (extent_count) {
1783                         if (copy_from_user(&fiemap_s->fm_extents[0],
1784                             (char __user *)arg + sizeof(*fiemap_s),
1785                             sizeof(struct ll_fiemap_extent)))
1786                                 GOTO(error, rc = -EFAULT);
1787                 }
1788
1789                 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
1790                         int rc;
1791
1792                         rc = filemap_fdatawrite(inode->i_mapping);
1793                         if (rc)
1794                                 GOTO(error, rc);
1795                 }
1796
1797                 rc = ll_fiemap(inode, fiemap_s, num_bytes);
1798                 if (rc)
1799                         GOTO(error, rc);
1800
1801                 ret_bytes = sizeof(struct ll_user_fiemap);
1802
1803                 if (extent_count != 0)
1804                         ret_bytes += (fiemap_s->fm_mapped_extents *
1805                                          sizeof(struct ll_fiemap_extent));
1806
1807                 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1808                         rc = -EFAULT;
1809
1810 error:
1811                 OBD_VFREE(fiemap_s, num_bytes);
1812                 RETURN(rc);
1813         }
1814         case EXT3_IOC_GETFLAGS:
1815         case EXT3_IOC_SETFLAGS:
1816                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1817         case EXT3_IOC_GETVERSION_OLD:
1818         case EXT3_IOC_GETVERSION:
1819                 RETURN(put_user(inode->i_generation, (int *)arg));
1820         case LL_IOC_JOIN: {
1821 #if LUSTRE_FIX >= 50
1822                 /* Allow file join in beta builds to allow debuggging */
1823                 char *ftail;
1824                 int rc;
1825
1826                 ftail = getname((const char *)arg);
1827                 if (IS_ERR(ftail))
1828                         RETURN(PTR_ERR(ftail));
1829                 rc = ll_file_join(inode, file, ftail);
1830                 putname(ftail);
1831                 RETURN(rc);
1832 #else
1833                 CWARN("file join is not supported in this version of Lustre\n");
1834                 RETURN(-ENOTTY);
1835 #endif
1836         }
1837         case LL_IOC_GROUP_LOCK:
1838                 RETURN(ll_get_grouplock(inode, file, arg));
1839         case LL_IOC_GROUP_UNLOCK:
1840                 RETURN(ll_put_grouplock(inode, file, arg));
1841         case IOC_OBD_STATFS:
1842                 RETURN(ll_obd_statfs(inode, (void *)arg));
1843
1844         /* We need to special case any other ioctls we want to handle,
1845          * to send them to the MDS/OST as appropriate and to properly
1846          * network encode the arg field.
1847         case EXT3_IOC_SETVERSION_OLD:
1848         case EXT3_IOC_SETVERSION:
1849         */
1850         case LL_IOC_FLUSHCTX:
1851                 RETURN(ll_flush_ctx(inode));
1852         case LL_IOC_PATH2FID: {
1853                 if (copy_to_user((void *)arg, &ll_i2info(inode)->lli_fid,
1854                                  sizeof(struct lu_fid)))
1855                         RETURN(-EFAULT);
1856
1857                 RETURN(0);
1858         }
1859         default: {
1860                 int err;
1861
1862                 if (LLIOC_STOP ==
1863                     ll_iocontrol_call(inode, file, cmd, arg, &err))
1864                         RETURN(err);
1865
1866                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1867                                      (void *)arg));
1868         }
1869         }
1870 }
1871
1872 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1873 {
1874         struct inode *inode = file->f_dentry->d_inode;
1875         loff_t retval;
1876         ENTRY;
1877         retval = offset + ((origin == 2) ? i_size_read(inode) :
1878                            (origin == 1) ? file->f_pos : 0);
1879         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
1880                inode->i_ino, inode->i_generation, inode, retval, retval,
1881                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1882         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1883
1884         if (origin == 2) { /* SEEK_END */
1885                 int nonblock = 0, rc;
1886
1887                 if (file->f_flags & O_NONBLOCK)
1888                         nonblock = LDLM_FL_BLOCK_NOWAIT;
1889
1890                 rc = cl_glimpse_size(inode);
1891                 if (rc != 0)
1892                         RETURN(rc);
1893
1894                 ll_inode_size_lock(inode, 0);
1895                 offset += i_size_read(inode);
1896                 ll_inode_size_unlock(inode, 0);
1897         } else if (origin == 1) { /* SEEK_CUR */
1898                 offset += file->f_pos;
1899         }
1900
1901         retval = -EINVAL;
1902         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1903                 if (offset != file->f_pos) {
1904                         file->f_pos = offset;
1905                 }
1906                 retval = offset;
1907         }
1908
1909         RETURN(retval);
1910 }
1911
1912 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1913 {
1914         struct inode *inode = dentry->d_inode;
1915         struct ll_inode_info *lli = ll_i2info(inode);
1916         struct lov_stripe_md *lsm = lli->lli_smd;
1917         struct ptlrpc_request *req;
1918         struct obd_capa *oc;
1919         int rc, err;
1920         ENTRY;
1921         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
1922                inode->i_generation, inode);
1923         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
1924
1925         /* fsync's caller has already called _fdata{sync,write}, we want
1926          * that IO to finish before calling the osc and mdc sync methods */
1927         rc = filemap_fdatawait(inode->i_mapping);
1928
1929         /* catch async errors that were recorded back when async writeback
1930          * failed for pages in this mapping. */
1931         err = lli->lli_async_rc;
1932         lli->lli_async_rc = 0;
1933         if (rc == 0)
1934                 rc = err;
1935         if (lsm) {
1936                 err = lov_test_and_clear_async_rc(lsm);
1937                 if (rc == 0)
1938                         rc = err;
1939         }
1940
1941         oc = ll_mdscapa_get(inode);
1942         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
1943                       &req);
1944         capa_put(oc);
1945         if (!rc)
1946                 rc = err;
1947         if (!err)
1948                 ptlrpc_req_finished(req);
1949
1950         if (data && lsm) {
1951                 struct obdo *oa;
1952
1953                 OBDO_ALLOC(oa);
1954                 if (!oa)
1955                         RETURN(rc ? rc : -ENOMEM);
1956
1957                 oa->o_id = lsm->lsm_object_id;
1958                 oa->o_gr = lsm->lsm_object_gr;
1959                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1960                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1961                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
1962                                            OBD_MD_FLGROUP);
1963
1964                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
1965                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
1966                                0, OBD_OBJECT_EOF, oc);
1967                 capa_put(oc);
1968                 if (!rc)
1969                         rc = err;
1970                 OBDO_FREE(oa);
1971         }
1972
1973         RETURN(rc);
1974 }
1975
1976 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
1977 {
1978         struct inode *inode = file->f_dentry->d_inode;
1979         struct ll_sb_info *sbi = ll_i2sbi(inode);
1980         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
1981                                            .ei_cb_cp =ldlm_flock_completion_ast,
1982                                            .ei_cbdata = file_lock };
1983         struct md_op_data *op_data;
1984         struct lustre_handle lockh = {0};
1985         ldlm_policy_data_t flock;
1986         int flags = 0;
1987         int rc;
1988         ENTRY;
1989
1990         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
1991                inode->i_ino, file_lock);
1992
1993         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
1994
1995         if (file_lock->fl_flags & FL_FLOCK) {
1996                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
1997                 /* set missing params for flock() calls */
1998                 file_lock->fl_end = OFFSET_MAX;
1999                 file_lock->fl_pid = current->tgid;
2000         }
2001         flock.l_flock.pid = file_lock->fl_pid;
2002         flock.l_flock.start = file_lock->fl_start;
2003         flock.l_flock.end = file_lock->fl_end;
2004
2005         switch (file_lock->fl_type) {
2006         case F_RDLCK:
2007                 einfo.ei_mode = LCK_PR;
2008                 break;
2009         case F_UNLCK:
2010                 /* An unlock request may or may not have any relation to
2011                  * existing locks so we may not be able to pass a lock handle
2012                  * via a normal ldlm_lock_cancel() request. The request may even
2013                  * unlock a byte range in the middle of an existing lock. In
2014                  * order to process an unlock request we need all of the same
2015                  * information that is given with a normal read or write record
2016                  * lock request. To avoid creating another ldlm unlock (cancel)
2017                  * message we'll treat a LCK_NL flock request as an unlock. */
2018                 einfo.ei_mode = LCK_NL;
2019                 break;
2020         case F_WRLCK:
2021                 einfo.ei_mode = LCK_PW;
2022                 break;
2023         default:
2024                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2025                 RETURN (-EINVAL);
2026         }
2027
2028         switch (cmd) {
2029         case F_SETLKW:
2030 #ifdef F_SETLKW64
2031         case F_SETLKW64:
2032 #endif
2033                 flags = 0;
2034                 break;
2035         case F_SETLK:
2036 #ifdef F_SETLK64
2037         case F_SETLK64:
2038 #endif
2039                 flags = LDLM_FL_BLOCK_NOWAIT;
2040                 break;
2041         case F_GETLK:
2042 #ifdef F_GETLK64
2043         case F_GETLK64:
2044 #endif
2045                 flags = LDLM_FL_TEST_LOCK;
2046                 /* Save the old mode so that if the mode in the lock changes we
2047                  * can decrement the appropriate reader or writer refcount. */
2048                 file_lock->fl_type = einfo.ei_mode;
2049                 break;
2050         default:
2051                 CERROR("unknown fcntl lock command: %d\n", cmd);
2052                 RETURN (-EINVAL);
2053         }
2054
2055         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2056                                      LUSTRE_OPC_ANY, NULL);
2057         if (IS_ERR(op_data))
2058                 RETURN(PTR_ERR(op_data));
2059
2060         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2061                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2062                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2063
2064         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2065                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2066
2067         ll_finish_md_op_data(op_data);
2068
2069         if ((file_lock->fl_flags & FL_FLOCK) &&
2070             (rc == 0 || file_lock->fl_type == F_UNLCK))
2071                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2072 #ifdef HAVE_F_OP_FLOCK
2073         if ((file_lock->fl_flags & FL_POSIX) &&
2074             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2075             !(flags & LDLM_FL_TEST_LOCK))
2076                 posix_lock_file_wait(file, file_lock);
2077 #endif
2078
2079         RETURN(rc);
2080 }
2081
2082 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2083 {
2084         ENTRY;
2085
2086         RETURN(-ENOSYS);
2087 }
2088
2089 int ll_have_md_lock(struct inode *inode, __u64 bits)
2090 {
2091         struct lustre_handle lockh;
2092         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2093         struct lu_fid *fid;
2094         int flags;
2095         ENTRY;
2096
2097         if (!inode)
2098                RETURN(0);
2099
2100         fid = &ll_i2info(inode)->lli_fid;
2101         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2102
2103         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2104         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2105                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2106                 RETURN(1);
2107         }
2108         RETURN(0);
2109 }
2110
2111 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2112                             struct lustre_handle *lockh)
2113 {
2114         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2115         struct lu_fid *fid;
2116         ldlm_mode_t rc;
2117         int flags;
2118         ENTRY;
2119
2120         fid = &ll_i2info(inode)->lli_fid;
2121         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2122
2123         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2124         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2125                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2126         RETURN(rc);
2127 }
2128
2129 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2130         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2131                               * and return success */
2132                 inode->i_nlink = 0;
2133                 /* This path cannot be hit for regular files unless in
2134                  * case of obscure races, so no need to to validate
2135                  * size. */
2136                 if (!S_ISREG(inode->i_mode) &&
2137                     !S_ISDIR(inode->i_mode))
2138                         return 0;
2139         }
2140
2141         if (rc) {
2142                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2143                 return -abs(rc);
2144
2145         }
2146
2147         return 0;
2148 }
2149
2150 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2151 {
2152         struct inode *inode = dentry->d_inode;
2153         struct ptlrpc_request *req = NULL;
2154         struct ll_sb_info *sbi;
2155         struct obd_export *exp;
2156         int rc;
2157         ENTRY;
2158
2159         if (!inode) {
2160                 CERROR("REPORT THIS LINE TO PETER\n");
2161                 RETURN(0);
2162         }
2163         sbi = ll_i2sbi(inode);
2164
2165         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2166                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2167
2168         exp = ll_i2mdexp(inode);
2169
2170         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2171                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2172                 struct md_op_data *op_data;
2173
2174                 /* Call getattr by fid, so do not provide name at all. */
2175                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2176                                              dentry->d_inode, NULL, 0, 0,
2177                                              LUSTRE_OPC_ANY, NULL);
2178                 if (IS_ERR(op_data))
2179                         RETURN(PTR_ERR(op_data));
2180
2181                 oit.it_create_mode |= M_CHECK_STALE;
2182                 rc = md_intent_lock(exp, op_data, NULL, 0,
2183                                     /* we are not interested in name
2184                                        based lookup */
2185                                     &oit, 0, &req,
2186                                     ll_md_blocking_ast, 0);
2187                 ll_finish_md_op_data(op_data);
2188                 oit.it_create_mode &= ~M_CHECK_STALE;
2189                 if (rc < 0) {
2190                         rc = ll_inode_revalidate_fini(inode, rc);
2191                         GOTO (out, rc);
2192                 }
2193
2194                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2195                 if (rc != 0) {
2196                         ll_intent_release(&oit);
2197                         GOTO(out, rc);
2198                 }
2199
2200                 /* Unlinked? Unhash dentry, so it is not picked up later by
2201                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2202                    here to preserve get_cwd functionality on 2.6.
2203                    Bug 10503 */
2204                 if (!dentry->d_inode->i_nlink) {
2205                         spin_lock(&ll_lookup_lock);
2206                         spin_lock(&dcache_lock);
2207                         ll_drop_dentry(dentry);
2208                         spin_unlock(&dcache_lock);
2209                         spin_unlock(&ll_lookup_lock);
2210                 }
2211
2212                 ll_lookup_finish_locks(&oit, dentry);
2213         } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE |
2214                                                      MDS_INODELOCK_LOOKUP)) {
2215                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2216                 obd_valid valid = OBD_MD_FLGETATTR;
2217                 struct obd_capa *oc;
2218                 int ealen = 0;
2219
2220                 if (S_ISREG(inode->i_mode)) {
2221                         rc = ll_get_max_mdsize(sbi, &ealen);
2222                         if (rc)
2223                                 RETURN(rc);
2224                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2225                 }
2226                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2227                  * capa for this inode. Because we only keep capas of dirs
2228                  * fresh. */
2229                 oc = ll_mdscapa_get(inode);
2230                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2231                                 ealen, &req);
2232                 capa_put(oc);
2233                 if (rc) {
2234                         rc = ll_inode_revalidate_fini(inode, rc);
2235                         RETURN(rc);
2236                 }
2237
2238                 rc = ll_prep_inode(&inode, req, NULL);
2239                 if (rc)
2240                         GOTO(out, rc);
2241         }
2242
2243         /* if object not yet allocated, don't validate size */
2244         if (ll_i2info(inode)->lli_smd == NULL)
2245                 GOTO(out, rc = 0);
2246
2247         /* cl_glimpse_size will prefer locally cached writes if they extend
2248          * the file */
2249         rc = cl_glimpse_size(inode);
2250         EXIT;
2251 out:
2252         ptlrpc_req_finished(req);
2253         return rc;
2254 }
2255
2256 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2257                   struct lookup_intent *it, struct kstat *stat)
2258 {
2259         struct inode *inode = de->d_inode;
2260         int res = 0;
2261
2262         res = ll_inode_revalidate_it(de, it);
2263         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2264
2265         if (res)
2266                 return res;
2267
2268         stat->dev = inode->i_sb->s_dev;
2269         stat->ino = inode->i_ino;
2270         stat->mode = inode->i_mode;
2271         stat->nlink = inode->i_nlink;
2272         stat->uid = inode->i_uid;
2273         stat->gid = inode->i_gid;
2274         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2275         stat->atime = inode->i_atime;
2276         stat->mtime = inode->i_mtime;
2277         stat->ctime = inode->i_ctime;
2278 #ifdef HAVE_INODE_BLKSIZE
2279         stat->blksize = inode->i_blksize;
2280 #else
2281         stat->blksize = 1 << inode->i_blkbits;
2282 #endif
2283
2284         ll_inode_size_lock(inode, 0);
2285         stat->size = i_size_read(inode);
2286         stat->blocks = inode->i_blocks;
2287         ll_inode_size_unlock(inode, 0);
2288
2289         return 0;
2290 }
2291 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2292 {
2293         struct lookup_intent it = { .it_op = IT_GETATTR };
2294
2295         return ll_getattr_it(mnt, de, &it, stat);
2296 }
2297
2298 static
2299 int lustre_check_acl(struct inode *inode, int mask)
2300 {
2301 #ifdef CONFIG_FS_POSIX_ACL
2302         struct ll_inode_info *lli = ll_i2info(inode);
2303         struct posix_acl *acl;
2304         int rc;
2305         ENTRY;
2306
2307         spin_lock(&lli->lli_lock);
2308         acl = posix_acl_dup(lli->lli_posix_acl);
2309         spin_unlock(&lli->lli_lock);
2310
2311         if (!acl)
2312                 RETURN(-EAGAIN);
2313
2314         rc = posix_acl_permission(inode, acl, mask);
2315         posix_acl_release(acl);
2316
2317         RETURN(rc);
2318 #else
2319         return -EAGAIN;
2320 #endif
2321 }
2322
2323 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2324 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2325 {
2326         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2327                inode->i_ino, inode->i_generation, inode, mask);
2328         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2329                 return lustre_check_remote_perm(inode, mask);
2330
2331         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2332         return generic_permission(inode, mask, lustre_check_acl);
2333 }
2334 #else
2335 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2336 {
2337         int mode = inode->i_mode;
2338         int rc;
2339
2340         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2341                inode->i_ino, inode->i_generation, inode, mask);
2342
2343         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2344                 return lustre_check_remote_perm(inode, mask);
2345
2346         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2347
2348         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2349             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2350                 return -EROFS;
2351         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2352                 return -EACCES;
2353         if (current->fsuid == inode->i_uid) {
2354                 mode >>= 6;
2355         } else if (1) {
2356                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2357                         goto check_groups;
2358                 rc = lustre_check_acl(inode, mask);
2359                 if (rc == -EAGAIN)
2360                         goto check_groups;
2361                 if (rc == -EACCES)
2362                         goto check_capabilities;
2363                 return rc;
2364         } else {
2365 check_groups:
2366                 if (in_group_p(inode->i_gid))
2367                         mode >>= 3;
2368         }
2369         if ((mode & mask & S_IRWXO) == mask)
2370                 return 0;
2371
2372 check_capabilities:
2373         if (!(mask & MAY_EXEC) ||
2374             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2375                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2376                         return 0;
2377
2378         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2379             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2380                 return 0;
2381
2382         return -EACCES;
2383 }
2384 #endif
2385
2386 #ifdef HAVE_FILE_READV
2387 #define READ_METHOD readv
2388 #define READ_FUNCTION ll_file_readv
2389 #define WRITE_METHOD writev
2390 #define WRITE_FUNCTION ll_file_writev
2391 #else
2392 #define READ_METHOD aio_read
2393 #define READ_FUNCTION ll_file_aio_read
2394 #define WRITE_METHOD aio_write
2395 #define WRITE_FUNCTION ll_file_aio_write
2396 #endif
2397
2398 /* -o localflock - only provides locally consistent flock locks */
2399 struct file_operations ll_file_operations = {
2400         .read           = ll_file_read,
2401         .READ_METHOD    = READ_FUNCTION,
2402         .write          = ll_file_write,
2403         .WRITE_METHOD   = WRITE_FUNCTION,
2404         .ioctl          = ll_file_ioctl,
2405         .open           = ll_file_open,
2406         .release        = ll_file_release,
2407         .mmap           = ll_file_mmap,
2408         .llseek         = ll_file_seek,
2409         .sendfile       = ll_file_sendfile,
2410         .fsync          = ll_fsync,
2411 };
2412
2413 struct file_operations ll_file_operations_flock = {
2414         .read           = ll_file_read,
2415         .READ_METHOD    = READ_FUNCTION,
2416         .write          = ll_file_write,
2417         .WRITE_METHOD   = WRITE_FUNCTION,
2418         .ioctl          = ll_file_ioctl,
2419         .open           = ll_file_open,
2420         .release        = ll_file_release,
2421         .mmap           = ll_file_mmap,
2422         .llseek         = ll_file_seek,
2423         .sendfile       = ll_file_sendfile,
2424         .fsync          = ll_fsync,
2425 #ifdef HAVE_F_OP_FLOCK
2426         .flock          = ll_file_flock,
2427 #endif
2428         .lock           = ll_file_flock
2429 };
2430
2431 /* These are for -o noflock - to return ENOSYS on flock calls */
2432 struct file_operations ll_file_operations_noflock = {
2433         .read           = ll_file_read,
2434         .READ_METHOD    = READ_FUNCTION,
2435         .write          = ll_file_write,
2436         .WRITE_METHOD   = WRITE_FUNCTION,
2437         .ioctl          = ll_file_ioctl,
2438         .open           = ll_file_open,
2439         .release        = ll_file_release,
2440         .mmap           = ll_file_mmap,
2441         .llseek         = ll_file_seek,
2442         .sendfile       = ll_file_sendfile,
2443         .fsync          = ll_fsync,
2444 #ifdef HAVE_F_OP_FLOCK
2445         .flock          = ll_file_noflock,
2446 #endif
2447         .lock           = ll_file_noflock
2448 };
2449
2450 struct inode_operations ll_file_inode_operations = {
2451 #ifdef HAVE_VFS_INTENT_PATCHES
2452         .setattr_raw    = ll_setattr_raw,
2453 #endif
2454         .setattr        = ll_setattr,
2455         .truncate       = ll_truncate,
2456         .getattr        = ll_getattr,
2457         .permission     = ll_inode_permission,
2458         .setxattr       = ll_setxattr,
2459         .getxattr       = ll_getxattr,
2460         .listxattr      = ll_listxattr,
2461         .removexattr    = ll_removexattr,
2462 };
2463
2464 /* dynamic ioctl number support routins */
2465 static struct llioc_ctl_data {
2466         struct rw_semaphore ioc_sem;
2467         struct list_head    ioc_head;
2468 } llioc = {
2469         __RWSEM_INITIALIZER(llioc.ioc_sem),
2470         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2471 };
2472
2473
2474 struct llioc_data {
2475         struct list_head        iocd_list;
2476         unsigned int            iocd_size;
2477         llioc_callback_t        iocd_cb;
2478         unsigned int            iocd_count;
2479         unsigned int            iocd_cmd[0];
2480 };
2481
2482 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2483 {
2484         unsigned int size;
2485         struct llioc_data *in_data = NULL;
2486         ENTRY;
2487
2488         if (cb == NULL || cmd == NULL ||
2489             count > LLIOC_MAX_CMD || count < 0)
2490                 RETURN(NULL);
2491
2492         size = sizeof(*in_data) + count * sizeof(unsigned int);
2493         OBD_ALLOC(in_data, size);
2494         if (in_data == NULL)
2495                 RETURN(NULL);
2496
2497         memset(in_data, 0, sizeof(*in_data));
2498         in_data->iocd_size = size;
2499         in_data->iocd_cb = cb;
2500         in_data->iocd_count = count;
2501         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2502
2503         down_write(&llioc.ioc_sem);
2504         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2505         up_write(&llioc.ioc_sem);
2506
2507         RETURN(in_data);
2508 }
2509
2510 void ll_iocontrol_unregister(void *magic)
2511 {
2512         struct llioc_data *tmp;
2513
2514         if (magic == NULL)
2515                 return;
2516
2517         down_write(&llioc.ioc_sem);
2518         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2519                 if (tmp == magic) {
2520                         unsigned int size = tmp->iocd_size;
2521
2522                         list_del(&tmp->iocd_list);
2523                         up_write(&llioc.ioc_sem);
2524
2525                         OBD_FREE(tmp, size);
2526                         return;
2527                 }
2528         }
2529         up_write(&llioc.ioc_sem);
2530
2531         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2532 }
2533
2534 EXPORT_SYMBOL(ll_iocontrol_register);
2535 EXPORT_SYMBOL(ll_iocontrol_unregister);
2536
2537 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2538                         unsigned int cmd, unsigned long arg, int *rcp)
2539 {
2540         enum llioc_iter ret = LLIOC_CONT;
2541         struct llioc_data *data;
2542         int rc = -EINVAL, i;
2543
2544         down_read(&llioc.ioc_sem);
2545         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2546                 for (i = 0; i < data->iocd_count; i++) {
2547                         if (cmd != data->iocd_cmd[i])
2548                                 continue;
2549
2550                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2551                         break;
2552                 }
2553
2554                 if (ret == LLIOC_STOP)
2555                         break;
2556         }
2557         up_read(&llioc.ioc_sem);
2558
2559         if (rcp)
2560                 *rcp = rc;
2561         return ret;
2562 }