Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / llite / file.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51
52 #include "cl_object.h"
53
54 struct ll_file_data *ll_file_data_get(void)
55 {
56         struct ll_file_data *fd;
57
58         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
59         return fd;
60 }
61
62 static void ll_file_data_put(struct ll_file_data *fd)
63 {
64         if (fd != NULL)
65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 }
67
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69                           struct lustre_handle *fh)
70 {
71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72         op_data->op_attr.ia_mode = inode->i_mode;
73         op_data->op_attr.ia_atime = inode->i_atime;
74         op_data->op_attr.ia_mtime = inode->i_mtime;
75         op_data->op_attr.ia_ctime = inode->i_ctime;
76         op_data->op_attr.ia_size = i_size_read(inode);
77         op_data->op_attr_blocks = inode->i_blocks;
78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
79         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
80         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
81         op_data->op_capa1 = ll_mdscapa_get(inode);
82 }
83
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85                              struct obd_client_handle *och)
86 {
87         ENTRY;
88
89         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
90                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
91
92         if (!(och->och_flags & FMODE_WRITE))
93                 goto out;
94
95         if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
96             !S_ISREG(inode->i_mode))
97                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
98         else
99                 ll_epoch_close(inode, op_data, &och, 0);
100
101 out:
102         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
103         EXIT;
104 }
105
106 static int ll_close_inode_openhandle(struct obd_export *md_exp,
107                                      struct inode *inode,
108                                      struct obd_client_handle *och)
109 {
110         struct obd_export *exp = ll_i2mdexp(inode);
111         struct md_op_data *op_data;
112         struct ptlrpc_request *req = NULL;
113         struct obd_device *obd = class_exp2obd(exp);
114         int epoch_close = 1;
115         int rc;
116         ENTRY;
117
118         if (obd == NULL) {
119                 /*
120                  * XXX: in case of LMV, is this correct to access
121                  * ->exp_handle?
122                  */
123                 CERROR("Invalid MDC connection handle "LPX64"\n",
124                        ll_i2mdexp(inode)->exp_handle.h_cookie);
125                 GOTO(out, rc = 0);
126         }
127
128         /*
129          * here we check if this is forced umount. If so this is called on
130          * canceling "open lock" and we do not call md_close() in this case, as
131          * it will not be successful, as import is already deactivated.
132          */
133         if (obd->obd_force)
134                 GOTO(out, rc = 0);
135
136         OBD_ALLOC_PTR(op_data);
137         if (op_data == NULL)
138                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
139
140         ll_prepare_close(inode, op_data, och);
141         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
142         rc = md_close(md_exp, op_data, och->och_mod, &req);
143         if (rc == -EAGAIN) {
144                 /* This close must have the epoch closed. */
145                 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
146                 LASSERT(epoch_close);
147                 /* MDS has instructed us to obtain Size-on-MDS attribute from
148                  * OSTs and send setattr to back to MDS. */
149                 rc = ll_sizeonmds_update(inode, &och->och_fh,
150                                          op_data->op_ioepoch);
151                 if (rc) {
152                         CERROR("inode %lu mdc Size-on-MDS update failed: "
153                                "rc = %d\n", inode->i_ino, rc);
154                         rc = 0;
155                 }
156         } else if (rc) {
157                 CERROR("inode %lu mdc close failed: rc = %d\n",
158                        inode->i_ino, rc);
159         }
160         ll_finish_md_op_data(op_data);
161
162         if (rc == 0) {
163                 rc = ll_objects_destroy(req, inode);
164                 if (rc)
165                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
166                                inode->i_ino, rc);
167         }
168
169         EXIT;
170 out:
171
172         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
173             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
174                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
175         } else {
176                 md_clear_open_replay_data(md_exp, och);
177                 /* Free @och if it is not waiting for DONE_WRITING. */
178                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
179                 OBD_FREE_PTR(och);
180         }
181         if (req) /* This is close request */
182                 ptlrpc_req_finished(req);
183         return rc;
184 }
185
186 int ll_md_real_close(struct inode *inode, int flags)
187 {
188         struct ll_inode_info *lli = ll_i2info(inode);
189         struct obd_client_handle **och_p;
190         struct obd_client_handle *och;
191         __u64 *och_usecount;
192         int rc = 0;
193         ENTRY;
194
195         if (flags & FMODE_WRITE) {
196                 och_p = &lli->lli_mds_write_och;
197                 och_usecount = &lli->lli_open_fd_write_count;
198         } else if (flags & FMODE_EXEC) {
199                 och_p = &lli->lli_mds_exec_och;
200                 och_usecount = &lli->lli_open_fd_exec_count;
201         } else {
202                 LASSERT(flags & FMODE_READ);
203                 och_p = &lli->lli_mds_read_och;
204                 och_usecount = &lli->lli_open_fd_read_count;
205         }
206
207         down(&lli->lli_och_sem);
208         if (*och_usecount) { /* There are still users of this handle, so
209                                 skip freeing it. */
210                 up(&lli->lli_och_sem);
211                 RETURN(0);
212         }
213         och=*och_p;
214         *och_p = NULL;
215         up(&lli->lli_och_sem);
216
217         if (och) { /* There might be a race and somebody have freed this och
218                       already */
219                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
220                                                inode, och);
221         }
222
223         RETURN(rc);
224 }
225
226 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
227                 struct file *file)
228 {
229         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
230         struct ll_inode_info *lli = ll_i2info(inode);
231         int rc = 0;
232         ENTRY;
233
234         /* clear group lock, if present */
235         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
236 #if 0 /* XXX */
237                 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
238                 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
239                 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
240                                       &fd->fd_cwlockh);
241 #endif
242         }
243
244         /* Let's see if we have good enough OPEN lock on the file and if
245            we can skip talking to MDS */
246         if (file->f_dentry->d_inode) { /* Can this ever be false? */
247                 int lockmode;
248                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
249                 struct lustre_handle lockh;
250                 struct inode *inode = file->f_dentry->d_inode;
251                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
252
253                 down(&lli->lli_och_sem);
254                 if (fd->fd_omode & FMODE_WRITE) {
255                         lockmode = LCK_CW;
256                         LASSERT(lli->lli_open_fd_write_count);
257                         lli->lli_open_fd_write_count--;
258                 } else if (fd->fd_omode & FMODE_EXEC) {
259                         lockmode = LCK_PR;
260                         LASSERT(lli->lli_open_fd_exec_count);
261                         lli->lli_open_fd_exec_count--;
262                 } else {
263                         lockmode = LCK_CR;
264                         LASSERT(lli->lli_open_fd_read_count);
265                         lli->lli_open_fd_read_count--;
266                 }
267                 up(&lli->lli_och_sem);
268
269                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
270                                    LDLM_IBITS, &policy, lockmode,
271                                    &lockh)) {
272                         rc = ll_md_real_close(file->f_dentry->d_inode,
273                                               fd->fd_omode);
274                 }
275         } else {
276                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
277                        file, file->f_dentry, file->f_dentry->d_name.name);
278         }
279
280         LUSTRE_FPRIVATE(file) = NULL;
281         ll_file_data_put(fd);
282         ll_capa_close(inode);
283
284         RETURN(rc);
285 }
286
287 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
288
289 /* While this returns an error code, fput() the caller does not, so we need
290  * to make every effort to clean up all of our state here.  Also, applications
291  * rarely check close errors and even if an error is returned they will not
292  * re-try the close call.
293  */
294 int ll_file_release(struct inode *inode, struct file *file)
295 {
296         struct ll_file_data *fd;
297         struct ll_sb_info *sbi = ll_i2sbi(inode);
298         struct ll_inode_info *lli = ll_i2info(inode);
299         struct lov_stripe_md *lsm = lli->lli_smd;
300         int rc;
301         ENTRY;
302
303         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
304                inode->i_generation, inode);
305
306 #ifdef CONFIG_FS_POSIX_ACL
307         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
308             inode == inode->i_sb->s_root->d_inode) {
309                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
310
311                 LASSERT(fd != NULL);
312                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
313                         fd->fd_flags &= ~LL_FILE_RMTACL;
314                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
315                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
316                 }
317         }
318 #endif
319
320         if (inode->i_sb->s_root != file->f_dentry)
321                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
322         fd = LUSTRE_FPRIVATE(file);
323         LASSERT(fd != NULL);
324
325         /* The last ref on @file, maybe not the the owner pid of statahead.
326          * Different processes can open the same dir, "ll_opendir_key" means:
327          * it is me that should stop the statahead thread. */
328         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
329                 ll_stop_statahead(inode, lli->lli_opendir_key);
330
331         if (inode->i_sb->s_root == file->f_dentry) {
332                 LUSTRE_FPRIVATE(file) = NULL;
333                 ll_file_data_put(fd);
334                 RETURN(0);
335         }
336
337         if (lsm)
338                 lov_test_and_clear_async_rc(lsm);
339         lli->lli_async_rc = 0;
340
341         rc = ll_md_close(sbi->ll_md_exp, inode, file);
342         RETURN(rc);
343 }
344
345 static int ll_intent_file_open(struct file *file, void *lmm,
346                                int lmmsize, struct lookup_intent *itp)
347 {
348         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
349         struct dentry *parent = file->f_dentry->d_parent;
350         const char *name = file->f_dentry->d_name.name;
351         const int len = file->f_dentry->d_name.len;
352         struct md_op_data *op_data;
353         struct ptlrpc_request *req;
354         int rc;
355         ENTRY;
356
357         if (!parent)
358                 RETURN(-ENOENT);
359
360         /* Usually we come here only for NFSD, and we want open lock.
361            But we can also get here with pre 2.6.15 patchless kernels, and in
362            that case that lock is also ok */
363         /* We can also get here if there was cached open handle in revalidate_it
364          * but it disappeared while we were getting from there to ll_file_open.
365          * But this means this file was closed and immediatelly opened which
366          * makes a good candidate for using OPEN lock */
367         /* If lmmsize & lmm are not 0, we are just setting stripe info
368          * parameters. No need for the open lock */
369         if (!lmm && !lmmsize)
370                 itp->it_flags |= MDS_OPEN_LOCK;
371
372         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
373                                       file->f_dentry->d_inode, name, len,
374                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
375         if (IS_ERR(op_data))
376                 RETURN(PTR_ERR(op_data));
377
378         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
379                             0 /*unused */, &req, ll_md_blocking_ast, 0);
380         ll_finish_md_op_data(op_data);
381         if (rc == -ESTALE) {
382                 /* reason for keep own exit path - don`t flood log
383                 * with messages with -ESTALE errors.
384                 */
385                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
386                      it_open_error(DISP_OPEN_OPEN, itp))
387                         GOTO(out, rc);
388                 ll_release_openhandle(file->f_dentry, itp);
389                 GOTO(out, rc);
390         }
391
392         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
393                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
394                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
395                 GOTO(out, rc);
396         }
397
398         if (itp->d.lustre.it_lock_mode)
399                 md_set_lock_data(sbi->ll_md_exp,
400                                  &itp->d.lustre.it_lock_handle,
401                                  file->f_dentry->d_inode);
402
403         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
404 out:
405         ptlrpc_req_finished(itp->d.lustre.it_data);
406         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
407         ll_intent_drop_lock(itp);
408
409         RETURN(rc);
410 }
411
412 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
413                        struct lookup_intent *it, struct obd_client_handle *och)
414 {
415         struct ptlrpc_request *req = it->d.lustre.it_data;
416         struct mdt_body *body;
417
418         LASSERT(och);
419
420         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
421         LASSERT(body != NULL);                      /* reply already checked out */
422
423         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
424         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
425         och->och_fid = lli->lli_fid;
426         och->och_flags = it->it_flags;
427         lli->lli_ioepoch = body->ioepoch;
428
429         return md_set_open_replay_data(md_exp, och, req);
430 }
431
432 int ll_local_open(struct file *file, struct lookup_intent *it,
433                   struct ll_file_data *fd, struct obd_client_handle *och)
434 {
435         struct inode *inode = file->f_dentry->d_inode;
436         struct ll_inode_info *lli = ll_i2info(inode);
437         ENTRY;
438
439         LASSERT(!LUSTRE_FPRIVATE(file));
440
441         LASSERT(fd != NULL);
442
443         if (och) {
444                 struct ptlrpc_request *req = it->d.lustre.it_data;
445                 struct mdt_body *body;
446                 int rc;
447
448                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
449                 if (rc)
450                         RETURN(rc);
451
452                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
453                 if ((it->it_flags & FMODE_WRITE) &&
454                     (body->valid & OBD_MD_FLSIZE))
455                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
456                                lli->lli_ioepoch, PFID(&lli->lli_fid));
457         }
458
459         LUSTRE_FPRIVATE(file) = fd;
460         ll_readahead_init(inode, &fd->fd_ras);
461         fd->fd_omode = it->it_flags;
462         RETURN(0);
463 }
464
465 /* Open a file, and (for the very first open) create objects on the OSTs at
466  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
467  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
468  * lli_open_sem to ensure no other process will create objects, send the
469  * stripe MD to the MDS, or try to destroy the objects if that fails.
470  *
471  * If we already have the stripe MD locally then we don't request it in
472  * md_open(), by passing a lmm_size = 0.
473  *
474  * It is up to the application to ensure no other processes open this file
475  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
476  * used.  We might be able to avoid races of that sort by getting lli_open_sem
477  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
478  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
479  */
480 int ll_file_open(struct inode *inode, struct file *file)
481 {
482         struct ll_inode_info *lli = ll_i2info(inode);
483         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
484                                           .it_flags = file->f_flags };
485         struct lov_stripe_md *lsm;
486         struct ptlrpc_request *req = NULL;
487         struct obd_client_handle **och_p;
488         __u64 *och_usecount;
489         struct ll_file_data *fd;
490         int rc = 0, opendir_set = 0;
491         ENTRY;
492
493         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
494                inode->i_generation, inode, file->f_flags);
495
496 #ifdef HAVE_VFS_INTENT_PATCHES
497         it = file->f_it;
498 #else
499         it = file->private_data; /* XXX: compat macro */
500         file->private_data = NULL; /* prevent ll_local_open assertion */
501 #endif
502
503         fd = ll_file_data_get();
504         if (fd == NULL)
505                 RETURN(-ENOMEM);
506
507         fd->fd_file = file;
508         if (S_ISDIR(inode->i_mode)) {
509 again:
510                 spin_lock(&lli->lli_lock);
511                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
512                         LASSERT(lli->lli_sai == NULL);
513                         lli->lli_opendir_key = fd;
514                         lli->lli_opendir_pid = cfs_curproc_pid();
515                         opendir_set = 1;
516                 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid() &&
517                                     lli->lli_opendir_key != NULL)) {
518                         /* Two cases for this:
519                          * (1) The same process open such directory many times.
520                          * (2) The old process opened the directory, and exited
521                          *     before its children processes. Then new process
522                          *     with the same pid opens such directory before the
523                          *     old process's children processes exit.
524                          * reset stat ahead for such cases. */
525                         spin_unlock(&lli->lli_lock);
526                         CDEBUG(D_INFO, "Conflict statahead for %.*s "DFID
527                                " reset it.\n", file->f_dentry->d_name.len,
528                                file->f_dentry->d_name.name,
529                                PFID(&lli->lli_fid));
530                         ll_stop_statahead(inode, lli->lli_opendir_key);
531                         goto again;
532                 }
533                 spin_unlock(&lli->lli_lock);
534         }
535
536         if (inode->i_sb->s_root == file->f_dentry) {
537                 LUSTRE_FPRIVATE(file) = fd;
538                 RETURN(0);
539         }
540
541         if (!it || !it->d.lustre.it_disposition) {
542                 /* Convert f_flags into access mode. We cannot use file->f_mode,
543                  * because everything but O_ACCMODE mask was stripped from
544                  * there */
545                 if ((oit.it_flags + 1) & O_ACCMODE)
546                         oit.it_flags++;
547                 if (file->f_flags & O_TRUNC)
548                         oit.it_flags |= FMODE_WRITE;
549
550                 /* kernel only call f_op->open in dentry_open.  filp_open calls
551                  * dentry_open after call to open_namei that checks permissions.
552                  * Only nfsd_open call dentry_open directly without checking
553                  * permissions and because of that this code below is safe. */
554                 if (oit.it_flags & FMODE_WRITE)
555                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
556
557                 /* We do not want O_EXCL here, presumably we opened the file
558                  * already? XXX - NFS implications? */
559                 oit.it_flags &= ~O_EXCL;
560
561                 it = &oit;
562         }
563
564 restart:
565         /* Let's see if we have file open on MDS already. */
566         if (it->it_flags & FMODE_WRITE) {
567                 och_p = &lli->lli_mds_write_och;
568                 och_usecount = &lli->lli_open_fd_write_count;
569         } else if (it->it_flags & FMODE_EXEC) {
570                 och_p = &lli->lli_mds_exec_och;
571                 och_usecount = &lli->lli_open_fd_exec_count;
572          } else {
573                 och_p = &lli->lli_mds_read_och;
574                 och_usecount = &lli->lli_open_fd_read_count;
575         }
576
577         down(&lli->lli_och_sem);
578         if (*och_p) { /* Open handle is present */
579                 if (it_disposition(it, DISP_OPEN_OPEN)) {
580                         /* Well, there's extra open request that we do not need,
581                            let's close it somehow. This will decref request. */
582                         rc = it_open_error(DISP_OPEN_OPEN, it);
583                         if (rc) {
584                                 up(&lli->lli_och_sem);
585                                 ll_file_data_put(fd);
586                                 GOTO(out_openerr, rc);
587                         }
588                         ll_release_openhandle(file->f_dentry, it);
589                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
590                                              LPROC_LL_OPEN);
591                 }
592                 (*och_usecount)++;
593
594                 rc = ll_local_open(file, it, fd, NULL);
595                 if (rc) {
596                         (*och_usecount)--;
597                         up(&lli->lli_och_sem);
598                         ll_file_data_put(fd);
599                         GOTO(out_openerr, rc);
600                 }
601         } else {
602                 LASSERT(*och_usecount == 0);
603                 if (!it->d.lustre.it_disposition) {
604                         /* We cannot just request lock handle now, new ELC code
605                            means that one of other OPEN locks for this file
606                            could be cancelled, and since blocking ast handler
607                            would attempt to grab och_sem as well, that would
608                            result in a deadlock */
609                         up(&lli->lli_och_sem);
610                         it->it_flags |= O_CHECK_STALE;
611                         rc = ll_intent_file_open(file, NULL, 0, it);
612                         it->it_flags &= ~O_CHECK_STALE;
613                         if (rc) {
614                                 ll_file_data_put(fd);
615                                 GOTO(out_openerr, rc);
616                         }
617
618                         /* Got some error? Release the request */
619                         if (it->d.lustre.it_status < 0) {
620                                 req = it->d.lustre.it_data;
621                                 ptlrpc_req_finished(req);
622                         }
623                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
624                                          &it->d.lustre.it_lock_handle,
625                                          file->f_dentry->d_inode);
626                         goto restart;
627                 }
628                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
629                 if (!*och_p) {
630                         ll_file_data_put(fd);
631                         GOTO(out_och_free, rc = -ENOMEM);
632                 }
633                 (*och_usecount)++;
634                 req = it->d.lustre.it_data;
635
636                 /* md_intent_lock() didn't get a request ref if there was an
637                  * open error, so don't do cleanup on the request here
638                  * (bug 3430) */
639                 /* XXX (green): Should not we bail out on any error here, not
640                  * just open error? */
641                 rc = it_open_error(DISP_OPEN_OPEN, it);
642                 if (rc) {
643                         ll_file_data_put(fd);
644                         GOTO(out_och_free, rc);
645                 }
646
647                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
648                 rc = ll_local_open(file, it, fd, *och_p);
649                 if (rc) {
650                         ll_file_data_put(fd);
651                         GOTO(out_och_free, rc);
652                 }
653         }
654         up(&lli->lli_och_sem);
655
656         /* Must do this outside lli_och_sem lock to prevent deadlock where
657            different kind of OPEN lock for this same inode gets cancelled
658            by ldlm_cancel_lru */
659         if (!S_ISREG(inode->i_mode))
660                 GOTO(out, rc);
661
662         ll_capa_open(inode);
663
664         lsm = lli->lli_smd;
665         if (lsm == NULL) {
666                 if (file->f_flags & O_LOV_DELAY_CREATE ||
667                     !(file->f_mode & FMODE_WRITE)) {
668                         CDEBUG(D_INODE, "object creation was delayed\n");
669                         GOTO(out, rc);
670                 }
671         }
672         file->f_flags &= ~O_LOV_DELAY_CREATE;
673         GOTO(out, rc);
674 out:
675         ptlrpc_req_finished(req);
676         if (req)
677                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
678 out_och_free:
679         if (rc) {
680                 if (*och_p) {
681                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
682                         *och_p = NULL; /* OBD_FREE writes some magic there */
683                         (*och_usecount)--;
684                 }
685                 up(&lli->lli_och_sem);
686 out_openerr:
687                 if (opendir_set != 0)
688                         ll_stop_statahead(inode, lli->lli_opendir_key);
689         }
690
691         return rc;
692 }
693
694 /* Fills the obdo with the attributes for the lsm */
695 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
696                           struct obd_capa *capa, struct obdo *obdo)
697 {
698         struct ptlrpc_request_set *set;
699         struct obd_info            oinfo = { { { 0 } } };
700         int                        rc;
701
702         ENTRY;
703
704         LASSERT(lsm != NULL);
705
706         oinfo.oi_md = lsm;
707         oinfo.oi_oa = obdo;
708         oinfo.oi_oa->o_id = lsm->lsm_object_id;
709         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
710         oinfo.oi_oa->o_mode = S_IFREG;
711         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
712                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
713                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
714                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
715                                OBD_MD_FLGROUP;
716         oinfo.oi_capa = capa;
717
718         set = ptlrpc_prep_set();
719         if (set == NULL) {
720                 CERROR("can't allocate ptlrpc set\n");
721                 rc = -ENOMEM;
722         } else {
723                 rc = obd_getattr_async(exp, &oinfo, set);
724                 if (rc == 0)
725                         rc = ptlrpc_set_wait(set);
726                 ptlrpc_set_destroy(set);
727         }
728         if (rc == 0)
729                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
730                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
731                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE);
732         RETURN(rc);
733 }
734
735 /* Fills the obdo with the attributes for the inode defined by lsm */
736 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
737 {
738         struct ll_inode_info *lli  = ll_i2info(inode);
739         struct obd_capa      *capa = ll_mdscapa_get(inode);
740         int rc;
741         ENTRY;
742
743         rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode), capa, obdo);
744         capa_put(capa);
745         if (rc == 0) {
746                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
747                 CDEBUG(D_INODE,
748                        "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
749                        lli->lli_smd->lsm_object_id, i_size_read(inode),
750                        (unsigned long long)inode->i_blocks,
751                        (unsigned long)ll_inode_blksize(inode));
752         }
753         RETURN(rc);
754 }
755
756 int ll_merge_lvb(struct inode *inode)
757 {
758         struct ll_inode_info *lli = ll_i2info(inode);
759         struct ll_sb_info *sbi = ll_i2sbi(inode);
760         struct ost_lvb lvb;
761         int rc;
762
763         ENTRY;
764
765         ll_inode_size_lock(inode, 1);
766         inode_init_lvb(inode, &lvb);
767         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
768         i_size_write(inode, lvb.lvb_size);
769         inode->i_blocks = lvb.lvb_blocks;
770
771         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
772         LTIME_S(inode->i_atime) = lvb.lvb_atime;
773         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
774         ll_inode_size_unlock(inode, 1);
775
776         RETURN(rc);
777 }
778
779 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
780                      lstat_t *st)
781 {
782         struct obdo obdo = { 0 };
783         int rc;
784
785         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo);
786         if (rc == 0) {
787                 st->st_size   = obdo.o_size;
788                 st->st_blocks = obdo.o_blocks;
789                 st->st_mtime  = obdo.o_mtime;
790                 st->st_atime  = obdo.o_atime;
791                 st->st_ctime  = obdo.o_ctime;
792         }
793         return rc;
794 }
795
796 void ll_io_init(struct cl_io *io, const struct file *file, int write)
797 {
798         struct inode *inode     = file->f_dentry->d_inode;
799         struct ll_sb_info *sbi  = ll_i2sbi(inode);
800         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
801
802         LASSERT(fd != NULL);
803         memset(io, 0, sizeof *io);
804         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
805         if (write)
806                 io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
807         io->ci_obj     = ll_i2info(inode)->lli_clob;
808         io->ci_lockreq = CILR_MAYBE;
809         if (fd->fd_flags & LL_FILE_IGNORE_LOCK || sbi->ll_flags & LL_SBI_NOLCK)
810                 io->ci_lockreq = CILR_NEVER;
811         else if (file->f_flags & O_APPEND)
812                 io->ci_lockreq = CILR_MANDATORY;
813 }
814
815 static ssize_t ll_file_io_generic(const struct lu_env *env,
816                 struct ccc_io_args *args, struct file *file,
817                 enum cl_io_type iot, loff_t *ppos, size_t count)
818 {
819         struct cl_io       *io;
820         ssize_t             result;
821         ENTRY;
822
823         io = &ccc_env_info(env)->cti_io;
824         ll_io_init(io, file, iot == CIT_WRITE);
825
826         if (iot == CIT_READ)
827                 io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile;
828
829         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
830                 struct vvp_io *vio = vvp_env_io(env);
831                 struct ccc_io *cio = ccc_env_io(env);
832                 if (cl_io_is_sendfile(io)) {
833                         vio->u.read.cui_actor = args->cia_actor;
834                         vio->u.read.cui_target = args->cia_target;
835                 } else {
836                         cio->cui_iov = args->cia_iov;
837                         cio->cui_nrsegs = args->cia_nrsegs;
838 #ifndef HAVE_FILE_WRITEV
839                         cio->cui_iocb = args->cia_iocb;
840 #endif
841                 }
842                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
843                 result = cl_io_loop(env, io);
844         } else
845                 /* cl_io_rw_init() handled IO */
846                 result = io->ci_result;
847         if (io->ci_nob > 0) {
848                 result = io->ci_nob;
849                 *ppos = io->u.ci_wr.wr.crw_pos;
850         }
851         cl_io_fini(env, io);
852         RETURN(result);
853 }
854
855
856 /*
857  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
858  */
859 static int ll_file_get_iov_count(const struct iovec *iov,
860                                  unsigned long *nr_segs, size_t *count)
861 {
862         size_t cnt = 0;
863         unsigned long seg;
864
865         for (seg = 0; seg < *nr_segs; seg++) {
866                 const struct iovec *iv = &iov[seg];
867
868                 /*
869                  * If any segment has a negative length, or the cumulative
870                  * length ever wraps negative then return -EINVAL.
871                  */
872                 cnt += iv->iov_len;
873                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
874                         return -EINVAL;
875                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
876                         continue;
877                 if (seg == 0)
878                         return -EFAULT;
879                 *nr_segs = seg;
880                 cnt -= iv->iov_len;   /* This segment is no good */
881                 break;
882         }
883         *count = cnt;
884         return 0;
885 }
886
887 #ifdef HAVE_FILE_READV
888 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
889                               unsigned long nr_segs, loff_t *ppos)
890 {
891         struct lu_env      *env;
892         struct ccc_io_args *args;
893         size_t              count;
894         ssize_t             result;
895         int                 refcheck;
896         ENTRY;
897
898         result = ll_file_get_iov_count(iov, &nr_segs, &count);
899         if (result)
900                 RETURN(result);
901
902         env = cl_env_get(&refcheck);
903         if (IS_ERR(env))
904                 RETURN(PTR_ERR(env));
905
906         args = &vvp_env_info(env)->vti_args;
907         args->cia_is_sendfile = 0;
908         args->cia_iov = (struct iovec *)iov;
909         args->cia_nrsegs = nr_segs;
910         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
911         cl_env_put(env, &refcheck);
912         RETURN(result);
913 }
914
915 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
916                             loff_t *ppos)
917 {
918         struct lu_env *env;
919         struct iovec  *local_iov;
920         ssize_t        result;
921         int            refcheck;
922         ENTRY;
923
924         env = cl_env_get(&refcheck);
925         if (IS_ERR(env))
926                 RETURN(PTR_ERR(env));
927
928         local_iov = &vvp_env_info(env)->vti_local_iov;
929         local_iov->iov_base = (void __user *)buf;
930         local_iov->iov_len = count;
931         result = ll_file_readv(file, local_iov, 1, ppos);
932         cl_env_put(env, &refcheck);
933         RETURN(result);
934 }
935
936 #else
937 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
938                                 unsigned long nr_segs, loff_t pos)
939 {
940         struct lu_env      *env;
941         struct ccc_io_args *args;
942         size_t              count;
943         ssize_t             result;
944         int                 refcheck;
945         ENTRY;
946
947         result = ll_file_get_iov_count(iov, &nr_segs, &count);
948         if (result)
949                 RETURN(result);
950
951         env = cl_env_get(&refcheck);
952         if (IS_ERR(env))
953                 RETURN(PTR_ERR(env));
954
955         args = &vvp_env_info(env)->vti_args;
956         args->cia_is_sendfile = 0;
957         args->cia_iov = (struct iovec *)iov;
958         args->cia_nrsegs = nr_segs;
959         args->cia_iocb = iocb;
960         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
961                                     &iocb->ki_pos, count);
962         cl_env_put(env, &refcheck);
963         RETURN(result);
964 }
965
966 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
967                             loff_t *ppos)
968 {
969         struct lu_env *env;
970         struct iovec  *local_iov;
971         struct kiocb  *kiocb;
972         ssize_t        result;
973         int            refcheck;
974         ENTRY;
975
976         env = cl_env_get(&refcheck);
977         if (IS_ERR(env))
978                 RETURN(PTR_ERR(env));
979
980         local_iov = &vvp_env_info(env)->vti_local_iov;
981         kiocb = &vvp_env_info(env)->vti_kiocb;
982         local_iov->iov_base = (void __user *)buf;
983         local_iov->iov_len = count;
984         init_sync_kiocb(kiocb, file);
985         kiocb->ki_pos = *ppos;
986         kiocb->ki_left = count;
987
988         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
989         *ppos = kiocb->ki_pos;
990
991         cl_env_put(env, &refcheck);
992         RETURN(result);
993 }
994 #endif
995
996 /*
997  * Write to a file (through the page cache).
998  */
999 #ifdef HAVE_FILE_WRITEV
1000 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
1001                               unsigned long nr_segs, loff_t *ppos)
1002 {
1003         struct lu_env      *env;
1004         struct ccc_io_args *args;
1005         size_t              count;
1006         ssize_t             result;
1007         int                 refcheck;
1008         ENTRY;
1009
1010         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1011         if (result)
1012                 RETURN(result);
1013
1014         env = cl_env_get(&refcheck);
1015         if (IS_ERR(env))
1016                 RETURN(PTR_ERR(env));
1017
1018         args = &vvp_env_info(env)->vti_args;
1019         args->cia_iov = (struct iovec *)iov;
1020         args->cia_nrsegs = nr_segs;
1021         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1022         cl_env_put(env, &refcheck);
1023         RETURN(result);
1024 }
1025
1026 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1027                              loff_t *ppos)
1028 {
1029         struct lu_env    *env;
1030         struct iovec     *local_iov;
1031         ssize_t           result;
1032         int               refcheck;
1033         ENTRY;
1034
1035         env = cl_env_get(&refcheck);
1036         if (IS_ERR(env))
1037                 RETURN(PTR_ERR(env));
1038
1039         local_iov = &vvp_env_info(env)->vti_local_iov;
1040         local_iov->iov_base = (void __user *)buf;
1041         local_iov->iov_len = count;
1042
1043         result = ll_file_writev(file, local_iov, 1, ppos);
1044         cl_env_put(env, &refcheck);
1045         RETURN(result);
1046 }
1047
1048 #else /* AIO stuff */
1049 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1050                                  unsigned long nr_segs, loff_t pos)
1051 {
1052         struct lu_env      *env;
1053         struct ccc_io_args *args;
1054         size_t              count;
1055         ssize_t             result;
1056         int                 refcheck;
1057         ENTRY;
1058
1059         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1060         if (result)
1061                 RETURN(result);
1062
1063         env = cl_env_get(&refcheck);
1064         if (IS_ERR(env))
1065                 RETURN(PTR_ERR(env));
1066
1067         args = &vvp_env_info(env)->vti_args;
1068         args->cia_iov = (struct iovec *)iov;
1069         args->cia_nrsegs = nr_segs;
1070         args->cia_iocb = iocb;
1071         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1072                                   &iocb->ki_pos, count);
1073         cl_env_put(env, &refcheck);
1074         RETURN(result);
1075 }
1076
1077 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1078                              loff_t *ppos)
1079 {
1080         struct lu_env *env;
1081         struct iovec  *local_iov;
1082         struct kiocb  *kiocb;
1083         ssize_t        result;
1084         int            refcheck;
1085         ENTRY;
1086
1087         env = cl_env_get(&refcheck);
1088         if (IS_ERR(env))
1089                 RETURN(PTR_ERR(env));
1090
1091         local_iov = &vvp_env_info(env)->vti_local_iov;
1092         kiocb = &vvp_env_info(env)->vti_kiocb;
1093         local_iov->iov_base = (void __user *)buf;
1094         local_iov->iov_len = count;
1095         init_sync_kiocb(kiocb, file);
1096         kiocb->ki_pos = *ppos;
1097         kiocb->ki_left = count;
1098
1099         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1100         *ppos = kiocb->ki_pos;
1101
1102         cl_env_put(env, &refcheck);
1103         RETURN(result);
1104 }
1105 #endif
1106
1107
1108 /*
1109  * Send file content (through pagecache) somewhere with helper
1110  */
1111 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1112                                 read_actor_t actor, void *target)
1113 {
1114         struct lu_env      *env;
1115         struct ccc_io_args *args;
1116         ssize_t             result;
1117         int                 refcheck;
1118         ENTRY;
1119
1120         env = cl_env_get(&refcheck);
1121         if (IS_ERR(env))
1122                 RETURN(PTR_ERR(env));
1123
1124         args = &vvp_env_info(env)->vti_args;
1125         args->cia_is_sendfile = 1;
1126         args->cia_target = target;
1127         args->cia_actor = actor;
1128         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1129         cl_env_put(env, &refcheck);
1130         RETURN(result);
1131 }
1132
1133 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1134                                unsigned long arg)
1135 {
1136         struct obd_export *exp = ll_i2dtexp(inode);
1137         struct ll_recreate_obj ucreatp;
1138         struct obd_trans_info oti = { 0 };
1139         struct obdo *oa = NULL;
1140         int lsm_size;
1141         int rc = 0;
1142         struct lov_stripe_md *lsm, *lsm2;
1143         ENTRY;
1144
1145         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1146                 RETURN(-EPERM);
1147
1148         if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1149                            sizeof(struct ll_recreate_obj)))
1150                 RETURN(-EFAULT);
1151
1152         OBDO_ALLOC(oa);
1153         if (oa == NULL)
1154                 RETURN(-ENOMEM);
1155
1156         ll_inode_size_lock(inode, 0);
1157         lsm = ll_i2info(inode)->lli_smd;
1158         if (lsm == NULL)
1159                 GOTO(out, rc = -ENOENT);
1160         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1161                    (lsm->lsm_stripe_count));
1162
1163         OBD_ALLOC(lsm2, lsm_size);
1164         if (lsm2 == NULL)
1165                 GOTO(out, rc = -ENOMEM);
1166
1167         oa->o_id = ucreatp.lrc_id;
1168         oa->o_gr = ucreatp.lrc_group;
1169         oa->o_nlink = ucreatp.lrc_ost_idx;
1170         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1171         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1172         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1173                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1174
1175         memcpy(lsm2, lsm, lsm_size);
1176         rc = obd_create(exp, oa, &lsm2, &oti);
1177
1178         OBD_FREE(lsm2, lsm_size);
1179         GOTO(out, rc);
1180 out:
1181         ll_inode_size_unlock(inode, 0);
1182         OBDO_FREE(oa);
1183         return rc;
1184 }
1185
1186 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1187                              int flags, struct lov_user_md *lum, int lum_size)
1188 {
1189         struct lov_stripe_md *lsm;
1190         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1191         int rc = 0;
1192         ENTRY;
1193
1194         ll_inode_size_lock(inode, 0);
1195         lsm = ll_i2info(inode)->lli_smd;
1196         if (lsm) {
1197                 ll_inode_size_unlock(inode, 0);
1198                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1199                        inode->i_ino);
1200                 RETURN(-EEXIST);
1201         }
1202
1203         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1204         if (rc)
1205                 GOTO(out, rc);
1206         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1207                 GOTO(out_req_free, rc = -ENOENT);
1208         rc = oit.d.lustre.it_status;
1209         if (rc < 0)
1210                 GOTO(out_req_free, rc);
1211
1212         ll_release_openhandle(file->f_dentry, &oit);
1213
1214  out:
1215         ll_inode_size_unlock(inode, 0);
1216         ll_intent_release(&oit);
1217         RETURN(rc);
1218 out_req_free:
1219         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1220         goto out;
1221 }
1222
1223 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1224                              struct lov_mds_md **lmmp, int *lmm_size,
1225                              struct ptlrpc_request **request)
1226 {
1227         struct ll_sb_info *sbi = ll_i2sbi(inode);
1228         struct mdt_body  *body;
1229         struct lov_mds_md *lmm = NULL;
1230         struct ptlrpc_request *req = NULL;
1231         struct obd_capa *oc;
1232         int rc, lmmsize;
1233
1234         rc = ll_get_max_mdsize(sbi, &lmmsize);
1235         if (rc)
1236                 RETURN(rc);
1237
1238         oc = ll_mdscapa_get(inode);
1239         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1240                              oc, filename, strlen(filename) + 1,
1241                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1242                              ll_i2suppgid(inode), &req);
1243         capa_put(oc);
1244         if (rc < 0) {
1245                 CDEBUG(D_INFO, "md_getattr_name failed "
1246                        "on %s: rc %d\n", filename, rc);
1247                 GOTO(out, rc);
1248         }
1249
1250         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1251         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1252
1253         lmmsize = body->eadatasize;
1254
1255         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1256                         lmmsize == 0) {
1257                 GOTO(out, rc = -ENODATA);
1258         }
1259
1260         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1261         LASSERT(lmm != NULL);
1262
1263         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1264             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
1265             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
1266                 GOTO(out, rc = -EPROTO);
1267         }
1268
1269         /*
1270          * This is coming from the MDS, so is probably in
1271          * little endian.  We convert it to host endian before
1272          * passing it to userspace.
1273          */
1274         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1275                 /* if function called for directory - we should
1276                  * avoid swab not existent lsm objects */
1277                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1278                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1279                         if (S_ISREG(body->mode))
1280                                 lustre_swab_lov_user_md_objects(
1281                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1282                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1283                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1284                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1285                         if (S_ISREG(body->mode))
1286                                 lustre_swab_lov_user_md_objects(
1287                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1288                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1289                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
1290                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1291                 }
1292         }
1293
1294         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1295                 struct lov_stripe_md *lsm;
1296                 struct lov_user_md_join *lmj;
1297                 int lmj_size, i, aindex = 0;
1298
1299                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1300                 if (rc < 0)
1301                         GOTO(out, rc = -ENOMEM);
1302                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1303                 if (rc)
1304                         GOTO(out_free_memmd, rc);
1305
1306                 lmj_size = sizeof(struct lov_user_md_join) +
1307                            lsm->lsm_stripe_count *
1308                            sizeof(struct lov_user_ost_data_join);
1309                 OBD_ALLOC(lmj, lmj_size);
1310                 if (!lmj)
1311                         GOTO(out_free_memmd, rc = -ENOMEM);
1312
1313                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1314                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1315                         struct lov_extent *lex =
1316                                 &lsm->lsm_array->lai_ext_array[aindex];
1317
1318                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1319                                 aindex ++;
1320                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1321                                         LPU64" len %d\n", aindex, i,
1322                                         lex->le_start, (int)lex->le_len);
1323                         lmj->lmm_objects[i].l_extent_start =
1324                                 lex->le_start;
1325
1326                         if ((int)lex->le_len == -1)
1327                                 lmj->lmm_objects[i].l_extent_end = -1;
1328                         else
1329                                 lmj->lmm_objects[i].l_extent_end =
1330                                         lex->le_start + lex->le_len;
1331                         lmj->lmm_objects[i].l_object_id =
1332                                 lsm->lsm_oinfo[i]->loi_id;
1333                         lmj->lmm_objects[i].l_object_gr =
1334                                 lsm->lsm_oinfo[i]->loi_gr;
1335                         lmj->lmm_objects[i].l_ost_gen =
1336                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1337                         lmj->lmm_objects[i].l_ost_idx =
1338                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1339                 }
1340                 lmm = (struct lov_mds_md *)lmj;
1341                 lmmsize = lmj_size;
1342 out_free_memmd:
1343                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1344         }
1345 out:
1346         *lmmp = lmm;
1347         *lmm_size = lmmsize;
1348         *request = req;
1349         return rc;
1350 }
1351
1352 static int ll_lov_setea(struct inode *inode, struct file *file,
1353                             unsigned long arg)
1354 {
1355         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1356         struct lov_user_md  *lump;
1357         int lum_size = sizeof(struct lov_user_md) +
1358                        sizeof(struct lov_user_ost_data);
1359         int rc;
1360         ENTRY;
1361
1362         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1363                 RETURN(-EPERM);
1364
1365         OBD_ALLOC(lump, lum_size);
1366         if (lump == NULL) {
1367                 RETURN(-ENOMEM);
1368         }
1369         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1370                 OBD_FREE(lump, lum_size);
1371                 RETURN(-EFAULT);
1372         }
1373
1374         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1375
1376         OBD_FREE(lump, lum_size);
1377         RETURN(rc);
1378 }
1379
1380 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1381                             unsigned long arg)
1382 {
1383         struct lov_user_md_v3 lumv3;
1384         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1385         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1386         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1387         int lum_size;
1388         int rc;
1389         int flags = FMODE_WRITE;
1390         ENTRY;
1391
1392         /* first try with v1 which is smaller than v3 */
1393         lum_size = sizeof(struct lov_user_md_v1);
1394         if (copy_from_user(lumv1, lumv1p, lum_size))
1395                 RETURN(-EFAULT);
1396
1397         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1398                 lum_size = sizeof(struct lov_user_md_v3);
1399                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1400                         RETURN(-EFAULT);
1401         }
1402
1403         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1404         if (rc == 0) {
1405                  put_user(0, &lumv1p->lmm_stripe_count);
1406                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1407                                     0, ll_i2info(inode)->lli_smd,
1408                                     (void *)arg);
1409         }
1410         RETURN(rc);
1411 }
1412
1413 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1414 {
1415         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1416
1417         if (!lsm)
1418                 RETURN(-ENODATA);
1419
1420         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1421                             (void *)arg);
1422 }
1423
1424 static int ll_get_grouplock(struct inode *inode, struct file *file,
1425                             unsigned long arg)
1426 {
1427         /* XXX */
1428         return -ENOSYS;
1429 }
1430
1431 static int ll_put_grouplock(struct inode *inode, struct file *file,
1432                             unsigned long arg)
1433 {
1434         /* XXX */
1435         return -ENOSYS;
1436 }
1437
1438 #if LUSTRE_FIX >= 50
1439 static int join_sanity_check(struct inode *head, struct inode *tail)
1440 {
1441         ENTRY;
1442         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1443                 CERROR("server do not support join \n");
1444                 RETURN(-EINVAL);
1445         }
1446         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1447                 CERROR("tail ino %lu and ino head %lu must be regular\n",
1448                        head->i_ino, tail->i_ino);
1449                 RETURN(-EINVAL);
1450         }
1451         if (head->i_ino == tail->i_ino) {
1452                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1453                 RETURN(-EINVAL);
1454         }
1455         if (i_size_read(head) % JOIN_FILE_ALIGN) {
1456                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1457                 RETURN(-EINVAL);
1458         }
1459         RETURN(0);
1460 }
1461
1462 static int join_file(struct inode *head_inode, struct file *head_filp,
1463                      struct file *tail_filp)
1464 {
1465         struct dentry *tail_dentry = tail_filp->f_dentry;
1466         struct lookup_intent oit = {.it_op = IT_OPEN,
1467                                    .it_flags = head_filp->f_flags|O_JOIN_FILE};
1468         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
1469                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
1470
1471         struct lustre_handle lockh;
1472         struct md_op_data *op_data;
1473         int    rc;
1474         loff_t data;
1475         ENTRY;
1476
1477         tail_dentry = tail_filp->f_dentry;
1478
1479         data = i_size_read(head_inode);
1480         op_data = ll_prep_md_op_data(NULL, head_inode,
1481                                      tail_dentry->d_parent->d_inode,
1482                                      tail_dentry->d_name.name,
1483                                      tail_dentry->d_name.len, 0,
1484                                      LUSTRE_OPC_ANY, &data);
1485         if (IS_ERR(op_data))
1486                 RETURN(PTR_ERR(op_data));
1487
1488         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
1489                          op_data, &lockh, NULL, 0, NULL, 0);
1490
1491         ll_finish_md_op_data(op_data);
1492         if (rc < 0)
1493                 GOTO(out, rc);
1494
1495         rc = oit.d.lustre.it_status;
1496
1497         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
1498                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
1499                 ptlrpc_req_finished((struct ptlrpc_request *)
1500                                     oit.d.lustre.it_data);
1501                 GOTO(out, rc);
1502         }
1503
1504         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1505                                            * away */
1506                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1507                 oit.d.lustre.it_lock_mode = 0;
1508         }
1509         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1510         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
1511         ll_release_openhandle(head_filp->f_dentry, &oit);
1512 out:
1513         ll_intent_release(&oit);
1514         RETURN(rc);
1515 }
1516
1517 static int ll_file_join(struct inode *head, struct file *filp,
1518                         char *filename_tail)
1519 {
1520         struct inode *tail = NULL, *first = NULL, *second = NULL;
1521         struct dentry *tail_dentry;
1522         struct file *tail_filp, *first_filp, *second_filp;
1523         struct ll_lock_tree first_tree, second_tree;
1524         struct ll_lock_tree_node *first_node, *second_node;
1525         struct ll_inode_info *hlli = ll_i2info(head);
1526         int rc = 0, cleanup_phase = 0;
1527         ENTRY;
1528
1529         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1530                head->i_ino, head->i_generation, head, filename_tail);
1531
1532         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1533         if (IS_ERR(tail_filp)) {
1534                 CERROR("Can not open tail file %s", filename_tail);
1535                 rc = PTR_ERR(tail_filp);
1536                 GOTO(cleanup, rc);
1537         }
1538         tail = igrab(tail_filp->f_dentry->d_inode);
1539
1540         tail_dentry = tail_filp->f_dentry;
1541         LASSERT(tail_dentry);
1542         cleanup_phase = 1;
1543
1544         /*reorder the inode for lock sequence*/
1545         first = head->i_ino > tail->i_ino ? head : tail;
1546         second = head->i_ino > tail->i_ino ? tail : head;
1547         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1548         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1549
1550         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1551                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1552         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1553         if (IS_ERR(first_node)){
1554                 rc = PTR_ERR(first_node);
1555                 GOTO(cleanup, rc);
1556         }
1557         first_tree.lt_fd = first_filp->private_data;
1558         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1559         if (rc != 0)
1560                 GOTO(cleanup, rc);
1561         cleanup_phase = 2;
1562
1563         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1564         if (IS_ERR(second_node)){
1565                 rc = PTR_ERR(second_node);
1566                 GOTO(cleanup, rc);
1567         }
1568         second_tree.lt_fd = second_filp->private_data;
1569         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1570         if (rc != 0)
1571                 GOTO(cleanup, rc);
1572         cleanup_phase = 3;
1573
1574         rc = join_sanity_check(head, tail);
1575         if (rc)
1576                 GOTO(cleanup, rc);
1577
1578         rc = join_file(head, filp, tail_filp);
1579         if (rc)
1580                 GOTO(cleanup, rc);
1581 cleanup:
1582         switch (cleanup_phase) {
1583         case 3:
1584                 ll_tree_unlock(&second_tree);
1585                 obd_cancel_unused(ll_i2dtexp(second),
1586                                   ll_i2info(second)->lli_smd, 0, NULL);
1587         case 2:
1588                 ll_tree_unlock(&first_tree);
1589                 obd_cancel_unused(ll_i2dtexp(first),
1590                                   ll_i2info(first)->lli_smd, 0, NULL);
1591         case 1:
1592                 filp_close(tail_filp, 0);
1593                 if (tail)
1594                         iput(tail);
1595                 if (head && rc == 0) {
1596                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
1597                                        &hlli->lli_smd);
1598                         hlli->lli_smd = NULL;
1599                 }
1600         case 0:
1601                 break;
1602         default:
1603                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
1604                 LBUG();
1605         }
1606         RETURN(rc);
1607 }
1608 #endif /* LUSTRE_FIX >= 50 */
1609
1610 /**
1611  * Close inode open handle
1612  *
1613  * \param dentry [in]     dentry which contains the inode
1614  * \param it     [in,out] intent which contains open info and result
1615  *
1616  * \retval 0     success
1617  * \retval <0    failure
1618  */
1619 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1620 {
1621         struct inode *inode = dentry->d_inode;
1622         struct obd_client_handle *och;
1623         int rc;
1624         ENTRY;
1625
1626         LASSERT(inode);
1627
1628         /* Root ? Do nothing. */
1629         if (dentry->d_inode->i_sb->s_root == dentry)
1630                 RETURN(0);
1631
1632         /* No open handle to close? Move away */
1633         if (!it_disposition(it, DISP_OPEN_OPEN))
1634                 RETURN(0);
1635
1636         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1637
1638         OBD_ALLOC(och, sizeof(*och));
1639         if (!och)
1640                 GOTO(out, rc = -ENOMEM);
1641
1642         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1643                     ll_i2info(inode), it, och);
1644
1645         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1646                                        inode, och);
1647  out:
1648         /* this one is in place of ll_file_open */
1649         if (it_disposition(it, DISP_ENQ_OPEN_REF))
1650                 ptlrpc_req_finished(it->d.lustre.it_data);
1651         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1652         RETURN(rc);
1653 }
1654
1655 /**
1656  * Get size for inode for which FIEMAP mapping is requested.
1657  * Make the FIEMAP get_info call and returns the result.
1658  */
1659 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1660               int num_bytes)
1661 {
1662         struct obd_export *exp = ll_i2dtexp(inode);
1663         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1664         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1665         int vallen = num_bytes;
1666         int rc;
1667         ENTRY;
1668
1669         /* If the stripe_count > 1 and the application does not understand
1670          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1671          */
1672         if (lsm->lsm_stripe_count > 1 &&
1673             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1674                 return -EOPNOTSUPP;
1675
1676         fm_key.oa.o_id = lsm->lsm_object_id;
1677         fm_key.oa.o_gr = lsm->lsm_object_gr;
1678         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1679
1680         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
1681                         OBD_MD_FLSIZE);
1682
1683         /* If filesize is 0, then there would be no objects for mapping */
1684         if (fm_key.oa.o_size == 0) {
1685                 fiemap->fm_mapped_extents = 0;
1686                 RETURN(0);
1687         }
1688
1689         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1690
1691         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1692         if (rc)
1693                 CERROR("obd_get_info failed: rc = %d\n", rc);
1694
1695         RETURN(rc);
1696 }
1697
1698 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1699                   unsigned long arg)
1700 {
1701         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1702         int flags;
1703         ENTRY;
1704
1705         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1706                inode->i_generation, inode, cmd);
1707         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1708
1709         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1710         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1711                 RETURN(-ENOTTY);
1712
1713         switch(cmd) {
1714         case LL_IOC_GETFLAGS:
1715                 /* Get the current value of the file flags */
1716                 return put_user(fd->fd_flags, (int *)arg);
1717         case LL_IOC_SETFLAGS:
1718         case LL_IOC_CLRFLAGS:
1719                 /* Set or clear specific file flags */
1720                 /* XXX This probably needs checks to ensure the flags are
1721                  *     not abused, and to handle any flag side effects.
1722                  */
1723                 if (get_user(flags, (int *) arg))
1724                         RETURN(-EFAULT);
1725
1726                 if (cmd == LL_IOC_SETFLAGS) {
1727                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1728                             !(file->f_flags & O_DIRECT)) {
1729                                 CERROR("%s: unable to disable locking on "
1730                                        "non-O_DIRECT file\n", current->comm);
1731                                 RETURN(-EINVAL);
1732                         }
1733
1734                         fd->fd_flags |= flags;
1735                 } else {
1736                         fd->fd_flags &= ~flags;
1737                 }
1738                 RETURN(0);
1739         case LL_IOC_LOV_SETSTRIPE:
1740                 RETURN(ll_lov_setstripe(inode, file, arg));
1741         case LL_IOC_LOV_SETEA:
1742                 RETURN(ll_lov_setea(inode, file, arg));
1743         case LL_IOC_LOV_GETSTRIPE:
1744                 RETURN(ll_lov_getstripe(inode, arg));
1745         case LL_IOC_RECREATE_OBJ:
1746                 RETURN(ll_lov_recreate_obj(inode, file, arg));
1747         case EXT3_IOC_FIEMAP: {
1748                 struct ll_user_fiemap *fiemap_s;
1749                 size_t num_bytes, ret_bytes;
1750                 unsigned int extent_count;
1751                 int rc = 0;
1752
1753                 /* Get the extent count so we can calculate the size of
1754                  * required fiemap buffer */
1755                 if (get_user(extent_count,
1756                     &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1757                         RETURN(-EFAULT);
1758                 num_bytes = sizeof(*fiemap_s) + (extent_count *
1759                                                  sizeof(struct ll_fiemap_extent));
1760                 OBD_VMALLOC(fiemap_s, num_bytes);
1761                 if (fiemap_s == NULL)
1762                         RETURN(-ENOMEM);
1763
1764                 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1765                                    sizeof(*fiemap_s)))
1766                         GOTO(error, rc = -EFAULT);
1767
1768                 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1769                         fiemap_s->fm_flags = fiemap_s->fm_flags &
1770                                                     ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1771                         if (copy_to_user((char *)arg, fiemap_s,
1772                                          sizeof(*fiemap_s)))
1773                                 GOTO(error, rc = -EFAULT);
1774
1775                         GOTO(error, rc = -EBADR);
1776                 }
1777
1778                 /* If fm_extent_count is non-zero, read the first extent since
1779                  * it is used to calculate end_offset and device from previous
1780                  * fiemap call. */
1781                 if (extent_count) {
1782                         if (copy_from_user(&fiemap_s->fm_extents[0],
1783                             (char __user *)arg + sizeof(*fiemap_s),
1784                             sizeof(struct ll_fiemap_extent)))
1785                                 GOTO(error, rc = -EFAULT);
1786                 }
1787
1788                 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
1789                         int rc;
1790
1791                         rc = filemap_fdatawrite(inode->i_mapping);
1792                         if (rc)
1793                                 GOTO(error, rc);
1794                 }
1795
1796                 rc = ll_fiemap(inode, fiemap_s, num_bytes);
1797                 if (rc)
1798                         GOTO(error, rc);
1799
1800                 ret_bytes = sizeof(struct ll_user_fiemap);
1801
1802                 if (extent_count != 0)
1803                         ret_bytes += (fiemap_s->fm_mapped_extents *
1804                                          sizeof(struct ll_fiemap_extent));
1805
1806                 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1807                         rc = -EFAULT;
1808
1809 error:
1810                 OBD_VFREE(fiemap_s, num_bytes);
1811                 RETURN(rc);
1812         }
1813         case EXT3_IOC_GETFLAGS:
1814         case EXT3_IOC_SETFLAGS:
1815                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1816         case EXT3_IOC_GETVERSION_OLD:
1817         case EXT3_IOC_GETVERSION:
1818                 RETURN(put_user(inode->i_generation, (int *)arg));
1819         case LL_IOC_JOIN: {
1820 #if LUSTRE_FIX >= 50
1821                 /* Allow file join in beta builds to allow debuggging */
1822                 char *ftail;
1823                 int rc;
1824
1825                 ftail = getname((const char *)arg);
1826                 if (IS_ERR(ftail))
1827                         RETURN(PTR_ERR(ftail));
1828                 rc = ll_file_join(inode, file, ftail);
1829                 putname(ftail);
1830                 RETURN(rc);
1831 #else
1832                 CWARN("file join is not supported in this version of Lustre\n");
1833                 RETURN(-ENOTTY);
1834 #endif
1835         }
1836         case LL_IOC_GROUP_LOCK:
1837                 RETURN(ll_get_grouplock(inode, file, arg));
1838         case LL_IOC_GROUP_UNLOCK:
1839                 RETURN(ll_put_grouplock(inode, file, arg));
1840         case IOC_OBD_STATFS:
1841                 RETURN(ll_obd_statfs(inode, (void *)arg));
1842
1843         /* We need to special case any other ioctls we want to handle,
1844          * to send them to the MDS/OST as appropriate and to properly
1845          * network encode the arg field.
1846         case EXT3_IOC_SETVERSION_OLD:
1847         case EXT3_IOC_SETVERSION:
1848         */
1849         case LL_IOC_FLUSHCTX:
1850                 RETURN(ll_flush_ctx(inode));
1851         case LL_IOC_PATH2FID: {
1852                 if (copy_to_user((void *)arg, &ll_i2info(inode)->lli_fid,
1853                                  sizeof(struct lu_fid)))
1854                         RETURN(-EFAULT);
1855
1856                 RETURN(0);
1857         }
1858         default: {
1859                 int err;
1860
1861                 if (LLIOC_STOP ==
1862                     ll_iocontrol_call(inode, file, cmd, arg, &err))
1863                         RETURN(err);
1864
1865                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1866                                      (void *)arg));
1867         }
1868         }
1869 }
1870
1871 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1872 {
1873         struct inode *inode = file->f_dentry->d_inode;
1874         loff_t retval;
1875         ENTRY;
1876         retval = offset + ((origin == 2) ? i_size_read(inode) :
1877                            (origin == 1) ? file->f_pos : 0);
1878         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
1879                inode->i_ino, inode->i_generation, inode, retval, retval,
1880                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1881         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1882
1883         if (origin == 2) { /* SEEK_END */
1884                 int nonblock = 0, rc;
1885
1886                 if (file->f_flags & O_NONBLOCK)
1887                         nonblock = LDLM_FL_BLOCK_NOWAIT;
1888
1889                 rc = cl_glimpse_size(inode);
1890                 if (rc != 0)
1891                         RETURN(rc);
1892
1893                 ll_inode_size_lock(inode, 0);
1894                 offset += i_size_read(inode);
1895                 ll_inode_size_unlock(inode, 0);
1896         } else if (origin == 1) { /* SEEK_CUR */
1897                 offset += file->f_pos;
1898         }
1899
1900         retval = -EINVAL;
1901         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1902                 if (offset != file->f_pos) {
1903                         file->f_pos = offset;
1904                 }
1905                 retval = offset;
1906         }
1907
1908         RETURN(retval);
1909 }
1910
1911 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1912 {
1913         struct inode *inode = dentry->d_inode;
1914         struct ll_inode_info *lli = ll_i2info(inode);
1915         struct lov_stripe_md *lsm = lli->lli_smd;
1916         struct ptlrpc_request *req;
1917         struct obd_capa *oc;
1918         int rc, err;
1919         ENTRY;
1920         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
1921                inode->i_generation, inode);
1922         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
1923
1924         /* fsync's caller has already called _fdata{sync,write}, we want
1925          * that IO to finish before calling the osc and mdc sync methods */
1926         rc = filemap_fdatawait(inode->i_mapping);
1927
1928         /* catch async errors that were recorded back when async writeback
1929          * failed for pages in this mapping. */
1930         err = lli->lli_async_rc;
1931         lli->lli_async_rc = 0;
1932         if (rc == 0)
1933                 rc = err;
1934         if (lsm) {
1935                 err = lov_test_and_clear_async_rc(lsm);
1936                 if (rc == 0)
1937                         rc = err;
1938         }
1939
1940         oc = ll_mdscapa_get(inode);
1941         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
1942                       &req);
1943         capa_put(oc);
1944         if (!rc)
1945                 rc = err;
1946         if (!err)
1947                 ptlrpc_req_finished(req);
1948
1949         if (data && lsm) {
1950                 struct obdo *oa;
1951
1952                 OBDO_ALLOC(oa);
1953                 if (!oa)
1954                         RETURN(rc ? rc : -ENOMEM);
1955
1956                 oa->o_id = lsm->lsm_object_id;
1957                 oa->o_gr = lsm->lsm_object_gr;
1958                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1959                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1960                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
1961                                            OBD_MD_FLGROUP);
1962
1963                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
1964                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
1965                                0, OBD_OBJECT_EOF, oc);
1966                 capa_put(oc);
1967                 if (!rc)
1968                         rc = err;
1969                 OBDO_FREE(oa);
1970         }
1971
1972         RETURN(rc);
1973 }
1974
1975 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
1976 {
1977         struct inode *inode = file->f_dentry->d_inode;
1978         struct ll_sb_info *sbi = ll_i2sbi(inode);
1979         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
1980                                            .ei_cb_cp =ldlm_flock_completion_ast,
1981                                            .ei_cbdata = file_lock };
1982         struct md_op_data *op_data;
1983         struct lustre_handle lockh = {0};
1984         ldlm_policy_data_t flock;
1985         int flags = 0;
1986         int rc;
1987         ENTRY;
1988
1989         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
1990                inode->i_ino, file_lock);
1991
1992         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
1993
1994         if (file_lock->fl_flags & FL_FLOCK) {
1995                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
1996                 /* set missing params for flock() calls */
1997                 file_lock->fl_end = OFFSET_MAX;
1998                 file_lock->fl_pid = current->tgid;
1999         }
2000         flock.l_flock.pid = file_lock->fl_pid;
2001         flock.l_flock.start = file_lock->fl_start;
2002         flock.l_flock.end = file_lock->fl_end;
2003
2004         switch (file_lock->fl_type) {
2005         case F_RDLCK:
2006                 einfo.ei_mode = LCK_PR;
2007                 break;
2008         case F_UNLCK:
2009                 /* An unlock request may or may not have any relation to
2010                  * existing locks so we may not be able to pass a lock handle
2011                  * via a normal ldlm_lock_cancel() request. The request may even
2012                  * unlock a byte range in the middle of an existing lock. In
2013                  * order to process an unlock request we need all of the same
2014                  * information that is given with a normal read or write record
2015                  * lock request. To avoid creating another ldlm unlock (cancel)
2016                  * message we'll treat a LCK_NL flock request as an unlock. */
2017                 einfo.ei_mode = LCK_NL;
2018                 break;
2019         case F_WRLCK:
2020                 einfo.ei_mode = LCK_PW;
2021                 break;
2022         default:
2023                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2024                 RETURN (-EINVAL);
2025         }
2026
2027         switch (cmd) {
2028         case F_SETLKW:
2029 #ifdef F_SETLKW64
2030         case F_SETLKW64:
2031 #endif
2032                 flags = 0;
2033                 break;
2034         case F_SETLK:
2035 #ifdef F_SETLK64
2036         case F_SETLK64:
2037 #endif
2038                 flags = LDLM_FL_BLOCK_NOWAIT;
2039                 break;
2040         case F_GETLK:
2041 #ifdef F_GETLK64
2042         case F_GETLK64:
2043 #endif
2044                 flags = LDLM_FL_TEST_LOCK;
2045                 /* Save the old mode so that if the mode in the lock changes we
2046                  * can decrement the appropriate reader or writer refcount. */
2047                 file_lock->fl_type = einfo.ei_mode;
2048                 break;
2049         default:
2050                 CERROR("unknown fcntl lock command: %d\n", cmd);
2051                 RETURN (-EINVAL);
2052         }
2053
2054         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2055                                      LUSTRE_OPC_ANY, NULL);
2056         if (IS_ERR(op_data))
2057                 RETURN(PTR_ERR(op_data));
2058
2059         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2060                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2061                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2062
2063         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2064                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2065
2066         ll_finish_md_op_data(op_data);
2067
2068         if ((file_lock->fl_flags & FL_FLOCK) &&
2069             (rc == 0 || file_lock->fl_type == F_UNLCK))
2070                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2071 #ifdef HAVE_F_OP_FLOCK
2072         if ((file_lock->fl_flags & FL_POSIX) &&
2073             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2074             !(flags & LDLM_FL_TEST_LOCK))
2075                 posix_lock_file_wait(file, file_lock);
2076 #endif
2077
2078         RETURN(rc);
2079 }
2080
2081 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2082 {
2083         ENTRY;
2084
2085         RETURN(-ENOSYS);
2086 }
2087
2088 int ll_have_md_lock(struct inode *inode, __u64 bits)
2089 {
2090         struct lustre_handle lockh;
2091         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2092         struct lu_fid *fid;
2093         int flags;
2094         ENTRY;
2095
2096         if (!inode)
2097                RETURN(0);
2098
2099         fid = &ll_i2info(inode)->lli_fid;
2100         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2101
2102         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2103         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2104                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2105                 RETURN(1);
2106         }
2107         RETURN(0);
2108 }
2109
2110 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2111                             struct lustre_handle *lockh)
2112 {
2113         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2114         struct lu_fid *fid;
2115         ldlm_mode_t rc;
2116         int flags;
2117         ENTRY;
2118
2119         fid = &ll_i2info(inode)->lli_fid;
2120         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2121
2122         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2123         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2124                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2125         RETURN(rc);
2126 }
2127
2128 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2129         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2130                               * and return success */
2131                 inode->i_nlink = 0;
2132                 /* This path cannot be hit for regular files unless in
2133                  * case of obscure races, so no need to to validate
2134                  * size. */
2135                 if (!S_ISREG(inode->i_mode) &&
2136                     !S_ISDIR(inode->i_mode))
2137                         return 0;
2138         }
2139
2140         if (rc) {
2141                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2142                 return -abs(rc);
2143
2144         }
2145
2146         return 0;
2147 }
2148
2149 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2150 {
2151         struct inode *inode = dentry->d_inode;
2152         struct ptlrpc_request *req = NULL;
2153         struct ll_sb_info *sbi;
2154         struct obd_export *exp;
2155         int rc;
2156         ENTRY;
2157
2158         if (!inode) {
2159                 CERROR("REPORT THIS LINE TO PETER\n");
2160                 RETURN(0);
2161         }
2162         sbi = ll_i2sbi(inode);
2163
2164         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2165                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2166
2167         exp = ll_i2mdexp(inode);
2168
2169         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2170                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2171                 struct md_op_data *op_data;
2172
2173                 /* Call getattr by fid, so do not provide name at all. */
2174                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2175                                              dentry->d_inode, NULL, 0, 0,
2176                                              LUSTRE_OPC_ANY, NULL);
2177                 if (IS_ERR(op_data))
2178                         RETURN(PTR_ERR(op_data));
2179
2180                 oit.it_flags |= O_CHECK_STALE;
2181                 rc = md_intent_lock(exp, op_data, NULL, 0,
2182                                     /* we are not interested in name
2183                                        based lookup */
2184                                     &oit, 0, &req,
2185                                     ll_md_blocking_ast, 0);
2186                 ll_finish_md_op_data(op_data);
2187                 oit.it_flags &= ~O_CHECK_STALE;
2188                 if (rc < 0) {
2189                         rc = ll_inode_revalidate_fini(inode, rc);
2190                         GOTO (out, rc);
2191                 }
2192
2193                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2194                 if (rc != 0) {
2195                         ll_intent_release(&oit);
2196                         GOTO(out, rc);
2197                 }
2198
2199                 /* Unlinked? Unhash dentry, so it is not picked up later by
2200                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2201                    here to preserve get_cwd functionality on 2.6.
2202                    Bug 10503 */
2203                 if (!dentry->d_inode->i_nlink) {
2204                         spin_lock(&ll_lookup_lock);
2205                         spin_lock(&dcache_lock);
2206                         ll_drop_dentry(dentry);
2207                         spin_unlock(&dcache_lock);
2208                         spin_unlock(&ll_lookup_lock);
2209                 }
2210
2211                 ll_lookup_finish_locks(&oit, dentry);
2212         } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE |
2213                                                      MDS_INODELOCK_LOOKUP)) {
2214                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2215                 obd_valid valid = OBD_MD_FLGETATTR;
2216                 struct obd_capa *oc;
2217                 int ealen = 0;
2218
2219                 if (S_ISREG(inode->i_mode)) {
2220                         rc = ll_get_max_mdsize(sbi, &ealen);
2221                         if (rc)
2222                                 RETURN(rc);
2223                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2224                 }
2225                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2226                  * capa for this inode. Because we only keep capas of dirs
2227                  * fresh. */
2228                 oc = ll_mdscapa_get(inode);
2229                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2230                                 ealen, &req);
2231                 capa_put(oc);
2232                 if (rc) {
2233                         rc = ll_inode_revalidate_fini(inode, rc);
2234                         RETURN(rc);
2235                 }
2236
2237                 rc = ll_prep_inode(&inode, req, NULL);
2238                 if (rc)
2239                         GOTO(out, rc);
2240         }
2241
2242         /* if object not yet allocated, don't validate size */
2243         if (ll_i2info(inode)->lli_smd == NULL)
2244                 GOTO(out, rc = 0);
2245
2246         /* cl_glimpse_size will prefer locally cached writes if they extend
2247          * the file */
2248         rc = cl_glimpse_size(inode);
2249         EXIT;
2250 out:
2251         ptlrpc_req_finished(req);
2252         return rc;
2253 }
2254
2255 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2256                   struct lookup_intent *it, struct kstat *stat)
2257 {
2258         struct inode *inode = de->d_inode;
2259         int res = 0;
2260
2261         res = ll_inode_revalidate_it(de, it);
2262         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2263
2264         if (res)
2265                 return res;
2266
2267         stat->dev = inode->i_sb->s_dev;
2268         stat->ino = inode->i_ino;
2269         stat->mode = inode->i_mode;
2270         stat->nlink = inode->i_nlink;
2271         stat->uid = inode->i_uid;
2272         stat->gid = inode->i_gid;
2273         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2274         stat->atime = inode->i_atime;
2275         stat->mtime = inode->i_mtime;
2276         stat->ctime = inode->i_ctime;
2277 #ifdef HAVE_INODE_BLKSIZE
2278         stat->blksize = inode->i_blksize;
2279 #else
2280         stat->blksize = 1 << inode->i_blkbits;
2281 #endif
2282
2283         ll_inode_size_lock(inode, 0);
2284         stat->size = i_size_read(inode);
2285         stat->blocks = inode->i_blocks;
2286         ll_inode_size_unlock(inode, 0);
2287
2288         return 0;
2289 }
2290 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2291 {
2292         struct lookup_intent it = { .it_op = IT_GETATTR };
2293
2294         return ll_getattr_it(mnt, de, &it, stat);
2295 }
2296
2297 static
2298 int lustre_check_acl(struct inode *inode, int mask)
2299 {
2300 #ifdef CONFIG_FS_POSIX_ACL
2301         struct ll_inode_info *lli = ll_i2info(inode);
2302         struct posix_acl *acl;
2303         int rc;
2304         ENTRY;
2305
2306         spin_lock(&lli->lli_lock);
2307         acl = posix_acl_dup(lli->lli_posix_acl);
2308         spin_unlock(&lli->lli_lock);
2309
2310         if (!acl)
2311                 RETURN(-EAGAIN);
2312
2313         rc = posix_acl_permission(inode, acl, mask);
2314         posix_acl_release(acl);
2315
2316         RETURN(rc);
2317 #else
2318         return -EAGAIN;
2319 #endif
2320 }
2321
2322 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2323 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2324 {
2325         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2326                inode->i_ino, inode->i_generation, inode, mask);
2327         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2328                 return lustre_check_remote_perm(inode, mask);
2329
2330         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2331         return generic_permission(inode, mask, lustre_check_acl);
2332 }
2333 #else
2334 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2335 {
2336         int mode = inode->i_mode;
2337         int rc;
2338
2339         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2340                inode->i_ino, inode->i_generation, inode, mask);
2341
2342         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2343                 return lustre_check_remote_perm(inode, mask);
2344
2345         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2346
2347         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2348             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2349                 return -EROFS;
2350         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2351                 return -EACCES;
2352         if (current->fsuid == inode->i_uid) {
2353                 mode >>= 6;
2354         } else if (1) {
2355                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2356                         goto check_groups;
2357                 rc = lustre_check_acl(inode, mask);
2358                 if (rc == -EAGAIN)
2359                         goto check_groups;
2360                 if (rc == -EACCES)
2361                         goto check_capabilities;
2362                 return rc;
2363         } else {
2364 check_groups:
2365                 if (in_group_p(inode->i_gid))
2366                         mode >>= 3;
2367         }
2368         if ((mode & mask & S_IRWXO) == mask)
2369                 return 0;
2370
2371 check_capabilities:
2372         if (!(mask & MAY_EXEC) ||
2373             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2374                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2375                         return 0;
2376
2377         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2378             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2379                 return 0;
2380
2381         return -EACCES;
2382 }
2383 #endif
2384
2385 #ifdef HAVE_FILE_READV
2386 #define READ_METHOD readv
2387 #define READ_FUNCTION ll_file_readv
2388 #define WRITE_METHOD writev
2389 #define WRITE_FUNCTION ll_file_writev
2390 #else
2391 #define READ_METHOD aio_read
2392 #define READ_FUNCTION ll_file_aio_read
2393 #define WRITE_METHOD aio_write
2394 #define WRITE_FUNCTION ll_file_aio_write
2395 #endif
2396
2397 /* -o localflock - only provides locally consistent flock locks */
2398 struct file_operations ll_file_operations = {
2399         .read           = ll_file_read,
2400         .READ_METHOD    = READ_FUNCTION,
2401         .write          = ll_file_write,
2402         .WRITE_METHOD   = WRITE_FUNCTION,
2403         .ioctl          = ll_file_ioctl,
2404         .open           = ll_file_open,
2405         .release        = ll_file_release,
2406         .mmap           = ll_file_mmap,
2407         .llseek         = ll_file_seek,
2408         .sendfile       = ll_file_sendfile,
2409         .fsync          = ll_fsync,
2410 };
2411
2412 struct file_operations ll_file_operations_flock = {
2413         .read           = ll_file_read,
2414         .READ_METHOD    = READ_FUNCTION,
2415         .write          = ll_file_write,
2416         .WRITE_METHOD   = WRITE_FUNCTION,
2417         .ioctl          = ll_file_ioctl,
2418         .open           = ll_file_open,
2419         .release        = ll_file_release,
2420         .mmap           = ll_file_mmap,
2421         .llseek         = ll_file_seek,
2422         .sendfile       = ll_file_sendfile,
2423         .fsync          = ll_fsync,
2424 #ifdef HAVE_F_OP_FLOCK
2425         .flock          = ll_file_flock,
2426 #endif
2427         .lock           = ll_file_flock
2428 };
2429
2430 /* These are for -o noflock - to return ENOSYS on flock calls */
2431 struct file_operations ll_file_operations_noflock = {
2432         .read           = ll_file_read,
2433         .READ_METHOD    = READ_FUNCTION,
2434         .write          = ll_file_write,
2435         .WRITE_METHOD   = WRITE_FUNCTION,
2436         .ioctl          = ll_file_ioctl,
2437         .open           = ll_file_open,
2438         .release        = ll_file_release,
2439         .mmap           = ll_file_mmap,
2440         .llseek         = ll_file_seek,
2441         .sendfile       = ll_file_sendfile,
2442         .fsync          = ll_fsync,
2443 #ifdef HAVE_F_OP_FLOCK
2444         .flock          = ll_file_noflock,
2445 #endif
2446         .lock           = ll_file_noflock
2447 };
2448
2449 struct inode_operations ll_file_inode_operations = {
2450 #ifdef HAVE_VFS_INTENT_PATCHES
2451         .setattr_raw    = ll_setattr_raw,
2452 #endif
2453         .setattr        = ll_setattr,
2454         .truncate       = ll_truncate,
2455         .getattr        = ll_getattr,
2456         .permission     = ll_inode_permission,
2457         .setxattr       = ll_setxattr,
2458         .getxattr       = ll_getxattr,
2459         .listxattr      = ll_listxattr,
2460         .removexattr    = ll_removexattr,
2461 };
2462
2463 /* dynamic ioctl number support routins */
2464 static struct llioc_ctl_data {
2465         struct rw_semaphore ioc_sem;
2466         struct list_head    ioc_head;
2467 } llioc = {
2468         __RWSEM_INITIALIZER(llioc.ioc_sem),
2469         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2470 };
2471
2472
2473 struct llioc_data {
2474         struct list_head        iocd_list;
2475         unsigned int            iocd_size;
2476         llioc_callback_t        iocd_cb;
2477         unsigned int            iocd_count;
2478         unsigned int            iocd_cmd[0];
2479 };
2480
2481 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2482 {
2483         unsigned int size;
2484         struct llioc_data *in_data = NULL;
2485         ENTRY;
2486
2487         if (cb == NULL || cmd == NULL ||
2488             count > LLIOC_MAX_CMD || count < 0)
2489                 RETURN(NULL);
2490
2491         size = sizeof(*in_data) + count * sizeof(unsigned int);
2492         OBD_ALLOC(in_data, size);
2493         if (in_data == NULL)
2494                 RETURN(NULL);
2495
2496         memset(in_data, 0, sizeof(*in_data));
2497         in_data->iocd_size = size;
2498         in_data->iocd_cb = cb;
2499         in_data->iocd_count = count;
2500         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2501
2502         down_write(&llioc.ioc_sem);
2503         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2504         up_write(&llioc.ioc_sem);
2505
2506         RETURN(in_data);
2507 }
2508
2509 void ll_iocontrol_unregister(void *magic)
2510 {
2511         struct llioc_data *tmp;
2512
2513         if (magic == NULL)
2514                 return;
2515
2516         down_write(&llioc.ioc_sem);
2517         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2518                 if (tmp == magic) {
2519                         unsigned int size = tmp->iocd_size;
2520
2521                         list_del(&tmp->iocd_list);
2522                         up_write(&llioc.ioc_sem);
2523
2524                         OBD_FREE(tmp, size);
2525                         return;
2526                 }
2527         }
2528         up_write(&llioc.ioc_sem);
2529
2530         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2531 }
2532
2533 EXPORT_SYMBOL(ll_iocontrol_register);
2534 EXPORT_SYMBOL(ll_iocontrol_unregister);
2535
2536 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2537                         unsigned int cmd, unsigned long arg, int *rcp)
2538 {
2539         enum llioc_iter ret = LLIOC_CONT;
2540         struct llioc_data *data;
2541         int rc = -EINVAL, i;
2542
2543         down_read(&llioc.ioc_sem);
2544         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2545                 for (i = 0; i < data->iocd_count; i++) {
2546                         if (cmd != data->iocd_cmd[i])
2547                                 continue;
2548
2549                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2550                         break;
2551                 }
2552
2553                 if (ret == LLIOC_STOP)
2554                         break;
2555         }
2556         up_read(&llioc.ioc_sem);
2557
2558         if (rcp)
2559                 *rcp = rc;
2560         return ret;
2561 }