Whamcloud - gitweb
beaf065720972345bcffcdfcd90b6d2de3e4999b
[fs/lustre-release.git] / lustre / llite / file.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51
52 #include "cl_object.h"
53
54 struct ll_file_data *ll_file_data_get(void)
55 {
56         struct ll_file_data *fd;
57
58         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
59         return fd;
60 }
61
62 static void ll_file_data_put(struct ll_file_data *fd)
63 {
64         if (fd != NULL)
65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 }
67
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69                           struct lustre_handle *fh)
70 {
71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72         op_data->op_attr.ia_mode = inode->i_mode;
73         op_data->op_attr.ia_atime = inode->i_atime;
74         op_data->op_attr.ia_mtime = inode->i_mtime;
75         op_data->op_attr.ia_ctime = inode->i_ctime;
76         op_data->op_attr.ia_size = i_size_read(inode);
77         op_data->op_attr_blocks = inode->i_blocks;
78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
79         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
80         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
81         op_data->op_capa1 = ll_mdscapa_get(inode);
82 }
83
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85                              struct obd_client_handle *och)
86 {
87         ENTRY;
88
89         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
90                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
91
92         if (!(och->och_flags & FMODE_WRITE))
93                 goto out;
94
95         if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
96             !S_ISREG(inode->i_mode))
97                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
98         else
99                 ll_epoch_close(inode, op_data, &och, 0);
100
101 out:
102         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
103         EXIT;
104 }
105
106 static int ll_close_inode_openhandle(struct obd_export *md_exp,
107                                      struct inode *inode,
108                                      struct obd_client_handle *och)
109 {
110         struct obd_export *exp = ll_i2mdexp(inode);
111         struct md_op_data *op_data;
112         struct ptlrpc_request *req = NULL;
113         struct obd_device *obd = class_exp2obd(exp);
114         int epoch_close = 1;
115         int rc;
116         ENTRY;
117
118         if (obd == NULL) {
119                 /*
120                  * XXX: in case of LMV, is this correct to access
121                  * ->exp_handle?
122                  */
123                 CERROR("Invalid MDC connection handle "LPX64"\n",
124                        ll_i2mdexp(inode)->exp_handle.h_cookie);
125                 GOTO(out, rc = 0);
126         }
127
128         /*
129          * here we check if this is forced umount. If so this is called on
130          * canceling "open lock" and we do not call md_close() in this case, as
131          * it will not be successful, as import is already deactivated.
132          */
133         if (obd->obd_force)
134                 GOTO(out, rc = 0);
135
136         OBD_ALLOC_PTR(op_data);
137         if (op_data == NULL)
138                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
139
140         ll_prepare_close(inode, op_data, och);
141         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
142         rc = md_close(md_exp, op_data, och->och_mod, &req);
143         if (rc == -EAGAIN) {
144                 /* This close must have the epoch closed. */
145                 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
146                 LASSERT(epoch_close);
147                 /* MDS has instructed us to obtain Size-on-MDS attribute from
148                  * OSTs and send setattr to back to MDS. */
149                 rc = ll_sizeonmds_update(inode, &och->och_fh,
150                                          op_data->op_ioepoch);
151                 if (rc) {
152                         CERROR("inode %lu mdc Size-on-MDS update failed: "
153                                "rc = %d\n", inode->i_ino, rc);
154                         rc = 0;
155                 }
156         } else if (rc) {
157                 CERROR("inode %lu mdc close failed: rc = %d\n",
158                        inode->i_ino, rc);
159         }
160         ll_finish_md_op_data(op_data);
161
162         if (rc == 0) {
163                 rc = ll_objects_destroy(req, inode);
164                 if (rc)
165                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
166                                inode->i_ino, rc);
167         }
168
169         EXIT;
170 out:
171
172         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
173             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
174                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
175         } else {
176                 md_clear_open_replay_data(md_exp, och);
177                 /* Free @och if it is not waiting for DONE_WRITING. */
178                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
179                 OBD_FREE_PTR(och);
180         }
181         if (req) /* This is close request */
182                 ptlrpc_req_finished(req);
183         return rc;
184 }
185
186 int ll_md_real_close(struct inode *inode, int flags)
187 {
188         struct ll_inode_info *lli = ll_i2info(inode);
189         struct obd_client_handle **och_p;
190         struct obd_client_handle *och;
191         __u64 *och_usecount;
192         int rc = 0;
193         ENTRY;
194
195         if (flags & FMODE_WRITE) {
196                 och_p = &lli->lli_mds_write_och;
197                 och_usecount = &lli->lli_open_fd_write_count;
198         } else if (flags & FMODE_EXEC) {
199                 och_p = &lli->lli_mds_exec_och;
200                 och_usecount = &lli->lli_open_fd_exec_count;
201         } else {
202                 LASSERT(flags & FMODE_READ);
203                 och_p = &lli->lli_mds_read_och;
204                 och_usecount = &lli->lli_open_fd_read_count;
205         }
206
207         down(&lli->lli_och_sem);
208         if (*och_usecount) { /* There are still users of this handle, so
209                                 skip freeing it. */
210                 up(&lli->lli_och_sem);
211                 RETURN(0);
212         }
213         och=*och_p;
214         *och_p = NULL;
215         up(&lli->lli_och_sem);
216
217         if (och) { /* There might be a race and somebody have freed this och
218                       already */
219                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
220                                                inode, och);
221         }
222
223         RETURN(rc);
224 }
225
226 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
227                 struct file *file)
228 {
229         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
230         struct ll_inode_info *lli = ll_i2info(inode);
231         int rc = 0;
232         ENTRY;
233
234         /* clear group lock, if present */
235         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
236                 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
237
238         /* Let's see if we have good enough OPEN lock on the file and if
239            we can skip talking to MDS */
240         if (file->f_dentry->d_inode) { /* Can this ever be false? */
241                 int lockmode;
242                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
243                 struct lustre_handle lockh;
244                 struct inode *inode = file->f_dentry->d_inode;
245                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
246
247                 down(&lli->lli_och_sem);
248                 if (fd->fd_omode & FMODE_WRITE) {
249                         lockmode = LCK_CW;
250                         LASSERT(lli->lli_open_fd_write_count);
251                         lli->lli_open_fd_write_count--;
252                 } else if (fd->fd_omode & FMODE_EXEC) {
253                         lockmode = LCK_PR;
254                         LASSERT(lli->lli_open_fd_exec_count);
255                         lli->lli_open_fd_exec_count--;
256                 } else {
257                         lockmode = LCK_CR;
258                         LASSERT(lli->lli_open_fd_read_count);
259                         lli->lli_open_fd_read_count--;
260                 }
261                 up(&lli->lli_och_sem);
262
263                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
264                                    LDLM_IBITS, &policy, lockmode,
265                                    &lockh)) {
266                         rc = ll_md_real_close(file->f_dentry->d_inode,
267                                               fd->fd_omode);
268                 }
269         } else {
270                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
271                        file, file->f_dentry, file->f_dentry->d_name.name);
272         }
273
274         LUSTRE_FPRIVATE(file) = NULL;
275         ll_file_data_put(fd);
276         ll_capa_close(inode);
277
278         RETURN(rc);
279 }
280
281 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
282
283 /* While this returns an error code, fput() the caller does not, so we need
284  * to make every effort to clean up all of our state here.  Also, applications
285  * rarely check close errors and even if an error is returned they will not
286  * re-try the close call.
287  */
288 int ll_file_release(struct inode *inode, struct file *file)
289 {
290         struct ll_file_data *fd;
291         struct ll_sb_info *sbi = ll_i2sbi(inode);
292         struct ll_inode_info *lli = ll_i2info(inode);
293         struct lov_stripe_md *lsm = lli->lli_smd;
294         int rc;
295         ENTRY;
296
297         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
298                inode->i_generation, inode);
299
300 #ifdef CONFIG_FS_POSIX_ACL
301         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
302             inode == inode->i_sb->s_root->d_inode) {
303                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
304
305                 LASSERT(fd != NULL);
306                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
307                         fd->fd_flags &= ~LL_FILE_RMTACL;
308                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
309                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
310                 }
311         }
312 #endif
313
314         if (inode->i_sb->s_root != file->f_dentry)
315                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
316         fd = LUSTRE_FPRIVATE(file);
317         LASSERT(fd != NULL);
318
319         /* The last ref on @file, maybe not the the owner pid of statahead.
320          * Different processes can open the same dir, "ll_opendir_key" means:
321          * it is me that should stop the statahead thread. */
322         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
323                 ll_stop_statahead(inode, lli->lli_opendir_key);
324
325         if (inode->i_sb->s_root == file->f_dentry) {
326                 LUSTRE_FPRIVATE(file) = NULL;
327                 ll_file_data_put(fd);
328                 RETURN(0);
329         }
330
331         if (lsm)
332                 lov_test_and_clear_async_rc(lsm);
333         lli->lli_async_rc = 0;
334
335         rc = ll_md_close(sbi->ll_md_exp, inode, file);
336         RETURN(rc);
337 }
338
339 static int ll_intent_file_open(struct file *file, void *lmm,
340                                int lmmsize, struct lookup_intent *itp)
341 {
342         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
343         struct dentry *parent = file->f_dentry->d_parent;
344         const char *name = file->f_dentry->d_name.name;
345         const int len = file->f_dentry->d_name.len;
346         struct md_op_data *op_data;
347         struct ptlrpc_request *req;
348         int rc;
349         ENTRY;
350
351         if (!parent)
352                 RETURN(-ENOENT);
353
354         /* Usually we come here only for NFSD, and we want open lock.
355            But we can also get here with pre 2.6.15 patchless kernels, and in
356            that case that lock is also ok */
357         /* We can also get here if there was cached open handle in revalidate_it
358          * but it disappeared while we were getting from there to ll_file_open.
359          * But this means this file was closed and immediatelly opened which
360          * makes a good candidate for using OPEN lock */
361         /* If lmmsize & lmm are not 0, we are just setting stripe info
362          * parameters. No need for the open lock */
363         if (!lmm && !lmmsize)
364                 itp->it_flags |= MDS_OPEN_LOCK;
365
366         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
367                                       file->f_dentry->d_inode, name, len,
368                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
369         if (IS_ERR(op_data))
370                 RETURN(PTR_ERR(op_data));
371
372         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
373                             0 /*unused */, &req, ll_md_blocking_ast, 0);
374         ll_finish_md_op_data(op_data);
375         if (rc == -ESTALE) {
376                 /* reason for keep own exit path - don`t flood log
377                 * with messages with -ESTALE errors.
378                 */
379                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
380                      it_open_error(DISP_OPEN_OPEN, itp))
381                         GOTO(out, rc);
382                 ll_release_openhandle(file->f_dentry, itp);
383                 GOTO(out, rc);
384         }
385
386         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
387                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
388                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
389                 GOTO(out, rc);
390         }
391
392         if (itp->d.lustre.it_lock_mode)
393                 md_set_lock_data(sbi->ll_md_exp,
394                                  &itp->d.lustre.it_lock_handle,
395                                  file->f_dentry->d_inode);
396
397         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
398 out:
399         ptlrpc_req_finished(itp->d.lustre.it_data);
400         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
401         ll_intent_drop_lock(itp);
402
403         RETURN(rc);
404 }
405
406 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
407                        struct lookup_intent *it, struct obd_client_handle *och)
408 {
409         struct ptlrpc_request *req = it->d.lustre.it_data;
410         struct mdt_body *body;
411
412         LASSERT(och);
413
414         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
415         LASSERT(body != NULL);                      /* reply already checked out */
416
417         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
418         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
419         och->och_fid = lli->lli_fid;
420         och->och_flags = it->it_flags;
421         lli->lli_ioepoch = body->ioepoch;
422
423         return md_set_open_replay_data(md_exp, och, req);
424 }
425
426 int ll_local_open(struct file *file, struct lookup_intent *it,
427                   struct ll_file_data *fd, struct obd_client_handle *och)
428 {
429         struct inode *inode = file->f_dentry->d_inode;
430         struct ll_inode_info *lli = ll_i2info(inode);
431         ENTRY;
432
433         LASSERT(!LUSTRE_FPRIVATE(file));
434
435         LASSERT(fd != NULL);
436
437         if (och) {
438                 struct ptlrpc_request *req = it->d.lustre.it_data;
439                 struct mdt_body *body;
440                 int rc;
441
442                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
443                 if (rc)
444                         RETURN(rc);
445
446                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
447                 if ((it->it_flags & FMODE_WRITE) &&
448                     (body->valid & OBD_MD_FLSIZE))
449                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
450                                lli->lli_ioepoch, PFID(&lli->lli_fid));
451         }
452
453         LUSTRE_FPRIVATE(file) = fd;
454         ll_readahead_init(inode, &fd->fd_ras);
455         fd->fd_omode = it->it_flags;
456         RETURN(0);
457 }
458
459 /* Open a file, and (for the very first open) create objects on the OSTs at
460  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
461  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
462  * lli_open_sem to ensure no other process will create objects, send the
463  * stripe MD to the MDS, or try to destroy the objects if that fails.
464  *
465  * If we already have the stripe MD locally then we don't request it in
466  * md_open(), by passing a lmm_size = 0.
467  *
468  * It is up to the application to ensure no other processes open this file
469  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
470  * used.  We might be able to avoid races of that sort by getting lli_open_sem
471  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
472  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
473  */
474 int ll_file_open(struct inode *inode, struct file *file)
475 {
476         struct ll_inode_info *lli = ll_i2info(inode);
477         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
478                                           .it_flags = file->f_flags };
479         struct lov_stripe_md *lsm;
480         struct ptlrpc_request *req = NULL;
481         struct obd_client_handle **och_p;
482         __u64 *och_usecount;
483         struct ll_file_data *fd;
484         int rc = 0, opendir_set = 0;
485         ENTRY;
486
487         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
488                inode->i_generation, inode, file->f_flags);
489
490 #ifdef HAVE_VFS_INTENT_PATCHES
491         it = file->f_it;
492 #else
493         it = file->private_data; /* XXX: compat macro */
494         file->private_data = NULL; /* prevent ll_local_open assertion */
495 #endif
496
497         fd = ll_file_data_get();
498         if (fd == NULL)
499                 RETURN(-ENOMEM);
500
501         fd->fd_file = file;
502         if (S_ISDIR(inode->i_mode)) {
503 again:
504                 spin_lock(&lli->lli_lock);
505                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
506                         LASSERT(lli->lli_sai == NULL);
507                         lli->lli_opendir_key = fd;
508                         lli->lli_opendir_pid = cfs_curproc_pid();
509                         opendir_set = 1;
510                 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid() &&
511                                     lli->lli_opendir_key != NULL)) {
512                         /* Two cases for this:
513                          * (1) The same process open such directory many times.
514                          * (2) The old process opened the directory, and exited
515                          *     before its children processes. Then new process
516                          *     with the same pid opens such directory before the
517                          *     old process's children processes exit.
518                          * reset stat ahead for such cases. */
519                         spin_unlock(&lli->lli_lock);
520                         CDEBUG(D_INFO, "Conflict statahead for %.*s "DFID
521                                " reset it.\n", file->f_dentry->d_name.len,
522                                file->f_dentry->d_name.name,
523                                PFID(&lli->lli_fid));
524                         ll_stop_statahead(inode, lli->lli_opendir_key);
525                         goto again;
526                 }
527                 spin_unlock(&lli->lli_lock);
528         }
529
530         if (inode->i_sb->s_root == file->f_dentry) {
531                 LUSTRE_FPRIVATE(file) = fd;
532                 RETURN(0);
533         }
534
535         if (!it || !it->d.lustre.it_disposition) {
536                 /* Convert f_flags into access mode. We cannot use file->f_mode,
537                  * because everything but O_ACCMODE mask was stripped from
538                  * there */
539                 if ((oit.it_flags + 1) & O_ACCMODE)
540                         oit.it_flags++;
541                 if (file->f_flags & O_TRUNC)
542                         oit.it_flags |= FMODE_WRITE;
543
544                 /* kernel only call f_op->open in dentry_open.  filp_open calls
545                  * dentry_open after call to open_namei that checks permissions.
546                  * Only nfsd_open call dentry_open directly without checking
547                  * permissions and because of that this code below is safe. */
548                 if (oit.it_flags & FMODE_WRITE)
549                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
550
551                 /* We do not want O_EXCL here, presumably we opened the file
552                  * already? XXX - NFS implications? */
553                 oit.it_flags &= ~O_EXCL;
554
555                 it = &oit;
556         }
557
558 restart:
559         /* Let's see if we have file open on MDS already. */
560         if (it->it_flags & FMODE_WRITE) {
561                 och_p = &lli->lli_mds_write_och;
562                 och_usecount = &lli->lli_open_fd_write_count;
563         } else if (it->it_flags & FMODE_EXEC) {
564                 och_p = &lli->lli_mds_exec_och;
565                 och_usecount = &lli->lli_open_fd_exec_count;
566          } else {
567                 och_p = &lli->lli_mds_read_och;
568                 och_usecount = &lli->lli_open_fd_read_count;
569         }
570
571         down(&lli->lli_och_sem);
572         if (*och_p) { /* Open handle is present */
573                 if (it_disposition(it, DISP_OPEN_OPEN)) {
574                         /* Well, there's extra open request that we do not need,
575                            let's close it somehow. This will decref request. */
576                         rc = it_open_error(DISP_OPEN_OPEN, it);
577                         if (rc) {
578                                 up(&lli->lli_och_sem);
579                                 ll_file_data_put(fd);
580                                 GOTO(out_openerr, rc);
581                         }
582                         ll_release_openhandle(file->f_dentry, it);
583                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
584                                              LPROC_LL_OPEN);
585                 }
586                 (*och_usecount)++;
587
588                 rc = ll_local_open(file, it, fd, NULL);
589                 if (rc) {
590                         (*och_usecount)--;
591                         up(&lli->lli_och_sem);
592                         ll_file_data_put(fd);
593                         GOTO(out_openerr, rc);
594                 }
595         } else {
596                 LASSERT(*och_usecount == 0);
597                 if (!it->d.lustre.it_disposition) {
598                         /* We cannot just request lock handle now, new ELC code
599                            means that one of other OPEN locks for this file
600                            could be cancelled, and since blocking ast handler
601                            would attempt to grab och_sem as well, that would
602                            result in a deadlock */
603                         up(&lli->lli_och_sem);
604                         it->it_create_mode |= M_CHECK_STALE;
605                         rc = ll_intent_file_open(file, NULL, 0, it);
606                         it->it_create_mode &= ~M_CHECK_STALE;
607                         if (rc) {
608                                 ll_file_data_put(fd);
609                                 GOTO(out_openerr, rc);
610                         }
611
612                         /* Got some error? Release the request */
613                         if (it->d.lustre.it_status < 0) {
614                                 req = it->d.lustre.it_data;
615                                 ptlrpc_req_finished(req);
616                         }
617                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
618                                          &it->d.lustre.it_lock_handle,
619                                          file->f_dentry->d_inode);
620                         goto restart;
621                 }
622                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
623                 if (!*och_p) {
624                         ll_file_data_put(fd);
625                         GOTO(out_och_free, rc = -ENOMEM);
626                 }
627                 (*och_usecount)++;
628                 req = it->d.lustre.it_data;
629
630                 /* md_intent_lock() didn't get a request ref if there was an
631                  * open error, so don't do cleanup on the request here
632                  * (bug 3430) */
633                 /* XXX (green): Should not we bail out on any error here, not
634                  * just open error? */
635                 rc = it_open_error(DISP_OPEN_OPEN, it);
636                 if (rc) {
637                         ll_file_data_put(fd);
638                         GOTO(out_och_free, rc);
639                 }
640
641                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
642                 rc = ll_local_open(file, it, fd, *och_p);
643                 if (rc) {
644                         ll_file_data_put(fd);
645                         GOTO(out_och_free, rc);
646                 }
647         }
648         up(&lli->lli_och_sem);
649
650         /* Must do this outside lli_och_sem lock to prevent deadlock where
651            different kind of OPEN lock for this same inode gets cancelled
652            by ldlm_cancel_lru */
653         if (!S_ISREG(inode->i_mode))
654                 GOTO(out, rc);
655
656         ll_capa_open(inode);
657
658         lsm = lli->lli_smd;
659         if (lsm == NULL) {
660                 if (file->f_flags & O_LOV_DELAY_CREATE ||
661                     !(file->f_mode & FMODE_WRITE)) {
662                         CDEBUG(D_INODE, "object creation was delayed\n");
663                         GOTO(out, rc);
664                 }
665         }
666         file->f_flags &= ~O_LOV_DELAY_CREATE;
667         GOTO(out, rc);
668 out:
669         ptlrpc_req_finished(req);
670         if (req)
671                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
672 out_och_free:
673         if (rc) {
674                 if (*och_p) {
675                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
676                         *och_p = NULL; /* OBD_FREE writes some magic there */
677                         (*och_usecount)--;
678                 }
679                 up(&lli->lli_och_sem);
680 out_openerr:
681                 if (opendir_set != 0)
682                         ll_stop_statahead(inode, lli->lli_opendir_key);
683         }
684
685         return rc;
686 }
687
688 /* Fills the obdo with the attributes for the lsm */
689 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
690                           struct obd_capa *capa, struct obdo *obdo)
691 {
692         struct ptlrpc_request_set *set;
693         struct obd_info            oinfo = { { { 0 } } };
694         int                        rc;
695
696         ENTRY;
697
698         LASSERT(lsm != NULL);
699
700         oinfo.oi_md = lsm;
701         oinfo.oi_oa = obdo;
702         oinfo.oi_oa->o_id = lsm->lsm_object_id;
703         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
704         oinfo.oi_oa->o_mode = S_IFREG;
705         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
706                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
707                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
708                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
709                                OBD_MD_FLGROUP;
710         oinfo.oi_capa = capa;
711
712         set = ptlrpc_prep_set();
713         if (set == NULL) {
714                 CERROR("can't allocate ptlrpc set\n");
715                 rc = -ENOMEM;
716         } else {
717                 rc = obd_getattr_async(exp, &oinfo, set);
718                 if (rc == 0)
719                         rc = ptlrpc_set_wait(set);
720                 ptlrpc_set_destroy(set);
721         }
722         if (rc == 0)
723                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
724                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
725                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE);
726         RETURN(rc);
727 }
728
729 /* Fills the obdo with the attributes for the inode defined by lsm */
730 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
731 {
732         struct ll_inode_info *lli  = ll_i2info(inode);
733         struct obd_capa      *capa = ll_mdscapa_get(inode);
734         int rc;
735         ENTRY;
736
737         rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode), capa, obdo);
738         capa_put(capa);
739         if (rc == 0) {
740                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
741                 CDEBUG(D_INODE,
742                        "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
743                        lli->lli_smd->lsm_object_id, i_size_read(inode),
744                        (unsigned long long)inode->i_blocks,
745                        (unsigned long)ll_inode_blksize(inode));
746         }
747         RETURN(rc);
748 }
749
750 int ll_merge_lvb(struct inode *inode)
751 {
752         struct ll_inode_info *lli = ll_i2info(inode);
753         struct ll_sb_info *sbi = ll_i2sbi(inode);
754         struct ost_lvb lvb;
755         int rc;
756
757         ENTRY;
758
759         ll_inode_size_lock(inode, 1);
760         inode_init_lvb(inode, &lvb);
761         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
762         i_size_write(inode, lvb.lvb_size);
763         inode->i_blocks = lvb.lvb_blocks;
764
765         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
766         LTIME_S(inode->i_atime) = lvb.lvb_atime;
767         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
768         ll_inode_size_unlock(inode, 1);
769
770         RETURN(rc);
771 }
772
773 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
774                      lstat_t *st)
775 {
776         struct obdo obdo = { 0 };
777         int rc;
778
779         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo);
780         if (rc == 0) {
781                 st->st_size   = obdo.o_size;
782                 st->st_blocks = obdo.o_blocks;
783                 st->st_mtime  = obdo.o_mtime;
784                 st->st_atime  = obdo.o_atime;
785                 st->st_ctime  = obdo.o_ctime;
786         }
787         return rc;
788 }
789
790 void ll_io_init(struct cl_io *io, const struct file *file, int write)
791 {
792         struct inode *inode     = file->f_dentry->d_inode;
793         struct ll_sb_info *sbi  = ll_i2sbi(inode);
794         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
795
796         LASSERT(fd != NULL);
797         memset(io, 0, sizeof *io);
798         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
799         if (write)
800                 io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
801         io->ci_obj     = ll_i2info(inode)->lli_clob;
802         io->ci_lockreq = CILR_MAYBE;
803         if (fd->fd_flags & LL_FILE_IGNORE_LOCK ||
804             sbi->ll_flags & LL_SBI_NOLCK) {
805                 io->ci_lockreq = CILR_NEVER;
806                 io->ci_no_srvlock = 1;
807         } else if (file->f_flags & O_APPEND) {
808                 io->ci_lockreq = CILR_MANDATORY;
809         }
810 }
811
812 static ssize_t ll_file_io_generic(const struct lu_env *env,
813                 struct ccc_io_args *args, struct file *file,
814                 enum cl_io_type iot, loff_t *ppos, size_t count)
815 {
816         struct cl_io       *io;
817         ssize_t             result;
818         ENTRY;
819
820         io = &ccc_env_info(env)->cti_io;
821         ll_io_init(io, file, iot == CIT_WRITE);
822
823         if (iot == CIT_READ)
824                 io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile;
825
826         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
827                 struct vvp_io *vio = vvp_env_io(env);
828                 struct ccc_io *cio = ccc_env_io(env);
829                 if (cl_io_is_sendfile(io)) {
830                         vio->u.read.cui_actor = args->cia_actor;
831                         vio->u.read.cui_target = args->cia_target;
832                 } else {
833                         cio->cui_iov = args->cia_iov;
834                         cio->cui_nrsegs = args->cia_nrsegs;
835 #ifndef HAVE_FILE_WRITEV
836                         cio->cui_iocb = args->cia_iocb;
837 #endif
838                 }
839                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
840                 result = cl_io_loop(env, io);
841         } else
842                 /* cl_io_rw_init() handled IO */
843                 result = io->ci_result;
844         if (io->ci_nob > 0) {
845                 result = io->ci_nob;
846                 *ppos = io->u.ci_wr.wr.crw_pos;
847         }
848         cl_io_fini(env, io);
849         RETURN(result);
850 }
851
852
853 /*
854  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
855  */
856 static int ll_file_get_iov_count(const struct iovec *iov,
857                                  unsigned long *nr_segs, size_t *count)
858 {
859         size_t cnt = 0;
860         unsigned long seg;
861
862         for (seg = 0; seg < *nr_segs; seg++) {
863                 const struct iovec *iv = &iov[seg];
864
865                 /*
866                  * If any segment has a negative length, or the cumulative
867                  * length ever wraps negative then return -EINVAL.
868                  */
869                 cnt += iv->iov_len;
870                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
871                         return -EINVAL;
872                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
873                         continue;
874                 if (seg == 0)
875                         return -EFAULT;
876                 *nr_segs = seg;
877                 cnt -= iv->iov_len;   /* This segment is no good */
878                 break;
879         }
880         *count = cnt;
881         return 0;
882 }
883
884 #ifdef HAVE_FILE_READV
885 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
886                               unsigned long nr_segs, loff_t *ppos)
887 {
888         struct lu_env      *env;
889         struct ccc_io_args *args;
890         size_t              count;
891         ssize_t             result;
892         int                 refcheck;
893         ENTRY;
894
895         result = ll_file_get_iov_count(iov, &nr_segs, &count);
896         if (result)
897                 RETURN(result);
898
899         env = cl_env_get(&refcheck);
900         if (IS_ERR(env))
901                 RETURN(PTR_ERR(env));
902
903         args = &vvp_env_info(env)->vti_args;
904         args->cia_is_sendfile = 0;
905         args->cia_iov = (struct iovec *)iov;
906         args->cia_nrsegs = nr_segs;
907         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
908         cl_env_put(env, &refcheck);
909         RETURN(result);
910 }
911
912 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
913                             loff_t *ppos)
914 {
915         struct lu_env *env;
916         struct iovec  *local_iov;
917         ssize_t        result;
918         int            refcheck;
919         ENTRY;
920
921         env = cl_env_get(&refcheck);
922         if (IS_ERR(env))
923                 RETURN(PTR_ERR(env));
924
925         local_iov = &vvp_env_info(env)->vti_local_iov;
926         local_iov->iov_base = (void __user *)buf;
927         local_iov->iov_len = count;
928         result = ll_file_readv(file, local_iov, 1, ppos);
929         cl_env_put(env, &refcheck);
930         RETURN(result);
931 }
932
933 #else
934 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
935                                 unsigned long nr_segs, loff_t pos)
936 {
937         struct lu_env      *env;
938         struct ccc_io_args *args;
939         size_t              count;
940         ssize_t             result;
941         int                 refcheck;
942         ENTRY;
943
944         result = ll_file_get_iov_count(iov, &nr_segs, &count);
945         if (result)
946                 RETURN(result);
947
948         env = cl_env_get(&refcheck);
949         if (IS_ERR(env))
950                 RETURN(PTR_ERR(env));
951
952         args = &vvp_env_info(env)->vti_args;
953         args->cia_is_sendfile = 0;
954         args->cia_iov = (struct iovec *)iov;
955         args->cia_nrsegs = nr_segs;
956         args->cia_iocb = iocb;
957         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
958                                     &iocb->ki_pos, count);
959         cl_env_put(env, &refcheck);
960         RETURN(result);
961 }
962
963 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
964                             loff_t *ppos)
965 {
966         struct lu_env *env;
967         struct iovec  *local_iov;
968         struct kiocb  *kiocb;
969         ssize_t        result;
970         int            refcheck;
971         ENTRY;
972
973         env = cl_env_get(&refcheck);
974         if (IS_ERR(env))
975                 RETURN(PTR_ERR(env));
976
977         local_iov = &vvp_env_info(env)->vti_local_iov;
978         kiocb = &vvp_env_info(env)->vti_kiocb;
979         local_iov->iov_base = (void __user *)buf;
980         local_iov->iov_len = count;
981         init_sync_kiocb(kiocb, file);
982         kiocb->ki_pos = *ppos;
983         kiocb->ki_left = count;
984
985         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
986         *ppos = kiocb->ki_pos;
987
988         cl_env_put(env, &refcheck);
989         RETURN(result);
990 }
991 #endif
992
993 /*
994  * Write to a file (through the page cache).
995  */
996 #ifdef HAVE_FILE_WRITEV
997 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
998                               unsigned long nr_segs, loff_t *ppos)
999 {
1000         struct lu_env      *env;
1001         struct ccc_io_args *args;
1002         size_t              count;
1003         ssize_t             result;
1004         int                 refcheck;
1005         ENTRY;
1006
1007         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1008         if (result)
1009                 RETURN(result);
1010
1011         env = cl_env_get(&refcheck);
1012         if (IS_ERR(env))
1013                 RETURN(PTR_ERR(env));
1014
1015         args = &vvp_env_info(env)->vti_args;
1016         args->cia_iov = (struct iovec *)iov;
1017         args->cia_nrsegs = nr_segs;
1018         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1019         cl_env_put(env, &refcheck);
1020         RETURN(result);
1021 }
1022
1023 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1024                              loff_t *ppos)
1025 {
1026         struct lu_env    *env;
1027         struct iovec     *local_iov;
1028         ssize_t           result;
1029         int               refcheck;
1030         ENTRY;
1031
1032         env = cl_env_get(&refcheck);
1033         if (IS_ERR(env))
1034                 RETURN(PTR_ERR(env));
1035
1036         local_iov = &vvp_env_info(env)->vti_local_iov;
1037         local_iov->iov_base = (void __user *)buf;
1038         local_iov->iov_len = count;
1039
1040         result = ll_file_writev(file, local_iov, 1, ppos);
1041         cl_env_put(env, &refcheck);
1042         RETURN(result);
1043 }
1044
1045 #else /* AIO stuff */
1046 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1047                                  unsigned long nr_segs, loff_t pos)
1048 {
1049         struct lu_env      *env;
1050         struct ccc_io_args *args;
1051         size_t              count;
1052         ssize_t             result;
1053         int                 refcheck;
1054         ENTRY;
1055
1056         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1057         if (result)
1058                 RETURN(result);
1059
1060         env = cl_env_get(&refcheck);
1061         if (IS_ERR(env))
1062                 RETURN(PTR_ERR(env));
1063
1064         args = &vvp_env_info(env)->vti_args;
1065         args->cia_iov = (struct iovec *)iov;
1066         args->cia_nrsegs = nr_segs;
1067         args->cia_iocb = iocb;
1068         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1069                                   &iocb->ki_pos, count);
1070         cl_env_put(env, &refcheck);
1071         RETURN(result);
1072 }
1073
1074 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1075                              loff_t *ppos)
1076 {
1077         struct lu_env *env;
1078         struct iovec  *local_iov;
1079         struct kiocb  *kiocb;
1080         ssize_t        result;
1081         int            refcheck;
1082         ENTRY;
1083
1084         env = cl_env_get(&refcheck);
1085         if (IS_ERR(env))
1086                 RETURN(PTR_ERR(env));
1087
1088         local_iov = &vvp_env_info(env)->vti_local_iov;
1089         kiocb = &vvp_env_info(env)->vti_kiocb;
1090         local_iov->iov_base = (void __user *)buf;
1091         local_iov->iov_len = count;
1092         init_sync_kiocb(kiocb, file);
1093         kiocb->ki_pos = *ppos;
1094         kiocb->ki_left = count;
1095
1096         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1097         *ppos = kiocb->ki_pos;
1098
1099         cl_env_put(env, &refcheck);
1100         RETURN(result);
1101 }
1102 #endif
1103
1104
1105 /*
1106  * Send file content (through pagecache) somewhere with helper
1107  */
1108 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1109                                 read_actor_t actor, void *target)
1110 {
1111         struct lu_env      *env;
1112         struct ccc_io_args *args;
1113         ssize_t             result;
1114         int                 refcheck;
1115         ENTRY;
1116
1117         env = cl_env_get(&refcheck);
1118         if (IS_ERR(env))
1119                 RETURN(PTR_ERR(env));
1120
1121         args = &vvp_env_info(env)->vti_args;
1122         args->cia_is_sendfile = 1;
1123         args->cia_target = target;
1124         args->cia_actor = actor;
1125         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1126         cl_env_put(env, &refcheck);
1127         RETURN(result);
1128 }
1129
1130 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1131                                unsigned long arg)
1132 {
1133         struct obd_export *exp = ll_i2dtexp(inode);
1134         struct ll_recreate_obj ucreatp;
1135         struct obd_trans_info oti = { 0 };
1136         struct obdo *oa = NULL;
1137         int lsm_size;
1138         int rc = 0;
1139         struct lov_stripe_md *lsm, *lsm2;
1140         ENTRY;
1141
1142         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1143                 RETURN(-EPERM);
1144
1145         if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1146                            sizeof(struct ll_recreate_obj)))
1147                 RETURN(-EFAULT);
1148
1149         OBDO_ALLOC(oa);
1150         if (oa == NULL)
1151                 RETURN(-ENOMEM);
1152
1153         ll_inode_size_lock(inode, 0);
1154         lsm = ll_i2info(inode)->lli_smd;
1155         if (lsm == NULL)
1156                 GOTO(out, rc = -ENOENT);
1157         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1158                    (lsm->lsm_stripe_count));
1159
1160         OBD_ALLOC(lsm2, lsm_size);
1161         if (lsm2 == NULL)
1162                 GOTO(out, rc = -ENOMEM);
1163
1164         oa->o_id = ucreatp.lrc_id;
1165         oa->o_gr = ucreatp.lrc_group;
1166         oa->o_nlink = ucreatp.lrc_ost_idx;
1167         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1168         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1169         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1170                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1171
1172         memcpy(lsm2, lsm, lsm_size);
1173         rc = obd_create(exp, oa, &lsm2, &oti);
1174
1175         OBD_FREE(lsm2, lsm_size);
1176         GOTO(out, rc);
1177 out:
1178         ll_inode_size_unlock(inode, 0);
1179         OBDO_FREE(oa);
1180         return rc;
1181 }
1182
1183 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1184                              int flags, struct lov_user_md *lum, int lum_size)
1185 {
1186         struct lov_stripe_md *lsm;
1187         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1188         int rc = 0;
1189         ENTRY;
1190
1191         ll_inode_size_lock(inode, 0);
1192         lsm = ll_i2info(inode)->lli_smd;
1193         if (lsm) {
1194                 ll_inode_size_unlock(inode, 0);
1195                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1196                        inode->i_ino);
1197                 RETURN(-EEXIST);
1198         }
1199
1200         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1201         if (rc)
1202                 GOTO(out, rc);
1203         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1204                 GOTO(out_req_free, rc = -ENOENT);
1205         rc = oit.d.lustre.it_status;
1206         if (rc < 0)
1207                 GOTO(out_req_free, rc);
1208
1209         ll_release_openhandle(file->f_dentry, &oit);
1210
1211  out:
1212         ll_inode_size_unlock(inode, 0);
1213         ll_intent_release(&oit);
1214         RETURN(rc);
1215 out_req_free:
1216         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1217         goto out;
1218 }
1219
1220 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1221                              struct lov_mds_md **lmmp, int *lmm_size,
1222                              struct ptlrpc_request **request)
1223 {
1224         struct ll_sb_info *sbi = ll_i2sbi(inode);
1225         struct mdt_body  *body;
1226         struct lov_mds_md *lmm = NULL;
1227         struct ptlrpc_request *req = NULL;
1228         struct obd_capa *oc;
1229         int rc, lmmsize;
1230
1231         rc = ll_get_max_mdsize(sbi, &lmmsize);
1232         if (rc)
1233                 RETURN(rc);
1234
1235         oc = ll_mdscapa_get(inode);
1236         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1237                              oc, filename, strlen(filename) + 1,
1238                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1239                              ll_i2suppgid(inode), &req);
1240         capa_put(oc);
1241         if (rc < 0) {
1242                 CDEBUG(D_INFO, "md_getattr_name failed "
1243                        "on %s: rc %d\n", filename, rc);
1244                 GOTO(out, rc);
1245         }
1246
1247         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1248         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1249
1250         lmmsize = body->eadatasize;
1251
1252         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1253                         lmmsize == 0) {
1254                 GOTO(out, rc = -ENODATA);
1255         }
1256
1257         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1258         LASSERT(lmm != NULL);
1259
1260         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1261             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
1262             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
1263                 GOTO(out, rc = -EPROTO);
1264         }
1265
1266         /*
1267          * This is coming from the MDS, so is probably in
1268          * little endian.  We convert it to host endian before
1269          * passing it to userspace.
1270          */
1271         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1272                 /* if function called for directory - we should
1273                  * avoid swab not existent lsm objects */
1274                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1275                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1276                         if (S_ISREG(body->mode))
1277                                 lustre_swab_lov_user_md_objects(
1278                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1279                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1280                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1281                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1282                         if (S_ISREG(body->mode))
1283                                 lustre_swab_lov_user_md_objects(
1284                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1285                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1286                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
1287                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1288                 }
1289         }
1290
1291         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1292                 struct lov_stripe_md *lsm;
1293                 struct lov_user_md_join *lmj;
1294                 int lmj_size, i, aindex = 0;
1295
1296                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1297                 if (rc < 0)
1298                         GOTO(out, rc = -ENOMEM);
1299                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1300                 if (rc)
1301                         GOTO(out_free_memmd, rc);
1302
1303                 lmj_size = sizeof(struct lov_user_md_join) +
1304                            lsm->lsm_stripe_count *
1305                            sizeof(struct lov_user_ost_data_join);
1306                 OBD_ALLOC(lmj, lmj_size);
1307                 if (!lmj)
1308                         GOTO(out_free_memmd, rc = -ENOMEM);
1309
1310                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1311                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1312                         struct lov_extent *lex =
1313                                 &lsm->lsm_array->lai_ext_array[aindex];
1314
1315                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1316                                 aindex ++;
1317                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1318                                         LPU64" len %d\n", aindex, i,
1319                                         lex->le_start, (int)lex->le_len);
1320                         lmj->lmm_objects[i].l_extent_start =
1321                                 lex->le_start;
1322
1323                         if ((int)lex->le_len == -1)
1324                                 lmj->lmm_objects[i].l_extent_end = -1;
1325                         else
1326                                 lmj->lmm_objects[i].l_extent_end =
1327                                         lex->le_start + lex->le_len;
1328                         lmj->lmm_objects[i].l_object_id =
1329                                 lsm->lsm_oinfo[i]->loi_id;
1330                         lmj->lmm_objects[i].l_object_gr =
1331                                 lsm->lsm_oinfo[i]->loi_gr;
1332                         lmj->lmm_objects[i].l_ost_gen =
1333                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1334                         lmj->lmm_objects[i].l_ost_idx =
1335                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1336                 }
1337                 lmm = (struct lov_mds_md *)lmj;
1338                 lmmsize = lmj_size;
1339 out_free_memmd:
1340                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1341         }
1342 out:
1343         *lmmp = lmm;
1344         *lmm_size = lmmsize;
1345         *request = req;
1346         return rc;
1347 }
1348
1349 static int ll_lov_setea(struct inode *inode, struct file *file,
1350                             unsigned long arg)
1351 {
1352         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1353         struct lov_user_md  *lump;
1354         int lum_size = sizeof(struct lov_user_md) +
1355                        sizeof(struct lov_user_ost_data);
1356         int rc;
1357         ENTRY;
1358
1359         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1360                 RETURN(-EPERM);
1361
1362         OBD_ALLOC(lump, lum_size);
1363         if (lump == NULL) {
1364                 RETURN(-ENOMEM);
1365         }
1366         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1367                 OBD_FREE(lump, lum_size);
1368                 RETURN(-EFAULT);
1369         }
1370
1371         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1372
1373         OBD_FREE(lump, lum_size);
1374         RETURN(rc);
1375 }
1376
1377 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1378                             unsigned long arg)
1379 {
1380         struct lov_user_md_v3 lumv3;
1381         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1382         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1383         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1384         int lum_size;
1385         int rc;
1386         int flags = FMODE_WRITE;
1387         ENTRY;
1388
1389         /* first try with v1 which is smaller than v3 */
1390         lum_size = sizeof(struct lov_user_md_v1);
1391         if (copy_from_user(lumv1, lumv1p, lum_size))
1392                 RETURN(-EFAULT);
1393
1394         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1395                 lum_size = sizeof(struct lov_user_md_v3);
1396                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1397                         RETURN(-EFAULT);
1398         }
1399
1400         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1401         if (rc == 0) {
1402                  put_user(0, &lumv1p->lmm_stripe_count);
1403                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1404                                     0, ll_i2info(inode)->lli_smd,
1405                                     (void *)arg);
1406         }
1407         RETURN(rc);
1408 }
1409
1410 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1411 {
1412         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1413
1414         if (!lsm)
1415                 RETURN(-ENODATA);
1416
1417         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1418                             (void *)arg);
1419 }
1420
1421 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1422 {
1423         struct ll_inode_info   *lli = ll_i2info(inode);
1424         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1425         struct ccc_grouplock    grouplock;
1426         int                     rc;
1427         ENTRY;
1428
1429         spin_lock(&lli->lli_lock);
1430         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1431                 CERROR("group lock already existed with gid %lu\n",
1432                        fd->fd_grouplock.cg_gid);
1433                 spin_unlock(&lli->lli_lock);
1434                 RETURN(-EINVAL);
1435         }
1436         LASSERT(fd->fd_grouplock.cg_lock == NULL);
1437         spin_unlock(&lli->lli_lock);
1438
1439         rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1440                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1441         if (rc)
1442                 RETURN(rc);
1443
1444         spin_lock(&lli->lli_lock);
1445         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1446                 spin_unlock(&lli->lli_lock);
1447                 CERROR("another thread just won the race\n");
1448                 cl_put_grouplock(&grouplock);
1449                 RETURN(-EINVAL);
1450         }
1451
1452         fd->fd_flags |= (LL_FILE_GROUP_LOCKED | LL_FILE_IGNORE_LOCK);
1453         fd->fd_grouplock = grouplock;
1454         spin_unlock(&lli->lli_lock);
1455
1456         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1457         RETURN(0);
1458 }
1459
1460 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1461 {
1462         struct ll_inode_info   *lli = ll_i2info(inode);
1463         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1464         struct ccc_grouplock    grouplock;
1465         ENTRY;
1466
1467         spin_lock(&lli->lli_lock);
1468         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1469                 spin_unlock(&lli->lli_lock);
1470                 CERROR("no group lock held\n");
1471                 RETURN(-EINVAL);
1472         }
1473         LASSERT(fd->fd_grouplock.cg_lock != NULL);
1474
1475         if (fd->fd_grouplock.cg_gid != arg) {
1476                 CERROR("group lock %lu doesn't match current id %lu\n",
1477                        arg, fd->fd_grouplock.cg_gid);
1478                 spin_unlock(&lli->lli_lock);
1479                 RETURN(-EINVAL);
1480         }
1481
1482         grouplock = fd->fd_grouplock;
1483         fd->fd_grouplock.cg_env = NULL;
1484         fd->fd_grouplock.cg_lock = NULL;
1485         fd->fd_grouplock.cg_gid = 0;
1486         fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED | LL_FILE_IGNORE_LOCK);
1487         spin_unlock(&lli->lli_lock);
1488
1489         cl_put_grouplock(&grouplock);
1490         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1491         RETURN(0);
1492 }
1493
1494 #if LUSTRE_FIX >= 50
1495 static int join_sanity_check(struct inode *head, struct inode *tail)
1496 {
1497         ENTRY;
1498         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1499                 CERROR("server do not support join \n");
1500                 RETURN(-EINVAL);
1501         }
1502         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1503                 CERROR("tail ino %lu and ino head %lu must be regular\n",
1504                        head->i_ino, tail->i_ino);
1505                 RETURN(-EINVAL);
1506         }
1507         if (head->i_ino == tail->i_ino) {
1508                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1509                 RETURN(-EINVAL);
1510         }
1511         if (i_size_read(head) % JOIN_FILE_ALIGN) {
1512                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1513                 RETURN(-EINVAL);
1514         }
1515         RETURN(0);
1516 }
1517
1518 static int join_file(struct inode *head_inode, struct file *head_filp,
1519                      struct file *tail_filp)
1520 {
1521         struct dentry *tail_dentry = tail_filp->f_dentry;
1522         struct lookup_intent oit = {.it_op = IT_OPEN,
1523                                     .it_flags = head_filp->f_flags,
1524                                     .it_create_mode = M_JOIN_FILE};
1525         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
1526                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
1527
1528         struct lustre_handle lockh;
1529         struct md_op_data *op_data;
1530         int    rc;
1531         loff_t data;
1532         ENTRY;
1533
1534         tail_dentry = tail_filp->f_dentry;
1535
1536         data = i_size_read(head_inode);
1537         op_data = ll_prep_md_op_data(NULL, head_inode,
1538                                      tail_dentry->d_parent->d_inode,
1539                                      tail_dentry->d_name.name,
1540                                      tail_dentry->d_name.len, 0,
1541                                      LUSTRE_OPC_ANY, &data);
1542         if (IS_ERR(op_data))
1543                 RETURN(PTR_ERR(op_data));
1544
1545         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
1546                          op_data, &lockh, NULL, 0, NULL, 0);
1547
1548         ll_finish_md_op_data(op_data);
1549         if (rc < 0)
1550                 GOTO(out, rc);
1551
1552         rc = oit.d.lustre.it_status;
1553
1554         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
1555                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
1556                 ptlrpc_req_finished((struct ptlrpc_request *)
1557                                     oit.d.lustre.it_data);
1558                 GOTO(out, rc);
1559         }
1560
1561         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1562                                            * away */
1563                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1564                 oit.d.lustre.it_lock_mode = 0;
1565         }
1566         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1567         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
1568         ll_release_openhandle(head_filp->f_dentry, &oit);
1569 out:
1570         ll_intent_release(&oit);
1571         RETURN(rc);
1572 }
1573
1574 static int ll_file_join(struct inode *head, struct file *filp,
1575                         char *filename_tail)
1576 {
1577         struct inode *tail = NULL, *first = NULL, *second = NULL;
1578         struct dentry *tail_dentry;
1579         struct file *tail_filp, *first_filp, *second_filp;
1580         struct ll_lock_tree first_tree, second_tree;
1581         struct ll_lock_tree_node *first_node, *second_node;
1582         struct ll_inode_info *hlli = ll_i2info(head);
1583         int rc = 0, cleanup_phase = 0;
1584         ENTRY;
1585
1586         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1587                head->i_ino, head->i_generation, head, filename_tail);
1588
1589         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1590         if (IS_ERR(tail_filp)) {
1591                 CERROR("Can not open tail file %s", filename_tail);
1592                 rc = PTR_ERR(tail_filp);
1593                 GOTO(cleanup, rc);
1594         }
1595         tail = igrab(tail_filp->f_dentry->d_inode);
1596
1597         tail_dentry = tail_filp->f_dentry;
1598         LASSERT(tail_dentry);
1599         cleanup_phase = 1;
1600
1601         /*reorder the inode for lock sequence*/
1602         first = head->i_ino > tail->i_ino ? head : tail;
1603         second = head->i_ino > tail->i_ino ? tail : head;
1604         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1605         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1606
1607         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1608                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1609         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1610         if (IS_ERR(first_node)){
1611                 rc = PTR_ERR(first_node);
1612                 GOTO(cleanup, rc);
1613         }
1614         first_tree.lt_fd = first_filp->private_data;
1615         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1616         if (rc != 0)
1617                 GOTO(cleanup, rc);
1618         cleanup_phase = 2;
1619
1620         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1621         if (IS_ERR(second_node)){
1622                 rc = PTR_ERR(second_node);
1623                 GOTO(cleanup, rc);
1624         }
1625         second_tree.lt_fd = second_filp->private_data;
1626         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1627         if (rc != 0)
1628                 GOTO(cleanup, rc);
1629         cleanup_phase = 3;
1630
1631         rc = join_sanity_check(head, tail);
1632         if (rc)
1633                 GOTO(cleanup, rc);
1634
1635         rc = join_file(head, filp, tail_filp);
1636         if (rc)
1637                 GOTO(cleanup, rc);
1638 cleanup:
1639         switch (cleanup_phase) {
1640         case 3:
1641                 ll_tree_unlock(&second_tree);
1642                 obd_cancel_unused(ll_i2dtexp(second),
1643                                   ll_i2info(second)->lli_smd, 0, NULL);
1644         case 2:
1645                 ll_tree_unlock(&first_tree);
1646                 obd_cancel_unused(ll_i2dtexp(first),
1647                                   ll_i2info(first)->lli_smd, 0, NULL);
1648         case 1:
1649                 filp_close(tail_filp, 0);
1650                 if (tail)
1651                         iput(tail);
1652                 if (head && rc == 0) {
1653                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
1654                                        &hlli->lli_smd);
1655                         hlli->lli_smd = NULL;
1656                 }
1657         case 0:
1658                 break;
1659         default:
1660                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
1661                 LBUG();
1662         }
1663         RETURN(rc);
1664 }
1665 #endif /* LUSTRE_FIX >= 50 */
1666
1667 /**
1668  * Close inode open handle
1669  *
1670  * \param dentry [in]     dentry which contains the inode
1671  * \param it     [in,out] intent which contains open info and result
1672  *
1673  * \retval 0     success
1674  * \retval <0    failure
1675  */
1676 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1677 {
1678         struct inode *inode = dentry->d_inode;
1679         struct obd_client_handle *och;
1680         int rc;
1681         ENTRY;
1682
1683         LASSERT(inode);
1684
1685         /* Root ? Do nothing. */
1686         if (dentry->d_inode->i_sb->s_root == dentry)
1687                 RETURN(0);
1688
1689         /* No open handle to close? Move away */
1690         if (!it_disposition(it, DISP_OPEN_OPEN))
1691                 RETURN(0);
1692
1693         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1694
1695         OBD_ALLOC(och, sizeof(*och));
1696         if (!och)
1697                 GOTO(out, rc = -ENOMEM);
1698
1699         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1700                     ll_i2info(inode), it, och);
1701
1702         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1703                                        inode, och);
1704  out:
1705         /* this one is in place of ll_file_open */
1706         if (it_disposition(it, DISP_ENQ_OPEN_REF))
1707                 ptlrpc_req_finished(it->d.lustre.it_data);
1708         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1709         RETURN(rc);
1710 }
1711
1712 /**
1713  * Get size for inode for which FIEMAP mapping is requested.
1714  * Make the FIEMAP get_info call and returns the result.
1715  */
1716 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1717               int num_bytes)
1718 {
1719         struct obd_export *exp = ll_i2dtexp(inode);
1720         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1721         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1722         int vallen = num_bytes;
1723         int rc;
1724         ENTRY;
1725
1726         /* If the stripe_count > 1 and the application does not understand
1727          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1728          */
1729         if (lsm->lsm_stripe_count > 1 &&
1730             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1731                 return -EOPNOTSUPP;
1732
1733         fm_key.oa.o_id = lsm->lsm_object_id;
1734         fm_key.oa.o_gr = lsm->lsm_object_gr;
1735         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1736
1737         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
1738                         OBD_MD_FLSIZE);
1739
1740         /* If filesize is 0, then there would be no objects for mapping */
1741         if (fm_key.oa.o_size == 0) {
1742                 fiemap->fm_mapped_extents = 0;
1743                 RETURN(0);
1744         }
1745
1746         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1747
1748         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1749         if (rc)
1750                 CERROR("obd_get_info failed: rc = %d\n", rc);
1751
1752         RETURN(rc);
1753 }
1754
1755 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1756                   unsigned long arg)
1757 {
1758         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1759         int flags;
1760         ENTRY;
1761
1762         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1763                inode->i_generation, inode, cmd);
1764         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1765
1766         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1767         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1768                 RETURN(-ENOTTY);
1769
1770         switch(cmd) {
1771         case LL_IOC_GETFLAGS:
1772                 /* Get the current value of the file flags */
1773                 return put_user(fd->fd_flags, (int *)arg);
1774         case LL_IOC_SETFLAGS:
1775         case LL_IOC_CLRFLAGS:
1776                 /* Set or clear specific file flags */
1777                 /* XXX This probably needs checks to ensure the flags are
1778                  *     not abused, and to handle any flag side effects.
1779                  */
1780                 if (get_user(flags, (int *) arg))
1781                         RETURN(-EFAULT);
1782
1783                 if (cmd == LL_IOC_SETFLAGS) {
1784                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1785                             !(file->f_flags & O_DIRECT)) {
1786                                 CERROR("%s: unable to disable locking on "
1787                                        "non-O_DIRECT file\n", current->comm);
1788                                 RETURN(-EINVAL);
1789                         }
1790
1791                         fd->fd_flags |= flags;
1792                 } else {
1793                         fd->fd_flags &= ~flags;
1794                 }
1795                 RETURN(0);
1796         case LL_IOC_LOV_SETSTRIPE:
1797                 RETURN(ll_lov_setstripe(inode, file, arg));
1798         case LL_IOC_LOV_SETEA:
1799                 RETURN(ll_lov_setea(inode, file, arg));
1800         case LL_IOC_LOV_GETSTRIPE:
1801                 RETURN(ll_lov_getstripe(inode, arg));
1802         case LL_IOC_RECREATE_OBJ:
1803                 RETURN(ll_lov_recreate_obj(inode, file, arg));
1804         case EXT3_IOC_FIEMAP: {
1805                 struct ll_user_fiemap *fiemap_s;
1806                 size_t num_bytes, ret_bytes;
1807                 unsigned int extent_count;
1808                 int rc = 0;
1809
1810                 /* Get the extent count so we can calculate the size of
1811                  * required fiemap buffer */
1812                 if (get_user(extent_count,
1813                     &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1814                         RETURN(-EFAULT);
1815                 num_bytes = sizeof(*fiemap_s) + (extent_count *
1816                                                  sizeof(struct ll_fiemap_extent));
1817                 OBD_VMALLOC(fiemap_s, num_bytes);
1818                 if (fiemap_s == NULL)
1819                         RETURN(-ENOMEM);
1820
1821                 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1822                                    sizeof(*fiemap_s)))
1823                         GOTO(error, rc = -EFAULT);
1824
1825                 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1826                         fiemap_s->fm_flags = fiemap_s->fm_flags &
1827                                                     ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1828                         if (copy_to_user((char *)arg, fiemap_s,
1829                                          sizeof(*fiemap_s)))
1830                                 GOTO(error, rc = -EFAULT);
1831
1832                         GOTO(error, rc = -EBADR);
1833                 }
1834
1835                 /* If fm_extent_count is non-zero, read the first extent since
1836                  * it is used to calculate end_offset and device from previous
1837                  * fiemap call. */
1838                 if (extent_count) {
1839                         if (copy_from_user(&fiemap_s->fm_extents[0],
1840                             (char __user *)arg + sizeof(*fiemap_s),
1841                             sizeof(struct ll_fiemap_extent)))
1842                                 GOTO(error, rc = -EFAULT);
1843                 }
1844
1845                 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
1846                         int rc;
1847
1848                         rc = filemap_fdatawrite(inode->i_mapping);
1849                         if (rc)
1850                                 GOTO(error, rc);
1851                 }
1852
1853                 rc = ll_fiemap(inode, fiemap_s, num_bytes);
1854                 if (rc)
1855                         GOTO(error, rc);
1856
1857                 ret_bytes = sizeof(struct ll_user_fiemap);
1858
1859                 if (extent_count != 0)
1860                         ret_bytes += (fiemap_s->fm_mapped_extents *
1861                                          sizeof(struct ll_fiemap_extent));
1862
1863                 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1864                         rc = -EFAULT;
1865
1866 error:
1867                 OBD_VFREE(fiemap_s, num_bytes);
1868                 RETURN(rc);
1869         }
1870         case EXT3_IOC_GETFLAGS:
1871         case EXT3_IOC_SETFLAGS:
1872                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1873         case EXT3_IOC_GETVERSION_OLD:
1874         case EXT3_IOC_GETVERSION:
1875                 RETURN(put_user(inode->i_generation, (int *)arg));
1876         case LL_IOC_JOIN: {
1877 #if LUSTRE_FIX >= 50
1878                 /* Allow file join in beta builds to allow debuggging */
1879                 char *ftail;
1880                 int rc;
1881
1882                 ftail = getname((const char *)arg);
1883                 if (IS_ERR(ftail))
1884                         RETURN(PTR_ERR(ftail));
1885                 rc = ll_file_join(inode, file, ftail);
1886                 putname(ftail);
1887                 RETURN(rc);
1888 #else
1889                 CWARN("file join is not supported in this version of Lustre\n");
1890                 RETURN(-ENOTTY);
1891 #endif
1892         }
1893         case LL_IOC_GROUP_LOCK:
1894                 RETURN(ll_get_grouplock(inode, file, arg));
1895         case LL_IOC_GROUP_UNLOCK:
1896                 RETURN(ll_put_grouplock(inode, file, arg));
1897         case IOC_OBD_STATFS:
1898                 RETURN(ll_obd_statfs(inode, (void *)arg));
1899
1900         /* We need to special case any other ioctls we want to handle,
1901          * to send them to the MDS/OST as appropriate and to properly
1902          * network encode the arg field.
1903         case EXT3_IOC_SETVERSION_OLD:
1904         case EXT3_IOC_SETVERSION:
1905         */
1906         case LL_IOC_FLUSHCTX:
1907                 RETURN(ll_flush_ctx(inode));
1908         case LL_IOC_PATH2FID: {
1909                 if (copy_to_user((void *)arg, &ll_i2info(inode)->lli_fid,
1910                                  sizeof(struct lu_fid)))
1911                         RETURN(-EFAULT);
1912
1913                 RETURN(0);
1914         }
1915         default: {
1916                 int err;
1917
1918                 if (LLIOC_STOP ==
1919                     ll_iocontrol_call(inode, file, cmd, arg, &err))
1920                         RETURN(err);
1921
1922                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1923                                      (void *)arg));
1924         }
1925         }
1926 }
1927
1928 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1929 {
1930         struct inode *inode = file->f_dentry->d_inode;
1931         loff_t retval;
1932         ENTRY;
1933         retval = offset + ((origin == 2) ? i_size_read(inode) :
1934                            (origin == 1) ? file->f_pos : 0);
1935         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
1936                inode->i_ino, inode->i_generation, inode, retval, retval,
1937                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1938         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1939
1940         if (origin == 2) { /* SEEK_END */
1941                 int nonblock = 0, rc;
1942
1943                 if (file->f_flags & O_NONBLOCK)
1944                         nonblock = LDLM_FL_BLOCK_NOWAIT;
1945
1946                 rc = cl_glimpse_size(inode);
1947                 if (rc != 0)
1948                         RETURN(rc);
1949
1950                 ll_inode_size_lock(inode, 0);
1951                 offset += i_size_read(inode);
1952                 ll_inode_size_unlock(inode, 0);
1953         } else if (origin == 1) { /* SEEK_CUR */
1954                 offset += file->f_pos;
1955         }
1956
1957         retval = -EINVAL;
1958         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1959                 if (offset != file->f_pos) {
1960                         file->f_pos = offset;
1961                 }
1962                 retval = offset;
1963         }
1964
1965         RETURN(retval);
1966 }
1967
1968 int ll_fsync(struct file *file, struct dentry *dentry, int data)
1969 {
1970         struct inode *inode = dentry->d_inode;
1971         struct ll_inode_info *lli = ll_i2info(inode);
1972         struct lov_stripe_md *lsm = lli->lli_smd;
1973         struct ptlrpc_request *req;
1974         struct obd_capa *oc;
1975         int rc, err;
1976         ENTRY;
1977         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
1978                inode->i_generation, inode);
1979         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
1980
1981         /* fsync's caller has already called _fdata{sync,write}, we want
1982          * that IO to finish before calling the osc and mdc sync methods */
1983         rc = filemap_fdatawait(inode->i_mapping);
1984
1985         /* catch async errors that were recorded back when async writeback
1986          * failed for pages in this mapping. */
1987         err = lli->lli_async_rc;
1988         lli->lli_async_rc = 0;
1989         if (rc == 0)
1990                 rc = err;
1991         if (lsm) {
1992                 err = lov_test_and_clear_async_rc(lsm);
1993                 if (rc == 0)
1994                         rc = err;
1995         }
1996
1997         oc = ll_mdscapa_get(inode);
1998         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
1999                       &req);
2000         capa_put(oc);
2001         if (!rc)
2002                 rc = err;
2003         if (!err)
2004                 ptlrpc_req_finished(req);
2005
2006         if (data && lsm) {
2007                 struct obdo *oa;
2008
2009                 OBDO_ALLOC(oa);
2010                 if (!oa)
2011                         RETURN(rc ? rc : -ENOMEM);
2012
2013                 oa->o_id = lsm->lsm_object_id;
2014                 oa->o_gr = lsm->lsm_object_gr;
2015                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2016                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2017                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2018                                            OBD_MD_FLGROUP);
2019
2020                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2021                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2022                                0, OBD_OBJECT_EOF, oc);
2023                 capa_put(oc);
2024                 if (!rc)
2025                         rc = err;
2026                 OBDO_FREE(oa);
2027         }
2028
2029         RETURN(rc);
2030 }
2031
2032 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2033 {
2034         struct inode *inode = file->f_dentry->d_inode;
2035         struct ll_sb_info *sbi = ll_i2sbi(inode);
2036         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2037                                            .ei_cb_cp =ldlm_flock_completion_ast,
2038                                            .ei_cbdata = file_lock };
2039         struct md_op_data *op_data;
2040         struct lustre_handle lockh = {0};
2041         ldlm_policy_data_t flock;
2042         int flags = 0;
2043         int rc;
2044         ENTRY;
2045
2046         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2047                inode->i_ino, file_lock);
2048
2049         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2050
2051         if (file_lock->fl_flags & FL_FLOCK) {
2052                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2053                 /* set missing params for flock() calls */
2054                 file_lock->fl_end = OFFSET_MAX;
2055                 file_lock->fl_pid = current->tgid;
2056         }
2057         flock.l_flock.pid = file_lock->fl_pid;
2058         flock.l_flock.start = file_lock->fl_start;
2059         flock.l_flock.end = file_lock->fl_end;
2060
2061         switch (file_lock->fl_type) {
2062         case F_RDLCK:
2063                 einfo.ei_mode = LCK_PR;
2064                 break;
2065         case F_UNLCK:
2066                 /* An unlock request may or may not have any relation to
2067                  * existing locks so we may not be able to pass a lock handle
2068                  * via a normal ldlm_lock_cancel() request. The request may even
2069                  * unlock a byte range in the middle of an existing lock. In
2070                  * order to process an unlock request we need all of the same
2071                  * information that is given with a normal read or write record
2072                  * lock request. To avoid creating another ldlm unlock (cancel)
2073                  * message we'll treat a LCK_NL flock request as an unlock. */
2074                 einfo.ei_mode = LCK_NL;
2075                 break;
2076         case F_WRLCK:
2077                 einfo.ei_mode = LCK_PW;
2078                 break;
2079         default:
2080                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2081                 RETURN (-EINVAL);
2082         }
2083
2084         switch (cmd) {
2085         case F_SETLKW:
2086 #ifdef F_SETLKW64
2087         case F_SETLKW64:
2088 #endif
2089                 flags = 0;
2090                 break;
2091         case F_SETLK:
2092 #ifdef F_SETLK64
2093         case F_SETLK64:
2094 #endif
2095                 flags = LDLM_FL_BLOCK_NOWAIT;
2096                 break;
2097         case F_GETLK:
2098 #ifdef F_GETLK64
2099         case F_GETLK64:
2100 #endif
2101                 flags = LDLM_FL_TEST_LOCK;
2102                 /* Save the old mode so that if the mode in the lock changes we
2103                  * can decrement the appropriate reader or writer refcount. */
2104                 file_lock->fl_type = einfo.ei_mode;
2105                 break;
2106         default:
2107                 CERROR("unknown fcntl lock command: %d\n", cmd);
2108                 RETURN (-EINVAL);
2109         }
2110
2111         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2112                                      LUSTRE_OPC_ANY, NULL);
2113         if (IS_ERR(op_data))
2114                 RETURN(PTR_ERR(op_data));
2115
2116         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2117                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2118                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2119
2120         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2121                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2122
2123         ll_finish_md_op_data(op_data);
2124
2125         if ((file_lock->fl_flags & FL_FLOCK) &&
2126             (rc == 0 || file_lock->fl_type == F_UNLCK))
2127                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2128 #ifdef HAVE_F_OP_FLOCK
2129         if ((file_lock->fl_flags & FL_POSIX) &&
2130             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2131             !(flags & LDLM_FL_TEST_LOCK))
2132                 posix_lock_file_wait(file, file_lock);
2133 #endif
2134
2135         RETURN(rc);
2136 }
2137
2138 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2139 {
2140         ENTRY;
2141
2142         RETURN(-ENOSYS);
2143 }
2144
2145 int ll_have_md_lock(struct inode *inode, __u64 bits)
2146 {
2147         struct lustre_handle lockh;
2148         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2149         struct lu_fid *fid;
2150         int flags;
2151         ENTRY;
2152
2153         if (!inode)
2154                RETURN(0);
2155
2156         fid = &ll_i2info(inode)->lli_fid;
2157         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2158
2159         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2160         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2161                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2162                 RETURN(1);
2163         }
2164         RETURN(0);
2165 }
2166
2167 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2168                             struct lustre_handle *lockh)
2169 {
2170         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2171         struct lu_fid *fid;
2172         ldlm_mode_t rc;
2173         int flags;
2174         ENTRY;
2175
2176         fid = &ll_i2info(inode)->lli_fid;
2177         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2178
2179         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2180         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2181                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2182         RETURN(rc);
2183 }
2184
2185 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2186         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2187                               * and return success */
2188                 inode->i_nlink = 0;
2189                 /* This path cannot be hit for regular files unless in
2190                  * case of obscure races, so no need to to validate
2191                  * size. */
2192                 if (!S_ISREG(inode->i_mode) &&
2193                     !S_ISDIR(inode->i_mode))
2194                         return 0;
2195         }
2196
2197         if (rc) {
2198                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2199                 return -abs(rc);
2200
2201         }
2202
2203         return 0;
2204 }
2205
2206 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2207                              __u64 ibits)
2208 {
2209         struct inode *inode = dentry->d_inode;
2210         struct ptlrpc_request *req = NULL;
2211         struct ll_sb_info *sbi;
2212         struct obd_export *exp;
2213         int rc = 0;
2214         ENTRY;
2215
2216         if (!inode) {
2217                 CERROR("REPORT THIS LINE TO PETER\n");
2218                 RETURN(0);
2219         }
2220         sbi = ll_i2sbi(inode);
2221
2222         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2223                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2224
2225         exp = ll_i2mdexp(inode);
2226
2227         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2228                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2229                 struct md_op_data *op_data;
2230
2231                 /* Call getattr by fid, so do not provide name at all. */
2232                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2233                                              dentry->d_inode, NULL, 0, 0,
2234                                              LUSTRE_OPC_ANY, NULL);
2235                 if (IS_ERR(op_data))
2236                         RETURN(PTR_ERR(op_data));
2237
2238                 oit.it_create_mode |= M_CHECK_STALE;
2239                 rc = md_intent_lock(exp, op_data, NULL, 0,
2240                                     /* we are not interested in name
2241                                        based lookup */
2242                                     &oit, 0, &req,
2243                                     ll_md_blocking_ast, 0);
2244                 ll_finish_md_op_data(op_data);
2245                 oit.it_create_mode &= ~M_CHECK_STALE;
2246                 if (rc < 0) {
2247                         rc = ll_inode_revalidate_fini(inode, rc);
2248                         GOTO (out, rc);
2249                 }
2250
2251                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2252                 if (rc != 0) {
2253                         ll_intent_release(&oit);
2254                         GOTO(out, rc);
2255                 }
2256
2257                 /* Unlinked? Unhash dentry, so it is not picked up later by
2258                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2259                    here to preserve get_cwd functionality on 2.6.
2260                    Bug 10503 */
2261                 if (!dentry->d_inode->i_nlink) {
2262                         spin_lock(&ll_lookup_lock);
2263                         spin_lock(&dcache_lock);
2264                         ll_drop_dentry(dentry);
2265                         spin_unlock(&dcache_lock);
2266                         spin_unlock(&ll_lookup_lock);
2267                 }
2268
2269                 ll_lookup_finish_locks(&oit, dentry);
2270         } else if (!ll_have_md_lock(dentry->d_inode, ibits)) {
2271
2272                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2273                 obd_valid valid = OBD_MD_FLGETATTR;
2274                 struct obd_capa *oc;
2275                 int ealen = 0;
2276
2277                 if (S_ISREG(inode->i_mode)) {
2278                         rc = ll_get_max_mdsize(sbi, &ealen);
2279                         if (rc)
2280                                 RETURN(rc);
2281                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2282                 }
2283                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2284                  * capa for this inode. Because we only keep capas of dirs
2285                  * fresh. */
2286                 oc = ll_mdscapa_get(inode);
2287                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2288                                 ealen, &req);
2289                 capa_put(oc);
2290                 if (rc) {
2291                         rc = ll_inode_revalidate_fini(inode, rc);
2292                         RETURN(rc);
2293                 }
2294
2295                 rc = ll_prep_inode(&inode, req, NULL);
2296         }
2297 out:
2298         ptlrpc_req_finished(req);
2299         return rc;
2300 }
2301
2302 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2303 {
2304         int rc;
2305         ENTRY;
2306
2307         rc = __ll_inode_revalidate_it(dentry, it, MDS_INODELOCK_UPDATE |
2308                                                   MDS_INODELOCK_LOOKUP);
2309
2310         /* if object not yet allocated, don't validate size */
2311         if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL)
2312                 RETURN(0);
2313
2314         /* cl_glimpse_size will prefer locally cached writes if they extend
2315          * the file */
2316
2317         if (rc == 0)
2318                 rc = cl_glimpse_size(dentry->d_inode);
2319
2320         RETURN(rc);
2321 }
2322
2323 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2324                   struct lookup_intent *it, struct kstat *stat)
2325 {
2326         struct inode *inode = de->d_inode;
2327         int res = 0;
2328
2329         res = ll_inode_revalidate_it(de, it);
2330         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2331
2332         if (res)
2333                 return res;
2334
2335         stat->dev = inode->i_sb->s_dev;
2336         stat->ino = inode->i_ino;
2337         stat->mode = inode->i_mode;
2338         stat->nlink = inode->i_nlink;
2339         stat->uid = inode->i_uid;
2340         stat->gid = inode->i_gid;
2341         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2342         stat->atime = inode->i_atime;
2343         stat->mtime = inode->i_mtime;
2344         stat->ctime = inode->i_ctime;
2345 #ifdef HAVE_INODE_BLKSIZE
2346         stat->blksize = inode->i_blksize;
2347 #else
2348         stat->blksize = 1 << inode->i_blkbits;
2349 #endif
2350
2351         ll_inode_size_lock(inode, 0);
2352         stat->size = i_size_read(inode);
2353         stat->blocks = inode->i_blocks;
2354         ll_inode_size_unlock(inode, 0);
2355
2356         return 0;
2357 }
2358 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2359 {
2360         struct lookup_intent it = { .it_op = IT_GETATTR };
2361
2362         return ll_getattr_it(mnt, de, &it, stat);
2363 }
2364
2365 static
2366 int lustre_check_acl(struct inode *inode, int mask)
2367 {
2368 #ifdef CONFIG_FS_POSIX_ACL
2369         struct ll_inode_info *lli = ll_i2info(inode);
2370         struct posix_acl *acl;
2371         int rc;
2372         ENTRY;
2373
2374         spin_lock(&lli->lli_lock);
2375         acl = posix_acl_dup(lli->lli_posix_acl);
2376         spin_unlock(&lli->lli_lock);
2377
2378         if (!acl)
2379                 RETURN(-EAGAIN);
2380
2381         rc = posix_acl_permission(inode, acl, mask);
2382         posix_acl_release(acl);
2383
2384         RETURN(rc);
2385 #else
2386         return -EAGAIN;
2387 #endif
2388 }
2389
2390 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2391 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2392 {
2393         int rc = 0;
2394         ENTRY;
2395
2396        /* as root inode are NOT getting validated in lookup operation,
2397         * need to do it before permission check. */
2398
2399         if (inode == inode->i_sb->s_root->d_inode) {
2400                 struct lookup_intent it = { .it_op = IT_GETATTR };
2401
2402                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2403                                               MDS_INODELOCK_LOOKUP);
2404                 if (rc)
2405                         RETURN(rc);
2406         }
2407
2408         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2409                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2410
2411         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2412                 return lustre_check_remote_perm(inode, mask);
2413
2414         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2415         rc = generic_permission(inode, mask, lustre_check_acl);
2416
2417         RETURN(rc);
2418 }
2419 #else
2420 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2421 {
2422         int mode = inode->i_mode;
2423         int rc;
2424
2425         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2426                inode->i_ino, inode->i_generation, inode, mask);
2427
2428         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2429                 return lustre_check_remote_perm(inode, mask);
2430
2431         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2432
2433         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2434             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2435                 return -EROFS;
2436         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2437                 return -EACCES;
2438         if (current->fsuid == inode->i_uid) {
2439                 mode >>= 6;
2440         } else if (1) {
2441                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2442                         goto check_groups;
2443                 rc = lustre_check_acl(inode, mask);
2444                 if (rc == -EAGAIN)
2445                         goto check_groups;
2446                 if (rc == -EACCES)
2447                         goto check_capabilities;
2448                 return rc;
2449         } else {
2450 check_groups:
2451                 if (in_group_p(inode->i_gid))
2452                         mode >>= 3;
2453         }
2454         if ((mode & mask & S_IRWXO) == mask)
2455                 return 0;
2456
2457 check_capabilities:
2458         if (!(mask & MAY_EXEC) ||
2459             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2460                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2461                         return 0;
2462
2463         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2464             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2465                 return 0;
2466
2467         return -EACCES;
2468 }
2469 #endif
2470
2471 #ifdef HAVE_FILE_READV
2472 #define READ_METHOD readv
2473 #define READ_FUNCTION ll_file_readv
2474 #define WRITE_METHOD writev
2475 #define WRITE_FUNCTION ll_file_writev
2476 #else
2477 #define READ_METHOD aio_read
2478 #define READ_FUNCTION ll_file_aio_read
2479 #define WRITE_METHOD aio_write
2480 #define WRITE_FUNCTION ll_file_aio_write
2481 #endif
2482
2483 /* -o localflock - only provides locally consistent flock locks */
2484 struct file_operations ll_file_operations = {
2485         .read           = ll_file_read,
2486         .READ_METHOD    = READ_FUNCTION,
2487         .write          = ll_file_write,
2488         .WRITE_METHOD   = WRITE_FUNCTION,
2489         .ioctl          = ll_file_ioctl,
2490         .open           = ll_file_open,
2491         .release        = ll_file_release,
2492         .mmap           = ll_file_mmap,
2493         .llseek         = ll_file_seek,
2494         .sendfile       = ll_file_sendfile,
2495         .fsync          = ll_fsync,
2496 };
2497
2498 struct file_operations ll_file_operations_flock = {
2499         .read           = ll_file_read,
2500         .READ_METHOD    = READ_FUNCTION,
2501         .write          = ll_file_write,
2502         .WRITE_METHOD   = WRITE_FUNCTION,
2503         .ioctl          = ll_file_ioctl,
2504         .open           = ll_file_open,
2505         .release        = ll_file_release,
2506         .mmap           = ll_file_mmap,
2507         .llseek         = ll_file_seek,
2508         .sendfile       = ll_file_sendfile,
2509         .fsync          = ll_fsync,
2510 #ifdef HAVE_F_OP_FLOCK
2511         .flock          = ll_file_flock,
2512 #endif
2513         .lock           = ll_file_flock
2514 };
2515
2516 /* These are for -o noflock - to return ENOSYS on flock calls */
2517 struct file_operations ll_file_operations_noflock = {
2518         .read           = ll_file_read,
2519         .READ_METHOD    = READ_FUNCTION,
2520         .write          = ll_file_write,
2521         .WRITE_METHOD   = WRITE_FUNCTION,
2522         .ioctl          = ll_file_ioctl,
2523         .open           = ll_file_open,
2524         .release        = ll_file_release,
2525         .mmap           = ll_file_mmap,
2526         .llseek         = ll_file_seek,
2527         .sendfile       = ll_file_sendfile,
2528         .fsync          = ll_fsync,
2529 #ifdef HAVE_F_OP_FLOCK
2530         .flock          = ll_file_noflock,
2531 #endif
2532         .lock           = ll_file_noflock
2533 };
2534
2535 struct inode_operations ll_file_inode_operations = {
2536 #ifdef HAVE_VFS_INTENT_PATCHES
2537         .setattr_raw    = ll_setattr_raw,
2538 #endif
2539         .setattr        = ll_setattr,
2540         .truncate       = ll_truncate,
2541         .getattr        = ll_getattr,
2542         .permission     = ll_inode_permission,
2543         .setxattr       = ll_setxattr,
2544         .getxattr       = ll_getxattr,
2545         .listxattr      = ll_listxattr,
2546         .removexattr    = ll_removexattr,
2547 };
2548
2549 /* dynamic ioctl number support routins */
2550 static struct llioc_ctl_data {
2551         struct rw_semaphore ioc_sem;
2552         struct list_head    ioc_head;
2553 } llioc = {
2554         __RWSEM_INITIALIZER(llioc.ioc_sem),
2555         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2556 };
2557
2558
2559 struct llioc_data {
2560         struct list_head        iocd_list;
2561         unsigned int            iocd_size;
2562         llioc_callback_t        iocd_cb;
2563         unsigned int            iocd_count;
2564         unsigned int            iocd_cmd[0];
2565 };
2566
2567 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2568 {
2569         unsigned int size;
2570         struct llioc_data *in_data = NULL;
2571         ENTRY;
2572
2573         if (cb == NULL || cmd == NULL ||
2574             count > LLIOC_MAX_CMD || count < 0)
2575                 RETURN(NULL);
2576
2577         size = sizeof(*in_data) + count * sizeof(unsigned int);
2578         OBD_ALLOC(in_data, size);
2579         if (in_data == NULL)
2580                 RETURN(NULL);
2581
2582         memset(in_data, 0, sizeof(*in_data));
2583         in_data->iocd_size = size;
2584         in_data->iocd_cb = cb;
2585         in_data->iocd_count = count;
2586         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2587
2588         down_write(&llioc.ioc_sem);
2589         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2590         up_write(&llioc.ioc_sem);
2591
2592         RETURN(in_data);
2593 }
2594
2595 void ll_iocontrol_unregister(void *magic)
2596 {
2597         struct llioc_data *tmp;
2598
2599         if (magic == NULL)
2600                 return;
2601
2602         down_write(&llioc.ioc_sem);
2603         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2604                 if (tmp == magic) {
2605                         unsigned int size = tmp->iocd_size;
2606
2607                         list_del(&tmp->iocd_list);
2608                         up_write(&llioc.ioc_sem);
2609
2610                         OBD_FREE(tmp, size);
2611                         return;
2612                 }
2613         }
2614         up_write(&llioc.ioc_sem);
2615
2616         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2617 }
2618
2619 EXPORT_SYMBOL(ll_iocontrol_register);
2620 EXPORT_SYMBOL(ll_iocontrol_unregister);
2621
2622 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2623                         unsigned int cmd, unsigned long arg, int *rcp)
2624 {
2625         enum llioc_iter ret = LLIOC_CONT;
2626         struct llioc_data *data;
2627         int rc = -EINVAL, i;
2628
2629         down_read(&llioc.ioc_sem);
2630         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2631                 for (i = 0; i < data->iocd_count; i++) {
2632                         if (cmd != data->iocd_cmd[i])
2633                                 continue;
2634
2635                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2636                         break;
2637                 }
2638
2639                 if (ret == LLIOC_STOP)
2640                         break;
2641         }
2642         up_read(&llioc.ioc_sem);
2643
2644         if (rcp)
2645                 *rcp = rc;
2646         return ret;
2647 }