Whamcloud - gitweb
b=19389
[fs/lustre-release.git] / lustre / llite / file.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * GPL HEADER START
5  *
6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 only,
10  * as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License version 2 for more details (a copy is included
16  * in the LICENSE file that accompanied this code).
17  *
18  * You should have received a copy of the GNU General Public License
19  * version 2 along with this program; If not, see
20  * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
21  *
22  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23  * CA 95054 USA or visit www.sun.com if you need additional information or
24  * have any questions.
25  *
26  * GPL HEADER END
27  */
28 /*
29  * Copyright  2008 Sun Microsystems, Inc. All rights reserved
30  * Use is subject to license terms.
31  */
32 /*
33  * This file is part of Lustre, http://www.lustre.org/
34  * Lustre is a trademark of Sun Microsystems, Inc.
35  *
36  * lustre/llite/file.c
37  *
38  * Author: Peter Braam <braam@clusterfs.com>
39  * Author: Phil Schwan <phil@clusterfs.com>
40  * Author: Andreas Dilger <adilger@clusterfs.com>
41  */
42
43 #define DEBUG_SUBSYSTEM S_LLITE
44 #include <lustre_dlm.h>
45 #include <lustre_lite.h>
46 #include <lustre_mdc.h>
47 #include <linux/pagemap.h>
48 #include <linux/file.h>
49 #include "llite_internal.h"
50 #include <lustre/ll_fiemap.h>
51
52 #include "cl_object.h"
53
54 struct ll_file_data *ll_file_data_get(void)
55 {
56         struct ll_file_data *fd;
57
58         OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, CFS_ALLOC_IO);
59         return fd;
60 }
61
62 static void ll_file_data_put(struct ll_file_data *fd)
63 {
64         if (fd != NULL)
65                 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
66 }
67
68 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
69                           struct lustre_handle *fh)
70 {
71         op_data->op_fid1 = ll_i2info(inode)->lli_fid;
72         op_data->op_attr.ia_mode = inode->i_mode;
73         op_data->op_attr.ia_atime = inode->i_atime;
74         op_data->op_attr.ia_mtime = inode->i_mtime;
75         op_data->op_attr.ia_ctime = inode->i_ctime;
76         op_data->op_attr.ia_size = i_size_read(inode);
77         op_data->op_attr_blocks = inode->i_blocks;
78         ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
79         op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
80         memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
81         op_data->op_capa1 = ll_mdscapa_get(inode);
82 }
83
84 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
85                              struct obd_client_handle *och)
86 {
87         ENTRY;
88
89         op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
90                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
91
92         if (!(och->och_flags & FMODE_WRITE))
93                 goto out;
94
95         if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
96             !S_ISREG(inode->i_mode))
97                 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
98         else
99                 ll_epoch_close(inode, op_data, &och, 0);
100
101 out:
102         ll_pack_inode2opdata(inode, op_data, &och->och_fh);
103         EXIT;
104 }
105
106 static int ll_close_inode_openhandle(struct obd_export *md_exp,
107                                      struct inode *inode,
108                                      struct obd_client_handle *och)
109 {
110         struct obd_export *exp = ll_i2mdexp(inode);
111         struct md_op_data *op_data;
112         struct ptlrpc_request *req = NULL;
113         struct obd_device *obd = class_exp2obd(exp);
114         int epoch_close = 1;
115         int rc;
116         ENTRY;
117
118         if (obd == NULL) {
119                 /*
120                  * XXX: in case of LMV, is this correct to access
121                  * ->exp_handle?
122                  */
123                 CERROR("Invalid MDC connection handle "LPX64"\n",
124                        ll_i2mdexp(inode)->exp_handle.h_cookie);
125                 GOTO(out, rc = 0);
126         }
127
128         /*
129          * here we check if this is forced umount. If so this is called on
130          * canceling "open lock" and we do not call md_close() in this case, as
131          * it will not be successful, as import is already deactivated.
132          */
133         if (obd->obd_force)
134                 GOTO(out, rc = 0);
135
136         OBD_ALLOC_PTR(op_data);
137         if (op_data == NULL)
138                 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
139
140         ll_prepare_close(inode, op_data, och);
141         epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
142         rc = md_close(md_exp, op_data, och->och_mod, &req);
143         if (rc == -EAGAIN) {
144                 /* This close must have the epoch closed. */
145                 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
146                 LASSERT(epoch_close);
147                 /* MDS has instructed us to obtain Size-on-MDS attribute from
148                  * OSTs and send setattr to back to MDS. */
149                 rc = ll_sizeonmds_update(inode, &och->och_fh,
150                                          op_data->op_ioepoch);
151                 if (rc) {
152                         CERROR("inode %lu mdc Size-on-MDS update failed: "
153                                "rc = %d\n", inode->i_ino, rc);
154                         rc = 0;
155                 }
156         } else if (rc) {
157                 CERROR("inode %lu mdc close failed: rc = %d\n",
158                        inode->i_ino, rc);
159         }
160         ll_finish_md_op_data(op_data);
161
162         if (rc == 0) {
163                 rc = ll_objects_destroy(req, inode);
164                 if (rc)
165                         CERROR("inode %lu ll_objects destroy: rc = %d\n",
166                                inode->i_ino, rc);
167         }
168
169         EXIT;
170 out:
171
172         if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
173             S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
174                 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
175         } else {
176                 md_clear_open_replay_data(md_exp, och);
177                 /* Free @och if it is not waiting for DONE_WRITING. */
178                 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
179                 OBD_FREE_PTR(och);
180         }
181         if (req) /* This is close request */
182                 ptlrpc_req_finished(req);
183         return rc;
184 }
185
186 int ll_md_real_close(struct inode *inode, int flags)
187 {
188         struct ll_inode_info *lli = ll_i2info(inode);
189         struct obd_client_handle **och_p;
190         struct obd_client_handle *och;
191         __u64 *och_usecount;
192         int rc = 0;
193         ENTRY;
194
195         if (flags & FMODE_WRITE) {
196                 och_p = &lli->lli_mds_write_och;
197                 och_usecount = &lli->lli_open_fd_write_count;
198         } else if (flags & FMODE_EXEC) {
199                 och_p = &lli->lli_mds_exec_och;
200                 och_usecount = &lli->lli_open_fd_exec_count;
201         } else {
202                 LASSERT(flags & FMODE_READ);
203                 och_p = &lli->lli_mds_read_och;
204                 och_usecount = &lli->lli_open_fd_read_count;
205         }
206
207         down(&lli->lli_och_sem);
208         if (*och_usecount) { /* There are still users of this handle, so
209                                 skip freeing it. */
210                 up(&lli->lli_och_sem);
211                 RETURN(0);
212         }
213         och=*och_p;
214         *och_p = NULL;
215         up(&lli->lli_och_sem);
216
217         if (och) { /* There might be a race and somebody have freed this och
218                       already */
219                 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
220                                                inode, och);
221         }
222
223         RETURN(rc);
224 }
225
226 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
227                 struct file *file)
228 {
229         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
230         struct ll_inode_info *lli = ll_i2info(inode);
231         int rc = 0;
232         ENTRY;
233
234         /* clear group lock, if present */
235         if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
236                 ll_put_grouplock(inode, file, fd->fd_grouplock.cg_gid);
237
238         /* Let's see if we have good enough OPEN lock on the file and if
239            we can skip talking to MDS */
240         if (file->f_dentry->d_inode) { /* Can this ever be false? */
241                 int lockmode;
242                 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
243                 struct lustre_handle lockh;
244                 struct inode *inode = file->f_dentry->d_inode;
245                 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
246
247                 down(&lli->lli_och_sem);
248                 if (fd->fd_omode & FMODE_WRITE) {
249                         lockmode = LCK_CW;
250                         LASSERT(lli->lli_open_fd_write_count);
251                         lli->lli_open_fd_write_count--;
252                 } else if (fd->fd_omode & FMODE_EXEC) {
253                         lockmode = LCK_PR;
254                         LASSERT(lli->lli_open_fd_exec_count);
255                         lli->lli_open_fd_exec_count--;
256                 } else {
257                         lockmode = LCK_CR;
258                         LASSERT(lli->lli_open_fd_read_count);
259                         lli->lli_open_fd_read_count--;
260                 }
261                 up(&lli->lli_och_sem);
262
263                 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
264                                    LDLM_IBITS, &policy, lockmode,
265                                    &lockh)) {
266                         rc = ll_md_real_close(file->f_dentry->d_inode,
267                                               fd->fd_omode);
268                 }
269         } else {
270                 CERROR("Releasing a file %p with negative dentry %p. Name %s",
271                        file, file->f_dentry, file->f_dentry->d_name.name);
272         }
273
274         LUSTRE_FPRIVATE(file) = NULL;
275         ll_file_data_put(fd);
276         ll_capa_close(inode);
277
278         RETURN(rc);
279 }
280
281 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
282
283 /* While this returns an error code, fput() the caller does not, so we need
284  * to make every effort to clean up all of our state here.  Also, applications
285  * rarely check close errors and even if an error is returned they will not
286  * re-try the close call.
287  */
288 int ll_file_release(struct inode *inode, struct file *file)
289 {
290         struct ll_file_data *fd;
291         struct ll_sb_info *sbi = ll_i2sbi(inode);
292         struct ll_inode_info *lli = ll_i2info(inode);
293         struct lov_stripe_md *lsm = lli->lli_smd;
294         int rc;
295         ENTRY;
296
297         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
298                inode->i_generation, inode);
299
300 #ifdef CONFIG_FS_POSIX_ACL
301         if (sbi->ll_flags & LL_SBI_RMT_CLIENT &&
302             inode == inode->i_sb->s_root->d_inode) {
303                 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
304
305                 LASSERT(fd != NULL);
306                 if (unlikely(fd->fd_flags & LL_FILE_RMTACL)) {
307                         fd->fd_flags &= ~LL_FILE_RMTACL;
308                         rct_del(&sbi->ll_rct, cfs_curproc_pid());
309                         et_search_free(&sbi->ll_et, cfs_curproc_pid());
310                 }
311         }
312 #endif
313
314         if (inode->i_sb->s_root != file->f_dentry)
315                 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
316         fd = LUSTRE_FPRIVATE(file);
317         LASSERT(fd != NULL);
318
319         /* The last ref on @file, maybe not the the owner pid of statahead.
320          * Different processes can open the same dir, "ll_opendir_key" means:
321          * it is me that should stop the statahead thread. */
322         if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0)
323                 ll_stop_statahead(inode, lli->lli_opendir_key);
324
325         if (inode->i_sb->s_root == file->f_dentry) {
326                 LUSTRE_FPRIVATE(file) = NULL;
327                 ll_file_data_put(fd);
328                 RETURN(0);
329         }
330
331         if (lsm)
332                 lov_test_and_clear_async_rc(lsm);
333         lli->lli_async_rc = 0;
334
335         rc = ll_md_close(sbi->ll_md_exp, inode, file);
336         RETURN(rc);
337 }
338
339 static int ll_intent_file_open(struct file *file, void *lmm,
340                                int lmmsize, struct lookup_intent *itp)
341 {
342         struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
343         struct dentry *parent = file->f_dentry->d_parent;
344         const char *name = file->f_dentry->d_name.name;
345         const int len = file->f_dentry->d_name.len;
346         struct md_op_data *op_data;
347         struct ptlrpc_request *req;
348         int rc;
349         ENTRY;
350
351         if (!parent)
352                 RETURN(-ENOENT);
353
354         /* Usually we come here only for NFSD, and we want open lock.
355            But we can also get here with pre 2.6.15 patchless kernels, and in
356            that case that lock is also ok */
357         /* We can also get here if there was cached open handle in revalidate_it
358          * but it disappeared while we were getting from there to ll_file_open.
359          * But this means this file was closed and immediatelly opened which
360          * makes a good candidate for using OPEN lock */
361         /* If lmmsize & lmm are not 0, we are just setting stripe info
362          * parameters. No need for the open lock */
363         if (!lmm && !lmmsize)
364                 itp->it_flags |= MDS_OPEN_LOCK;
365
366         op_data  = ll_prep_md_op_data(NULL, parent->d_inode,
367                                       file->f_dentry->d_inode, name, len,
368                                       O_RDWR, LUSTRE_OPC_ANY, NULL);
369         if (IS_ERR(op_data))
370                 RETURN(PTR_ERR(op_data));
371
372         rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
373                             0 /*unused */, &req, ll_md_blocking_ast, 0);
374         ll_finish_md_op_data(op_data);
375         if (rc == -ESTALE) {
376                 /* reason for keep own exit path - don`t flood log
377                 * with messages with -ESTALE errors.
378                 */
379                 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
380                      it_open_error(DISP_OPEN_OPEN, itp))
381                         GOTO(out, rc);
382                 ll_release_openhandle(file->f_dentry, itp);
383                 GOTO(out, rc);
384         }
385
386         if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
387                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
388                 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
389                 GOTO(out, rc);
390         }
391
392         if (itp->d.lustre.it_lock_mode)
393                 md_set_lock_data(sbi->ll_md_exp,
394                                  &itp->d.lustre.it_lock_handle,
395                                  file->f_dentry->d_inode);
396
397         rc = ll_prep_inode(&file->f_dentry->d_inode, req, NULL);
398 out:
399         ptlrpc_req_finished(itp->d.lustre.it_data);
400         it_clear_disposition(itp, DISP_ENQ_COMPLETE);
401         ll_intent_drop_lock(itp);
402
403         RETURN(rc);
404 }
405
406 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
407                        struct lookup_intent *it, struct obd_client_handle *och)
408 {
409         struct ptlrpc_request *req = it->d.lustre.it_data;
410         struct mdt_body *body;
411
412         LASSERT(och);
413
414         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
415         LASSERT(body != NULL);                      /* reply already checked out */
416
417         memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
418         och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
419         och->och_fid = lli->lli_fid;
420         och->och_flags = it->it_flags;
421         lli->lli_ioepoch = body->ioepoch;
422
423         return md_set_open_replay_data(md_exp, och, req);
424 }
425
426 int ll_local_open(struct file *file, struct lookup_intent *it,
427                   struct ll_file_data *fd, struct obd_client_handle *och)
428 {
429         struct inode *inode = file->f_dentry->d_inode;
430         struct ll_inode_info *lli = ll_i2info(inode);
431         ENTRY;
432
433         LASSERT(!LUSTRE_FPRIVATE(file));
434
435         LASSERT(fd != NULL);
436
437         if (och) {
438                 struct ptlrpc_request *req = it->d.lustre.it_data;
439                 struct mdt_body *body;
440                 int rc;
441
442                 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
443                 if (rc)
444                         RETURN(rc);
445
446                 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
447                 if ((it->it_flags & FMODE_WRITE) &&
448                     (body->valid & OBD_MD_FLSIZE))
449                         CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
450                                lli->lli_ioepoch, PFID(&lli->lli_fid));
451         }
452
453         LUSTRE_FPRIVATE(file) = fd;
454         ll_readahead_init(inode, &fd->fd_ras);
455         fd->fd_omode = it->it_flags;
456         RETURN(0);
457 }
458
459 /* Open a file, and (for the very first open) create objects on the OSTs at
460  * this time.  If opened with O_LOV_DELAY_CREATE, then we don't do the object
461  * creation or open until ll_lov_setstripe() ioctl is called.  We grab
462  * lli_open_sem to ensure no other process will create objects, send the
463  * stripe MD to the MDS, or try to destroy the objects if that fails.
464  *
465  * If we already have the stripe MD locally then we don't request it in
466  * md_open(), by passing a lmm_size = 0.
467  *
468  * It is up to the application to ensure no other processes open this file
469  * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
470  * used.  We might be able to avoid races of that sort by getting lli_open_sem
471  * before returning in the O_LOV_DELAY_CREATE case and dropping it here
472  * or in ll_file_release(), but I'm not sure that is desirable/necessary.
473  */
474 int ll_file_open(struct inode *inode, struct file *file)
475 {
476         struct ll_inode_info *lli = ll_i2info(inode);
477         struct lookup_intent *it, oit = { .it_op = IT_OPEN,
478                                           .it_flags = file->f_flags };
479         struct lov_stripe_md *lsm;
480         struct ptlrpc_request *req = NULL;
481         struct obd_client_handle **och_p;
482         __u64 *och_usecount;
483         struct ll_file_data *fd;
484         int rc = 0, opendir_set = 0;
485         ENTRY;
486
487         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
488                inode->i_generation, inode, file->f_flags);
489
490 #ifdef HAVE_VFS_INTENT_PATCHES
491         it = file->f_it;
492 #else
493         it = file->private_data; /* XXX: compat macro */
494         file->private_data = NULL; /* prevent ll_local_open assertion */
495 #endif
496
497         fd = ll_file_data_get();
498         if (fd == NULL)
499                 RETURN(-ENOMEM);
500
501         fd->fd_file = file;
502         if (S_ISDIR(inode->i_mode)) {
503 again:
504                 spin_lock(&lli->lli_lock);
505                 if (lli->lli_opendir_key == NULL && lli->lli_opendir_pid == 0) {
506                         LASSERT(lli->lli_sai == NULL);
507                         lli->lli_opendir_key = fd;
508                         lli->lli_opendir_pid = cfs_curproc_pid();
509                         opendir_set = 1;
510                 } else if (unlikely(lli->lli_opendir_pid == cfs_curproc_pid() &&
511                                     lli->lli_opendir_key != NULL)) {
512                         /* Two cases for this:
513                          * (1) The same process open such directory many times.
514                          * (2) The old process opened the directory, and exited
515                          *     before its children processes. Then new process
516                          *     with the same pid opens such directory before the
517                          *     old process's children processes exit.
518                          * reset stat ahead for such cases. */
519                         spin_unlock(&lli->lli_lock);
520                         CDEBUG(D_INFO, "Conflict statahead for %.*s "DFID
521                                " reset it.\n", file->f_dentry->d_name.len,
522                                file->f_dentry->d_name.name,
523                                PFID(&lli->lli_fid));
524                         ll_stop_statahead(inode, lli->lli_opendir_key);
525                         goto again;
526                 }
527                 spin_unlock(&lli->lli_lock);
528         }
529
530         if (inode->i_sb->s_root == file->f_dentry) {
531                 LUSTRE_FPRIVATE(file) = fd;
532                 RETURN(0);
533         }
534
535         if (!it || !it->d.lustre.it_disposition) {
536                 /* Convert f_flags into access mode. We cannot use file->f_mode,
537                  * because everything but O_ACCMODE mask was stripped from
538                  * there */
539                 if ((oit.it_flags + 1) & O_ACCMODE)
540                         oit.it_flags++;
541                 if (file->f_flags & O_TRUNC)
542                         oit.it_flags |= FMODE_WRITE;
543
544                 /* kernel only call f_op->open in dentry_open.  filp_open calls
545                  * dentry_open after call to open_namei that checks permissions.
546                  * Only nfsd_open call dentry_open directly without checking
547                  * permissions and because of that this code below is safe. */
548                 if (oit.it_flags & FMODE_WRITE)
549                         oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
550
551                 /* We do not want O_EXCL here, presumably we opened the file
552                  * already? XXX - NFS implications? */
553                 oit.it_flags &= ~O_EXCL;
554
555                 it = &oit;
556         }
557
558 restart:
559         /* Let's see if we have file open on MDS already. */
560         if (it->it_flags & FMODE_WRITE) {
561                 och_p = &lli->lli_mds_write_och;
562                 och_usecount = &lli->lli_open_fd_write_count;
563         } else if (it->it_flags & FMODE_EXEC) {
564                 och_p = &lli->lli_mds_exec_och;
565                 och_usecount = &lli->lli_open_fd_exec_count;
566          } else {
567                 och_p = &lli->lli_mds_read_och;
568                 och_usecount = &lli->lli_open_fd_read_count;
569         }
570
571         down(&lli->lli_och_sem);
572         if (*och_p) { /* Open handle is present */
573                 if (it_disposition(it, DISP_OPEN_OPEN)) {
574                         /* Well, there's extra open request that we do not need,
575                            let's close it somehow. This will decref request. */
576                         rc = it_open_error(DISP_OPEN_OPEN, it);
577                         if (rc) {
578                                 up(&lli->lli_och_sem);
579                                 ll_file_data_put(fd);
580                                 GOTO(out_openerr, rc);
581                         }
582                         ll_release_openhandle(file->f_dentry, it);
583                         lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
584                                              LPROC_LL_OPEN);
585                 }
586                 (*och_usecount)++;
587
588                 rc = ll_local_open(file, it, fd, NULL);
589                 if (rc) {
590                         (*och_usecount)--;
591                         up(&lli->lli_och_sem);
592                         ll_file_data_put(fd);
593                         GOTO(out_openerr, rc);
594                 }
595         } else {
596                 LASSERT(*och_usecount == 0);
597                 if (!it->d.lustre.it_disposition) {
598                         /* We cannot just request lock handle now, new ELC code
599                            means that one of other OPEN locks for this file
600                            could be cancelled, and since blocking ast handler
601                            would attempt to grab och_sem as well, that would
602                            result in a deadlock */
603                         up(&lli->lli_och_sem);
604                         it->it_create_mode |= M_CHECK_STALE;
605                         rc = ll_intent_file_open(file, NULL, 0, it);
606                         it->it_create_mode &= ~M_CHECK_STALE;
607                         if (rc) {
608                                 ll_file_data_put(fd);
609                                 GOTO(out_openerr, rc);
610                         }
611
612                         /* Got some error? Release the request */
613                         if (it->d.lustre.it_status < 0) {
614                                 req = it->d.lustre.it_data;
615                                 ptlrpc_req_finished(req);
616                         }
617                         md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
618                                          &it->d.lustre.it_lock_handle,
619                                          file->f_dentry->d_inode);
620                         goto restart;
621                 }
622                 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
623                 if (!*och_p) {
624                         ll_file_data_put(fd);
625                         GOTO(out_och_free, rc = -ENOMEM);
626                 }
627                 (*och_usecount)++;
628                 req = it->d.lustre.it_data;
629
630                 /* md_intent_lock() didn't get a request ref if there was an
631                  * open error, so don't do cleanup on the request here
632                  * (bug 3430) */
633                 /* XXX (green): Should not we bail out on any error here, not
634                  * just open error? */
635                 rc = it_open_error(DISP_OPEN_OPEN, it);
636                 if (rc) {
637                         ll_file_data_put(fd);
638                         GOTO(out_och_free, rc);
639                 }
640
641                 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
642                 rc = ll_local_open(file, it, fd, *och_p);
643                 if (rc) {
644                         ll_file_data_put(fd);
645                         GOTO(out_och_free, rc);
646                 }
647         }
648         up(&lli->lli_och_sem);
649
650         /* Must do this outside lli_och_sem lock to prevent deadlock where
651            different kind of OPEN lock for this same inode gets cancelled
652            by ldlm_cancel_lru */
653         if (!S_ISREG(inode->i_mode))
654                 GOTO(out, rc);
655
656         ll_capa_open(inode);
657
658         lsm = lli->lli_smd;
659         if (lsm == NULL) {
660                 if (file->f_flags & O_LOV_DELAY_CREATE ||
661                     !(file->f_mode & FMODE_WRITE)) {
662                         CDEBUG(D_INODE, "object creation was delayed\n");
663                         GOTO(out, rc);
664                 }
665         }
666         file->f_flags &= ~O_LOV_DELAY_CREATE;
667         GOTO(out, rc);
668 out:
669         ptlrpc_req_finished(req);
670         if (req)
671                 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
672 out_och_free:
673         if (rc) {
674                 if (*och_p) {
675                         OBD_FREE(*och_p, sizeof (struct obd_client_handle));
676                         *och_p = NULL; /* OBD_FREE writes some magic there */
677                         (*och_usecount)--;
678                 }
679                 up(&lli->lli_och_sem);
680 out_openerr:
681                 if (opendir_set != 0)
682                         ll_stop_statahead(inode, lli->lli_opendir_key);
683         }
684
685         return rc;
686 }
687
688 /* Fills the obdo with the attributes for the lsm */
689 static int ll_lsm_getattr(struct lov_stripe_md *lsm, struct obd_export *exp,
690                           struct obd_capa *capa, struct obdo *obdo)
691 {
692         struct ptlrpc_request_set *set;
693         struct obd_info            oinfo = { { { 0 } } };
694         int                        rc;
695
696         ENTRY;
697
698         LASSERT(lsm != NULL);
699
700         oinfo.oi_md = lsm;
701         oinfo.oi_oa = obdo;
702         oinfo.oi_oa->o_id = lsm->lsm_object_id;
703         oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
704         oinfo.oi_oa->o_mode = S_IFREG;
705         oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
706                                OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
707                                OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
708                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
709                                OBD_MD_FLGROUP;
710         oinfo.oi_capa = capa;
711
712         set = ptlrpc_prep_set();
713         if (set == NULL) {
714                 CERROR("can't allocate ptlrpc set\n");
715                 rc = -ENOMEM;
716         } else {
717                 rc = obd_getattr_async(exp, &oinfo, set);
718                 if (rc == 0)
719                         rc = ptlrpc_set_wait(set);
720                 ptlrpc_set_destroy(set);
721         }
722         if (rc == 0)
723                 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
724                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
725                                          OBD_MD_FLCTIME | OBD_MD_FLSIZE);
726         RETURN(rc);
727 }
728
729 /* Fills the obdo with the attributes for the inode defined by lsm */
730 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
731 {
732         struct ll_inode_info *lli  = ll_i2info(inode);
733         struct obd_capa      *capa = ll_mdscapa_get(inode);
734         int rc;
735         ENTRY;
736
737         rc = ll_lsm_getattr(lli->lli_smd, ll_i2dtexp(inode), capa, obdo);
738         capa_put(capa);
739         if (rc == 0) {
740                 obdo_refresh_inode(inode, obdo, obdo->o_valid);
741                 CDEBUG(D_INODE,
742                        "objid "LPX64" size %Lu, blocks %llu, blksize %lu\n",
743                        lli->lli_smd->lsm_object_id, i_size_read(inode),
744                        (unsigned long long)inode->i_blocks,
745                        (unsigned long)ll_inode_blksize(inode));
746         }
747         RETURN(rc);
748 }
749
750 int ll_merge_lvb(struct inode *inode)
751 {
752         struct ll_inode_info *lli = ll_i2info(inode);
753         struct ll_sb_info *sbi = ll_i2sbi(inode);
754         struct ost_lvb lvb;
755         int rc;
756
757         ENTRY;
758
759         ll_inode_size_lock(inode, 1);
760         inode_init_lvb(inode, &lvb);
761         rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
762         i_size_write(inode, lvb.lvb_size);
763         inode->i_blocks = lvb.lvb_blocks;
764
765         LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
766         LTIME_S(inode->i_atime) = lvb.lvb_atime;
767         LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
768         ll_inode_size_unlock(inode, 1);
769
770         RETURN(rc);
771 }
772
773 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
774                      lstat_t *st)
775 {
776         struct obdo obdo = { 0 };
777         int rc;
778
779         rc = ll_lsm_getattr(lsm, sbi->ll_dt_exp, NULL, &obdo);
780         if (rc == 0) {
781                 st->st_size   = obdo.o_size;
782                 st->st_blocks = obdo.o_blocks;
783                 st->st_mtime  = obdo.o_mtime;
784                 st->st_atime  = obdo.o_atime;
785                 st->st_ctime  = obdo.o_ctime;
786         }
787         return rc;
788 }
789
790 void ll_io_init(struct cl_io *io, const struct file *file, int write)
791 {
792         struct inode *inode     = file->f_dentry->d_inode;
793         struct ll_sb_info *sbi  = ll_i2sbi(inode);
794         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
795
796         LASSERT(fd != NULL);
797         memset(io, 0, sizeof *io);
798         io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
799         if (write)
800                 io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
801         io->ci_obj     = ll_i2info(inode)->lli_clob;
802         io->ci_lockreq = CILR_MAYBE;
803         if (fd->fd_flags & LL_FILE_IGNORE_LOCK ||
804             sbi->ll_flags & LL_SBI_NOLCK) {
805                 io->ci_lockreq = CILR_NEVER;
806                 io->ci_no_srvlock = 1;
807         } else if (file->f_flags & O_APPEND) {
808                 io->ci_lockreq = CILR_MANDATORY;
809         }
810 }
811
812 static ssize_t ll_file_io_generic(const struct lu_env *env,
813                 struct ccc_io_args *args, struct file *file,
814                 enum cl_io_type iot, loff_t *ppos, size_t count)
815 {
816         struct cl_io       *io;
817         ssize_t             result;
818         ENTRY;
819
820         io = &ccc_env_info(env)->cti_io;
821         ll_io_init(io, file, iot == CIT_WRITE);
822
823         if (iot == CIT_READ)
824                 io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile;
825
826         if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
827                 struct vvp_io *vio = vvp_env_io(env);
828                 struct ccc_io *cio = ccc_env_io(env);
829                 if (cl_io_is_sendfile(io)) {
830                         vio->u.read.cui_actor = args->cia_actor;
831                         vio->u.read.cui_target = args->cia_target;
832                 } else {
833                         cio->cui_iov = args->cia_iov;
834                         cio->cui_nrsegs = args->cia_nrsegs;
835 #ifndef HAVE_FILE_WRITEV
836                         cio->cui_iocb = args->cia_iocb;
837 #endif
838                 }
839                 cio->cui_fd  = LUSTRE_FPRIVATE(file);
840                 result = cl_io_loop(env, io);
841         } else
842                 /* cl_io_rw_init() handled IO */
843                 result = io->ci_result;
844         if (io->ci_nob > 0) {
845                 result = io->ci_nob;
846                 *ppos = io->u.ci_wr.wr.crw_pos;
847         }
848         cl_io_fini(env, io);
849         RETURN(result);
850 }
851
852
853 /*
854  * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
855  */
856 static int ll_file_get_iov_count(const struct iovec *iov,
857                                  unsigned long *nr_segs, size_t *count)
858 {
859         size_t cnt = 0;
860         unsigned long seg;
861
862         for (seg = 0; seg < *nr_segs; seg++) {
863                 const struct iovec *iv = &iov[seg];
864
865                 /*
866                  * If any segment has a negative length, or the cumulative
867                  * length ever wraps negative then return -EINVAL.
868                  */
869                 cnt += iv->iov_len;
870                 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
871                         return -EINVAL;
872                 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
873                         continue;
874                 if (seg == 0)
875                         return -EFAULT;
876                 *nr_segs = seg;
877                 cnt -= iv->iov_len;   /* This segment is no good */
878                 break;
879         }
880         *count = cnt;
881         return 0;
882 }
883
884 #ifdef HAVE_FILE_READV
885 static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
886                               unsigned long nr_segs, loff_t *ppos)
887 {
888         struct lu_env      *env;
889         struct ccc_io_args *args;
890         size_t              count;
891         ssize_t             result;
892         int                 refcheck;
893         ENTRY;
894
895         result = ll_file_get_iov_count(iov, &nr_segs, &count);
896         if (result)
897                 RETURN(result);
898
899         env = cl_env_get(&refcheck);
900         if (IS_ERR(env))
901                 RETURN(PTR_ERR(env));
902
903         args = &vvp_env_info(env)->vti_args;
904         args->cia_is_sendfile = 0;
905         args->cia_iov = (struct iovec *)iov;
906         args->cia_nrsegs = nr_segs;
907         result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
908         cl_env_put(env, &refcheck);
909         RETURN(result);
910 }
911
912 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
913                             loff_t *ppos)
914 {
915         struct lu_env *env;
916         struct iovec  *local_iov;
917         ssize_t        result;
918         int            refcheck;
919         ENTRY;
920
921         env = cl_env_get(&refcheck);
922         if (IS_ERR(env))
923                 RETURN(PTR_ERR(env));
924
925         local_iov = &vvp_env_info(env)->vti_local_iov;
926         local_iov->iov_base = (void __user *)buf;
927         local_iov->iov_len = count;
928         result = ll_file_readv(file, local_iov, 1, ppos);
929         cl_env_put(env, &refcheck);
930         RETURN(result);
931 }
932
933 #else
934 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
935                                 unsigned long nr_segs, loff_t pos)
936 {
937         struct lu_env      *env;
938         struct ccc_io_args *args;
939         size_t              count;
940         ssize_t             result;
941         int                 refcheck;
942         ENTRY;
943
944         result = ll_file_get_iov_count(iov, &nr_segs, &count);
945         if (result)
946                 RETURN(result);
947
948         env = cl_env_get(&refcheck);
949         if (IS_ERR(env))
950                 RETURN(PTR_ERR(env));
951
952         args = &vvp_env_info(env)->vti_args;
953         args->cia_is_sendfile = 0;
954         args->cia_iov = (struct iovec *)iov;
955         args->cia_nrsegs = nr_segs;
956         args->cia_iocb = iocb;
957         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
958                                     &iocb->ki_pos, count);
959         cl_env_put(env, &refcheck);
960         RETURN(result);
961 }
962
963 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
964                             loff_t *ppos)
965 {
966         struct lu_env *env;
967         struct iovec  *local_iov;
968         struct kiocb  *kiocb;
969         ssize_t        result;
970         int            refcheck;
971         ENTRY;
972
973         env = cl_env_get(&refcheck);
974         if (IS_ERR(env))
975                 RETURN(PTR_ERR(env));
976
977         local_iov = &vvp_env_info(env)->vti_local_iov;
978         kiocb = &vvp_env_info(env)->vti_kiocb;
979         local_iov->iov_base = (void __user *)buf;
980         local_iov->iov_len = count;
981         init_sync_kiocb(kiocb, file);
982         kiocb->ki_pos = *ppos;
983         kiocb->ki_left = count;
984
985         result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
986         *ppos = kiocb->ki_pos;
987
988         cl_env_put(env, &refcheck);
989         RETURN(result);
990 }
991 #endif
992
993 /*
994  * Write to a file (through the page cache).
995  */
996 #ifdef HAVE_FILE_WRITEV
997 static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
998                               unsigned long nr_segs, loff_t *ppos)
999 {
1000         struct lu_env      *env;
1001         struct ccc_io_args *args;
1002         size_t              count;
1003         ssize_t             result;
1004         int                 refcheck;
1005         ENTRY;
1006
1007         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1008         if (result)
1009                 RETURN(result);
1010
1011         env = cl_env_get(&refcheck);
1012         if (IS_ERR(env))
1013                 RETURN(PTR_ERR(env));
1014
1015         args = &vvp_env_info(env)->vti_args;
1016         args->cia_iov = (struct iovec *)iov;
1017         args->cia_nrsegs = nr_segs;
1018         result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
1019         cl_env_put(env, &refcheck);
1020         RETURN(result);
1021 }
1022
1023 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1024                              loff_t *ppos)
1025 {
1026         struct lu_env    *env;
1027         struct iovec     *local_iov;
1028         ssize_t           result;
1029         int               refcheck;
1030         ENTRY;
1031
1032         env = cl_env_get(&refcheck);
1033         if (IS_ERR(env))
1034                 RETURN(PTR_ERR(env));
1035
1036         local_iov = &vvp_env_info(env)->vti_local_iov;
1037         local_iov->iov_base = (void __user *)buf;
1038         local_iov->iov_len = count;
1039
1040         result = ll_file_writev(file, local_iov, 1, ppos);
1041         cl_env_put(env, &refcheck);
1042         RETURN(result);
1043 }
1044
1045 #else /* AIO stuff */
1046 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1047                                  unsigned long nr_segs, loff_t pos)
1048 {
1049         struct lu_env      *env;
1050         struct ccc_io_args *args;
1051         size_t              count;
1052         ssize_t             result;
1053         int                 refcheck;
1054         ENTRY;
1055
1056         result = ll_file_get_iov_count(iov, &nr_segs, &count);
1057         if (result)
1058                 RETURN(result);
1059
1060         env = cl_env_get(&refcheck);
1061         if (IS_ERR(env))
1062                 RETURN(PTR_ERR(env));
1063
1064         args = &vvp_env_info(env)->vti_args;
1065         args->cia_iov = (struct iovec *)iov;
1066         args->cia_nrsegs = nr_segs;
1067         args->cia_iocb = iocb;
1068         result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1069                                   &iocb->ki_pos, count);
1070         cl_env_put(env, &refcheck);
1071         RETURN(result);
1072 }
1073
1074 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1075                              loff_t *ppos)
1076 {
1077         struct lu_env *env;
1078         struct iovec  *local_iov;
1079         struct kiocb  *kiocb;
1080         ssize_t        result;
1081         int            refcheck;
1082         ENTRY;
1083
1084         env = cl_env_get(&refcheck);
1085         if (IS_ERR(env))
1086                 RETURN(PTR_ERR(env));
1087
1088         local_iov = &vvp_env_info(env)->vti_local_iov;
1089         kiocb = &vvp_env_info(env)->vti_kiocb;
1090         local_iov->iov_base = (void __user *)buf;
1091         local_iov->iov_len = count;
1092         init_sync_kiocb(kiocb, file);
1093         kiocb->ki_pos = *ppos;
1094         kiocb->ki_left = count;
1095
1096         result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
1097         *ppos = kiocb->ki_pos;
1098
1099         cl_env_put(env, &refcheck);
1100         RETURN(result);
1101 }
1102 #endif
1103
1104
1105 /*
1106  * Send file content (through pagecache) somewhere with helper
1107  */
1108 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1109                                 read_actor_t actor, void *target)
1110 {
1111         struct lu_env      *env;
1112         struct ccc_io_args *args;
1113         ssize_t             result;
1114         int                 refcheck;
1115         ENTRY;
1116
1117         env = cl_env_get(&refcheck);
1118         if (IS_ERR(env))
1119                 RETURN(PTR_ERR(env));
1120
1121         args = &vvp_env_info(env)->vti_args;
1122         args->cia_is_sendfile = 1;
1123         args->cia_target = target;
1124         args->cia_actor = actor;
1125         result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1126         cl_env_put(env, &refcheck);
1127         RETURN(result);
1128 }
1129
1130 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1131                                unsigned long arg)
1132 {
1133         struct obd_export *exp = ll_i2dtexp(inode);
1134         struct ll_recreate_obj ucreatp;
1135         struct obd_trans_info oti = { 0 };
1136         struct obdo *oa = NULL;
1137         int lsm_size;
1138         int rc = 0;
1139         struct lov_stripe_md *lsm, *lsm2;
1140         ENTRY;
1141
1142         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1143                 RETURN(-EPERM);
1144
1145         if (copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1146                            sizeof(struct ll_recreate_obj)))
1147                 RETURN(-EFAULT);
1148
1149         OBDO_ALLOC(oa);
1150         if (oa == NULL)
1151                 RETURN(-ENOMEM);
1152
1153         ll_inode_size_lock(inode, 0);
1154         lsm = ll_i2info(inode)->lli_smd;
1155         if (lsm == NULL)
1156                 GOTO(out, rc = -ENOENT);
1157         lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1158                    (lsm->lsm_stripe_count));
1159
1160         OBD_ALLOC(lsm2, lsm_size);
1161         if (lsm2 == NULL)
1162                 GOTO(out, rc = -ENOMEM);
1163
1164         oa->o_id = ucreatp.lrc_id;
1165         oa->o_gr = ucreatp.lrc_group;
1166         oa->o_nlink = ucreatp.lrc_ost_idx;
1167         oa->o_flags |= OBD_FL_RECREATE_OBJS;
1168         oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1169         obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1170                         OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1171
1172         memcpy(lsm2, lsm, lsm_size);
1173         rc = obd_create(exp, oa, &lsm2, &oti);
1174
1175         OBD_FREE(lsm2, lsm_size);
1176         GOTO(out, rc);
1177 out:
1178         ll_inode_size_unlock(inode, 0);
1179         OBDO_FREE(oa);
1180         return rc;
1181 }
1182
1183 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1184                              int flags, struct lov_user_md *lum, int lum_size)
1185 {
1186         struct lov_stripe_md *lsm;
1187         struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1188         int rc = 0;
1189         ENTRY;
1190
1191         ll_inode_size_lock(inode, 0);
1192         lsm = ll_i2info(inode)->lli_smd;
1193         if (lsm) {
1194                 ll_inode_size_unlock(inode, 0);
1195                 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1196                        inode->i_ino);
1197                 RETURN(-EEXIST);
1198         }
1199
1200         rc = ll_intent_file_open(file, lum, lum_size, &oit);
1201         if (rc)
1202                 GOTO(out, rc);
1203         if (it_disposition(&oit, DISP_LOOKUP_NEG))
1204                 GOTO(out_req_free, rc = -ENOENT);
1205         rc = oit.d.lustre.it_status;
1206         if (rc < 0)
1207                 GOTO(out_req_free, rc);
1208
1209         ll_release_openhandle(file->f_dentry, &oit);
1210
1211  out:
1212         ll_inode_size_unlock(inode, 0);
1213         ll_intent_release(&oit);
1214         RETURN(rc);
1215 out_req_free:
1216         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1217         goto out;
1218 }
1219
1220 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1221                              struct lov_mds_md **lmmp, int *lmm_size,
1222                              struct ptlrpc_request **request)
1223 {
1224         struct ll_sb_info *sbi = ll_i2sbi(inode);
1225         struct mdt_body  *body;
1226         struct lov_mds_md *lmm = NULL;
1227         struct ptlrpc_request *req = NULL;
1228         struct obd_capa *oc;
1229         int rc, lmmsize;
1230
1231         rc = ll_get_max_mdsize(sbi, &lmmsize);
1232         if (rc)
1233                 RETURN(rc);
1234
1235         oc = ll_mdscapa_get(inode);
1236         rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1237                              oc, filename, strlen(filename) + 1,
1238                              OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize,
1239                              ll_i2suppgid(inode), &req);
1240         capa_put(oc);
1241         if (rc < 0) {
1242                 CDEBUG(D_INFO, "md_getattr_name failed "
1243                        "on %s: rc %d\n", filename, rc);
1244                 GOTO(out, rc);
1245         }
1246
1247         body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1248         LASSERT(body != NULL); /* checked by mdc_getattr_name */
1249
1250         lmmsize = body->eadatasize;
1251
1252         if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1253                         lmmsize == 0) {
1254                 GOTO(out, rc = -ENODATA);
1255         }
1256
1257         lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1258         LASSERT(lmm != NULL);
1259
1260         if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) &&
1261             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3)) &&
1262             (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) {
1263                 GOTO(out, rc = -EPROTO);
1264         }
1265
1266         /*
1267          * This is coming from the MDS, so is probably in
1268          * little endian.  We convert it to host endian before
1269          * passing it to userspace.
1270          */
1271         if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1272                 /* if function called for directory - we should
1273                  * avoid swab not existent lsm objects */
1274                 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1275                         lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm);
1276                         if (S_ISREG(body->mode))
1277                                 lustre_swab_lov_user_md_objects(
1278                                  ((struct lov_user_md_v1 *)lmm)->lmm_objects,
1279                                  ((struct lov_user_md_v1 *)lmm)->lmm_stripe_count);
1280                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1281                         lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm);
1282                         if (S_ISREG(body->mode))
1283                                 lustre_swab_lov_user_md_objects(
1284                                  ((struct lov_user_md_v3 *)lmm)->lmm_objects,
1285                                  ((struct lov_user_md_v3 *)lmm)->lmm_stripe_count);
1286                 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) {
1287                         lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1288                 }
1289         }
1290
1291         if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1292                 struct lov_stripe_md *lsm;
1293                 struct lov_user_md_join *lmj;
1294                 int lmj_size, i, aindex = 0;
1295
1296                 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1297                 if (rc < 0)
1298                         GOTO(out, rc = -ENOMEM);
1299                 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1300                 if (rc)
1301                         GOTO(out_free_memmd, rc);
1302
1303                 lmj_size = sizeof(struct lov_user_md_join) +
1304                            lsm->lsm_stripe_count *
1305                            sizeof(struct lov_user_ost_data_join);
1306                 OBD_ALLOC(lmj, lmj_size);
1307                 if (!lmj)
1308                         GOTO(out_free_memmd, rc = -ENOMEM);
1309
1310                 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1311                 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1312                         struct lov_extent *lex =
1313                                 &lsm->lsm_array->lai_ext_array[aindex];
1314
1315                         if (lex->le_loi_idx + lex->le_stripe_count <= i)
1316                                 aindex ++;
1317                         CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1318                                         LPU64" len %d\n", aindex, i,
1319                                         lex->le_start, (int)lex->le_len);
1320                         lmj->lmm_objects[i].l_extent_start =
1321                                 lex->le_start;
1322
1323                         if ((int)lex->le_len == -1)
1324                                 lmj->lmm_objects[i].l_extent_end = -1;
1325                         else
1326                                 lmj->lmm_objects[i].l_extent_end =
1327                                         lex->le_start + lex->le_len;
1328                         lmj->lmm_objects[i].l_object_id =
1329                                 lsm->lsm_oinfo[i]->loi_id;
1330                         lmj->lmm_objects[i].l_object_gr =
1331                                 lsm->lsm_oinfo[i]->loi_gr;
1332                         lmj->lmm_objects[i].l_ost_gen =
1333                                 lsm->lsm_oinfo[i]->loi_ost_gen;
1334                         lmj->lmm_objects[i].l_ost_idx =
1335                                 lsm->lsm_oinfo[i]->loi_ost_idx;
1336                 }
1337                 lmm = (struct lov_mds_md *)lmj;
1338                 lmmsize = lmj_size;
1339 out_free_memmd:
1340                 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1341         }
1342 out:
1343         *lmmp = lmm;
1344         *lmm_size = lmmsize;
1345         *request = req;
1346         return rc;
1347 }
1348
1349 static int ll_lov_setea(struct inode *inode, struct file *file,
1350                             unsigned long arg)
1351 {
1352         int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1353         struct lov_user_md  *lump;
1354         int lum_size = sizeof(struct lov_user_md) +
1355                        sizeof(struct lov_user_ost_data);
1356         int rc;
1357         ENTRY;
1358
1359         if (!cfs_capable(CFS_CAP_SYS_ADMIN))
1360                 RETURN(-EPERM);
1361
1362         OBD_ALLOC(lump, lum_size);
1363         if (lump == NULL) {
1364                 RETURN(-ENOMEM);
1365         }
1366         if (copy_from_user(lump, (struct lov_user_md  *)arg, lum_size)) {
1367                 OBD_FREE(lump, lum_size);
1368                 RETURN(-EFAULT);
1369         }
1370
1371         rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1372
1373         OBD_FREE(lump, lum_size);
1374         RETURN(rc);
1375 }
1376
1377 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1378                             unsigned long arg)
1379 {
1380         struct lov_user_md_v3 lumv3;
1381         struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3;
1382         struct lov_user_md_v1 *lumv1p = (struct lov_user_md_v1 *)arg;
1383         struct lov_user_md_v3 *lumv3p = (struct lov_user_md_v3 *)arg;
1384         int lum_size;
1385         int rc;
1386         int flags = FMODE_WRITE;
1387         ENTRY;
1388
1389         /* first try with v1 which is smaller than v3 */
1390         lum_size = sizeof(struct lov_user_md_v1);
1391         if (copy_from_user(lumv1, lumv1p, lum_size))
1392                 RETURN(-EFAULT);
1393
1394         if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) {
1395                 lum_size = sizeof(struct lov_user_md_v3);
1396                 if (copy_from_user(&lumv3, lumv3p, lum_size))
1397                         RETURN(-EFAULT);
1398         }
1399
1400         rc = ll_lov_setstripe_ea_info(inode, file, flags, lumv1, lum_size);
1401         if (rc == 0) {
1402                  put_user(0, &lumv1p->lmm_stripe_count);
1403                  rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1404                                     0, ll_i2info(inode)->lli_smd,
1405                                     (void *)arg);
1406         }
1407         RETURN(rc);
1408 }
1409
1410 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1411 {
1412         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1413
1414         if (!lsm)
1415                 RETURN(-ENODATA);
1416
1417         return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1418                             (void *)arg);
1419 }
1420
1421 int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1422 {
1423         struct ll_inode_info   *lli = ll_i2info(inode);
1424         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1425         struct ccc_grouplock    grouplock;
1426         int                     rc;
1427         ENTRY;
1428
1429         spin_lock(&lli->lli_lock);
1430         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1431                 CERROR("group lock already existed with gid %lu\n",
1432                        fd->fd_grouplock.cg_gid);
1433                 spin_unlock(&lli->lli_lock);
1434                 RETURN(-EINVAL);
1435         }
1436         LASSERT(fd->fd_grouplock.cg_lock == NULL);
1437         spin_unlock(&lli->lli_lock);
1438
1439         rc = cl_get_grouplock(cl_i2info(inode)->lli_clob,
1440                               arg, (file->f_flags & O_NONBLOCK), &grouplock);
1441         if (rc)
1442                 RETURN(rc);
1443
1444         spin_lock(&lli->lli_lock);
1445         if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1446                 spin_unlock(&lli->lli_lock);
1447                 CERROR("another thread just won the race\n");
1448                 cl_put_grouplock(&grouplock);
1449                 RETURN(-EINVAL);
1450         }
1451
1452         fd->fd_flags |= (LL_FILE_GROUP_LOCKED | LL_FILE_IGNORE_LOCK);
1453         fd->fd_grouplock = grouplock;
1454         spin_unlock(&lli->lli_lock);
1455
1456         CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
1457         RETURN(0);
1458 }
1459
1460 int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg)
1461 {
1462         struct ll_inode_info   *lli = ll_i2info(inode);
1463         struct ll_file_data    *fd = LUSTRE_FPRIVATE(file);
1464         struct ccc_grouplock    grouplock;
1465         ENTRY;
1466
1467         spin_lock(&lli->lli_lock);
1468         if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1469                 spin_unlock(&lli->lli_lock);
1470                 CERROR("no group lock held\n");
1471                 RETURN(-EINVAL);
1472         }
1473         LASSERT(fd->fd_grouplock.cg_lock != NULL);
1474
1475         if (fd->fd_grouplock.cg_gid != arg) {
1476                 CERROR("group lock %lu doesn't match current id %lu\n",
1477                        arg, fd->fd_grouplock.cg_gid);
1478                 spin_unlock(&lli->lli_lock);
1479                 RETURN(-EINVAL);
1480         }
1481
1482         grouplock = fd->fd_grouplock;
1483         fd->fd_grouplock.cg_env = NULL;
1484         fd->fd_grouplock.cg_lock = NULL;
1485         fd->fd_grouplock.cg_gid = 0;
1486         fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED | LL_FILE_IGNORE_LOCK);
1487         spin_unlock(&lli->lli_lock);
1488
1489         cl_put_grouplock(&grouplock);
1490         CDEBUG(D_INFO, "group lock %lu released\n", arg);
1491         RETURN(0);
1492 }
1493
1494 #if LUSTRE_FIX >= 50
1495 static int join_sanity_check(struct inode *head, struct inode *tail)
1496 {
1497         ENTRY;
1498         if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
1499                 CERROR("server do not support join \n");
1500                 RETURN(-EINVAL);
1501         }
1502         if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
1503                 CERROR("tail ino %lu and ino head %lu must be regular\n",
1504                        head->i_ino, tail->i_ino);
1505                 RETURN(-EINVAL);
1506         }
1507         if (head->i_ino == tail->i_ino) {
1508                 CERROR("file %lu can not be joined to itself \n", head->i_ino);
1509                 RETURN(-EINVAL);
1510         }
1511         if (i_size_read(head) % JOIN_FILE_ALIGN) {
1512                 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
1513                 RETURN(-EINVAL);
1514         }
1515         RETURN(0);
1516 }
1517
1518 static int join_file(struct inode *head_inode, struct file *head_filp,
1519                      struct file *tail_filp)
1520 {
1521         struct dentry *tail_dentry = tail_filp->f_dentry;
1522         struct lookup_intent oit = {.it_op = IT_OPEN,
1523                                     .it_flags = head_filp->f_flags,
1524                                     .it_create_mode = M_JOIN_FILE};
1525         struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
1526                 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL };
1527
1528         struct lustre_handle lockh;
1529         struct md_op_data *op_data;
1530         int    rc;
1531         loff_t data;
1532         ENTRY;
1533
1534         tail_dentry = tail_filp->f_dentry;
1535
1536         data = i_size_read(head_inode);
1537         op_data = ll_prep_md_op_data(NULL, head_inode,
1538                                      tail_dentry->d_parent->d_inode,
1539                                      tail_dentry->d_name.name,
1540                                      tail_dentry->d_name.len, 0,
1541                                      LUSTRE_OPC_ANY, &data);
1542         if (IS_ERR(op_data))
1543                 RETURN(PTR_ERR(op_data));
1544
1545         rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
1546                          op_data, &lockh, NULL, 0, NULL, 0);
1547
1548         ll_finish_md_op_data(op_data);
1549         if (rc < 0)
1550                 GOTO(out, rc);
1551
1552         rc = oit.d.lustre.it_status;
1553
1554         if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
1555                 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
1556                 ptlrpc_req_finished((struct ptlrpc_request *)
1557                                     oit.d.lustre.it_data);
1558                 GOTO(out, rc);
1559         }
1560
1561         if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
1562                                            * away */
1563                 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
1564                 oit.d.lustre.it_lock_mode = 0;
1565         }
1566         ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1567         it_clear_disposition(&oit, DISP_ENQ_COMPLETE);
1568         ll_release_openhandle(head_filp->f_dentry, &oit);
1569 out:
1570         ll_intent_release(&oit);
1571         RETURN(rc);
1572 }
1573
1574 static int ll_file_join(struct inode *head, struct file *filp,
1575                         char *filename_tail)
1576 {
1577         struct inode *tail = NULL, *first = NULL, *second = NULL;
1578         struct dentry *tail_dentry;
1579         struct file *tail_filp, *first_filp, *second_filp;
1580         struct ll_lock_tree first_tree, second_tree;
1581         struct ll_lock_tree_node *first_node, *second_node;
1582         struct ll_inode_info *hlli = ll_i2info(head);
1583         int rc = 0, cleanup_phase = 0;
1584         ENTRY;
1585
1586         CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
1587                head->i_ino, head->i_generation, head, filename_tail);
1588
1589         tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
1590         if (IS_ERR(tail_filp)) {
1591                 CERROR("Can not open tail file %s", filename_tail);
1592                 rc = PTR_ERR(tail_filp);
1593                 GOTO(cleanup, rc);
1594         }
1595         tail = igrab(tail_filp->f_dentry->d_inode);
1596
1597         tail_dentry = tail_filp->f_dentry;
1598         LASSERT(tail_dentry);
1599         cleanup_phase = 1;
1600
1601         /*reorder the inode for lock sequence*/
1602         first = head->i_ino > tail->i_ino ? head : tail;
1603         second = head->i_ino > tail->i_ino ? tail : head;
1604         first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
1605         second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
1606
1607         CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
1608                head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
1609         first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
1610         if (IS_ERR(first_node)){
1611                 rc = PTR_ERR(first_node);
1612                 GOTO(cleanup, rc);
1613         }
1614         first_tree.lt_fd = first_filp->private_data;
1615         rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
1616         if (rc != 0)
1617                 GOTO(cleanup, rc);
1618         cleanup_phase = 2;
1619
1620         second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
1621         if (IS_ERR(second_node)){
1622                 rc = PTR_ERR(second_node);
1623                 GOTO(cleanup, rc);
1624         }
1625         second_tree.lt_fd = second_filp->private_data;
1626         rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
1627         if (rc != 0)
1628                 GOTO(cleanup, rc);
1629         cleanup_phase = 3;
1630
1631         rc = join_sanity_check(head, tail);
1632         if (rc)
1633                 GOTO(cleanup, rc);
1634
1635         rc = join_file(head, filp, tail_filp);
1636         if (rc)
1637                 GOTO(cleanup, rc);
1638 cleanup:
1639         switch (cleanup_phase) {
1640         case 3:
1641                 ll_tree_unlock(&second_tree);
1642                 obd_cancel_unused(ll_i2dtexp(second),
1643                                   ll_i2info(second)->lli_smd, 0, NULL);
1644         case 2:
1645                 ll_tree_unlock(&first_tree);
1646                 obd_cancel_unused(ll_i2dtexp(first),
1647                                   ll_i2info(first)->lli_smd, 0, NULL);
1648         case 1:
1649                 filp_close(tail_filp, 0);
1650                 if (tail)
1651                         iput(tail);
1652                 if (head && rc == 0) {
1653                         obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
1654                                        &hlli->lli_smd);
1655                         hlli->lli_smd = NULL;
1656                 }
1657         case 0:
1658                 break;
1659         default:
1660                 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
1661                 LBUG();
1662         }
1663         RETURN(rc);
1664 }
1665 #endif /* LUSTRE_FIX >= 50 */
1666
1667 /**
1668  * Close inode open handle
1669  *
1670  * \param dentry [in]     dentry which contains the inode
1671  * \param it     [in,out] intent which contains open info and result
1672  *
1673  * \retval 0     success
1674  * \retval <0    failure
1675  */
1676 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
1677 {
1678         struct inode *inode = dentry->d_inode;
1679         struct obd_client_handle *och;
1680         int rc;
1681         ENTRY;
1682
1683         LASSERT(inode);
1684
1685         /* Root ? Do nothing. */
1686         if (dentry->d_inode->i_sb->s_root == dentry)
1687                 RETURN(0);
1688
1689         /* No open handle to close? Move away */
1690         if (!it_disposition(it, DISP_OPEN_OPEN))
1691                 RETURN(0);
1692
1693         LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
1694
1695         OBD_ALLOC(och, sizeof(*och));
1696         if (!och)
1697                 GOTO(out, rc = -ENOMEM);
1698
1699         ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
1700                     ll_i2info(inode), it, och);
1701
1702         rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
1703                                        inode, och);
1704  out:
1705         /* this one is in place of ll_file_open */
1706         if (it_disposition(it, DISP_ENQ_OPEN_REF))
1707                 ptlrpc_req_finished(it->d.lustre.it_data);
1708         it_clear_disposition(it, DISP_ENQ_OPEN_REF);
1709         RETURN(rc);
1710 }
1711
1712 /**
1713  * Get size for inode for which FIEMAP mapping is requested.
1714  * Make the FIEMAP get_info call and returns the result.
1715  */
1716 int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
1717               int num_bytes)
1718 {
1719         struct obd_export *exp = ll_i2dtexp(inode);
1720         struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1721         struct ll_fiemap_info_key fm_key = { .name = KEY_FIEMAP, };
1722         int vallen = num_bytes;
1723         int rc;
1724         ENTRY;
1725
1726         /* If the stripe_count > 1 and the application does not understand
1727          * DEVICE_ORDER flag, then it cannot interpret the extents correctly.
1728          */
1729         if (lsm->lsm_stripe_count > 1 &&
1730             !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER))
1731                 return -EOPNOTSUPP;
1732
1733         fm_key.oa.o_id = lsm->lsm_object_id;
1734         fm_key.oa.o_gr = lsm->lsm_object_gr;
1735         fm_key.oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
1736
1737         obdo_from_inode(&fm_key.oa, inode, OBD_MD_FLFID | OBD_MD_FLGROUP |
1738                         OBD_MD_FLSIZE);
1739
1740         /* If filesize is 0, then there would be no objects for mapping */
1741         if (fm_key.oa.o_size == 0) {
1742                 fiemap->fm_mapped_extents = 0;
1743                 RETURN(0);
1744         }
1745
1746         memcpy(&fm_key.fiemap, fiemap, sizeof(*fiemap));
1747
1748         rc = obd_get_info(exp, sizeof(fm_key), &fm_key, &vallen, fiemap, lsm);
1749         if (rc)
1750                 CERROR("obd_get_info failed: rc = %d\n", rc);
1751
1752         RETURN(rc);
1753 }
1754
1755 int ll_fid2path(struct obd_export *exp, void *arg)
1756 {
1757         struct getinfo_fid2path *gfout, *gfin;
1758         int outsize, rc;
1759         ENTRY;
1760
1761         /* Need to get the buflen */
1762         OBD_ALLOC_PTR(gfin);
1763         if (gfin == NULL)
1764                 RETURN(-ENOMEM);
1765         if (copy_from_user(gfin, arg, sizeof(*gfin))) {
1766                 OBD_FREE_PTR(gfin);
1767                 RETURN(-EFAULT);
1768         }
1769
1770         outsize = sizeof(*gfout) + gfin->gf_pathlen;
1771         OBD_ALLOC(gfout, outsize);
1772         if (gfout == NULL) {
1773                 OBD_FREE_PTR(gfin);
1774                 RETURN(-ENOMEM);
1775         }
1776         memcpy(gfout, gfin, sizeof(*gfout));
1777         OBD_FREE_PTR(gfin);
1778
1779         /* Call mdc_iocontrol */
1780         rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
1781         if (rc)
1782                 GOTO(gf_free, rc);
1783         if (copy_to_user(arg, gfout, outsize))
1784                 rc = -EFAULT;
1785
1786 gf_free:
1787         OBD_FREE(gfout, outsize);
1788         RETURN(rc);
1789 }
1790
1791 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
1792                   unsigned long arg)
1793 {
1794         struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1795         int flags;
1796         ENTRY;
1797
1798         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
1799                inode->i_generation, inode, cmd);
1800         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
1801
1802         /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
1803         if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
1804                 RETURN(-ENOTTY);
1805
1806         switch(cmd) {
1807         case LL_IOC_GETFLAGS:
1808                 /* Get the current value of the file flags */
1809                 return put_user(fd->fd_flags, (int *)arg);
1810         case LL_IOC_SETFLAGS:
1811         case LL_IOC_CLRFLAGS:
1812                 /* Set or clear specific file flags */
1813                 /* XXX This probably needs checks to ensure the flags are
1814                  *     not abused, and to handle any flag side effects.
1815                  */
1816                 if (get_user(flags, (int *) arg))
1817                         RETURN(-EFAULT);
1818
1819                 if (cmd == LL_IOC_SETFLAGS) {
1820                         if ((flags & LL_FILE_IGNORE_LOCK) &&
1821                             !(file->f_flags & O_DIRECT)) {
1822                                 CERROR("%s: unable to disable locking on "
1823                                        "non-O_DIRECT file\n", current->comm);
1824                                 RETURN(-EINVAL);
1825                         }
1826
1827                         fd->fd_flags |= flags;
1828                 } else {
1829                         fd->fd_flags &= ~flags;
1830                 }
1831                 RETURN(0);
1832         case LL_IOC_LOV_SETSTRIPE:
1833                 RETURN(ll_lov_setstripe(inode, file, arg));
1834         case LL_IOC_LOV_SETEA:
1835                 RETURN(ll_lov_setea(inode, file, arg));
1836         case LL_IOC_LOV_GETSTRIPE:
1837                 RETURN(ll_lov_getstripe(inode, arg));
1838         case LL_IOC_RECREATE_OBJ:
1839                 RETURN(ll_lov_recreate_obj(inode, file, arg));
1840         case EXT3_IOC_FIEMAP: {
1841                 struct ll_user_fiemap *fiemap_s;
1842                 size_t num_bytes, ret_bytes;
1843                 unsigned int extent_count;
1844                 int rc = 0;
1845
1846                 /* Get the extent count so we can calculate the size of
1847                  * required fiemap buffer */
1848                 if (get_user(extent_count,
1849                     &((struct ll_user_fiemap __user *)arg)->fm_extent_count))
1850                         RETURN(-EFAULT);
1851                 num_bytes = sizeof(*fiemap_s) + (extent_count *
1852                                                  sizeof(struct ll_fiemap_extent));
1853                 OBD_VMALLOC(fiemap_s, num_bytes);
1854                 if (fiemap_s == NULL)
1855                         RETURN(-ENOMEM);
1856
1857                 if (copy_from_user(fiemap_s,(struct ll_user_fiemap __user *)arg,
1858                                    sizeof(*fiemap_s)))
1859                         GOTO(error, rc = -EFAULT);
1860
1861                 if (fiemap_s->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
1862                         fiemap_s->fm_flags = fiemap_s->fm_flags &
1863                                                     ~LUSTRE_FIEMAP_FLAGS_COMPAT;
1864                         if (copy_to_user((char *)arg, fiemap_s,
1865                                          sizeof(*fiemap_s)))
1866                                 GOTO(error, rc = -EFAULT);
1867
1868                         GOTO(error, rc = -EBADR);
1869                 }
1870
1871                 /* If fm_extent_count is non-zero, read the first extent since
1872                  * it is used to calculate end_offset and device from previous
1873                  * fiemap call. */
1874                 if (extent_count) {
1875                         if (copy_from_user(&fiemap_s->fm_extents[0],
1876                             (char __user *)arg + sizeof(*fiemap_s),
1877                             sizeof(struct ll_fiemap_extent)))
1878                                 GOTO(error, rc = -EFAULT);
1879                 }
1880
1881                 if (fiemap_s->fm_flags & FIEMAP_FLAG_SYNC) {
1882                         int rc;
1883
1884                         rc = filemap_fdatawrite(inode->i_mapping);
1885                         if (rc)
1886                                 GOTO(error, rc);
1887                 }
1888
1889                 rc = ll_fiemap(inode, fiemap_s, num_bytes);
1890                 if (rc)
1891                         GOTO(error, rc);
1892
1893                 ret_bytes = sizeof(struct ll_user_fiemap);
1894
1895                 if (extent_count != 0)
1896                         ret_bytes += (fiemap_s->fm_mapped_extents *
1897                                          sizeof(struct ll_fiemap_extent));
1898
1899                 if (copy_to_user((void *)arg, fiemap_s, ret_bytes))
1900                         rc = -EFAULT;
1901
1902 error:
1903                 OBD_VFREE(fiemap_s, num_bytes);
1904                 RETURN(rc);
1905         }
1906         case EXT3_IOC_GETFLAGS:
1907         case EXT3_IOC_SETFLAGS:
1908                 RETURN(ll_iocontrol(inode, file, cmd, arg));
1909         case EXT3_IOC_GETVERSION_OLD:
1910         case EXT3_IOC_GETVERSION:
1911                 RETURN(put_user(inode->i_generation, (int *)arg));
1912         case LL_IOC_JOIN: {
1913 #if LUSTRE_FIX >= 50
1914                 /* Allow file join in beta builds to allow debuggging */
1915                 char *ftail;
1916                 int rc;
1917
1918                 ftail = getname((const char *)arg);
1919                 if (IS_ERR(ftail))
1920                         RETURN(PTR_ERR(ftail));
1921                 rc = ll_file_join(inode, file, ftail);
1922                 putname(ftail);
1923                 RETURN(rc);
1924 #else
1925                 CWARN("file join is not supported in this version of Lustre\n");
1926                 RETURN(-ENOTTY);
1927 #endif
1928         }
1929         case LL_IOC_GROUP_LOCK:
1930                 RETURN(ll_get_grouplock(inode, file, arg));
1931         case LL_IOC_GROUP_UNLOCK:
1932                 RETURN(ll_put_grouplock(inode, file, arg));
1933         case IOC_OBD_STATFS:
1934                 RETURN(ll_obd_statfs(inode, (void *)arg));
1935
1936         /* We need to special case any other ioctls we want to handle,
1937          * to send them to the MDS/OST as appropriate and to properly
1938          * network encode the arg field.
1939         case EXT3_IOC_SETVERSION_OLD:
1940         case EXT3_IOC_SETVERSION:
1941         */
1942         case LL_IOC_FLUSHCTX:
1943                 RETURN(ll_flush_ctx(inode));
1944         case LL_IOC_PATH2FID: {
1945                 if (copy_to_user((void *)arg, &ll_i2info(inode)->lli_fid,
1946                                  sizeof(struct lu_fid)))
1947                         RETURN(-EFAULT);
1948
1949                 RETURN(0);
1950         }
1951         case OBD_IOC_FID2PATH:
1952                 RETURN(ll_fid2path(ll_i2mdexp(inode), (void *)arg));
1953
1954         default: {
1955                 int err;
1956
1957                 if (LLIOC_STOP ==
1958                     ll_iocontrol_call(inode, file, cmd, arg, &err))
1959                         RETURN(err);
1960
1961                 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
1962                                      (void *)arg));
1963         }
1964         }
1965 }
1966
1967 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
1968 {
1969         struct inode *inode = file->f_dentry->d_inode;
1970         loff_t retval;
1971         ENTRY;
1972         retval = offset + ((origin == 2) ? i_size_read(inode) :
1973                            (origin == 1) ? file->f_pos : 0);
1974         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
1975                inode->i_ino, inode->i_generation, inode, retval, retval,
1976                origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
1977         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
1978
1979         if (origin == 2) { /* SEEK_END */
1980                 int nonblock = 0, rc;
1981
1982                 if (file->f_flags & O_NONBLOCK)
1983                         nonblock = LDLM_FL_BLOCK_NOWAIT;
1984
1985                 rc = cl_glimpse_size(inode);
1986                 if (rc != 0)
1987                         RETURN(rc);
1988
1989                 ll_inode_size_lock(inode, 0);
1990                 offset += i_size_read(inode);
1991                 ll_inode_size_unlock(inode, 0);
1992         } else if (origin == 1) { /* SEEK_CUR */
1993                 offset += file->f_pos;
1994         }
1995
1996         retval = -EINVAL;
1997         if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
1998                 if (offset != file->f_pos) {
1999                         file->f_pos = offset;
2000                 }
2001                 retval = offset;
2002         }
2003
2004         RETURN(retval);
2005 }
2006
2007 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2008 {
2009         struct inode *inode = dentry->d_inode;
2010         struct ll_inode_info *lli = ll_i2info(inode);
2011         struct lov_stripe_md *lsm = lli->lli_smd;
2012         struct ptlrpc_request *req;
2013         struct obd_capa *oc;
2014         int rc, err;
2015         ENTRY;
2016         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2017                inode->i_generation, inode);
2018         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2019
2020         /* fsync's caller has already called _fdata{sync,write}, we want
2021          * that IO to finish before calling the osc and mdc sync methods */
2022         rc = filemap_fdatawait(inode->i_mapping);
2023
2024         /* catch async errors that were recorded back when async writeback
2025          * failed for pages in this mapping. */
2026         err = lli->lli_async_rc;
2027         lli->lli_async_rc = 0;
2028         if (rc == 0)
2029                 rc = err;
2030         if (lsm) {
2031                 err = lov_test_and_clear_async_rc(lsm);
2032                 if (rc == 0)
2033                         rc = err;
2034         }
2035
2036         oc = ll_mdscapa_get(inode);
2037         err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2038                       &req);
2039         capa_put(oc);
2040         if (!rc)
2041                 rc = err;
2042         if (!err)
2043                 ptlrpc_req_finished(req);
2044
2045         if (data && lsm) {
2046                 struct obdo *oa;
2047
2048                 OBDO_ALLOC(oa);
2049                 if (!oa)
2050                         RETURN(rc ? rc : -ENOMEM);
2051
2052                 oa->o_id = lsm->lsm_object_id;
2053                 oa->o_gr = lsm->lsm_object_gr;
2054                 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2055                 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2056                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2057                                            OBD_MD_FLGROUP);
2058
2059                 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2060                 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2061                                0, OBD_OBJECT_EOF, oc);
2062                 capa_put(oc);
2063                 if (!rc)
2064                         rc = err;
2065                 OBDO_FREE(oa);
2066         }
2067
2068         RETURN(rc);
2069 }
2070
2071 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2072 {
2073         struct inode *inode = file->f_dentry->d_inode;
2074         struct ll_sb_info *sbi = ll_i2sbi(inode);
2075         struct ldlm_enqueue_info einfo = { .ei_type = LDLM_FLOCK,
2076                                            .ei_cb_cp =ldlm_flock_completion_ast,
2077                                            .ei_cbdata = file_lock };
2078         struct md_op_data *op_data;
2079         struct lustre_handle lockh = {0};
2080         ldlm_policy_data_t flock;
2081         int flags = 0;
2082         int rc;
2083         ENTRY;
2084
2085         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2086                inode->i_ino, file_lock);
2087
2088         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2089
2090         if (file_lock->fl_flags & FL_FLOCK) {
2091                 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2092                 /* set missing params for flock() calls */
2093                 file_lock->fl_end = OFFSET_MAX;
2094                 file_lock->fl_pid = current->tgid;
2095         }
2096         flock.l_flock.pid = file_lock->fl_pid;
2097         flock.l_flock.start = file_lock->fl_start;
2098         flock.l_flock.end = file_lock->fl_end;
2099
2100         switch (file_lock->fl_type) {
2101         case F_RDLCK:
2102                 einfo.ei_mode = LCK_PR;
2103                 break;
2104         case F_UNLCK:
2105                 /* An unlock request may or may not have any relation to
2106                  * existing locks so we may not be able to pass a lock handle
2107                  * via a normal ldlm_lock_cancel() request. The request may even
2108                  * unlock a byte range in the middle of an existing lock. In
2109                  * order to process an unlock request we need all of the same
2110                  * information that is given with a normal read or write record
2111                  * lock request. To avoid creating another ldlm unlock (cancel)
2112                  * message we'll treat a LCK_NL flock request as an unlock. */
2113                 einfo.ei_mode = LCK_NL;
2114                 break;
2115         case F_WRLCK:
2116                 einfo.ei_mode = LCK_PW;
2117                 break;
2118         default:
2119                 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2120                 RETURN (-EINVAL);
2121         }
2122
2123         switch (cmd) {
2124         case F_SETLKW:
2125 #ifdef F_SETLKW64
2126         case F_SETLKW64:
2127 #endif
2128                 flags = 0;
2129                 break;
2130         case F_SETLK:
2131 #ifdef F_SETLK64
2132         case F_SETLK64:
2133 #endif
2134                 flags = LDLM_FL_BLOCK_NOWAIT;
2135                 break;
2136         case F_GETLK:
2137 #ifdef F_GETLK64
2138         case F_GETLK64:
2139 #endif
2140                 flags = LDLM_FL_TEST_LOCK;
2141                 /* Save the old mode so that if the mode in the lock changes we
2142                  * can decrement the appropriate reader or writer refcount. */
2143                 file_lock->fl_type = einfo.ei_mode;
2144                 break;
2145         default:
2146                 CERROR("unknown fcntl lock command: %d\n", cmd);
2147                 RETURN (-EINVAL);
2148         }
2149
2150         op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2151                                      LUSTRE_OPC_ANY, NULL);
2152         if (IS_ERR(op_data))
2153                 RETURN(PTR_ERR(op_data));
2154
2155         CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2156                "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2157                flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2158
2159         rc = md_enqueue(sbi->ll_md_exp, &einfo, NULL,
2160                         op_data, &lockh, &flock, 0, NULL /* req */, flags);
2161
2162         ll_finish_md_op_data(op_data);
2163
2164         if ((file_lock->fl_flags & FL_FLOCK) &&
2165             (rc == 0 || file_lock->fl_type == F_UNLCK))
2166                 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2167 #ifdef HAVE_F_OP_FLOCK
2168         if ((file_lock->fl_flags & FL_POSIX) &&
2169             (rc == 0 || file_lock->fl_type == F_UNLCK) &&
2170             !(flags & LDLM_FL_TEST_LOCK))
2171                 posix_lock_file_wait(file, file_lock);
2172 #endif
2173
2174         RETURN(rc);
2175 }
2176
2177 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2178 {
2179         ENTRY;
2180
2181         RETURN(-ENOSYS);
2182 }
2183
2184 int ll_have_md_lock(struct inode *inode, __u64 bits)
2185 {
2186         struct lustre_handle lockh;
2187         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2188         struct lu_fid *fid;
2189         int flags;
2190         ENTRY;
2191
2192         if (!inode)
2193                RETURN(0);
2194
2195         fid = &ll_i2info(inode)->lli_fid;
2196         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2197
2198         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2199         if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2200                           LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2201                 RETURN(1);
2202         }
2203         RETURN(0);
2204 }
2205
2206 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2207                             struct lustre_handle *lockh)
2208 {
2209         ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2210         struct lu_fid *fid;
2211         ldlm_mode_t rc;
2212         int flags;
2213         ENTRY;
2214
2215         fid = &ll_i2info(inode)->lli_fid;
2216         CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2217
2218         flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2219         rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2220                            LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2221         RETURN(rc);
2222 }
2223
2224 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2225         if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2226                               * and return success */
2227                 inode->i_nlink = 0;
2228                 /* This path cannot be hit for regular files unless in
2229                  * case of obscure races, so no need to to validate
2230                  * size. */
2231                 if (!S_ISREG(inode->i_mode) &&
2232                     !S_ISDIR(inode->i_mode))
2233                         return 0;
2234         }
2235
2236         if (rc) {
2237                 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2238                 return -abs(rc);
2239
2240         }
2241
2242         return 0;
2243 }
2244
2245 int __ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it,
2246                              __u64 ibits)
2247 {
2248         struct inode *inode = dentry->d_inode;
2249         struct ptlrpc_request *req = NULL;
2250         struct ll_sb_info *sbi;
2251         struct obd_export *exp;
2252         int rc = 0;
2253         ENTRY;
2254
2255         if (!inode) {
2256                 CERROR("REPORT THIS LINE TO PETER\n");
2257                 RETURN(0);
2258         }
2259         sbi = ll_i2sbi(inode);
2260
2261         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2262                inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2263
2264         exp = ll_i2mdexp(inode);
2265
2266         if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2267                 struct lookup_intent oit = { .it_op = IT_GETATTR };
2268                 struct md_op_data *op_data;
2269
2270                 /* Call getattr by fid, so do not provide name at all. */
2271                 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2272                                              dentry->d_inode, NULL, 0, 0,
2273                                              LUSTRE_OPC_ANY, NULL);
2274                 if (IS_ERR(op_data))
2275                         RETURN(PTR_ERR(op_data));
2276
2277                 oit.it_create_mode |= M_CHECK_STALE;
2278                 rc = md_intent_lock(exp, op_data, NULL, 0,
2279                                     /* we are not interested in name
2280                                        based lookup */
2281                                     &oit, 0, &req,
2282                                     ll_md_blocking_ast, 0);
2283                 ll_finish_md_op_data(op_data);
2284                 oit.it_create_mode &= ~M_CHECK_STALE;
2285                 if (rc < 0) {
2286                         rc = ll_inode_revalidate_fini(inode, rc);
2287                         GOTO (out, rc);
2288                 }
2289
2290                 rc = ll_revalidate_it_finish(req, &oit, dentry);
2291                 if (rc != 0) {
2292                         ll_intent_release(&oit);
2293                         GOTO(out, rc);
2294                 }
2295
2296                 /* Unlinked? Unhash dentry, so it is not picked up later by
2297                    do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2298                    here to preserve get_cwd functionality on 2.6.
2299                    Bug 10503 */
2300                 if (!dentry->d_inode->i_nlink) {
2301                         spin_lock(&ll_lookup_lock);
2302                         spin_lock(&dcache_lock);
2303                         ll_drop_dentry(dentry);
2304                         spin_unlock(&dcache_lock);
2305                         spin_unlock(&ll_lookup_lock);
2306                 }
2307
2308                 ll_lookup_finish_locks(&oit, dentry);
2309         } else if (!ll_have_md_lock(dentry->d_inode, ibits)) {
2310
2311                 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2312                 obd_valid valid = OBD_MD_FLGETATTR;
2313                 struct obd_capa *oc;
2314                 int ealen = 0;
2315
2316                 if (S_ISREG(inode->i_mode)) {
2317                         rc = ll_get_max_mdsize(sbi, &ealen);
2318                         if (rc)
2319                                 RETURN(rc);
2320                         valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2321                 }
2322                 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2323                  * capa for this inode. Because we only keep capas of dirs
2324                  * fresh. */
2325                 oc = ll_mdscapa_get(inode);
2326                 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2327                                 ealen, &req);
2328                 capa_put(oc);
2329                 if (rc) {
2330                         rc = ll_inode_revalidate_fini(inode, rc);
2331                         RETURN(rc);
2332                 }
2333
2334                 rc = ll_prep_inode(&inode, req, NULL);
2335         }
2336 out:
2337         ptlrpc_req_finished(req);
2338         return rc;
2339 }
2340
2341 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2342 {
2343         int rc;
2344         ENTRY;
2345
2346         rc = __ll_inode_revalidate_it(dentry, it, MDS_INODELOCK_UPDATE |
2347                                                   MDS_INODELOCK_LOOKUP);
2348
2349         /* if object not yet allocated, don't validate size */
2350         if (rc == 0 && ll_i2info(dentry->d_inode)->lli_smd == NULL)
2351                 RETURN(0);
2352
2353         /* cl_glimpse_size will prefer locally cached writes if they extend
2354          * the file */
2355
2356         if (rc == 0)
2357                 rc = cl_glimpse_size(dentry->d_inode);
2358
2359         RETURN(rc);
2360 }
2361
2362 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2363                   struct lookup_intent *it, struct kstat *stat)
2364 {
2365         struct inode *inode = de->d_inode;
2366         int res = 0;
2367
2368         res = ll_inode_revalidate_it(de, it);
2369         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2370
2371         if (res)
2372                 return res;
2373
2374         stat->dev = inode->i_sb->s_dev;
2375         stat->ino = inode->i_ino;
2376         stat->mode = inode->i_mode;
2377         stat->nlink = inode->i_nlink;
2378         stat->uid = inode->i_uid;
2379         stat->gid = inode->i_gid;
2380         stat->rdev = kdev_t_to_nr(inode->i_rdev);
2381         stat->atime = inode->i_atime;
2382         stat->mtime = inode->i_mtime;
2383         stat->ctime = inode->i_ctime;
2384 #ifdef HAVE_INODE_BLKSIZE
2385         stat->blksize = inode->i_blksize;
2386 #else
2387         stat->blksize = 1 << inode->i_blkbits;
2388 #endif
2389
2390         ll_inode_size_lock(inode, 0);
2391         stat->size = i_size_read(inode);
2392         stat->blocks = inode->i_blocks;
2393         ll_inode_size_unlock(inode, 0);
2394
2395         return 0;
2396 }
2397 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2398 {
2399         struct lookup_intent it = { .it_op = IT_GETATTR };
2400
2401         return ll_getattr_it(mnt, de, &it, stat);
2402 }
2403
2404 static
2405 int lustre_check_acl(struct inode *inode, int mask)
2406 {
2407 #ifdef CONFIG_FS_POSIX_ACL
2408         struct ll_inode_info *lli = ll_i2info(inode);
2409         struct posix_acl *acl;
2410         int rc;
2411         ENTRY;
2412
2413         spin_lock(&lli->lli_lock);
2414         acl = posix_acl_dup(lli->lli_posix_acl);
2415         spin_unlock(&lli->lli_lock);
2416
2417         if (!acl)
2418                 RETURN(-EAGAIN);
2419
2420         rc = posix_acl_permission(inode, acl, mask);
2421         posix_acl_release(acl);
2422
2423         RETURN(rc);
2424 #else
2425         return -EAGAIN;
2426 #endif
2427 }
2428
2429 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2430 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2431 {
2432         int rc = 0;
2433         ENTRY;
2434
2435        /* as root inode are NOT getting validated in lookup operation,
2436         * need to do it before permission check. */
2437
2438         if (inode == inode->i_sb->s_root->d_inode) {
2439                 struct lookup_intent it = { .it_op = IT_GETATTR };
2440
2441                 rc = __ll_inode_revalidate_it(inode->i_sb->s_root, &it,
2442                                               MDS_INODELOCK_LOOKUP);
2443                 if (rc)
2444                         RETURN(rc);
2445         }
2446
2447         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), inode mode %x mask %o\n",
2448                inode->i_ino, inode->i_generation, inode, inode->i_mode, mask);
2449
2450         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2451                 return lustre_check_remote_perm(inode, mask);
2452
2453         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2454         rc = generic_permission(inode, mask, lustre_check_acl);
2455
2456         RETURN(rc);
2457 }
2458 #else
2459 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2460 {
2461         int mode = inode->i_mode;
2462         int rc;
2463
2464         CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2465                inode->i_ino, inode->i_generation, inode, mask);
2466
2467         if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2468                 return lustre_check_remote_perm(inode, mask);
2469
2470         ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2471
2472         if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2473             (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2474                 return -EROFS;
2475         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2476                 return -EACCES;
2477         if (current->fsuid == inode->i_uid) {
2478                 mode >>= 6;
2479         } else if (1) {
2480                 if (((mode >> 3) & mask & S_IRWXO) != mask)
2481                         goto check_groups;
2482                 rc = lustre_check_acl(inode, mask);
2483                 if (rc == -EAGAIN)
2484                         goto check_groups;
2485                 if (rc == -EACCES)
2486                         goto check_capabilities;
2487                 return rc;
2488         } else {
2489 check_groups:
2490                 if (in_group_p(inode->i_gid))
2491                         mode >>= 3;
2492         }
2493         if ((mode & mask & S_IRWXO) == mask)
2494                 return 0;
2495
2496 check_capabilities:
2497         if (!(mask & MAY_EXEC) ||
2498             (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2499                 if (cfs_capable(CFS_CAP_DAC_OVERRIDE))
2500                         return 0;
2501
2502         if (cfs_capable(CFS_CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2503             (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2504                 return 0;
2505
2506         return -EACCES;
2507 }
2508 #endif
2509
2510 #ifdef HAVE_FILE_READV
2511 #define READ_METHOD readv
2512 #define READ_FUNCTION ll_file_readv
2513 #define WRITE_METHOD writev
2514 #define WRITE_FUNCTION ll_file_writev
2515 #else
2516 #define READ_METHOD aio_read
2517 #define READ_FUNCTION ll_file_aio_read
2518 #define WRITE_METHOD aio_write
2519 #define WRITE_FUNCTION ll_file_aio_write
2520 #endif
2521
2522 /* -o localflock - only provides locally consistent flock locks */
2523 struct file_operations ll_file_operations = {
2524         .read           = ll_file_read,
2525         .READ_METHOD    = READ_FUNCTION,
2526         .write          = ll_file_write,
2527         .WRITE_METHOD   = WRITE_FUNCTION,
2528         .ioctl          = ll_file_ioctl,
2529         .open           = ll_file_open,
2530         .release        = ll_file_release,
2531         .mmap           = ll_file_mmap,
2532         .llseek         = ll_file_seek,
2533         .sendfile       = ll_file_sendfile,
2534         .fsync          = ll_fsync,
2535 };
2536
2537 struct file_operations ll_file_operations_flock = {
2538         .read           = ll_file_read,
2539         .READ_METHOD    = READ_FUNCTION,
2540         .write          = ll_file_write,
2541         .WRITE_METHOD   = WRITE_FUNCTION,
2542         .ioctl          = ll_file_ioctl,
2543         .open           = ll_file_open,
2544         .release        = ll_file_release,
2545         .mmap           = ll_file_mmap,
2546         .llseek         = ll_file_seek,
2547         .sendfile       = ll_file_sendfile,
2548         .fsync          = ll_fsync,
2549 #ifdef HAVE_F_OP_FLOCK
2550         .flock          = ll_file_flock,
2551 #endif
2552         .lock           = ll_file_flock
2553 };
2554
2555 /* These are for -o noflock - to return ENOSYS on flock calls */
2556 struct file_operations ll_file_operations_noflock = {
2557         .read           = ll_file_read,
2558         .READ_METHOD    = READ_FUNCTION,
2559         .write          = ll_file_write,
2560         .WRITE_METHOD   = WRITE_FUNCTION,
2561         .ioctl          = ll_file_ioctl,
2562         .open           = ll_file_open,
2563         .release        = ll_file_release,
2564         .mmap           = ll_file_mmap,
2565         .llseek         = ll_file_seek,
2566         .sendfile       = ll_file_sendfile,
2567         .fsync          = ll_fsync,
2568 #ifdef HAVE_F_OP_FLOCK
2569         .flock          = ll_file_noflock,
2570 #endif
2571         .lock           = ll_file_noflock
2572 };
2573
2574 struct inode_operations ll_file_inode_operations = {
2575 #ifdef HAVE_VFS_INTENT_PATCHES
2576         .setattr_raw    = ll_setattr_raw,
2577 #endif
2578         .setattr        = ll_setattr,
2579         .truncate       = ll_truncate,
2580         .getattr        = ll_getattr,
2581         .permission     = ll_inode_permission,
2582         .setxattr       = ll_setxattr,
2583         .getxattr       = ll_getxattr,
2584         .listxattr      = ll_listxattr,
2585         .removexattr    = ll_removexattr,
2586 };
2587
2588 /* dynamic ioctl number support routins */
2589 static struct llioc_ctl_data {
2590         struct rw_semaphore ioc_sem;
2591         struct list_head    ioc_head;
2592 } llioc = {
2593         __RWSEM_INITIALIZER(llioc.ioc_sem),
2594         CFS_LIST_HEAD_INIT(llioc.ioc_head)
2595 };
2596
2597
2598 struct llioc_data {
2599         struct list_head        iocd_list;
2600         unsigned int            iocd_size;
2601         llioc_callback_t        iocd_cb;
2602         unsigned int            iocd_count;
2603         unsigned int            iocd_cmd[0];
2604 };
2605
2606 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2607 {
2608         unsigned int size;
2609         struct llioc_data *in_data = NULL;
2610         ENTRY;
2611
2612         if (cb == NULL || cmd == NULL ||
2613             count > LLIOC_MAX_CMD || count < 0)
2614                 RETURN(NULL);
2615
2616         size = sizeof(*in_data) + count * sizeof(unsigned int);
2617         OBD_ALLOC(in_data, size);
2618         if (in_data == NULL)
2619                 RETURN(NULL);
2620
2621         memset(in_data, 0, sizeof(*in_data));
2622         in_data->iocd_size = size;
2623         in_data->iocd_cb = cb;
2624         in_data->iocd_count = count;
2625         memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2626
2627         down_write(&llioc.ioc_sem);
2628         list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2629         up_write(&llioc.ioc_sem);
2630
2631         RETURN(in_data);
2632 }
2633
2634 void ll_iocontrol_unregister(void *magic)
2635 {
2636         struct llioc_data *tmp;
2637
2638         if (magic == NULL)
2639                 return;
2640
2641         down_write(&llioc.ioc_sem);
2642         list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2643                 if (tmp == magic) {
2644                         unsigned int size = tmp->iocd_size;
2645
2646                         list_del(&tmp->iocd_list);
2647                         up_write(&llioc.ioc_sem);
2648
2649                         OBD_FREE(tmp, size);
2650                         return;
2651                 }
2652         }
2653         up_write(&llioc.ioc_sem);
2654
2655         CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2656 }
2657
2658 EXPORT_SYMBOL(ll_iocontrol_register);
2659 EXPORT_SYMBOL(ll_iocontrol_unregister);
2660
2661 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2662                         unsigned int cmd, unsigned long arg, int *rcp)
2663 {
2664         enum llioc_iter ret = LLIOC_CONT;
2665         struct llioc_data *data;
2666         int rc = -EINVAL, i;
2667
2668         down_read(&llioc.ioc_sem);
2669         list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2670                 for (i = 0; i < data->iocd_count; i++) {
2671                         if (cmd != data->iocd_cmd[i])
2672                                 continue;
2673
2674                         ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2675                         break;
2676                 }
2677
2678                 if (ret == LLIOC_STOP)
2679                         break;
2680         }
2681         up_read(&llioc.ioc_sem);
2682
2683         if (rcp)
2684                 *rcp = rc;
2685         return ret;
2686 }