1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <lustre_mdc.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include "llite_internal.h"
33 /* also used by llite/special.c:ll_special_open() */
34 struct ll_file_data *ll_file_data_get(void)
36 struct ll_file_data *fd;
38 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
42 static void ll_file_data_put(struct ll_file_data *fd)
45 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
48 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
49 struct lustre_handle *fh)
51 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
52 op_data->op_attr.ia_mode = inode->i_mode;
53 op_data->op_attr.ia_atime = inode->i_atime;
54 op_data->op_attr.ia_mtime = inode->i_mtime;
55 op_data->op_attr.ia_ctime = inode->i_ctime;
56 op_data->op_attr.ia_size = i_size_read(inode);
57 op_data->op_attr_blocks = inode->i_blocks;
58 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
59 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
60 memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
61 op_data->op_capa1 = ll_mdscapa_get(inode);
64 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
65 struct obd_client_handle *och)
69 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
70 ATTR_MTIME_SET | ATTR_CTIME_SET;
72 if (!(och->och_flags & FMODE_WRITE))
75 if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
76 !S_ISREG(inode->i_mode))
77 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
79 ll_epoch_close(inode, op_data, &och, 0);
82 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
86 static int ll_close_inode_openhandle(struct obd_export *md_exp,
88 struct obd_client_handle *och)
90 struct obd_export *exp = ll_i2mdexp(inode);
91 struct md_op_data *op_data;
92 struct ptlrpc_request *req = NULL;
93 struct obd_device *obd = class_exp2obd(exp);
100 * XXX: in case of LMV, is this correct to access
103 CERROR("Invalid MDC connection handle "LPX64"\n",
104 ll_i2mdexp(inode)->exp_handle.h_cookie);
109 * here we check if this is forced umount. If so this is called on
110 * canceling "open lock" and we do not call md_close() in this case, as
111 * it will not be successful, as import is already deactivated.
116 OBD_ALLOC_PTR(op_data);
118 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
120 ll_prepare_close(inode, op_data, och);
121 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
122 rc = md_close(md_exp, op_data, och->och_mod, &req);
127 /* This close must have the epoch closed. */
128 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
129 LASSERT(epoch_close);
130 /* MDS has instructed us to obtain Size-on-MDS attribute from
131 * OSTs and send setattr to back to MDS. */
132 rc = ll_sizeonmds_update(inode, och->och_mod,
133 &och->och_fh, op_data->op_ioepoch);
135 CERROR("inode %lu mdc Size-on-MDS update failed: "
136 "rc = %d\n", inode->i_ino, rc);
140 CERROR("inode %lu mdc close failed: rc = %d\n",
143 ll_finish_md_op_data(op_data);
146 rc = ll_objects_destroy(req, inode);
148 CERROR("inode %lu ll_objects destroy: rc = %d\n",
155 if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
156 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
157 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
160 ptlrpc_close_replay_seq(req);
161 md_clear_open_replay_data(md_exp, och);
162 /* Free @och if it is not waiting for DONE_WRITING. */
163 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
166 if (req) /* This is close request */
167 ptlrpc_req_finished(req);
171 int ll_md_real_close(struct inode *inode, int flags)
173 struct ll_inode_info *lli = ll_i2info(inode);
174 struct obd_client_handle **och_p;
175 struct obd_client_handle *och;
180 if (flags & FMODE_WRITE) {
181 och_p = &lli->lli_mds_write_och;
182 och_usecount = &lli->lli_open_fd_write_count;
183 } else if (flags & FMODE_EXEC) {
184 och_p = &lli->lli_mds_exec_och;
185 och_usecount = &lli->lli_open_fd_exec_count;
187 LASSERT(flags & FMODE_READ);
188 och_p = &lli->lli_mds_read_och;
189 och_usecount = &lli->lli_open_fd_read_count;
192 down(&lli->lli_och_sem);
193 if (*och_usecount) { /* There are still users of this handle, so
195 up(&lli->lli_och_sem);
200 up(&lli->lli_och_sem);
202 if (och) { /* There might be a race and somebody have freed this och
204 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
211 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
214 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
215 struct ll_inode_info *lli = ll_i2info(inode);
219 /* clear group lock, if present */
220 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
221 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
222 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
223 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
227 /* Let's see if we have good enough OPEN lock on the file and if
228 we can skip talking to MDS */
229 if (file->f_dentry->d_inode) { /* Can this ever be false? */
231 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
232 struct lustre_handle lockh;
233 struct inode *inode = file->f_dentry->d_inode;
234 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
236 down(&lli->lli_och_sem);
237 if (fd->fd_omode & FMODE_WRITE) {
239 LASSERT(lli->lli_open_fd_write_count);
240 lli->lli_open_fd_write_count--;
241 } else if (fd->fd_omode & FMODE_EXEC) {
243 LASSERT(lli->lli_open_fd_exec_count);
244 lli->lli_open_fd_exec_count--;
247 LASSERT(lli->lli_open_fd_read_count);
248 lli->lli_open_fd_read_count--;
250 up(&lli->lli_och_sem);
252 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
253 LDLM_IBITS, &policy, lockmode,
255 rc = ll_md_real_close(file->f_dentry->d_inode,
259 CERROR("Releasing a file %p with negative dentry %p. Name %s",
260 file, file->f_dentry, file->f_dentry->d_name.name);
263 LUSTRE_FPRIVATE(file) = NULL;
264 ll_file_data_put(fd);
265 ll_capa_close(inode);
270 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
272 /* While this returns an error code, fput() the caller does not, so we need
273 * to make every effort to clean up all of our state here. Also, applications
274 * rarely check close errors and even if an error is returned they will not
275 * re-try the close call.
277 int ll_file_release(struct inode *inode, struct file *file)
279 struct ll_file_data *fd;
280 struct ll_sb_info *sbi = ll_i2sbi(inode);
281 struct ll_inode_info *lli = ll_i2info(inode);
282 struct lov_stripe_md *lsm = lli->lli_smd;
286 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
287 inode->i_generation, inode);
289 /* don't do anything for / */
290 if (inode->i_sb->s_root == file->f_dentry)
293 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
294 fd = LUSTRE_FPRIVATE(file);
297 /* don't do anything for / */
298 if (inode->i_sb->s_root == file->f_dentry) {
299 LUSTRE_FPRIVATE(file) = NULL;
300 ll_file_data_put(fd);
305 lov_test_and_clear_async_rc(lsm);
306 lli->lli_async_rc = 0;
308 rc = ll_md_close(sbi->ll_md_exp, inode, file);
312 static int ll_intent_file_open(struct file *file, void *lmm,
313 int lmmsize, struct lookup_intent *itp)
315 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
316 struct dentry *parent = file->f_dentry->d_parent;
317 const char *name = file->f_dentry->d_name.name;
318 const int len = file->f_dentry->d_name.len;
319 struct md_op_data *op_data;
320 struct ptlrpc_request *req;
326 /* Usually we come here only for NFSD, and we want open lock.
327 But we can also get here with pre 2.6.15 patchless kernels, and in
328 that case that lock is also ok */
329 /* We can also get here if there was cached open handle in revalidate_it
330 * but it disappeared while we were getting from there to ll_file_open.
331 * But this means this file was closed and immediatelly opened which
332 * makes a good candidate for using OPEN lock */
333 /* If lmmsize & lmm are not 0, we are just setting stripe info
334 * parameters. No need for the open lock */
335 if (!lmm && !lmmsize)
336 itp->it_flags |= MDS_OPEN_LOCK;
338 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
339 file->f_dentry->d_inode, name, len,
340 O_RDWR, LUSTRE_OPC_ANY, NULL);
342 RETURN(PTR_ERR(op_data));
344 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
345 0 /*unused */, &req, ll_md_blocking_ast, 0);
346 ll_finish_md_op_data(op_data);
348 /* reason for keep own exit path - don`t flood log
349 * with messages with -ESTALE errors.
351 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
352 it_open_error(DISP_OPEN_OPEN, itp))
354 ll_release_openhandle(file->f_dentry, itp);
358 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
359 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
360 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
364 if (itp->d.lustre.it_lock_mode)
365 md_set_lock_data(sbi->ll_md_exp,
366 &itp->d.lustre.it_lock_handle,
367 file->f_dentry->d_inode);
369 rc = ll_prep_inode(&file->f_dentry->d_inode, req, DLM_REPLY_REC_OFF,
372 ptlrpc_req_finished(itp->d.lustre.it_data);
375 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
376 ll_intent_drop_lock(itp);
381 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
382 struct lookup_intent *it, struct obd_client_handle *och)
384 struct ptlrpc_request *req = it->d.lustre.it_data;
385 struct mdt_body *body;
389 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
390 /* reply already checked out */
391 LASSERT(body != NULL);
392 /* and swabbed in md_enqueue */
393 LASSERT(lustre_rep_swabbed(req, DLM_REPLY_REC_OFF));
395 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
396 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
397 och->och_fid = lli->lli_fid;
398 och->och_flags = it->it_flags;
399 lli->lli_ioepoch = body->ioepoch;
401 return md_set_open_replay_data(md_exp, och, req);
404 int ll_local_open(struct file *file, struct lookup_intent *it,
405 struct ll_file_data *fd, struct obd_client_handle *och)
407 struct inode *inode = file->f_dentry->d_inode;
408 struct ll_inode_info *lli = ll_i2info(inode);
411 LASSERT(!LUSTRE_FPRIVATE(file));
416 struct ptlrpc_request *req = it->d.lustre.it_data;
417 struct mdt_body *body;
420 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
424 body = lustre_msg_buf(req->rq_repmsg,
425 DLM_REPLY_REC_OFF, sizeof(*body));
427 if ((it->it_flags & FMODE_WRITE) &&
428 (body->valid & OBD_MD_FLSIZE))
430 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
431 lli->lli_ioepoch, PFID(&lli->lli_fid));
435 LUSTRE_FPRIVATE(file) = fd;
436 ll_readahead_init(inode, &fd->fd_ras);
437 fd->fd_omode = it->it_flags;
441 /* Open a file, and (for the very first open) create objects on the OSTs at
442 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
443 * creation or open until ll_lov_setstripe() ioctl is called. We grab
444 * lli_open_sem to ensure no other process will create objects, send the
445 * stripe MD to the MDS, or try to destroy the objects if that fails.
447 * If we already have the stripe MD locally then we don't request it in
448 * md_open(), by passing a lmm_size = 0.
450 * It is up to the application to ensure no other processes open this file
451 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
452 * used. We might be able to avoid races of that sort by getting lli_open_sem
453 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
454 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
456 int ll_file_open(struct inode *inode, struct file *file)
458 struct ll_inode_info *lli = ll_i2info(inode);
459 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
460 .it_flags = file->f_flags };
461 struct lov_stripe_md *lsm;
462 struct ptlrpc_request *req = NULL;
463 struct obd_client_handle **och_p;
465 struct ll_file_data *fd;
469 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
470 inode->i_generation, inode, file->f_flags);
472 /* don't do anything for / */
473 if (inode->i_sb->s_root == file->f_dentry)
476 #ifdef HAVE_VFS_INTENT_PATCHES
479 it = file->private_data; /* XXX: compat macro */
480 file->private_data = NULL; /* prevent ll_local_open assertion */
483 fd = ll_file_data_get();
487 /* don't do anything for / */
488 if (inode->i_sb->s_root == file->f_dentry) {
489 LUSTRE_FPRIVATE(file) = fd;
493 if (!it || !it->d.lustre.it_disposition) {
494 /* Convert f_flags into access mode. We cannot use file->f_mode,
495 * because everything but O_ACCMODE mask was stripped from
497 if ((oit.it_flags + 1) & O_ACCMODE)
499 if (file->f_flags & O_TRUNC)
500 oit.it_flags |= FMODE_WRITE;
502 /* kernel only call f_op->open in dentry_open. filp_open calls
503 * dentry_open after call to open_namei that checks permissions.
504 * Only nfsd_open call dentry_open directly without checking
505 * permissions and because of that this code below is safe. */
506 if (oit.it_flags & FMODE_WRITE)
507 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
509 /* We do not want O_EXCL here, presumably we opened the file
510 * already? XXX - NFS implications? */
511 oit.it_flags &= ~O_EXCL;
517 /* Let's see if we have file open on MDS already. */
518 if (it->it_flags & FMODE_WRITE) {
519 och_p = &lli->lli_mds_write_och;
520 och_usecount = &lli->lli_open_fd_write_count;
521 } else if (it->it_flags & FMODE_EXEC) {
522 och_p = &lli->lli_mds_exec_och;
523 och_usecount = &lli->lli_open_fd_exec_count;
525 och_p = &lli->lli_mds_read_och;
526 och_usecount = &lli->lli_open_fd_read_count;
529 down(&lli->lli_och_sem);
530 if (*och_p) { /* Open handle is present */
531 if (it_disposition(it, DISP_OPEN_OPEN)) {
532 /* Well, there's extra open request that we do not need,
533 let's close it somehow. This will decref request. */
534 rc = it_open_error(DISP_OPEN_OPEN, it);
536 ll_file_data_put(fd);
537 GOTO(out_och_free, rc);
539 ll_release_openhandle(file->f_dentry, it);
540 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
545 rc = ll_local_open(file, it, fd, NULL);
547 up(&lli->lli_och_sem);
548 ll_file_data_put(fd);
552 LASSERT(*och_usecount == 0);
553 if (!it->d.lustre.it_disposition) {
554 /* We cannot just request lock handle now, new ELC code
555 means that one of other OPEN locks for this file
556 could be cancelled, and since blocking ast handler
557 would attempt to grab och_sem as well, that would
558 result in a deadlock */
559 up(&lli->lli_och_sem);
560 it->it_flags |= O_CHECK_STALE;
561 rc = ll_intent_file_open(file, NULL, 0, it);
562 it->it_flags &= ~O_CHECK_STALE;
564 ll_file_data_put(fd);
565 GOTO(out_openerr, rc);
568 /* Got some error? Release the request */
569 if (it->d.lustre.it_status < 0) {
570 req = it->d.lustre.it_data;
571 ptlrpc_req_finished(req);
573 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
574 &it->d.lustre.it_lock_handle,
575 file->f_dentry->d_inode);
578 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
580 ll_file_data_put(fd);
581 GOTO(out_och_free, rc = -ENOMEM);
584 req = it->d.lustre.it_data;
586 /* md_intent_lock() didn't get a request ref if there was an
587 * open error, so don't do cleanup on the request here
589 /* XXX (green): Should not we bail out on any error here, not
590 * just open error? */
591 rc = it_open_error(DISP_OPEN_OPEN, it);
593 ll_file_data_put(fd);
594 GOTO(out_och_free, rc);
597 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
598 rc = ll_local_open(file, it, fd, *och_p);
600 up(&lli->lli_och_sem);
601 ll_file_data_put(fd);
602 GOTO(out_och_free, rc);
605 up(&lli->lli_och_sem);
607 /* Must do this outside lli_och_sem lock to prevent deadlock where
608 different kind of OPEN lock for this same inode gets cancelled
609 by ldlm_cancel_lru */
610 if (!S_ISREG(inode->i_mode))
617 if (file->f_flags & O_LOV_DELAY_CREATE ||
618 !(file->f_mode & FMODE_WRITE)) {
619 CDEBUG(D_INODE, "object creation was delayed\n");
623 file->f_flags &= ~O_LOV_DELAY_CREATE;
626 ptlrpc_req_finished(req);
628 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
632 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
633 *och_p = NULL; /* OBD_FREE writes some magic there */
636 up(&lli->lli_och_sem);
637 out_openerr: ;/* Looks weierd, eh? Just wait for statahead code to insert
638 a statement here <-- remove this comment after statahead
645 /* Fills the obdo with the attributes for the inode defined by lsm */
646 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
648 struct ptlrpc_request_set *set;
649 struct ll_inode_info *lli = ll_i2info(inode);
650 struct lov_stripe_md *lsm = lli->lli_smd;
652 struct obd_info oinfo = { { { 0 } } };
656 LASSERT(lsm != NULL);
660 oinfo.oi_oa->o_id = lsm->lsm_object_id;
661 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
662 oinfo.oi_oa->o_mode = S_IFREG;
663 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
664 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
665 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
666 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
668 oinfo.oi_capa = ll_mdscapa_get(inode);
670 set = ptlrpc_prep_set();
672 CERROR("can't allocate ptlrpc set\n");
675 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
677 rc = ptlrpc_set_wait(set);
678 ptlrpc_set_destroy(set);
680 capa_put(oinfo.oi_capa);
684 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
685 OBD_MD_FLATIME | OBD_MD_FLMTIME |
686 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
688 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
689 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
690 lli->lli_smd->lsm_object_id, i_size_read(inode),
691 inode->i_blocks, inode->i_blksize);
695 static inline void ll_remove_suid(struct inode *inode)
699 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
700 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
702 /* was any of the uid bits set? */
703 mode &= inode->i_mode;
704 if (mode && !capable(CAP_FSETID)) {
705 inode->i_mode &= ~mode;
706 // XXX careful here - we cannot change the size
710 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
712 struct ll_inode_info *lli = ll_i2info(inode);
713 struct lov_stripe_md *lsm = lli->lli_smd;
714 struct obd_export *exp = ll_i2dtexp(inode);
717 struct ldlm_lock *lock;
718 struct lov_stripe_md *lsm;
719 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
720 __u32 stripe, vallen = sizeof(stripe);
724 if (lsm->lsm_stripe_count == 1)
725 GOTO(check, stripe = 0);
727 /* get our offset in the lov */
728 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
730 CERROR("obd_get_info: rc = %d\n", rc);
733 LASSERT(stripe < lsm->lsm_stripe_count);
736 if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
737 lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
738 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
739 lsm->lsm_oinfo[stripe]->loi_id,
740 lsm->lsm_oinfo[stripe]->loi_gr);
741 RETURN(-ELDLM_NO_LOCK_DATA);
747 /* Flush the page cache for an extent as its canceled. When we're on an LOV,
748 * we get a lock cancellation for each stripe, so we have to map the obd's
749 * region back onto the stripes in the file that it held.
751 * No one can dirty the extent until we've finished our work and they can
752 * enqueue another lock. The DLM protects us from ll_file_read/write here,
753 * but other kernel actors could have pages locked.
755 * Called with the DLM lock held. */
756 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
757 struct ldlm_lock *lock, __u32 stripe)
759 ldlm_policy_data_t tmpex;
760 unsigned long start, end, count, skip, i, j;
762 int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
763 struct lustre_handle lockh;
764 struct address_space *mapping = inode->i_mapping;
767 tmpex = lock->l_policy_data;
768 CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
769 inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
772 /* our locks are page granular thanks to osc_enqueue, we invalidate the
774 if ((tmpex.l_extent.start & ~CFS_PAGE_MASK) != 0 ||
775 ((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) != 0)
776 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",
778 LASSERT((tmpex.l_extent.start & ~CFS_PAGE_MASK) == 0);
779 LASSERT(((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) == 0);
783 start = tmpex.l_extent.start >> CFS_PAGE_SHIFT;
784 end = tmpex.l_extent.end >> CFS_PAGE_SHIFT;
785 if (lsm->lsm_stripe_count > 1) {
786 count = lsm->lsm_stripe_size >> CFS_PAGE_SHIFT;
787 skip = (lsm->lsm_stripe_count - 1) * count;
788 start += start/count * skip + stripe * count;
790 end += end/count * skip + stripe * count;
792 if (end < tmpex.l_extent.end >> CFS_PAGE_SHIFT)
795 i = i_size_read(inode) ? (__u64)(i_size_read(inode) - 1) >>
800 CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
801 "count: %lu skip: %lu end: %lu%s\n", start, start % count,
802 count, skip, end, discard ? " (DISCARDING)" : "");
804 /* walk through the vmas on the inode and tear down mmaped pages that
805 * intersect with the lock. this stops immediately if there are no
806 * mmap()ed regions of the file. This is not efficient at all and
807 * should be short lived. We'll associate mmap()ed pages with the lock
808 * and will be able to find them directly */
809 for (i = start; i <= end; i += (j + skip)) {
810 j = min(count - (i % count), end - i + 1);
813 if (ll_teardown_mmaps(mapping,
814 (__u64)i << CFS_PAGE_SHIFT,
815 ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) )
819 /* this is the simplistic implementation of page eviction at
820 * cancelation. It is careful to get races with other page
821 * lockers handled correctly. fixes from bug 20 will make it
822 * more efficient by associating locks with pages and with
823 * batching writeback under the lock explicitly. */
824 for (i = start, j = start % count; i <= end;
825 j++, i++, tmpex.l_extent.start += CFS_PAGE_SIZE) {
827 CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
833 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
834 LPU64" >= "LPU64" start %lu i %lu end %lu\n",
835 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
838 if (!mapping_has_pages(mapping)) {
839 CDEBUG(D_INODE|D_PAGE, "nothing left\n");
845 page = find_get_page(mapping, i);
848 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
849 i, tmpex.l_extent.start);
852 /* page->mapping to check with racing against teardown */
853 if (!discard && clear_page_dirty_for_io(page)) {
854 rc = ll_call_writepage(inode, page);
855 /* either waiting for io to complete or reacquiring
856 * the lock that the failed writepage released */
858 wait_on_page_writeback(page);
860 CERROR("writepage inode %lu(%p) of page %p "
861 "failed: %d\n", inode->i_ino, inode,
864 set_bit(AS_ENOSPC, &mapping->flags);
866 set_bit(AS_EIO, &mapping->flags);
870 tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
871 /* check to see if another DLM lock covers this page b=2765 */
872 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
873 LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING |
875 &lock->l_resource->lr_name, LDLM_EXTENT,
876 &tmpex, LCK_PR | LCK_PW, &lockh);
878 if (rc2 <= 0 && page->mapping != NULL) {
879 struct ll_async_page *llap = llap_cast_private(page);
880 /* checking again to account for writeback's
882 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
884 ll_ra_accounting(llap, mapping);
885 ll_truncate_complete_page(page);
888 page_cache_release(page);
890 LASSERTF(tmpex.l_extent.start <=
891 (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
892 lock->l_policy_data.l_extent.end + 1),
893 "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
894 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
899 static int ll_extent_lock_callback(struct ldlm_lock *lock,
900 struct ldlm_lock_desc *new, void *data,
903 struct lustre_handle lockh = { 0 };
907 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
908 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
913 case LDLM_CB_BLOCKING:
914 ldlm_lock2handle(lock, &lockh);
915 rc = ldlm_cli_cancel(&lockh);
917 CERROR("ldlm_cli_cancel failed: %d\n", rc);
919 case LDLM_CB_CANCELING: {
921 struct ll_inode_info *lli;
922 struct lov_stripe_md *lsm;
926 /* This lock wasn't granted, don't try to evict pages */
927 if (lock->l_req_mode != lock->l_granted_mode)
930 inode = ll_inode_from_lock(lock);
933 lli = ll_i2info(inode);
936 if (lli->lli_smd == NULL)
940 stripe = ll_lock_to_stripe_offset(inode, lock);
944 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
946 lov_stripe_lock(lsm);
947 lock_res_and_lock(lock);
948 kms = ldlm_extent_shift_kms(lock,
949 lsm->lsm_oinfo[stripe]->loi_kms);
951 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
952 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
953 lsm->lsm_oinfo[stripe]->loi_kms, kms);
954 lsm->lsm_oinfo[stripe]->loi_kms = kms;
955 unlock_res_and_lock(lock);
956 lov_stripe_unlock(lsm);
969 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
971 /* XXX ALLOCATE - 160 bytes */
972 struct inode *inode = ll_inode_from_lock(lock);
973 struct ll_inode_info *lli = ll_i2info(inode);
974 struct lustre_handle lockh = { 0 };
979 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
980 LDLM_FL_BLOCK_CONV)) {
981 LBUG(); /* not expecting any blocked async locks yet */
982 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
984 ldlm_lock_dump(D_OTHER, lock, 0);
985 ldlm_reprocess_all(lock->l_resource);
989 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
991 stripe = ll_lock_to_stripe_offset(inode, lock);
995 if (lock->l_lvb_len) {
996 struct lov_stripe_md *lsm = lli->lli_smd;
998 lvb = lock->l_lvb_data;
999 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
1001 lock_res_and_lock(lock);
1002 ll_inode_size_lock(inode, 1);
1003 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
1004 kms = ldlm_extent_shift_kms(NULL, kms);
1005 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
1006 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
1007 lsm->lsm_oinfo[stripe].loi_kms, kms);
1008 lsm->lsm_oinfo[stripe].loi_kms = kms;
1009 ll_inode_size_unlock(inode, 1);
1010 unlock_res_and_lock(lock);
1015 wake_up(&lock->l_waitq);
1017 ldlm_lock2handle(lock, &lockh);
1018 ldlm_lock_decref(&lockh, LCK_PR);
1023 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
1025 struct ptlrpc_request *req = reqp;
1026 struct inode *inode = ll_inode_from_lock(lock);
1027 struct ll_inode_info *lli;
1028 struct lov_stripe_md *lsm;
1029 struct ost_lvb *lvb;
1031 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
1035 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
1036 lli = ll_i2info(inode);
1038 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1041 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1043 /* First, find out which stripe index this lock corresponds to. */
1044 stripe = ll_lock_to_stripe_offset(inode, lock);
1046 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1048 rc = lustre_pack_reply(req, 2, size, NULL);
1052 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
1053 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
1054 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1055 lvb->lvb_atime = LTIME_S(inode->i_atime);
1056 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1058 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1059 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1060 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
1061 lvb->lvb_atime, lvb->lvb_ctime);
1066 /* These errors are normal races, so we don't want to fill the console
1067 * with messages by calling ptlrpc_error() */
1068 if (rc == -ELDLM_NO_LOCK_DATA)
1069 lustre_pack_reply(req, 1, NULL, NULL);
1071 req->rq_status = rc;
1075 static int ll_merge_lvb(struct inode *inode)
1077 struct ll_inode_info *lli = ll_i2info(inode);
1078 struct ll_sb_info *sbi = ll_i2sbi(inode);
1084 ll_inode_size_lock(inode, 1);
1085 inode_init_lvb(inode, &lvb);
1086 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1087 i_size_write(inode, lvb.lvb_size);
1088 inode->i_blocks = lvb.lvb_blocks;
1090 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1091 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1092 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1093 ll_inode_size_unlock(inode, 1);
1098 int ll_local_size(struct inode *inode)
1100 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1101 struct ll_inode_info *lli = ll_i2info(inode);
1102 struct ll_sb_info *sbi = ll_i2sbi(inode);
1103 struct lustre_handle lockh = { 0 };
1108 if (lli->lli_smd->lsm_stripe_count == 0)
1111 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1112 &policy, LCK_PR, &flags, inode, &lockh);
1118 rc = ll_merge_lvb(inode);
1119 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
1123 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1126 struct lustre_handle lockh = { 0 };
1127 struct ldlm_enqueue_info einfo = { 0 };
1128 struct obd_info oinfo = { { { 0 } } };
1134 einfo.ei_type = LDLM_EXTENT;
1135 einfo.ei_mode = LCK_PR;
1136 einfo.ei_cb_bl = ll_extent_lock_callback;
1137 einfo.ei_cb_cp = ldlm_completion_ast;
1138 einfo.ei_cb_gl = ll_glimpse_callback;
1139 einfo.ei_cbdata = NULL;
1141 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1142 oinfo.oi_lockh = &lockh;
1144 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
1146 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1150 CERROR("obd_enqueue returned rc %d, "
1151 "returning -EIO\n", rc);
1152 RETURN(rc > 0 ? -EIO : rc);
1155 lov_stripe_lock(lsm);
1156 memset(&lvb, 0, sizeof(lvb));
1157 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1158 st->st_size = lvb.lvb_size;
1159 st->st_blocks = lvb.lvb_blocks;
1160 st->st_mtime = lvb.lvb_mtime;
1161 st->st_atime = lvb.lvb_atime;
1162 st->st_ctime = lvb.lvb_ctime;
1163 lov_stripe_unlock(lsm);
1168 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1169 * file (because it prefers KMS over RSS when larger) */
1170 int ll_glimpse_size(struct inode *inode, int ast_flags)
1172 struct ll_inode_info *lli = ll_i2info(inode);
1173 struct ll_sb_info *sbi = ll_i2sbi(inode);
1174 struct lustre_handle lockh = { 0 };
1175 struct ldlm_enqueue_info einfo = { 0 };
1176 struct obd_info oinfo = { { { 0 } } };
1180 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1183 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1185 if (!lli->lli_smd) {
1186 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1190 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1191 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1192 * won't revoke any conflicting DLM locks held. Instead,
1193 * ll_glimpse_callback() will be called on each client
1194 * holding a DLM lock against this file, and resulting size
1195 * will be returned for each stripe. DLM lock on [0, EOF] is
1196 * acquired only if there were no conflicting locks. */
1197 einfo.ei_type = LDLM_EXTENT;
1198 einfo.ei_mode = LCK_PR;
1199 einfo.ei_cb_bl = ll_extent_lock_callback;
1200 einfo.ei_cb_cp = ldlm_completion_ast;
1201 einfo.ei_cb_gl = ll_glimpse_callback;
1202 einfo.ei_cbdata = inode;
1204 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1205 oinfo.oi_lockh = &lockh;
1206 oinfo.oi_md = lli->lli_smd;
1207 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1209 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1213 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1214 RETURN(rc > 0 ? -EIO : rc);
1217 rc = ll_merge_lvb(inode);
1219 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
1220 i_size_read(inode), inode->i_blocks);
1225 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1226 struct lov_stripe_md *lsm, int mode,
1227 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1230 struct ll_sb_info *sbi = ll_i2sbi(inode);
1232 struct ldlm_enqueue_info einfo = { 0 };
1233 struct obd_info oinfo = { { { 0 } } };
1237 LASSERT(!lustre_handle_is_used(lockh));
1238 LASSERT(lsm != NULL);
1240 /* don't drop the mmapped file to LRU */
1241 if (mapping_mapped(inode->i_mapping))
1242 ast_flags |= LDLM_FL_NO_LRU;
1244 /* XXX phil: can we do this? won't it screw the file size up? */
1245 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1246 (sbi->ll_flags & LL_SBI_NOLCK))
1249 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1250 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1252 einfo.ei_type = LDLM_EXTENT;
1253 einfo.ei_mode = mode;
1254 einfo.ei_cb_bl = ll_extent_lock_callback;
1255 einfo.ei_cb_cp = ldlm_completion_ast;
1256 einfo.ei_cb_gl = ll_glimpse_callback;
1257 einfo.ei_cbdata = inode;
1259 oinfo.oi_policy = *policy;
1260 oinfo.oi_lockh = lockh;
1262 oinfo.oi_flags = ast_flags;
1264 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
1265 *policy = oinfo.oi_policy;
1269 ll_inode_size_lock(inode, 1);
1270 inode_init_lvb(inode, &lvb);
1271 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1273 if (policy->l_extent.start == 0 &&
1274 policy->l_extent.end == OBD_OBJECT_EOF) {
1275 /* vmtruncate()->ll_truncate() first sets the i_size and then
1276 * the kms under both a DLM lock and the
1277 * ll_inode_size_lock(). If we don't get the
1278 * ll_inode_size_lock() here we can match the DLM lock and
1279 * reset i_size from the kms before the truncating path has
1280 * updated the kms. generic_file_write can then trust the
1281 * stale i_size when doing appending writes and effectively
1282 * cancel the result of the truncate. Getting the
1283 * ll_inode_size_lock() after the enqueue maintains the DLM
1284 * -> ll_inode_size_lock() acquiring order. */
1285 i_size_write(inode, lvb.lvb_size);
1286 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1287 inode->i_ino, i_size_read(inode));
1291 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1292 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1293 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1295 ll_inode_size_unlock(inode, 1);
1300 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1301 struct lov_stripe_md *lsm, int mode,
1302 struct lustre_handle *lockh)
1304 struct ll_sb_info *sbi = ll_i2sbi(inode);
1308 /* XXX phil: can we do this? won't it screw the file size up? */
1309 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1310 (sbi->ll_flags & LL_SBI_NOLCK))
1313 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1318 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1321 struct inode *inode = file->f_dentry->d_inode;
1322 struct ll_inode_info *lli = ll_i2info(inode);
1323 struct lov_stripe_md *lsm = lli->lli_smd;
1324 struct ll_sb_info *sbi = ll_i2sbi(inode);
1325 struct ll_lock_tree tree;
1326 struct ll_lock_tree_node *node;
1328 struct ll_ra_read bead;
1331 ssize_t retval, chunk, sum = 0;
1335 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1336 inode->i_ino, inode->i_generation, inode, count, *ppos);
1337 /* "If nbyte is 0, read() will return 0 and have no other results."
1338 * -- Single Unix Spec */
1342 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1345 /* Read on file with no objects should return zero-filled
1346 * buffers up to file size (we can get non-zero sizes with
1347 * mknod + truncate, then opening file for read. This is a
1348 * common pattern in NFS case, it seems). Bug 6243 */
1350 /* Since there are no objects on OSTs, we have nothing to get
1351 * lock on and so we are forced to access inode->i_size
1354 /* Read beyond end of file */
1355 if (*ppos >= i_size_read(inode))
1358 if (count > i_size_read(inode) - *ppos)
1359 count = i_size_read(inode) - *ppos;
1360 /* Make sure to correctly adjust the file pos pointer for
1362 notzeroed = clear_user(buf, count);
1371 if (sbi->ll_max_rw_chunk != 0) {
1372 /* first, let's know the end of the current stripe */
1374 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1377 /* correct, the end is beyond the request */
1378 if (end > *ppos + count - 1)
1379 end = *ppos + count - 1;
1381 /* and chunk shouldn't be too large even if striping is wide */
1382 if (end - *ppos > sbi->ll_max_rw_chunk)
1383 end = *ppos + sbi->ll_max_rw_chunk - 1;
1385 end = *ppos + count - 1;
1388 node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1390 GOTO(out, retval = PTR_ERR(node));
1393 tree.lt_fd = LUSTRE_FPRIVATE(file);
1394 rc = ll_tree_lock(&tree, node, buf, count,
1395 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1397 GOTO(out, retval = rc);
1399 ll_inode_size_lock(inode, 1);
1401 * Consistency guarantees: following possibilities exist for the
1402 * relation between region being read and real file size at this
1405 * (A): the region is completely inside of the file;
1407 * (B-x): x bytes of region are inside of the file, the rest is
1410 * (C): the region is completely outside of the file.
1412 * This classification is stable under DLM lock acquired by
1413 * ll_tree_lock() above, because to change class, other client has to
1414 * take DLM lock conflicting with our lock. Also, any updates to
1415 * ->i_size by other threads on this client are serialized by
1416 * ll_inode_size_lock(). This guarantees that short reads are handled
1417 * correctly in the face of concurrent writes and truncates.
1419 inode_init_lvb(inode, &lvb);
1420 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1422 if (*ppos + count - 1 > kms) {
1423 /* A glimpse is necessary to determine whether we return a
1424 * short read (B) or some zeroes at the end of the buffer (C) */
1425 ll_inode_size_unlock(inode, 1);
1426 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1428 ll_tree_unlock(&tree);
1432 /* region is within kms and, hence, within real file size (A).
1433 * We need to increase i_size to cover the read region so that
1434 * generic_file_read() will do its job, but that doesn't mean
1435 * the kms size is _correct_, it is only the _minimum_ size.
1436 * If someone does a stat they will get the correct size which
1437 * will always be >= the kms value here. b=11081 */
1438 if (i_size_read(inode) < kms)
1439 i_size_write(inode, kms);
1440 ll_inode_size_unlock(inode, 1);
1443 chunk = end - *ppos + 1;
1444 CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1445 inode->i_ino, chunk, *ppos, i_size_read(inode));
1447 /* turn off the kernel's read-ahead */
1448 file->f_ra.ra_pages = 0;
1450 /* initialize read-ahead window once per syscall */
1453 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1454 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1455 ll_ra_read_in(file, &bead);
1459 file_accessed(file);
1460 retval = generic_file_read(file, buf, chunk, ppos);
1461 ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1463 ll_tree_unlock(&tree);
1469 if (retval == chunk && count > 0)
1475 ll_ra_read_ex(file, &bead);
1476 retval = (sum > 0) ? sum : retval;
1481 * Write to a file (through the page cache).
1483 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1486 struct inode *inode = file->f_dentry->d_inode;
1487 struct ll_sb_info *sbi = ll_i2sbi(inode);
1488 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1489 struct ll_lock_tree tree;
1490 struct ll_lock_tree_node *node;
1491 loff_t maxbytes = ll_file_maxbytes(inode);
1492 loff_t lock_start, lock_end, end;
1493 ssize_t retval, chunk, sum = 0;
1497 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1498 inode->i_ino, inode->i_generation, inode, count, *ppos);
1500 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1502 /* POSIX, but surprised the VFS doesn't check this already */
1506 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1507 * called on the file, don't fail the below assertion (bug 2388). */
1508 if (file->f_flags & O_LOV_DELAY_CREATE &&
1509 ll_i2info(inode)->lli_smd == NULL)
1512 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1514 down(&ll_i2info(inode)->lli_write_sem);
1517 chunk = 0; /* just to fix gcc's warning */
1518 end = *ppos + count - 1;
1520 if (file->f_flags & O_APPEND) {
1522 lock_end = OBD_OBJECT_EOF;
1523 } else if (sbi->ll_max_rw_chunk != 0) {
1524 /* first, let's know the end of the current stripe */
1526 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1529 /* correct, the end is beyond the request */
1530 if (end > *ppos + count - 1)
1531 end = *ppos + count - 1;
1533 /* and chunk shouldn't be too large even if striping is wide */
1534 if (end - *ppos > sbi->ll_max_rw_chunk)
1535 end = *ppos + sbi->ll_max_rw_chunk - 1;
1540 lock_end = *ppos + count - 1;
1542 node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1545 GOTO(out, retval = PTR_ERR(node));
1547 tree.lt_fd = LUSTRE_FPRIVATE(file);
1548 rc = ll_tree_lock(&tree, node, buf, count,
1549 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1551 GOTO(out, retval = rc);
1553 /* This is ok, g_f_w will overwrite this under i_sem if it races
1554 * with a local truncate, it just makes our maxbyte checking easier.
1555 * The i_size value gets updated in ll_extent_lock() as a consequence
1556 * of the [0,EOF] extent lock we requested above. */
1557 if (file->f_flags & O_APPEND) {
1558 *ppos = i_size_read(inode);
1559 end = *ppos + count - 1;
1562 if (*ppos >= maxbytes) {
1563 send_sig(SIGXFSZ, current, 0);
1564 GOTO(out_unlock, retval = -EFBIG);
1566 if (*ppos + count > maxbytes)
1567 count = maxbytes - *ppos;
1569 /* generic_file_write handles O_APPEND after getting i_mutex */
1570 chunk = end - *ppos + 1;
1571 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1572 inode->i_ino, chunk, *ppos);
1573 retval = generic_file_write(file, buf, chunk, ppos);
1574 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1577 ll_tree_unlock(&tree);
1584 if (retval == chunk && count > 0)
1588 up(&ll_i2info(inode)->lli_write_sem);
1590 retval = (sum > 0) ? sum : retval;
1591 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1592 retval > 0 ? retval : 0);
1597 * Send file content (through pagecache) somewhere with helper
1599 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1600 read_actor_t actor, void *target)
1602 struct inode *inode = in_file->f_dentry->d_inode;
1603 struct ll_inode_info *lli = ll_i2info(inode);
1604 struct lov_stripe_md *lsm = lli->lli_smd;
1605 struct ll_lock_tree tree;
1606 struct ll_lock_tree_node *node;
1608 struct ll_ra_read bead;
1613 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1614 inode->i_ino, inode->i_generation, inode, count, *ppos);
1616 /* "If nbyte is 0, read() will return 0 and have no other results."
1617 * -- Single Unix Spec */
1621 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1622 /* turn off the kernel's read-ahead */
1623 in_file->f_ra.ra_pages = 0;
1625 /* File with no objects, nothing to lock */
1627 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1629 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1631 RETURN(PTR_ERR(node));
1633 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1634 rc = ll_tree_lock(&tree, node, NULL, count,
1635 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1639 ll_inode_size_lock(inode, 1);
1641 * Consistency guarantees: following possibilities exist for the
1642 * relation between region being read and real file size at this
1645 * (A): the region is completely inside of the file;
1647 * (B-x): x bytes of region are inside of the file, the rest is
1650 * (C): the region is completely outside of the file.
1652 * This classification is stable under DLM lock acquired by
1653 * ll_tree_lock() above, because to change class, other client has to
1654 * take DLM lock conflicting with our lock. Also, any updates to
1655 * ->i_size by other threads on this client are serialized by
1656 * ll_inode_size_lock(). This guarantees that short reads are handled
1657 * correctly in the face of concurrent writes and truncates.
1659 inode_init_lvb(inode, &lvb);
1660 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1662 if (*ppos + count - 1 > kms) {
1663 /* A glimpse is necessary to determine whether we return a
1664 * short read (B) or some zeroes at the end of the buffer (C) */
1665 ll_inode_size_unlock(inode, 1);
1666 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1670 /* region is within kms and, hence, within real file size (A) */
1671 i_size_write(inode, kms);
1672 ll_inode_size_unlock(inode, 1);
1675 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1676 inode->i_ino, count, *ppos, i_size_read(inode));
1678 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1679 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1680 ll_ra_read_in(in_file, &bead);
1682 file_accessed(in_file);
1683 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1684 ll_ra_read_ex(in_file, &bead);
1687 ll_tree_unlock(&tree);
1691 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1694 struct ll_inode_info *lli = ll_i2info(inode);
1695 struct obd_export *exp = ll_i2dtexp(inode);
1696 struct ll_recreate_obj ucreatp;
1697 struct obd_trans_info oti = { 0 };
1698 struct obdo *oa = NULL;
1701 struct lov_stripe_md *lsm, *lsm2;
1704 if (!capable (CAP_SYS_ADMIN))
1707 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1708 sizeof(struct ll_recreate_obj));
1716 down(&lli->lli_size_sem);
1719 GOTO(out, rc = -ENOENT);
1720 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1721 (lsm->lsm_stripe_count));
1723 OBD_ALLOC(lsm2, lsm_size);
1725 GOTO(out, rc = -ENOMEM);
1727 oa->o_id = ucreatp.lrc_id;
1728 oa->o_gr = ucreatp.lrc_group;
1729 oa->o_nlink = ucreatp.lrc_ost_idx;
1730 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1731 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1732 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1733 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1735 memcpy(lsm2, lsm, lsm_size);
1736 rc = obd_create(exp, oa, &lsm2, &oti);
1738 OBD_FREE(lsm2, lsm_size);
1741 up(&lli->lli_size_sem);
1746 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1747 int flags, struct lov_user_md *lum, int lum_size)
1749 struct ll_inode_info *lli = ll_i2info(inode);
1750 struct lov_stripe_md *lsm;
1751 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1755 down(&lli->lli_size_sem);
1758 up(&lli->lli_size_sem);
1759 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1764 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1767 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1768 GOTO(out_req_free, rc = -ENOENT);
1769 rc = oit.d.lustre.it_status;
1771 GOTO(out_req_free, rc);
1773 ll_release_openhandle(file->f_dentry, &oit);
1776 up(&lli->lli_size_sem);
1777 ll_intent_release(&oit);
1780 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1784 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1785 struct lov_mds_md **lmmp, int *lmm_size,
1786 struct ptlrpc_request **request)
1788 struct ll_sb_info *sbi = ll_i2sbi(inode);
1789 struct mdt_body *body;
1790 struct lov_mds_md *lmm = NULL;
1791 struct ptlrpc_request *req = NULL;
1792 struct obd_capa *oc;
1795 rc = ll_get_max_mdsize(sbi, &lmmsize);
1799 oc = ll_mdscapa_get(inode);
1800 rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1801 oc, filename, strlen(filename) + 1,
1802 OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize, &req);
1805 CDEBUG(D_INFO, "md_getattr_name failed "
1806 "on %s: rc %d\n", filename, rc);
1810 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*body));
1811 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1812 /* swabbed by mdc_getattr_name */
1813 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
1815 lmmsize = body->eadatasize;
1817 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1819 GOTO(out, rc = -ENODATA);
1822 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1823 LASSERT(lmm != NULL);
1824 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
1827 * This is coming from the MDS, so is probably in
1828 * little endian. We convert it to host endian before
1829 * passing it to userspace.
1831 if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1832 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1833 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1834 } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1835 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1838 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1839 struct lov_stripe_md *lsm;
1840 struct lov_user_md_join *lmj;
1841 int lmj_size, i, aindex = 0;
1843 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1845 GOTO(out, rc = -ENOMEM);
1846 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1848 GOTO(out_free_memmd, rc);
1850 lmj_size = sizeof(struct lov_user_md_join) +
1851 lsm->lsm_stripe_count *
1852 sizeof(struct lov_user_ost_data_join);
1853 OBD_ALLOC(lmj, lmj_size);
1855 GOTO(out_free_memmd, rc = -ENOMEM);
1857 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1858 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1859 struct lov_extent *lex =
1860 &lsm->lsm_array->lai_ext_array[aindex];
1862 if (lex->le_loi_idx + lex->le_stripe_count <= i)
1864 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1865 LPU64" len %d\n", aindex, i,
1866 lex->le_start, (int)lex->le_len);
1867 lmj->lmm_objects[i].l_extent_start =
1870 if ((int)lex->le_len == -1)
1871 lmj->lmm_objects[i].l_extent_end = -1;
1873 lmj->lmm_objects[i].l_extent_end =
1874 lex->le_start + lex->le_len;
1875 lmj->lmm_objects[i].l_object_id =
1876 lsm->lsm_oinfo[i]->loi_id;
1877 lmj->lmm_objects[i].l_object_gr =
1878 lsm->lsm_oinfo[i]->loi_gr;
1879 lmj->lmm_objects[i].l_ost_gen =
1880 lsm->lsm_oinfo[i]->loi_ost_gen;
1881 lmj->lmm_objects[i].l_ost_idx =
1882 lsm->lsm_oinfo[i]->loi_ost_idx;
1884 lmm = (struct lov_mds_md *)lmj;
1887 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1891 *lmm_size = lmmsize;
1896 static int ll_lov_setea(struct inode *inode, struct file *file,
1899 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1900 struct lov_user_md *lump;
1901 int lum_size = sizeof(struct lov_user_md) +
1902 sizeof(struct lov_user_ost_data);
1906 if (!capable (CAP_SYS_ADMIN))
1909 OBD_ALLOC(lump, lum_size);
1913 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
1915 OBD_FREE(lump, lum_size);
1919 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1921 OBD_FREE(lump, lum_size);
1925 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1928 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1930 int flags = FMODE_WRITE;
1933 /* Bug 1152: copy properly when this is no longer true */
1934 LASSERT(sizeof(lum) == sizeof(*lump));
1935 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1936 rc = copy_from_user(&lum, lump, sizeof(lum));
1940 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1942 put_user(0, &lump->lmm_stripe_count);
1943 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1944 0, ll_i2info(inode)->lli_smd, lump);
1949 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1951 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1956 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1960 static int ll_get_grouplock(struct inode *inode, struct file *file,
1963 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1964 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1965 .end = OBD_OBJECT_EOF}};
1966 struct lustre_handle lockh = { 0 };
1967 struct ll_inode_info *lli = ll_i2info(inode);
1968 struct lov_stripe_md *lsm = lli->lli_smd;
1972 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1976 policy.l_extent.gid = arg;
1977 if (file->f_flags & O_NONBLOCK)
1978 flags = LDLM_FL_BLOCK_NOWAIT;
1980 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1984 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1986 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1991 static int ll_put_grouplock(struct inode *inode, struct file *file,
1994 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1995 struct ll_inode_info *lli = ll_i2info(inode);
1996 struct lov_stripe_md *lsm = lli->lli_smd;
2000 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2001 /* Ugh, it's already unlocked. */
2005 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
2008 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
2010 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2015 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2020 static int join_sanity_check(struct inode *head, struct inode *tail)
2023 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2024 CERROR("server do not support join \n");
2027 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2028 CERROR("tail ino %lu and ino head %lu must be regular\n",
2029 head->i_ino, tail->i_ino);
2032 if (head->i_ino == tail->i_ino) {
2033 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2036 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2037 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2043 static int join_file(struct inode *head_inode, struct file *head_filp,
2044 struct file *tail_filp)
2046 struct dentry *tail_dentry = tail_filp->f_dentry;
2047 struct lookup_intent oit = {.it_op = IT_OPEN,
2048 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2049 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
2050 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL };
2052 struct lustre_handle lockh;
2053 struct md_op_data *op_data;
2058 tail_dentry = tail_filp->f_dentry;
2060 data = i_size_read(head_inode);
2061 op_data = ll_prep_md_op_data(NULL, head_inode,
2062 tail_dentry->d_parent->d_inode,
2063 tail_dentry->d_name.name,
2064 tail_dentry->d_name.len, 0,
2065 LUSTRE_OPC_ANY, &data);
2066 if (IS_ERR(op_data))
2067 RETURN(PTR_ERR(op_data));
2069 rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
2070 op_data, &lockh, NULL, 0, 0);
2072 ll_finish_md_op_data(op_data);
2076 rc = oit.d.lustre.it_status;
2078 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2079 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2080 ptlrpc_req_finished((struct ptlrpc_request *)
2081 oit.d.lustre.it_data);
2085 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2087 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2088 oit.d.lustre.it_lock_mode = 0;
2090 ll_release_openhandle(head_filp->f_dentry, &oit);
2092 ll_intent_release(&oit);
2096 static int ll_file_join(struct inode *head, struct file *filp,
2097 char *filename_tail)
2099 struct inode *tail = NULL, *first = NULL, *second = NULL;
2100 struct dentry *tail_dentry;
2101 struct file *tail_filp, *first_filp, *second_filp;
2102 struct ll_lock_tree first_tree, second_tree;
2103 struct ll_lock_tree_node *first_node, *second_node;
2104 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2105 int rc = 0, cleanup_phase = 0;
2108 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2109 head->i_ino, head->i_generation, head, filename_tail);
2111 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2112 if (IS_ERR(tail_filp)) {
2113 CERROR("Can not open tail file %s", filename_tail);
2114 rc = PTR_ERR(tail_filp);
2117 tail = igrab(tail_filp->f_dentry->d_inode);
2119 tlli = ll_i2info(tail);
2120 tail_dentry = tail_filp->f_dentry;
2121 LASSERT(tail_dentry);
2124 /*reorder the inode for lock sequence*/
2125 first = head->i_ino > tail->i_ino ? head : tail;
2126 second = head->i_ino > tail->i_ino ? tail : head;
2127 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2128 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2130 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2131 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2132 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2133 if (IS_ERR(first_node)){
2134 rc = PTR_ERR(first_node);
2137 first_tree.lt_fd = first_filp->private_data;
2138 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2143 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2144 if (IS_ERR(second_node)){
2145 rc = PTR_ERR(second_node);
2148 second_tree.lt_fd = second_filp->private_data;
2149 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2154 rc = join_sanity_check(head, tail);
2158 rc = join_file(head, filp, tail_filp);
2162 switch (cleanup_phase) {
2164 ll_tree_unlock(&second_tree);
2165 obd_cancel_unused(ll_i2dtexp(second),
2166 ll_i2info(second)->lli_smd, 0, NULL);
2168 ll_tree_unlock(&first_tree);
2169 obd_cancel_unused(ll_i2dtexp(first),
2170 ll_i2info(first)->lli_smd, 0, NULL);
2172 filp_close(tail_filp, 0);
2175 if (head && rc == 0) {
2176 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2178 hlli->lli_smd = NULL;
2183 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2189 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2191 struct inode *inode = dentry->d_inode;
2192 struct obd_client_handle *och;
2198 /* Root ? Do nothing. */
2199 if (dentry->d_inode->i_sb->s_root == dentry)
2202 /* No open handle to close? Move away */
2203 if (!it_disposition(it, DISP_OPEN_OPEN))
2206 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2208 OBD_ALLOC(och, sizeof(*och));
2210 GOTO(out, rc = -ENOMEM);
2212 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2213 ll_i2info(inode), it, och);
2215 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2218 /* this one is in place of ll_file_open */
2219 ptlrpc_req_finished(it->d.lustre.it_data);
2220 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2224 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2227 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2231 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2232 inode->i_generation, inode, cmd);
2233 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2235 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2236 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2240 case LL_IOC_GETFLAGS:
2241 /* Get the current value of the file flags */
2242 return put_user(fd->fd_flags, (int *)arg);
2243 case LL_IOC_SETFLAGS:
2244 case LL_IOC_CLRFLAGS:
2245 /* Set or clear specific file flags */
2246 /* XXX This probably needs checks to ensure the flags are
2247 * not abused, and to handle any flag side effects.
2249 if (get_user(flags, (int *) arg))
2252 if (cmd == LL_IOC_SETFLAGS) {
2253 if ((flags & LL_FILE_IGNORE_LOCK) &&
2254 !(file->f_flags & O_DIRECT)) {
2255 CERROR("%s: unable to disable locking on "
2256 "non-O_DIRECT file\n", current->comm);
2260 fd->fd_flags |= flags;
2262 fd->fd_flags &= ~flags;
2265 case LL_IOC_LOV_SETSTRIPE:
2266 RETURN(ll_lov_setstripe(inode, file, arg));
2267 case LL_IOC_LOV_SETEA:
2268 RETURN(ll_lov_setea(inode, file, arg));
2269 case LL_IOC_LOV_GETSTRIPE:
2270 RETURN(ll_lov_getstripe(inode, arg));
2271 case LL_IOC_RECREATE_OBJ:
2272 RETURN(ll_lov_recreate_obj(inode, file, arg));
2273 case EXT3_IOC_GETFLAGS:
2274 case EXT3_IOC_SETFLAGS:
2275 RETURN(ll_iocontrol(inode, file, cmd, arg));
2276 case EXT3_IOC_GETVERSION_OLD:
2277 case EXT3_IOC_GETVERSION:
2278 RETURN(put_user(inode->i_generation, (int *)arg));
2283 ftail = getname((const char *)arg);
2285 RETURN(PTR_ERR(ftail));
2286 rc = ll_file_join(inode, file, ftail);
2290 case LL_IOC_GROUP_LOCK:
2291 RETURN(ll_get_grouplock(inode, file, arg));
2292 case LL_IOC_GROUP_UNLOCK:
2293 RETURN(ll_put_grouplock(inode, file, arg));
2294 case IOC_OBD_STATFS:
2295 RETURN(ll_obd_statfs(inode, (void *)arg));
2297 /* We need to special case any other ioctls we want to handle,
2298 * to send them to the MDS/OST as appropriate and to properly
2299 * network encode the arg field.
2300 case EXT3_IOC_SETVERSION_OLD:
2301 case EXT3_IOC_SETVERSION:
2303 case LL_IOC_FLUSHCTX:
2304 RETURN(ll_flush_ctx(inode));
2305 case LL_IOC_GETFACL: {
2306 struct rmtacl_ioctl_data ioc;
2308 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2311 RETURN(ll_ioctl_getfacl(inode, &ioc));
2313 case LL_IOC_SETFACL: {
2314 struct rmtacl_ioctl_data ioc;
2316 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2319 RETURN(ll_ioctl_setfacl(inode, &ioc));
2325 ll_iocontrol_call(inode, file, cmd, arg, &err))
2328 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2334 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2336 struct inode *inode = file->f_dentry->d_inode;
2337 struct ll_inode_info *lli = ll_i2info(inode);
2338 struct lov_stripe_md *lsm = lli->lli_smd;
2341 retval = offset + ((origin == 2) ? i_size_read(inode) :
2342 (origin == 1) ? file->f_pos : 0);
2343 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2344 inode->i_ino, inode->i_generation, inode, retval, retval,
2345 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2346 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2348 if (origin == 2) { /* SEEK_END */
2349 int nonblock = 0, rc;
2351 if (file->f_flags & O_NONBLOCK)
2352 nonblock = LDLM_FL_BLOCK_NOWAIT;
2355 rc = ll_glimpse_size(inode, nonblock);
2360 ll_inode_size_lock(inode, 0);
2361 offset += i_size_read(inode);
2362 ll_inode_size_unlock(inode, 0);
2363 } else if (origin == 1) { /* SEEK_CUR */
2364 offset += file->f_pos;
2368 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2369 if (offset != file->f_pos) {
2370 file->f_pos = offset;
2371 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2373 file->f_version = ++event;
2382 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2384 struct inode *inode = dentry->d_inode;
2385 struct ll_inode_info *lli = ll_i2info(inode);
2386 struct lov_stripe_md *lsm = lli->lli_smd;
2387 struct ptlrpc_request *req;
2388 struct obd_capa *oc;
2391 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2392 inode->i_generation, inode);
2393 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2395 /* fsync's caller has already called _fdata{sync,write}, we want
2396 * that IO to finish before calling the osc and mdc sync methods */
2397 rc = filemap_fdatawait(inode->i_mapping);
2399 /* catch async errors that were recorded back when async writeback
2400 * failed for pages in this mapping. */
2401 err = lli->lli_async_rc;
2402 lli->lli_async_rc = 0;
2406 err = lov_test_and_clear_async_rc(lsm);
2411 oc = ll_mdscapa_get(inode);
2412 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2418 ptlrpc_req_finished(req);
2425 RETURN(rc ? rc : -ENOMEM);
2427 oa->o_id = lsm->lsm_object_id;
2428 oa->o_gr = lsm->lsm_object_gr;
2429 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2430 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2431 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2434 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2435 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2436 0, OBD_OBJECT_EOF, oc);
2446 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2448 struct inode *inode = file->f_dentry->d_inode;
2449 struct ll_sb_info *sbi = ll_i2sbi(inode);
2450 struct ldlm_res_id res_id =
2451 { .name = { fid_seq(ll_inode2fid(inode)),
2452 fid_oid(ll_inode2fid(inode)),
2453 fid_ver(ll_inode2fid(inode)),
2455 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2456 ldlm_flock_completion_ast, NULL, file_lock };
2457 struct lustre_handle lockh = {0};
2458 ldlm_policy_data_t flock;
2463 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2464 inode->i_ino, file_lock);
2466 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2468 if (file_lock->fl_flags & FL_FLOCK) {
2469 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2470 /* set missing params for flock() calls */
2471 file_lock->fl_end = OFFSET_MAX;
2472 file_lock->fl_pid = current->tgid;
2474 flock.l_flock.pid = file_lock->fl_pid;
2475 flock.l_flock.start = file_lock->fl_start;
2476 flock.l_flock.end = file_lock->fl_end;
2478 switch (file_lock->fl_type) {
2480 einfo.ei_mode = LCK_PR;
2483 /* An unlock request may or may not have any relation to
2484 * existing locks so we may not be able to pass a lock handle
2485 * via a normal ldlm_lock_cancel() request. The request may even
2486 * unlock a byte range in the middle of an existing lock. In
2487 * order to process an unlock request we need all of the same
2488 * information that is given with a normal read or write record
2489 * lock request. To avoid creating another ldlm unlock (cancel)
2490 * message we'll treat a LCK_NL flock request as an unlock. */
2491 einfo.ei_mode = LCK_NL;
2494 einfo.ei_mode = LCK_PW;
2497 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2512 flags = LDLM_FL_BLOCK_NOWAIT;
2518 flags = LDLM_FL_TEST_LOCK;
2519 /* Save the old mode so that if the mode in the lock changes we
2520 * can decrement the appropriate reader or writer refcount. */
2521 file_lock->fl_type = einfo.ei_mode;
2524 CERROR("unknown fcntl lock command: %d\n", cmd);
2528 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2529 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2530 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2532 rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &einfo, &res_id,
2533 &flock, &flags, NULL, 0, NULL, &lockh, 0);
2534 if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2535 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2536 #ifdef HAVE_F_OP_FLOCK
2537 if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2538 !(flags & LDLM_FL_TEST_LOCK))
2539 posix_lock_file_wait(file, file_lock);
2545 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2552 int ll_have_md_lock(struct inode *inode, __u64 bits)
2554 struct lustre_handle lockh;
2555 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2563 fid = &ll_i2info(inode)->lli_fid;
2564 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2566 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2567 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2568 LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2574 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2575 struct lustre_handle *lockh)
2577 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2583 fid = &ll_i2info(inode)->lli_fid;
2584 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2586 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2587 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2588 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2592 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2593 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2594 * and return success */
2596 /* This path cannot be hit for regular files unless in
2597 * case of obscure races, so no need to to validate
2599 if (!S_ISREG(inode->i_mode) &&
2600 !S_ISDIR(inode->i_mode))
2605 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2613 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2615 struct inode *inode = dentry->d_inode;
2616 struct ptlrpc_request *req = NULL;
2617 struct ll_sb_info *sbi;
2618 struct obd_export *exp;
2623 CERROR("REPORT THIS LINE TO PETER\n");
2626 sbi = ll_i2sbi(inode);
2628 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2629 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2631 exp = ll_i2mdexp(inode);
2633 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2634 struct lookup_intent oit = { .it_op = IT_GETATTR };
2635 struct md_op_data *op_data;
2637 /* Call getattr by fid, so do not provide name at all. */
2638 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2639 dentry->d_inode, NULL, 0, 0,
2640 LUSTRE_OPC_ANY, NULL);
2641 if (IS_ERR(op_data))
2642 RETURN(PTR_ERR(op_data));
2644 oit.it_flags |= O_CHECK_STALE;
2645 rc = md_intent_lock(exp, op_data, NULL, 0,
2646 /* we are not interested in name
2649 ll_md_blocking_ast, 0);
2650 ll_finish_md_op_data(op_data);
2651 oit.it_flags &= ~O_CHECK_STALE;
2653 rc = ll_inode_revalidate_fini(inode, rc);
2657 rc = ll_revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2659 ll_intent_release(&oit);
2663 /* Unlinked? Unhash dentry, so it is not picked up later by
2664 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2665 here to preserve get_cwd functionality on 2.6.
2667 if (!dentry->d_inode->i_nlink) {
2668 spin_lock(&dcache_lock);
2669 ll_drop_dentry(dentry);
2670 spin_unlock(&dcache_lock);
2673 ll_lookup_finish_locks(&oit, dentry);
2674 } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE)) {
2675 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2676 obd_valid valid = OBD_MD_FLGETATTR;
2677 struct obd_capa *oc;
2680 if (S_ISREG(inode->i_mode)) {
2681 rc = ll_get_max_mdsize(sbi, &ealen);
2684 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2686 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2687 * capa for this inode. Because we only keep capas of dirs
2689 oc = ll_mdscapa_get(inode);
2690 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2694 rc = ll_inode_revalidate_fini(inode, rc);
2698 rc = ll_prep_inode(&inode, req, REPLY_REC_OFF,
2704 /* if object not yet allocated, don't validate size */
2705 if (ll_i2info(inode)->lli_smd == NULL)
2708 /* ll_glimpse_size will prefer locally cached writes if they extend
2710 rc = ll_glimpse_size(inode, 0);
2713 ptlrpc_req_finished(req);
2717 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2718 struct lookup_intent *it, struct kstat *stat)
2720 struct inode *inode = de->d_inode;
2723 res = ll_inode_revalidate_it(de, it);
2724 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2729 stat->dev = inode->i_sb->s_dev;
2730 stat->ino = inode->i_ino;
2731 stat->mode = inode->i_mode;
2732 stat->nlink = inode->i_nlink;
2733 stat->uid = inode->i_uid;
2734 stat->gid = inode->i_gid;
2735 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2736 stat->atime = inode->i_atime;
2737 stat->mtime = inode->i_mtime;
2738 stat->ctime = inode->i_ctime;
2739 #ifdef HAVE_INODE_BLKSIZE
2740 stat->blksize = inode->i_blksize;
2742 stat->blksize = 1 << inode->i_blkbits;
2745 ll_inode_size_lock(inode, 0);
2746 stat->size = i_size_read(inode);
2747 stat->blocks = inode->i_blocks;
2748 ll_inode_size_unlock(inode, 0);
2752 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2754 struct lookup_intent it = { .it_op = IT_GETATTR };
2756 return ll_getattr_it(mnt, de, &it, stat);
2760 int lustre_check_acl(struct inode *inode, int mask)
2762 #ifdef CONFIG_FS_POSIX_ACL
2763 struct ll_inode_info *lli = ll_i2info(inode);
2764 struct posix_acl *acl;
2768 spin_lock(&lli->lli_lock);
2769 acl = posix_acl_dup(lli->lli_posix_acl);
2770 spin_unlock(&lli->lli_lock);
2775 rc = posix_acl_permission(inode, acl, mask);
2776 posix_acl_release(acl);
2784 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2785 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2787 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2788 inode->i_ino, inode->i_generation, inode, mask);
2789 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2790 return lustre_check_remote_perm(inode, mask);
2792 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2793 return generic_permission(inode, mask, lustre_check_acl);
2796 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2798 int mode = inode->i_mode;
2801 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2802 inode->i_ino, inode->i_generation, inode, mask);
2804 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2805 return lustre_check_remote_perm(inode, mask);
2807 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2809 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2810 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2812 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2814 if (current->fsuid == inode->i_uid) {
2817 if (((mode >> 3) & mask & S_IRWXO) != mask)
2819 rc = lustre_check_acl(inode, mask);
2823 goto check_capabilities;
2827 if (in_group_p(inode->i_gid))
2830 if ((mode & mask & S_IRWXO) == mask)
2834 if (!(mask & MAY_EXEC) ||
2835 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2836 if (capable(CAP_DAC_OVERRIDE))
2839 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2840 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2847 /* -o localflock - only provides locally consistent flock locks */
2848 struct file_operations ll_file_operations = {
2849 .read = ll_file_read,
2850 .write = ll_file_write,
2851 .ioctl = ll_file_ioctl,
2852 .open = ll_file_open,
2853 .release = ll_file_release,
2854 .mmap = ll_file_mmap,
2855 .llseek = ll_file_seek,
2856 .sendfile = ll_file_sendfile,
2860 struct file_operations ll_file_operations_flock = {
2861 .read = ll_file_read,
2862 .write = ll_file_write,
2863 .ioctl = ll_file_ioctl,
2864 .open = ll_file_open,
2865 .release = ll_file_release,
2866 .mmap = ll_file_mmap,
2867 .llseek = ll_file_seek,
2868 .sendfile = ll_file_sendfile,
2870 #ifdef HAVE_F_OP_FLOCK
2871 .flock = ll_file_flock,
2873 .lock = ll_file_flock
2876 /* These are for -o noflock - to return ENOSYS on flock calls */
2877 struct file_operations ll_file_operations_noflock = {
2878 .read = ll_file_read,
2879 .write = ll_file_write,
2880 .ioctl = ll_file_ioctl,
2881 .open = ll_file_open,
2882 .release = ll_file_release,
2883 .mmap = ll_file_mmap,
2884 .llseek = ll_file_seek,
2885 .sendfile = ll_file_sendfile,
2887 #ifdef HAVE_F_OP_FLOCK
2888 .flock = ll_file_noflock,
2890 .lock = ll_file_noflock
2893 struct inode_operations ll_file_inode_operations = {
2894 #ifdef HAVE_VFS_INTENT_PATCHES
2895 .setattr_raw = ll_setattr_raw,
2897 .setattr = ll_setattr,
2898 .truncate = ll_truncate,
2899 .getattr = ll_getattr,
2900 .permission = ll_inode_permission,
2901 .setxattr = ll_setxattr,
2902 .getxattr = ll_getxattr,
2903 .listxattr = ll_listxattr,
2904 .removexattr = ll_removexattr,
2907 /* dynamic ioctl number support routins */
2908 static struct llioc_ctl_data {
2909 struct rw_semaphore ioc_sem;
2910 struct list_head ioc_head;
2912 __RWSEM_INITIALIZER(llioc.ioc_sem),
2913 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2918 struct list_head iocd_list;
2919 unsigned int iocd_size;
2920 llioc_callback_t iocd_cb;
2921 unsigned int iocd_count;
2922 unsigned int iocd_cmd[0];
2925 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2928 struct llioc_data *in_data = NULL;
2931 if (cb == NULL || cmd == NULL ||
2932 count > LLIOC_MAX_CMD || count < 0)
2935 size = sizeof(*in_data) + count * sizeof(unsigned int);
2936 OBD_ALLOC(in_data, size);
2937 if (in_data == NULL)
2940 memset(in_data, 0, sizeof(*in_data));
2941 in_data->iocd_size = size;
2942 in_data->iocd_cb = cb;
2943 in_data->iocd_count = count;
2944 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2946 down_write(&llioc.ioc_sem);
2947 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2948 up_write(&llioc.ioc_sem);
2953 void ll_iocontrol_unregister(void *magic)
2955 struct llioc_data *tmp;
2960 down_write(&llioc.ioc_sem);
2961 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2963 unsigned int size = tmp->iocd_size;
2965 list_del(&tmp->iocd_list);
2966 up_write(&llioc.ioc_sem);
2968 OBD_FREE(tmp, size);
2972 up_write(&llioc.ioc_sem);
2974 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2977 EXPORT_SYMBOL(ll_iocontrol_register);
2978 EXPORT_SYMBOL(ll_iocontrol_unregister);
2980 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2981 unsigned int cmd, unsigned long arg, int *rcp)
2983 enum llioc_iter ret = LLIOC_CONT;
2984 struct llioc_data *data;
2985 int rc = -EINVAL, i;
2987 down_read(&llioc.ioc_sem);
2988 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2989 for (i = 0; i < data->iocd_count; i++) {
2990 if (cmd != data->iocd_cmd[i])
2993 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2997 if (ret == LLIOC_STOP)
3000 up_read(&llioc.ioc_sem);