1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Peter Braam <braam@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
7 * Author: Andreas Dilger <adilger@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_LLITE
26 #include <lustre_dlm.h>
27 #include <lustre_lite.h>
28 #include <lustre_mdc.h>
29 #include <linux/pagemap.h>
30 #include <linux/file.h>
31 #include "llite_internal.h"
33 /* also used by llite/special.c:ll_special_open() */
34 struct ll_file_data *ll_file_data_get(void)
36 struct ll_file_data *fd;
38 OBD_SLAB_ALLOC_PTR(fd, ll_file_data_slab);
42 static void ll_file_data_put(struct ll_file_data *fd)
45 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
48 void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
49 struct lustre_handle *fh)
51 op_data->op_fid1 = ll_i2info(inode)->lli_fid;
52 op_data->op_attr.ia_mode = inode->i_mode;
53 op_data->op_attr.ia_atime = inode->i_atime;
54 op_data->op_attr.ia_mtime = inode->i_mtime;
55 op_data->op_attr.ia_ctime = inode->i_ctime;
56 op_data->op_attr.ia_size = i_size_read(inode);
57 op_data->op_attr_blocks = inode->i_blocks;
58 ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = inode->i_flags;
59 op_data->op_ioepoch = ll_i2info(inode)->lli_ioepoch;
60 memcpy(&op_data->op_handle, fh, sizeof(op_data->op_handle));
61 op_data->op_capa1 = ll_mdscapa_get(inode);
64 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
65 struct obd_client_handle *och)
69 op_data->op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
70 ATTR_MTIME_SET | ATTR_CTIME_SET;
72 if (!(och->och_flags & FMODE_WRITE))
75 if (!(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM) ||
76 !S_ISREG(inode->i_mode))
77 op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS;
79 ll_epoch_close(inode, op_data, &och, 0);
82 ll_pack_inode2opdata(inode, op_data, &och->och_fh);
86 static int ll_close_inode_openhandle(struct obd_export *md_exp,
88 struct obd_client_handle *och)
90 struct obd_export *exp = ll_i2mdexp(inode);
91 struct md_op_data *op_data;
92 struct ptlrpc_request *req = NULL;
93 struct obd_device *obd = class_exp2obd(exp);
100 * XXX: in case of LMV, is this correct to access
103 CERROR("Invalid MDC connection handle "LPX64"\n",
104 ll_i2mdexp(inode)->exp_handle.h_cookie);
109 * here we check if this is forced umount. If so this is called on
110 * canceling "open lock" and we do not call md_close() in this case, as
111 * it will not be successful, as import is already deactivated.
116 OBD_ALLOC_PTR(op_data);
118 GOTO(out, rc = -ENOMEM); // XXX We leak openhandle and request here.
120 ll_prepare_close(inode, op_data, och);
121 epoch_close = (op_data->op_flags & MF_EPOCH_CLOSE);
122 rc = md_close(md_exp, op_data, och->och_mod, &req);
127 /* This close must have the epoch closed. */
128 LASSERT(exp->exp_connect_flags & OBD_CONNECT_SOM);
129 LASSERT(epoch_close);
130 /* MDS has instructed us to obtain Size-on-MDS attribute from
131 * OSTs and send setattr to back to MDS. */
132 rc = ll_sizeonmds_update(inode, och->och_mod,
133 &och->och_fh, op_data->op_ioepoch);
135 CERROR("inode %lu mdc Size-on-MDS update failed: "
136 "rc = %d\n", inode->i_ino, rc);
140 CERROR("inode %lu mdc close failed: rc = %d\n",
143 ll_finish_md_op_data(op_data);
146 rc = ll_objects_destroy(req, inode);
148 CERROR("inode %lu ll_objects destroy: rc = %d\n",
155 if ((exp->exp_connect_flags & OBD_CONNECT_SOM) && !epoch_close &&
156 S_ISREG(inode->i_mode) && (och->och_flags & FMODE_WRITE)) {
157 ll_queue_done_writing(inode, LLIF_DONE_WRITING);
160 ptlrpc_close_replay_seq(req);
161 md_clear_open_replay_data(md_exp, och);
162 /* Free @och if it is not waiting for DONE_WRITING. */
163 och->och_fh.cookie = DEAD_HANDLE_MAGIC;
166 if (req) /* This is close request */
167 ptlrpc_req_finished(req);
171 int ll_md_real_close(struct inode *inode, int flags)
173 struct ll_inode_info *lli = ll_i2info(inode);
174 struct obd_client_handle **och_p;
175 struct obd_client_handle *och;
180 if (flags & FMODE_WRITE) {
181 och_p = &lli->lli_mds_write_och;
182 och_usecount = &lli->lli_open_fd_write_count;
183 } else if (flags & FMODE_EXEC) {
184 och_p = &lli->lli_mds_exec_och;
185 och_usecount = &lli->lli_open_fd_exec_count;
187 LASSERT(flags & FMODE_READ);
188 och_p = &lli->lli_mds_read_och;
189 och_usecount = &lli->lli_open_fd_read_count;
192 down(&lli->lli_och_sem);
193 if (*och_usecount) { /* There are still users of this handle, so
195 up(&lli->lli_och_sem);
200 up(&lli->lli_och_sem);
202 if (och) { /* There might be a race and somebody have freed this och
204 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
211 int ll_md_close(struct obd_export *md_exp, struct inode *inode,
214 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
215 struct ll_inode_info *lli = ll_i2info(inode);
219 /* clear group lock, if present */
220 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
221 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
222 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
223 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
227 /* Let's see if we have good enough OPEN lock on the file and if
228 we can skip talking to MDS */
229 if (file->f_dentry->d_inode) { /* Can this ever be false? */
231 int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
232 struct lustre_handle lockh;
233 struct inode *inode = file->f_dentry->d_inode;
234 ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}};
236 down(&lli->lli_och_sem);
237 if (fd->fd_omode & FMODE_WRITE) {
239 LASSERT(lli->lli_open_fd_write_count);
240 lli->lli_open_fd_write_count--;
241 } else if (fd->fd_omode & FMODE_EXEC) {
243 LASSERT(lli->lli_open_fd_exec_count);
244 lli->lli_open_fd_exec_count--;
247 LASSERT(lli->lli_open_fd_read_count);
248 lli->lli_open_fd_read_count--;
250 up(&lli->lli_och_sem);
252 if (!md_lock_match(md_exp, flags, ll_inode2fid(inode),
253 LDLM_IBITS, &policy, lockmode,
255 rc = ll_md_real_close(file->f_dentry->d_inode,
259 CERROR("Releasing a file %p with negative dentry %p. Name %s",
260 file, file->f_dentry, file->f_dentry->d_name.name);
263 LUSTRE_FPRIVATE(file) = NULL;
264 ll_file_data_put(fd);
265 ll_capa_close(inode);
270 int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
272 /* While this returns an error code, fput() the caller does not, so we need
273 * to make every effort to clean up all of our state here. Also, applications
274 * rarely check close errors and even if an error is returned they will not
275 * re-try the close call.
277 int ll_file_release(struct inode *inode, struct file *file)
279 struct ll_file_data *fd;
280 struct ll_sb_info *sbi = ll_i2sbi(inode);
281 struct ll_inode_info *lli = ll_i2info(inode);
282 struct lov_stripe_md *lsm = lli->lli_smd;
286 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
287 inode->i_generation, inode);
289 /* don't do anything for / */
290 if (inode->i_sb->s_root == file->f_dentry)
293 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
294 fd = LUSTRE_FPRIVATE(file);
297 /* don't do anything for / */
298 if (inode->i_sb->s_root == file->f_dentry) {
299 LUSTRE_FPRIVATE(file) = NULL;
300 ll_file_data_put(fd);
305 lov_test_and_clear_async_rc(lsm);
306 lli->lli_async_rc = 0;
308 rc = ll_md_close(sbi->ll_md_exp, inode, file);
312 static int ll_intent_file_open(struct file *file, void *lmm,
313 int lmmsize, struct lookup_intent *itp)
315 struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode);
316 struct dentry *parent = file->f_dentry->d_parent;
317 const char *name = file->f_dentry->d_name.name;
318 const int len = file->f_dentry->d_name.len;
319 struct md_op_data *op_data;
320 struct ptlrpc_request *req;
326 /* Usually we come here only for NFSD, and we want open lock.
327 But we can also get here with pre 2.6.15 patchless kernels, and in
328 that case that lock is also ok */
329 /* We can also get here if there was cached open handle in revalidate_it
330 * but it disappeared while we were getting from there to ll_file_open.
331 * But this means this file was closed and immediatelly opened which
332 * makes a good candidate for using OPEN lock */
333 /* If lmmsize & lmm are not 0, we are just setting stripe info
334 * parameters. No need for the open lock */
335 if (!lmm && !lmmsize)
336 itp->it_flags |= MDS_OPEN_LOCK;
338 op_data = ll_prep_md_op_data(NULL, parent->d_inode,
339 file->f_dentry->d_inode, name, len,
340 O_RDWR, LUSTRE_OPC_ANY, NULL);
342 RETURN(PTR_ERR(op_data));
344 rc = md_intent_lock(sbi->ll_md_exp, op_data, lmm, lmmsize, itp,
345 0 /*unused */, &req, ll_md_blocking_ast, 0);
346 ll_finish_md_op_data(op_data);
348 /* reason for keep own exit path - don`t flood log
349 * with messages with -ESTALE errors.
351 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
352 it_open_error(DISP_OPEN_OPEN, itp))
354 ll_release_openhandle(file->f_dentry, itp);
358 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
359 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
360 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
364 if (itp->d.lustre.it_lock_mode)
365 md_set_lock_data(sbi->ll_md_exp,
366 &itp->d.lustre.it_lock_handle,
367 file->f_dentry->d_inode);
369 rc = ll_prep_inode(&file->f_dentry->d_inode, req, DLM_REPLY_REC_OFF,
372 ptlrpc_req_finished(itp->d.lustre.it_data);
375 it_clear_disposition(itp, DISP_ENQ_COMPLETE);
376 ll_intent_drop_lock(itp);
381 static int ll_och_fill(struct obd_export *md_exp, struct ll_inode_info *lli,
382 struct lookup_intent *it, struct obd_client_handle *och)
384 struct ptlrpc_request *req = it->d.lustre.it_data;
385 struct mdt_body *body;
389 body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body));
390 /* reply already checked out */
391 LASSERT(body != NULL);
392 /* and swabbed in md_enqueue */
393 LASSERT(lustre_rep_swabbed(req, DLM_REPLY_REC_OFF));
395 memcpy(&och->och_fh, &body->handle, sizeof(body->handle));
396 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
397 och->och_fid = lli->lli_fid;
398 och->och_flags = it->it_flags;
399 lli->lli_ioepoch = body->ioepoch;
401 return md_set_open_replay_data(md_exp, och, req);
404 int ll_local_open(struct file *file, struct lookup_intent *it,
405 struct ll_file_data *fd, struct obd_client_handle *och)
407 struct inode *inode = file->f_dentry->d_inode;
408 struct ll_inode_info *lli = ll_i2info(inode);
411 LASSERT(!LUSTRE_FPRIVATE(file));
416 struct ptlrpc_request *req = it->d.lustre.it_data;
417 struct mdt_body *body;
420 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, lli, it, och);
424 body = lustre_msg_buf(req->rq_repmsg,
425 DLM_REPLY_REC_OFF, sizeof(*body));
427 if ((it->it_flags & FMODE_WRITE) &&
428 (body->valid & OBD_MD_FLSIZE))
430 CDEBUG(D_INODE, "Epoch "LPU64" opened on "DFID"\n",
431 lli->lli_ioepoch, PFID(&lli->lli_fid));
435 LUSTRE_FPRIVATE(file) = fd;
436 ll_readahead_init(inode, &fd->fd_ras);
437 fd->fd_omode = it->it_flags;
441 /* Open a file, and (for the very first open) create objects on the OSTs at
442 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
443 * creation or open until ll_lov_setstripe() ioctl is called. We grab
444 * lli_open_sem to ensure no other process will create objects, send the
445 * stripe MD to the MDS, or try to destroy the objects if that fails.
447 * If we already have the stripe MD locally then we don't request it in
448 * md_open(), by passing a lmm_size = 0.
450 * It is up to the application to ensure no other processes open this file
451 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
452 * used. We might be able to avoid races of that sort by getting lli_open_sem
453 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
454 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
456 int ll_file_open(struct inode *inode, struct file *file)
458 struct ll_inode_info *lli = ll_i2info(inode);
459 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
460 .it_flags = file->f_flags };
461 struct lov_stripe_md *lsm;
462 struct ptlrpc_request *req = NULL;
463 struct obd_client_handle **och_p;
465 struct ll_file_data *fd;
469 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), flags %o\n", inode->i_ino,
470 inode->i_generation, inode, file->f_flags);
472 /* don't do anything for / */
473 if (inode->i_sb->s_root == file->f_dentry)
476 #ifdef HAVE_VFS_INTENT_PATCHES
479 it = file->private_data; /* XXX: compat macro */
480 file->private_data = NULL; /* prevent ll_local_open assertion */
483 fd = ll_file_data_get();
487 /* don't do anything for / */
488 if (inode->i_sb->s_root == file->f_dentry) {
489 LUSTRE_FPRIVATE(file) = fd;
493 if (!it || !it->d.lustre.it_disposition) {
494 /* Convert f_flags into access mode. We cannot use file->f_mode,
495 * because everything but O_ACCMODE mask was stripped from
497 if ((oit.it_flags + 1) & O_ACCMODE)
499 if (file->f_flags & O_TRUNC)
500 oit.it_flags |= FMODE_WRITE;
502 /* kernel only call f_op->open in dentry_open. filp_open calls
503 * dentry_open after call to open_namei that checks permissions.
504 * Only nfsd_open call dentry_open directly without checking
505 * permissions and because of that this code below is safe. */
506 if (oit.it_flags & FMODE_WRITE)
507 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
509 /* We do not want O_EXCL here, presumably we opened the file
510 * already? XXX - NFS implications? */
511 oit.it_flags &= ~O_EXCL;
516 /* Let's see if we have file open on MDS already. */
517 if (it->it_flags & FMODE_WRITE) {
518 och_p = &lli->lli_mds_write_och;
519 och_usecount = &lli->lli_open_fd_write_count;
520 } else if (it->it_flags & FMODE_EXEC) {
521 och_p = &lli->lli_mds_exec_och;
522 och_usecount = &lli->lli_open_fd_exec_count;
524 och_p = &lli->lli_mds_read_och;
525 och_usecount = &lli->lli_open_fd_read_count;
528 down(&lli->lli_och_sem);
529 if (*och_p) { /* Open handle is present */
530 if (it_disposition(it, DISP_OPEN_OPEN)) {
531 /* Well, there's extra open request that we do not need,
532 let's close it somehow. This will decref request. */
533 rc = it_open_error(DISP_OPEN_OPEN, it);
535 ll_file_data_put(fd);
536 GOTO(out_och_free, rc);
538 ll_release_openhandle(file->f_dentry, it);
539 lprocfs_counter_incr(ll_i2sbi(inode)->ll_stats,
544 rc = ll_local_open(file, it, fd, NULL);
546 up(&lli->lli_och_sem);
547 ll_file_data_put(fd);
551 LASSERT(*och_usecount == 0);
552 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
554 ll_file_data_put(fd);
555 GOTO(out_och_free, rc = -ENOMEM);
558 if (!it->d.lustre.it_disposition) {
559 it->it_flags |= O_CHECK_STALE;
560 rc = ll_intent_file_open(file, NULL, 0, it);
561 it->it_flags &= ~O_CHECK_STALE;
563 ll_file_data_put(fd);
564 GOTO(out_och_free, rc);
567 /* Got some error? Release the request */
568 if (it->d.lustre.it_status < 0) {
569 req = it->d.lustre.it_data;
570 ptlrpc_req_finished(req);
572 md_set_lock_data(ll_i2sbi(inode)->ll_md_exp,
573 &it->d.lustre.it_lock_handle,
574 file->f_dentry->d_inode);
576 req = it->d.lustre.it_data;
578 /* md_intent_lock() didn't get a request ref if there was an
579 * open error, so don't do cleanup on the request here
581 /* XXX (green): Should not we bail out on any error here, not
582 * just open error? */
583 rc = it_open_error(DISP_OPEN_OPEN, it);
585 ll_file_data_put(fd);
586 GOTO(out_och_free, rc);
589 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
590 rc = ll_local_open(file, it, fd, *och_p);
592 up(&lli->lli_och_sem);
593 ll_file_data_put(fd);
594 GOTO(out_och_free, rc);
597 up(&lli->lli_och_sem);
599 /* Must do this outside lli_och_sem lock to prevent deadlock where
600 different kind of OPEN lock for this same inode gets cancelled
601 by ldlm_cancel_lru */
602 if (!S_ISREG(inode->i_mode))
609 if (file->f_flags & O_LOV_DELAY_CREATE ||
610 !(file->f_mode & FMODE_WRITE)) {
611 CDEBUG(D_INODE, "object creation was delayed\n");
615 file->f_flags &= ~O_LOV_DELAY_CREATE;
618 ptlrpc_req_finished(req);
620 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
624 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
625 *och_p = NULL; /* OBD_FREE writes some magic there */
628 up(&lli->lli_och_sem);
634 /* Fills the obdo with the attributes for the inode defined by lsm */
635 int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
637 struct ptlrpc_request_set *set;
638 struct ll_inode_info *lli = ll_i2info(inode);
639 struct lov_stripe_md *lsm = lli->lli_smd;
641 struct obd_info oinfo = { { { 0 } } };
645 LASSERT(lsm != NULL);
649 oinfo.oi_oa->o_id = lsm->lsm_object_id;
650 oinfo.oi_oa->o_gr = lsm->lsm_object_gr;
651 oinfo.oi_oa->o_mode = S_IFREG;
652 oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE |
653 OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
654 OBD_MD_FLBLKSZ | OBD_MD_FLATIME |
655 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
657 oinfo.oi_capa = ll_mdscapa_get(inode);
659 set = ptlrpc_prep_set();
661 CERROR("can't allocate ptlrpc set\n");
664 rc = obd_getattr_async(ll_i2dtexp(inode), &oinfo, set);
666 rc = ptlrpc_set_wait(set);
667 ptlrpc_set_destroy(set);
669 capa_put(oinfo.oi_capa);
673 oinfo.oi_oa->o_valid &= (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ |
674 OBD_MD_FLATIME | OBD_MD_FLMTIME |
675 OBD_MD_FLCTIME | OBD_MD_FLSIZE);
677 obdo_refresh_inode(inode, oinfo.oi_oa, oinfo.oi_oa->o_valid);
678 CDEBUG(D_INODE, "objid "LPX64" size %Lu, blocks %lu, blksize %lu\n",
679 lli->lli_smd->lsm_object_id, i_size_read(inode),
680 inode->i_blocks, inode->i_blksize);
684 static inline void ll_remove_suid(struct inode *inode)
688 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
689 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
691 /* was any of the uid bits set? */
692 mode &= inode->i_mode;
693 if (mode && !capable(CAP_FSETID)) {
694 inode->i_mode &= ~mode;
695 // XXX careful here - we cannot change the size
699 static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
701 struct ll_inode_info *lli = ll_i2info(inode);
702 struct lov_stripe_md *lsm = lli->lli_smd;
703 struct obd_export *exp = ll_i2dtexp(inode);
706 struct ldlm_lock *lock;
707 struct lov_stripe_md *lsm;
708 } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm };
709 __u32 stripe, vallen = sizeof(stripe);
713 if (lsm->lsm_stripe_count == 1)
714 GOTO(check, stripe = 0);
716 /* get our offset in the lov */
717 rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe);
719 CERROR("obd_get_info: rc = %d\n", rc);
722 LASSERT(stripe < lsm->lsm_stripe_count);
725 if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]||
726 lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[2]){
727 LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
728 lsm->lsm_oinfo[stripe]->loi_id,
729 lsm->lsm_oinfo[stripe]->loi_gr);
730 RETURN(-ELDLM_NO_LOCK_DATA);
736 /* Flush the page cache for an extent as its canceled. When we're on an LOV,
737 * we get a lock cancellation for each stripe, so we have to map the obd's
738 * region back onto the stripes in the file that it held.
740 * No one can dirty the extent until we've finished our work and they can
741 * enqueue another lock. The DLM protects us from ll_file_read/write here,
742 * but other kernel actors could have pages locked.
744 * Called with the DLM lock held. */
745 void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm,
746 struct ldlm_lock *lock, __u32 stripe)
748 ldlm_policy_data_t tmpex;
749 unsigned long start, end, count, skip, i, j;
751 int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA;
752 struct lustre_handle lockh;
753 struct address_space *mapping = inode->i_mapping;
756 tmpex = lock->l_policy_data;
757 CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n",
758 inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end,
761 /* our locks are page granular thanks to osc_enqueue, we invalidate the
763 if ((tmpex.l_extent.start & ~CFS_PAGE_MASK) != 0 ||
764 ((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) != 0)
765 LDLM_ERROR(lock, "lock not aligned on PAGE_SIZE %lu",
767 LASSERT((tmpex.l_extent.start & ~CFS_PAGE_MASK) == 0);
768 LASSERT(((tmpex.l_extent.end + 1) & ~CFS_PAGE_MASK) == 0);
772 start = tmpex.l_extent.start >> CFS_PAGE_SHIFT;
773 end = tmpex.l_extent.end >> CFS_PAGE_SHIFT;
774 if (lsm->lsm_stripe_count > 1) {
775 count = lsm->lsm_stripe_size >> CFS_PAGE_SHIFT;
776 skip = (lsm->lsm_stripe_count - 1) * count;
777 start += start/count * skip + stripe * count;
779 end += end/count * skip + stripe * count;
781 if (end < tmpex.l_extent.end >> CFS_PAGE_SHIFT)
784 i = i_size_read(inode) ? (__u64)(i_size_read(inode) - 1) >>
789 CDEBUG(D_INODE|D_PAGE, "walking page indices start: %lu j: %lu "
790 "count: %lu skip: %lu end: %lu%s\n", start, start % count,
791 count, skip, end, discard ? " (DISCARDING)" : "");
793 /* walk through the vmas on the inode and tear down mmaped pages that
794 * intersect with the lock. this stops immediately if there are no
795 * mmap()ed regions of the file. This is not efficient at all and
796 * should be short lived. We'll associate mmap()ed pages with the lock
797 * and will be able to find them directly */
798 for (i = start; i <= end; i += (j + skip)) {
799 j = min(count - (i % count), end - i + 1);
802 if (ll_teardown_mmaps(mapping,
803 (__u64)i << CFS_PAGE_SHIFT,
804 ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) )
808 /* this is the simplistic implementation of page eviction at
809 * cancelation. It is careful to get races with other page
810 * lockers handled correctly. fixes from bug 20 will make it
811 * more efficient by associating locks with pages and with
812 * batching writeback under the lock explicitly. */
813 for (i = start, j = start % count; i <= end;
814 j++, i++, tmpex.l_extent.start += CFS_PAGE_SIZE) {
816 CDEBUG(D_PAGE, "skip index %lu to %lu\n", i, i + skip);
822 LASSERTF(tmpex.l_extent.start< lock->l_policy_data.l_extent.end,
823 LPU64" >= "LPU64" start %lu i %lu end %lu\n",
824 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
827 if (!mapping_has_pages(mapping)) {
828 CDEBUG(D_INODE|D_PAGE, "nothing left\n");
834 page = find_get_page(mapping, i);
837 LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n",
838 i, tmpex.l_extent.start);
841 /* page->mapping to check with racing against teardown */
842 if (!discard && clear_page_dirty_for_io(page)) {
843 rc = ll_call_writepage(inode, page);
844 /* either waiting for io to complete or reacquiring
845 * the lock that the failed writepage released */
847 wait_on_page_writeback(page);
849 CERROR("writepage inode %lu(%p) of page %p "
850 "failed: %d\n", inode->i_ino, inode,
853 set_bit(AS_ENOSPC, &mapping->flags);
855 set_bit(AS_EIO, &mapping->flags);
859 tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
860 /* check to see if another DLM lock covers this page b=2765 */
861 rc2 = ldlm_lock_match(lock->l_resource->lr_namespace,
862 LDLM_FL_BLOCK_GRANTED|LDLM_FL_CBPENDING |
864 &lock->l_resource->lr_name, LDLM_EXTENT,
865 &tmpex, LCK_PR | LCK_PW, &lockh);
867 if (rc2 <= 0 && page->mapping != NULL) {
868 struct ll_async_page *llap = llap_cast_private(page);
869 /* checking again to account for writeback's
871 LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
873 ll_ra_accounting(llap, mapping);
874 ll_truncate_complete_page(page);
877 page_cache_release(page);
879 LASSERTF(tmpex.l_extent.start <=
880 (lock->l_policy_data.l_extent.end == ~0ULL ? ~0ULL :
881 lock->l_policy_data.l_extent.end + 1),
882 "loop too long "LPU64" > "LPU64" start %lu i %lu end %lu\n",
883 tmpex.l_extent.start, lock->l_policy_data.l_extent.end,
888 static int ll_extent_lock_callback(struct ldlm_lock *lock,
889 struct ldlm_lock_desc *new, void *data,
892 struct lustre_handle lockh = { 0 };
896 if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
897 LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
902 case LDLM_CB_BLOCKING:
903 ldlm_lock2handle(lock, &lockh);
904 rc = ldlm_cli_cancel(&lockh);
906 CERROR("ldlm_cli_cancel failed: %d\n", rc);
908 case LDLM_CB_CANCELING: {
910 struct ll_inode_info *lli;
911 struct lov_stripe_md *lsm;
915 /* This lock wasn't granted, don't try to evict pages */
916 if (lock->l_req_mode != lock->l_granted_mode)
919 inode = ll_inode_from_lock(lock);
922 lli = ll_i2info(inode);
925 if (lli->lli_smd == NULL)
929 stripe = ll_lock_to_stripe_offset(inode, lock);
933 ll_pgcache_remove_extent(inode, lsm, lock, stripe);
935 lov_stripe_lock(lsm);
936 lock_res_and_lock(lock);
937 kms = ldlm_extent_shift_kms(lock,
938 lsm->lsm_oinfo[stripe]->loi_kms);
940 if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
941 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
942 lsm->lsm_oinfo[stripe]->loi_kms, kms);
943 lsm->lsm_oinfo[stripe]->loi_kms = kms;
944 unlock_res_and_lock(lock);
945 lov_stripe_unlock(lsm);
958 int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
960 /* XXX ALLOCATE - 160 bytes */
961 struct inode *inode = ll_inode_from_lock(lock);
962 struct ll_inode_info *lli = ll_i2info(inode);
963 struct lustre_handle lockh = { 0 };
968 if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
969 LDLM_FL_BLOCK_CONV)) {
970 LBUG(); /* not expecting any blocked async locks yet */
971 LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
973 ldlm_lock_dump(D_OTHER, lock, 0);
974 ldlm_reprocess_all(lock->l_resource);
978 LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
980 stripe = ll_lock_to_stripe_offset(inode, lock);
984 if (lock->l_lvb_len) {
985 struct lov_stripe_md *lsm = lli->lli_smd;
987 lvb = lock->l_lvb_data;
988 lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
990 lock_res_and_lock(lock);
991 ll_inode_size_lock(inode, 1);
992 kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
993 kms = ldlm_extent_shift_kms(NULL, kms);
994 if (lsm->lsm_oinfo[stripe].loi_kms != kms)
995 LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
996 lsm->lsm_oinfo[stripe].loi_kms, kms);
997 lsm->lsm_oinfo[stripe].loi_kms = kms;
998 ll_inode_size_unlock(inode, 1);
999 unlock_res_and_lock(lock);
1004 wake_up(&lock->l_waitq);
1006 ldlm_lock2handle(lock, &lockh);
1007 ldlm_lock_decref(&lockh, LCK_PR);
1012 static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
1014 struct ptlrpc_request *req = reqp;
1015 struct inode *inode = ll_inode_from_lock(lock);
1016 struct ll_inode_info *lli;
1017 struct lov_stripe_md *lsm;
1018 struct ost_lvb *lvb;
1020 int size[2] = { sizeof(struct ptlrpc_body), sizeof(*lvb) };
1024 GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
1025 lli = ll_i2info(inode);
1027 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1030 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1032 /* First, find out which stripe index this lock corresponds to. */
1033 stripe = ll_lock_to_stripe_offset(inode, lock);
1035 GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
1037 rc = lustre_pack_reply(req, 2, size, NULL);
1041 lvb = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*lvb));
1042 lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
1043 lvb->lvb_mtime = LTIME_S(inode->i_mtime);
1044 lvb->lvb_atime = LTIME_S(inode->i_atime);
1045 lvb->lvb_ctime = LTIME_S(inode->i_ctime);
1047 LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
1048 " atime "LPU64", mtime "LPU64", ctime "LPU64,
1049 i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
1050 lvb->lvb_atime, lvb->lvb_ctime);
1055 /* These errors are normal races, so we don't want to fill the console
1056 * with messages by calling ptlrpc_error() */
1057 if (rc == -ELDLM_NO_LOCK_DATA)
1058 lustre_pack_reply(req, 1, NULL, NULL);
1060 req->rq_status = rc;
1064 static int ll_merge_lvb(struct inode *inode)
1066 struct ll_inode_info *lli = ll_i2info(inode);
1067 struct ll_sb_info *sbi = ll_i2sbi(inode);
1073 ll_inode_size_lock(inode, 1);
1074 inode_init_lvb(inode, &lvb);
1075 rc = obd_merge_lvb(sbi->ll_dt_exp, lli->lli_smd, &lvb, 0);
1076 i_size_write(inode, lvb.lvb_size);
1077 inode->i_blocks = lvb.lvb_blocks;
1079 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1080 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1081 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1082 ll_inode_size_unlock(inode, 1);
1087 int ll_local_size(struct inode *inode)
1089 ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
1090 struct ll_inode_info *lli = ll_i2info(inode);
1091 struct ll_sb_info *sbi = ll_i2sbi(inode);
1092 struct lustre_handle lockh = { 0 };
1097 if (lli->lli_smd->lsm_stripe_count == 0)
1100 rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
1101 &policy, LCK_PR, &flags, inode, &lockh);
1107 rc = ll_merge_lvb(inode);
1108 obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
1112 int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
1115 struct lustre_handle lockh = { 0 };
1116 struct ldlm_enqueue_info einfo = { 0 };
1117 struct obd_info oinfo = { { { 0 } } };
1123 einfo.ei_type = LDLM_EXTENT;
1124 einfo.ei_mode = LCK_PR;
1125 einfo.ei_cb_bl = ll_extent_lock_callback;
1126 einfo.ei_cb_cp = ldlm_completion_ast;
1127 einfo.ei_cb_gl = ll_glimpse_callback;
1128 einfo.ei_cbdata = NULL;
1130 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1131 oinfo.oi_lockh = &lockh;
1133 oinfo.oi_flags = LDLM_FL_HAS_INTENT;
1135 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1139 CERROR("obd_enqueue returned rc %d, "
1140 "returning -EIO\n", rc);
1141 RETURN(rc > 0 ? -EIO : rc);
1144 lov_stripe_lock(lsm);
1145 memset(&lvb, 0, sizeof(lvb));
1146 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
1147 st->st_size = lvb.lvb_size;
1148 st->st_blocks = lvb.lvb_blocks;
1149 st->st_mtime = lvb.lvb_mtime;
1150 st->st_atime = lvb.lvb_atime;
1151 st->st_ctime = lvb.lvb_ctime;
1152 lov_stripe_unlock(lsm);
1157 /* NB: obd_merge_lvb will prefer locally cached writes if they extend the
1158 * file (because it prefers KMS over RSS when larger) */
1159 int ll_glimpse_size(struct inode *inode, int ast_flags)
1161 struct ll_inode_info *lli = ll_i2info(inode);
1162 struct ll_sb_info *sbi = ll_i2sbi(inode);
1163 struct lustre_handle lockh = { 0 };
1164 struct ldlm_enqueue_info einfo = { 0 };
1165 struct obd_info oinfo = { { { 0 } } };
1169 if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
1172 CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
1174 if (!lli->lli_smd) {
1175 CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
1179 /* NOTE: this looks like DLM lock request, but it may not be one. Due
1180 * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
1181 * won't revoke any conflicting DLM locks held. Instead,
1182 * ll_glimpse_callback() will be called on each client
1183 * holding a DLM lock against this file, and resulting size
1184 * will be returned for each stripe. DLM lock on [0, EOF] is
1185 * acquired only if there were no conflicting locks. */
1186 einfo.ei_type = LDLM_EXTENT;
1187 einfo.ei_mode = LCK_PR;
1188 einfo.ei_cb_bl = ll_extent_lock_callback;
1189 einfo.ei_cb_cp = ldlm_completion_ast;
1190 einfo.ei_cb_gl = ll_glimpse_callback;
1191 einfo.ei_cbdata = inode;
1193 oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
1194 oinfo.oi_lockh = &lockh;
1195 oinfo.oi_md = lli->lli_smd;
1196 oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
1198 rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
1202 CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
1203 RETURN(rc > 0 ? -EIO : rc);
1206 rc = ll_merge_lvb(inode);
1208 CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %lu\n",
1209 i_size_read(inode), inode->i_blocks);
1214 int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
1215 struct lov_stripe_md *lsm, int mode,
1216 ldlm_policy_data_t *policy, struct lustre_handle *lockh,
1219 struct ll_sb_info *sbi = ll_i2sbi(inode);
1221 struct ldlm_enqueue_info einfo = { 0 };
1222 struct obd_info oinfo = { { { 0 } } };
1226 LASSERT(!lustre_handle_is_used(lockh));
1227 LASSERT(lsm != NULL);
1229 /* don't drop the mmapped file to LRU */
1230 if (mapping_mapped(inode->i_mapping))
1231 ast_flags |= LDLM_FL_NO_LRU;
1233 /* XXX phil: can we do this? won't it screw the file size up? */
1234 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1235 (sbi->ll_flags & LL_SBI_NOLCK))
1238 CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
1239 inode->i_ino, policy->l_extent.start, policy->l_extent.end);
1241 einfo.ei_type = LDLM_EXTENT;
1242 einfo.ei_mode = mode;
1243 einfo.ei_cb_bl = ll_extent_lock_callback;
1244 einfo.ei_cb_cp = ldlm_completion_ast;
1245 einfo.ei_cb_gl = ll_glimpse_callback;
1246 einfo.ei_cbdata = inode;
1248 oinfo.oi_policy = *policy;
1249 oinfo.oi_lockh = lockh;
1251 oinfo.oi_flags = ast_flags;
1253 rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
1254 *policy = oinfo.oi_policy;
1258 ll_inode_size_lock(inode, 1);
1259 inode_init_lvb(inode, &lvb);
1260 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1262 if (policy->l_extent.start == 0 &&
1263 policy->l_extent.end == OBD_OBJECT_EOF) {
1264 /* vmtruncate()->ll_truncate() first sets the i_size and then
1265 * the kms under both a DLM lock and the
1266 * ll_inode_size_lock(). If we don't get the
1267 * ll_inode_size_lock() here we can match the DLM lock and
1268 * reset i_size from the kms before the truncating path has
1269 * updated the kms. generic_file_write can then trust the
1270 * stale i_size when doing appending writes and effectively
1271 * cancel the result of the truncate. Getting the
1272 * ll_inode_size_lock() after the enqueue maintains the DLM
1273 * -> ll_inode_size_lock() acquiring order. */
1274 i_size_write(inode, lvb.lvb_size);
1275 CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
1276 inode->i_ino, i_size_read(inode));
1280 LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
1281 LTIME_S(inode->i_atime) = lvb.lvb_atime;
1282 LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
1284 ll_inode_size_unlock(inode, 1);
1289 int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
1290 struct lov_stripe_md *lsm, int mode,
1291 struct lustre_handle *lockh)
1293 struct ll_sb_info *sbi = ll_i2sbi(inode);
1297 /* XXX phil: can we do this? won't it screw the file size up? */
1298 if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
1299 (sbi->ll_flags & LL_SBI_NOLCK))
1302 rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
1307 static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
1310 struct inode *inode = file->f_dentry->d_inode;
1311 struct ll_inode_info *lli = ll_i2info(inode);
1312 struct lov_stripe_md *lsm = lli->lli_smd;
1313 struct ll_sb_info *sbi = ll_i2sbi(inode);
1314 struct ll_lock_tree tree;
1315 struct ll_lock_tree_node *node;
1317 struct ll_ra_read bead;
1320 ssize_t retval, chunk, sum = 0;
1324 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1325 inode->i_ino, inode->i_generation, inode, count, *ppos);
1326 /* "If nbyte is 0, read() will return 0 and have no other results."
1327 * -- Single Unix Spec */
1331 ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
1334 /* Read on file with no objects should return zero-filled
1335 * buffers up to file size (we can get non-zero sizes with
1336 * mknod + truncate, then opening file for read. This is a
1337 * common pattern in NFS case, it seems). Bug 6243 */
1339 /* Since there are no objects on OSTs, we have nothing to get
1340 * lock on and so we are forced to access inode->i_size
1343 /* Read beyond end of file */
1344 if (*ppos >= i_size_read(inode))
1347 if (count > i_size_read(inode) - *ppos)
1348 count = i_size_read(inode) - *ppos;
1349 /* Make sure to correctly adjust the file pos pointer for
1351 notzeroed = clear_user(buf, count);
1360 if (sbi->ll_max_rw_chunk != 0) {
1361 /* first, let's know the end of the current stripe */
1363 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1366 /* correct, the end is beyond the request */
1367 if (end > *ppos + count - 1)
1368 end = *ppos + count - 1;
1370 /* and chunk shouldn't be too large even if striping is wide */
1371 if (end - *ppos > sbi->ll_max_rw_chunk)
1372 end = *ppos + sbi->ll_max_rw_chunk - 1;
1374 end = *ppos + count - 1;
1377 node = ll_node_from_inode(inode, *ppos, end, LCK_PR);
1379 GOTO(out, retval = PTR_ERR(node));
1382 tree.lt_fd = LUSTRE_FPRIVATE(file);
1383 rc = ll_tree_lock(&tree, node, buf, count,
1384 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1386 GOTO(out, retval = rc);
1388 ll_inode_size_lock(inode, 1);
1390 * Consistency guarantees: following possibilities exist for the
1391 * relation between region being read and real file size at this
1394 * (A): the region is completely inside of the file;
1396 * (B-x): x bytes of region are inside of the file, the rest is
1399 * (C): the region is completely outside of the file.
1401 * This classification is stable under DLM lock acquired by
1402 * ll_tree_lock() above, because to change class, other client has to
1403 * take DLM lock conflicting with our lock. Also, any updates to
1404 * ->i_size by other threads on this client are serialized by
1405 * ll_inode_size_lock(). This guarantees that short reads are handled
1406 * correctly in the face of concurrent writes and truncates.
1408 inode_init_lvb(inode, &lvb);
1409 obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
1411 if (*ppos + count - 1 > kms) {
1412 /* A glimpse is necessary to determine whether we return a
1413 * short read (B) or some zeroes at the end of the buffer (C) */
1414 ll_inode_size_unlock(inode, 1);
1415 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1417 ll_tree_unlock(&tree);
1421 /* region is within kms and, hence, within real file size (A).
1422 * We need to increase i_size to cover the read region so that
1423 * generic_file_read() will do its job, but that doesn't mean
1424 * the kms size is _correct_, it is only the _minimum_ size.
1425 * If someone does a stat they will get the correct size which
1426 * will always be >= the kms value here. b=11081 */
1427 if (i_size_read(inode) < kms)
1428 i_size_write(inode, kms);
1429 ll_inode_size_unlock(inode, 1);
1432 chunk = end - *ppos + 1;
1433 CDEBUG(D_INODE, "Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1434 inode->i_ino, chunk, *ppos, i_size_read(inode));
1436 /* turn off the kernel's read-ahead */
1437 file->f_ra.ra_pages = 0;
1439 /* initialize read-ahead window once per syscall */
1442 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1443 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1444 ll_ra_read_in(file, &bead);
1448 file_accessed(file);
1449 retval = generic_file_read(file, buf, chunk, ppos);
1450 ll_rw_stats_tally(sbi, current->pid, file, count, 0);
1452 ll_tree_unlock(&tree);
1458 if (retval == chunk && count > 0)
1464 ll_ra_read_ex(file, &bead);
1465 retval = (sum > 0) ? sum : retval;
1470 * Write to a file (through the page cache).
1472 static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
1475 struct inode *inode = file->f_dentry->d_inode;
1476 struct ll_sb_info *sbi = ll_i2sbi(inode);
1477 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1478 struct ll_lock_tree tree;
1479 struct ll_lock_tree_node *node;
1480 loff_t maxbytes = ll_file_maxbytes(inode);
1481 loff_t lock_start, lock_end, end;
1482 ssize_t retval, chunk, sum = 0;
1486 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1487 inode->i_ino, inode->i_generation, inode, count, *ppos);
1489 SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
1491 /* POSIX, but surprised the VFS doesn't check this already */
1495 /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
1496 * called on the file, don't fail the below assertion (bug 2388). */
1497 if (file->f_flags & O_LOV_DELAY_CREATE &&
1498 ll_i2info(inode)->lli_smd == NULL)
1501 LASSERT(ll_i2info(inode)->lli_smd != NULL);
1503 down(&ll_i2info(inode)->lli_write_sem);
1506 chunk = 0; /* just to fix gcc's warning */
1507 end = *ppos + count - 1;
1509 if (file->f_flags & O_APPEND) {
1511 lock_end = OBD_OBJECT_EOF;
1512 } else if (sbi->ll_max_rw_chunk != 0) {
1513 /* first, let's know the end of the current stripe */
1515 obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
1518 /* correct, the end is beyond the request */
1519 if (end > *ppos + count - 1)
1520 end = *ppos + count - 1;
1522 /* and chunk shouldn't be too large even if striping is wide */
1523 if (end - *ppos > sbi->ll_max_rw_chunk)
1524 end = *ppos + sbi->ll_max_rw_chunk - 1;
1529 lock_end = *ppos + count - 1;
1531 node = ll_node_from_inode(inode, lock_start, lock_end, LCK_PW);
1534 GOTO(out, retval = PTR_ERR(node));
1536 tree.lt_fd = LUSTRE_FPRIVATE(file);
1537 rc = ll_tree_lock(&tree, node, buf, count,
1538 file->f_flags & O_NONBLOCK ? LDLM_FL_BLOCK_NOWAIT :0);
1540 GOTO(out, retval = rc);
1542 /* This is ok, g_f_w will overwrite this under i_sem if it races
1543 * with a local truncate, it just makes our maxbyte checking easier.
1544 * The i_size value gets updated in ll_extent_lock() as a consequence
1545 * of the [0,EOF] extent lock we requested above. */
1546 if (file->f_flags & O_APPEND) {
1547 *ppos = i_size_read(inode);
1548 end = *ppos + count - 1;
1551 if (*ppos >= maxbytes) {
1552 send_sig(SIGXFSZ, current, 0);
1553 GOTO(out_unlock, retval = -EFBIG);
1555 if (*ppos + count > maxbytes)
1556 count = maxbytes - *ppos;
1558 /* generic_file_write handles O_APPEND after getting i_mutex */
1559 chunk = end - *ppos + 1;
1560 CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
1561 inode->i_ino, chunk, *ppos);
1562 retval = generic_file_write(file, buf, chunk, ppos);
1563 ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, count, 1);
1566 ll_tree_unlock(&tree);
1573 if (retval == chunk && count > 0)
1577 up(&ll_i2info(inode)->lli_write_sem);
1579 retval = (sum > 0) ? sum : retval;
1580 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1581 retval > 0 ? retval : 0);
1586 * Send file content (through pagecache) somewhere with helper
1588 static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
1589 read_actor_t actor, void *target)
1591 struct inode *inode = in_file->f_dentry->d_inode;
1592 struct ll_inode_info *lli = ll_i2info(inode);
1593 struct lov_stripe_md *lsm = lli->lli_smd;
1594 struct ll_lock_tree tree;
1595 struct ll_lock_tree_node *node;
1597 struct ll_ra_read bead;
1602 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
1603 inode->i_ino, inode->i_generation, inode, count, *ppos);
1605 /* "If nbyte is 0, read() will return 0 and have no other results."
1606 * -- Single Unix Spec */
1610 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
1611 /* turn off the kernel's read-ahead */
1612 in_file->f_ra.ra_pages = 0;
1614 /* File with no objects, nothing to lock */
1616 RETURN(generic_file_sendfile(in_file, ppos, count, actor, target));
1618 node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
1620 RETURN(PTR_ERR(node));
1622 tree.lt_fd = LUSTRE_FPRIVATE(in_file);
1623 rc = ll_tree_lock(&tree, node, NULL, count,
1624 in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
1628 ll_inode_size_lock(inode, 1);
1630 * Consistency guarantees: following possibilities exist for the
1631 * relation between region being read and real file size at this
1634 * (A): the region is completely inside of the file;
1636 * (B-x): x bytes of region are inside of the file, the rest is
1639 * (C): the region is completely outside of the file.
1641 * This classification is stable under DLM lock acquired by
1642 * ll_tree_lock() above, because to change class, other client has to
1643 * take DLM lock conflicting with our lock. Also, any updates to
1644 * ->i_size by other threads on this client are serialized by
1645 * ll_inode_size_lock(). This guarantees that short reads are handled
1646 * correctly in the face of concurrent writes and truncates.
1648 inode_init_lvb(inode, &lvb);
1649 obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
1651 if (*ppos + count - 1 > kms) {
1652 /* A glimpse is necessary to determine whether we return a
1653 * short read (B) or some zeroes at the end of the buffer (C) */
1654 ll_inode_size_unlock(inode, 1);
1655 retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
1659 /* region is within kms and, hence, within real file size (A) */
1660 i_size_write(inode, kms);
1661 ll_inode_size_unlock(inode, 1);
1664 CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
1665 inode->i_ino, count, *ppos, i_size_read(inode));
1667 bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
1668 bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
1669 ll_ra_read_in(in_file, &bead);
1671 file_accessed(in_file);
1672 retval = generic_file_sendfile(in_file, ppos, count, actor, target);
1673 ll_ra_read_ex(in_file, &bead);
1676 ll_tree_unlock(&tree);
1680 static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
1683 struct ll_inode_info *lli = ll_i2info(inode);
1684 struct obd_export *exp = ll_i2dtexp(inode);
1685 struct ll_recreate_obj ucreatp;
1686 struct obd_trans_info oti = { 0 };
1687 struct obdo *oa = NULL;
1690 struct lov_stripe_md *lsm, *lsm2;
1693 if (!capable (CAP_SYS_ADMIN))
1696 rc = copy_from_user(&ucreatp, (struct ll_recreate_obj *)arg,
1697 sizeof(struct ll_recreate_obj));
1705 down(&lli->lli_size_sem);
1708 GOTO(out, rc = -ENOENT);
1709 lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
1710 (lsm->lsm_stripe_count));
1712 OBD_ALLOC(lsm2, lsm_size);
1714 GOTO(out, rc = -ENOMEM);
1716 oa->o_id = ucreatp.lrc_id;
1717 oa->o_gr = ucreatp.lrc_group;
1718 oa->o_nlink = ucreatp.lrc_ost_idx;
1719 oa->o_flags |= OBD_FL_RECREATE_OBJS;
1720 oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
1721 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
1722 OBD_MD_FLMTIME | OBD_MD_FLCTIME);
1724 memcpy(lsm2, lsm, lsm_size);
1725 rc = obd_create(exp, oa, &lsm2, &oti);
1727 OBD_FREE(lsm2, lsm_size);
1730 up(&lli->lli_size_sem);
1735 int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
1736 int flags, struct lov_user_md *lum, int lum_size)
1738 struct ll_inode_info *lli = ll_i2info(inode);
1739 struct lov_stripe_md *lsm;
1740 struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
1744 down(&lli->lli_size_sem);
1747 up(&lli->lli_size_sem);
1748 CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
1753 rc = ll_intent_file_open(file, lum, lum_size, &oit);
1756 if (it_disposition(&oit, DISP_LOOKUP_NEG))
1757 GOTO(out_req_free, rc = -ENOENT);
1758 rc = oit.d.lustre.it_status;
1760 GOTO(out_req_free, rc);
1762 ll_release_openhandle(file->f_dentry, &oit);
1765 up(&lli->lli_size_sem);
1766 ll_intent_release(&oit);
1769 ptlrpc_req_finished((struct ptlrpc_request *) oit.d.lustre.it_data);
1773 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1774 struct lov_mds_md **lmmp, int *lmm_size,
1775 struct ptlrpc_request **request)
1777 struct ll_sb_info *sbi = ll_i2sbi(inode);
1778 struct mdt_body *body;
1779 struct lov_mds_md *lmm = NULL;
1780 struct ptlrpc_request *req = NULL;
1781 struct obd_capa *oc;
1784 rc = ll_get_max_mdsize(sbi, &lmmsize);
1788 oc = ll_mdscapa_get(inode);
1789 rc = md_getattr_name(sbi->ll_md_exp, ll_inode2fid(inode),
1790 oc, filename, strlen(filename) + 1,
1791 OBD_MD_FLEASIZE | OBD_MD_FLDIREA, lmmsize, &req);
1794 CDEBUG(D_INFO, "md_getattr_name failed "
1795 "on %s: rc %d\n", filename, rc);
1799 body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*body));
1800 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1801 /* swabbed by mdc_getattr_name */
1802 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF));
1804 lmmsize = body->eadatasize;
1806 if (!(body->valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1808 GOTO(out, rc = -ENODATA);
1811 lmm = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF + 1, lmmsize);
1812 LASSERT(lmm != NULL);
1813 LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1));
1816 * This is coming from the MDS, so is probably in
1817 * little endian. We convert it to host endian before
1818 * passing it to userspace.
1820 if (lmm->lmm_magic == __swab32(LOV_MAGIC)) {
1821 lustre_swab_lov_user_md((struct lov_user_md *)lmm);
1822 lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm);
1823 } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) {
1824 lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm);
1827 if (lmm->lmm_magic == LOV_MAGIC_JOIN) {
1828 struct lov_stripe_md *lsm;
1829 struct lov_user_md_join *lmj;
1830 int lmj_size, i, aindex = 0;
1832 rc = obd_unpackmd(sbi->ll_dt_exp, &lsm, lmm, lmmsize);
1834 GOTO(out, rc = -ENOMEM);
1835 rc = obd_checkmd(sbi->ll_dt_exp, sbi->ll_md_exp, lsm);
1837 GOTO(out_free_memmd, rc);
1839 lmj_size = sizeof(struct lov_user_md_join) +
1840 lsm->lsm_stripe_count *
1841 sizeof(struct lov_user_ost_data_join);
1842 OBD_ALLOC(lmj, lmj_size);
1844 GOTO(out_free_memmd, rc = -ENOMEM);
1846 memcpy(lmj, lmm, sizeof(struct lov_user_md_join));
1847 for (i = 0; i < lsm->lsm_stripe_count; i++) {
1848 struct lov_extent *lex =
1849 &lsm->lsm_array->lai_ext_array[aindex];
1851 if (lex->le_loi_idx + lex->le_stripe_count <= i)
1853 CDEBUG(D_INFO, "aindex %d i %d l_extent_start "
1854 LPU64" len %d\n", aindex, i,
1855 lex->le_start, (int)lex->le_len);
1856 lmj->lmm_objects[i].l_extent_start =
1859 if ((int)lex->le_len == -1)
1860 lmj->lmm_objects[i].l_extent_end = -1;
1862 lmj->lmm_objects[i].l_extent_end =
1863 lex->le_start + lex->le_len;
1864 lmj->lmm_objects[i].l_object_id =
1865 lsm->lsm_oinfo[i]->loi_id;
1866 lmj->lmm_objects[i].l_object_gr =
1867 lsm->lsm_oinfo[i]->loi_gr;
1868 lmj->lmm_objects[i].l_ost_gen =
1869 lsm->lsm_oinfo[i]->loi_ost_gen;
1870 lmj->lmm_objects[i].l_ost_idx =
1871 lsm->lsm_oinfo[i]->loi_ost_idx;
1873 lmm = (struct lov_mds_md *)lmj;
1876 obd_free_memmd(sbi->ll_dt_exp, &lsm);
1880 *lmm_size = lmmsize;
1885 static int ll_lov_setea(struct inode *inode, struct file *file,
1888 int flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
1889 struct lov_user_md *lump;
1890 int lum_size = sizeof(struct lov_user_md) +
1891 sizeof(struct lov_user_ost_data);
1895 if (!capable (CAP_SYS_ADMIN))
1898 OBD_ALLOC(lump, lum_size);
1902 rc = copy_from_user(lump, (struct lov_user_md *)arg, lum_size);
1904 OBD_FREE(lump, lum_size);
1908 rc = ll_lov_setstripe_ea_info(inode, file, flags, lump, lum_size);
1910 OBD_FREE(lump, lum_size);
1914 static int ll_lov_setstripe(struct inode *inode, struct file *file,
1917 struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
1919 int flags = FMODE_WRITE;
1922 /* Bug 1152: copy properly when this is no longer true */
1923 LASSERT(sizeof(lum) == sizeof(*lump));
1924 LASSERT(sizeof(lum.lmm_objects[0]) == sizeof(lump->lmm_objects[0]));
1925 rc = copy_from_user(&lum, lump, sizeof(lum));
1929 rc = ll_lov_setstripe_ea_info(inode, file, flags, &lum, sizeof(lum));
1931 put_user(0, &lump->lmm_stripe_count);
1932 rc = obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode),
1933 0, ll_i2info(inode)->lli_smd, lump);
1938 static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
1940 struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
1945 return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, ll_i2dtexp(inode), 0, lsm,
1949 static int ll_get_grouplock(struct inode *inode, struct file *file,
1952 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1953 ldlm_policy_data_t policy = { .l_extent = { .start = 0,
1954 .end = OBD_OBJECT_EOF}};
1955 struct lustre_handle lockh = { 0 };
1956 struct ll_inode_info *lli = ll_i2info(inode);
1957 struct lov_stripe_md *lsm = lli->lli_smd;
1961 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
1965 policy.l_extent.gid = arg;
1966 if (file->f_flags & O_NONBLOCK)
1967 flags = LDLM_FL_BLOCK_NOWAIT;
1969 rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
1973 fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
1975 memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
1980 static int ll_put_grouplock(struct inode *inode, struct file *file,
1983 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1984 struct ll_inode_info *lli = ll_i2info(inode);
1985 struct lov_stripe_md *lsm = lli->lli_smd;
1989 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1990 /* Ugh, it's already unlocked. */
1994 if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
1997 fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
1999 rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
2004 memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
2009 static int join_sanity_check(struct inode *head, struct inode *tail)
2012 if ((ll_i2sbi(head)->ll_flags & LL_SBI_JOIN) == 0) {
2013 CERROR("server do not support join \n");
2016 if (!S_ISREG(tail->i_mode) || !S_ISREG(head->i_mode)) {
2017 CERROR("tail ino %lu and ino head %lu must be regular\n",
2018 head->i_ino, tail->i_ino);
2021 if (head->i_ino == tail->i_ino) {
2022 CERROR("file %lu can not be joined to itself \n", head->i_ino);
2025 if (i_size_read(head) % JOIN_FILE_ALIGN) {
2026 CERROR("hsize %llu must be times of 64K\n", i_size_read(head));
2032 static int join_file(struct inode *head_inode, struct file *head_filp,
2033 struct file *tail_filp)
2035 struct dentry *tail_dentry = tail_filp->f_dentry;
2036 struct lookup_intent oit = {.it_op = IT_OPEN,
2037 .it_flags = head_filp->f_flags|O_JOIN_FILE};
2038 struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CW,
2039 ll_md_blocking_ast, ldlm_completion_ast, NULL, NULL };
2041 struct lustre_handle lockh;
2042 struct md_op_data *op_data;
2047 tail_dentry = tail_filp->f_dentry;
2049 data = i_size_read(head_inode);
2050 op_data = ll_prep_md_op_data(NULL, head_inode,
2051 tail_dentry->d_parent->d_inode,
2052 tail_dentry->d_name.name,
2053 tail_dentry->d_name.len, 0,
2054 LUSTRE_OPC_ANY, &data);
2055 if (IS_ERR(op_data))
2056 RETURN(PTR_ERR(op_data));
2058 rc = md_enqueue(ll_i2mdexp(head_inode), &einfo, &oit,
2059 op_data, &lockh, NULL, 0, 0);
2061 ll_finish_md_op_data(op_data);
2065 rc = oit.d.lustre.it_status;
2067 if (rc < 0 || it_open_error(DISP_OPEN_OPEN, &oit)) {
2068 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, &oit);
2069 ptlrpc_req_finished((struct ptlrpc_request *)
2070 oit.d.lustre.it_data);
2074 if (oit.d.lustre.it_lock_mode) { /* If we got lock - release it right
2076 ldlm_lock_decref(&lockh, oit.d.lustre.it_lock_mode);
2077 oit.d.lustre.it_lock_mode = 0;
2079 ll_release_openhandle(head_filp->f_dentry, &oit);
2081 ll_intent_release(&oit);
2085 static int ll_file_join(struct inode *head, struct file *filp,
2086 char *filename_tail)
2088 struct inode *tail = NULL, *first = NULL, *second = NULL;
2089 struct dentry *tail_dentry;
2090 struct file *tail_filp, *first_filp, *second_filp;
2091 struct ll_lock_tree first_tree, second_tree;
2092 struct ll_lock_tree_node *first_node, *second_node;
2093 struct ll_inode_info *hlli = ll_i2info(head), *tlli;
2094 int rc = 0, cleanup_phase = 0;
2097 CDEBUG(D_VFSTRACE, "VFS Op:head=%lu/%u(%p) tail %s\n",
2098 head->i_ino, head->i_generation, head, filename_tail);
2100 tail_filp = filp_open(filename_tail, O_WRONLY, 0644);
2101 if (IS_ERR(tail_filp)) {
2102 CERROR("Can not open tail file %s", filename_tail);
2103 rc = PTR_ERR(tail_filp);
2106 tail = igrab(tail_filp->f_dentry->d_inode);
2108 tlli = ll_i2info(tail);
2109 tail_dentry = tail_filp->f_dentry;
2110 LASSERT(tail_dentry);
2113 /*reorder the inode for lock sequence*/
2114 first = head->i_ino > tail->i_ino ? head : tail;
2115 second = head->i_ino > tail->i_ino ? tail : head;
2116 first_filp = head->i_ino > tail->i_ino ? filp : tail_filp;
2117 second_filp = head->i_ino > tail->i_ino ? tail_filp : filp;
2119 CDEBUG(D_INFO, "reorder object from %lu:%lu to %lu:%lu \n",
2120 head->i_ino, tail->i_ino, first->i_ino, second->i_ino);
2121 first_node = ll_node_from_inode(first, 0, OBD_OBJECT_EOF, LCK_EX);
2122 if (IS_ERR(first_node)){
2123 rc = PTR_ERR(first_node);
2126 first_tree.lt_fd = first_filp->private_data;
2127 rc = ll_tree_lock(&first_tree, first_node, NULL, 0, 0);
2132 second_node = ll_node_from_inode(second, 0, OBD_OBJECT_EOF, LCK_EX);
2133 if (IS_ERR(second_node)){
2134 rc = PTR_ERR(second_node);
2137 second_tree.lt_fd = second_filp->private_data;
2138 rc = ll_tree_lock(&second_tree, second_node, NULL, 0, 0);
2143 rc = join_sanity_check(head, tail);
2147 rc = join_file(head, filp, tail_filp);
2151 switch (cleanup_phase) {
2153 ll_tree_unlock(&second_tree);
2154 obd_cancel_unused(ll_i2dtexp(second),
2155 ll_i2info(second)->lli_smd, 0, NULL);
2157 ll_tree_unlock(&first_tree);
2158 obd_cancel_unused(ll_i2dtexp(first),
2159 ll_i2info(first)->lli_smd, 0, NULL);
2161 filp_close(tail_filp, 0);
2164 if (head && rc == 0) {
2165 obd_free_memmd(ll_i2sbi(head)->ll_dt_exp,
2167 hlli->lli_smd = NULL;
2172 CERROR("invalid cleanup_phase %d\n", cleanup_phase);
2178 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2180 struct inode *inode = dentry->d_inode;
2181 struct obd_client_handle *och;
2187 /* Root ? Do nothing. */
2188 if (dentry->d_inode->i_sb->s_root == dentry)
2191 /* No open handle to close? Move away */
2192 if (!it_disposition(it, DISP_OPEN_OPEN))
2195 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2197 OBD_ALLOC(och, sizeof(*och));
2199 GOTO(out, rc = -ENOMEM);
2201 ll_och_fill(ll_i2sbi(inode)->ll_md_exp,
2202 ll_i2info(inode), it, och);
2204 rc = ll_close_inode_openhandle(ll_i2sbi(inode)->ll_md_exp,
2207 /* this one is in place of ll_file_open */
2208 ptlrpc_req_finished(it->d.lustre.it_data);
2209 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2213 int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
2216 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2220 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%x\n", inode->i_ino,
2221 inode->i_generation, inode, cmd);
2222 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
2224 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
2225 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
2229 case LL_IOC_GETFLAGS:
2230 /* Get the current value of the file flags */
2231 return put_user(fd->fd_flags, (int *)arg);
2232 case LL_IOC_SETFLAGS:
2233 case LL_IOC_CLRFLAGS:
2234 /* Set or clear specific file flags */
2235 /* XXX This probably needs checks to ensure the flags are
2236 * not abused, and to handle any flag side effects.
2238 if (get_user(flags, (int *) arg))
2241 if (cmd == LL_IOC_SETFLAGS) {
2242 if ((flags & LL_FILE_IGNORE_LOCK) &&
2243 !(file->f_flags & O_DIRECT)) {
2244 CERROR("%s: unable to disable locking on "
2245 "non-O_DIRECT file\n", current->comm);
2249 fd->fd_flags |= flags;
2251 fd->fd_flags &= ~flags;
2254 case LL_IOC_LOV_SETSTRIPE:
2255 RETURN(ll_lov_setstripe(inode, file, arg));
2256 case LL_IOC_LOV_SETEA:
2257 RETURN(ll_lov_setea(inode, file, arg));
2258 case LL_IOC_LOV_GETSTRIPE:
2259 RETURN(ll_lov_getstripe(inode, arg));
2260 case LL_IOC_RECREATE_OBJ:
2261 RETURN(ll_lov_recreate_obj(inode, file, arg));
2262 case EXT3_IOC_GETFLAGS:
2263 case EXT3_IOC_SETFLAGS:
2264 RETURN(ll_iocontrol(inode, file, cmd, arg));
2265 case EXT3_IOC_GETVERSION_OLD:
2266 case EXT3_IOC_GETVERSION:
2267 RETURN(put_user(inode->i_generation, (int *)arg));
2272 ftail = getname((const char *)arg);
2274 RETURN(PTR_ERR(ftail));
2275 rc = ll_file_join(inode, file, ftail);
2279 case LL_IOC_GROUP_LOCK:
2280 RETURN(ll_get_grouplock(inode, file, arg));
2281 case LL_IOC_GROUP_UNLOCK:
2282 RETURN(ll_put_grouplock(inode, file, arg));
2283 case IOC_OBD_STATFS:
2284 RETURN(ll_obd_statfs(inode, (void *)arg));
2286 /* We need to special case any other ioctls we want to handle,
2287 * to send them to the MDS/OST as appropriate and to properly
2288 * network encode the arg field.
2289 case EXT3_IOC_SETVERSION_OLD:
2290 case EXT3_IOC_SETVERSION:
2292 case LL_IOC_FLUSHCTX:
2293 RETURN(ll_flush_ctx(inode));
2294 case LL_IOC_GETFACL: {
2295 struct rmtacl_ioctl_data ioc;
2297 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2300 RETURN(ll_ioctl_getfacl(inode, &ioc));
2302 case LL_IOC_SETFACL: {
2303 struct rmtacl_ioctl_data ioc;
2305 if (copy_from_user(&ioc, (void *)arg, sizeof(ioc)))
2308 RETURN(ll_ioctl_setfacl(inode, &ioc));
2314 ll_iocontrol_call(inode, file, cmd, arg, &err))
2317 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
2323 loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
2325 struct inode *inode = file->f_dentry->d_inode;
2326 struct ll_inode_info *lli = ll_i2info(inode);
2327 struct lov_stripe_md *lsm = lli->lli_smd;
2330 retval = offset + ((origin == 2) ? i_size_read(inode) :
2331 (origin == 1) ? file->f_pos : 0);
2332 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), to=%Lu=%#Lx(%s)\n",
2333 inode->i_ino, inode->i_generation, inode, retval, retval,
2334 origin == 2 ? "SEEK_END": origin == 1 ? "SEEK_CUR" : "SEEK_SET");
2335 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
2337 if (origin == 2) { /* SEEK_END */
2338 int nonblock = 0, rc;
2340 if (file->f_flags & O_NONBLOCK)
2341 nonblock = LDLM_FL_BLOCK_NOWAIT;
2344 rc = ll_glimpse_size(inode, nonblock);
2349 ll_inode_size_lock(inode, 0);
2350 offset += i_size_read(inode);
2351 ll_inode_size_unlock(inode, 0);
2352 } else if (origin == 1) { /* SEEK_CUR */
2353 offset += file->f_pos;
2357 if (offset >= 0 && offset <= ll_file_maxbytes(inode)) {
2358 if (offset != file->f_pos) {
2359 file->f_pos = offset;
2360 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
2362 file->f_version = ++event;
2371 int ll_fsync(struct file *file, struct dentry *dentry, int data)
2373 struct inode *inode = dentry->d_inode;
2374 struct ll_inode_info *lli = ll_i2info(inode);
2375 struct lov_stripe_md *lsm = lli->lli_smd;
2376 struct ptlrpc_request *req;
2377 struct obd_capa *oc;
2380 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino,
2381 inode->i_generation, inode);
2382 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
2384 /* fsync's caller has already called _fdata{sync,write}, we want
2385 * that IO to finish before calling the osc and mdc sync methods */
2386 rc = filemap_fdatawait(inode->i_mapping);
2388 /* catch async errors that were recorded back when async writeback
2389 * failed for pages in this mapping. */
2390 err = lli->lli_async_rc;
2391 lli->lli_async_rc = 0;
2395 err = lov_test_and_clear_async_rc(lsm);
2400 oc = ll_mdscapa_get(inode);
2401 err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), oc,
2407 ptlrpc_req_finished(req);
2414 RETURN(rc ? rc : -ENOMEM);
2416 oa->o_id = lsm->lsm_object_id;
2417 oa->o_gr = lsm->lsm_object_gr;
2418 oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2419 obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
2420 OBD_MD_FLMTIME | OBD_MD_FLCTIME |
2423 oc = ll_osscapa_get(inode, CAPA_OPC_OSS_WRITE);
2424 err = obd_sync(ll_i2sbi(inode)->ll_dt_exp, oa, lsm,
2425 0, OBD_OBJECT_EOF, oc);
2435 int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
2437 struct inode *inode = file->f_dentry->d_inode;
2438 struct ll_sb_info *sbi = ll_i2sbi(inode);
2439 struct ldlm_res_id res_id =
2440 { .name = { fid_seq(ll_inode2fid(inode)),
2441 fid_oid(ll_inode2fid(inode)),
2442 fid_ver(ll_inode2fid(inode)),
2444 struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL,
2445 ldlm_flock_completion_ast, NULL, file_lock };
2446 struct lustre_handle lockh = {0};
2447 ldlm_policy_data_t flock;
2452 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu file_lock=%p\n",
2453 inode->i_ino, file_lock);
2455 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
2457 if (file_lock->fl_flags & FL_FLOCK) {
2458 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
2459 /* set missing params for flock() calls */
2460 file_lock->fl_end = OFFSET_MAX;
2461 file_lock->fl_pid = current->tgid;
2463 flock.l_flock.pid = file_lock->fl_pid;
2464 flock.l_flock.start = file_lock->fl_start;
2465 flock.l_flock.end = file_lock->fl_end;
2467 switch (file_lock->fl_type) {
2469 einfo.ei_mode = LCK_PR;
2472 /* An unlock request may or may not have any relation to
2473 * existing locks so we may not be able to pass a lock handle
2474 * via a normal ldlm_lock_cancel() request. The request may even
2475 * unlock a byte range in the middle of an existing lock. In
2476 * order to process an unlock request we need all of the same
2477 * information that is given with a normal read or write record
2478 * lock request. To avoid creating another ldlm unlock (cancel)
2479 * message we'll treat a LCK_NL flock request as an unlock. */
2480 einfo.ei_mode = LCK_NL;
2483 einfo.ei_mode = LCK_PW;
2486 CERROR("unknown fcntl lock type: %d\n", file_lock->fl_type);
2501 flags = LDLM_FL_BLOCK_NOWAIT;
2507 flags = LDLM_FL_TEST_LOCK;
2508 /* Save the old mode so that if the mode in the lock changes we
2509 * can decrement the appropriate reader or writer refcount. */
2510 file_lock->fl_type = einfo.ei_mode;
2513 CERROR("unknown fcntl lock command: %d\n", cmd);
2517 CDEBUG(D_DLMTRACE, "inode=%lu, pid=%u, flags=%#x, mode=%u, "
2518 "start="LPU64", end="LPU64"\n", inode->i_ino, flock.l_flock.pid,
2519 flags, einfo.ei_mode, flock.l_flock.start, flock.l_flock.end);
2521 rc = ldlm_cli_enqueue(sbi->ll_md_exp, NULL, &einfo, &res_id,
2522 &flock, &flags, NULL, 0, NULL, &lockh, 0);
2523 if ((file_lock->fl_flags & FL_FLOCK) && (rc == 0))
2524 ll_flock_lock_file_wait(file, file_lock, (cmd == F_SETLKW));
2525 #ifdef HAVE_F_OP_FLOCK
2526 if ((file_lock->fl_flags & FL_POSIX) && (rc == 0) &&
2527 !(flags & LDLM_FL_TEST_LOCK))
2528 posix_lock_file_wait(file, file_lock);
2534 int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
2541 int ll_have_md_lock(struct inode *inode, __u64 bits)
2543 struct lustre_handle lockh;
2544 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2552 fid = &ll_i2info(inode)->lli_fid;
2553 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2555 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
2556 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2557 LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh)) {
2563 ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
2564 struct lustre_handle *lockh)
2566 ldlm_policy_data_t policy = { .l_inodebits = {bits}};
2572 fid = &ll_i2info(inode)->lli_fid;
2573 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
2575 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING;
2576 rc = md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, &policy,
2577 LCK_CR|LCK_CW|LCK_PR|LCK_PW, lockh);
2581 static int ll_inode_revalidate_fini(struct inode *inode, int rc) {
2582 if (rc == -ENOENT) { /* Already unlinked. Just update nlink
2583 * and return success */
2585 /* This path cannot be hit for regular files unless in
2586 * case of obscure races, so no need to to validate
2588 if (!S_ISREG(inode->i_mode) &&
2589 !S_ISDIR(inode->i_mode))
2594 CERROR("failure %d inode %lu\n", rc, inode->i_ino);
2602 int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
2604 struct inode *inode = dentry->d_inode;
2605 struct ptlrpc_request *req = NULL;
2606 struct ll_sb_info *sbi;
2607 struct obd_export *exp;
2612 CERROR("REPORT THIS LINE TO PETER\n");
2615 sbi = ll_i2sbi(inode);
2617 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n",
2618 inode->i_ino, inode->i_generation, inode, dentry->d_name.name);
2620 exp = ll_i2mdexp(inode);
2622 if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) {
2623 struct lookup_intent oit = { .it_op = IT_GETATTR };
2624 struct md_op_data *op_data;
2626 /* Call getattr by fid, so do not provide name at all. */
2627 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode,
2628 dentry->d_inode, NULL, 0, 0,
2629 LUSTRE_OPC_ANY, NULL);
2630 if (IS_ERR(op_data))
2631 RETURN(PTR_ERR(op_data));
2633 oit.it_flags |= O_CHECK_STALE;
2634 rc = md_intent_lock(exp, op_data, NULL, 0,
2635 /* we are not interested in name
2638 ll_md_blocking_ast, 0);
2639 ll_finish_md_op_data(op_data);
2640 oit.it_flags &= ~O_CHECK_STALE;
2642 rc = ll_inode_revalidate_fini(inode, rc);
2646 rc = ll_revalidate_it_finish(req, DLM_REPLY_REC_OFF, &oit, dentry);
2648 ll_intent_release(&oit);
2652 /* Unlinked? Unhash dentry, so it is not picked up later by
2653 do_lookup() -> ll_revalidate_it(). We cannot use d_drop
2654 here to preserve get_cwd functionality on 2.6.
2656 if (!dentry->d_inode->i_nlink) {
2657 spin_lock(&dcache_lock);
2658 ll_drop_dentry(dentry);
2659 spin_unlock(&dcache_lock);
2662 ll_lookup_finish_locks(&oit, dentry);
2663 } else if (!ll_have_md_lock(dentry->d_inode, MDS_INODELOCK_UPDATE)) {
2664 struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode);
2665 obd_valid valid = OBD_MD_FLGETATTR;
2666 struct obd_capa *oc;
2669 if (S_ISREG(inode->i_mode)) {
2670 rc = ll_get_max_mdsize(sbi, &ealen);
2673 valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE;
2675 /* Once OBD_CONNECT_ATTRFID is not supported, we can't find one
2676 * capa for this inode. Because we only keep capas of dirs
2678 oc = ll_mdscapa_get(inode);
2679 rc = md_getattr(sbi->ll_md_exp, ll_inode2fid(inode), oc, valid,
2683 rc = ll_inode_revalidate_fini(inode, rc);
2687 rc = ll_prep_inode(&inode, req, REPLY_REC_OFF,
2693 /* if object not yet allocated, don't validate size */
2694 if (ll_i2info(inode)->lli_smd == NULL)
2697 /* ll_glimpse_size will prefer locally cached writes if they extend
2699 rc = ll_glimpse_size(inode, 0);
2702 ptlrpc_req_finished(req);
2706 int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
2707 struct lookup_intent *it, struct kstat *stat)
2709 struct inode *inode = de->d_inode;
2712 res = ll_inode_revalidate_it(de, it);
2713 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETATTR, 1);
2718 stat->dev = inode->i_sb->s_dev;
2719 stat->ino = inode->i_ino;
2720 stat->mode = inode->i_mode;
2721 stat->nlink = inode->i_nlink;
2722 stat->uid = inode->i_uid;
2723 stat->gid = inode->i_gid;
2724 stat->rdev = kdev_t_to_nr(inode->i_rdev);
2725 stat->atime = inode->i_atime;
2726 stat->mtime = inode->i_mtime;
2727 stat->ctime = inode->i_ctime;
2728 #ifdef HAVE_INODE_BLKSIZE
2729 stat->blksize = inode->i_blksize;
2731 stat->blksize = 1 << inode->i_blkbits;
2734 ll_inode_size_lock(inode, 0);
2735 stat->size = i_size_read(inode);
2736 stat->blocks = inode->i_blocks;
2737 ll_inode_size_unlock(inode, 0);
2741 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
2743 struct lookup_intent it = { .it_op = IT_GETATTR };
2745 return ll_getattr_it(mnt, de, &it, stat);
2749 int lustre_check_acl(struct inode *inode, int mask)
2751 #ifdef CONFIG_FS_POSIX_ACL
2752 struct ll_inode_info *lli = ll_i2info(inode);
2753 struct posix_acl *acl;
2757 spin_lock(&lli->lli_lock);
2758 acl = posix_acl_dup(lli->lli_posix_acl);
2759 spin_unlock(&lli->lli_lock);
2764 rc = posix_acl_permission(inode, acl, mask);
2765 posix_acl_release(acl);
2773 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
2774 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2776 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2777 inode->i_ino, inode->i_generation, inode, mask);
2778 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2779 return lustre_check_remote_perm(inode, mask);
2781 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2782 return generic_permission(inode, mask, lustre_check_acl);
2785 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
2787 int mode = inode->i_mode;
2790 CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p), mask %o\n",
2791 inode->i_ino, inode->i_generation, inode, mask);
2793 if (ll_i2sbi(inode)->ll_flags & LL_SBI_RMT_CLIENT)
2794 return lustre_check_remote_perm(inode, mask);
2796 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1);
2798 if ((mask & MAY_WRITE) && IS_RDONLY(inode) &&
2799 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
2801 if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
2803 if (current->fsuid == inode->i_uid) {
2806 if (((mode >> 3) & mask & S_IRWXO) != mask)
2808 rc = lustre_check_acl(inode, mask);
2812 goto check_capabilities;
2816 if (in_group_p(inode->i_gid))
2819 if ((mode & mask & S_IRWXO) == mask)
2823 if (!(mask & MAY_EXEC) ||
2824 (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
2825 if (capable(CAP_DAC_OVERRIDE))
2828 if (capable(CAP_DAC_READ_SEARCH) && ((mask == MAY_READ) ||
2829 (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))))
2836 /* -o localflock - only provides locally consistent flock locks */
2837 struct file_operations ll_file_operations = {
2838 .read = ll_file_read,
2839 .write = ll_file_write,
2840 .ioctl = ll_file_ioctl,
2841 .open = ll_file_open,
2842 .release = ll_file_release,
2843 .mmap = ll_file_mmap,
2844 .llseek = ll_file_seek,
2845 .sendfile = ll_file_sendfile,
2849 struct file_operations ll_file_operations_flock = {
2850 .read = ll_file_read,
2851 .write = ll_file_write,
2852 .ioctl = ll_file_ioctl,
2853 .open = ll_file_open,
2854 .release = ll_file_release,
2855 .mmap = ll_file_mmap,
2856 .llseek = ll_file_seek,
2857 .sendfile = ll_file_sendfile,
2859 #ifdef HAVE_F_OP_FLOCK
2860 .flock = ll_file_flock,
2862 .lock = ll_file_flock
2865 /* These are for -o noflock - to return ENOSYS on flock calls */
2866 struct file_operations ll_file_operations_noflock = {
2867 .read = ll_file_read,
2868 .write = ll_file_write,
2869 .ioctl = ll_file_ioctl,
2870 .open = ll_file_open,
2871 .release = ll_file_release,
2872 .mmap = ll_file_mmap,
2873 .llseek = ll_file_seek,
2874 .sendfile = ll_file_sendfile,
2876 #ifdef HAVE_F_OP_FLOCK
2877 .flock = ll_file_noflock,
2879 .lock = ll_file_noflock
2882 struct inode_operations ll_file_inode_operations = {
2883 #ifdef HAVE_VFS_INTENT_PATCHES
2884 .setattr_raw = ll_setattr_raw,
2886 .setattr = ll_setattr,
2887 .truncate = ll_truncate,
2888 .getattr = ll_getattr,
2889 .permission = ll_inode_permission,
2890 .setxattr = ll_setxattr,
2891 .getxattr = ll_getxattr,
2892 .listxattr = ll_listxattr,
2893 .removexattr = ll_removexattr,
2896 /* dynamic ioctl number support routins */
2897 static struct llioc_ctl_data {
2898 struct rw_semaphore ioc_sem;
2899 struct list_head ioc_head;
2901 __RWSEM_INITIALIZER(llioc.ioc_sem),
2902 CFS_LIST_HEAD_INIT(llioc.ioc_head)
2907 struct list_head iocd_list;
2908 unsigned int iocd_size;
2909 llioc_callback_t iocd_cb;
2910 unsigned int iocd_count;
2911 unsigned int iocd_cmd[0];
2914 void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd)
2917 struct llioc_data *in_data = NULL;
2920 if (cb == NULL || cmd == NULL ||
2921 count > LLIOC_MAX_CMD || count < 0)
2924 size = sizeof(*in_data) + count * sizeof(unsigned int);
2925 OBD_ALLOC(in_data, size);
2926 if (in_data == NULL)
2929 memset(in_data, 0, sizeof(*in_data));
2930 in_data->iocd_size = size;
2931 in_data->iocd_cb = cb;
2932 in_data->iocd_count = count;
2933 memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count);
2935 down_write(&llioc.ioc_sem);
2936 list_add_tail(&in_data->iocd_list, &llioc.ioc_head);
2937 up_write(&llioc.ioc_sem);
2942 void ll_iocontrol_unregister(void *magic)
2944 struct llioc_data *tmp;
2949 down_write(&llioc.ioc_sem);
2950 list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) {
2952 unsigned int size = tmp->iocd_size;
2954 list_del(&tmp->iocd_list);
2955 up_write(&llioc.ioc_sem);
2957 OBD_FREE(tmp, size);
2961 up_write(&llioc.ioc_sem);
2963 CWARN("didn't find iocontrol register block with magic: %p\n", magic);
2966 EXPORT_SYMBOL(ll_iocontrol_register);
2967 EXPORT_SYMBOL(ll_iocontrol_unregister);
2969 enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
2970 unsigned int cmd, unsigned long arg, int *rcp)
2972 enum llioc_iter ret = LLIOC_CONT;
2973 struct llioc_data *data;
2974 int rc = -EINVAL, i;
2976 down_read(&llioc.ioc_sem);
2977 list_for_each_entry(data, &llioc.ioc_head, iocd_list) {
2978 for (i = 0; i < data->iocd_count; i++) {
2979 if (cmd != data->iocd_cmd[i])
2982 ret = data->iocd_cb(inode, file, cmd, arg, data, &rc);
2986 if (ret == LLIOC_STOP)
2989 up_read(&llioc.ioc_sem);