vma->vm_file->f_dentry->d_inode->i_ino, \
vma->vm_file->f_dentry->d_iname, ## arg); \
-
-struct ll_lock_tree_node {
- rb_node_t lt_node;
- struct list_head lt_locked_item;
- __u64 lt_oid;
- ldlm_policy_data_t lt_policy;
- struct lustre_handle lt_lockh;
- ldlm_mode_t lt_mode;
- struct inode *lt_inode;
-};
-
-int lt_get_mmap_locks(struct ll_lock_tree *tree,
- unsigned long addr, size_t count);
-
struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
int *type);
-struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
- __u64 end, ldlm_mode_t mode)
-{
- struct ll_lock_tree_node *node;
-
- OBD_ALLOC(node, sizeof(*node));
- if (node == NULL)
- RETURN(ERR_PTR(-ENOMEM));
-
- node->lt_inode = inode;
- node->lt_oid = ll_i2info(inode)->lli_smd->lsm_object_id;
- node->lt_policy.l_extent.start = start;
- node->lt_policy.l_extent.end = end;
- memset(&node->lt_lockh, 0, sizeof(node->lt_lockh));
- INIT_LIST_HEAD(&node->lt_locked_item);
- node->lt_mode = mode;
-
- return node;
-}
-
-int lt_compare(struct ll_lock_tree_node *one, struct ll_lock_tree_node *two)
-{
- /* To avoid multiple fs deadlock */
- if (one->lt_inode->i_sb->s_dev < two->lt_inode->i_sb->s_dev)
- return -1;
- if (one->lt_inode->i_sb->s_dev > two->lt_inode->i_sb->s_dev)
- return 1;
-
- if (one->lt_oid < two->lt_oid)
- return -1;
- if (one->lt_oid > two->lt_oid)
- return 1;
-
- if (one->lt_policy.l_extent.end < two->lt_policy.l_extent.start)
- return -1;
- if (one->lt_policy.l_extent.start > two->lt_policy.l_extent.end)
- return 1;
-
- return 0; /* they are the same object and overlap */
-}
-
-static void lt_merge(struct ll_lock_tree_node *dst,
- struct ll_lock_tree_node *src)
-{
- dst->lt_policy.l_extent.start = min(dst->lt_policy.l_extent.start,
- src->lt_policy.l_extent.start);
- dst->lt_policy.l_extent.end = max(dst->lt_policy.l_extent.end,
- src->lt_policy.l_extent.end);
-
- /* XXX could be a real call to the dlm to find superset modes */
- if (src->lt_mode == LCK_PW && dst->lt_mode != LCK_PW)
- dst->lt_mode = LCK_PW;
-}
-
-static void lt_insert(struct ll_lock_tree *tree,
- struct ll_lock_tree_node *node)
-{
- struct ll_lock_tree_node *walk;
- rb_node_t **p, *parent;
- ENTRY;
-
-restart:
- p = &tree->lt_root.rb_node;
- parent = NULL;
- while (*p) {
- parent = *p;
- walk = rb_entry(parent, struct ll_lock_tree_node, lt_node);
- switch (lt_compare(node, walk)) {
- case -1:
- p = &(*p)->rb_left;
- break;
- case 1:
- p = &(*p)->rb_right;
- break;
- case 0:
- lt_merge(node, walk);
- rb_erase(&walk->lt_node, &tree->lt_root);
- OBD_FREE(walk, sizeof(*walk));
- goto restart;
- break;
- default:
- LBUG();
- break;
- }
- }
- rb_link_node(&node->lt_node, parent, p);
- rb_insert_color(&node->lt_node, &tree->lt_root);
- EXIT;
-}
-
-static struct ll_lock_tree_node *lt_least_node(struct ll_lock_tree *tree)
-{
- rb_node_t *rbnode;
- struct ll_lock_tree_node *node = NULL;
-
- for ( rbnode = tree->lt_root.rb_node; rbnode != NULL;
- rbnode = rbnode->rb_left) {
- if (rbnode->rb_left == NULL) {
- node = rb_entry(rbnode, struct ll_lock_tree_node,
- lt_node);
- break;
- }
- }
- RETURN(node);
-}
-
-int ll_tree_unlock(struct ll_lock_tree *tree)
-{
- struct ll_lock_tree_node *node;
- struct list_head *pos, *n;
- struct inode *inode;
- int rc = 0;
- ENTRY;
-
- list_for_each_safe(pos, n, &tree->lt_locked_list) {
- node = list_entry(pos, struct ll_lock_tree_node,
- lt_locked_item);
-
- inode = node->lt_inode;
- rc = ll_extent_unlock(tree->lt_fd, inode,
- ll_i2info(inode)->lli_smd, node->lt_mode,
- &node->lt_lockh);
- if (rc != 0) {
- /* XXX better message */
- CERROR("couldn't unlock %d\n", rc);
- }
- list_del(&node->lt_locked_item);
- OBD_FREE(node, sizeof(*node));
- }
-
- while ((node = lt_least_node(tree))) {
- rb_erase(&node->lt_node, &tree->lt_root);
- OBD_FREE(node, sizeof(*node));
- }
-
- RETURN(rc);
-}
-
-int ll_tree_lock(struct ll_lock_tree *tree,
- struct ll_lock_tree_node *first_node,
- const char *buf, size_t count, int ast_flags)
-{
- struct ll_lock_tree_node *node;
- int rc = 0;
- ENTRY;
-
- tree->lt_root.rb_node = NULL;
- INIT_LIST_HEAD(&tree->lt_locked_list);
- if (first_node != NULL)
- lt_insert(tree, first_node);
-
- /* To avoid such subtle deadlock case: client1 try to read file1 to
- * mmapped file2, on the same time, client2 try to read file2 to
- * mmapped file1.*/
- rc = lt_get_mmap_locks(tree, (unsigned long)buf, count);
- if (rc)
- GOTO(out, rc);
-
- while ((node = lt_least_node(tree))) {
- struct inode *inode = node->lt_inode;
- rc = ll_extent_lock(tree->lt_fd, inode,
- ll_i2info(inode)->lli_smd, node->lt_mode,
- &node->lt_policy, &node->lt_lockh,
- ast_flags);
- if (rc != 0)
- GOTO(out, rc);
-
- rb_erase(&node->lt_node, &tree->lt_root);
- list_add_tail(&node->lt_locked_item, &tree->lt_locked_list);
- }
- RETURN(rc);
-out:
- ll_tree_unlock(tree);
- RETURN(rc);
-}
-
-static ldlm_mode_t mode_from_vma(struct vm_area_struct *vma)
-{
- /* we only want to hold PW locks if the mmap() can generate
- * writes back to the file and that only happens in shared
- * writable vmas */
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
- return LCK_PW;
- return LCK_PR;
-}
-
-static void policy_from_vma(ldlm_policy_data_t *policy,
+void policy_from_vma(ldlm_policy_data_t *policy,
struct vm_area_struct *vma, unsigned long addr,
size_t count)
{
~CFS_PAGE_MASK;
}
-static struct vm_area_struct * our_vma(unsigned long addr, size_t count)
+struct vm_area_struct * our_vma(unsigned long addr, size_t count)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *ret = NULL;
RETURN(ret);
}
-int ll_region_mapped(unsigned long addr, size_t count)
-{
- return !!our_vma(addr, count);
-}
-
-int lt_get_mmap_locks(struct ll_lock_tree *tree,
- unsigned long addr, size_t count)
-{
- struct vm_area_struct *vma;
- struct ll_lock_tree_node *node;
- ldlm_policy_data_t policy;
- struct inode *inode;
- ENTRY;
-
- if (count == 0)
- RETURN(0);
-
- /* we need to look up vmas on page aligned addresses */
- count += addr & (~CFS_PAGE_MASK);
- addr &= CFS_PAGE_MASK;
-
- while ((vma = our_vma(addr, count)) != NULL) {
- LASSERT(vma->vm_file);
-
- inode = vma->vm_file->f_dentry->d_inode;
- policy_from_vma(&policy, vma, addr, count);
- node = ll_node_from_inode(inode, policy.l_extent.start,
- policy.l_extent.end,
- mode_from_vma(vma));
- if (IS_ERR(node)) {
- CERROR("not enough mem for lock_tree_node!\n");
- RETURN(-ENOMEM);
- }
- lt_insert(tree, node);
-
- if (vma->vm_end - addr >= count)
- break;
- count -= vma->vm_end - addr;
- addr = vma->vm_end;
- }
- RETURN(0);
-}
-
/**
- * Page fault handler.
+ * Lustre implementation of a vm_operations_struct::nopage() method, called by
+ * VM to server page fault (both in kernel and user space).
+ *
+ * This function sets up CIT_FAULT cl_io that does the job.
*
* \param vma - is virtiual area struct related to page fault
* \param address - address when hit fault
* \param type - of fault
*
+ * XXX newer 2.6 kernels provide vm_operations_struct::fault() method with
+ * slightly different semantics instead.
+ *
* \return allocated and filled page for address
* \retval NOPAGE_SIGBUS if page not exist on this address
* \retval NOPAGE_OOM not have memory for allocate new page
struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
int *type)
{
- struct file *filp = vma->vm_file;
- struct ll_file_data *fd = LUSTRE_FPRIVATE(filp);
- struct inode *inode = filp->f_dentry->d_inode;
- struct lustre_handle lockh = { 0 };
- ldlm_policy_data_t policy;
- ldlm_mode_t mode;
- struct page *page = NULL;
- struct ll_inode_info *lli = ll_i2info(inode);
- struct lov_stripe_md *lsm;
- struct ost_lvb lvb;
- __u64 kms, old_mtime;
- unsigned long pgoff, size, rand_read, seq_read;
- int rc = 0;
- ENTRY;
-
- if (lli->lli_smd == NULL) {
- CERROR("No lsm on fault?\n");
- RETURN(NULL);
- }
-
- ll_clear_file_contended(inode);
-
- /* start and end the lock on the first and last bytes in the page */
- policy_from_vma(&policy, vma, address, CFS_PAGE_SIZE);
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_dentry->d_inode;
+ struct lu_env *env;
+ struct cl_io *io;
+ struct page *page = NULL;
+ struct cl_env_nest nest;
+ int result;
- CDEBUG(D_MMAP, "nopage vma %p inode %lu, locking ["LPU64", "LPU64"]\n",
- vma, inode->i_ino, policy.l_extent.start, policy.l_extent.end);
-
- mode = mode_from_vma(vma);
- old_mtime = LTIME_S(inode->i_mtime);
-
- lsm = lli->lli_smd;
- rc = ll_extent_lock(fd, inode, lsm, mode, &policy,
- &lockh, LDLM_FL_CBPENDING);
- if (rc != 0)
- RETURN(NULL);
-
- if (vma->vm_flags & VM_EXEC && LTIME_S(inode->i_mtime) != old_mtime)
- CWARN("binary changed. inode %lu\n", inode->i_ino);
-
- lov_stripe_lock(lsm);
- inode_init_lvb(inode, &lvb);
- obd_merge_lvb(ll_i2dtexp(inode), lsm, &lvb, 1);
- kms = lvb.lvb_size;
-
- pgoff = ((address - vma->vm_start) >> CFS_PAGE_SHIFT) + vma->vm_pgoff;
- size = (kms + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
-
- if (pgoff >= size) {
- lov_stripe_unlock(lsm);
- ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
- } else {
- /* XXX change inode size without ll_inode_size_lock() held!
- * there is a race condition with truncate path. (see
- * ll_extent_lock) */
- /* XXX i_size_write() is not used because it is not safe to
- * take the ll_inode_size_lock() due to a potential lock
- * inversion (bug 6077). And since it's not safe to use
- * i_size_write() without a covering mutex we do the
- * assignment directly. It is not critical that the
- * size be correct. */
- /* region is within kms and, hence, within real file size (A).
- * We need to increase i_size to cover the read region so that
- * generic_file_read() will do its job, but that doesn't mean
- * the kms size is _correct_, it is only the _minimum_ size.
- * If someone does a stat they will get the correct size which
- * will always be >= the kms value here. b=11081 */
- if (i_size_read(inode) < kms) {
- inode->i_size = kms;
- CDEBUG(D_INODE, "ino=%lu, updating i_size %llu\n",
- inode->i_ino, i_size_read(inode));
- }
- lov_stripe_unlock(lsm);
- }
+ ENTRY;
- /* If mapping is writeable, adjust kms to cover this page,
- * but do not extend kms beyond actual file size.
- * policy.l_extent.end is set to the end of the page by policy_from_vma
- * bug 10919 */
- lov_stripe_lock(lsm);
- if (mode == LCK_PW)
- obd_adjust_kms(ll_i2dtexp(inode), lsm,
- min_t(loff_t, policy.l_extent.end + 1,
- i_size_read(inode)), 0);
- lov_stripe_unlock(lsm);
-
- /* disable VM_SEQ_READ and use VM_RAND_READ to make sure that
- * the kernel will not read other pages not covered by ldlm in
- * filemap_nopage. we do our readahead in ll_readpage.
+ /*
+ * vm_operations_struct::nopage() can be called when lustre IO is
+ * already active for the current thread, e.g., when doing read/write
+ * against user level buffer mapped from Lustre buffer. To avoid
+ * stomping on existing context, optionally force an allocation of a new
+ * one.
*/
- rand_read = vma->vm_flags & VM_RAND_READ;
- seq_read = vma->vm_flags & VM_SEQ_READ;
- vma->vm_flags &= ~ VM_SEQ_READ;
- vma->vm_flags |= VM_RAND_READ;
-
- page = filemap_nopage(vma, address, type);
- if (page != NOPAGE_SIGBUS && page != NOPAGE_OOM)
- LL_CDEBUG_PAGE(D_PAGE, page, "got addr %lu type %lx\n", address,
- (long)type);
- else
- CDEBUG(D_PAGE, "got addr %lu type %lx - SIGBUS\n", address,
- (long)type);
-
- vma->vm_flags &= ~VM_RAND_READ;
- vma->vm_flags |= (rand_read | seq_read);
-
- ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, &lockh);
+ env = cl_env_nested_get(&nest);
+ if (!IS_ERR(env)) {
+ pgoff_t pg_offset;
+ const unsigned long writable = VM_SHARED|VM_WRITE;
+ unsigned long ra_flags;
+ struct cl_fault_io *fio;
+
+ io = &ccc_env_info(env)->cti_io;
+ io->ci_obj = ll_i2info(inode)->lli_clob;
+ LASSERT(io->ci_obj != NULL);
+
+ fio = &io->u.ci_fault;
+ pg_offset = (address - vma->vm_start) >> PAGE_SHIFT;
+ fio->ft_index = pg_offset + vma->vm_pgoff;
+ fio->ft_writable = (vma->vm_flags&writable) == writable;
+ fio->ft_executable = vma->vm_flags&VM_EXEC;
+
+ /*
+ * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
+ * the kernel will not read other pages not covered by ldlm in
+ * filemap_nopage. we do our readahead in ll_readpage.
+ */
+ ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
+ vma->vm_flags &= ~VM_SEQ_READ;
+ vma->vm_flags |= VM_RAND_READ;
+
+ CDEBUG(D_INFO, "vm_flags: %lx (%lu %i %i)\n", vma->vm_flags,
+ fio->ft_index, fio->ft_writable, fio->ft_executable);
+
+ if (cl_io_init(env, io, CIT_FAULT, io->ci_obj) == 0) {
+ struct vvp_io *vio = vvp_env_io(env);
+ struct ccc_io *cio = ccc_env_io(env);
+
+ LASSERT(cio->cui_cl.cis_io == io);
+
+ vio->u.fault.ft_vma = vma;
+ vio->u.fault.ft_address = address;
+ vio->u.fault.ft_type = type;
+ cio->cui_fd = LUSTRE_FPRIVATE(file);
+
+ result = cl_io_loop(env, io);
+ if (result == 0) {
+ LASSERT(fio->ft_page != NULL);
+ page = cl_page_vmpage(env, fio->ft_page);
+ } else if (result == -EFAULT) {
+ page = NOPAGE_SIGBUS;
+ } else if (result == -ENOMEM) {
+ page = NOPAGE_OOM;
+ }
+ } else
+ result = io->ci_result;
+
+ vma->vm_flags &= ~VM_RAND_READ;
+ vma->vm_flags |= ra_flags;
+
+ cl_io_fini(env, io);
+ cl_env_nested_put(&nest, env);
+ }
RETURN(page);
}
-/* To avoid cancel the locks covering mmapped region for lock cache pressure,
- * we track the mapped vma count by lli_mmap_cnt.
- * ll_vm_open(): when first vma is linked, split locks from lru.
- * ll_vm_close(): when last vma is unlinked, join all this file's locks to lru.
- *
- * XXX we don't check the if the region of vma/lock for performance.
+/**
+ * To avoid cancel the locks covering mmapped region for lock cache pressure,
+ * we track the mapped vma count in ccc_object::cob_mmap_cnt.
*/
static void ll_vm_open(struct vm_area_struct * vma)
{
- struct inode *inode = vma->vm_file->f_dentry->d_inode;
- struct ll_inode_info *lli = ll_i2info(inode);
- ENTRY;
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct ccc_object *vob = cl_inode2ccc(inode);
+ ENTRY;
LASSERT(vma->vm_file);
-
- spin_lock(&lli->lli_lock);
- LASSERT(atomic_read(&lli->lli_mmap_cnt) >= 0);
-
- atomic_inc(&lli->lli_mmap_cnt);
- spin_unlock(&lli->lli_lock);
+ LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+ atomic_inc(&vob->cob_mmap_cnt);
+ EXIT;
}
+/**
+ * Dual to ll_vm_open().
+ */
static void ll_vm_close(struct vm_area_struct *vma)
{
- struct inode *inode = vma->vm_file->f_dentry->d_inode;
- struct ll_inode_info *lli = ll_i2info(inode);
- ENTRY;
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct ccc_object *vob = cl_inode2ccc(inode);
+ ENTRY;
LASSERT(vma->vm_file);
-
- spin_lock(&lli->lli_lock);
- LASSERT(atomic_read(&lli->lli_mmap_cnt) > 0);
-
- atomic_dec(&lli->lli_mmap_cnt);
- spin_unlock(&lli->lli_lock);
+ atomic_dec(&vob->cob_mmap_cnt);
+ LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+ EXIT;
}
#ifndef HAVE_FILEMAP_POPULATE
vma->vm_ops = &ll_file_vm_ops;
vma->vm_ops->open(vma);
/* update the inode's size and mtime */
- rc = ll_glimpse_size(file->f_dentry->d_inode, 0);
+ rc = cl_glimpse_size(file->f_dentry->d_inode);
}
RETURN(rc);