X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;ds=sidebyside;f=lustre%2Fllite%2Fllite_mmap.c;h=dc40818dc018e0cb60580f1387a008a3a5ded14d;hb=3a92c850b094019e556577ec6cab5907538dcbf5;hp=9611d6b8133cfff979e00f14b93e9cc9d894542e;hpb=32b2ddf168b846ccf8c83329728905f6c5c8bbcb;p=fs%2Flustre-release.git diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c index 9611d6b..dc40818 100644 --- a/lustre/llite/llite_mmap.c +++ b/lustre/llite/llite_mmap.c @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * +/* * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -28,6 +26,8 @@ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Whamcloud, Inc. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -57,18 +57,10 @@ #define DEBUG_SUBSYSTEM S_LLITE -//#include #include #include "llite_internal.h" #include -#define VMA_DEBUG(vma, fmt, arg...) \ - CDEBUG(D_MMAP, "vma(%p) start(%ld) end(%ld) pgoff(%ld) inode(%p) " \ - "ino(%lu) iname(%s): " fmt, vma, vma->vm_start, vma->vm_end, \ - vma->vm_pgoff, vma->vm_file->f_dentry->d_inode, \ - vma->vm_file->f_dentry->d_inode->i_ino, \ - vma->vm_file->f_dentry->d_iname, ## arg); \ - struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, int *type); @@ -119,19 +111,21 @@ struct vm_area_struct * our_vma(unsigned long addr, size_t count) * \retval EINVAL if env can't allocated * \return other error codes from cl_io_init. */ -int ll_fault_io_init(struct vm_area_struct *vma, struct lu_env **env_ret, - struct cl_env_nest *nest, pgoff_t index, unsigned long *ra_flags) +struct cl_io *ll_fault_io_init(struct vm_area_struct *vma, + struct lu_env **env_ret, + struct cl_env_nest *nest, + pgoff_t index, unsigned long *ra_flags) { struct file *file = vma->vm_file; struct inode *inode = file->f_dentry->d_inode; - const unsigned long writable = VM_SHARED|VM_WRITE; struct cl_io *io; struct cl_fault_io *fio; struct lu_env *env; ENTRY; + *env_ret = NULL; if (ll_file_nolock(file)) - RETURN(-EOPNOTSUPP); + RETURN(ERR_PTR(-EOPNOTSUPP)); /* * page fault can be called when lustre IO is @@ -141,20 +135,17 @@ int ll_fault_io_init(struct vm_area_struct *vma, struct lu_env **env_ret, * one. */ env = cl_env_nested_get(nest); - if (IS_ERR(env)) { - *env_ret = NULL; - RETURN(-EINVAL); - } + if (IS_ERR(env)) + RETURN(ERR_PTR(-EINVAL)); *env_ret = env; - io = &ccc_env_info(env)->cti_io; + io = ccc_env_thread_io(env); io->ci_obj = ll_i2info(inode)->lli_clob; LASSERT(io->ci_obj != NULL); fio = &io->u.ci_fault; fio->ft_index = index; - fio->ft_writable = (vma->vm_flags&writable) == writable; fio->ft_executable = vma->vm_flags&VM_EXEC; /* @@ -162,12 +153,13 @@ int ll_fault_io_init(struct vm_area_struct *vma, struct lu_env **env_ret, * the kernel will not read other pages not covered by ldlm in * filemap_nopage. we do our readahead in ll_readpage. */ - *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ); + if (ra_flags != NULL) + *ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ); vma->vm_flags &= ~VM_SEQ_READ; vma->vm_flags |= VM_RAND_READ; - CDEBUG(D_INFO, "vm_flags: %lx (%lu %d %d)\n", vma->vm_flags, - fio->ft_index, fio->ft_writable, fio->ft_executable); + CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags, + fio->ft_index, fio->ft_executable); if (cl_io_init(env, io, CIT_FAULT, io->ci_obj) == 0) { struct ccc_io *cio = ccc_env_io(env); @@ -182,9 +174,100 @@ int ll_fault_io_init(struct vm_area_struct *vma, struct lu_env **env_ret, cio->cui_fd = fd; } - return io->ci_result; + return io; +} + +/* Sharing code of page_mkwrite method for rhel5 and rhel6 */ +static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage, + bool *retry) +{ + struct lu_env *env; + struct cl_io *io; + struct vvp_io *vio; + struct cl_env_nest nest; + int result; + ENTRY; + + LASSERT(vmpage != NULL); + + io = ll_fault_io_init(vma, &env, &nest, vmpage->index, NULL); + if (IS_ERR(io)) + GOTO(out, result = PTR_ERR(io)); + + result = io->ci_result; + if (result < 0) + GOTO(out, result); + + /* Don't enqueue new locks for page_mkwrite(). + * If the lock has been cancelled then page must have been + * truncated, in that case, kernel will handle it. + */ + io->ci_lockreq = CILR_PEEK; + io->u.ci_fault.ft_mkwrite = 1; + io->u.ci_fault.ft_writable = 1; + + vio = vvp_env_io(env); + vio->u.fault.ft_vma = vma; + vio->u.fault.ft_vmpage = vmpage; + + result = cl_io_loop(env, io); + + if (result == -ENODATA) /* peek failed, no lock caching. */ + CDEBUG(D_MMAP, "race on page_mkwrite: %lx (%lu %p)\n", + vma->vm_flags, io->u.ci_fault.ft_index, vmpage); + + if (result == 0 || result == -ENODATA) { + lock_page(vmpage); + if (vmpage->mapping == NULL) { + unlock_page(vmpage); + + /* page was truncated and lock was cancelled, return + * ENODATA so that VM_FAULT_NOPAGE will be returned + * to handle_mm_fault(). */ + if (result == 0) + result = -ENODATA; + } else if (result == -ENODATA) { + /* Invalidate it if the cl_lock is being revoked. + * This piece of code is definitely needed for RHEL5, + * otherwise, SIGBUS will be wrongly returned to + * applications. */ + write_one_page(vmpage, 1); + lock_page(vmpage); + if (vmpage->mapping != NULL) { + ll_invalidate_page(vmpage); + LASSERT(vmpage->mapping == NULL); + } + unlock_page(vmpage); + } else if (!PageDirty(vmpage)) { + /* race, the page has been cleaned by ptlrpcd after + * it was unlocked, it has to be added into dirty + * cache again otherwise this soon-to-dirty page won't + * consume any grants, even worse if this page is being + * transferred because it will break RPC checksum. + */ + unlock_page(vmpage); + + CDEBUG(D_MMAP, "Race on page_mkwrite %p/%lu, page has " + "been written out, retry.\n", + vmpage, vmpage->index); + + *retry = true; + result = -EAGAIN; + } + } + EXIT; + +out: + cl_io_fini(env, io); + cl_env_nested_put(&nest, env); + + CDEBUG(D_MMAP, "%s mkwrite with %d\n", cfs_current()->comm, result); + + LASSERT(ergo(result == 0, PageLocked(vmpage))); + return(result); } + #ifndef HAVE_VM_OP_FAULT /** * Lustre implementation of a vm_operations_struct::nopage() method, called by @@ -211,33 +294,34 @@ struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, unsigned long ra_flags; pgoff_t pg_offset; int result; + const unsigned long writable = VM_SHARED|VM_WRITE; ENTRY; pg_offset = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - result = ll_fault_io_init(vma, &env, &nest, pg_offset, &ra_flags); - if (env == NULL) + io = ll_fault_io_init(vma, &env, &nest, pg_offset, &ra_flags); + if (IS_ERR(io)) return NOPAGE_SIGBUS; - io = &ccc_env_info(env)->cti_io; + result = io->ci_result; if (result < 0) goto out_err; - vio = vvp_env_io(env); + io->u.ci_fault.ft_writable = (vma->vm_flags&writable) == writable; + vio = vvp_env_io(env); vio->u.fault.ft_vma = vma; vio->u.fault.nopage.ft_address = address; vio->u.fault.nopage.ft_type = type; + vio->u.fault.ft_vmpage = NULL; result = cl_io_loop(env, io); + page = vio->u.fault.ft_vmpage; + if (result != 0 && page != NULL) + page_cache_release(page); out_err: - if (result == 0) { - LASSERT(io->u.ci_fault.ft_page != NULL); - page = vio->u.fault.ft_vmpage; - } else { - if (result == -ENOMEM) - page = NOPAGE_OOM; - } + if (result == -ENOMEM) + page = NOPAGE_OOM; vma->vm_flags &= ~VM_RAND_READ; vma->vm_flags |= ra_flags; @@ -247,6 +331,7 @@ out_err: RETURN(page); } + #else /** * Lustre implementation of a vm_operations_struct::fault() method, called by @@ -259,22 +344,23 @@ out_err: * \retval VM_FAULT_ERROR on general error * \retval NOPAGE_OOM not have memory for allocate new page */ -int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) { struct lu_env *env; struct cl_io *io; struct vvp_io *vio = NULL; + struct page *vmpage; unsigned long ra_flags; struct cl_env_nest nest; int result; int fault_ret = 0; ENTRY; - result = ll_fault_io_init(vma, &env, &nest, vmf->pgoff, &ra_flags); - if (env == NULL) + io = ll_fault_io_init(vma, &env, &nest, vmf->pgoff, &ra_flags); + if (IS_ERR(io)) RETURN(VM_FAULT_ERROR); - io = &ccc_env_info(env)->cti_io; + result = io->ci_result; if (result < 0) goto out_err; @@ -284,21 +370,30 @@ int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) vio->u.fault.fault.ft_vmf = vmf; result = cl_io_loop(env, io); + + vmpage = vio->u.fault.ft_vmpage; + if (result != 0 && vmpage != NULL) { + page_cache_release(vmpage); + vmf->page = NULL; + } + fault_ret = vio->u.fault.fault.ft_flags; out_err: - if (result != 0) - fault_ret |= VM_FAULT_ERROR; + if (result != 0 && fault_ret == 0) + fault_ret = VM_FAULT_ERROR; vma->vm_flags |= ra_flags; cl_io_fini(env, io); cl_env_nested_put(&nest, env); + CDEBUG(D_MMAP, "%s fault %d/%d\n", + cfs_current()->comm, fault_ret, result); RETURN(fault_ret); } -int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ll_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int count = 0; bool printed = false; @@ -333,6 +428,80 @@ restart: } #endif +#ifndef HAVE_PGMKWRITE_USE_VMFAULT +static int ll_page_mkwrite(struct vm_area_struct *vma, struct page *vmpage) +{ + int count = 0; + bool printed = false; + bool retry; + int result; + + do { + retry = false; + result = ll_page_mkwrite0(vma, vmpage, &retry); + + if (!printed && ++count > 16) { + CWARN("app(%s): the page %lu of file %lu is under heavy" + " contention.\n", + current->comm, page_index(vmpage), + vma->vm_file->f_dentry->d_inode->i_ino); + printed = true; + } + } while (retry); + + if (result == 0) + unlock_page(vmpage); + else if (result == -ENODATA) + result = 0; /* kernel will know truncate has happened and + * retry */ + + return result; +} +#else +static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int count = 0; + bool printed = false; + bool retry; + int result; + + do { + retry = false; + result = ll_page_mkwrite0(vma, vmf->page, &retry); + + if (!printed && ++count > 16) { + CWARN("app(%s): the page %lu of file %lu is under heavy" + " contention.\n", + current->comm, vmf->pgoff, + vma->vm_file->f_dentry->d_inode->i_ino); + printed = true; + } + } while (retry); + + switch(result) { + case 0: + LASSERT(PageLocked(vmf->page)); + result = VM_FAULT_LOCKED; + break; + case -ENODATA: + case -EFAULT: + result = VM_FAULT_NOPAGE; + break; + case -ENOMEM: + result = VM_FAULT_OOM; + break; + case -EAGAIN: + result = VM_FAULT_RETRY; + break; + default: + result = VM_FAULT_SIGBUS; + break; + } + + return result; +} +#endif + /** * To avoid cancel the locks covering mmapped region for lock cache pressure, * we track the mapped vma count in ccc_object::cob_mmap_cnt. @@ -413,6 +582,7 @@ static struct vm_operations_struct ll_file_vm_ops = { #else .fault = ll_fault, #endif + .page_mkwrite = ll_page_mkwrite, .open = ll_vm_open, .close = ll_vm_close, }; @@ -436,7 +606,7 @@ int ll_file_mmap(struct file *file, struct vm_area_struct * vma) vma->vm_ops = &ll_file_vm_ops; vma->vm_ops->open(vma); /* update the inode's size and mtime */ - rc = cl_glimpse_size(inode); + rc = ll_glimpse_size(inode); } RETURN(rc);