From adf46db962f657b74bd38db27e7b320aaee3cdd5 Mon Sep 17 00:00:00 2001 From: Sebastien Buisson Date: Thu, 20 Feb 2020 14:45:07 +0000 Subject: [PATCH] LU-12275 sec: support truncate for encrypted files Truncation of encrypted files is not a trivial operation. The page corresponding to the point where truncation occurs must be read, decrypted, zeroed after truncation point, re-encrypted and then written back. Signed-off-by: Sebastien Buisson Change-Id: I834f9372913d7051b1e0821515d3fea0873ffd78 Reviewed-on: https://review.whamcloud.com/37794 Tested-by: jenkins Reviewed-by: John L. Hammond Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/llite/file.c | 7 ++ lustre/llite/llite_lib.c | 180 +++++++++++++++++++++++++++++++++++++++++++- lustre/llite/rw.c | 13 +++- lustre/llite/vvp_io.c | 9 ++- lustre/ofd/ofd_objects.c | 11 +++ lustre/osc/osc_request.c | 7 +- lustre/osd-ldiskfs/osd_io.c | 24 +++++- lustre/osd-zfs/osd_io.c | 31 +++++++- 8 files changed, 270 insertions(+), 12 deletions(-) diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 5a543da..166b527 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -2255,6 +2255,13 @@ static int ll_lov_setstripe(struct inode *inode, struct file *file, GOTO(out, rc); rc = ll_file_getstripe(inode, arg, lum_size); + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode) && + ll_i2info(inode)->lli_clob) { + struct iattr attr = { 0 }; + + rc = cl_setattr_ost(ll_i2info(inode)->lli_clob, &attr, + OP_XVALID_FLAGS, LUSTRE_ENCRYPT_FL); + } } cl_lov_delay_create_clear(&file->f_flags); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 49fa5c0..0aaae5f 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -1744,6 +1744,162 @@ static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data) RETURN(rc); } +/** + * Zero portion of page that is part of @inode. + * This implies, if necessary: + * - taking cl_lock on range corresponding to concerned page + * - grabbing vm page + * - associating cl_page + * - proceeding to clio read + * - zeroing range in page + * - proceeding to cl_page flush + * - releasing cl_lock + * + * \param[in] inode inode + * \param[in] index page index + * \param[in] offset offset in page to start zero from + * \param[in] len len to zero + * + * \retval 0 on success + * \retval negative errno on failure + */ +int ll_io_zero_page(struct inode *inode, pgoff_t index, pgoff_t offset, + unsigned len) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct cl_object *clob = lli->lli_clob; + __u16 refcheck; + struct lu_env *env = NULL; + struct cl_io *io = NULL; + struct cl_page *clpage = NULL; + struct page *vmpage = NULL; + unsigned from = index << PAGE_SHIFT; + struct cl_lock *lock = NULL; + struct cl_lock_descr *descr = NULL; + struct cl_2queue *queue = NULL; + struct cl_sync_io *anchor = NULL; + bool holdinglock = false; + bool lockedbymyself = true; + int rc; + + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = vvp_env_thread_io(env); + io->ci_obj = clob; + rc = cl_io_rw_init(env, io, CIT_WRITE, from, PAGE_SIZE); + if (rc) + GOTO(putenv, rc); + + lock = vvp_env_lock(env); + descr = &lock->cll_descr; + descr->cld_obj = io->ci_obj; + descr->cld_start = cl_index(io->ci_obj, from); + descr->cld_end = cl_index(io->ci_obj, from + PAGE_SIZE - 1); + descr->cld_mode = CLM_WRITE; + descr->cld_enq_flags = CEF_MUST | CEF_NONBLOCK; + + /* request lock for page */ + rc = cl_lock_request(env, io, lock); + /* -ECANCELED indicates a matching lock with a different extent + * was already present, and -EEXIST indicates a matching lock + * on exactly the same extent was already present. + * In both cases it means we are covered. + */ + if (rc == -ECANCELED || rc == -EEXIST) + rc = 0; + else if (rc < 0) + GOTO(iofini, rc); + else + holdinglock = true; + + /* grab page */ + vmpage = grab_cache_page_nowait(inode->i_mapping, index); + if (vmpage == NULL) + GOTO(rellock, rc = -EOPNOTSUPP); + + if (!PageDirty(vmpage)) { + /* associate cl_page */ + clpage = cl_page_find(env, clob, vmpage->index, + vmpage, CPT_CACHEABLE); + if (IS_ERR(clpage)) + GOTO(pagefini, rc = PTR_ERR(clpage)); + + cl_page_assume(env, io, clpage); + } + + if (!PageUptodate(vmpage) && !PageDirty(vmpage) && + !PageWriteback(vmpage)) { + /* read page */ + /* set PagePrivate2 to detect special case of empty page + * in osc_brw_fini_request() + */ + SetPagePrivate2(vmpage); + rc = ll_io_read_page(env, io, clpage, NULL); + if (!PagePrivate2(vmpage)) + /* PagePrivate2 was cleared in osc_brw_fini_request() + * meaning we read an empty page. In this case, in order + * to avoid allocating unnecessary block in truncated + * file, we must not zero and write as below. Subsequent + * server-side truncate will handle things correctly. + */ + GOTO(clpfini, rc = 0); + ClearPagePrivate2(vmpage); + if (rc) + GOTO(clpfini, rc); + lockedbymyself = trylock_page(vmpage); + cl_page_assume(env, io, clpage); + } + + /* zero range in page */ + zero_user(vmpage, offset, len); + + if (holdinglock && clpage) { + /* explicitly write newly modified page */ + queue = &io->ci_queue; + cl_2queue_init(queue); + anchor = &vvp_env_info(env)->vti_anchor; + cl_sync_io_init(anchor, 1); + clpage->cp_sync_io = anchor; + cl_2queue_add(queue, clpage); + rc = cl_io_submit_rw(env, io, CRT_WRITE, queue); + if (rc) + GOTO(queuefini1, rc); + rc = cl_sync_io_wait(env, anchor, 0); + if (rc) + GOTO(queuefini2, rc); + cl_page_assume(env, io, clpage); + +queuefini2: + cl_2queue_discard(env, io, queue); +queuefini1: + cl_2queue_disown(env, io, queue); + cl_2queue_fini(env, queue); + } + +clpfini: + if (clpage) + cl_page_put(env, clpage); +pagefini: + if (lockedbymyself) { + unlock_page(vmpage); + put_page(vmpage); + } +rellock: + if (holdinglock) + cl_lock_release(env, lock); +iofini: + cl_io_fini(env, io); +putenv: + if (env) + cl_env_put(env, &refcheck); + + RETURN(rc); +} + /* If this inode has objects allocated to it (lsm != NULL), then the OST * object(s) determine the file size and mtime. Otherwise, the MDS will * keep these values until such a time that objects are allocated for it. @@ -1875,6 +2031,8 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, GOTO(out, rc); } } else { + unsigned int flags = 0; + /* For truncate and utimes sending attributes to OSTs, * setting mtime/atime to the past will be performed * under PW [0:EOF] extent lock (new_size:EOF for @@ -1883,7 +2041,22 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, * it is necessary due to possible time * de-synchronization between MDT inode and OST objects */ - rc = cl_setattr_ost(lli->lli_clob, attr, xvalid, 0); + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode) && + attr->ia_valid & ATTR_SIZE) { + xvalid |= OP_XVALID_FLAGS; + flags = LUSTRE_ENCRYPT_FL; + if (attr->ia_size & ~PAGE_MASK) { + pgoff_t offset = + attr->ia_size & (PAGE_SIZE - 1); + + rc = ll_io_zero_page(inode, + attr->ia_size >> PAGE_SHIFT, + offset, PAGE_SIZE - offset); + if (rc) + GOTO(out, rc); + } + } + rc = cl_setattr_ost(lli->lli_clob, attr, xvalid, flags); } } @@ -1947,6 +2120,11 @@ int ll_setattr(struct dentry *de, struct iattr *attr) { int mode = de->d_inode->i_mode; enum op_xvalid xvalid = 0; + int rc; + + rc = llcrypt_prepare_setattr(de, attr); + if (rc) + return rc; if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) == (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 29ec2b7..a78f87c 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -1449,8 +1449,8 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io, { struct inode *inode = vvp_object_inode(page->cp_obj); struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ll_file_data *fd = file->private_data; - struct ll_readahead_state *ras = &fd->fd_ras; + struct ll_file_data *fd = NULL; + struct ll_readahead_state *ras = NULL; struct cl_2queue *queue = &io->ci_queue; struct cl_sync_io *anchor = NULL; struct vvp_page *vpg; @@ -1460,10 +1460,15 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io, pgoff_t io_end_index; ENTRY; + if (file) { + fd = file->private_data; + ras = &fd->fd_ras; + } + vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page)); uptodate = vpg->vpg_defer_uptodate; - if (ll_readahead_enabled(sbi) && !vpg->vpg_ra_updated) { + if (ll_readahead_enabled(sbi) && !vpg->vpg_ra_updated && ras) { struct vvp_io *vio = vvp_env_io(env); enum ras_update_flags flags = 0; @@ -1490,7 +1495,7 @@ int ll_io_read_page(const struct lu_env *env, struct cl_io *io, io_start_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos); io_end_index = cl_index(io->ci_obj, io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count - 1); - if (ll_readahead_enabled(sbi)) { + if (ll_readahead_enabled(sbi) && ras) { rc2 = ll_readahead(env, io, &queue->c2_qin, ras, uptodate, file); CDEBUG(D_READA, DFID " %d pages read ahead at %lu\n", diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c index 55e21fa..85d630f 100644 --- a/lustre/llite/vvp_io.c +++ b/lustre/llite/vvp_io.c @@ -653,7 +653,14 @@ static int vvp_io_setattr_lock(const struct lu_env *env, __u32 enqflags = 0; if (cl_io_is_trunc(io)) { - if (io->u.ci_setattr.sa_attr.lvb_size == 0) + struct inode *inode = vvp_object_inode(io->ci_obj); + + /* set enqueue flags to CEF_MUST in case of encrypted file, + * to prevent lockless truncate + */ + if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) + enqflags = CEF_MUST; + else if (io->u.ci_setattr.sa_attr.lvb_size == 0) enqflags = CEF_DISCARD_DATA; } else if (cl_io_is_fallocate(io)) { lock_start = io->u.ci_setattr.sa_falloc_offset; diff --git a/lustre/ofd/ofd_objects.c b/lustre/ofd/ofd_objects.c index bb92058..4982a49 100644 --- a/lustre/ofd/ofd_objects.c +++ b/lustre/ofd/ofd_objects.c @@ -919,6 +919,17 @@ int ofd_object_punch(const struct lu_env *env, struct ofd_object *fo, oa->o_valid &= ~OBD_MD_LAYOUT_VERSION; } + if (oa->o_valid & OBD_MD_FLFLAGS && oa->o_flags & LUSTRE_ENCRYPT_FL) { + /* punch must be aware we are dealing with an encrypted file */ + struct lu_attr la = { + .la_valid = LA_FLAGS, + .la_flags = LUSTRE_ENCRYPT_FL, + }; + + rc = dt_attr_set(env, dob, &la, th); + if (rc) + GOTO(unlock, rc); + } rc = dt_punch(env, dob, start, OBD_OBJECT_EOF, th); if (rc) GOTO(unlock, rc); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index e87b4f0..a9d95bc 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -2065,8 +2065,13 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) break; p++; } - if (p - q == PAGE_SIZE / sizeof(*p)) + if (p - q == PAGE_SIZE / sizeof(*p)) { + /* if page is empty forward info to upper layers + * (ll_io_zero_page) by clearing PagePrivate2 + */ + ClearPagePrivate2(pg->pg); continue; + } rc = llcrypt_decrypt_pagecache_blocks(pg->pg, PAGE_SIZE, 0); diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index 961e10d..7880dec 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -2010,6 +2010,14 @@ static int osd_punch(const struct lu_env *env, struct dt_object *dt, grow = true; i_size_write(inode, start); spin_unlock(&inode->i_lock); + /* if object holds encrypted content, we need to make sure we truncate + * on an encryption unit boundary, or subsequent reads will get + * corrupted content + */ + if (obj->oo_lma_flags & LUSTRE_ENCRYPT_FL && + start & ~LUSTRE_ENCRYPTION_MASK) + start = (start & LUSTRE_ENCRYPTION_MASK) + + LUSTRE_ENCRYPTION_UNIT_SIZE; ll_truncate_pagecache(inode, start); /* optimize grow case */ @@ -2242,15 +2250,29 @@ void osd_execute_truncate(struct osd_object *obj) return; } + size = i_size_read(inode); inode_lock(inode); + /* if object holds encrypted content, we need to make sure we truncate + * on an encryption unit boundary, or block content will get corrupted + */ + if (obj->oo_lma_flags & LUSTRE_ENCRYPT_FL && + size & ~LUSTRE_ENCRYPTION_MASK) + inode->i_size = (size & LUSTRE_ENCRYPTION_MASK) + + LUSTRE_ENCRYPTION_UNIT_SIZE; ldiskfs_truncate(inode); inode_unlock(inode); + if (inode->i_size != size) { + spin_lock(&inode->i_lock); + i_size_write(inode, size); + LDISKFS_I(inode)->i_disksize = size; + spin_unlock(&inode->i_lock); + osd_dirty_inode(inode, I_DIRTY_DATASYNC); + } /* * For a partial-page truncate, flush the page to disk immediately to * avoid data corruption during direct disk write. b=17397 */ - size = i_size_read(inode); if ((size & ~PAGE_MASK) == 0) return; if (osd_use_page_cache(d)) { diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index f2a72b6..b270228 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -1024,9 +1024,11 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, * dmu_tx_hold_sa() and if off < size, dmu_tx_hold_free() * called and then assigned to a transaction group. */ -static int __osd_object_punch(objset_t *os, dnode_t *dn, dmu_tx_t *tx, - uint64_t size, uint64_t off, uint64_t len) +static int __osd_object_punch(struct osd_object *obj, objset_t *os, + dmu_tx_t *tx, uint64_t off, uint64_t len) { + dnode_t *dn = obj->oo_dn; + uint64_t size = obj->oo_attr.la_size; int rc = 0; /* Assert that the transaction has been assigned to a @@ -1038,6 +1040,19 @@ static int __osd_object_punch(objset_t *os, dnode_t *dn, dmu_tx_t *tx, if (len == DMU_OBJECT_END && size == off) return 0; + /* if object holds encrypted content, we need to make sure we truncate + * on an encryption unit boundary, or subsequent reads will get + * corrupted content + */ + if (len != DMU_OBJECT_END) + len -= LUSTRE_ENCRYPTION_UNIT_SIZE - + (off & ~LUSTRE_ENCRYPTION_MASK); + if (obj->oo_lma_flags & LUSTRE_ENCRYPT_FL && + off & ~LUSTRE_ENCRYPTION_MASK) + off = (off & LUSTRE_ENCRYPTION_MASK) + + LUSTRE_ENCRYPTION_UNIT_SIZE; + + /* XXX: dnode_free_range() can be used to save on dnode lookup */ if (off < size) dmu_free_range(os, dn->dn_object, off, len, tx); @@ -1069,8 +1084,8 @@ static int osd_punch(const struct lu_env *env, struct dt_object *dt, len = end - start; write_unlock(&obj->oo_attr_lock); - rc = __osd_object_punch(osd->od_os, obj->oo_dn, oh->ot_tx, - obj->oo_attr.la_size, start, len); + rc = __osd_object_punch(obj, osd->od_os, oh->ot_tx, start, len); + /* set new size */ if (len == DMU_OBJECT_END) { write_lock(&obj->oo_attr_lock); @@ -1100,6 +1115,14 @@ static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt, len = end - start; /* declare we'll free some blocks ... */ + /* if object holds encrypted content, we need to make sure we truncate + * on an encryption unit boundary, or subsequent reads will get + * corrupted content + */ + if (obj->oo_lma_flags & LUSTRE_ENCRYPT_FL && + start & ~LUSTRE_ENCRYPTION_MASK) + start = (start & LUSTRE_ENCRYPTION_MASK) + + LUSTRE_ENCRYPTION_UNIT_SIZE; if (start < obj->oo_attr.la_size) { read_unlock(&obj->oo_attr_lock); dmu_tx_mark_netfree(oh->ot_tx); -- 1.8.3.1