*/
/*
* This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
*/
#define DEBUG_SUBSYSTEM S_OSC
#include <libcfs/libcfs.h>
#include <linux/falloc.h>
#include <lprocfs_status.h>
-#include <lustre_debug.h>
#include <lustre_dlm.h>
#include <lustre_fid.h>
#include <lustre_ha.h>
#include <linux/falloc.h>
#include "osc_internal.h"
+#include <lnet/lnet_rdma.h>
atomic_t osc_pool_req_count;
unsigned int osc_reqpool_maxreqcount;
int rc;
ENTRY;
- /*
- * Only mode == 0 (which is standard prealloc) is supported now.
- * Punch is not supported yet.
- */
- if (mode & ~FALLOC_FL_KEEP_SIZE)
- RETURN(-EOPNOTSUPP);
oa->o_falloc_mode = mode;
-
req = ptlrpc_request_alloc(class_exp2cliimp(exp),
&RQF_OST_FALLOCATE);
if (req == NULL)
ptlrpc_request_set_replen(req);
- req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret;
+ req->rq_interpret_reply = osc_setattr_interpret;
BUILD_BUG_ON(sizeof(*sa) > sizeof(req->rq_async_args));
sa = ptlrpc_req_async_args(sa, req);
sa->sa_oa = oa;
RETURN(0);
}
+EXPORT_SYMBOL(osc_fallocate_base);
static int osc_sync_interpret(const struct lu_env *env,
struct ptlrpc_request *req, void *args, int rc)
size_t pg_count, struct brw_page **pga,
int opc, obd_dif_csum_fn *fn,
int sector_size,
- u32 *check_sum)
+ u32 *check_sum, bool resend)
{
struct ahash_request *req;
/* Used Adler as the default checksum type on top of DIF tags */
buffer = kmap(__page);
guard_start = (__u16 *)buffer;
guard_number = PAGE_SIZE / sizeof(*guard_start);
+ CDEBUG(D_PAGE | (resend ? D_HA : 0),
+ "GRD tags per page=%u, resend=%u, bytes=%u, pages=%zu\n",
+ guard_number, resend, nob, pg_count);
+
while (nob > 0 && pg_count > 0) {
unsigned int count = pga[i]->count > nob ? nob : pga[i]->count;
guard_number - used_number,
&used, sector_size,
fn);
+ if (unlikely(resend))
+ CDEBUG(D_PAGE | D_HA,
+ "pga[%u]: used %u off %llu+%u gen checksum: %*phN\n",
+ i, used, pga[i]->off & ~PAGE_MASK, count,
+ (int)(used * sizeof(*guard_start)),
+ guard_start + used_number);
if (rc)
break;
#else /* !CONFIG_CRC_T10DIF */
#define obd_dif_ip_fn NULL
#define obd_dif_crc_fn NULL
-#define osc_checksum_bulk_t10pi(name, nob, pgc, pga, opc, fn, ssize, csum) \
+#define osc_checksum_bulk_t10pi(name, nob, pgc, pga, opc, fn, ssize, csum, re) \
-EOPNOTSUPP
#endif /* CONFIG_CRC_T10DIF */
enum cksum_types cksum_type,
int nob, size_t pg_count,
struct brw_page **pga, int opc,
- u32 *check_sum)
+ u32 *check_sum, bool resend)
{
obd_dif_csum_fn *fn = NULL;
int sector_size = 0;
if (fn)
rc = osc_checksum_bulk_t10pi(obd_name, nob, pg_count, pga,
- opc, fn, sector_size, check_sum);
+ opc, fn, sector_size, check_sum,
+ resend);
else
rc = osc_checksum_bulk(nob, pg_count, pga, opc, cksum_type,
check_sum);
struct brw_page *pg_prev;
void *short_io_buf;
const char *obd_name = cli->cl_import->imp_obd->obd_name;
- struct inode *inode;
+ struct inode *inode = NULL;
bool directio = false;
+ bool enable_checksum = true;
ENTRY;
- inode = page2inode(pga[0]->pg);
- if (inode == NULL) {
- /* Try to get reference to inode from cl_page if we are
- * dealing with direct IO, as handled pages are not
- * actual page cache pages.
- */
- struct osc_async_page *oap = brw_page2oap(pga[0]);
- struct cl_page *clpage = oap2cl_page(oap);
+ if (pga[0]->pg) {
+ inode = page2inode(pga[0]->pg);
+ if (inode == NULL) {
+ /* Try to get reference to inode from cl_page if we are
+ * dealing with direct IO, as handled pages are not
+ * actual page cache pages.
+ */
+ struct osc_async_page *oap = brw_page2oap(pga[0]);
+ struct cl_page *clpage = oap2cl_page(oap);
- inode = clpage->cp_inode;
- if (inode)
- directio = true;
+ inode = clpage->cp_inode;
+ if (inode)
+ directio = true;
+ }
}
if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ))
RETURN(-ENOMEM); /* Recoverable */
}
}
+ if (lnet_is_rdma_only_page(pga[0]->pg)) {
+ enable_checksum = false;
+ short_io_size = 0;
+ }
+
/* Check if read/write is small enough to be a short io. */
if (short_io_size > cli->cl_max_short_io_bytes || niocount > 1 ||
!imp_connect_shortio(cli->cl_import))
short_io_size = 0;
+ /* If this is an empty RPC to old server, just ignore it */
+ if (!short_io_size && !pga[0]->pg) {
+ ptlrpc_request_free(req);
+ RETURN(-ENODATA);
+ }
+
req_capsule_set_size(pill, &RMF_SHORT_IO, RCL_CLIENT,
opc == OST_READ ? 0 : short_io_size);
if (opc == OST_READ)
if (osc_should_shrink_grant(cli))
osc_shrink_grant_local(cli, &body->oa);
+ if (!cli->cl_checksum || sptlrpc_flavor_has_bulk(&req->rq_flvr))
+ enable_checksum = false;
+
/* size[REQ_REC_OFF] still sizeof (*body) */
if (opc == OST_WRITE) {
- if (cli->cl_checksum &&
- !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+ if (enable_checksum) {
/* store cl_cksum_type in a local variable since
* it can be changed via lprocfs */
enum cksum_types cksum_type = cli->cl_cksum_type;
rc = osc_checksum_bulk_rw(obd_name, cksum_type,
requested_nob, page_count,
pga, OST_WRITE,
- &body->oa.o_cksum);
+ &body->oa.o_cksum, resend);
if (rc < 0) {
- CDEBUG(D_PAGE, "failed to checksum, rc = %d\n",
+ CDEBUG(D_PAGE, "failed to checksum: rc = %d\n",
rc);
GOTO(out, rc);
}
- CDEBUG(D_PAGE, "checksum at write origin: %x\n",
- body->oa.o_cksum);
+ CDEBUG(D_PAGE | (resend ? D_HA : 0),
+ "checksum at write origin: %x (%x)\n",
+ body->oa.o_cksum, cksum_type);
- /* save this in 'oa', too, for later checking */
- oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
+ /* save this in 'oa', too, for later checking */
+ oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS;
oa->o_flags |= obd_cksum_type_pack(obd_name,
cksum_type);
} else {
req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER,
sizeof(__u32) * niocount);
} else {
- if (cli->cl_checksum &&
- !sptlrpc_flavor_has_bulk(&req->rq_flvr)) {
+ if (enable_checksum) {
if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0)
body->oa.o_flags = 0;
body->oa.o_flags |= obd_cksum_type_pack(obd_name,
* file/fid, not during the resends/retries. */
snprintf(dbgcksum_file_name, sizeof(dbgcksum_file_name),
"%s-checksum_dump-osc-"DFID":[%llu-%llu]-%x-%x",
- (strncmp(libcfs_debug_file_path_arr, "NONE", 4) != 0 ?
- libcfs_debug_file_path_arr :
- LIBCFS_DEBUG_FILE_PATH_DEFAULT),
+ (strncmp(libcfs_debug_file_path, "NONE", 4) != 0 ?
+ libcfs_debug_file_path : LIBCFS_DEBUG_FILE_PATH_DEFAULT),
oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : 0ULL,
oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0,
oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0,
pga[0]->off,
pga[page_count-1]->off + pga[page_count-1]->count - 1,
client_cksum, server_cksum);
+ CWARN("dumping checksum data to %s\n", dbgcksum_file_name);
filp = filp_open(dbgcksum_file_name,
O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, 0600);
if (IS_ERR(filp)) {
}
len -= rc;
buf += rc;
- CDEBUG(D_INFO, "%s: wrote %d bytes\n",
- dbgcksum_file_name, rc);
}
kunmap(pga[i]->pg);
}
if (rc)
CERROR("%s: sync returns %d\n", dbgcksum_file_name, rc);
filp_close(filp, NULL);
+
+ libcfs_debug_dumplog();
}
static int
rc = osc_checksum_bulk_t10pi(obd_name, aa->aa_requested_nob,
aa->aa_page_count, aa->aa_ppga,
OST_WRITE, fn, sector_size,
- &new_cksum);
+ &new_cksum, true);
else
rc = osc_checksum_bulk(aa->aa_requested_nob, aa->aa_page_count,
aa->aa_ppga, OST_WRITE, cksum_type,
}
}
- if (rc < aa->aa_requested_nob)
- handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
+ if (rc < aa->aa_requested_nob)
+ handle_short_read(rc, aa->aa_page_count, aa->aa_ppga);
- if (body->oa.o_valid & OBD_MD_FLCKSUM) {
- static int cksum_counter;
- u32 server_cksum = body->oa.o_cksum;
- char *via = "";
- char *router = "";
+ if (body->oa.o_valid & OBD_MD_FLCKSUM) {
+ static int cksum_counter;
+ u32 server_cksum = body->oa.o_cksum;
+ int nob = rc;
+ char *via = "";
+ char *router = "";
enum cksum_types cksum_type;
u32 o_flags = body->oa.o_valid & OBD_MD_FLFLAGS ?
body->oa.o_flags : 0;
cksum_type = obd_cksum_type_unpack(o_flags);
- rc = osc_checksum_bulk_rw(obd_name, cksum_type, rc,
+ rc = osc_checksum_bulk_rw(obd_name, cksum_type, nob,
aa->aa_page_count, aa->aa_ppga,
- OST_READ, &client_cksum);
+ OST_READ, &client_cksum, false);
if (rc < 0)
GOTO(out, rc);
if (server_cksum != client_cksum) {
struct ost_body *clbody;
+ __u32 client_cksum2;
u32 page_count = aa->aa_page_count;
+ osc_checksum_bulk_rw(obd_name, cksum_type, nob,
+ page_count, aa->aa_ppga,
+ OST_READ, &client_cksum2, true);
clbody = req_capsule_client_get(&req->rq_pill,
&RMF_OST_BODY);
if (cli->cl_checksum_dump)
LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from "
"%s%s%s inode "DFID" object "DOSTID
- " extent [%llu-%llu], client %x, "
+ " extent [%llu-%llu], client %x/%x, "
"server %x, cksum_type %x\n",
obd_name,
libcfs_nid2str(peer->nid),
aa->aa_ppga[0]->off,
aa->aa_ppga[page_count-1]->off +
aa->aa_ppga[page_count-1]->count - 1,
- client_cksum, server_cksum,
- cksum_type);
+ client_cksum, client_cksum2,
+ server_cksum, cksum_type);
cksum_counter = 0;
aa->aa_oa->o_cksum = client_cksum;
rc = -EAGAIN;
static void osc_release_ppga(struct brw_page **ppga, size_t count)
{
LASSERT(ppga != NULL);
- OBD_FREE_PTR_ARRAY(ppga, count);
+ OBD_FREE_PTR_ARRAY_LARGE(ppga, count);
}
static int brw_interpret(const struct lu_env *env,
req->rq_import->imp_obd->obd_name,
POSTID(&aa->aa_oa->o_oi), rc);
} else if (rc == -EINPROGRESS ||
- client_should_resend(aa->aa_resends, aa->aa_cli)) {
+ client_should_resend(aa->aa_resends, aa->aa_cli)) {
rc = osc_brw_redo_request(req, aa, rc);
} else {
CERROR("%s: too many resent retries for object: "
list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) {
list_del_init(&ext->oe_link);
osc_extent_finish(env, ext, 1,
- rc && req->rq_no_delay ? -EWOULDBLOCK : rc);
+ rc && req->rq_no_delay ? -EAGAIN : rc);
}
LASSERT(list_empty(&aa->aa_exts));
LASSERT(list_empty(&aa->aa_oaps));
if (mem_tight)
mpflag = memalloc_noreclaim_save();
- OBD_ALLOC_PTR_ARRAY(pga, page_count);
+ OBD_ALLOC_PTR_ARRAY_LARGE(pga, page_count);
if (pga == NULL)
GOTO(out, rc = -ENOMEM);
}
/* first page in the list */
- oap = list_entry(rpc_list.next, typeof(*oap), oap_rpc_item);
+ oap = list_first_entry(&rpc_list, typeof(*oap), oap_rpc_item);
crattr = &osc_env_info(env)->oti_req_attr;
memset(crattr, 0, sizeof(*crattr));
osc_release_ppga(pga, page_count);
}
/* this should happen rarely and is pretty bad, it makes the
- * pending list not follow the dirty order */
- while (!list_empty(ext_list)) {
- ext = list_entry(ext_list->next, struct osc_extent,
- oe_link);
+ * pending list not follow the dirty order
+ */
+ while ((ext = list_first_entry_or_null(ext_list,
+ struct osc_extent,
+ oe_link)) != NULL) {
list_del_init(&ext->oe_link);
osc_extent_finish(env, ext, 0, rc);
}
RETURN(rc);
}
+/* This is to refresh our lock in face of no RPCs. */
+void osc_send_empty_rpc(struct osc_object *osc, pgoff_t start)
+{
+ struct ptlrpc_request *req;
+ struct obdo oa;
+ struct brw_page bpg = { .off = start, .count = 1};
+ struct brw_page *pga = &bpg;
+ int rc;
+
+ memset(&oa, 0, sizeof(oa));
+ oa.o_oi = osc->oo_oinfo->loi_oi;
+ oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLFLAGS;
+ /* For updated servers - don't do a read */
+ oa.o_flags = OBD_FL_NORPC;
+
+ rc = osc_brw_prep_request(OBD_BRW_READ, osc_cli(osc), &oa, 1, &pga,
+ &req, 0);
+
+ /* If we succeeded we ship it off, if not there's no point in doing
+ * anything. Also no resends.
+ * No interpret callback, no commit callback.
+ */
+ if (!rc) {
+ req->rq_no_resend = 1;
+ ptlrpcd_add_req(req);
+ }
+}
+
static int osc_set_lock_data(struct ldlm_lock *lock, void *data)
{
int set = 0;
struct obd_device *obd = class_exp2obd(exp);
struct obd_statfs *msfs;
struct ptlrpc_request *req;
- struct obd_import *imp = NULL;
+ struct obd_import *imp, *imp0;
int rc;
ENTRY;
-
- /*Since the request might also come from lprocfs, so we need
- *sync this with client_disconnect_export Bug15684*/
- down_read(&obd->u.cli.cl_sem);
- if (obd->u.cli.cl_import)
- imp = class_import_get(obd->u.cli.cl_import);
- up_read(&obd->u.cli.cl_sem);
- if (!imp)
- RETURN(-ENODEV);
+ /*Since the request might also come from lprocfs, so we need
+ *sync this with client_disconnect_export Bug15684
+ */
+ with_imp_locked(obd, imp0, rc)
+ imp = class_import_get(imp0);
+ if (rc)
+ RETURN(rc);
/* We could possibly pass max_age in the request (as an absolute
* timestamp or a "seconds.usec ago") so the target can avoid doing
.o_quotactl = osc_quotactl,
};
-static struct shrinker *osc_cache_shrinker;
LIST_HEAD(osc_shrink_list);
DEFINE_SPINLOCK(osc_shrink_lock);
-#ifndef HAVE_SHRINKER_COUNT
-static int osc_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+#ifdef HAVE_SHRINKER_COUNT
+static struct shrinker osc_cache_shrinker = {
+ .count_objects = osc_cache_shrink_count,
+ .scan_objects = osc_cache_shrink_scan,
+ .seeks = DEFAULT_SEEKS,
+};
+#else
+static int osc_cache_shrink(struct shrinker *shrinker,
+ struct shrink_control *sc)
{
- struct shrink_control scv = {
- .nr_to_scan = shrink_param(sc, nr_to_scan),
- .gfp_mask = shrink_param(sc, gfp_mask)
- };
- (void)osc_cache_shrink_scan(shrinker, &scv);
+ (void)osc_cache_shrink_scan(shrinker, sc);
- return osc_cache_shrink_count(shrinker, &scv);
+ return osc_cache_shrink_count(shrinker, sc);
}
+
+static struct shrinker osc_cache_shrinker = {
+ .shrink = osc_cache_shrink,
+ .seeks = DEFAULT_SEEKS,
+};
#endif
static int __init osc_init(void)
unsigned int reqpool_size;
unsigned int reqsize;
int rc;
- DEF_SHRINKER_VAR(osc_shvar, osc_cache_shrink,
- osc_cache_shrink_count, osc_cache_shrink_scan);
ENTRY;
/* print an address of _any_ initialized kernel symbol from this
if (rc)
RETURN(rc);
- rc = class_register_type(&osc_obd_ops, NULL, true, NULL,
+ rc = class_register_type(&osc_obd_ops, NULL, true,
LUSTRE_OSC_NAME, &osc_device_type);
if (rc)
GOTO(out_kmem, rc);
- osc_cache_shrinker = set_shrinker(DEFAULT_SEEKS, &osc_shvar);
+ rc = register_shrinker(&osc_cache_shrinker);
+ if (rc)
+ GOTO(out_type, rc);
/* This is obviously too much memory, only prevent overflow here */
if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0)
- GOTO(out_type, rc = -EINVAL);
+ GOTO(out_shrinker, rc = -EINVAL);
reqpool_size = osc_reqpool_mem_max << 20;
ptlrpc_add_rqs_to_pool);
if (osc_rq_pool == NULL)
- GOTO(out_type, rc = -ENOMEM);
+ GOTO(out_shrinker, rc = -ENOMEM);
rc = osc_start_grant_work();
if (rc != 0)
out_req_pool:
ptlrpc_free_rq_pool(osc_rq_pool);
+out_shrinker:
+ unregister_shrinker(&osc_cache_shrinker);
out_type:
class_unregister_type(LUSTRE_OSC_NAME);
out_kmem:
static void __exit osc_exit(void)
{
osc_stop_grant_work();
- remove_shrinker(osc_cache_shrinker);
+ unregister_shrinker(&osc_cache_shrinker);
class_unregister_type(LUSTRE_OSC_NAME);
lu_kmem_fini(osc_caches);
ptlrpc_free_rq_pool(osc_rq_pool);