X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fquota%2Fqsd_lib.c;h=6b827cbf3a03cf8a6914bd125659204de58b42ee;hp=16a5cc427dc744827b170f47e3fb4d636458ea72;hb=d527e812461baf9db2f6ed960a3b6cc12d4ab37c;hpb=318fd8d197ff607a032dac6ed9cb15922e44a53f

diff --git a/lustre/quota/qsd_lib.c b/lustre/quota/qsd_lib.c
index 16a5cc4..6b827cb 100644
--- a/lustre/quota/qsd_lib.c
+++ b/lustre/quota/qsd_lib.c
@@ -21,53 +21,313 @@
  * GPL HEADER END
  */
 /*
- * Copyright (c) 2012 Whamcloud, Inc.
+ * Copyright (c) 2012, 2017, Intel Corporation.
  * Use is subject to license terms.
  *
- * Author: Johann Lombardi <johann@whamcloud.com>
- * Author: Niu    Yawei    <niu@whamcloud.com>
+ * Author: Johann Lombardi <johann.lombardi@intel.com>
+ * Author: Niu    Yawei    <yawei.niu@intel.com>
  */
 
 /*
  * Quota Slave Driver (QSD) management.
+ *
+ * The quota slave feature is implemented under the form of a library called
+ * QSD. Each OSD device should create a QSD instance via qsd_init() which will
+ * be used to manage quota enforcement for this device. This implies:
+ * - completing the reintegration procedure with the quota master (aka QMT, see
+ *   qmt_dev.c) to retrieve the latest quota settings and space distribution.
+ * - managing quota locks in order to be notified of configuration changes.
+ * - acquiring space from the QMT when quota space for a given user/group is
+ *   close to exhaustion.
+ * - allocating quota space to service threads for local request processing.
+ *
+ * Once the QSD instance created, the OSD device should invoke qsd_start()
+ * when recovery is completed. This notifies the QSD that we are about to
+ * process new requests on which quota should be strictly enforced.
+ * Then, qsd_op_begin/end can be used to reserve/release/pre-acquire quota space
+ * for/after each operation until shutdown where the QSD instance should be
+ * freed via qsd_fini().
  */
 
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
-
 #define DEBUG_SUBSYSTEM S_LQUOTA
 
+#include <obd_class.h>
 #include "qsd_internal.h"
 
+struct kmem_cache *upd_kmem;
+
+struct lu_kmem_descr qsd_caches[] = {
+	{
+		.ckd_cache = &upd_kmem,
+		.ckd_name  = "upd_kmem",
+		.ckd_size  = sizeof(struct qsd_upd_rec)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
 /* define qsd thread key */
 LU_KEY_INIT_FINI(qsd, struct qsd_thread_info);
-LU_CONTEXT_KEY_DEFINE(qsd, LCT_MD_THREAD | LCT_DT_THREAD | LCT_LOCAL);
+LU_CONTEXT_KEY_DEFINE(qsd, LCT_MD_THREAD | LCT_MG_THREAD | LCT_DT_THREAD | LCT_LOCAL);
 LU_KEY_INIT_GENERIC(qsd);
 
 /* some procfs helpers */
-static int lprocfs_qsd_rd_state(char *page, char **start, off_t off,
-				int count, int *eof, void *data)
+static int qsd_state_seq_show(struct seq_file *m, void *data)
+{
+	struct qsd_instance	*qsd = m->private;
+	char			 enabled[5];
+
+	LASSERT(qsd != NULL);
+
+	memset(enabled, 0, sizeof(enabled));
+	if (qsd_type_enabled(qsd, USRQUOTA))
+		strcat(enabled, "u");
+	if (qsd_type_enabled(qsd, GRPQUOTA))
+		strcat(enabled, "g");
+	if (qsd_type_enabled(qsd, PRJQUOTA))
+		strncat(enabled, "p", 1);
+	if (strlen(enabled) == 0)
+		strcat(enabled, "none");
+
+	/* TODO: further pool ID should be removed or
+	 * replaced with pool Name */
+	seq_printf(m, "target name:    %s\n"
+		   "pool ID:        %d\n"
+		   "type:           %s\n"
+		   "quota enabled:  %s\n"
+		   "conn to master: %s\n",
+		   qsd->qsd_svname, 0,
+		   qsd->qsd_is_md ? "md" : "dt", enabled,
+		   qsd->qsd_exp_valid ? "setup" : "not setup yet");
+
+	if (qsd->qsd_prepared) {
+		memset(enabled, 0, sizeof(enabled));
+		if (qsd->qsd_type_array[USRQUOTA]->qqi_acct_obj != NULL)
+			strcat(enabled, "u");
+		if (qsd->qsd_type_array[GRPQUOTA]->qqi_acct_obj != NULL)
+			strcat(enabled, "g");
+		if (qsd->qsd_type_array[PRJQUOTA]->qqi_acct_obj != NULL)
+			strncat(enabled, "p", 1);
+		if (strlen(enabled) == 0)
+			strcat(enabled, "none");
+		seq_printf(m, "space acct:     %s\n"
+			   "user uptodate:  glb[%d],slv[%d],reint[%d]\n"
+			   "group uptodate: glb[%d],slv[%d],reint[%d]\n"
+			   "project uptodate: glb[%d],slv[%d],reint[%d]\n",
+			   enabled,
+			   qsd->qsd_type_array[USRQUOTA]->qqi_glb_uptodate,
+			   qsd->qsd_type_array[USRQUOTA]->qqi_slv_uptodate,
+			   qsd->qsd_type_array[USRQUOTA]->qqi_reint,
+			   qsd->qsd_type_array[GRPQUOTA]->qqi_glb_uptodate,
+			   qsd->qsd_type_array[GRPQUOTA]->qqi_slv_uptodate,
+			   qsd->qsd_type_array[GRPQUOTA]->qqi_reint,
+			   qsd->qsd_type_array[PRJQUOTA]->qqi_glb_uptodate,
+			   qsd->qsd_type_array[PRJQUOTA]->qqi_slv_uptodate,
+			   qsd->qsd_type_array[PRJQUOTA]->qqi_reint);
+	}
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(qsd_state);
+
+static int qsd_enabled_seq_show(struct seq_file *m, void *data)
 {
-	struct qsd_instance	*qsd = (struct qsd_instance *)data;
+	struct qsd_instance	*qsd = m->private;
+	char			 enabled[5];
 
 	LASSERT(qsd != NULL);
 
-	return snprintf(page, count,
-			"target name:    %s\n"
-			"pool ID:        %d\n"
-			"type:           %s\n"
-			"quota enabled:  none\n",
-			qsd->qsd_svname, qsd->qsd_pool_id,
-			qsd->qsd_is_md ? "md" : "dt");
+	memset(enabled, 0, sizeof(enabled));
+	if (qsd_type_enabled(qsd, USRQUOTA))
+		strncat(enabled, "u", sizeof(enabled) - strlen(enabled));
+	if (qsd_type_enabled(qsd, GRPQUOTA))
+		strncat(enabled, "g", sizeof(enabled) - strlen(enabled));
+	if (qsd_type_enabled(qsd, PRJQUOTA))
+		strncat(enabled, "p", sizeof(enabled) - strlen(enabled));
+	if (strlen(enabled) == 0)
+		strncat(enabled, "none", sizeof(enabled) - strlen(enabled));
+
+	seq_printf(m, "%s\n", enabled);
+	return 0;
 }
 
+static ssize_t qsd_enabled_seq_write(struct file *file,
+				     const char __user *buffer,
+				     size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct qsd_instance *qsd = m->private;
+	char fsname[LUSTRE_MAXFSNAME + 1];
+	int enabled = 0;
+	char valstr[5];
+	int pool, rc;
+
+	if (count > 4)
+		return -E2BIG;
+
+	if (copy_from_user(valstr, buffer, count))
+		GOTO(out, count = -EFAULT);
+
+	valstr[sizeof(valstr) - 1] = 0;
+	if (strchr(valstr, 'u'))
+		enabled |= BIT(USRQUOTA);
+	if (strchr(valstr, 'g'))
+		enabled |= BIT(GRPQUOTA);
+	if (strchr(valstr, 'p'))
+		enabled |= BIT(PRJQUOTA);
+
+	if (enabled == 0 && strcmp(valstr, "none"))
+		GOTO(out, count = -EINVAL);
+
+	if (qsd->qsd_is_md)
+		pool = LQUOTA_RES_MD;
+	else
+		pool = LQUOTA_RES_DT;
+
+	if (server_name2fsname(qsd->qsd_svname, fsname, NULL))
+		GOTO(out, count = -EINVAL);
+
+	rc = qsd_config(valstr, fsname, pool);
+	if (rc)
+		count = rc;
+out:
+	return count;
+}
+LPROC_SEQ_FOPS(qsd_enabled);
+
+/* force reintegration procedure to be executed.
+ * Used for test/debugging purpose */
+static ssize_t
+lprocfs_force_reint_seq_write(struct file *file, const char __user *buffer,
+				size_t count, loff_t *off)
+{
+	struct seq_file	    *m = file->private_data;
+	struct qsd_instance *qsd = m->private;
+	int		     rc = 0, qtype;
+
+	LASSERT(qsd != NULL);
+
+	write_lock(&qsd->qsd_lock);
+	if (qsd->qsd_stopping) {
+		/* don't mess up with shutdown procedure, it is already
+		 * complicated enough */
+		rc = -ESHUTDOWN;
+	} else if (!qsd->qsd_prepared) {
+		rc = -EAGAIN;
+	} else {
+		/* mark all indexes as stale */
+		for (qtype = USRQUOTA; qtype < LL_MAXQUOTAS; qtype++) {
+			qsd->qsd_type_array[qtype]->qqi_glb_uptodate = false;
+			qsd->qsd_type_array[qtype]->qqi_slv_uptodate = false;
+		}
+	}
+	write_unlock(&qsd->qsd_lock);
+
+	if (rc)
+		return rc;
+
+	/* kick off reintegration */
+	for (qtype = USRQUOTA; qtype < LL_MAXQUOTAS; qtype++) {
+		rc = qsd_start_reint_thread(qsd->qsd_type_array[qtype]);
+		if (rc)
+			break;
+	}
+	return rc == 0 ? count : rc;
+}
+LPROC_SEQ_FOPS_WR_ONLY(qsd, force_reint);
+
+static int qsd_timeout_seq_show(struct seq_file *m, void *data)
+{
+	struct qsd_instance *qsd = m->private;
+	LASSERT(qsd != NULL);
+
+	seq_printf(m, "%d\n", qsd_wait_timeout(qsd));
+	return 0;
+}
+
+static ssize_t
+qsd_timeout_seq_write(struct file *file, const char __user *buffer,
+			size_t count, loff_t *off)
+{
+	struct seq_file *m = file->private_data;
+	struct qsd_instance *qsd = m->private;
+	time64_t timeout;
+	int rc;
+
+	LASSERT(qsd != NULL);
+	rc = kstrtoll_from_user(buffer, count, 0, &timeout);
+	if (rc)
+		return rc;
+
+	if (timeout < 0)
+		return -EINVAL;
+
+	qsd->qsd_timeout = timeout;
+	return count;
+}
+LPROC_SEQ_FOPS(qsd_timeout);
+
 static struct lprocfs_vars lprocfs_quota_qsd_vars[] = {
-	{ "info", lprocfs_qsd_rd_state, 0, 0},
+	{ .name	=	"info",
+	  .fops	=	&qsd_state_fops		},
+	{ .name	=	"enabled",
+	  .fops	=	&qsd_enabled_fops	},
+	{ .name	=	"force_reint",
+	  .fops	=	&qsd_force_reint_fops	},
+	{ .name	=	"timeout",
+	  .fops	=	&qsd_timeout_fops	},
 	{ NULL }
 };
 
 /*
+ * Callback function invoked by the OSP layer when the connection to the master
+ * has been set up.
+ *
+ * \param data - is a pointer to the qsd_instance
+ *
+ * \retval - 0 on success, appropriate error on failure
+ */
+static int qsd_conn_callback(void *data)
+{
+	struct qsd_instance *qsd = (struct qsd_instance *)data;
+	int                  type;
+	ENTRY;
+
+	/* qsd_exp should now be valid */
+	LASSERT(qsd->qsd_exp);
+
+	qsd->qsd_ns = class_exp2obd(qsd->qsd_exp)->obd_namespace;
+
+	write_lock(&qsd->qsd_lock);
+	/* notify that qsd_exp is now valid */
+	qsd->qsd_exp_valid = true;
+	write_unlock(&qsd->qsd_lock);
+
+	/* Now that the connection to master is setup, we can initiate the
+	 * reintegration procedure for quota types which are enabled.
+	 * It is worth noting that, if the qsd_instance hasn't been started
+	 * already, then we can only complete the first two steps of the
+	 * reintegration procedure (i.e. global lock enqueue and slave
+	 * index transfer) since the space usage reconciliation (i.e.
+	 * step 3) will have to wait for qsd_start() to be called */
+	for (type = USRQUOTA; type < LL_MAXQUOTAS; type++) {
+		struct qsd_qtype_info *qqi = qsd->qsd_type_array[type];
+		struct task_struct *t;
+
+		/* qqi_reint_task can be set to NULL at any time,
+		 * so we need to be careful.
+		 */
+		rcu_read_lock();
+		t = rcu_dereference(qqi->qqi_reint_task);
+		if (t)
+			wake_up_process(t);
+		rcu_read_unlock();
+	}
+
+	RETURN(0);
+}
+
+/*
  * Release qsd_qtype_info structure which contains data associated with a
  * given quota type. This releases the accounting objects.
  * It's called on OSD cleanup when the qsd instance is released.
@@ -81,6 +341,7 @@ static void qsd_qtype_fini(const struct lu_env *env, struct qsd_instance *qsd,
 			   int qtype)
 {
 	struct qsd_qtype_info	*qqi;
+	int repeat = 0;
 	ENTRY;
 
 	if (qsd->qsd_type_array[qtype] == NULL)
@@ -88,26 +349,58 @@ static void qsd_qtype_fini(const struct lu_env *env, struct qsd_instance *qsd,
 	qqi = qsd->qsd_type_array[qtype];
 	qsd->qsd_type_array[qtype] = NULL;
 
+	/* all deferred work lists should be empty */
+	LASSERT(list_empty(&qqi->qqi_deferred_glb));
+	LASSERT(list_empty(&qqi->qqi_deferred_slv));
+
+	/* shutdown lquota site */
+	if (qqi->qqi_site != NULL && !IS_ERR(qqi->qqi_site)) {
+		lquota_site_free(env, qqi->qqi_site);
+		qqi->qqi_site = NULL;
+	}
+
+	/* The qqi may still be holding by global locks which are being
+	 * canceled asynchronously (LU-4365), see the following steps:
+	 *
+	 * - On server umount, we try to clear all quota locks first by
+	 *   disconnecting LWP (which will invalidate import and cleanup
+	 *   all locks on it), however, if quota reint process is holding
+	 *   the global lock for reintegration at that time, global lock
+	 *   will fail to be cleared on LWP disconnection.
+	 *
+	 * - Umount process goes on and stops reint process, the global
+	 *   lock will be dropped on reint process exit, however, the lock
+	 *   cancel in done in asynchronous way, so the
+	 *   qsd_glb_blocking_ast() might haven't been called yet when we
+	 *   get here.
+	 */
+	while (atomic_read(&qqi->qqi_ref) > 1) {
+		CDEBUG(D_QUOTA, "qqi reference count %u, repeat: %d\n",
+		       atomic_read(&qqi->qqi_ref), repeat);
+		repeat++;
+		schedule_timeout_interruptible(cfs_time_seconds(1));
+	}
+
 	/* by now, all qqi users should have gone away */
-	LASSERT(cfs_atomic_read(&qqi->qqi_ref) == 1);
+	LASSERT(atomic_read(&qqi->qqi_ref) == 1);
 	lu_ref_fini(&qqi->qqi_reference);
 
 	/* release accounting object */
 	if (qqi->qqi_acct_obj != NULL && !IS_ERR(qqi->qqi_acct_obj)) {
-		lu_object_put(env, &qqi->qqi_acct_obj->do_lu);
+		dt_object_put(env, qqi->qqi_acct_obj);
 		qqi->qqi_acct_obj = NULL;
 	}
 
 	/* release slv index */
 	if (qqi->qqi_slv_obj != NULL && !IS_ERR(qqi->qqi_slv_obj)) {
-		lu_object_put(env, &qqi->qqi_slv_obj->do_lu);
+		dt_object_put(env, qqi->qqi_slv_obj);
 		qqi->qqi_slv_obj = NULL;
 		qqi->qqi_slv_ver = 0;
 	}
 
 	/* release global index */
 	if (qqi->qqi_glb_obj != NULL && !IS_ERR(qqi->qqi_glb_obj)) {
-		lu_object_put(env, &qqi->qqi_glb_obj->do_lu);
+		dt_object_put(env, qqi->qqi_glb_obj);
 		qqi->qqi_glb_obj = NULL;
 		qqi->qqi_glb_ver = 0;
 	}
@@ -116,10 +409,45 @@ static void qsd_qtype_fini(const struct lu_env *env, struct qsd_instance *qsd,
 	EXIT;
 }
 
+static const char *qtype2acct_name(int qtype)
+{
+	static char unknown[24];
+
+	switch (qtype) {
+	case USRQUOTA:
+		return "acct_user";
+	case GRPQUOTA:
+		return "acct_group";
+	case PRJQUOTA:
+		return "acct_project";
+	}
+
+	snprintf(unknown, sizeof(unknown), "acct_unknown_%u", qtype);
+	return unknown;
+}
+
+static const char *qtype2glb_name(int qtype)
+{
+	static char unknown[24];
+
+	switch (qtype) {
+	case USRQUOTA:
+		return "limit_user";
+	case GRPQUOTA:
+		return "limit_group";
+	case PRJQUOTA:
+		return "limit_project";
+	}
+
+	snprintf(unknown, sizeof(unknown), "acct_unknown_%u", qtype);
+	return unknown;
+}
+
 /*
  * Allocate and initialize a qsd_qtype_info structure for quota type \qtype.
  * This opens the accounting object and initializes the proc file.
- * It's called on OSD start when the qsd instance is created.
+ * It's called on OSD start when the qsd_prepare() is invoked on the qsd
+ * instance.
  *
  * \param env  - the environment passed by the caller
  * \param qsd  - is the qsd instance which will be in charge of the new
@@ -144,28 +472,29 @@ static int qsd_qtype_init(const struct lu_env *env, struct qsd_instance *qsd,
 	if (qqi == NULL)
 		RETURN(-ENOMEM);
 	qsd->qsd_type_array[qtype] = qqi;
-	cfs_atomic_set(&qqi->qqi_ref, 1); /* referenced from qsd */
+	atomic_set(&qqi->qqi_ref, 1); /* referenced from qsd */
 
 	/* set backpointer and other parameters */
 	qqi->qqi_qsd   = qsd;
 	qqi->qqi_qtype = qtype;
 	lu_ref_init(&qqi->qqi_reference);
-	lquota_generate_fid(&qqi->qqi_fid, qsd->qsd_pool_id, QSD_RES_TYPE(qsd),
-			    qtype);
 	qqi->qqi_glb_uptodate = false;
 	qqi->qqi_slv_uptodate = false;
 	qqi->qqi_reint        = false;
-	memset(&qqi->qqi_lockh, 0, sizeof(qqi->qqi_lockh));
-
-        /* open accounting object */
-        LASSERT(qqi->qqi_acct_obj == NULL);
-	qqi->qqi_acct_obj = acct_obj_lookup(env, qsd->qsd_dev,
-					    qtype == USRQUOTA ? ACCT_USER_OID
-							      : ACCT_GROUP_OID);
-	/* don't print any error message on failure in order not to confuse
-	 * non-OFD user (e.g. 2.3 MDT stack) */
-	if (IS_ERR(qqi->qqi_acct_obj))
+	INIT_LIST_HEAD(&qqi->qqi_deferred_glb);
+	INIT_LIST_HEAD(&qqi->qqi_deferred_slv);
+	lquota_generate_fid(&qqi->qqi_fid, QSD_RES_TYPE(qsd), qtype);
+
+	/* open accounting object */
+	LASSERT(qqi->qqi_acct_obj == NULL);
+	qqi->qqi_acct_obj = acct_obj_lookup(env, qsd->qsd_dev, qtype);
+	if (IS_ERR(qqi->qqi_acct_obj)) {
+		CDEBUG(D_QUOTA, "%s: no %s space accounting support: rc = %ld\n",
+		       qsd->qsd_svname, qtype_name(qtype),
+		       PTR_ERR(qqi->qqi_acct_obj));
 		qqi->qqi_acct_obj = NULL;
+		qqi->qqi_acct_failed = true;
+	}
 
 	/* open global index copy */
 	LASSERT(qqi->qqi_glb_obj == NULL);
@@ -195,17 +524,32 @@ static int qsd_qtype_init(const struct lu_env *env, struct qsd_instance *qsd,
 	}
 	qqi->qqi_slv_ver = dt_version_get(env, qqi->qqi_slv_obj);
 
-	/* register proc entry for accounting object */
-	rc = lprocfs_seq_create(qsd->qsd_proc,
-				qtype == USRQUOTA ? "acct_user" : "acct_group",
+	/* allocate site */
+	qqi->qqi_site = lquota_site_alloc(env, qqi, false, qtype, &qsd_lqe_ops);
+	if (IS_ERR(qqi->qqi_site)) {
+		CERROR("%s: can't allocate site "DFID" %ld\n", qsd->qsd_svname,
+		       PFID(&qqi->qqi_fid), PTR_ERR(qqi->qqi_site));
+		GOTO(out, rc = PTR_ERR(qqi->qqi_site));
+	}
+
+	/* register proc entry for accounting & global index copy objects */
+	rc = lprocfs_seq_create(qsd->qsd_proc, qtype2acct_name(qtype),
 				0444, &lprocfs_quota_seq_fops,
 				qqi->qqi_acct_obj);
 	if (rc) {
-		CWARN("%s: can't add procfs entry for accounting file %d\n",
-		      qsd->qsd_svname, rc);
+		CERROR("%s: can't add procfs entry for accounting file %d\n",
+		       qsd->qsd_svname, rc);
 		GOTO(out, rc);
 	}
 
+	rc = lprocfs_seq_create(qsd->qsd_proc, qtype2glb_name(qtype),
+				0444, &lprocfs_quota_seq_fops,
+				qqi->qqi_glb_obj);
+	if (rc) {
+		CERROR("%s: can't add procfs entry for global index copy %d\n",
+		       qsd->qsd_svname, rc);
+		GOTO(out, rc);
+	}
 	EXIT;
 out:
 	if (rc)
@@ -215,7 +559,8 @@ out:
 
 /*
  * Release a qsd_instance. Companion of qsd_init(). This releases all data
- * structures associated with the quota slave.
+ * structures associated with the quota slave (on-disk objects, lquota entry
+ * tables, ...).
  * This function should be called when the OSD is shutting down.
  *
  * \param env - is the environment passed by the caller
@@ -226,22 +571,55 @@ void qsd_fini(const struct lu_env *env, struct qsd_instance *qsd)
 	int	qtype;
 	ENTRY;
 
+	if (unlikely(qsd == NULL))
+		RETURN_EXIT;
+
 	CDEBUG(D_QUOTA, "%s: initiating QSD shutdown\n", qsd->qsd_svname);
+	write_lock(&qsd->qsd_lock);
 	qsd->qsd_stopping = true;
+	write_unlock(&qsd->qsd_lock);
 
 	/* remove qsd proc entry */
-	if (qsd->qsd_proc != NULL && !IS_ERR(qsd->qsd_proc)) {
+	if (qsd->qsd_proc != NULL) {
 		lprocfs_remove(&qsd->qsd_proc);
 		qsd->qsd_proc = NULL;
 	}
 
+	/* stop the writeback thread */
+	qsd_stop_upd_thread(qsd);
+
+	/* shutdown the reintegration threads */
+	for (qtype = USRQUOTA; qtype < LL_MAXQUOTAS; qtype++) {
+		if (qsd->qsd_type_array[qtype] == NULL)
+			continue;
+		qsd_stop_reint_thread(qsd->qsd_type_array[qtype]);
+	}
+
+	if (qsd->qsd_ns != NULL) {
+		qsd->qsd_ns = NULL;
+	}
+
+	/* release per-filesystem information */
+	if (qsd->qsd_fsinfo != NULL) {
+		mutex_lock(&qsd->qsd_fsinfo->qfs_mutex);
+		/* remove from the list of fsinfo */
+		list_del_init(&qsd->qsd_link);
+		mutex_unlock(&qsd->qsd_fsinfo->qfs_mutex);
+		qsd_put_fsinfo(qsd->qsd_fsinfo);
+		qsd->qsd_fsinfo = NULL;
+	}
+
 	/* free per-quota type data */
-	for (qtype = USRQUOTA; qtype < MAXQUOTAS; qtype++)
+	for (qtype = USRQUOTA; qtype < LL_MAXQUOTAS; qtype++)
 		qsd_qtype_fini(env, qsd, qtype);
 
+	/* deregister connection to the quota master */
+	qsd->qsd_exp_valid = false;
+	lustre_deregister_lwp_item(&qsd->qsd_exp);
+
 	/* release quota root directory */
-	if (qsd->qsd_root != NULL && !IS_ERR(qsd->qsd_root)) {
-		lu_object_put(env, &qsd->qsd_root->do_lu);
+	if (qsd->qsd_root != NULL) {
+		dt_object_put(env, qsd->qsd_root);
 		qsd->qsd_root = NULL;
 	}
 
@@ -252,6 +630,7 @@ void qsd_fini(const struct lu_env *env, struct qsd_instance *qsd)
 		qsd->qsd_dev = NULL;
 	}
 
+	CDEBUG(D_QUOTA, "%s: QSD shutdown completed\n", qsd->qsd_svname);
 	OBD_FREE_PTR(qsd);
 	EXIT;
 }
@@ -259,8 +638,7 @@ EXPORT_SYMBOL(qsd_fini);
 
 /*
  * Create a new qsd_instance to be associated with backend osd device
- * identified by \dev. For now, this function just create procfs files which
- * dumps the accounting information
+ * identified by \dev.
  *
  * \param env    - the environment passed by the caller
  * \param svname - is the service name of the OSD device creating this instance
@@ -273,60 +651,86 @@ EXPORT_SYMBOL(qsd_fini);
  */
 struct qsd_instance *qsd_init(const struct lu_env *env, char *svname,
 			      struct dt_device *dev,
-			      cfs_proc_dir_entry_t *osd_proc)
+			      struct proc_dir_entry *osd_proc,
+			      bool is_md, bool excl)
 {
+	struct qsd_thread_info	*qti = qsd_info(env);
 	struct qsd_instance	*qsd;
-	int			 rc, qtype;
+	int			 rc, type, idx;
 	ENTRY;
 
+	/* only configure qsd for MDT & OST */
+	type = server_name2index(svname, &idx, NULL);
+	if (type != LDD_F_SV_TYPE_MDT && type != LDD_F_SV_TYPE_OST)
+		RETURN(NULL);
+
 	/* allocate qsd instance */
 	OBD_ALLOC_PTR(qsd);
 	if (qsd == NULL)
 		RETURN(ERR_PTR(-ENOMEM));
 
-	cfs_rwlock_init(&qsd->qsd_lock);
+	/* generic initializations */
+	rwlock_init(&qsd->qsd_lock);
+	INIT_LIST_HEAD(&qsd->qsd_link);
+	INIT_LIST_HEAD(&qsd->qsd_upd_list);
+	spin_lock_init(&qsd->qsd_adjust_lock);
+	INIT_LIST_HEAD(&qsd->qsd_adjust_list);
+	qsd->qsd_prepared = false;
+	qsd->qsd_started = false;
+	qsd->qsd_is_md = is_md;
+	qsd->qsd_updating = false;
+	qsd->qsd_exclusive = excl;
+
 	/* copy service name */
-	strncpy(qsd->qsd_svname, svname, MAX_OBD_NAME);
+	if (strlcpy(qsd->qsd_svname, svname, sizeof(qsd->qsd_svname))
+	    >= sizeof(qsd->qsd_svname))
+		GOTO(out, rc = -E2BIG);
 
 	/* grab reference on osd device */
 	lu_device_get(&dev->dd_lu_dev);
 	lu_ref_add(&dev->dd_lu_dev.ld_reference, "qsd", qsd);
 	qsd->qsd_dev = dev;
 
-	/* we only support pool ID 0 (default data or metadata pool) for the
-	 * time being. A different pool ID could be assigned to this target via
-	 * the configuration log in the future */
-	qsd->qsd_pool_id  = 0;
-
-	/* Record whether this qsd instance is managing quota enforcement for a
-	 * MDT (i.e. inode quota) or OST (block quota) */
-	qsd->qsd_is_md = lu_device_is_md(dev->dd_lu_dev.ld_site->ls_top_dev);
-
-	/* look-up on-disk directory for the quota slave */
-	qsd->qsd_root = lquota_disk_dir_find_create(env, dev, NULL, QSD_DIR);
-	if (IS_ERR(qsd->qsd_root)) {
-		rc = PTR_ERR(qsd->qsd_root);
-		CERROR("%s: failed to create quota slave root dir (%d)\n",
-		       svname, rc);
+	/* get fsname from svname */
+	rc = server_name2fsname(svname, qti->qti_buf, NULL);
+	if (rc) {
+		CERROR("%s: fail to extract filesystem name\n", svname);
 		GOTO(out, rc);
 	}
 
+	/* look up quota setting for the filesystem the target belongs to */
+	qsd->qsd_fsinfo = qsd_get_fsinfo(qti->qti_buf, 1);
+	if (qsd->qsd_fsinfo == NULL) {
+		CERROR("%s: failed to locate filesystem information\n", svname);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	/* add in the list of lquota_fsinfo */
+	mutex_lock(&qsd->qsd_fsinfo->qfs_mutex);
+	list_add_tail(&qsd->qsd_link, &qsd->qsd_fsinfo->qfs_qsd_list);
+	mutex_unlock(&qsd->qsd_fsinfo->qfs_mutex);
+
 	/* register procfs directory */
-	qsd->qsd_proc = lprocfs_register(QSD_DIR, osd_proc,
-					 lprocfs_quota_qsd_vars, qsd);
+	if (qsd->qsd_is_md)
+		qsd->qsd_proc = lprocfs_register(QSD_DIR_MD, osd_proc,
+						 lprocfs_quota_qsd_vars, qsd);
+	else
+		qsd->qsd_proc = lprocfs_register(QSD_DIR_DT, osd_proc,
+						 lprocfs_quota_qsd_vars, qsd);
+
+	if (type == LDD_F_SV_TYPE_MDT && qsd->qsd_is_md)
+		lprocfs_add_symlink(QSD_DIR, osd_proc, "./%s", QSD_DIR_MD);
+	else if (type == LDD_F_SV_TYPE_OST && !qsd->qsd_is_md)
+		lprocfs_add_symlink(QSD_DIR, osd_proc, "./%s", QSD_DIR_DT);
+
 	if (IS_ERR(qsd->qsd_proc)) {
 		rc = PTR_ERR(qsd->qsd_proc);
+		qsd->qsd_proc = NULL;
 		CERROR("%s: fail to create quota slave proc entry (%d)\n",
 		       svname, rc);
 		GOTO(out, rc);
         }
-
-	/* initialize per-quota type data */
-	for (qtype = USRQUOTA; qtype < MAXQUOTAS; qtype++) {
-		rc = qsd_qtype_init(env, qsd, qtype);
-		if (rc)
-			GOTO(out, rc);
-	}
+	EXIT;
 out:
 	if (rc) {
 		qsd_fini(env, qsd);
@@ -337,12 +741,195 @@ out:
 EXPORT_SYMBOL(qsd_init);
 
 /*
+ * Initialize on-disk structures in order to manage quota enforcement for
+ * the target associated with the qsd instance \qsd and starts the reintegration
+ * procedure for each quota type as soon as possible.
+ * The last step of the reintegration will be completed once qsd_start() is
+ * called, at which points the space reconciliation with the master will be
+ * executed.
+ * This function must be called when the server stack is fully configured,
+ * typically when ->ldo_prepare is called across the stack.
+ *
+ * \param env - the environment passed by the caller
+ * \param qsd - is qsd_instance to prepare
+ *
+ * \retval - 0 on success, appropriate error on failure
+ */
+int qsd_prepare(const struct lu_env *env, struct qsd_instance *qsd)
+{
+	struct qsd_thread_info	*qti = qsd_info(env);
+	int			 qtype, rc = 0;
+	ENTRY;
+
+	if (unlikely(qsd == NULL))
+		RETURN(0);
+
+	read_lock(&qsd->qsd_lock);
+	if (qsd->qsd_prepared) {
+		CERROR("%s: qsd instance already prepared\n", qsd->qsd_svname);
+		rc = -EALREADY;
+	}
+	read_unlock(&qsd->qsd_lock);
+	if (rc)
+		RETURN(rc);
+
+	/* Record whether this qsd instance is managing quota enforcement for a
+	 * MDT (i.e. inode quota) or OST (block quota) */
+	if (qsd->qsd_is_md)
+		qsd->qsd_sync_threshold = LQUOTA_LEAST_QUNIT(LQUOTA_RES_MD);
+	else
+		qsd->qsd_sync_threshold = LQUOTA_LEAST_QUNIT(LQUOTA_RES_DT);
+
+	/* look-up on-disk directory for the quota slave */
+	qsd->qsd_root = lquota_disk_dir_find_create(env, qsd->qsd_dev, NULL,
+						    QSD_DIR);
+	if (IS_ERR(qsd->qsd_root)) {
+		rc = PTR_ERR(qsd->qsd_root);
+		qsd->qsd_root = NULL;
+		CERROR("%s: failed to create quota slave root dir (%d)\n",
+		       qsd->qsd_svname, rc);
+		RETURN(rc);
+	}
+
+	/* initialize per-quota type data */
+	for (qtype = USRQUOTA; qtype < LL_MAXQUOTAS; qtype++) {
+		rc = qsd_qtype_init(env, qsd, qtype);
+		if (rc)
+			RETURN(rc);
+	}
+
+	/* pools successfully setup, mark the qsd as prepared */
+	write_lock(&qsd->qsd_lock);
+	qsd->qsd_prepared = true;
+	write_unlock(&qsd->qsd_lock);
+
+	if (qsd->qsd_dev->dd_rdonly)
+		RETURN(0);
+
+	/* start reintegration thread for each type, if required */
+	for (qtype = USRQUOTA; qtype < LL_MAXQUOTAS; qtype++) {
+		struct qsd_qtype_info	*qqi = qsd->qsd_type_array[qtype];
+
+		if (qsd_type_enabled(qsd, qtype) &&
+		    qqi->qqi_acct_failed) {
+			LCONSOLE_ERROR("%s: can't enable quota enforcement "
+				       "since space accounting isn't functional"
+				       ". Please run tunefs.lustre --quota on "
+				       "an unmounted filesystem if not done "
+				       "already\n", qsd->qsd_svname);
+			continue;
+		}
+
+		rc = qsd_start_reint_thread(qqi);
+		if (rc) {
+			CERROR("%s: failed to start reint thread for type %s: rc = %d\n",
+				qsd->qsd_svname, qtype_name(qtype), rc);
+			RETURN(rc);
+		}
+	}
+
+	/* start writeback thread */
+	rc = qsd_start_upd_thread(qsd);
+	if (rc) {
+		CERROR("%s: failed to start writeback thread (%d)\n",
+		       qsd->qsd_svname, rc);
+		RETURN(rc);
+	}
+
+	/* generate osp name */
+	rc = tgt_name2lwp_name(qsd->qsd_svname, qti->qti_buf,
+			       MTI_NAME_MAXLEN, 0);
+	if (rc) {
+		CERROR("%s: failed to generate ospname (%d)\n",
+		       qsd->qsd_svname, rc);
+		RETURN(rc);
+	}
+
+	/* the connection callback will start the reintegration
+	 * procedure if quota is enabled */
+	rc = lustre_register_lwp_item(qti->qti_buf, &qsd->qsd_exp,
+				      qsd_conn_callback, (void *)qsd);
+	if (rc) {
+		CERROR("%s: fail to get connection to master (%d)\n",
+		       qsd->qsd_svname, rc);
+		RETURN(rc);
+	}
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(qsd_prepare);
+
+/*
+ * Start a qsd instance. This will complete the last step of the reintegration
+ * procedure as soon as possible (provided that the master is reachable).
+ * This should be called when recovery has been completed and quota should now
+ * be enforced on every operations.
+ *
+ * \param env - the environment passed by the caller
+ * \param qsd - is the qsd instance associated with the osd device to start
+ */
+int qsd_start(const struct lu_env *env, struct qsd_instance *qsd)
+{
+	int	type, rc = 0;
+	ENTRY;
+
+	if (unlikely(qsd == NULL))
+		RETURN(0);
+
+	write_lock(&qsd->qsd_lock);
+	if (!qsd->qsd_prepared) {
+		CERROR("%s: can't start qsd instance since it wasn't properly "
+		       "initialized\n", qsd->qsd_svname);
+		rc = -EFAULT;
+	} else if (qsd->qsd_started) {
+		CERROR("%s: qsd instance already started\n", qsd->qsd_svname);
+		rc = -EALREADY;
+	} else {
+		/* notify that the qsd_instance is now started */
+		qsd->qsd_started = true;
+	}
+	write_unlock(&qsd->qsd_lock);
+
+	if (rc)
+		RETURN(rc);
+
+	/* Trigger the 3rd step of reintegration: If usage > granted, acquire
+	 * up to usage; If usage < granted, release down to usage.  */
+	for (type = USRQUOTA; type < LL_MAXQUOTAS; type++) {
+		struct qsd_qtype_info	*qqi = qsd->qsd_type_array[type];
+		struct task_struct *t;
+
+		/* qqi_reint_task can be set to NULL at any time,
+		 * so we need to be careful.
+		 */
+		rcu_read_lock();
+		t = rcu_dereference(qqi->qqi_reint_task);
+		if (t)
+			wake_up_process(t);
+		rcu_read_unlock();
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(qsd_start);
+
+void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg));
+
+/*
  * Global initialization performed at module load time
  */
 int qsd_glb_init(void)
 {
+	int	rc;
+
+	rc = lu_kmem_init(qsd_caches);
+	if (rc)
+		return rc;
+
 	qsd_key_init_generic(&qsd_thread_key, NULL);
 	lu_context_key_register(&qsd_thread_key);
+	lustre_register_quota_process_config(qsd_process_config);
+
 	return 0;
 }
 
@@ -351,5 +938,7 @@ int qsd_glb_init(void)
  */
 void qsd_glb_fini(void)
 {
+	lustre_register_quota_process_config(NULL);
+	lu_kmem_fini(qsd_caches);
 	lu_context_key_degister(&qsd_thread_key);
 }