X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosd-zfs%2Fosd_handler.c;h=8a0329851f3238e79800e59d0e1299ac703c11e0;hp=3c2abe52e3d75a9f014c2bda772a792df237da67;hb=6e6357dbf9a14aaea459f460dbe4f93e52c814d4;hpb=fd3feec5956d92561789f9db519b7cdb0e00d5ac

diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c
index 3c2abe5..8a03298 100644
--- a/lustre/osd-zfs/osd_handler.c
+++ b/lustre/osd-zfs/osd_handler.c
@@ -28,7 +28,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2011, 2012 Whamcloud, Inc.
+ * Copyright (c) 2012, Intel Corporation.
  * Use is subject to license terms.
  *
  */
@@ -131,6 +131,7 @@ static void osd_trans_commit_cb(void *cb_data, int error)
 {
 	struct osd_thandle	*oh = cb_data;
 	struct thandle		*th = &oh->ot_super;
+	struct osd_device	*osd = osd_dt_dev(th->th_dev);
 	struct lu_device	*lud = &th->th_dev->dd_lu_dev;
 	struct dt_txn_commit_cb	*dcb, *tmp;
 
@@ -151,6 +152,14 @@ static void osd_trans_commit_cb(void *cb_data, int error)
 	cfs_list_for_each_entry_safe(dcb, tmp, &oh->ot_dcb_list, dcb_linkage)
 		dcb->dcb_func(NULL, th, dcb, error);
 
+	/* Unlike ldiskfs, zfs updates space accounting at commit time.
+	 * As a consequence, op_end is called only now to inform the quota slave
+	 * component that reserved quota space is now accounted in usage and
+	 * should be released. Quota space won't be adjusted at this point since
+	 * we can't provide a suitable environment. It will be performed
+	 * asynchronously by a lquota thread. */
+	qsd_op_end(NULL, osd->od_quota_slave, &oh->ot_quota_trans);
+
 	lu_device_put(lud);
 	th->th_dev = NULL;
 	lu_context_exit(&th->th_ctx);
@@ -200,11 +209,10 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d,
 		if (!lu_device_is_md(&d->dd_lu_dev) && rc == -ENOSPC)
 			CERROR("%s: failed to start transaction due to ENOSPC. "
 			       "Metadata overhead is underestimated or "
-			       "grant_ratio is too low.\n",
-			       osd->od_dt_dev.dd_lu_dev.ld_obd->obd_name);
+			       "grant_ratio is too low.\n", osd->od_svname);
 		else
 			CERROR("%s: can't assign tx: rc = %d\n",
-			       osd->od_dt_dev.dd_lu_dev.ld_obd->obd_name, rc);
+			       osd->od_svname, rc);
 	} else {
 		/* add commit callback */
 		dmu_tx_callback_register(oh->ot_tx, osd_trans_commit_cb, oh);
@@ -234,10 +242,23 @@ static int osd_trans_stop(const struct lu_env *env, struct thandle *th)
 		LASSERT(oh->ot_tx);
 		dmu_tx_abort(oh->ot_tx);
 		osd_object_sa_dirty_rele(oh);
+		/* there won't be any commit, release reserved quota space now,
+		 * if any */
+		qsd_op_end(env, osd->od_quota_slave, &oh->ot_quota_trans);
 		OBD_FREE_PTR(oh);
 		RETURN(0);
 	}
 
+	/* When doing our own inode accounting, the ZAPs storing per-uid/gid
+	 * usage are updated at operation execution time, so we should call
+	 * qsd_op_end() straight away. Otherwise (for blk accounting maintained
+	 * by ZFS and when #inode is estimated from #blks) accounting is updated
+	 * at commit time and the call to qsd_op_end() must be delayed */
+	if (oh->ot_quota_trans.lqt_id_cnt > 0 &&
+			!oh->ot_quota_trans.lqt_ids[0].lqi_is_blk &&
+			!osd->od_quota_iused_est)
+		qsd_op_end(env, osd->od_quota_slave, &oh->ot_quota_trans);
+
 	rc = dt_txn_hook_stop(env, th);
 	if (rc != 0)
 		CDEBUG(D_OTHER, "%s: transaction hook failed: rc = %d\n",
@@ -278,7 +299,8 @@ static struct thandle *osd_trans_create(const struct lu_env *env,
 	oh->ot_tx = tx;
 	CFS_INIT_LIST_HEAD(&oh->ot_dcb_list);
 	CFS_INIT_LIST_HEAD(&oh->ot_sa_list);
-	cfs_sema_init(&oh->ot_sa_lock, 1);
+	sema_init(&oh->ot_sa_lock, 1);
+	memset(&oh->ot_quota_trans, 0, sizeof(oh->ot_quota_trans));
 	th = &oh->ot_super;
 	th->th_dev = dt;
 	th->th_result = 0;
@@ -340,6 +362,8 @@ static void osd_conf_get(const struct lu_env *env,
 	param->ddp_inodespace = OSD_DNODE_EST_COUNT;
 	/* per-fragment overhead to be used by the client code */
 	param->ddp_grant_frag = udmu_blk_insert_cost();
+
+	param->ddp_mnt = NULL;
 }
 
 /*
@@ -359,6 +383,7 @@ static int osd_commit_async(const struct lu_env *env, struct dt_device *dev)
 	tx_state_t	  *tx = &dmu_objset_pool(osd->od_objset.os)->dp_tx;
 	uint64_t	   txg;
 
+	mutex_enter(&tx->tx_sync_lock);
 	txg = tx->tx_open_txg + 1;
 	if (tx->tx_quiesce_txg_waiting < txg) {
 		tx->tx_quiesce_txg_waiting = txg;
@@ -480,33 +505,31 @@ static int osd_shutdown(const struct lu_env *env, struct osd_device *o)
 	RETURN(0);
 }
 
+static void osd_xattr_changed_cb(void *arg, uint64_t newval)
+{
+	struct osd_device *osd = arg;
+
+	osd->od_xattr_in_sa = (newval == ZFS_XATTR_SA);
+}
+
 static int osd_mount(const struct lu_env *env,
 		     struct osd_device *o, struct lustre_cfg *cfg)
 {
-	char				*dev  = lustre_cfg_string(cfg, 0);
-	struct lustre_mount_info	*lmi;
-	struct lustre_sb_info		*lsi;
-	dmu_buf_t			*rootdb;
-	int				 rc;
+	struct dsl_dataset *ds;
+	char	  *dev  = lustre_cfg_string(cfg, 1);
+	dmu_buf_t *rootdb;
+	int	   rc;
 	ENTRY;
 
 	if (o->od_objset.os != NULL)
 		RETURN(0);
 
-	lmi = server_get_mount(dev);
-	if (lmi == NULL) {
-		CERROR("Unknown mount point: '%s'\n", dev);
-		RETURN(-ENODEV);
-	}
-
-	lsi = s2lsi(lmi->lmi_sb);
-	dev = lsi->lsi_lmd->lmd_dev;
-
 	if (strlen(dev) >= sizeof(o->od_mntdev))
 		RETURN(-E2BIG);
 
 	strcpy(o->od_mntdev, dev);
-	strcpy(o->od_svname, lsi->lsi_ldd->ldd_svname);
+	strncpy(o->od_svname, lustre_cfg_string(cfg, 4),
+		sizeof(o->od_svname) - 1);
 
 	rc = -udmu_objset_open(o->od_mntdev, &o->od_objset);
 	if (rc) {
@@ -514,6 +537,13 @@ static int osd_mount(const struct lu_env *env,
 		RETURN(rc);
 	}
 
+	ds = dmu_objset_ds(o->od_objset.os);
+	LASSERT(ds);
+	rc = dsl_prop_register(ds, "xattr", osd_xattr_changed_cb, o);
+	if (rc)
+		CERROR("%s: cat not register xattr callback, ignore: %d\n",
+		       o->od_svname, rc);
+
 	rc = __osd_obj2dbuf(env, o->od_objset.os, o->od_objset.root,
 				&rootdb, root_tag);
 	if (rc) {
@@ -530,6 +560,15 @@ static int osd_mount(const struct lu_env *env,
 	if (rc)
 		GOTO(err, rc);
 
+	rc = lu_site_init(&o->od_site, osd2lu_dev(o));
+	if (rc)
+		GOTO(err, rc);
+	o->od_site.ls_bottom_dev = osd2lu_dev(o);
+
+	rc = lu_site_init_finish(&o->od_site);
+	if (rc)
+		GOTO(err, rc);
+
 	/* Use our own ZAP for inode accounting by default, this can be changed
 	 * via procfs to estimate the inode usage from the block usage */
 	o->od_quota_iused_est = 0;
@@ -540,6 +579,14 @@ static int osd_mount(const struct lu_env *env,
 
 	o->arc_prune_cb = arc_add_prune_callback(arc_prune_func, o);
 
+	/* initialize quota slave instance */
+	o->od_quota_slave = qsd_init(env, o->od_svname, &o->od_dt_dev,
+				     o->od_proc_entry);
+	if (IS_ERR(o->od_quota_slave)) {
+		rc = PTR_ERR(o->od_quota_slave);
+		o->od_quota_slave = NULL;
+		GOTO(err, rc);
+	}
 err:
 	RETURN(rc);
 }
@@ -587,28 +634,36 @@ out:
 	RETURN(rc);
 }
 
+static struct lu_device *osd_device_fini(const struct lu_env *env,
+					 struct lu_device *dev);
+
 static struct lu_device *osd_device_alloc(const struct lu_env *env,
-					  struct lu_device_type *t,
+					  struct lu_device_type *type,
 					  struct lustre_cfg *cfg)
 {
-	struct osd_device	*o;
-	int			 rc;
+	struct osd_device *dev;
+	int		   rc;
 
-	OBD_ALLOC_PTR(o);
-	if (o == NULL)
+	OBD_ALLOC_PTR(dev);
+	if (dev == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	rc = dt_device_init(&o->od_dt_dev, t);
+	rc = dt_device_init(&dev->od_dt_dev, type);
 	if (rc == 0) {
-		rc = osd_device_init0(env, o, cfg);
+		rc = osd_device_init0(env, dev, cfg);
+		if (rc == 0) {
+			rc = osd_mount(env, dev, cfg);
+			if (rc)
+				osd_device_fini(env, osd2lu_dev(dev));
+		}
 		if (rc)
-			dt_device_fini(&o->od_dt_dev);
+			dt_device_fini(&dev->od_dt_dev);
 	}
 
 	if (unlikely(rc != 0))
-		OBD_FREE_PTR(o);
+		OBD_FREE_PTR(dev);
 
-	return rc == 0 ? osd2lu_dev(o) : ERR_PTR(rc);
+	return rc == 0 ? osd2lu_dev(dev) : ERR_PTR(rc);
 }
 
 static struct lu_device *osd_device_free(const struct lu_env *env,
@@ -619,9 +674,13 @@ static struct lu_device *osd_device_free(const struct lu_env *env,
 
 	cleanup_capa_hash(o->od_capa_hash);
 	/* XXX: make osd top device in order to release reference */
-	/*d->ld_site->ls_top_dev = d;
+	d->ld_site->ls_top_dev = d;
 	lu_site_purge(env, d->ld_site, -1);
-	lu_site_fini(&o->od_site);*/
+	if (!cfs_hash_is_empty(d->ld_site->ls_obj_hash)) {
+		LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL);
+		lu_site_print(env, d->ld_site, &msgdata, lu_cdebug_printer);
+	}
+	lu_site_fini(&o->od_site);
 	dt_device_fini(&o->od_dt_dev);
 	OBD_FREE_PTR(o);
 
@@ -631,15 +690,21 @@ static struct lu_device *osd_device_free(const struct lu_env *env,
 static struct lu_device *osd_device_fini(const struct lu_env *env,
 					 struct lu_device *d)
 {
-	struct osd_device	 *o = osd_dev(d);
-	struct lustre_mount_info *lmi;
-	int rc;
+	struct osd_device *o = osd_dev(d);
+	struct dsl_dataset *ds;
+	int		   rc;
 	ENTRY;
 
 
+	osd_shutdown(env, o);
 	osd_oi_fini(env, o);
 
 	if (o->od_objset.os) {
+		ds = dmu_objset_ds(o->od_objset.os);
+		rc = dsl_prop_unregister(ds, "xattr", osd_xattr_changed_cb, o);
+		if (rc)
+			CERROR("%s: dsl_prop_unregister xattr error %d\n",
+				o->od_svname, rc);
 		arc_remove_prune_callback(o->arc_prune_cb);
 		o->arc_prune_cb = NULL;
 		osd_sync(env, lu2dt_dev(d));
@@ -655,10 +720,6 @@ static struct lu_device *osd_device_fini(const struct lu_env *env,
 	if (o->od_objset.os)
 		osd_umount(env, o);
 
-	lmi = server_get_mount_2(o->od_svname);
-	LASSERT(lmi);
-	server_put_mount(lmi->lmi_name, lmi->lmi_mnt);
-
 	RETURN(NULL);
 }
 
@@ -695,10 +756,71 @@ static int osd_process_config(const struct lu_env *env,
 
 static int osd_recovery_complete(const struct lu_env *env, struct lu_device *d)
 {
+	struct osd_device	*osd = osd_dev(d);
+	int			 rc = 0;
+	ENTRY;
+
+	if (osd->od_quota_slave == NULL)
+		RETURN(0);
+
+	/* start qsd instance on recovery completion, this notifies the quota
+	 * slave code that we are about to process new requests now */
+	rc = qsd_start(env, osd->od_quota_slave);
+	RETURN(rc);
+}
+
+/*
+ * we use exports to track all osd users
+ */
+static int osd_obd_connect(const struct lu_env *env, struct obd_export **exp,
+			   struct obd_device *obd, struct obd_uuid *cluuid,
+			   struct obd_connect_data *data, void *localdata)
+{
+	struct osd_device    *osd = osd_dev(obd->obd_lu_dev);
+	struct lustre_handle  conn;
+	int                   rc;
 	ENTRY;
+
+	CDEBUG(D_CONFIG, "connect #%d\n", osd->od_connects);
+
+	rc = class_connect(&conn, obd, cluuid);
+	if (rc)
+		RETURN(rc);
+
+	*exp = class_conn2export(&conn);
+
+	spin_lock(&osd->od_objset.lock);
+	osd->od_connects++;
+	spin_unlock(&osd->od_objset.lock);
+
 	RETURN(0);
 }
 
+/*
+ * once last export (we don't count self-export) disappeared
+ * osd can be released
+ */
+static int osd_obd_disconnect(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+	struct osd_device *osd = osd_dev(obd->obd_lu_dev);
+	int                rc, release = 0;
+	ENTRY;
+
+	/* Only disconnect the underlying layers on the final disconnect. */
+	spin_lock(&osd->od_objset.lock);
+	osd->od_connects--;
+	if (osd->od_connects == 0)
+		release = 1;
+	spin_unlock(&osd->od_objset.lock);
+
+	rc = class_disconnect(exp); /* bz 9811 */
+
+	if (rc == 0 && release)
+		class_manual_cleanup(obd);
+	RETURN(rc);
+}
+
 static int osd_prepare(const struct lu_env *env, struct lu_device *pdev,
 		       struct lu_device *dev)
 {
@@ -706,14 +828,19 @@ static int osd_prepare(const struct lu_env *env, struct lu_device *pdev,
 	int			 rc = 0;
 	ENTRY;
 
-	/* initialize quota slave instance */
-	osd->od_quota_slave = qsd_init(env, osd->od_svname, &osd->od_dt_dev,
-				       osd->od_proc_entry);
-	if (IS_ERR(osd->od_quota_slave)) {
-		rc = PTR_ERR(osd->od_quota_slave);
-		osd->od_quota_slave = NULL;
+	if (dev->ld_site && lu_device_is_md(dev->ld_site->ls_top_dev)) {
+		/* MDT/MDD still use old infrastructure to create
+		 * special files */
+		rc = llo_local_objects_setup(env, lu2md_dev(pdev),
+					     lu2dt_dev(dev));
+		if (rc)
+			RETURN(rc);
 	}
 
+	if (osd->od_quota_slave != NULL)
+		/* set up quota slave objects */
+		rc = qsd_prepare(env, osd->od_quota_slave);
+
 	RETURN(rc);
 }
 
@@ -756,6 +883,8 @@ static struct lu_device_type osd_device_type = {
 
 static struct obd_ops osd_obd_device_ops = {
 	.o_owner       = THIS_MODULE,
+	.o_connect	= osd_obd_connect,
+	.o_disconnect	= osd_obd_disconnect
 };
 
 int __init osd_init(void)