From: Tatsushi Takamura Date: Mon, 26 Aug 2019 00:12:37 +0000 (+0900) Subject: LU-12691 ldlm: obd_max_recoverable_clients is not atomic X-Git-Tag: 2.12.90~38 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=01261e7b563adc97899d962f0ba2d1b430894bf7 LU-12691 ldlm: obd_max_recoverable_clients is not atomic Originally obd_max_recoverable_clients is not increased at the same moment. But because of LU-3540, it will be increased by multiple processes. The type of obd_max_recoverable_clients should be atomic_t and be handled by atomic operations. Signed-off-by: Tatsushi Takamura Change-Id: I9a67bbbfacab2e05858243f649e3a4e0d4b5d7f7 Reviewed-on: https://review.whamcloud.com/35914 Reviewed-by: Andreas Dilger Reviewed-by: Patrick Farrell Reviewed-by: Mike Pershin Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd.h b/lustre/include/obd.h index c43cafb..fcd78b1 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -664,7 +664,7 @@ struct obd_device { struct list_head obd_exports_timed; time64_t obd_eviction_timer; /* for ping evictor */ - int obd_max_recoverable_clients; + atomic_t obd_max_recoverable_clients; atomic_t obd_connected_clients; int obd_stale_clients; /* this lock protects all recovery list_heads, timer and diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 4581df0..50ce61c 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -845,7 +845,7 @@ static int target_handle_reconnect(struct lustre_handle *conn, target->obd_name, obd_uuid2str(&exp->exp_client_uuid), obd_export_nid2str(exp), - target->obd_max_recoverable_clients, + atomic_read(&target->obd_max_recoverable_clients), timeout / 60, timeout % 60); } else { struct target_distribute_txn_data *tdtd; @@ -1324,7 +1324,8 @@ no_export: connected = atomic_read(&target->obd_connected_clients); in_progress = atomic_read(&target->obd_lock_replay_clients); - known = target->obd_max_recoverable_clients; + known = + atomic_read(&target->obd_max_recoverable_clients); stale = target->obd_stale_clients; remaining = hrtimer_expires_remaining(timer); left = ktime_divns(remaining, NSEC_PER_SEC); @@ -1480,9 +1481,10 @@ dont_check_exports: * condition. */ if (new_mds_mds_conn) - target->obd_max_recoverable_clients++; + atomic_inc(&target->obd_max_recoverable_clients); + if (atomic_inc_return(&target->obd_connected_clients) == - target->obd_max_recoverable_clients) + atomic_read(&target->obd_max_recoverable_clients)) wake_up(&target->obd_next_transno_waitq); } @@ -1643,7 +1645,7 @@ static void target_finish_recovery(struct lu_target *lut) LCONSOLE_INFO("%s: Recovery over after %lld:%.02lld, of %d clients %d recovered and %d %s evicted.\n", obd->obd_name, (s64)elapsed_time / 60, (s64)elapsed_time % 60, - obd->obd_max_recoverable_clients, + atomic_read(&obd->obd_max_recoverable_clients), atomic_read(&obd->obd_connected_clients), obd->obd_stale_clients, obd->obd_stale_clients == 1 ? "was" : "were"); @@ -1805,9 +1807,11 @@ static void target_start_recovery_timer(struct obd_device *obd) obd->obd_name, obd->obd_recovery_timeout / 60, obd->obd_recovery_timeout % 60, - obd->obd_max_recoverable_clients, - (obd->obd_max_recoverable_clients == 1) ? "" : "s", - (obd->obd_max_recoverable_clients == 1) ? "s" : ""); + atomic_read(&obd->obd_max_recoverable_clients), + (atomic_read(&obd->obd_max_recoverable_clients) == 1) ? + "" : "s", + (atomic_read(&obd->obd_max_recoverable_clients) == 1) ? + "s" : ""); } /** @@ -1993,7 +1997,8 @@ static int check_for_next_transno(struct lu_target *lut) CDEBUG(D_HA, "max: %d, connected: %d, completed: %d, queue_len: %d, req_transno: %llu, next_transno: %llu\n", - obd->obd_max_recoverable_clients, connected, completed, + atomic_read(&obd->obd_max_recoverable_clients), + connected, completed, queue_len, req_transno, next_transno); if (obd->obd_abort_recovery) { @@ -2307,13 +2312,15 @@ static int check_for_recovery_ready(struct lu_target *lut) CDEBUG(D_HA, "connected %d stale %d max_recoverable_clients %d abort %d expired %d\n", - clnts, obd->obd_stale_clients, obd->obd_max_recoverable_clients, + clnts, obd->obd_stale_clients, + atomic_read(&obd->obd_max_recoverable_clients), obd->obd_abort_recovery, obd->obd_recovery_expired); if (!obd->obd_abort_recovery && !obd->obd_recovery_expired) { - LASSERT(clnts <= obd->obd_max_recoverable_clients); + LASSERT(clnts <= + atomic_read(&obd->obd_max_recoverable_clients)); if (clnts + obd->obd_stale_clients < - obd->obd_max_recoverable_clients) + atomic_read(&obd->obd_max_recoverable_clients)) return 0; } @@ -2765,7 +2772,7 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler) if (lut->lut_bottom->dd_rdonly) return; - if (obd->obd_max_recoverable_clients == 0) { + if (atomic_read(&obd->obd_max_recoverable_clients) == 0) { /** Update server last boot epoch */ tgt_boot_epoch_update(lut); return; @@ -2773,7 +2780,8 @@ void target_recovery_init(struct lu_target *lut, svc_handler_t handler) CDEBUG(D_HA, "RECOVERY: service %s, %d recoverable clients, " "last_transno %llu\n", obd->obd_name, - obd->obd_max_recoverable_clients, obd->obd_last_committed); + atomic_read(&obd->obd_max_recoverable_clients), + obd->obd_last_committed); LASSERT(obd->obd_stopping == 0); obd->obd_next_recovery_transno = obd->obd_last_committed + 1; obd->obd_recovery_start = 0; diff --git a/lustre/obdclass/lprocfs_status_server.c b/lustre/obdclass/lprocfs_status_server.c index 87c8e60..30e551f 100644 --- a/lustre/obdclass/lprocfs_status_server.c +++ b/lustre/obdclass/lprocfs_status_server.c @@ -671,7 +671,7 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data) LASSERT(obd != NULL); seq_printf(m, "status: "); - if (obd->obd_max_recoverable_clients == 0) { + if (atomic_read(&obd->obd_max_recoverable_clients) == 0) { seq_printf(m, "INACTIVE\n"); goto out; } @@ -687,9 +687,9 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data) ktime_get_real_seconds() - obd->obd_recovery_start); /* Number of clients that have completed recovery */ seq_printf(m, "completed_clients: %d/%d\n", - obd->obd_max_recoverable_clients - + atomic_read(&obd->obd_max_recoverable_clients) - obd->obd_stale_clients, - obd->obd_max_recoverable_clients); + atomic_read(&obd->obd_max_recoverable_clients)); seq_printf(m, "replayed_requests: %d\n", obd->obd_replayed_requests); seq_printf(m, "last_transno: %lld\n", @@ -745,7 +745,7 @@ int lprocfs_recovery_status_seq_show(struct seq_file *m, void *data) ktime_get_real_seconds())); seq_printf(m, "connected_clients: %d/%d\n", atomic_read(&obd->obd_connected_clients), - obd->obd_max_recoverable_clients); + atomic_read(&obd->obd_max_recoverable_clients)); /* Number of clients that have completed recovery */ seq_printf(m, "req_replay_clients: %d\n", atomic_read(&obd->obd_req_replay_clients)); diff --git a/lustre/target/tgt_lastrcvd.c b/lustre/target/tgt_lastrcvd.c index b36908b..f31a6a3 100644 --- a/lustre/target/tgt_lastrcvd.c +++ b/lustre/target/tgt_lastrcvd.c @@ -844,7 +844,7 @@ void tgt_boot_epoch_update(struct lu_target *tgt) * - there is no client to recover or the recovery was aborted */ if (!strncmp(tgt->lut_obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) && - (tgt->lut_obd->obd_max_recoverable_clients == 0 || + (atomic_read(&tgt->lut_obd->obd_max_recoverable_clients) == 0 || tgt->lut_obd->obd_abort_recovery)) tgt->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS; @@ -1565,7 +1565,7 @@ static int tgt_clients_data_init(const struct lu_env *env, exp->exp_connecting = 0; exp->exp_in_recovery = 0; spin_unlock(&exp->exp_lock); - obd->obd_max_recoverable_clients++; + atomic_inc(&obd->obd_max_recoverable_clients); if (tgt->lut_lsd.lsd_feature_incompat & OBD_INCOMPAT_MULTI_RPCS &&