X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fldlm%2Fldlm_lib.c;h=e112c20e5b4c8206f59fcaf1c617e1c3462ce30b;hb=1ba794f6ec9e7ce7ad65fd74f170089fffc31d91;hp=4aa54a7af3438c470f7d62b9a231a504625c4a88;hpb=f073f11b860fcb42707c50b63bed2b2294ceeeba;p=fs%2Flustre-release.git diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 4aa54a7..e112c20 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -15,11 +15,7 @@ * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. + * http://www.gnu.org/licenses/gpl-2.0.html * * GPL HEADER END */ @@ -27,7 +23,7 @@ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2010, 2015, Intel Corporation. + * Copyright (c) 2010, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -43,6 +39,7 @@ #define DEBUG_SUBSYSTEM S_LDLM +#include #include #include #include @@ -58,71 +55,75 @@ static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid, int priority, int create) { - struct ptlrpc_connection *ptlrpc_conn; - struct obd_import_conn *imp_conn = NULL, *item; - int rc = 0; - ENTRY; + struct ptlrpc_connection *ptlrpc_conn; + struct obd_import_conn *imp_conn = NULL, *item; + lnet_nid_t nid4refnet = LNET_NID_ANY; + int rc = 0; + ENTRY; - if (!create && !priority) { - CDEBUG(D_HA, "Nothing to do\n"); - RETURN(-EINVAL); - } + if (!create && !priority) { + CDEBUG(D_HA, "Nothing to do\n"); + RETURN(-EINVAL); + } - ptlrpc_conn = ptlrpc_uuid_to_connection(uuid); - if (!ptlrpc_conn) { - CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid); - RETURN (-ENOENT); - } + if (imp->imp_connection && + imp->imp_connection->c_remote_uuid.uuid[0] == 0) + /* nid4refnet is used to restrict network connections */ + nid4refnet = imp->imp_connection->c_self; + ptlrpc_conn = ptlrpc_uuid_to_connection(uuid, nid4refnet); + if (!ptlrpc_conn) { + CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid); + RETURN(-ENOENT); + } - if (create) { - OBD_ALLOC(imp_conn, sizeof(*imp_conn)); - if (!imp_conn) { - GOTO(out_put, rc = -ENOMEM); - } - } + if (create) { + OBD_ALLOC(imp_conn, sizeof(*imp_conn)); + if (!imp_conn) + GOTO(out_put, rc = -ENOMEM); + } spin_lock(&imp->imp_lock); list_for_each_entry(item, &imp->imp_conn_list, oic_item) { - if (obd_uuid_equals(uuid, &item->oic_uuid)) { - if (priority) { + if (obd_uuid_equals(uuid, &item->oic_uuid)) { + if (priority) { list_del(&item->oic_item); list_add(&item->oic_item, - &imp->imp_conn_list); - item->oic_last_attempt = 0; - } - CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n", - imp, imp->imp_obd->obd_name, uuid->uuid, - (priority ? ", moved to head" : "")); + &imp->imp_conn_list); + item->oic_last_attempt = 0; + } + CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n", + imp, imp->imp_obd->obd_name, uuid->uuid, + (priority ? ", moved to head" : "")); spin_unlock(&imp->imp_lock); - GOTO(out_free, rc = 0); - } - } + GOTO(out_free, rc = 0); + } + } /* No existing import connection found for \a uuid. */ - if (create) { - imp_conn->oic_conn = ptlrpc_conn; - imp_conn->oic_uuid = *uuid; - imp_conn->oic_last_attempt = 0; - if (priority) + if (create) { + imp_conn->oic_conn = ptlrpc_conn; + imp_conn->oic_uuid = *uuid; + imp_conn->oic_last_attempt = 0; + if (priority) list_add(&imp_conn->oic_item, &imp->imp_conn_list); - else + else list_add_tail(&imp_conn->oic_item, - &imp->imp_conn_list); - CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n", - imp, imp->imp_obd->obd_name, uuid->uuid, - (priority ? "head" : "tail")); - } else { + &imp->imp_conn_list); + CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n", + imp, imp->imp_obd->obd_name, uuid->uuid, + (priority ? "head" : "tail")); + } else { spin_unlock(&imp->imp_lock); GOTO(out_free, rc = -ENOENT); } spin_unlock(&imp->imp_lock); - RETURN(0); + RETURN(0); out_free: - if (imp_conn) - OBD_FREE(imp_conn, sizeof(*imp_conn)); + if (imp_conn) + OBD_FREE(imp_conn, sizeof(*imp_conn)); out_put: - ptlrpc_connection_put(ptlrpc_conn); - RETURN(rc); + ptlrpc_connection_put(ptlrpc_conn); + RETURN(rc); } int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid) @@ -167,14 +168,17 @@ int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid) ptlrpc_connection_put(imp->imp_connection); imp->imp_connection = NULL; - dlmexp = class_conn2export(&imp->imp_dlm_handle); - if (dlmexp && dlmexp->exp_connection) { - LASSERT(dlmexp->exp_connection == - imp_conn->oic_conn); - ptlrpc_connection_put(dlmexp->exp_connection); - dlmexp->exp_connection = NULL; - } - } + dlmexp = class_conn2export(&imp->imp_dlm_handle); + if (dlmexp && dlmexp->exp_connection) { + LASSERT(dlmexp->exp_connection == + imp_conn->oic_conn); + ptlrpc_connection_put(dlmexp->exp_connection); + dlmexp->exp_connection = NULL; + } + + if (dlmexp != NULL) + class_export_put(dlmexp); + } list_del(&imp_conn->oic_item); ptlrpc_connection_put(imp_conn->oic_conn); @@ -257,6 +261,7 @@ static int osc_on_mdt(char *obdname) * 1 - client UUID * 2 - server UUID * 3 - inactive-on-startup + * 4 - restrictive net */ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) { @@ -267,6 +272,8 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) char *name = obddev->obd_type->typ_name; enum ldlm_ns_type ns_type = LDLM_NS_TYPE_UNKNOWN; char *cli_name = lustre_cfg_buf(lcfg, 0); + struct ptlrpc_connection fake_conn = { .c_self = 0, + .c_remote_uuid.uuid[0] = 0 }; int rc; ENTRY; @@ -344,12 +351,15 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) init_rwsem(&cli->cl_sem); mutex_init(&cli->cl_mgc_mutex); - cli->cl_conn_count = 0; - memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2), - min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2), - sizeof(server_uuid))); + cli->cl_seq = NULL; + init_rwsem(&cli->cl_seq_rwsem); + cli->cl_conn_count = 0; + memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2), + min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2), + sizeof(server_uuid))); cli->cl_dirty_pages = 0; + cli->cl_dirty_max_pages = 0; cli->cl_avail_grant = 0; /* FIXME: Should limit this for the sum of all cl_dirty_max_pages. */ /* cl_dirty_max_pages may be changed at connect time in @@ -382,9 +392,15 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) spin_lock_init(&cli->cl_lru_list_lock); atomic_long_set(&cli->cl_unstable_count, 0); INIT_LIST_HEAD(&cli->cl_shrink_list); + INIT_LIST_HEAD(&cli->cl_grant_chain); + + INIT_LIST_HEAD(&cli->cl_flight_waiters); + cli->cl_rpcs_in_flight = 0; init_waitqueue_head(&cli->cl_destroy_waitq); atomic_set(&cli->cl_destroy_in_flight, 0); + + cli->cl_supp_cksum_types = OBD_CKSUM_CRC32; #ifdef ENABLE_CHECKSUM /* Turn on checksumming by default. */ cli->cl_checksum = 1; @@ -393,7 +409,7 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) * Set cl_chksum* to CRC32 for now to avoid returning screwed info * through procfs. */ - cli->cl_cksum_type = cli->cl_supp_cksum_types = OBD_CKSUM_CRC32; + cli->cl_cksum_type = cli->cl_supp_cksum_types; #endif atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS); @@ -401,24 +417,26 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) * from OFD after connecting. */ cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES; - /* set cl_chunkbits default value to PAGE_CACHE_SHIFT, + cli->cl_max_short_io_bytes = OBD_MAX_SHORT_IO_BYTES; + + /* set cl_chunkbits default value to PAGE_SHIFT, * it will be updated at OSC connection time. */ - cli->cl_chunkbits = PAGE_CACHE_SHIFT; + cli->cl_chunkbits = PAGE_SHIFT; if (!strcmp(name, LUSTRE_MDC_NAME)) { cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; - } else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 128 /* MB */) { + } else if (totalram_pages >> (20 - PAGE_SHIFT) <= 128 /* MB */) { cli->cl_max_rpcs_in_flight = 2; - } else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 256 /* MB */) { + } else if (totalram_pages >> (20 - PAGE_SHIFT) <= 256 /* MB */) { cli->cl_max_rpcs_in_flight = 3; - } else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 512 /* MB */) { + } else if (totalram_pages >> (20 - PAGE_SHIFT) <= 512 /* MB */) { cli->cl_max_rpcs_in_flight = 4; } else { if (osc_on_mdt(obddev->obd_name)) cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_MAX; else cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; - } + } spin_lock_init(&cli->cl_mod_rpcs_lock); spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock); @@ -428,6 +446,8 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) init_waitqueue_head(&cli->cl_mod_rpcs_waitq); cli->cl_mod_tag_bitmap = NULL; + INIT_LIST_HEAD(&cli->cl_chg_dev_linkage); + if (connect_op == MDS_CONNECT) { cli->cl_max_mod_rpcs_in_flight = cli->cl_max_rpcs_in_flight - 1; OBD_ALLOC(cli->cl_mod_tag_bitmap, @@ -454,11 +474,26 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) LUSTRE_CFG_BUFLEN(lcfg, 1)); class_import_put(imp); - rc = client_import_add_conn(imp, &server_uuid, 1); - if (rc) { - CERROR("can't add initial connection\n"); - GOTO(err_import, rc); - } + if (lustre_cfg_buf(lcfg, 4)) { + __u32 refnet = libcfs_str2net(lustre_cfg_string(lcfg, 4)); + + if (refnet == LNET_NIDNET(LNET_NID_ANY)) { + rc = -EINVAL; + CERROR("%s: bad mount option 'network=%s': rc = %d\n", + obddev->obd_name, lustre_cfg_string(lcfg, 4), + rc); + GOTO(err_import, rc); + } + fake_conn.c_self = LNET_MKNID(refnet, 0); + imp->imp_connection = &fake_conn; + } + + rc = client_import_add_conn(imp, &server_uuid, 1); + if (rc) { + CERROR("can't add initial connection\n"); + GOTO(err_import, rc); + } + imp->imp_connection = NULL; cli->cl_import = imp; /* cli->cl_max_mds_easize updated by mdc_init_ea_size() */ @@ -534,7 +569,6 @@ int client_connect_import(const struct lu_env *env, struct obd_connect_data *ocd; struct lustre_handle conn = { 0 }; int rc; - bool is_mdc = false; ENTRY; *exp = NULL; @@ -559,18 +593,12 @@ int client_connect_import(const struct lu_env *env, ocd = &imp->imp_connect_data; if (data) { *ocd = *data; - is_mdc = strncmp(imp->imp_obd->obd_type->typ_name, - LUSTRE_MDC_NAME, 3) == 0; - if (is_mdc) - data->ocd_connect_flags |= OBD_CONNECT_MULTIMODRPCS; imp->imp_connect_flags_orig = data->ocd_connect_flags; imp->imp_connect_flags2_orig = data->ocd_connect_flags2; } rc = ptlrpc_connect_import(imp); if (rc != 0) { - if (data && is_mdc) - data->ocd_connect_flags &= ~OBD_CONNECT_MULTIMODRPCS; LASSERT(imp->imp_state == LUSTRE_IMP_DISCON); GOTO(out_ldlm, rc); } @@ -578,13 +606,10 @@ int client_connect_import(const struct lu_env *env, if (data) { LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) == - ocd->ocd_connect_flags, "old "LPX64", new "LPX64"\n", + ocd->ocd_connect_flags, "old %#llx, new %#llx\n", data->ocd_connect_flags, ocd->ocd_connect_flags); data->ocd_connect_flags = ocd->ocd_connect_flags; - /* clear the flag as it was not set and is not known - * by upper layers */ - if (is_mdc) - data->ocd_connect_flags &= ~OBD_CONNECT_MULTIMODRPCS; + data->ocd_connect_flags2 = ocd->ocd_connect_flags2; } ptlrpc_pinger_add_import(imp); @@ -613,7 +638,7 @@ int client_disconnect_export(struct obd_export *exp) ENTRY; if (!obd) { - CERROR("invalid export for disconnect: exp %p cookie "LPX64"\n", + CERROR("invalid export for disconnect: exp %p cookie %#llx\n", exp, exp ? exp->exp_handle.h_cookie : -1); RETURN(-EINVAL); } @@ -702,7 +727,10 @@ int server_disconnect_export(struct obd_export *exp) spin_lock(&svcpt->scp_rep_lock); list_del_init(&rs->rs_exp_list); + spin_lock(&rs->rs_lock); + /* clear rs_convert_lock to make sure rs is handled and put */ + rs->rs_convert_lock = 0; ptlrpc_schedule_difficult_reply(rs); spin_unlock(&rs->rs_lock); @@ -724,17 +752,16 @@ static int target_handle_reconnect(struct lustre_handle *conn, { struct obd_device *target; struct lustre_handle *hdl; - cfs_time_t now; - cfs_time_t deadline; - int timeout; + ktime_t remaining; + s64 timeout; int rc = 0; - ENTRY; + ENTRY; hdl = &exp->exp_imp_reverse->imp_remote_handle; if (!exp->exp_connection || !lustre_handle_is_used(hdl)) { conn->cookie = exp->exp_handle.h_cookie; CDEBUG(D_HA, "connect export for UUID '%s' at %p," - " cookie "LPX64"\n", cluuid->uuid, exp, conn->cookie); + " cookie %#llx\n", cluuid->uuid, exp, conn->cookie); RETURN(0); } @@ -743,9 +770,9 @@ static int target_handle_reconnect(struct lustre_handle *conn, /* Might be a re-connect after a partition. */ if (memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) { LCONSOLE_WARN("%s: already connected client %s (at %s) " - "with handle "LPX64". Rejecting client " + "with handle %#llx. Rejecting client " "with the same UUID trying to reconnect " - "with handle "LPX64"\n", target->obd_name, + "with handle %#llx\n", target->obd_name, obd_uuid2str(&exp->exp_client_uuid), obd_export_nid2str(exp), hdl->cookie, conn->cookie); @@ -763,24 +790,43 @@ static int target_handle_reconnect(struct lustre_handle *conn, GOTO(out_already, rc); } - now = cfs_time_current(); - deadline = cfs_timer_deadline(&target->obd_recovery_timer); - if (cfs_time_before(now, deadline)) { - timeout = cfs_duration_sec(cfs_time_sub(deadline, now)); - LCONSOLE_WARN("%s: Client %s (at %s) reconnecting," - " waiting for %d clients in recovery for" - " %d:%.02d\n", target->obd_name, - obd_uuid2str(&exp->exp_client_uuid), - obd_export_nid2str(exp), - target->obd_max_recoverable_clients, - timeout / 60, timeout % 60); + remaining = hrtimer_expires_remaining(&target->obd_recovery_timer); + timeout = ktime_divns(remaining, NSEC_PER_SEC); + if (timeout > 0) { + LCONSOLE_WARN("%s: Client %s (at %s) reconnected, waiting for %d clients in recovery for %lld:%.02lld\n", + target->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), + target->obd_max_recoverable_clients, + timeout / 60, timeout % 60); } else { - timeout = cfs_duration_sec(cfs_time_sub(now, deadline)); - LCONSOLE_WARN("%s: Recovery already passed deadline" - " %d:%.02d, It is most likely due to DNE" - " recovery is failed or stuck, please wait a" - " few more minutes or abort the recovery.\n", - target->obd_name, timeout / 60, timeout % 60); + struct target_distribute_txn_data *tdtd; + int size = 0; + int count = 0; + char *buf = NULL; + + tdtd = class_exp2tgt(exp)->lut_tdtd; + if (tdtd && tdtd->tdtd_show_update_logs_retrievers) + buf = tdtd->tdtd_show_update_logs_retrievers( + tdtd->tdtd_show_retrievers_cbdata, + &size, &count); + + if (count > 0) + LCONSOLE_WARN("%s: Client %s (at %s) reconnecting, waiting for %d MDTs (%s) in recovery for %lld:%.02lld. Please wait until all MDTs recovered or you may force MDT evicition via 'lctl --device %s abort_recovery.\n", + target->obd_name, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp), count, + buf ? buf : "unknown (not enough RAM)", + (abs(timeout) + target->obd_recovery_timeout) / 60, + (abs(timeout) + target->obd_recovery_timeout) % 60, + target->obd_name); + else + LCONSOLE_WARN("%s: Recovery already passed deadline %lld:%.02lld. If you do not want to wait more, you may force taget eviction via 'lctl --device %s abort_recovery.\n", + target->obd_name, abs(timeout) / 60, + abs(timeout) % 60, target->obd_name); + + if (buf != NULL) + OBD_FREE(buf, size); } out_already: @@ -791,20 +837,6 @@ out_already: RETURN(EALREADY); } -void target_client_add_cb(struct obd_device *obd, __u64 transno, void *cb_data, - int error) -{ - struct obd_export *exp = cb_data; - - CDEBUG(D_RPCTRACE, "%s: committing for initial connect of %s\n", - obd->obd_name, exp->exp_client_uuid.uuid); - - spin_lock(&exp->exp_lock); - exp->exp_need_sync = 0; - spin_unlock(&exp->exp_lock); - class_export_cb_put(exp); -} - static void check_and_start_recovery_timer(struct obd_device *obd, struct ptlrpc_request *req, int new_client); @@ -925,16 +957,14 @@ int target_handle_connect(struct ptlrpc_request *req) * reconnect case */ struct lustre_handle conn; struct lustre_handle *tmp; - struct obd_uuid tgtuuid; struct obd_uuid cluuid; char *str; int rc = 0; char *target_start; int target_len; - bool mds_conn = false, lw_client = false; + bool mds_conn = false, lw_client = false, initial_conn = false; bool mds_mds_conn = false; bool new_mds_mds_conn = false; - bool target_referenced = false; struct obd_connect_data *data, *tmpdata; int size, tmpsize; lnet_nid_t *client_nid = NULL; @@ -948,11 +978,7 @@ int target_handle_connect(struct ptlrpc_request *req) GOTO(out, rc = -EINVAL); } - obd_str2uuid(&tgtuuid, str); - target = class_uuid2obd(&tgtuuid); - if (!target) - target = class_name2obd(str); - + target = class_dev_by_str(str); if (!target) { deuuidify(str, NULL, &target_start, &target_len); LCONSOLE_ERROR_MSG(0x137, "%s: not available for connect " @@ -964,6 +990,9 @@ int target_handle_connect(struct ptlrpc_request *req) } spin_lock(&target->obd_dev_lock); + + target->obd_conn_inprogress++; + if (target->obd_stopping || !target->obd_set_up) { spin_unlock(&target->obd_dev_lock); @@ -985,13 +1014,6 @@ int target_handle_connect(struct ptlrpc_request *req) GOTO(out, rc = -EAGAIN); } - /* Make sure the target isn't cleaned up while we're here. Yes, - * there's still a race between the above check and our incref here. - * Really, class_uuid2obd should take the ref. */ - class_incref(target, __func__, current); - target_referenced = true; - - target->obd_conn_inprogress++; spin_unlock(&target->obd_dev_lock); str = req_capsule_client_get(&req->rq_pill, &RMF_CLUUID); @@ -1030,32 +1052,17 @@ int target_handle_connect(struct ptlrpc_request *req) */ if (!(data->ocd_connect_flags & OBD_CONNECT_FULL20)) GOTO(out, rc = -EPROTO); -#endif + /* Don't allow liblustre clients to connect. + * - testing was disabled in v2_2_50_0-61-g6a75d65 + * - building was disabled in v2_5_58_0-28-g7277179 + * - client code was deleted in v2_6_50_0-101-gcdfbc72, + * - clients were refused connect for version difference > 0.0.1.32 */ if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) { - if (data->ocd_version < LUSTRE_VERSION_CODE - - LUSTRE_VERSION_ALLOWED_OFFSET || - data->ocd_version > LUSTRE_VERSION_CODE + - LUSTRE_VERSION_ALLOWED_OFFSET) { - DEBUG_REQ(D_WARNING, req, "Refusing %s (%d.%d.%d.%d) " - "libclient connection attempt", - data->ocd_version < LUSTRE_VERSION_CODE ? - "old" : "new", - OBD_OCD_VERSION_MAJOR(data->ocd_version), - OBD_OCD_VERSION_MINOR(data->ocd_version), - OBD_OCD_VERSION_PATCH(data->ocd_version), - OBD_OCD_VERSION_FIX(data->ocd_version)); - data = req_capsule_server_sized_get(&req->rq_pill, - &RMF_CONNECT_DATA, - offsetof(typeof(*data), ocd_version) + - sizeof(data->ocd_version)); - if (data) { - data->ocd_connect_flags = OBD_CONNECT_VERSION; - data->ocd_version = LUSTRE_VERSION_CODE; - } - GOTO(out, rc = -EPROTO); - } + DEBUG_REQ(D_WARNING, req, "Refusing libclient connection"); + GOTO(out, rc = -EPROTO); } +#endif /* Note: lw_client is needed in MDS-MDS failover during update log * processing, so we needs to allow lw_client to be connected at @@ -1063,6 +1070,7 @@ int target_handle_connect(struct ptlrpc_request *req) lw_client = (data->ocd_connect_flags & OBD_CONNECT_LIGHTWEIGHT) != 0; if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) { + initial_conn = true; mds_conn = (data->ocd_connect_flags & OBD_CONNECT_MDS) != 0; mds_mds_conn = (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) != 0; @@ -1072,7 +1080,8 @@ int target_handle_connect(struct ptlrpc_request *req) * * Via check OBD_CONNECT_FID, we can distinguish whether * the OBD_CONNECT_MDS_MDS/OBD_CONNECT_MNE_SWAB is from - * MGC or MDT. */ + * MGC or MDT, since MGC does not use OBD_CONNECT_FID. + */ if (!lw_client && (data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) && (data->ocd_connect_flags & OBD_CONNECT_FID) && @@ -1120,8 +1129,8 @@ int target_handle_connect(struct ptlrpc_request *req) class_export_put(export); export = NULL; rc = -EALREADY; - } else if ((mds_conn || lw_client || - data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) && + } else if ((mds_conn || (lw_client && initial_conn) || + data->ocd_connect_flags & OBD_CONNECT_MDS_MDS) && export->exp_connection != NULL) { spin_unlock(&export->exp_lock); if (req->rq_peer.nid != export->exp_connection->c_peer.nid) { @@ -1148,6 +1157,7 @@ int target_handle_connect(struct ptlrpc_request *req) * cause namespace inconsistency */ spin_lock(&export->exp_lock); export->exp_connecting = 1; + export->exp_conn_cnt = 0; spin_unlock(&export->exp_lock); conn.cookie = export->exp_handle.h_cookie; rc = EALREADY; @@ -1157,10 +1167,8 @@ int target_handle_connect(struct ptlrpc_request *req) export = NULL; rc = 0; } - } else if (export->exp_connection != NULL && - req->rq_peer.nid != export->exp_connection->c_peer.nid && - (lustre_msg_get_op_flags(req->rq_reqmsg) & - MSG_CONNECT_INITIAL)) { + } else if (export->exp_connection != NULL && initial_conn && + req->rq_peer.nid != export->exp_connection->c_peer.nid) { spin_unlock(&export->exp_lock); /* In MDS failover we have static UUID but NID can change. */ LCONSOLE_WARN("%s: Client %s seen on new nid %s when " @@ -1191,28 +1199,29 @@ no_export: target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), atomic_read(&export->exp_refcount)); - GOTO(out, rc = -EBUSY); - } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1) { - if (!strstr(cluuid.uuid, "mdt")) - LCONSOLE_WARN("%s: Rejecting reconnect from the " - "known client %s (at %s) because it " - "is indicating it is a new client", - target->obd_name, cluuid.uuid, - libcfs_nid2str(req->rq_peer.nid)); - GOTO(out, rc = -EALREADY); - } else { - OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout); - } + GOTO(out, rc = -EBUSY); + } else if (lustre_msg_get_conn_cnt(req->rq_reqmsg) == 1 && + rc != EALREADY) { + if (!strstr(cluuid.uuid, "mdt")) + LCONSOLE_WARN("%s: Rejecting reconnect from the " + "known client %s (at %s) because it " + "is indicating it is a new client", + target->obd_name, cluuid.uuid, + libcfs_nid2str(req->rq_peer.nid)); + GOTO(out, rc = -EALREADY); + } else { + OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_RECONNECT, 2 * obd_timeout); + } if (rc < 0) { GOTO(out, rc); } - CDEBUG(D_HA, "%s: connection from %s@%s %st"LPU64" exp %p cur %ld last %ld\n", - target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), - target->obd_recovering ? "recovering/" : "", data->ocd_transno, - export, (long)cfs_time_current_sec(), - export ? (long)export->exp_last_request_time : 0); + CDEBUG(D_HA, "%s: connection from %s@%s %st%llu exp %p cur %lld last %lld\n", + target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), + target->obd_recovering ? "recovering/" : "", data->ocd_transno, + export, ktime_get_real_seconds(), + export ? export->exp_last_request_time : 0); /* If this is the first time a client connects, reset the recovery * timer. Discard lightweight connections which might be local. */ @@ -1238,27 +1247,34 @@ no_export: /* allow "new" MDT to be connected during recovery, since we * need retrieve recovery update records from it */ if (target->obd_recovering && !lw_client && !mds_mds_conn) { - cfs_time_t t; - int c; /* connected */ - int i; /* in progress */ - int k; /* known */ - int s; /* stale/evicted */ - - c = atomic_read(&target->obd_connected_clients); - i = atomic_read(&target->obd_lock_replay_clients); - k = target->obd_max_recoverable_clients; - s = target->obd_stale_clients; - t = cfs_timer_deadline(&target->obd_recovery_timer); - t = cfs_time_sub(t, cfs_time_current()); - t = cfs_duration_sec(t); - LCONSOLE_WARN("%s: Denying connection for new client %s" - "(at %s), waiting for %d known clients " - "(%d recovered, %d in progress, and %d " - "evicted) to recover in %d:%.02d\n", + struct hrtimer *timer = &target->obd_recovery_timer; + ktime_t remaining; + s64 timeout, left; + int in_progress; + int connected; + int known; + int stale; + char *msg; + + connected = atomic_read(&target->obd_connected_clients); + in_progress = atomic_read(&target->obd_lock_replay_clients); + known = target->obd_max_recoverable_clients; + stale = target->obd_stale_clients; + remaining = hrtimer_expires_remaining(timer); + left = ktime_divns(remaining, NSEC_PER_SEC); + if (ktime_to_ns(remaining) > 0) { + msg = "to recover in"; + timeout = left; + } else { + msg = "already passed deadline"; + timeout = -left; + } + + LCONSOLE_WARN("%s: Denying connection for new client %s (at %s), waiting for %d known clients (%d recovered, %d in progress, and %d evicted) %s %lld:%.02lld\n", target->obd_name, cluuid.uuid, - libcfs_nid2str(req->rq_peer.nid), k, - c - i, i, s, (int)t / 60, - (int)t % 60); + libcfs_nid2str(req->rq_peer.nid), known, + connected - in_progress, in_progress, + stale, msg, timeout / 60, timeout % 60); rc = -EBUSY; } else { dont_check_exports: @@ -1313,37 +1329,26 @@ dont_check_exports: spin_unlock(&export->exp_lock); CDEBUG(D_RPCTRACE, "%s: %s already connected at greater " "or equal conn_cnt: %d >= %d\n", - cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), - export->exp_conn_cnt, - lustre_msg_get_conn_cnt(req->rq_reqmsg)); - - GOTO(out, rc = -EALREADY); - } - LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0); - export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg); + cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), + export->exp_conn_cnt, + lustre_msg_get_conn_cnt(req->rq_reqmsg)); - /* Don't evict liblustre clients for not pinging. */ - if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT) { - export->exp_libclient = 1; - spin_unlock(&export->exp_lock); - - spin_lock(&target->obd_dev_lock); - list_del_init(&export->exp_obd_chain_timed); - spin_unlock(&target->obd_dev_lock); - } else { - spin_unlock(&export->exp_lock); + GOTO(out, rc = -EALREADY); } + LASSERT(lustre_msg_get_conn_cnt(req->rq_reqmsg) > 0); + export->exp_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg); + spin_unlock(&export->exp_lock); - if (export->exp_connection != NULL) { + if (export->exp_connection != NULL) { /* Check to see if connection came from another NID. */ - if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) && + if ((export->exp_connection->c_peer.nid != req->rq_peer.nid) && !hlist_unhashed(&export->exp_nid_hash)) - cfs_hash_del(export->exp_obd->obd_nid_hash, - &export->exp_connection->c_peer.nid, - &export->exp_nid_hash); + cfs_hash_del(export->exp_obd->obd_nid_hash, + &export->exp_connection->c_peer.nid, + &export->exp_nid_hash); - ptlrpc_connection_put(export->exp_connection); - } + ptlrpc_connection_put(export->exp_connection); + } export->exp_connection = ptlrpc_connection_get(req->rq_peer, req->rq_self, @@ -1417,12 +1422,11 @@ out: class_export_put(export); } - if (target_referenced == true && target != NULL) { + if (target != NULL) { spin_lock(&target->obd_dev_lock); target->obd_conn_inprogress--; spin_unlock(&target->obd_dev_lock); - - class_decref(target, __func__, current); + class_decref(target, "find", current); } req->rq_status = rc; RETURN(rc); @@ -1551,18 +1555,20 @@ static void target_finish_recovery(struct lu_target *lut) /* Only log a recovery message when recovery has occurred. */ if (obd->obd_recovery_start) { - time_t elapsed_time = max_t(time_t, 1, cfs_time_current_sec() - - obd->obd_recovery_start); - LCONSOLE_INFO("%s: Recovery over after %d:%.02d, of %d clients " + time64_t now = ktime_get_real_seconds(); + time64_t elapsed_time; + + elapsed_time = max_t(time64_t, now - obd->obd_recovery_start, 1); + LCONSOLE_INFO("%s: Recovery over after %lld:%.02lld, of %d clients " "%d recovered and %d %s evicted.\n", obd->obd_name, - (int)elapsed_time / 60, (int)elapsed_time % 60, + (s64)elapsed_time / 60, (s64)elapsed_time % 60, obd->obd_max_recoverable_clients, atomic_read(&obd->obd_connected_clients), obd->obd_stale_clients, obd->obd_stale_clients == 1 ? "was" : "were"); } - ldlm_reprocess_all_ns(obd->obd_namespace); + ldlm_reprocess_recovery_done(obd->obd_namespace); spin_lock(&obd->obd_recovery_task_lock); if (!list_empty(&obd->obd_req_replay_queue) || !list_empty(&obd->obd_lock_replay_queue) || @@ -1579,15 +1585,16 @@ static void target_finish_recovery(struct lu_target *lut) } spin_unlock(&obd->obd_recovery_task_lock); - obd->obd_recovery_end = cfs_time_current_sec(); + obd->obd_recovery_end = ktime_get_real_seconds(); /* When recovery finished, cleanup orphans on MDS and OST. */ - if (OBT(obd) && OBP(obd, postrecov)) { - int rc = OBP(obd, postrecov)(obd); - if (rc < 0) - LCONSOLE_WARN("%s: Post recovery failed, rc %d\n", - obd->obd_name, rc); - } + if (obd->obd_type && OBP(obd, postrecov)) { + int rc = OBP(obd, postrecov)(obd); + + if (rc < 0) + LCONSOLE_WARN("%s: Post recovery failed, rc %d\n", + obd->obd_name, rc); + } EXIT; } @@ -1684,12 +1691,14 @@ EXPORT_SYMBOL(target_cleanup_recovery); /* obd_recovery_task_lock should be held */ void target_cancel_recovery_timer(struct obd_device *obd) { - CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name); - cfs_timer_disarm(&obd->obd_recovery_timer); + CDEBUG(D_HA, "%s: cancel recovery timer\n", obd->obd_name); + hrtimer_cancel(&obd->obd_recovery_timer); } static void target_start_recovery_timer(struct obd_device *obd) { + ktime_t delay; + if (obd->obd_recovery_start != 0) return; @@ -1706,19 +1715,18 @@ static void target_start_recovery_timer(struct obd_device *obd) return; } - cfs_timer_arm(&obd->obd_recovery_timer, - cfs_time_shift(obd->obd_recovery_timeout)); - obd->obd_recovery_start = cfs_time_current_sec(); + delay = ktime_set(obd->obd_recovery_timeout, 0); + hrtimer_start(&obd->obd_recovery_timer, delay, HRTIMER_MODE_REL); + obd->obd_recovery_start = ktime_get_real_seconds(); spin_unlock(&obd->obd_dev_lock); - LCONSOLE_WARN("%s: Will be in recovery for at least %d:%.02d, " - "or until %d client%s reconnect%s\n", - obd->obd_name, - obd->obd_recovery_timeout / 60, - obd->obd_recovery_timeout % 60, - obd->obd_max_recoverable_clients, - (obd->obd_max_recoverable_clients == 1) ? "" : "s", - (obd->obd_max_recoverable_clients == 1) ? "s": ""); + LCONSOLE_WARN("%s: Will be in recovery for at least %llu:%02llu, or until %d client%s reconnect%s\n", + obd->obd_name, + obd->obd_recovery_timeout / 60, + obd->obd_recovery_timeout % 60, + obd->obd_max_recoverable_clients, + (obd->obd_max_recoverable_clients == 1) ? "" : "s", + (obd->obd_max_recoverable_clients == 1) ? "s": ""); } /** @@ -1727,43 +1735,48 @@ static void target_start_recovery_timer(struct obd_device *obd) * if @extend is true, extend recovery window to have @drt remaining at least; * otherwise, make sure the recovery timeout value is not less than @drt. */ -static void extend_recovery_timer(struct obd_device *obd, int drt, bool extend) +static void extend_recovery_timer(struct obd_device *obd, time_t drt, + bool extend) { - cfs_time_t now; - cfs_time_t end; - cfs_duration_t left; - int to; + ktime_t left_ns; + time_t left; + time_t to; spin_lock(&obd->obd_dev_lock); if (!obd->obd_recovering || obd->obd_abort_recovery) { spin_unlock(&obd->obd_dev_lock); - return; - } - LASSERT(obd->obd_recovery_start != 0); + return; + } + LASSERT(obd->obd_recovery_start != 0); - now = cfs_time_current_sec(); - to = obd->obd_recovery_timeout; - end = obd->obd_recovery_start + to; - left = cfs_time_sub(end, now); + to = obd->obd_recovery_timeout; + left_ns = hrtimer_expires_remaining(&obd->obd_recovery_timer); + left = ktime_divns(left_ns, NSEC_PER_SEC); + if (extend && (drt > left)) + to += drt - left; + else if (!extend && (drt > to)) + to = drt; - if (extend && (drt > left)) { - to += drt - left; - } else if (!extend && (drt > to)) { - to = drt; - } + if (to > obd->obd_recovery_time_hard) { + to = obd->obd_recovery_time_hard; + CWARN("%s: extended recovery timer reaching hard limit: %ld, extend: %d\n", + obd->obd_name, to, extend); + } - if (to > obd->obd_recovery_time_hard) - to = obd->obd_recovery_time_hard; if (obd->obd_recovery_timeout < to) { - obd->obd_recovery_timeout = to; - end = obd->obd_recovery_start + to; - cfs_timer_arm(&obd->obd_recovery_timer, - cfs_time_shift(end - now)); - } + ktime_t now = ktime_get_real(); + ktime_t end; + + obd->obd_recovery_timeout = to; + end = ktime_set(obd->obd_recovery_start + to, 0); + left_ns = ktime_sub(end, now); + hrtimer_forward_now(&obd->obd_recovery_timer, left_ns); + left = ktime_divns(left_ns, NSEC_PER_MSEC); + } spin_unlock(&obd->obd_dev_lock); - CDEBUG(D_HA, "%s: recovery timer will expire in %u seconds\n", - obd->obd_name, (unsigned)cfs_time_sub(end, now)); + CDEBUG(D_HA, "%s: recovery timer will expire in %ld seconds\n", + obd->obd_name, left); } /* Reset the timer with each new client connection */ @@ -1776,27 +1789,32 @@ static void extend_recovery_timer(struct obd_device *obd, int drt, bool extend) * be extended to make sure the client could be reconnected, in the * process, the timeout from the new client should be ignored. */ - static void check_and_start_recovery_timer(struct obd_device *obd, - struct ptlrpc_request *req, - int new_client) + struct ptlrpc_request *req, + int new_client) { - int service_time = lustre_msg_get_service_time(req->rq_reqmsg); - struct obd_device_target *obt = &obd->u.obt; + time_t service_time = lustre_msg_get_service_time(req->rq_reqmsg); + struct obd_device_target *obt = &obd->u.obt; - if (!new_client && service_time) - /* Teach server about old server's estimates, as first guess - * at how long new requests will take. */ + if (!new_client && service_time) + /* Teach server about old server's estimates, as first guess + * at how long new requests will take. + */ at_measured(&req->rq_rqbd->rqbd_svcpt->scp_at_estimate, - service_time); + service_time); - target_start_recovery_timer(obd); + target_start_recovery_timer(obd); /* Convert the service time to RPC timeout, - * and reuse service_time to limit stack usage. */ + * and reuse service_time to limit stack usage. + */ service_time = at_est2timeout(service_time); + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_SLUGGISH_NET) && + service_time < at_extra) + service_time = at_extra; + /* We expect other clients to timeout within service_time, then try * to reconnect, then try the failover server. The max delay between * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL. */ @@ -1878,7 +1896,7 @@ static int check_for_next_transno(struct lu_target *lut) next_transno = obd->obd_next_recovery_transno; CDEBUG(D_HA, "max: %d, connected: %d, completed: %d, queue_len: %d, " - "req_transno: "LPU64", next_transno: "LPU64"\n", + "req_transno: %llu, next_transno: %llu\n", obd->obd_max_recoverable_clients, connected, completed, queue_len, req_transno, next_transno); @@ -1890,25 +1908,25 @@ static int check_for_next_transno(struct lu_target *lut) wake_up = 1; } else if (tdtd != NULL && req != NULL && is_req_replayed_by_update(req)) { - LASSERTF(req_transno < next_transno, "req_transno "LPU64 - "next_transno"LPU64"\n", req_transno, next_transno); - CDEBUG(D_HA, "waking for duplicate req ("LPU64")\n", + LASSERTF(req_transno < next_transno, "req_transno %llu" + "next_transno%llu\n", req_transno, next_transno); + CDEBUG(D_HA, "waking for duplicate req (%llu)\n", req_transno); wake_up = 1; } else if (req_transno == next_transno || (update_transno != 0 && update_transno <= next_transno)) { - CDEBUG(D_HA, "waking for next ("LPD64")\n", next_transno); + CDEBUG(D_HA, "waking for next (%lld)\n", next_transno); wake_up = 1; } else if (queue_len > 0 && queue_len == atomic_read(&obd->obd_req_replay_clients)) { /** handle gaps occured due to lost reply or VBR */ LASSERTF(req_transno >= next_transno, - "req_transno: "LPU64", next_transno: "LPU64"\n", + "req_transno: %llu, next_transno: %llu\n", req_transno, next_transno); CDEBUG(D_HA, "%s: waking for gap in transno, VBR is %s (skip: " - LPD64", ql: %d, comp: %d, conn: %d, next: "LPD64 - ", next_update "LPD64" last_committed: "LPD64")\n", + "%lld, ql: %d, comp: %d, conn: %d, next: %lld" + ", next_update %lld last_committed: %lld)\n", obd->obd_name, obd->obd_version_recov ? "ON" : "OFF", next_transno, queue_len, completed, connected, req_transno, update_transno, obd->obd_last_committed); @@ -1919,7 +1937,7 @@ static int check_for_next_transno(struct lu_target *lut) wake_up = 1; } else if (OBD_FAIL_CHECK(OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS)) { CDEBUG(D_HA, "accepting transno gaps is explicitly allowed" - " by fail_lock, waking up ("LPD64")\n", next_transno); + " by fail_lock, waking up (%lld)\n", next_transno); obd->obd_next_recovery_transno = req_transno; wake_up = 1; } @@ -1962,9 +1980,20 @@ static int target_recovery_overseer(struct lu_target *lut, { struct obd_device *obd = lut->lut_obd; struct target_distribute_txn_data *tdtd; + time64_t last = 0; + time64_t now; repeat: - if ((obd->obd_recovery_start != 0) && (cfs_time_current_sec() >= - (obd->obd_recovery_start + obd->obd_recovery_time_hard))) { + if (obd->obd_recovering && obd->obd_recovery_start == 0) { + now = ktime_get_seconds(); + if (now - last > 600) { + LCONSOLE_INFO("%s: in recovery but waiting for " + "the first client to connect\n", + obd->obd_name); + last = now; + } + } + if (obd->obd_recovery_start != 0 && ktime_get_real_seconds() >= + (obd->obd_recovery_start + obd->obd_recovery_time_hard)) { __u64 next_update_transno = 0; /* Only abort the recovery if there are no update recovery @@ -2005,7 +2034,7 @@ repeat: * clients */ abort_req_replay_queue(obd); abort_lock_replay_queue(obd); - CDEBUG(D_HA, "%s: there are still update replay ("LPX64 + CDEBUG(D_HA, "%s: there are still update replay (%#llx" ")in the queue.\n", obd->obd_name, next_update_transno); } else { @@ -2033,7 +2062,6 @@ repeat: atomic_read(&tdtd->tdtd_recovery_threads_count) == 0, &lwi); /* Then abort the update recovery list */ - dtrq_list_dump(lut->lut_tdtd, D_ERROR); dtrq_list_destroy(lut->lut_tdtd); } @@ -2132,15 +2160,15 @@ static void handle_recovery_req(struct ptlrpc_thread *thread, (void)handler(req); lu_context_exit(&thread->t_env->le_ctx); - /* don't reset timer for final stage */ - if (!exp_finished(req->rq_export)) { - int to = obd_timeout; + /* don't reset timer for final stage */ + if (!exp_finished(req->rq_export)) { + time_t to = obd_timeout; - /** - * Add request timeout to the recovery time so next request from - * this client may come in recovery time - */ - if (!AT_OFF) { + /** + * Add request timeout to the recovery time so next request from + * this client may come in recovery time + */ + if (!AT_OFF) { struct ptlrpc_service_part *svcpt; svcpt = req->rq_rqbd->rqbd_svcpt; @@ -2148,10 +2176,10 @@ static void handle_recovery_req(struct ptlrpc_thread *thread, * the client will recalculate the timeout according to * current server estimate service time, so we will * use the maxium timeout here for waiting the client - * sending the next req */ - to = max((int)at_est2timeout( - at_get(&svcpt->scp_at_estimate)), - (int)lustre_msg_get_timeout(req->rq_reqmsg)); + * sending the next req + */ + to = max_t(time_t, at_est2timeout(at_get(&svcpt->scp_at_estimate)), + lustre_msg_get_timeout(req->rq_reqmsg)); /* Add 2 net_latency, one for balance rq_deadline * (see ptl_send_rpc), one for resend the req to server, * Note: client will pack net_latency in replay req @@ -2188,12 +2216,9 @@ static int check_for_recovery_ready(struct lu_target *lut) * timer expired, and some clients got evicted */ extend_recovery_timer(obd, obd->obd_recovery_timeout, true); - CDEBUG(D_HA, "%s update recovery is not ready," - " extend recovery %d\n", obd->obd_name, - obd->obd_recovery_timeout); + CDEBUG(D_HA, "%s update recovery is not ready, extend recovery %llu\n", + obd->obd_name, obd->obd_recovery_timeout); return 0; - } else { - dtrq_list_dump(lut->lut_tdtd, D_HA); } } @@ -2261,7 +2286,7 @@ static void drop_duplicate_replay_req(struct lu_env *env, struct obd_device *obd, struct ptlrpc_request *req) { - DEBUG_REQ(D_HA, req, "remove t"LPD64" from %s because of duplicate" + DEBUG_REQ(D_HA, req, "remove t%lld from %s because of duplicate" " update records are found.\n", lustre_msg_get_transno(req->rq_reqmsg), libcfs_nid2str(req->rq_peer.nid)); @@ -2295,7 +2320,7 @@ static void replay_request_or_update(struct lu_env *env, __u64 transno; ENTRY; - CDEBUG(D_HA, "Waiting for transno "LPD64"\n", + CDEBUG(D_HA, "Waiting for transno %lld\n", obd->obd_next_recovery_transno); /* Replay all of request and update by transno */ @@ -2350,7 +2375,7 @@ static void replay_request_or_update(struct lu_env *env, } LASSERT(trd->trd_processing_task == current_pid()); - DEBUG_REQ(D_HA, req, "processing t"LPD64" from %s", + DEBUG_REQ(D_HA, req, "processing t%lld from %s", lustre_msg_get_transno(req->rq_reqmsg), libcfs_nid2str(req->rq_peer.nid)); @@ -2380,7 +2405,7 @@ static void replay_request_or_update(struct lu_env *env, extend_recovery_timer(obd, obd_timeout, true); if (rc == 0 && dtrq->dtrq_xid != 0) { - CDEBUG(D_HA, "Move x"LPU64" t"LPU64 + CDEBUG(D_HA, "Move x%llu t%llu" " to finish list\n", dtrq->dtrq_xid, dtrq->dtrq_master_transno); @@ -2443,11 +2468,10 @@ static int target_recovery_thread(void *arg) RETURN(rc); } - thread->t_env = env; - thread->t_id = -1; /* force filter_iobuf_get/put to use local buffers */ - env->le_ctx.lc_thread = thread; + thread->t_env = env; + thread->t_id = -1; /* force filter_iobuf_get/put to use local buffers */ + env->le_ctx.lc_thread = thread; tgt_io_thread_init(thread); /* init thread_big_cache for IO requests */ - thread->t_watchdog = NULL; CDEBUG(D_HA, "%s: started recovery thread pid %d\n", obd->obd_name, current_pid()); @@ -2469,7 +2493,7 @@ static int target_recovery_thread(void *arg) /* next stage: replay requests or update */ delta = jiffies; - CDEBUG(D_INFO, "1: request replay stage - %d clients from t"LPU64"\n", + CDEBUG(D_INFO, "1: request replay stage - %d clients from t%llu\n", atomic_read(&obd->obd_req_replay_clients), obd->obd_next_recovery_transno); replay_request_or_update(env, lut, trd, thread); @@ -2591,39 +2615,47 @@ void target_recovery_fini(struct obd_device *obd) } EXPORT_SYMBOL(target_recovery_fini); -static void target_recovery_expired(unsigned long castmeharder) +static enum hrtimer_restart target_recovery_expired(struct hrtimer *timer) { - struct obd_device *obd = (struct obd_device *)castmeharder; - CDEBUG(D_HA, "%s: recovery timed out; %d clients are still in recovery" - " after %lds (%d clients connected)\n", + struct obd_device *obd = container_of(timer, struct obd_device, + obd_recovery_timer); + + CDEBUG(D_HA, + "%s: recovery timed out; %d clients are still in recovery after %llu seconds (%d clients connected)\n", obd->obd_name, atomic_read(&obd->obd_lock_replay_clients), - cfs_time_current_sec()- obd->obd_recovery_start, + ktime_get_real_seconds() - obd->obd_recovery_start, atomic_read(&obd->obd_connected_clients)); obd->obd_recovery_expired = 1; wake_up(&obd->obd_next_transno_waitq); + return HRTIMER_NORESTART; } void target_recovery_init(struct lu_target *lut, svc_handler_t handler) { - struct obd_device *obd = lut->lut_obd; + struct obd_device *obd = lut->lut_obd; - if (obd->obd_max_recoverable_clients == 0) { - /** Update server last boot epoch */ - tgt_boot_epoch_update(lut); - return; - } + if (lut->lut_bottom->dd_rdonly) + return; + + if (obd->obd_max_recoverable_clients == 0) { + /** Update server last boot epoch */ + tgt_boot_epoch_update(lut); + return; + } CDEBUG(D_HA, "RECOVERY: service %s, %d recoverable clients, " - "last_transno "LPU64"\n", obd->obd_name, + "last_transno %llu\n", obd->obd_name, obd->obd_max_recoverable_clients, obd->obd_last_committed); - LASSERT(obd->obd_stopping == 0); - obd->obd_next_recovery_transno = obd->obd_last_committed + 1; - obd->obd_recovery_start = 0; - obd->obd_recovery_end = 0; + LASSERT(obd->obd_stopping == 0); + obd->obd_next_recovery_transno = obd->obd_last_committed + 1; + obd->obd_recovery_start = 0; + obd->obd_recovery_end = 0; - cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd); - target_start_recovery_thread(lut, handler); + hrtimer_init(&obd->obd_recovery_timer, CLOCK_REALTIME, + HRTIMER_MODE_REL); + obd->obd_recovery_timer.function = &target_recovery_expired; + target_start_recovery_thread(lut, handler); } EXPORT_SYMBOL(target_recovery_init); @@ -2678,6 +2710,17 @@ int target_queue_recovery_request(struct ptlrpc_request *req, target_process_req_flags(obd, req); if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LOCK_REPLAY_DONE) { + if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_RECOVERY_REQ_RACE))) { + if (cfs_fail_val == 1) { + cfs_race_state = 1; + cfs_fail_val = 0; + wake_up(&cfs_race_waitq); + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + } + } + /* client declares he's ready to complete recovery * so, we put the request on th final queue */ target_request_copy_get(req); @@ -2765,8 +2808,8 @@ int target_queue_recovery_request(struct ptlrpc_request *req, * Also, a resent, replayed request that has already been * handled will pass through here and be processed immediately. */ - CDEBUG(D_HA, "Next recovery transno: "LPU64 - ", current: "LPU64", replaying\n", + CDEBUG(D_HA, "Next recovery transno: %llu" + ", current: %llu, replaying\n", obd->obd_next_recovery_transno, transno); /* If the request has been replayed by update replay, then sends this @@ -2828,12 +2871,6 @@ added: RETURN(0); } -int target_handle_ping(struct ptlrpc_request *req) -{ - obd_ping(req->rq_svc_thread->t_env, req->rq_export); - return req_capsule_server_pack(&req->rq_pill); -} - void target_committed_to_req(struct ptlrpc_request *req) { struct obd_export *exp = req->rq_export; @@ -2846,7 +2883,7 @@ void target_committed_to_req(struct ptlrpc_request *req) "%d)", exp->exp_obd->obd_no_transno, req->rq_repmsg == NULL); - CDEBUG(D_INFO, "last_committed "LPU64", transno "LPU64", xid "LPU64"\n", + CDEBUG(D_INFO, "last_committed %llu, transno %llu, xid %llu\n", exp->exp_last_committed, req->rq_transno, req->rq_xid); } @@ -2952,7 +2989,7 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) rs->rs_opc = lustre_msg_get_opc(req->rq_reqmsg); spin_lock(&exp->exp_uncommitted_replies_lock); - CDEBUG(D_NET, "rs transno = "LPU64", last committed = "LPU64"\n", + CDEBUG(D_NET, "rs transno = %llu, last committed = %llu\n", rs->rs_transno, exp->exp_last_committed); if (rs->rs_transno > exp->exp_last_committed) { /* not committed already */ @@ -3125,10 +3162,10 @@ static inline const char *bulk2type(struct ptlrpc_request *req) int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc, struct l_wait_info *lwi) { - struct ptlrpc_request *req = desc->bd_req; - time_t start = cfs_time_current_sec(); - time_t deadline; - int rc = 0; + struct ptlrpc_request *req = desc->bd_req; + time64_t start = ktime_get_real_seconds(); + time64_t deadline; + int rc = 0; ENTRY; @@ -3149,8 +3186,7 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc, if (req->rq_bulk_read) rc = sptlrpc_svc_wrap_bulk(req, desc); - if ((exp->exp_connect_data.ocd_connect_flags & - OBD_CONNECT_BULK_MBITS) != 0) + if (OCD_HAS_FLAG(&exp->exp_connect_data, BULK_MBITS)) req->rq_mbits = lustre_msg_get_mbits(req->rq_reqmsg); else /* old version, bulk matchbits is rq_xid */ req->rq_mbits = req->rq_xid; @@ -3176,12 +3212,13 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc, deadline = req->rq_deadline; do { - long timeoutl = deadline - cfs_time_current_sec(); - cfs_duration_t timeout = timeoutl <= 0 ? - CFS_TICK : cfs_time_seconds(timeoutl); - time_t rq_deadline; + time64_t timeoutl = deadline - ktime_get_real_seconds(); + long timeout_jiffies = timeoutl <= 0 ? + 1 : cfs_time_seconds(timeoutl); + time64_t rq_deadline; - *lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1), + *lwi = LWI_TIMEOUT_INTERVAL(timeout_jiffies, + cfs_time_seconds(1), target_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, !ptlrpc_server_bulk_active(desc) || @@ -3191,17 +3228,17 @@ int target_bulk_io(struct obd_export *exp, struct ptlrpc_bulk_desc *desc, lwi); LASSERT(rc == 0 || rc == -ETIMEDOUT); /* Wait again if we changed rq_deadline. */ - rq_deadline = ACCESS_ONCE(req->rq_deadline); + rq_deadline = READ_ONCE(req->rq_deadline); deadline = start + bulk_timeout; if (deadline > rq_deadline) deadline = rq_deadline; - } while ((rc == -ETIMEDOUT) && - (deadline > cfs_time_current_sec())); + } while (rc == -ETIMEDOUT && + deadline > ktime_get_real_seconds()); if (rc == -ETIMEDOUT) { - DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %ld%+lds", + DEBUG_REQ(D_ERROR, req, "timeout on bulk %s after %lld%+llds", bulk2type(req), deadline - start, - cfs_time_current_sec() - deadline); + ktime_get_real_seconds() - deadline); ptlrpc_abort_bulk(desc); } else if (exp->exp_failed) { DEBUG_REQ(D_ERROR, req, "Eviction on bulk %s",