* File join has been disabled in this release, refer to Bugzilla 16929.
Severity : normal
+Bugzilla : 18674
+Description: client could not reconnect to OST because of an active request.
+Details : The client now retries to reconnect to the same server, if a
+ connect request failed with EBUSY or -EAGAIN. Test case 35b
+ conf-sanity.sh has been created.
+
+Severity : normal
Bugzilla : 19557
Description: Don't allow make backward step on assignin osc next id.
Details : race between allocation next id and ll_sync thread can be cause
imp_pingable:1, /* pingable */
imp_resend_replay:1, /* resend for replay */
imp_recon_bk:1, /* turn off reconnect if all failovers fail */
- imp_last_recon:1; /* internally used by above */
+ imp_last_recon:1, /* internally used by above */
+ imp_force_reconnect:1; /* import must be reconnected instead of chouse new connection */
__u32 imp_connect_op;
struct obd_connect_data imp_connect_data;
__u64 imp_connect_flags_orig;
}
/* if not found, simply choose the current one */
- if (!imp_conn) {
+ if (!imp_conn || imp->imp_force_reconnect) {
LASSERT(imp->imp_conn_current);
imp_conn = imp->imp_conn_current;
tried_all = 0;
EXIT;
}
+static int ptlrpc_busy_reconnect(int rc)
+{
+ return (rc == -EBUSY) || (rc == -EAGAIN);
+}
+
+
static int ptlrpc_connect_interpret(const struct lu_env *env,
struct ptlrpc_request *request,
void *data, int rc)
spin_unlock(&imp->imp_lock);
RETURN(0);
}
- spin_unlock(&imp->imp_lock);
- if (rc)
+ if (rc) {
+ /* if this reconnect to busy export - not need select new target
+ * for connecting*/
+ if (ptlrpc_busy_reconnect(rc))
+ imp->imp_force_reconnect = 1;
+ spin_unlock(&imp->imp_lock);
GOTO(out, rc);
+ }
LASSERT(imp->imp_conn_current);
msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
/* All imports are pingable */
- spin_lock(&imp->imp_lock);
imp->imp_pingable = 1;
+ imp->imp_force_reconnect = 0;
if (aa->pcaa_initial_connect) {
if (msg_flags & MSG_CONNECT_REPLAYABLE) {
}
run_test 34c "force umount with failed ost should be normal"
-test_35() { # bug 12459
+test_35a() { # bug 12459
setup
DBG_SAVE="`lctl get_param -n debug`"
log "Wait for RECONNECT_INTERVAL seconds (10s)"
sleep 10
- MSG="conf-sanity.sh test_35 `date +%F%kh%Mm%Ss`"
+ MSG="conf-sanity.sh test_35a `date +%F%kh%Mm%Ss`"
$LCTL clear
log "$MSG"
log "Stopping the MDT:"
[ "$NEXTCONN" != "0" ] && log "The client didn't try to reconnect to the last active server (tried ${NEXTCONN} instead)" && return 7
cleanup
}
-run_test 35 "Reconnect to the last active server first"
+run_test 35a "Reconnect to the last active server first"
+
+test_35b() { # bug 18674
+ remote_mds || { skip "local MDS" && return 0; }
+ setup
+
+ debugsave
+ $LCTL set_param debug="ha"
+ $LCTL clear
+ MSG="conf-sanity.sh test_35b `date +%F%kh%Mm%Ss`"
+ log "$MSG"
+
+ log "Set up a fake failnode for the MDS"
+ FAKENID="127.0.0.2"
+ local device=$(do_facet mds "$LCTL get_param -n devices" | \
+ awk '($3 ~ "mdt" && $4 ~ "MDT") { print $4 }' | head -1)
+ do_facet mds "$LCTL conf_param ${device}.failover.node=$FAKENID" || \
+ return 1
+
+ local at_max_saved=0
+ # adaptive timeouts may prevent seeing the issue
+ if at_is_enabled; then
+ at_max_saved=$(at_max_get mds)
+ at_max_set 0 mds client
+ fi
+
+ mkdir -p $MOUNT/testdir
+ touch $MOUNT/testdir/test
+
+ log "Injecting EBUSY on MDS"
+ # Setting OBD_FAIL_MDS_RESEND=0x136
+ do_facet mds "$LCTL set_param fail_loc=0x80000136" || return 2
+
+ log "Stat on a test file"
+ stat $MOUNT/testdir/test
+
+ log "Stop injecting EBUSY on MDS"
+ do_facet mds "$LCTL set_param fail_loc=0" || return 3
+ rm -f $MOUNT/testdir/test
+
+ log "done"
+ # restore adaptive timeout
+ [ $at_max_saved -ne 0 ] && at_max_set $at_max_saved mds client
+
+ $LCTL dk $TMP/lustre-log-$TESTNAME.log
+
+ # retrieve from the log if the client has ever tried to
+ # contact the fake server after the loss of connection
+ FAILCONN=`awk "BEGIN {ret = 0;}
+ /import_select_connection.*${FSNAME}-MDT0000-mdc.* using connection/ {
+ ret = 1;
+ if (\\\$NF ~ /$FAKENID/) {
+ ret = 2;
+ exit;
+ }
+ }
+ END {print ret}" $TMP/lustre-log-$TESTNAME.log`
+
+ [ "$FAILCONN" == "0" ] && \
+ log "ERROR: The client reconnection has not been triggered" && \
+ return 4
+ [ "$FAILCONN" == "2" ] && \
+ log "ERROR: The client tried to reconnect to the failover server while the primary was busy" && \
+ return 5
+
+ cleanup
+}
+run_test 35b "Continue reconnection retries, if the active server is busy"
test_36() { # 12743
local rc