fo_writethrough_cache:1, /* writetrhough cache */
fo_syncjournal:1, /* sync journal on writes */
fo_sync_lock_cancel:2, /* sync on lock cancel */
- fo_raid_degraded:1; /* RAID device degraded */
+ fo_raid_degraded:1, /* RAID device degraded */
+ fo_mds_ost_sync:1; /**< MDS-OST orphan recovery*/
struct obd_import *fo_mdc_imp;
struct obd_uuid fo_mdc_uuid;
int (*o_llog_init)(struct obd_device *obd, struct obd_device *disk_obd,
int *idx);
int (*o_llog_finish)(struct obd_device *obd, int count);
+ int (*o_llog_connect)(struct obd_export *, struct llogd_conn_body *);
/* metadata-only methods */
int (*o_pin)(struct obd_export *, struct ll_fid *,
EXIT;
}
+static inline int obd_llog_connect(struct obd_export *exp,
+ struct llogd_conn_body *body)
+{
+ int rc;
+ ENTRY;
+
+ OBD_CHECK_OP(exp->exp_obd, llog_connect, 0);
+ EXP_COUNTER_INCREMENT(exp, llog_connect);
+
+ rc = OBP(exp->exp_obd, llog_connect)(exp, body);
+ RETURN(rc);
+}
+
static inline int obd_notify(struct obd_device *obd,
struct obd_device *watched,
enum obd_notify_event ev, void *data)
struct llog_process_cat_args *args = data;
struct llog_ctxt *ctxt = args->lpca_ctxt;
struct llog_handle *llh = NULL;
- void *cb = args->lpca_cb;
+ llog_cb_t cb = args->lpca_cb;
struct llog_logid logid;
int rc;
ENTRY;
}
if (cb) {
- rc = llog_cat_process(llh, (llog_cb_t)cb, NULL);
+ rc = llog_cat_process(llh, cb, NULL);
if (rc != LLOG_PROC_BREAK && rc != 0)
CERROR("llog_cat_process() failed %d\n", rc);
+ cb(llh, NULL, NULL);
} else {
CWARN("No callback function for recovery\n");
}
LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, extent_calc);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_connect);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin);
LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin);
RETURN(rc);
}
+static int filter_llog_connect(struct obd_export *exp,
+ struct llogd_conn_body *body)
+{
+ struct obd_device *obd = exp->exp_obd;
+ struct llog_ctxt *ctxt;
+ int rc;
+ ENTRY;
+
+ CDEBUG(D_OTHER, "%s: Recovery from log "LPX64"/"LPX64":%x\n",
+ obd->obd_name, body->lgdc_logid.lgl_oid,
+ body->lgdc_logid.lgl_ogr, body->lgdc_logid.lgl_ogen);
+
+ spin_lock_bh(&obd->obd_processing_task_lock);
+ obd->u.filter.fo_mds_ost_sync = 1;
+ spin_unlock_bh(&obd->obd_processing_task_lock);
+
+ ctxt = llog_get_context(obd, body->lgdc_ctxt_idx);
+ if (ctxt == NULL) {
+ CERROR("NULL ctxt at idx %d\n", body->lgdc_ctxt_idx);
+ RETURN(-ENOENT);
+ }
+
+ rc = llog_connect(ctxt, &body->lgdc_logid, &body->lgdc_gen, NULL);
+ llog_ctxt_put(ctxt);
+ if (rc != 0)
+ CERROR("%s: failed to connect rc %d idx %d\n", obd->obd_name,
+ rc, body->lgdc_ctxt_idx);
+
+ RETURN(rc);
+}
+
static int filter_precleanup(struct obd_device *obd,
enum obd_cleanup_stage stage)
{
.o_preprw = filter_preprw,
.o_commitrw = filter_commitrw,
.o_llog_init = filter_llog_init,
+ .o_llog_connect = filter_llog_connect,
.o_llog_finish = filter_llog_finish,
.o_iocontrol = filter_iocontrol,
.o_health_check = filter_health_check,
if (ctxt->loc_obd->obd_stopping)
RETURN(LLOG_PROC_BREAK);
+ if (rec == NULL) {
+ cfs_spin_lock_bh(&ctxt->loc_obd->obd_processing_task_lock);
+ ctxt->loc_obd->u.filter.fo_mds_ost_sync = 0;
+ cfs_spin_unlock_bh(&ctxt->loc_obd->obd_processing_task_lock);
+ RETURN(0);
+ }
+
if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
CERROR("log is not plain\n");
RETURN(-EINVAL);
return count;
}
+static int lprocfs_filter_rd_mds_sync(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ LASSERT(obd != NULL);
+
+ return snprintf(page, count, "%u\n", obd->u.filter.fo_mds_ost_sync);
+}
+
int lprocfs_filter_rd_degraded(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
lprocfs_filter_wr_syncjournal, 0 },
{ "sync_on_lock_cancel", lprocfs_filter_rd_sync_lock_cancel,
lprocfs_filter_wr_sync_lock_cancel, 0 },
+ { "mds_sync", lprocfs_filter_rd_mds_sync, 0, 0},
{ "degraded", lprocfs_filter_rd_degraded,
lprocfs_filter_wr_degraded, 0 },
{ 0 }
return count;
}
+static int osc_rd_destroys_in_flight(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct obd_device *obd = data;
+ return snprintf(page, count, "%u\n",
+ atomic_read(&obd->u.cli.cl_destroy_in_flight));
+}
+
static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
{ "uuid", lprocfs_rd_uuid, 0, 0 },
{ "ping", 0, lprocfs_wr_ping, 0, 0, 0222 },
osc_wr_max_pages_per_rpc, 0 },
{ "max_rpcs_in_flight", osc_rd_max_rpcs_in_flight,
osc_wr_max_rpcs_in_flight, 0 },
+ { "destroys_in_flight", osc_rd_destroys_in_flight, 0, 0 },
{ "max_dirty_mb", osc_rd_max_dirty_mb, osc_wr_max_dirty_mb, 0 },
{ "cur_dirty_bytes", osc_rd_cur_dirty_bytes, 0, 0 },
{ "cur_grant_bytes", osc_rd_cur_grant_bytes,
struct obd_trans_info *oti = &trans_info;
int should_process, fail = OBD_FAIL_OST_ALL_REPLY_NET, rc = 0;
struct obd_device *obd = NULL;
+ struct llogd_conn_body *body;
ENTRY;
LASSERT(current->journal_info == NULL);
/* FIXME - just reply status */
case LLOG_ORIGIN_CONNECT:
DEBUG_REQ(D_INODE, req, "log connect");
- rc = llog_handle_connect(req);
+ body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF,
+ sizeof(*body));
+ rc = obd_llog_connect(req->rq_export, body);
req->rq_status = rc;
rc = lustre_pack_reply(req, 1, NULL, NULL);
if (rc)
kill -USR1 $MULTIPID || return 3
wait $MULTIPID || return 4
- # drop close
+ # drop close
do_facet mds lctl set_param fail_loc=0x80000115
facet_failover mds
do_facet mds lctl set_param fail_loc=0
umount $MOUNT2
fail mds
+ wait_recovery_complete mds || error "MDS recovery isn't done"
# first 25 files should have been replayed
unlinkmany $MOUNT1/$tfile- 5 || return 2
unlinkmany $MOUNT1/$tfile-3- 5 || return 3
zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
- # give ost time to process llogs
- sleep 3
+
+ wait_mds_ost_sync || return 5
+ wait_destroy_complete || return 6
+
AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
log "before $BEFOREUSED, after $AFTERUSED"
[ $AFTERUSED -ne $BEFOREUSED ] && \
}
run_test 14b "delete ost orphans if gap occured in objids due to VBR"
-test_15a() { # was test_15
+test_15a() { # was test_15
replay_barrier mds
createmany -o $MOUNT1/$tfile- 25
createmany -o $MOUNT2/$tfile-2- 1
# need to check iozone output on "complete"
local iozonelog=$TMP/${TESTSUITE}.iozone.log
rm -f $iozonelog
- cat $tmppipe | while read line ; do
+ cat $tmppipe | while read line ; do
echo "$line"
echo "$line" >>$iozonelog
done;
fi
rm -f $tmppipe
rm -f $iozonelog
- return $rc
+ return $rc
}
test_5() {
local pid=$!
echo iozone bg pid=$pid
-
+
sleep 8
fail ost1
local rc=0
get_stripe_info client $f
sync
- sleep 2 # ensure we have a fresh statfs
+ sleep 2 # ensure we have a fresh statfs
sync
#define OBD_FAIL_MDS_REINT_NET_REP 0x119
do_facet mds "lctl set_param fail_loc=0x80000119"
(( $before > $after_dd )) || return 1
rm -f $f
fail ost$((stripe_index + 1))
+ wait_recovery_complete ost$((stripe_index + 1)) || error "OST recovery isn't done"
$CHECKSTAT -t file $f && return 2 || true
sync
# let the delete happen
- sleep 5
+ wait_mds_ost_sync || return 4
+ wait_destroy_complete || return 5
after=`kbytesfree`
log "before: $before after: $after"
(( $before <= $after + 40 )) || return 3 # take OST logs into account
before=`kbytesfree`
dd if=/dev/urandom bs=4096 count=1280 of=$f || return 4
sync
- sleep 2 # ensure we have a fresh statfs
+ sleep 2 # ensure we have a fresh statfs
sync
after_dd=`kbytesfree`
log "before: $before after_dd: $after_dd"
replay_barrier ost1
rm -f $f
fail ost1
+ wait_recovery_complete ost1 || error "OST recovery isn't done"
$CHECKSTAT -t file $f && return 2 || true
sync
# let the delete happen
- sleep 2
+ wait_mds_ost_sync || return 4
+ wait_destroy_complete || return 5
after=`kbytesfree`
log "before: $before after: $after"
- (( $before <= $after + 40 )) || return 3 # take OST logs into account
+ (( $before <= $after + 40 )) || return 3 # take OST logs into account
}
run_test 7 "Fail OST before obd_destroy"
client_up || client_up || true # reconnect
fail mds # start orphan recovery
- wait_recovery_complete mds || error "MDS recovery not done"
-
- # For interop with 2.0 only:
- # FIXME just because recovery is done doesn't mean we've finished
- # orphan cleanup. Fake it with a sleep for now...
- sleep 10
-
+ wait_recovery_complete mds || error "MDS recovery isn't done"
+ wait_mds_ost_sync || return 3
AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
log "before $BEFOREUSED, after $AFTERUSED"
[ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \
sleep 1
TOTAL=`lctl get_param -n osc.*.kbytesavail | \
awk 'BEGIN{total=0}; {total+=$1}; END{print total}'`
- [ "$TOTAL" -eq "$TOTALPREV" ] && break
+ [ "$TOTAL" -eq "$TOTALPREV" ] && return 0
echo "Waiting delete completed ... prev: $TOTALPREV current: $TOTAL "
TOTALPREV=$TOTAL
WAIT=$(( WAIT + 1))
done
- echo "Delete completed."
+ echo "Delete is not completed in $MAX_WAIT sec"
+ return 1
}
wait_for_host() {
return 1
}
+wait_mds_ost_sync () {
+ # just because recovery is done doesn't mean we've finished
+ # orphan cleanup. Wait for llogs to get synchronized.
+ echo "Waiting for orphan cleanup..."
+ # MAX value includes time needed for MDS-OST reconnection
+ local MAX=$(( TIMEOUT * 2 ))
+ local WAIT=0
+ while [ $WAIT -lt $MAX ]; do
+ local -a sync=($(do_nodes $(comma_list $(osts_nodes)) \
+ "$LCTL get_param -n obdfilter.*.mds_sync"))
+ local con=1
+ for ((i=0; i<${#sync[@]}; i++)); do
+ [ ${sync[$i]} -eq 0 ] && continue
+ # there is a not finished MDS-OST synchronization
+ con=0
+ break;
+ done
+ sleep 2 # increase waiting time and cover statfs cache
+ [ ${con} -eq 1 ] && return 0
+ echo "Waiting $WAIT secs for $facet mds-ost sync done."
+ WAIT=$((WAIT + 2))
+ done
+ echo "$facet recovery not done in $MAX sec. $STATUS"
+ return 1
+}
+
+wait_destroy_complete () {
+ echo "Waiting for destroy to be done..."
+ # MAX value shouldn't be big as this mean server responsiveness
+ # never increase this just to make test pass but investigate
+ # why it takes so long time
+ local MAX=5
+ local WAIT=0
+ while [ $WAIT -lt $MAX ]; do
+ local -a RPCs=($($LCTL get_param -n osc.*.destroys_in_flight))
+ local con=1
+ for ((i=0; i<${#RPCs[@]}; i++)); do
+ [ ${RPCs[$i]} -eq 0 ] && continue
+ # there are still some destroy RPCs in flight
+ con=0
+ break;
+ done
+ sleep 1
+ [ ${con} -eq 1 ] && return 0 # done waiting
+ echo "Waiting $WAIT secs for destroys to be done."
+ WAIT=$((WAIT + 1))
+ done
+ echo "Destroys weren't done in $MAX sec."
+ return 1
+}
+
wait_exit_ST () {
local facet=$1