From 75358628445eca5030b3e2bbec5d02d25fa0ed21 Mon Sep 17 00:00:00 2001 From: "John L. Hammond" Date: Fri, 15 Dec 2017 10:14:11 -0600 Subject: [PATCH] LU-10383 hsm: refactor mdt_coordinator_cb() Split the ARS_WAITING and ARS_STARTED cases of mdt_coordinator_cb() into subfunctions, mdt_cdt_waiting_cb() and mdt_cdt_started_cb(). Test-Parameters: trivial testlist=sanity-hsm Signed-off-by: John L. Hammond Change-Id: I734e10e4db72f76a6b0de76c383ad0b03efd76d8 Reviewed-on: https://review.whamcloud.com/30552 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Faccini Bruno Reviewed-by: Dmitry Eremin Reviewed-by: Oleg Drokin --- lustre/mdt/mdt_coordinator.c | 407 ++++++++++++++++++++++--------------------- 1 file changed, 206 insertions(+), 201 deletions(-) diff --git a/lustre/mdt/mdt_coordinator.c b/lustre/mdt/mdt_coordinator.c index 0e5e8f4..074fdeb 100644 --- a/lustre/mdt/mdt_coordinator.c +++ b/lustre/mdt/mdt_coordinator.c @@ -152,228 +152,233 @@ struct hsm_thread_data { struct mdt_thread_info *cdt_mti; struct hsm_scan_request *request; }; -/** - * llog_cat_process() callback, used to: - * - find waiting request and start action - * - purge canceled and done requests - * \param env [IN] environment - * \param llh [IN] llog handle - * \param hdr [IN] llog record - * \param data [IN/OUT] cb data = struct hsm_scan_data - * \retval 0 success - * \retval -ve failure - */ -static int mdt_coordinator_cb(const struct lu_env *env, + +static int mdt_cdt_waiting_cb(const struct lu_env *env, + struct mdt_device *mdt, struct llog_handle *llh, - struct llog_rec_hdr *hdr, - void *data) + struct llog_agent_req_rec *larr, + struct hsm_scan_data *hsd) { - struct llog_agent_req_rec *larr; - struct hsm_scan_data *hsd; - struct hsm_action_item *hai; - struct mdt_device *mdt; - struct coordinator *cdt; - int rc; - ENTRY; - - hsd = data; - mdt = hsd->mti->mti_mdt; - cdt = &mdt->mdt_coordinator; + struct coordinator *cdt = &mdt->mdt_coordinator; + struct hsm_scan_request *request; + struct hsm_action_item *hai; + int i; - larr = (struct llog_agent_req_rec *)hdr; - dump_llog_agent_req_rec("mdt_coordinator_cb(): ", larr); - switch (larr->arr_status) { - case ARS_WAITING: { - int i; - struct hsm_scan_request *request; + /* Are agents full? */ + if (atomic_read(&cdt->cdt_request_count) >= cdt->cdt_max_requests) + RETURN(0); - /* Are agents full? */ - if (atomic_read(&cdt->cdt_request_count) >= - cdt->cdt_max_requests) + /* first search whether the request is found in the list we + * have built. */ + request = NULL; + for (i = 0; i < hsd->request_cnt; i++) { + if (hsd->request[i].hal->hal_compound_id == + larr->arr_compound_id) { + request = &hsd->request[i]; break; + } + } - /* first search whether the request is found in the - * list we have built. */ - request = NULL; - for (i = 0; i < hsd->request_cnt; i++) { - if (hsd->request[i].hal->hal_compound_id == - larr->arr_compound_id) { - request = &hsd->request[i]; - break; + if (!request) { + struct hsm_action_list *hal; + + if (hsd->request_cnt == hsd->max_requests) { + if (!hsd->housekeeping) { + /* The request array is full, stop + * here. There might be more known + * requests that could be merged, but + * this avoid analyzing too many llogs + * for minor gains. */ + RETURN(LLOG_PROC_BREAK); + } else { + /* Unknown request and no more room + * for a new request. Continue to scan + * to find other entries for already + * existing requests. */ + RETURN(0); } } - if (!request) { - struct hsm_action_list *hal; - - if (hsd->request_cnt == hsd->max_requests) { - if (!hsd->housekeeping) { - /* The request array is full, - * stop here. There might be - * more known requests that - * could be merged, but this - * avoid analyzing too many - * llogs for minor gains. - */ - RETURN(LLOG_PROC_BREAK); - } else { - /* Unknown request and no more room - * for a new request. Continue to scan - * to find other entries for already - * existing requests. - */ - RETURN(0); - } - } + request = &hsd->request[hsd->request_cnt]; - request = &hsd->request[hsd->request_cnt]; + /* allocates hai vector size just needs to be large + * enough */ + request->hal_sz = sizeof(*request->hal) + + cfs_size_round(MTI_NAME_MAXLEN + 1) + + 2 * cfs_size_round(larr->arr_hai.hai_len); + OBD_ALLOC(hal, request->hal_sz); + if (!hal) + RETURN(-ENOMEM); - /* allocates hai vector size just needs to be large - * enough */ - request->hal_sz = - sizeof(*request->hal) + - cfs_size_round(MTI_NAME_MAXLEN+1) + - 2 * cfs_size_round(larr->arr_hai.hai_len); - OBD_ALLOC(hal, request->hal_sz); - if (!hal) - RETURN(-ENOMEM); - hal->hal_version = HAL_VERSION; - strlcpy(hal->hal_fsname, hsd->fs_name, - MTI_NAME_MAXLEN + 1); - hal->hal_compound_id = larr->arr_compound_id; - hal->hal_archive_id = larr->arr_archive_id; - hal->hal_flags = larr->arr_flags; - hal->hal_count = 0; - request->hal_used_sz = hal_size(hal); - request->hal = hal; - hsd->request_cnt++; - hai = hai_first(hal); - } else { - /* request is known */ - /* we check if record archive num is the same as the - * known request, if not we will serve it in multiple - * time because we do not know if the agent can serve - * multiple backend - * a use case is a compound made of multiple restore - * where the files are not archived in the same backend - */ - if (larr->arr_archive_id != - request->hal->hal_archive_id) - RETURN(0); + hal->hal_version = HAL_VERSION; + strlcpy(hal->hal_fsname, hsd->fs_name, MTI_NAME_MAXLEN + 1); + hal->hal_compound_id = larr->arr_compound_id; + hal->hal_archive_id = larr->arr_archive_id; + hal->hal_flags = larr->arr_flags; + hal->hal_count = 0; + request->hal_used_sz = hal_size(hal); + request->hal = hal; + hsd->request_cnt++; + hai = hai_first(hal); + } else { + /* request is known */ + /* we check if record archive num is the same as the + * known request, if not we will serve it in multiple + * time because we do not know if the agent can serve + * multiple backend a use case is a compound made of + * multiple restore where the files are not archived + * in the same backend */ + if (larr->arr_archive_id != request->hal->hal_archive_id) + RETURN(0); - if (request->hal_sz < - request->hal_used_sz + - cfs_size_round(larr->arr_hai.hai_len)) { - /* Not enough room, need an extension */ - void *hal_buffer; - int sz; - - sz = 2 * request->hal_sz; - OBD_ALLOC(hal_buffer, sz); - if (!hal_buffer) - RETURN(-ENOMEM); - memcpy(hal_buffer, request->hal, - request->hal_used_sz); - OBD_FREE(request->hal, - request->hal_sz); - request->hal = hal_buffer; - request->hal_sz = sz; - } - hai = hai_first(request->hal); - for (i = 0; i < request->hal->hal_count; i++) - hai = hai_next(hai); - } - memcpy(hai, &larr->arr_hai, larr->arr_hai.hai_len); - hai->hai_cookie = larr->arr_hai.hai_cookie; - hai->hai_gid = larr->arr_hai.hai_gid; + if (request->hal_sz < request->hal_used_sz + + cfs_size_round(larr->arr_hai.hai_len)) { + /* Not enough room, need an extension */ + void *hal_buffer; + int sz; - request->hal_used_sz += cfs_size_round(hai->hai_len); - request->hal->hal_count++; + sz = 2 * request->hal_sz; + OBD_ALLOC(hal_buffer, sz); + if (!hal_buffer) + RETURN(-ENOMEM); + memcpy(hal_buffer, request->hal, request->hal_used_sz); + OBD_FREE(request->hal, request->hal_sz); + request->hal = hal_buffer; + request->hal_sz = sz; + } - if (hai->hai_action != HSMA_CANCEL) - cdt_agent_record_hash_add(cdt, hai->hai_cookie, - llh->lgh_hdr->llh_cat_idx, - hdr->lrh_index); - break; + hai = hai_first(request->hal); + for (i = 0; i < request->hal->hal_count; i++) + hai = hai_next(hai); } - case ARS_STARTED: { - struct hsm_progress_kernel pgs; - struct cdt_agent_req *car; - time64_t now = ktime_get_real_seconds(); - time64_t last; - if (!hsd->housekeeping) - break; + memcpy(hai, &larr->arr_hai, larr->arr_hai.hai_len); + hai->hai_cookie = larr->arr_hai.hai_cookie; + hai->hai_gid = larr->arr_hai.hai_gid; - /* we search for a running request - * error may happen if coordinator crashes or stopped - * with running request - */ - car = mdt_cdt_find_request(cdt, larr->arr_hai.hai_cookie); - if (car == NULL) { - last = larr->arr_req_change; - } else { - last = car->car_req_update; - mdt_cdt_put_request(car); - } + request->hal_used_sz += cfs_size_round(hai->hai_len); + request->hal->hal_count++; - /* test if request too long, if yes cancel it - * the same way the copy tool acknowledge a cancel request */ - if (now <= last + cdt->cdt_active_req_timeout) - RETURN(0); + if (hai->hai_action != HSMA_CANCEL) + cdt_agent_record_hash_add(cdt, hai->hai_cookie, + llh->lgh_hdr->llh_cat_idx, + larr->arr_hdr.lrh_index); - dump_llog_agent_req_rec("request timed out, start cleaning", - larr); - /* a too old cancel request just needs to be removed - * this can happen, if copy tool does not support - * cancel for other requests, we have to remove the - * running request and notify the copytool */ - pgs.hpk_fid = larr->arr_hai.hai_fid; - pgs.hpk_cookie = larr->arr_hai.hai_cookie; - pgs.hpk_extent = larr->arr_hai.hai_extent; - pgs.hpk_flags = HP_FLAG_COMPLETED; - pgs.hpk_errval = ENOSYS; - pgs.hpk_data_version = 0; - - /* update request state, but do not record in llog, to - * avoid deadlock on cdt_llog_lock */ - rc = mdt_hsm_update_request_state(hsd->mti, &pgs, 0); - if (rc) - CERROR("%s: cannot cleanup timed out request: " - DFID" for cookie %#llx action=%s\n", - mdt_obd_name(mdt), - PFID(&pgs.hpk_fid), pgs.hpk_cookie, - hsm_copytool_action2name( - larr->arr_hai.hai_action)); - - if (rc == -ENOENT) { - /* The request no longer exists, forget - * about it, and do not send a cancel request - * to the client, for which an error will be - * sent back, leading to an endless cycle of - * cancellation. */ - cdt_agent_record_hash_del(cdt, - larr->arr_hai.hai_cookie); - RETURN(LLOG_DEL_RECORD); - } + RETURN(0); +} - /* XXX A cancel request cannot be cancelled. */ - if (larr->arr_hai.hai_action == HSMA_CANCEL) - RETURN(0); +static int mdt_cdt_started_cb(const struct lu_env *env, + struct mdt_device *mdt, + struct llog_handle *llh, + struct llog_agent_req_rec *larr, + struct hsm_scan_data *hsd) +{ + struct coordinator *cdt = &mdt->mdt_coordinator; + struct hsm_progress_kernel pgs; + struct cdt_agent_req *car; + time64_t now = ktime_get_real_seconds(); + time64_t last; + int rc; - larr->arr_status = ARS_CANCELED; - larr->arr_req_change = now; - rc = llog_write(hsd->mti->mti_env, llh, hdr, hdr->lrh_index); - if (rc < 0) - CERROR("%s: cannot update agent log: rc = %d\n", - mdt_obd_name(mdt), rc); - break; + if (!hsd->housekeeping) + RETURN(0); + + /* we search for a running request + * error may happen if coordinator crashes or stopped + * with running request + */ + car = mdt_cdt_find_request(cdt, larr->arr_hai.hai_cookie); + if (car == NULL) { + last = larr->arr_req_change; + } else { + last = car->car_req_update; + mdt_cdt_put_request(car); + } + + /* test if request too long, if yes cancel it + * the same way the copy tool acknowledge a cancel request */ + if (now <= last + cdt->cdt_active_req_timeout) + RETURN(0); + + dump_llog_agent_req_rec("request timed out, start cleaning", larr); + /* a too old cancel request just needs to be removed + * this can happen, if copy tool does not support + * cancel for other requests, we have to remove the + * running request and notify the copytool */ + pgs.hpk_fid = larr->arr_hai.hai_fid; + pgs.hpk_cookie = larr->arr_hai.hai_cookie; + pgs.hpk_extent = larr->arr_hai.hai_extent; + pgs.hpk_flags = HP_FLAG_COMPLETED; + pgs.hpk_errval = ENOSYS; + pgs.hpk_data_version = 0; + + /* update request state, but do not record in llog, to + * avoid deadlock on cdt_llog_lock */ + rc = mdt_hsm_update_request_state(hsd->mti, &pgs, 0); + if (rc) + CERROR("%s: cannot cleanup timed out request: " + DFID" for cookie %#llx action=%s\n", + mdt_obd_name(mdt), + PFID(&pgs.hpk_fid), pgs.hpk_cookie, + hsm_copytool_action2name(larr->arr_hai.hai_action)); + + if (rc == -ENOENT) { + /* The request no longer exists, forget + * about it, and do not send a cancel request + * to the client, for which an error will be + * sent back, leading to an endless cycle of + * cancellation. */ + cdt_agent_record_hash_del(cdt, larr->arr_hai.hai_cookie); + RETURN(LLOG_DEL_RECORD); } - case ARS_FAILED: - case ARS_CANCELED: - case ARS_SUCCEED: + + /* XXX A cancel request cannot be cancelled. */ + if (larr->arr_hai.hai_action == HSMA_CANCEL) + RETURN(0); + + larr->arr_status = ARS_CANCELED; + larr->arr_req_change = now; + rc = llog_write(hsd->mti->mti_env, llh, &larr->arr_hdr, + larr->arr_hdr.lrh_index); + if (rc < 0) + CERROR("%s: cannot update agent log: rc = %d\n", + mdt_obd_name(mdt), rc); + + RETURN(0); +} + +/** + * llog_cat_process() callback, used to: + * - find waiting request and start action + * - purge canceled and done requests + * \param env [IN] environment + * \param llh [IN] llog handle + * \param hdr [IN] llog record + * \param data [IN/OUT] cb data = struct hsm_scan_data + * \retval 0 success + * \retval -ve failure + */ +static int mdt_coordinator_cb(const struct lu_env *env, + struct llog_handle *llh, + struct llog_rec_hdr *hdr, + void *data) +{ + struct llog_agent_req_rec *larr = (struct llog_agent_req_rec *)hdr; + struct hsm_scan_data *hsd = data; + struct mdt_device *mdt = hsd->mti->mti_mdt; + struct coordinator *cdt = &mdt->mdt_coordinator; + ENTRY; + + larr = (struct llog_agent_req_rec *)hdr; + dump_llog_agent_req_rec("mdt_coordinator_cb(): ", larr); + switch (larr->arr_status) { + case ARS_WAITING: + RETURN(mdt_cdt_waiting_cb(env, mdt, llh, larr, hsd)); + case ARS_STARTED: + RETURN(mdt_cdt_started_cb(env, mdt, llh, larr, hsd)); + default: if (!hsd->housekeeping) - break; + RETURN(0); if ((larr->arr_req_change + cdt->cdt_grace_delay) < ktime_get_real_seconds()) { @@ -381,9 +386,9 @@ static int mdt_coordinator_cb(const struct lu_env *env, larr->arr_hai.hai_cookie); RETURN(LLOG_DEL_RECORD); } - break; + + RETURN(0); } - RETURN(0); } /** -- 1.8.3.1