4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 021110-1307, USA
24 * Copyright (c) 2012, 2017, Intel Corporation.
27 * lustre/target/tgt_main.c
29 * Lustre Unified Target main initialization code
31 * Author: Mikhail Pershin <mike.pershin@intel.com>
34 #define DEBUG_SUBSYSTEM S_CLASS
37 #include "tgt_internal.h"
38 #include "../ptlrpc/ptlrpc_internal.h"
40 /* This must be longer than the longest string below */
41 #define SYNC_STATES_MAXLEN 16
42 static char *sync_on_cancel_states[] = {"never",
47 * Show policy for handling dirty data under a lock being cancelled.
49 * \param[in] kobj sysfs kobject
50 * \param[in] attr sysfs attribute
51 * \param[in] buf buffer for data
53 * \retval 0 and buffer filled with data on success
54 * \retval negative value on error
56 ssize_t sync_lock_cancel_show(struct kobject *kobj,
57 struct attribute *attr, char *buf)
59 struct obd_device *obd = container_of(kobj, struct obd_device,
61 struct lu_target *tgt = obd->u.obt.obt_lut;
63 return sprintf(buf, "%s\n",
64 sync_on_cancel_states[tgt->lut_sync_lock_cancel]);
66 EXPORT_SYMBOL(sync_lock_cancel_show);
69 * Change policy for handling dirty data under a lock being cancelled.
71 * This variable defines what action target takes upon lock cancel
72 * There are three possible modes:
73 * 1) never - never do sync upon lock cancel. This can lead to data
74 * inconsistencies if both the OST and client crash while writing a file
75 * that is also concurrently being read by another client. In these cases,
76 * this may allow the file data to "rewind" to an earlier state.
77 * 2) blocking - do sync only if there is blocking lock, e.g. if another
78 * client is trying to access this same object
79 * 3) always - do sync always
81 * \param[in] kobj kobject
82 * \param[in] attr attribute to show
83 * \param[in] buf buffer for data
84 * \param[in] count buffer size
86 * \retval \a count on success
87 * \retval negative value on error
89 ssize_t sync_lock_cancel_store(struct kobject *kobj, struct attribute *attr,
90 const char *buffer, size_t count)
92 struct obd_device *obd = container_of(kobj, struct obd_device,
94 struct lu_target *tgt = obd->u.obt.obt_lut;
98 if (count == 0 || count >= SYNC_STATES_MAXLEN)
101 for (i = 0 ; i < NUM_SYNC_ON_CANCEL_STATES; i++) {
102 if (strcmp(buffer, sync_on_cancel_states[i]) == 0) {
108 /* Legacy numeric codes */
110 int rc = kstrtoint(buffer, 0, &val);
115 if (val < 0 || val > 2)
118 spin_lock(&tgt->lut_flags_lock);
119 tgt->lut_sync_lock_cancel = val;
120 spin_unlock(&tgt->lut_flags_lock);
123 EXPORT_SYMBOL(sync_lock_cancel_store);
124 LUSTRE_RW_ATTR(sync_lock_cancel);
127 * Show maximum number of Filter Modification Data (FMD) maintained.
129 * \param[in] kobj kobject
130 * \param[in] attr attribute to show
131 * \param[in] buf buffer for data
133 * \retval 0 and buffer filled with data on success
134 * \retval negative value on error
136 ssize_t tgt_fmd_count_show(struct kobject *kobj, struct attribute *attr,
139 struct obd_device *obd = container_of(kobj, struct obd_device,
141 struct lu_target *lut = obd->u.obt.obt_lut;
143 return sprintf(buf, "%u\n", lut->lut_fmd_max_num);
147 * Change number of FMDs maintained by target.
149 * This defines how large the list of FMDs can be.
151 * \param[in] kobj kobject
152 * \param[in] attr attribute to show
153 * \param[in] buf buffer for data
154 * \param[in] count buffer size
156 * \retval \a count on success
157 * \retval negative value on error
159 ssize_t tgt_fmd_count_store(struct kobject *kobj, struct attribute *attr,
160 const char *buffer, size_t count)
162 struct obd_device *obd = container_of(kobj, struct obd_device,
164 struct lu_target *lut = obd->u.obt.obt_lut;
167 rc = kstrtoint(buffer, 0, &val);
171 if (val < 1 || val > 65536)
174 lut->lut_fmd_max_num = val;
178 LUSTRE_RW_ATTR(tgt_fmd_count);
181 * Show the maximum age of FMD data in seconds.
183 * \param[in] kobj kobject
184 * \param[in] attr attribute to show
185 * \param[in] buf buffer for data
187 * \retval 0 and buffer filled with data on success
188 * \retval negative value on error
190 ssize_t tgt_fmd_seconds_show(struct kobject *kobj, struct attribute *attr,
193 struct obd_device *obd = container_of(kobj, struct obd_device,
195 struct lu_target *lut = obd->u.obt.obt_lut;
197 return sprintf(buf, "%lld\n", lut->lut_fmd_max_age);
201 * Set the maximum age of FMD data in seconds.
203 * This defines how long FMD data stays in the FMD list.
205 * \param[in] kobj kobject
206 * \param[in] attr attribute to show
207 * \param[in] buf buffer for data
208 * \param[in] count buffer size
210 * \retval \a count on success
211 * \retval negative number on error
213 ssize_t tgt_fmd_seconds_store(struct kobject *kobj, struct attribute *attr,
214 const char *buffer, size_t count)
216 struct obd_device *obd = container_of(kobj, struct obd_device,
218 struct lu_target *lut = obd->u.obt.obt_lut;
222 rc = kstrtoll(buffer, 0, &val);
226 if (val < 1 || val > 65536) /* ~ 18 hour max */
229 lut->lut_fmd_max_age = val;
233 LUSTRE_RW_ATTR(tgt_fmd_seconds);
235 /* These two aliases are old names and kept for compatibility, they were
236 * changed to 'tgt_fmd_count' and 'tgt_fmd_seconds'.
237 * This change was made in Lustre 2.13, so these aliases can be removed
238 * when back compatibility is not needed with any Lustre version prior 2.13
240 static struct lustre_attr tgt_fmd_count_compat = __ATTR(client_cache_count,
241 0644, tgt_fmd_count_show, tgt_fmd_count_store);
242 static struct lustre_attr tgt_fmd_seconds_compat = __ATTR(client_cache_seconds,
243 0644, tgt_fmd_seconds_show, tgt_fmd_seconds_store);
245 static const struct attribute *tgt_attrs[] = {
246 &lustre_attr_sync_lock_cancel.attr,
247 &lustre_attr_tgt_fmd_count.attr,
248 &lustre_attr_tgt_fmd_seconds.attr,
249 &tgt_fmd_count_compat.attr,
250 &tgt_fmd_seconds_compat.attr,
254 int tgt_tunables_init(struct lu_target *lut)
258 rc = sysfs_create_files(&lut->lut_obd->obd_kset.kobj, tgt_attrs);
260 lut->lut_attrs = tgt_attrs;
263 EXPORT_SYMBOL(tgt_tunables_init);
265 void tgt_tunables_fini(struct lu_target *lut)
267 if (lut->lut_attrs) {
268 sysfs_remove_files(&lut->lut_obd->obd_kset.kobj,
270 lut->lut_attrs = NULL;
273 EXPORT_SYMBOL(tgt_tunables_fini);
276 * Save cross-MDT lock in lut_slc_locks.
278 * Lock R/W count is not saved, but released in unlock (not canceled remotely),
279 * instead only a refcount is taken, so that the remote MDT where the object
280 * resides can detect conflict with this lock there.
283 * \param lock cross-MDT lock to save
284 * \param transno when the transaction with this transno is committed, this lock
287 void tgt_save_slc_lock(struct lu_target *lut, struct ldlm_lock *lock,
290 spin_lock(&lut->lut_slc_locks_guard);
291 lock_res_and_lock(lock);
292 if (ldlm_is_cbpending(lock)) {
293 /* if it was canceld by server, don't save, because remote MDT
294 * will do Sync-on-Cancel. */
297 lock->l_transno = transno;
298 /* if this lock is in the list already, there are two operations
299 * both use this lock, and save it after use, so for the second
300 * one, just put the refcount. */
301 if (list_empty(&lock->l_slc_link))
302 list_add_tail(&lock->l_slc_link, &lut->lut_slc_locks);
306 unlock_res_and_lock(lock);
307 spin_unlock(&lut->lut_slc_locks_guard);
309 EXPORT_SYMBOL(tgt_save_slc_lock);
312 * Discard cross-MDT lock from lut_slc_locks.
314 * This is called upon BAST, just remove lock from lut_slc_locks and put lock
315 * refcount. The BAST will cancel this lock.
318 * \param lock cross-MDT lock to discard
320 void tgt_discard_slc_lock(struct lu_target *lut, struct ldlm_lock *lock)
322 spin_lock(&lut->lut_slc_locks_guard);
323 lock_res_and_lock(lock);
324 /* may race with tgt_cancel_slc_locks() */
325 if (lock->l_transno != 0) {
326 LASSERT(!list_empty(&lock->l_slc_link));
327 LASSERT(ldlm_is_cbpending(lock));
328 list_del_init(&lock->l_slc_link);
332 unlock_res_and_lock(lock);
333 spin_unlock(&lut->lut_slc_locks_guard);
335 EXPORT_SYMBOL(tgt_discard_slc_lock);
338 * Cancel cross-MDT locks upon transaction commit.
340 * Remove cross-MDT locks from lut_slc_locks, cancel them and put lock refcount.
343 * \param transno transaction with this number was committed.
345 void tgt_cancel_slc_locks(struct lu_target *lut, __u64 transno)
347 struct ldlm_lock *lock, *next;
349 struct lustre_handle lockh;
351 spin_lock(&lut->lut_slc_locks_guard);
352 list_for_each_entry_safe(lock, next, &lut->lut_slc_locks,
354 lock_res_and_lock(lock);
355 LASSERT(lock->l_transno != 0);
356 if (lock->l_transno > transno) {
357 unlock_res_and_lock(lock);
360 /* ouch, another operation is using it after it's saved */
361 if (lock->l_readers != 0 || lock->l_writers != 0) {
362 unlock_res_and_lock(lock);
365 /* set CBPENDING so that this lock won't be used again */
366 ldlm_set_cbpending(lock);
368 list_move(&lock->l_slc_link, &list);
369 unlock_res_and_lock(lock);
371 spin_unlock(&lut->lut_slc_locks_guard);
373 list_for_each_entry_safe(lock, next, &list, l_slc_link) {
374 list_del_init(&lock->l_slc_link);
375 ldlm_lock2handle(lock, &lockh);
376 ldlm_cli_cancel(&lockh, LCF_ASYNC);
381 int tgt_init(const struct lu_env *env, struct lu_target *lut,
382 struct obd_device *obd, struct dt_device *dt,
383 struct tgt_opc_slice *slice, int request_fail_id,
386 struct dt_object_format dof;
390 struct tg_grants_data *tgd = &lut->lut_tgd;
391 struct obd_statfs *osfs;
399 lut->lut_bottom = dt;
400 lut->lut_last_rcvd = NULL;
401 lut->lut_client_bitmap = NULL;
402 atomic_set(&lut->lut_num_clients, 0);
403 atomic_set(&lut->lut_client_generation, 0);
404 lut->lut_reply_data = NULL;
405 lut->lut_reply_bitmap = NULL;
406 obd->u.obt.obt_lut = lut;
407 obd->u.obt.obt_magic = OBT_MAGIC;
409 /* set request handler slice and parameters */
410 lut->lut_slice = slice;
411 lut->lut_reply_fail_id = reply_fail_id;
412 lut->lut_request_fail_id = request_fail_id;
414 /* sptlrcp variables init */
415 rwlock_init(&lut->lut_sptlrpc_lock);
416 sptlrpc_rule_set_init(&lut->lut_sptlrpc_rset);
418 spin_lock_init(&lut->lut_flags_lock);
419 lut->lut_sync_lock_cancel = NEVER_SYNC_ON_CANCEL;
421 spin_lock_init(&lut->lut_slc_locks_guard);
422 INIT_LIST_HEAD(&lut->lut_slc_locks);
424 /* last_rcvd initialization is needed by replayable targets only */
425 if (!obd->obd_replayable)
428 /* initialize grant and statfs data in target */
429 dt_conf_get(env, lut->lut_bottom, &lut->lut_dt_conf);
432 spin_lock_init(&tgd->tgd_osfs_lock);
433 tgd->tgd_osfs_age = ktime_get_seconds() - 1000;
434 tgd->tgd_osfs_unstable = 0;
435 tgd->tgd_statfs_inflight = 0;
436 tgd->tgd_osfs_inflight = 0;
439 spin_lock_init(&tgd->tgd_grant_lock);
440 tgd->tgd_tot_dirty = 0;
441 tgd->tgd_tot_granted = 0;
442 tgd->tgd_tot_pending = 0;
443 tgd->tgd_grant_compat_disable = 0;
445 /* populate cached statfs data */
446 osfs = &tgt_th_info(env)->tti_u.osfs;
447 rc = tgt_statfs_internal(env, lut, osfs, 0, NULL);
449 CERROR("%s: can't get statfs data, rc %d\n", tgt_name(lut),
453 if (!is_power_of_2(osfs->os_bsize)) {
454 CERROR("%s: blocksize (%d) is not a power of 2\n",
455 tgt_name(lut), osfs->os_bsize);
456 GOTO(out, rc = -EPROTO);
458 tgd->tgd_blockbits = fls(osfs->os_bsize) - 1;
460 spin_lock_init(&lut->lut_translock);
461 spin_lock_init(&lut->lut_client_bitmap_lock);
463 OBD_ALLOC(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3);
464 if (lut->lut_client_bitmap == NULL)
467 memset(&attr, 0, sizeof(attr));
468 attr.la_valid = LA_MODE;
469 attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
470 dof.dof_type = dt_mode_to_dft(S_IFREG);
472 lu_local_obj_fid(&fid, LAST_RECV_OID);
474 o = dt_find_or_create(env, lut->lut_bottom, &fid, &dof, &attr);
477 CERROR("%s: cannot open LAST_RCVD: rc = %d\n", tgt_name(lut),
482 lut->lut_last_rcvd = o;
483 rc = tgt_server_data_init(env, lut);
487 /* prepare transactions callbacks */
488 lut->lut_txn_cb.dtc_txn_start = tgt_txn_start_cb;
489 lut->lut_txn_cb.dtc_txn_stop = tgt_txn_stop_cb;
490 lut->lut_txn_cb.dtc_cookie = lut;
491 lut->lut_txn_cb.dtc_tag = LCT_DT_THREAD | LCT_MD_THREAD;
492 INIT_LIST_HEAD(&lut->lut_txn_cb.dtc_linkage);
494 dt_txn_callback_add(lut->lut_bottom, &lut->lut_txn_cb);
495 lut->lut_bottom->dd_lu_dev.ld_site->ls_tgt = lut;
497 lut->lut_fmd_max_num = LUT_FMD_MAX_NUM_DEFAULT;
498 lut->lut_fmd_max_age = LUT_FMD_MAX_AGE_DEFAULT;
500 atomic_set(&lut->lut_sync_count, 0);
502 /* reply_data is supported by MDT targets only for now */
503 if (strncmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME, 3) != 0)
506 OBD_ALLOC(lut->lut_reply_bitmap,
507 LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *));
508 if (lut->lut_reply_bitmap == NULL)
509 GOTO(out, rc = -ENOMEM);
511 memset(&attr, 0, sizeof(attr));
512 attr.la_valid = LA_MODE;
513 attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
514 dof.dof_type = dt_mode_to_dft(S_IFREG);
516 lu_local_obj_fid(&fid, REPLY_DATA_OID);
518 o = dt_find_or_create(env, lut->lut_bottom, &fid, &dof, &attr);
521 CERROR("%s: cannot open REPLY_DATA: rc = %d\n", tgt_name(lut),
525 lut->lut_reply_data = o;
527 rc = tgt_reply_data_init(env, lut);
534 dt_txn_callback_del(lut->lut_bottom, &lut->lut_txn_cb);
536 obd->u.obt.obt_magic = 0;
537 obd->u.obt.obt_lut = NULL;
538 if (lut->lut_last_rcvd != NULL) {
539 dt_object_put(env, lut->lut_last_rcvd);
540 lut->lut_last_rcvd = NULL;
542 if (lut->lut_client_bitmap != NULL)
543 OBD_FREE(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3);
544 lut->lut_client_bitmap = NULL;
545 if (lut->lut_reply_data != NULL)
546 dt_object_put(env, lut->lut_reply_data);
547 lut->lut_reply_data = NULL;
548 if (lut->lut_reply_bitmap != NULL) {
549 for (i = 0; i < LUT_REPLY_SLOTS_MAX_CHUNKS; i++) {
550 if (lut->lut_reply_bitmap[i] != NULL)
551 OBD_FREE_LARGE(lut->lut_reply_bitmap[i],
552 BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) *
554 lut->lut_reply_bitmap[i] = NULL;
556 OBD_FREE(lut->lut_reply_bitmap,
557 LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *));
559 lut->lut_reply_bitmap = NULL;
562 EXPORT_SYMBOL(tgt_init);
564 void tgt_fini(const struct lu_env *env, struct lu_target *lut)
570 if (lut->lut_lsd.lsd_feature_incompat & OBD_INCOMPAT_MULTI_RPCS &&
571 atomic_read(&lut->lut_num_clients) == 0) {
572 /* Clear MULTI RPCS incompatibility flag that prevents previous
573 * Lustre versions to mount a target with reply_data file */
574 lut->lut_lsd.lsd_feature_incompat &= ~OBD_INCOMPAT_MULTI_RPCS;
575 rc = tgt_server_data_update(env, lut, 1);
577 CERROR("%s: unable to clear MULTI RPCS "
578 "incompatibility flag\n",
579 lut->lut_obd->obd_name);
582 sptlrpc_rule_set_free(&lut->lut_sptlrpc_rset);
584 if (lut->lut_reply_data != NULL)
585 dt_object_put(env, lut->lut_reply_data);
586 lut->lut_reply_data = NULL;
587 if (lut->lut_reply_bitmap != NULL) {
588 for (i = 0; i < LUT_REPLY_SLOTS_MAX_CHUNKS; i++) {
589 if (lut->lut_reply_bitmap[i] != NULL)
590 OBD_FREE_LARGE(lut->lut_reply_bitmap[i],
591 BITS_TO_LONGS(LUT_REPLY_SLOTS_PER_CHUNK) *
593 lut->lut_reply_bitmap[i] = NULL;
595 OBD_FREE(lut->lut_reply_bitmap,
596 LUT_REPLY_SLOTS_MAX_CHUNKS * sizeof(unsigned long *));
598 lut->lut_reply_bitmap = NULL;
599 if (lut->lut_client_bitmap) {
600 OBD_FREE(lut->lut_client_bitmap, LR_MAX_CLIENTS >> 3);
601 lut->lut_client_bitmap = NULL;
603 if (lut->lut_last_rcvd) {
604 dt_txn_callback_del(lut->lut_bottom, &lut->lut_txn_cb);
605 dt_object_put(env, lut->lut_last_rcvd);
606 lut->lut_last_rcvd = NULL;
610 EXPORT_SYMBOL(tgt_fini);
612 static struct kmem_cache *tgt_thread_kmem;
613 static struct kmem_cache *tgt_session_kmem;
614 struct kmem_cache *tgt_fmd_kmem;
616 static struct lu_kmem_descr tgt_caches[] = {
618 .ckd_cache = &tgt_thread_kmem,
619 .ckd_name = "tgt_thread_kmem",
620 .ckd_size = sizeof(struct tgt_thread_info),
623 .ckd_cache = &tgt_session_kmem,
624 .ckd_name = "tgt_session_kmem",
625 .ckd_size = sizeof(struct tgt_session_info)
628 .ckd_cache = &tgt_fmd_kmem,
629 .ckd_name = "tgt_fmd_cache",
630 .ckd_size = sizeof(struct tgt_fmd_data)
638 /* context key constructor/destructor: tg_key_init, tg_key_fini */
639 static void *tgt_key_init(const struct lu_context *ctx,
640 struct lu_context_key *key)
642 struct tgt_thread_info *thread;
644 OBD_SLAB_ALLOC_PTR_GFP(thread, tgt_thread_kmem, GFP_NOFS);
646 return ERR_PTR(-ENOMEM);
651 static void tgt_key_fini(const struct lu_context *ctx,
652 struct lu_context_key *key, void *data)
654 struct tgt_thread_info *info = data;
655 struct thandle_exec_args *args = &info->tti_tea;
658 for (i = 0; i < args->ta_alloc_args; i++) {
659 if (args->ta_args[i] != NULL)
660 OBD_FREE_PTR(args->ta_args[i]);
663 if (args->ta_args != NULL)
664 OBD_FREE(args->ta_args, sizeof(args->ta_args[0]) *
665 args->ta_alloc_args);
666 OBD_SLAB_FREE_PTR(info, tgt_thread_kmem);
669 static void tgt_key_exit(const struct lu_context *ctx,
670 struct lu_context_key *key, void *data)
672 struct tgt_thread_info *tti = data;
674 tti->tti_has_trans = 0;
675 tti->tti_mult_trans = 0;
678 /* context key: tg_thread_key */
679 struct lu_context_key tgt_thread_key = {
680 .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD,
681 .lct_init = tgt_key_init,
682 .lct_fini = tgt_key_fini,
683 .lct_exit = tgt_key_exit,
686 LU_KEY_INIT_GENERIC(tgt);
688 static void *tgt_ses_key_init(const struct lu_context *ctx,
689 struct lu_context_key *key)
691 struct tgt_session_info *session;
693 OBD_SLAB_ALLOC_PTR_GFP(session, tgt_session_kmem, GFP_NOFS);
695 return ERR_PTR(-ENOMEM);
700 static void tgt_ses_key_fini(const struct lu_context *ctx,
701 struct lu_context_key *key, void *data)
703 struct tgt_session_info *session = data;
705 OBD_SLAB_FREE_PTR(session, tgt_session_kmem);
708 /* context key: tgt_session_key */
709 struct lu_context_key tgt_session_key = {
710 .lct_tags = LCT_SERVER_SESSION,
711 .lct_init = tgt_ses_key_init,
712 .lct_fini = tgt_ses_key_fini,
714 EXPORT_SYMBOL(tgt_session_key);
716 LU_KEY_INIT_GENERIC(tgt_ses);
719 * this page is allocated statically when module is initializing
720 * it is used to simulate data corruptions, see ost_checksum_bulk()
721 * for details. as the original pages provided by the layers below
722 * can be remain in the internal cache, we do not want to modify
725 struct page *tgt_page_to_corrupt;
727 int tgt_mod_init(void)
732 result = lu_kmem_init(tgt_caches);
736 tgt_page_to_corrupt = alloc_page(GFP_KERNEL);
738 tgt_key_init_generic(&tgt_thread_key, NULL);
739 lu_context_key_register_many(&tgt_thread_key, NULL);
741 tgt_ses_key_init_generic(&tgt_session_key, NULL);
742 lu_context_key_register_many(&tgt_session_key, NULL);
750 void tgt_mod_exit(void)
753 if (tgt_page_to_corrupt != NULL)
754 put_page(tgt_page_to_corrupt);
756 lu_context_key_degister(&tgt_thread_key);
757 lu_context_key_degister(&tgt_session_key);
760 lu_kmem_fini(tgt_caches);