+
+int
+kiblnd_failover_thread(void *arg)
+{
+ cfs_rwlock_t *glock = &kiblnd_data.kib_global_lock;
+ kib_dev_t *dev;
+ cfs_waitlink_t wait;
+ unsigned long flags;
+ int rc;
+
+ LASSERT (*kiblnd_tunables.kib_dev_failover != 0);
+
+ cfs_daemonize ("kiblnd_failover");
+ cfs_block_allsigs ();
+
+ cfs_waitlink_init(&wait);
+ cfs_write_lock_irqsave(glock, flags);
+
+ while (!kiblnd_data.kib_shutdown) {
+ int do_failover = 0;
+ int long_sleep;
+
+ cfs_list_for_each_entry(dev, &kiblnd_data.kib_failed_devs,
+ ibd_fail_list) {
+ if (cfs_time_before(cfs_time_current(),
+ dev->ibd_next_failover))
+ continue;
+ do_failover = 1;
+ break;
+ }
+
+ if (do_failover) {
+ cfs_list_del_init(&dev->ibd_fail_list);
+ dev->ibd_failover = 1;
+ cfs_write_unlock_irqrestore(glock, flags);
+
+ rc = kiblnd_dev_failover(dev);
+
+ cfs_write_lock_irqsave(glock, flags);
+
+ LASSERT (dev->ibd_failover);
+ dev->ibd_failover = 0;
+ if (rc >= 0) { /* Device is OK or failover succeed */
+ dev->ibd_next_failover = cfs_time_shift(3);
+ continue;
+ }
+
+ /* failed to failover, retry later */
+ dev->ibd_next_failover =
+ cfs_time_shift(min(dev->ibd_failed_failover, 10));
+ if (kiblnd_dev_can_failover(dev)) {
+ cfs_list_add_tail(&dev->ibd_fail_list,
+ &kiblnd_data.kib_failed_devs);
+ }
+
+ continue;
+ }
+
+ /* long sleep if no more pending failover */
+ long_sleep = cfs_list_empty(&kiblnd_data.kib_failed_devs);
+
+ cfs_set_current_state(CFS_TASK_INTERRUPTIBLE);
+ cfs_waitq_add(&kiblnd_data.kib_failover_waitq, &wait);
+ cfs_write_unlock_irqrestore(glock, flags);
+
+ rc = schedule_timeout(long_sleep ? cfs_time_seconds(10) :
+ cfs_time_seconds(1));
+ cfs_set_current_state(CFS_TASK_RUNNING);
+ cfs_waitq_del(&kiblnd_data.kib_failover_waitq, &wait);
+ cfs_write_lock_irqsave(glock, flags);
+
+ if (!long_sleep || rc != 0)
+ continue;
+
+ /* have a long sleep, routine check all active devices,
+ * we need checking like this because if there is not active
+ * connection on the dev and no SEND from local, we may listen
+ * on wrong HCA for ever while there is a bonding failover */
+ cfs_list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) {
+ if (kiblnd_dev_can_failover(dev)) {
+ cfs_list_add_tail(&dev->ibd_fail_list,
+ &kiblnd_data.kib_failed_devs);
+ }
+ }
+ }
+
+ cfs_write_unlock_irqrestore(glock, flags);
+
+ kiblnd_thread_fini();
+ return 0;
+}