4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2017, Commissariat a l'Energie Atomique et aux Energies
26 * Copyright (c) 2017, Intel Corporation.
28 * Author: Henri Doreau <henri.doreau@cea.fr>
31 #define DEBUG_SUBSYSTEM S_MDC
33 #include <linux/init.h>
34 #include <linux/kthread.h>
35 #include <linux/poll.h>
36 #include <linux/miscdevice.h>
38 #include <lustre_log.h>
39 #include <uapi/linux/lustre/lustre_ioctl.h>
41 #include "mdc_internal.h"
45 * -- Changelog delivery through character device --
49 * Mutex to protect chlg_registered_devices below
51 static DEFINE_MUTEX(chlg_registered_dev_lock);
54 * Global linked list of all registered devices (one per MDT).
56 static LIST_HEAD(chlg_registered_devices);
59 struct chlg_registered_dev {
60 /* Device name of the form "changelog-{MDTNAME}" */
62 /* Misc device descriptor */
63 struct miscdevice ced_misc;
64 /* OBDs referencing this device (multiple mount point) */
65 struct list_head ced_obds;
66 /* Reference counter for proper deregistration */
68 /* Link within the global chlg_registered_devices */
69 struct list_head ced_link;
72 struct chlg_reader_state {
73 /* Shortcut to the corresponding OBD device */
74 struct obd_device *crs_obd;
75 /* the corresponding chlg_registered_dev */
76 struct chlg_registered_dev *crs_ced;
77 /* Producer thread (if any) */
78 struct task_struct *crs_prod_task;
79 /* An error occurred that prevents from reading further */
81 /* EOF, no more records available */
83 /* Desired start position */
84 __u64 crs_start_offset;
85 /* Wait queue for the catalog processing thread */
86 wait_queue_head_t crs_waitq_prod;
87 /* Wait queue for the record copy threads */
88 wait_queue_head_t crs_waitq_cons;
89 /* Mutex protecting crs_rec_count and crs_rec_queue */
90 struct mutex crs_lock;
91 /* Number of item in the list */
93 /* List of prefetched enqueued_record::enq_linkage_items */
94 struct list_head crs_rec_queue;
95 unsigned int crs_last_catidx;
96 unsigned int crs_last_idx;
100 struct chlg_rec_entry {
101 /* Link within the chlg_reader_state::crs_rec_queue list */
102 struct list_head enq_linkage;
103 /* Data (enq_record) field length */
105 /* Copy of a changelog record (see struct llog_changelog_rec) */
106 struct changelog_rec enq_record[];
110 /* Number of records to prefetch locally. */
111 CDEV_CHLG_MAX_PREFETCH = 1024,
115 * Deregister a changelog character device whose refcount has reached zero.
117 static void chlg_dev_clear(struct kref *kref)
119 struct chlg_registered_dev *entry = container_of(kref,
120 struct chlg_registered_dev,
124 list_del(&entry->ced_link);
125 misc_deregister(&entry->ced_misc);
130 static inline struct obd_device* chlg_obd_get(struct chlg_registered_dev *dev)
132 struct obd_device *obd;
134 mutex_lock(&chlg_registered_dev_lock);
135 if (list_empty(&dev->ced_obds))
138 obd = list_first_entry(&dev->ced_obds, struct obd_device,
139 u.cli.cl_chg_dev_linkage);
140 class_incref(obd, "changelog", dev);
141 mutex_unlock(&chlg_registered_dev_lock);
145 static inline void chlg_obd_put(struct chlg_registered_dev *dev,
146 struct obd_device *obd)
148 class_decref(obd, "changelog", dev);
152 * ChangeLog catalog processing callback invoked on each record.
153 * If the current record is eligible to userland delivery, push
154 * it into the crs_rec_queue where the consumer code will fetch it.
156 * @param[in] env (unused)
157 * @param[in] llh Client-side handle used to identify the llog
158 * @param[in] hdr Header of the current llog record
159 * @param[in,out] data chlg_reader_state passed from caller
161 * @return 0 or LLOG_PROC_* control code on success, negated error on failure.
163 static int chlg_read_cat_process_cb(const struct lu_env *env,
164 struct llog_handle *llh,
165 struct llog_rec_hdr *hdr, void *data)
167 struct llog_changelog_rec *rec;
168 struct chlg_reader_state *crs = data;
169 struct chlg_rec_entry *enq;
174 LASSERT(crs != NULL);
175 LASSERT(hdr != NULL);
177 rec = container_of(hdr, struct llog_changelog_rec, cr_hdr);
179 crs->crs_last_catidx = llh->lgh_hdr->llh_cat_idx;
180 crs->crs_last_idx = hdr->lrh_index;
182 if (rec->cr_hdr.lrh_type != CHANGELOG_REC) {
184 CERROR("%s: not a changelog rec %x/%d in llog "DFID" rc = %d\n",
185 crs->crs_obd->obd_name, rec->cr_hdr.lrh_type,
187 PFID(lu_object_fid(&llh->lgh_obj->do_lu)), rc);
191 /* Skip undesired records */
192 if (rec->cr.cr_index < crs->crs_start_offset)
195 CDEBUG(D_HSM, "%llu %02d%-5s %llu 0x%x t="DFID" p="DFID" %.*s\n",
196 rec->cr.cr_index, rec->cr.cr_type,
197 changelog_type2str(rec->cr.cr_type), rec->cr.cr_time,
198 rec->cr.cr_flags & CLF_FLAGMASK,
199 PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid),
200 rec->cr.cr_namelen, changelog_rec_name(&rec->cr));
202 wait_event_interruptible(crs->crs_waitq_prod,
203 crs->crs_rec_count < CDEV_CHLG_MAX_PREFETCH ||
204 kthread_should_stop());
206 if (kthread_should_stop())
207 RETURN(LLOG_PROC_BREAK);
209 len = changelog_rec_size(&rec->cr) + rec->cr.cr_namelen;
210 OBD_ALLOC(enq, sizeof(*enq) + len);
214 INIT_LIST_HEAD(&enq->enq_linkage);
215 enq->enq_length = len;
216 memcpy(enq->enq_record, &rec->cr, len);
218 mutex_lock(&crs->crs_lock);
219 list_add_tail(&enq->enq_linkage, &crs->crs_rec_queue);
220 crs->crs_rec_count++;
221 mutex_unlock(&crs->crs_lock);
223 wake_up_all(&crs->crs_waitq_cons);
229 * Remove record from the list it is attached to and free it.
231 static void enq_record_delete(struct chlg_rec_entry *rec)
233 list_del(&rec->enq_linkage);
234 OBD_FREE(rec, sizeof(*rec) + rec->enq_length);
238 * Record prefetch thread entry point. Opens the changelog catalog and starts
241 * @param[in,out] args chlg_reader_state passed from caller.
242 * @return 0 on success, negated error code on failure.
244 static int chlg_load(void *args)
246 struct chlg_reader_state *crs = args;
247 struct chlg_registered_dev *ced = crs->crs_ced;
248 struct obd_device *obd = NULL;
249 struct llog_ctxt *ctx = NULL;
250 struct llog_handle *llh = NULL;
254 crs->crs_last_catidx = -1;
255 crs->crs_last_idx = 0;
258 obd = chlg_obd_get(ced);
264 ctx = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
266 GOTO(err_out, rc = -ENOENT);
268 rc = llog_open(NULL, ctx, &llh, NULL, CHANGELOG_CATALOG,
271 CERROR("%s: fail to open changelog catalog: rc = %d\n",
277 rc = llog_init_handle(NULL, llh,
280 LLOG_F_EXT_EXTRA_FLAGS |
281 LLOG_F_EXT_X_UIDGID |
287 CERROR("%s: fail to init llog handle: rc = %d\n",
292 rc = llog_cat_process(NULL, llh, chlg_read_cat_process_cb, crs,
293 crs->crs_last_catidx, crs->crs_last_idx);
295 CERROR("%s: fail to process llog: rc = %d\n", obd->obd_name, rc);
298 if (!kthread_should_stop() && crs->crs_poll) {
299 llog_cat_close(NULL, llh);
301 class_decref(obd, "changelog", crs);
302 schedule_timeout_interruptible(HZ);
312 wake_up_all(&crs->crs_waitq_cons);
315 llog_cat_close(NULL, llh);
321 chlg_obd_put(ced, obd);
322 wait_event_interruptible(crs->crs_waitq_prod, kthread_should_stop());
328 * Read handler, dequeues records from the chlg_reader_state if any.
329 * No partial records are copied to userland so this function can return less
330 * data than required (short read).
332 * @param[in] file File pointer to the character device.
333 * @param[out] buff Userland buffer where to copy the records.
334 * @param[in] count Userland buffer size.
335 * @param[out] ppos File position, updated with the index number of the next
337 * @return number of copied bytes on success, negated error code on failure.
339 static ssize_t chlg_read(struct file *file, char __user *buff, size_t count,
342 struct chlg_reader_state *crs = file->private_data;
343 struct chlg_rec_entry *rec;
344 struct chlg_rec_entry *tmp;
345 size_t written_total = 0;
350 if (file->f_flags & O_NONBLOCK && crs->crs_rec_count == 0) {
351 if (crs->crs_err < 0)
352 RETURN(crs->crs_err);
353 else if (crs->crs_eof)
359 rc = wait_event_interruptible(crs->crs_waitq_cons,
360 crs->crs_rec_count > 0 || crs->crs_eof || crs->crs_err);
362 mutex_lock(&crs->crs_lock);
363 list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) {
364 if (written_total + rec->enq_length > count)
367 if (copy_to_user(buff, rec->enq_record, rec->enq_length)) {
372 buff += rec->enq_length;
373 written_total += rec->enq_length;
375 crs->crs_rec_count--;
376 list_move_tail(&rec->enq_linkage, &consumed);
378 crs->crs_start_offset = rec->enq_record->cr_index + 1;
380 mutex_unlock(&crs->crs_lock);
382 if (written_total > 0) {
384 wake_up_all(&crs->crs_waitq_prod);
385 } else if (rc == 0) {
389 list_for_each_entry_safe(rec, tmp, &consumed, enq_linkage)
390 enq_record_delete(rec);
392 *ppos = crs->crs_start_offset;
398 * Jump to a given record index. Helper for chlg_llseek().
400 * @param[in,out] crs Internal reader state.
401 * @param[in] offset Desired offset (index record).
402 * @return 0 on success, negated error code on failure.
404 static int chlg_set_start_offset(struct chlg_reader_state *crs, __u64 offset)
406 struct chlg_rec_entry *rec;
407 struct chlg_rec_entry *tmp;
409 mutex_lock(&crs->crs_lock);
410 if (offset < crs->crs_start_offset) {
411 mutex_unlock(&crs->crs_lock);
415 crs->crs_start_offset = offset;
416 list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) {
417 struct changelog_rec *cr = rec->enq_record;
419 if (cr->cr_index >= crs->crs_start_offset)
422 crs->crs_rec_count--;
423 enq_record_delete(rec);
426 mutex_unlock(&crs->crs_lock);
427 wake_up_all(&crs->crs_waitq_prod);
432 * Move read pointer to a certain record index, encoded as an offset.
434 * @param[in,out] file File pointer to the changelog character device
435 * @param[in] off Offset to skip, actually a record index, not byte count
436 * @param[in] whence Relative/Absolute interpretation of the offset
437 * @return the resulting position on success or negated error code on failure.
439 static loff_t chlg_llseek(struct file *file, loff_t off, int whence)
441 struct chlg_reader_state *crs = file->private_data;
450 pos = file->f_pos + off;
457 /* We cannot go backward */
458 if (pos < file->f_pos)
461 rc = chlg_set_start_offset(crs, pos);
470 * Clear record range for a given changelog reader.
472 * @param[in] crs Current internal state.
473 * @param[in] reader Changelog reader ID (cl1, cl2...)
474 * @param[in] record Record index up which to clear
475 * @return 0 on success, negated error code on failure.
477 static int chlg_clear(struct chlg_reader_state *crs, __u32 reader, __u64 record)
479 struct obd_device *obd = NULL;
480 struct changelog_setinfo cs = {
486 obd = chlg_obd_get(crs->crs_ced);
490 rc = obd_set_info_async(NULL, obd->obd_self_export,
491 strlen(KEY_CHANGELOG_CLEAR),
492 KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, NULL);
494 chlg_obd_put(crs->crs_ced, obd);
498 /** Maximum changelog control command size */
499 #define CHLG_CONTROL_CMD_MAX 64
502 * Handle writes() into the changelog character device. Write() can be used
503 * to request special control operations.
505 * @param[in] file File pointer to the changelog character device
506 * @param[in] buff User supplied data (written data)
507 * @param[in] count Number of written bytes
508 * @param[in] off (unused)
509 * @return number of written bytes on success, negated error code on failure.
511 static ssize_t chlg_write(struct file *file, const char __user *buff,
512 size_t count, loff_t *off)
514 struct chlg_reader_state *crs = file->private_data;
521 if (count > CHLG_CONTROL_CMD_MAX)
524 OBD_ALLOC(kbuf, CHLG_CONTROL_CMD_MAX);
528 if (copy_from_user(kbuf, buff, count))
529 GOTO(out_kbuf, rc = -EFAULT);
531 kbuf[CHLG_CONTROL_CMD_MAX - 1] = '\0';
533 if (sscanf(kbuf, "clear:cl%u:%llu", &reader, &record) == 2)
534 rc = chlg_clear(crs, reader, record);
540 OBD_FREE(kbuf, CHLG_CONTROL_CMD_MAX);
541 return rc < 0 ? rc : count;
545 * Open handler, initialize internal CRS state and spawn prefetch thread if
547 * @param[in] inode Inode struct for the open character device.
548 * @param[in] file Corresponding file pointer.
549 * @return 0 on success, negated error code on failure.
551 static int chlg_open(struct inode *inode, struct file *file)
553 struct chlg_reader_state *crs;
554 struct miscdevice *misc = file->private_data;
555 struct chlg_registered_dev *dev;
556 struct task_struct *task;
560 dev = container_of(misc, struct chlg_registered_dev, ced_misc);
566 kref_get(&dev->ced_refs);
568 crs->crs_err = false;
569 crs->crs_eof = false;
571 mutex_init(&crs->crs_lock);
572 INIT_LIST_HEAD(&crs->crs_rec_queue);
573 init_waitqueue_head(&crs->crs_waitq_prod);
574 init_waitqueue_head(&crs->crs_waitq_cons);
576 if (file->f_mode & FMODE_READ) {
577 task = kthread_run(chlg_load, crs, "chlg_load_thread");
580 CERROR("%s: cannot start changelog thread: rc = %d\n",
584 crs->crs_prod_task = task;
587 file->private_data = crs;
591 kref_put(&dev->ced_refs, chlg_dev_clear);
597 * Close handler, release resources.
599 * @param[in] inode Inode struct for the open character device.
600 * @param[in] file Corresponding file pointer.
601 * @return 0 on success, negated error code on failure.
603 static int chlg_release(struct inode *inode, struct file *file)
605 struct chlg_reader_state *crs = file->private_data;
606 struct chlg_rec_entry *rec;
607 struct chlg_rec_entry *tmp;
610 if (crs->crs_prod_task)
611 rc = kthread_stop(crs->crs_prod_task);
613 list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage)
614 enq_record_delete(rec);
616 kref_put(&crs->crs_ced->ced_refs, chlg_dev_clear);
623 * Poll handler, indicates whether the device is readable (new records) and
626 * @param[in] file Device file pointer.
627 * @param[in] wait (opaque)
628 * @return combination of the poll status flags.
630 static unsigned int chlg_poll(struct file *file, poll_table *wait)
632 struct chlg_reader_state *crs = file->private_data;
633 unsigned int mask = 0;
635 mutex_lock(&crs->crs_lock);
636 poll_wait(file, &crs->crs_waitq_cons, wait);
637 if (crs->crs_rec_count > 0)
638 mask |= POLLIN | POLLRDNORM;
643 mutex_unlock(&crs->crs_lock);
647 static long chlg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
651 struct chlg_reader_state *crs = file->private_data;
653 case OBD_IOC_CHLG_POLL:
654 crs->crs_poll = !!arg;
664 static const struct file_operations chlg_fops = {
665 .owner = THIS_MODULE,
666 .llseek = chlg_llseek,
670 .release = chlg_release,
672 .unlocked_ioctl = chlg_ioctl,
676 * This uses obd_name of the form: "testfs-MDT0000-mdc-ffff88006501600"
677 * and returns a name of the form: "changelog-testfs-MDT0000".
679 static void get_chlg_name(char *name, size_t name_len, struct obd_device *obd)
683 snprintf(name, name_len, "changelog-%s", obd->obd_name);
685 /* Find the 2nd '-' from the end and truncate on it */
686 for (i = 0; i < 2; i++) {
687 char *p = strrchr(name, '-');
696 * Find a changelog character device by name.
697 * All devices registered during MDC setup are listed in a global list with
698 * their names attached.
700 static struct chlg_registered_dev *
701 chlg_registered_dev_find_by_name(const char *name)
703 struct chlg_registered_dev *dit;
705 LASSERT(mutex_is_locked(&chlg_registered_dev_lock));
706 list_for_each_entry(dit, &chlg_registered_devices, ced_link)
707 if (strcmp(name, dit->ced_name) == 0)
713 * Find chlg_registered_dev structure for a given OBD device.
714 * This is bad O(n^2) but for each filesystem:
715 * - N is # of MDTs times # of mount points
716 * - this only runs at shutdown
718 static struct chlg_registered_dev *
719 chlg_registered_dev_find_by_obd(const struct obd_device *obd)
721 struct chlg_registered_dev *dit;
722 struct obd_device *oit;
724 LASSERT(mutex_is_locked(&chlg_registered_dev_lock));
725 list_for_each_entry(dit, &chlg_registered_devices, ced_link)
726 list_for_each_entry(oit, &dit->ced_obds,
727 u.cli.cl_chg_dev_linkage)
734 * Changelog character device initialization.
735 * Register a misc character device with a dynamic minor number, under a name
736 * of the form: 'changelog-fsname-MDTxxxx'. Reference this OBD device with it.
738 * @param[in] obd This MDC obd_device.
739 * @return 0 on success, negated error code on failure.
741 int mdc_changelog_cdev_init(struct obd_device *obd)
743 struct chlg_registered_dev *exist;
744 struct chlg_registered_dev *entry;
748 OBD_ALLOC_PTR(entry);
752 get_chlg_name(entry->ced_name, sizeof(entry->ced_name), obd);
754 entry->ced_misc.minor = MISC_DYNAMIC_MINOR;
755 entry->ced_misc.name = entry->ced_name;
756 entry->ced_misc.fops = &chlg_fops;
758 kref_init(&entry->ced_refs);
759 INIT_LIST_HEAD(&entry->ced_obds);
760 INIT_LIST_HEAD(&entry->ced_link);
762 mutex_lock(&chlg_registered_dev_lock);
763 exist = chlg_registered_dev_find_by_name(entry->ced_name);
765 kref_get(&exist->ced_refs);
766 list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &exist->ced_obds);
767 GOTO(out_unlock, rc = 0);
770 list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &entry->ced_obds);
771 list_add_tail(&entry->ced_link, &chlg_registered_devices);
773 /* Register new character device */
774 rc = misc_register(&entry->ced_misc);
776 list_del_init(&obd->u.cli.cl_chg_dev_linkage);
777 list_del(&entry->ced_link);
778 GOTO(out_unlock, rc);
781 entry = NULL; /* prevent it from being freed below */
784 mutex_unlock(&chlg_registered_dev_lock);
791 * Release OBD, decrease reference count of the corresponding changelog device.
793 void mdc_changelog_cdev_finish(struct obd_device *obd)
795 struct chlg_registered_dev *dev;
798 mutex_lock(&chlg_registered_dev_lock);
799 dev = chlg_registered_dev_find_by_obd(obd);
800 list_del_init(&obd->u.cli.cl_chg_dev_linkage);
801 kref_put(&dev->ced_refs, chlg_dev_clear);
802 mutex_unlock(&chlg_registered_dev_lock);