4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2017, Commissariat a l'Energie Atomique et aux Energies
26 * Author: Henri Doreau <henri.doreau@cea.fr>
29 #define DEBUG_SUBSYSTEM S_MDC
31 #include <linux/init.h>
32 #include <linux/kthread.h>
33 #include <linux/poll.h>
34 #include <linux/miscdevice.h>
36 #include <lustre_log.h>
38 #include "mdc_internal.h"
42 * -- Changelog delivery through character device --
46 * Mutex to protect chlg_registered_devices below
48 static DEFINE_MUTEX(chlg_registered_dev_lock);
51 * Global linked list of all registered devices (one per MDT).
53 static LIST_HEAD(chlg_registered_devices);
56 struct chlg_registered_dev {
57 /* Device name of the form "changelog-{MDTNAME}" */
59 /* Misc device descriptor */
60 struct miscdevice ced_misc;
61 /* OBDs referencing this device (multiple mount point) */
62 struct list_head ced_obds;
63 /* Reference counter for proper deregistration */
65 /* Link within the global chlg_registered_devices */
66 struct list_head ced_link;
69 struct chlg_reader_state {
70 /* Shortcut to the corresponding OBD device */
71 struct obd_device *crs_obd;
72 /* An error occurred that prevents from reading further */
74 /* EOF, no more records available */
76 /* Userland reader closed connection */
78 /* Desired start position */
79 __u64 crs_start_offset;
80 /* Wait queue for the catalog processing thread */
81 wait_queue_head_t crs_waitq_prod;
82 /* Wait queue for the record copy threads */
83 wait_queue_head_t crs_waitq_cons;
84 /* Mutex protecting crs_rec_count and crs_rec_queue */
85 struct mutex crs_lock;
86 /* Number of item in the list */
88 /* List of prefetched enqueued_record::enq_linkage_items */
89 struct list_head crs_rec_queue;
92 struct chlg_rec_entry {
93 /* Link within the chlg_reader_state::crs_rec_queue list */
94 struct list_head enq_linkage;
95 /* Data (enq_record) field length */
97 /* Copy of a changelog record (see struct llog_changelog_rec) */
98 struct changelog_rec enq_record[];
102 /* Number of records to prefetch locally. */
103 CDEV_CHLG_MAX_PREFETCH = 1024,
107 * ChangeLog catalog processing callback invoked on each record.
108 * If the current record is eligible to userland delivery, push
109 * it into the crs_rec_queue where the consumer code will fetch it.
111 * @param[in] env (unused)
112 * @param[in] llh Client-side handle used to identify the llog
113 * @param[in] hdr Header of the current llog record
114 * @param[in,out] data chlg_reader_state passed from caller
116 * @return 0 or LLOG_PROC_* control code on success, negated error on failure.
118 static int chlg_read_cat_process_cb(const struct lu_env *env,
119 struct llog_handle *llh,
120 struct llog_rec_hdr *hdr, void *data)
122 struct llog_changelog_rec *rec;
123 struct chlg_reader_state *crs = data;
124 struct chlg_rec_entry *enq;
125 struct l_wait_info lwi = { 0 };
130 LASSERT(crs != NULL);
131 LASSERT(hdr != NULL);
133 rec = container_of(hdr, struct llog_changelog_rec, cr_hdr);
135 if (rec->cr_hdr.lrh_type != CHANGELOG_REC) {
137 CERROR("%s: not a changelog rec %x/%d in llog "DFID" rc = %d\n",
138 crs->crs_obd->obd_name, rec->cr_hdr.lrh_type,
140 PFID(lu_object_fid(&llh->lgh_obj->do_lu)), rc);
144 /* Skip undesired records */
145 if (rec->cr.cr_index < crs->crs_start_offset)
148 CDEBUG(D_HSM, "%llu %02d%-5s %llu 0x%x t="DFID" p="DFID" %.*s\n",
149 rec->cr.cr_index, rec->cr.cr_type,
150 changelog_type2str(rec->cr.cr_type), rec->cr.cr_time,
151 rec->cr.cr_flags & CLF_FLAGMASK,
152 PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid),
153 rec->cr.cr_namelen, changelog_rec_name(&rec->cr));
155 l_wait_event(crs->crs_waitq_prod,
156 (crs->crs_rec_count < CDEV_CHLG_MAX_PREFETCH ||
157 crs->crs_closed), &lwi);
160 RETURN(LLOG_PROC_BREAK);
162 len = changelog_rec_size(&rec->cr) + rec->cr.cr_namelen;
163 OBD_ALLOC(enq, sizeof(*enq) + len);
167 INIT_LIST_HEAD(&enq->enq_linkage);
168 enq->enq_length = len;
169 memcpy(enq->enq_record, &rec->cr, len);
171 mutex_lock(&crs->crs_lock);
172 list_add_tail(&enq->enq_linkage, &crs->crs_rec_queue);
173 crs->crs_rec_count++;
174 mutex_unlock(&crs->crs_lock);
176 wake_up_all(&crs->crs_waitq_cons);
182 * Remove record from the list it is attached to and free it.
184 static void enq_record_delete(struct chlg_rec_entry *rec)
186 list_del(&rec->enq_linkage);
187 OBD_FREE(rec, sizeof(*rec) + rec->enq_length);
191 * Release resources associated to a changelog_reader_state instance.
193 * @param crs CRS instance to release.
195 static void crs_free(struct chlg_reader_state *crs)
197 struct chlg_rec_entry *rec;
198 struct chlg_rec_entry *tmp;
200 list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage)
201 enq_record_delete(rec);
207 * Record prefetch thread entry point. Opens the changelog catalog and starts
210 * @param[in,out] args chlg_reader_state passed from caller.
211 * @return 0 on success, negated error code on failure.
213 static int chlg_load(void *args)
215 struct chlg_reader_state *crs = args;
216 struct obd_device *obd = crs->crs_obd;
217 struct llog_ctxt *ctx = NULL;
218 struct llog_handle *llh = NULL;
219 struct l_wait_info lwi = { 0 };
223 ctx = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT);
225 GOTO(err_out, rc = -ENOENT);
227 rc = llog_open(NULL, ctx, &llh, NULL, CHANGELOG_CATALOG,
230 CERROR("%s: fail to open changelog catalog: rc = %d\n",
235 rc = llog_init_handle(NULL, llh, LLOG_F_IS_CAT|LLOG_F_EXT_JOBID, NULL);
237 CERROR("%s: fail to init llog handle: rc = %d\n",
242 rc = llog_cat_process(NULL, llh, chlg_read_cat_process_cb, crs, 0, 0);
244 CERROR("%s: fail to process llog: rc = %d\n", obd->obd_name, rc);
250 wake_up_all(&crs->crs_waitq_cons);
253 llog_cat_close(NULL, llh);
258 l_wait_event(crs->crs_waitq_prod, crs->crs_closed, &lwi);
264 * Read handler, dequeues records from the chlg_reader_state if any.
265 * No partial records are copied to userland so this function can return less
266 * data than required (short read).
268 * @param[in] file File pointer to the character device.
269 * @param[out] buff Userland buffer where to copy the records.
270 * @param[in] count Userland buffer size.
271 * @param[out] ppos File position, updated with the index number of the next
273 * @return number of copied bytes on success, negated error code on failure.
275 static ssize_t chlg_read(struct file *file, char __user *buff, size_t count,
278 struct chlg_reader_state *crs = file->private_data;
279 struct chlg_rec_entry *rec;
280 struct chlg_rec_entry *tmp;
281 struct l_wait_info lwi = { 0 };
282 ssize_t written_total = 0;
286 if (file->f_flags & O_NONBLOCK && crs->crs_rec_count == 0)
289 l_wait_event(crs->crs_waitq_cons,
290 crs->crs_rec_count > 0 || crs->crs_eof || crs->crs_err,
293 mutex_lock(&crs->crs_lock);
294 list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) {
295 if (written_total + rec->enq_length > count)
298 if (copy_to_user(buff, rec->enq_record, rec->enq_length)) {
299 if (written_total == 0)
300 written_total = -EFAULT;
304 buff += rec->enq_length;
305 written_total += rec->enq_length;
307 crs->crs_rec_count--;
308 list_move_tail(&rec->enq_linkage, &consumed);
310 crs->crs_start_offset = rec->enq_record->cr_index + 1;
312 mutex_unlock(&crs->crs_lock);
314 if (written_total > 0)
315 wake_up_all(&crs->crs_waitq_prod);
317 list_for_each_entry_safe(rec, tmp, &consumed, enq_linkage)
318 enq_record_delete(rec);
320 *ppos = crs->crs_start_offset;
322 RETURN(written_total);
326 * Jump to a given record index. Helper for chlg_llseek().
328 * @param[in,out] crs Internal reader state.
329 * @param[in] offset Desired offset (index record).
330 * @return 0 on success, negated error code on failure.
332 static int chlg_set_start_offset(struct chlg_reader_state *crs, __u64 offset)
334 struct chlg_rec_entry *rec;
335 struct chlg_rec_entry *tmp;
337 mutex_lock(&crs->crs_lock);
338 if (offset < crs->crs_start_offset) {
339 mutex_unlock(&crs->crs_lock);
343 crs->crs_start_offset = offset;
344 list_for_each_entry_safe(rec, tmp, &crs->crs_rec_queue, enq_linkage) {
345 struct changelog_rec *cr = rec->enq_record;
347 if (cr->cr_index >= crs->crs_start_offset)
350 crs->crs_rec_count--;
351 enq_record_delete(rec);
354 mutex_unlock(&crs->crs_lock);
355 wake_up_all(&crs->crs_waitq_prod);
360 * Move read pointer to a certain record index, encoded as an offset.
362 * @param[in,out] file File pointer to the changelog character device
363 * @param[in] off Offset to skip, actually a record index, not byte count
364 * @param[in] whence Relative/Absolute interpretation of the offset
365 * @return the resulting position on success or negated error code on failure.
367 static loff_t chlg_llseek(struct file *file, loff_t off, int whence)
369 struct chlg_reader_state *crs = file->private_data;
378 pos = file->f_pos + off;
385 /* We cannot go backward */
386 if (pos < file->f_pos)
389 rc = chlg_set_start_offset(crs, pos);
398 * Clear record range for a given changelog reader.
400 * @param[in] crs Current internal state.
401 * @param[in] reader Changelog reader ID (cl1, cl2...)
402 * @param[in] record Record index up which to clear
403 * @return 0 on success, negated error code on failure.
405 static int chlg_clear(struct chlg_reader_state *crs, __u32 reader, __u64 record)
407 struct obd_device *obd = crs->crs_obd;
408 struct changelog_setinfo cs = {
413 return obd_set_info_async(NULL, obd->obd_self_export,
414 strlen(KEY_CHANGELOG_CLEAR),
415 KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, NULL);
418 /** Maximum changelog control command size */
419 #define CHLG_CONTROL_CMD_MAX 64
422 * Handle writes() into the changelog character device. Write() can be used
423 * to request special control operations.
425 * @param[in] file File pointer to the changelog character device
426 * @param[in] buff User supplied data (written data)
427 * @param[in] count Number of written bytes
428 * @param[in] off (unused)
429 * @return number of written bytes on success, negated error code on failure.
431 static ssize_t chlg_write(struct file *file, const char __user *buff,
432 size_t count, loff_t *off)
434 struct chlg_reader_state *crs = file->private_data;
441 if (count > CHLG_CONTROL_CMD_MAX)
444 OBD_ALLOC(kbuf, CHLG_CONTROL_CMD_MAX);
448 if (copy_from_user(kbuf, buff, count))
449 GOTO(out_kbuf, rc = -EFAULT);
451 kbuf[CHLG_CONTROL_CMD_MAX - 1] = '\0';
453 if (sscanf(kbuf, "clear:cl%u:%llu", &reader, &record) == 2)
454 rc = chlg_clear(crs, reader, record);
460 OBD_FREE(kbuf, CHLG_CONTROL_CMD_MAX);
461 return rc < 0 ? rc : count;
465 * Find the OBD device associated to a changelog character device.
466 * @param[in] cdev character device instance descriptor
467 * @return corresponding OBD device or NULL if none was found.
469 static struct obd_device *chlg_obd_get(dev_t cdev)
471 int minor = MINOR(cdev);
472 struct obd_device *obd = NULL;
473 struct chlg_registered_dev *curr;
475 mutex_lock(&chlg_registered_dev_lock);
476 list_for_each_entry(curr, &chlg_registered_devices, ced_link) {
477 if (curr->ced_misc.minor == minor) {
478 /* take the first available OBD device attached */
479 obd = list_first_entry(&curr->ced_obds,
481 u.cli.cl_chg_dev_linkage);
485 mutex_unlock(&chlg_registered_dev_lock);
490 * Open handler, initialize internal CRS state and spawn prefetch thread if
492 * @param[in] inode Inode struct for the open character device.
493 * @param[in] file Corresponding file pointer.
494 * @return 0 on success, negated error code on failure.
496 static int chlg_open(struct inode *inode, struct file *file)
498 struct chlg_reader_state *crs;
499 struct obd_device *obd = chlg_obd_get(inode->i_rdev);
500 struct task_struct *task;
512 crs->crs_err = false;
513 crs->crs_eof = false;
514 crs->crs_closed = false;
516 mutex_init(&crs->crs_lock);
517 INIT_LIST_HEAD(&crs->crs_rec_queue);
518 init_waitqueue_head(&crs->crs_waitq_prod);
519 init_waitqueue_head(&crs->crs_waitq_cons);
521 if (file->f_mode & FMODE_READ) {
522 task = kthread_run(chlg_load, crs, "chlg_load_thread");
525 CERROR("%s: cannot start changelog thread: rc = %d\n",
531 file->private_data = crs;
540 * Close handler, release resources.
542 * @param[in] inode Inode struct for the open character device.
543 * @param[in] file Corresponding file pointer.
544 * @return 0 on success, negated error code on failure.
546 static int chlg_release(struct inode *inode, struct file *file)
548 struct chlg_reader_state *crs = file->private_data;
550 if (file->f_mode & FMODE_READ) {
551 crs->crs_closed = true;
552 wake_up_all(&crs->crs_waitq_prod);
554 /* No producer thread, release resource ourselves */
561 * Poll handler, indicates whether the device is readable (new records) and
564 * @param[in] file Device file pointer.
565 * @param[in] wait (opaque)
566 * @return combination of the poll status flags.
568 static unsigned int chlg_poll(struct file *file, poll_table *wait)
570 struct chlg_reader_state *crs = file->private_data;
571 unsigned int mask = 0;
573 mutex_lock(&crs->crs_lock);
574 poll_wait(file, &crs->crs_waitq_cons, wait);
575 if (crs->crs_rec_count > 0)
576 mask |= POLLIN | POLLRDNORM;
581 mutex_unlock(&crs->crs_lock);
585 static const struct file_operations chlg_fops = {
586 .owner = THIS_MODULE,
587 .llseek = chlg_llseek,
591 .release = chlg_release,
596 * This uses obd_name of the form: "testfs-MDT0000-mdc-ffff88006501600"
597 * and returns a name of the form: "changelog-testfs-MDT0000".
599 static void get_chlg_name(char *name, size_t name_len, struct obd_device *obd)
603 snprintf(name, name_len, "changelog-%s", obd->obd_name);
605 /* Find the 2nd '-' from the end and truncate on it */
606 for (i = 0; i < 2; i++) {
607 char *p = strrchr(name, '-');
616 * Find a changelog character device by name.
617 * All devices registered during MDC setup are listed in a global list with
618 * their names attached.
620 static struct chlg_registered_dev *
621 chlg_registered_dev_find_by_name(const char *name)
623 struct chlg_registered_dev *dit;
625 list_for_each_entry(dit, &chlg_registered_devices, ced_link)
626 if (strcmp(name, dit->ced_name) == 0)
632 * Find chlg_registered_dev structure for a given OBD device.
633 * This is bad O(n^2) but for each filesystem:
634 * - N is # of MDTs times # of mount points
635 * - this only runs at shutdown
637 static struct chlg_registered_dev *
638 chlg_registered_dev_find_by_obd(const struct obd_device *obd)
640 struct chlg_registered_dev *dit;
641 struct obd_device *oit;
643 list_for_each_entry(dit, &chlg_registered_devices, ced_link)
644 list_for_each_entry(oit, &dit->ced_obds,
645 u.cli.cl_chg_dev_linkage)
652 * Changelog character device initialization.
653 * Register a misc character device with a dynamic minor number, under a name
654 * of the form: 'changelog-fsname-MDTxxxx'. Reference this OBD device with it.
656 * @param[in] obd This MDC obd_device.
657 * @return 0 on success, negated error code on failure.
659 int mdc_changelog_cdev_init(struct obd_device *obd)
661 struct chlg_registered_dev *exist;
662 struct chlg_registered_dev *entry;
666 OBD_ALLOC_PTR(entry);
670 get_chlg_name(entry->ced_name, sizeof(entry->ced_name), obd);
672 entry->ced_misc.minor = MISC_DYNAMIC_MINOR;
673 entry->ced_misc.name = entry->ced_name;
674 entry->ced_misc.fops = &chlg_fops;
676 kref_init(&entry->ced_refs);
677 INIT_LIST_HEAD(&entry->ced_obds);
678 INIT_LIST_HEAD(&entry->ced_link);
680 mutex_lock(&chlg_registered_dev_lock);
681 exist = chlg_registered_dev_find_by_name(entry->ced_name);
683 kref_get(&exist->ced_refs);
684 list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &exist->ced_obds);
685 GOTO(out_unlock, rc = 0);
688 /* Register new character device */
689 rc = misc_register(&entry->ced_misc);
691 GOTO(out_unlock, rc);
693 list_add_tail(&obd->u.cli.cl_chg_dev_linkage, &entry->ced_obds);
694 list_add_tail(&entry->ced_link, &chlg_registered_devices);
696 entry = NULL; /* prevent it from being freed below */
699 mutex_unlock(&chlg_registered_dev_lock);
706 * Deregister a changelog character device whose refcount has reached zero.
708 static void chlg_dev_clear(struct kref *kref)
710 struct chlg_registered_dev *entry = container_of(kref,
711 struct chlg_registered_dev,
715 list_del(&entry->ced_link);
716 misc_deregister(&entry->ced_misc);
722 * Release OBD, decrease reference count of the corresponding changelog device.
724 void mdc_changelog_cdev_finish(struct obd_device *obd)
726 struct chlg_registered_dev *dev = chlg_registered_dev_find_by_obd(obd);
729 mutex_lock(&chlg_registered_dev_lock);
730 list_del_init(&obd->u.cli.cl_chg_dev_linkage);
731 kref_put(&dev->ced_refs, chlg_dev_clear);
732 mutex_unlock(&chlg_registered_dev_lock);