Whamcloud - gitweb
4447de0bff7f976492f86e0e710320febbcbe28f
[fs/lustre-release.git] / lustre / ofd / ofd_access_log.c
1 #include <linux/cdev.h>
2 #include <linux/circ_buf.h>
3 #include <linux/device.h>
4 #include <linux/fs.h>
5 #include <linux/kernel.h>
6 #include <linux/miscdevice.h>
7 #include <linux/module.h>
8 #include <linux/poll.h>
9 #include <linux/slab.h>
10 #include <linux/types.h>
11 #include <linux/uaccess.h>
12 #include <uapi/linux/lustre/lustre_idl.h>
13 #include <uapi/linux/lustre/lustre_access_log.h>
14 #include "ofd_internal.h"
15
16 /* OFD access logs: OST (OFD) RPC handlers log accesses by FID and
17  * PFID which are read from userspace through character device files
18  * (/dev/lustre-access-log/scratch-OST0000). Accesses are described by
19  * struct ofd_access_entry_v1. The char device implements read()
20  * (blocking and nonblocking) and poll(), along with an ioctl that
21  * returns diagnostic information on an oal device.
22  *
23  * A control device (/dev/lustre-access-log/control) supports an ioctl()
24  * plus poll() method to for oal discovery. See uses of
25  * oal_control_event_count and oal_control_wait_queue for details.
26  *
27  * oal log size and entry size are restricted to powers of 2 to
28  * support circ_buf methods. See Documentation/core-api/circular-buffers.rst
29  * in the linux tree for more information.
30  *
31  * The associated struct device (*oal_device) owns the oal. The
32  * release() method of oal_device frees the oal and releases its
33  * minor. This may seem slightly more complicated than necessary but
34  * it allows the OST to be unmounted while the oal still has open file
35  * descriptors.
36  */
37
38 enum {
39         OAL_DEV_COUNT = 1 << MINORBITS,
40 };
41
42 struct ofd_access_log {
43         char oal_name[128]; /* lustre-OST0000 */
44         struct device oal_device;
45         struct cdev oal_cdev;
46         struct circ_buf oal_circ;
47         wait_queue_head_t oal_read_wait_queue;
48         spinlock_t oal_read_lock;
49         spinlock_t oal_write_lock;
50         unsigned int oal_drop_count;
51         unsigned int oal_is_closed;
52         unsigned int oal_log_size;
53         unsigned int oal_entry_size;
54 };
55
56 static atomic_t oal_control_event_count = ATOMIC_INIT(0);
57 static DECLARE_WAIT_QUEUE_HEAD(oal_control_wait_queue);
58
59 static struct class *oal_log_class;
60 static unsigned int oal_log_major;
61 static DEFINE_IDR(oal_log_minor_idr); /* TODO Use ida instead. */
62 static DEFINE_SPINLOCK(oal_log_minor_lock);
63
64 bool ofd_access_log_size_is_valid(unsigned int size)
65 {
66         const unsigned int size_min = 2 * sizeof(struct ofd_access_entry_v1);
67         const unsigned int size_max = 1U << 30;
68
69         if (size == 0)
70                 return true;
71
72         return is_power_of_2(size) && size_min <= size && size <= size_max;
73 }
74
75 static void oal_control_event_inc(void)
76 {
77         atomic_inc(&oal_control_event_count);
78         wake_up(&oal_control_wait_queue);
79 }
80
81 static int oal_log_minor_alloc(int *pminor)
82 {
83         void *OAL_LOG_MINOR_ALLOCED = (void *)-1;
84         int minor;
85
86         idr_preload(GFP_KERNEL);
87         spin_lock(&oal_log_minor_lock);
88         minor = idr_alloc(&oal_log_minor_idr, OAL_LOG_MINOR_ALLOCED, 0,
89                         OAL_DEV_COUNT, GFP_NOWAIT);
90         spin_unlock(&oal_log_minor_lock);
91         idr_preload_end();
92
93         if (minor < 0)
94                 return minor;
95
96         *pminor = minor;
97
98         return 0;
99 }
100
101 static void oal_log_minor_free(int minor)
102 {
103         spin_lock(&oal_log_minor_lock);
104         idr_remove(&oal_log_minor_idr, minor);
105         spin_unlock(&oal_log_minor_lock);
106 }
107
108 static bool oal_is_empty(struct ofd_access_log *oal)
109 {
110         return CIRC_CNT(oal->oal_circ.head,
111                         oal->oal_circ.tail,
112                         oal->oal_log_size) < oal->oal_entry_size;
113 }
114
115 static ssize_t oal_write_entry(struct ofd_access_log *oal,
116                         const void *entry, size_t entry_size)
117 {
118         struct circ_buf *circ = &oal->oal_circ;
119         unsigned int head;
120         unsigned int tail;
121         ssize_t rc;
122
123         if (entry_size != oal->oal_entry_size)
124                 return -EINVAL;
125
126         spin_lock(&oal->oal_write_lock);
127         head = circ->head;
128         tail = READ_ONCE(circ->tail);
129
130         /* CIRC_SPACE() return space available, 0..oal_log_size -
131          * 1. It always leaves one free char, since a completely full
132          * buffer would have head == tail, which is the same as empty. */
133         if (CIRC_SPACE(head, tail, oal->oal_log_size) < oal->oal_entry_size) {
134                 oal->oal_drop_count++;
135                 rc = -EAGAIN;
136                 goto out_write_lock;
137         }
138
139         memcpy(&circ->buf[head], entry, entry_size);
140         rc = entry_size;
141
142         /* Ensure the entry is stored before we update the head. */
143         smp_store_release(&circ->head,
144                         (head + oal->oal_entry_size) & (oal->oal_log_size - 1));
145
146         wake_up(&oal->oal_read_wait_queue);
147 out_write_lock:
148         spin_unlock(&oal->oal_write_lock);
149
150         return rc;
151 }
152
153 /* Read one entry from the log and return its size. Non-blocking.
154  * When the log is empty we return -EAGAIN if the OST is still mounted
155  * and 0 otherwise.
156  */
157 static ssize_t oal_read_entry(struct ofd_access_log *oal,
158                         void *entry_buf, size_t entry_buf_size)
159 {
160         struct circ_buf *circ = &oal->oal_circ;
161         unsigned int head;
162         unsigned int tail;
163         ssize_t rc;
164
165         /* XXX This method may silently truncate entries when
166          * entry_buf_size is less than oal_entry_size. But that's OK
167          * because you know what you are doing. */
168         spin_lock(&oal->oal_read_lock);
169
170         /* Memory barrier usage follows circular-buffers.txt. */
171         head = smp_load_acquire(&circ->head);
172         tail = circ->tail;
173
174         if (!CIRC_CNT(head, tail, oal->oal_log_size)) {
175                 rc = oal->oal_is_closed ? 0 : -EAGAIN;
176                 goto out_read_lock;
177         }
178
179         BUG_ON(CIRC_CNT(head, tail, oal->oal_log_size) < oal->oal_entry_size);
180
181         /* Read index before reading contents at that index. */
182         smp_read_barrier_depends();
183
184         /* Extract one entry from the buffer. */
185         rc = min_t(size_t, oal->oal_entry_size, entry_buf_size);
186         memcpy(entry_buf, &circ->buf[tail], rc);
187
188         /* Memory barrier usage follows circular-buffers.txt. */
189         smp_store_release(&circ->tail,
190                         (tail + oal->oal_entry_size) & (oal->oal_log_size - 1));
191
192 out_read_lock:
193         spin_unlock(&oal->oal_read_lock);
194
195         return rc;
196 }
197
198 static int oal_file_open(struct inode *inode, struct file *filp)
199 {
200         filp->private_data = container_of(inode->i_cdev,
201                                         struct ofd_access_log, oal_cdev);
202
203         return nonseekable_open(inode, filp);
204 }
205
206 /* User buffer size must be a multiple of ofd access entry size. */
207 static ssize_t oal_file_read(struct file *filp, char __user *buf, size_t count,
208                         loff_t *ppos)
209 {
210         struct ofd_access_log *oal = filp->private_data;
211         void *entry;
212         size_t size = 0;
213         int rc = 0;
214
215         if (!count)
216                 return 0;
217
218         if (count & (oal->oal_entry_size - 1))
219                 return -EINVAL;
220
221         entry = kzalloc(oal->oal_entry_size, GFP_KERNEL);
222         if (!entry)
223                 return -ENOMEM;
224
225         while (size < count) {
226                 rc = oal_read_entry(oal, entry, oal->oal_entry_size);
227                 if (rc == -EAGAIN) {
228                         if (filp->f_flags & O_NONBLOCK)
229                                 break;
230
231                         rc = wait_event_interruptible(oal->oal_read_wait_queue,
232                                 !oal_is_empty(oal) || oal->oal_is_closed);
233                         if (rc)
234                                 break;
235                 } else if (rc <= 0) {
236                         break; /* cloed or error */
237                 } else {
238                         if (copy_to_user(buf, entry, oal->oal_entry_size)) {
239                                 rc = -EFAULT;
240                                 break;
241                         }
242
243                         buf += oal->oal_entry_size;
244                         size += oal->oal_entry_size;
245                 }
246         }
247
248         kfree(entry);
249
250         return size ? size : rc;
251 }
252
253 /* Included for test purposes. User buffer size must be a multiple of
254  * ofd access entry size. */
255 static ssize_t oal_file_write(struct file *filp, const char __user *buf,
256                         size_t count, loff_t *ppos)
257 {
258         struct ofd_access_log *oal = filp->private_data;
259         void *entry;
260         size_t size = 0;
261         ssize_t rc = 0;
262
263         if (!count)
264                 return 0;
265
266         if (count & (oal->oal_entry_size - 1))
267                 return -EINVAL;
268
269         entry = kzalloc(oal->oal_entry_size, GFP_KERNEL);
270         if (!entry)
271                 return -ENOMEM;
272
273         while (size < count) {
274                 if (copy_from_user(entry, buf, oal->oal_entry_size)) {
275                         rc = -EFAULT;
276                         break;
277                 }
278
279                 rc = oal_write_entry(oal, entry, oal->oal_entry_size);
280                 if (rc <= 0)
281                         break;
282
283                 buf += oal->oal_entry_size;
284                 size += oal->oal_entry_size;
285         }
286
287         kfree(entry);
288
289         return size > 0 ? size : rc;
290 }
291
292 unsigned int oal_file_poll(struct file *filp, struct poll_table_struct *wait)
293 {
294         struct ofd_access_log *oal = filp->private_data;
295         unsigned int mask = 0;
296
297         poll_wait(filp, &oal->oal_read_wait_queue, wait);
298
299         spin_lock(&oal->oal_read_lock);
300
301         if (!oal_is_empty(oal) || oal->oal_is_closed)
302                 mask |= POLLIN;
303
304         spin_unlock(&oal->oal_read_lock);
305
306         return mask;
307 }
308
309 static long oal_ioctl_info(struct ofd_access_log *oal, unsigned long arg)
310 {
311         struct lustre_access_log_info_v1 __user *lali;
312         u32 entry_count = CIRC_CNT(oal->oal_circ.head,
313                                 oal->oal_circ.tail,
314                                 oal->oal_log_size) / oal->oal_entry_size;
315         u32 entry_space = CIRC_SPACE(oal->oal_circ.head,
316                                 oal->oal_circ.tail,
317                                 oal->oal_log_size) / oal->oal_entry_size;
318
319         lali = (struct lustre_access_log_info_v1 __user *)arg;
320         BUILD_BUG_ON(sizeof(lali->lali_name) != sizeof(oal->oal_name));
321
322         if (put_user(LUSTRE_ACCESS_LOG_VERSION_1, &lali->lali_version))
323                 return -EFAULT;
324
325         if (put_user(LUSTRE_ACCESS_LOG_TYPE_OFD, &lali->lali_type))
326                 return -EFAULT;
327
328         if (copy_to_user(lali->lali_name, oal->oal_name, sizeof(oal->oal_name)))
329                 return -EFAULT;
330
331         if (put_user(oal->oal_log_size, &lali->lali_log_size))
332                 return -EFAULT;
333
334         if (put_user(oal->oal_entry_size, &lali->lali_entry_size))
335                 return -EFAULT;
336
337         if (put_user(oal->oal_circ.head, &lali->_lali_head))
338                 return -EFAULT;
339
340         if (put_user(oal->oal_circ.tail, &lali->_lali_tail))
341                 return -EFAULT;
342
343         if (put_user(entry_space, &lali->_lali_entry_space))
344                 return -EFAULT;
345
346         if (put_user(entry_count, &lali->_lali_entry_count))
347                 return -EFAULT;
348
349         if (put_user(oal->oal_drop_count, &lali->_lali_drop_count))
350                 return -EFAULT;
351
352         if (put_user(oal->oal_is_closed, &lali->_lali_is_closed))
353                 return -EFAULT;
354
355         return 0;
356 }
357
358 static long oal_file_ioctl(struct file *filp, unsigned int cmd,
359                         unsigned long arg)
360 {
361         struct ofd_access_log *oal = filp->private_data;
362
363         switch (cmd) {
364         case LUSTRE_ACCESS_LOG_IOCTL_VERSION:
365                 return LUSTRE_ACCESS_LOG_VERSION_1;
366         case LUSTRE_ACCESS_LOG_IOCTL_INFO:
367                 return oal_ioctl_info(oal, arg);
368         default:
369                 return -ENOTTY;
370         }
371 }
372
373 static const struct file_operations oal_fops = {
374         .owner = THIS_MODULE,
375         .open = &oal_file_open,
376         .unlocked_ioctl = &oal_file_ioctl,
377         .read = &oal_file_read,
378         .write = &oal_file_write,
379         .poll = &oal_file_poll,
380         .llseek = &no_llseek,
381 };
382
383 static void oal_device_release(struct device *dev)
384 {
385         struct ofd_access_log *oal = dev_get_drvdata(dev);
386
387         oal_log_minor_free(MINOR(oal->oal_device.devt));
388         vfree(oal->oal_circ.buf);
389         kfree(oal);
390 }
391
392 struct ofd_access_log *ofd_access_log_create(const char *ofd_name, size_t size)
393 {
394         const size_t entry_size = sizeof(struct ofd_access_entry_v1);
395         struct ofd_access_log *oal;
396         int minor;
397         int rc;
398
399         BUILD_BUG_ON(sizeof(oal->oal_name) != MAX_OBD_NAME);
400         BUILD_BUG_ON(!is_power_of_2(entry_size));
401
402         if (!size)
403                 return NULL;
404
405         if (!is_power_of_2(size) || (size & (entry_size - 1)) ||
406             (unsigned int)size != size)
407                 return ERR_PTR(-EINVAL);
408
409         oal = kzalloc(sizeof(*oal), GFP_KERNEL);
410         if (!oal)
411                 return ERR_PTR(-ENOMEM);
412
413         strlcpy(oal->oal_name, ofd_name, sizeof(oal->oal_name));
414         oal->oal_log_size = size;
415         oal->oal_entry_size = entry_size;
416         spin_lock_init(&oal->oal_write_lock);
417         spin_lock_init(&oal->oal_read_lock);
418         init_waitqueue_head(&oal->oal_read_wait_queue);
419
420         oal->oal_circ.buf = vmalloc(oal->oal_log_size);
421         if (!oal->oal_circ.buf) {
422                 rc = -ENOMEM;
423                 goto out_free;
424         }
425
426         rc = oal_log_minor_alloc(&minor);
427         if (rc < 0)
428                 goto out_free;
429
430         device_initialize(&oal->oal_device);
431         oal->oal_device.devt = MKDEV(oal_log_major, minor);
432         oal->oal_device.class = oal_log_class;
433         oal->oal_device.release = &oal_device_release;
434         dev_set_drvdata(&oal->oal_device, oal);
435         rc = dev_set_name(&oal->oal_device,
436                         "%s!%s", LUSTRE_ACCESS_LOG_DIR_NAME, oal->oal_name);
437         if (rc < 0)
438                 goto out_minor;
439
440         cdev_init(&oal->oal_cdev, &oal_fops);
441         oal->oal_cdev.owner = THIS_MODULE;
442         rc = cdev_device_add(&oal->oal_cdev, &oal->oal_device);
443         if (rc < 0)
444                 goto out_device_name;
445
446         oal_control_event_inc();
447
448         return oal;
449
450 out_device_name:
451         kfree_const(oal->oal_device.kobj.name);
452 out_minor:
453         oal_log_minor_free(minor);
454 out_free:
455         vfree(oal->oal_circ.buf);
456         kfree(oal);
457
458         return ERR_PTR(rc);
459 }
460
461 void ofd_access(struct ofd_device *m,
462                 const struct lu_fid *parent_fid,
463                 __u64 begin, __u64 end,
464                 unsigned int size,
465                 unsigned int segment_count,
466                 int rw)
467 {
468         unsigned int flags = (rw == READ) ? OFD_ACCESS_READ : OFD_ACCESS_WRITE;
469
470         if (m->ofd_access_log && (flags & m->ofd_access_log_mask)) {
471                 struct ofd_access_entry_v1 oae = {
472                         .oae_parent_fid = *parent_fid,
473                         .oae_begin = begin,
474                         .oae_end = end,
475                         .oae_time = ktime_get_real_seconds(),
476                         .oae_size = size,
477                         .oae_segment_count = segment_count,
478                         .oae_flags = flags,
479                 };
480
481                 oal_write_entry(m->ofd_access_log, &oae, sizeof(oae));
482         }
483 }
484
485 /* Called on OST umount to:
486  * - Close the write end of the oal. The wakes any tasks sleeping in
487  *   read or poll and makes all reads return zero once the log
488  *   becomes empty.
489  * - Delete the associated stuct device and cdev, preventing new
490  *   opens. Existing opens retain a reference on the oal through
491  *   their reference on oal_device.
492  * The oal will be freed when the last open file handle is closed. */
493 void ofd_access_log_delete(struct ofd_access_log *oal)
494 {
495         if (!oal)
496                 return;
497
498         oal->oal_is_closed = 1;
499         wake_up_all(&oal->oal_read_wait_queue);
500         cdev_device_del(&oal->oal_cdev, &oal->oal_device);
501 }
502
503 /* private_data for control device file. */
504 struct oal_control_file {
505         int ccf_event_count;
506 };
507
508 /* Control file usage:
509  * Open /dev/lustre-access-log/control.
510  * while (1)
511  *   Poll for readable on control FD.
512  *   Call ioctl(FD, LUSTRE_ACCESS_LOG_IOCTL_PRESCAN) to fetch event count.
513  *   Scan /dev/ or /sys/class/... for new devices.
514  */
515 static int oal_control_file_open(struct inode *inode, struct file *filp)
516 {
517         struct oal_control_file *ccf;
518         int rc;
519
520         rc = nonseekable_open(inode, filp);
521         if (rc)
522                 return rc;
523
524         /* ccf->ccf_event_count = 0 on open */
525         ccf = kzalloc(sizeof(*ccf), GFP_KERNEL);
526         if (!ccf)
527                 return -ENOMEM;
528
529         filp->private_data = ccf;
530
531         return 0;
532 }
533
534 static int oal_control_file_release(struct inode *inode, struct file *filp)
535 {
536         kfree(filp->private_data);
537         return 0;
538 }
539
540 static unsigned int oal_control_file_poll(struct file *filp, poll_table *wait)
541 {
542         struct oal_control_file *ccf = filp->private_data;
543         unsigned int mask = 0;
544
545         poll_wait(filp, &oal_control_wait_queue, wait);
546
547         if (atomic_read(&oal_control_event_count) != ccf->ccf_event_count)
548                 mask |= POLLIN;
549
550         return mask;
551 }
552
553 static long oal_control_file_ioctl(struct file *filp, unsigned int cmd,
554                                 unsigned long arg)
555 {
556         struct oal_control_file *ccf = filp->private_data;
557
558         switch (cmd) {
559         case LUSTRE_ACCESS_LOG_IOCTL_VERSION:
560                 return LUSTRE_ACCESS_LOG_VERSION_1;
561         case LUSTRE_ACCESS_LOG_IOCTL_MAJOR:
562                 return oal_log_major;
563         case LUSTRE_ACCESS_LOG_IOCTL_PRESCAN:
564                 ccf->ccf_event_count = atomic_read(&oal_control_event_count);
565                 return 0;
566         default:
567                 return -ENOTTY;
568         }
569 }
570
571 static const struct file_operations oal_control_fops = {
572         .owner = THIS_MODULE,
573         .open = &oal_control_file_open,
574         .release = &oal_control_file_release,
575         .poll = &oal_control_file_poll,
576         .unlocked_ioctl = &oal_control_file_ioctl,
577         .llseek = &noop_llseek,
578 };
579
580 static struct miscdevice oal_control_misc = {
581         .minor = MISC_DYNAMIC_MINOR,
582         .name = LUSTRE_ACCESS_LOG_DIR_NAME"!control",
583         .fops = &oal_control_fops,
584 };
585
586 int ofd_access_log_module_init(void)
587 {
588         dev_t dev;
589         int rc;
590
591         BUILD_BUG_ON(!is_power_of_2(sizeof(struct ofd_access_entry_v1)));
592
593         rc = misc_register(&oal_control_misc);
594         if (rc)
595                 return rc;
596
597         rc = alloc_chrdev_region(&dev, 0, OAL_DEV_COUNT,
598                                 LUSTRE_ACCESS_LOG_DIR_NAME);
599         if (rc)
600                 goto out_oal_control_misc;
601
602         oal_log_major = MAJOR(dev);
603
604         oal_log_class = class_create(THIS_MODULE, LUSTRE_ACCESS_LOG_DIR_NAME);
605         if (IS_ERR(oal_log_class)) {
606                 rc = PTR_ERR(oal_log_class);
607                 goto out_dev;
608         }
609
610         return 0;
611 out_dev:
612         unregister_chrdev_region(dev, OAL_DEV_COUNT);
613 out_oal_control_misc:
614         misc_deregister(&oal_control_misc);
615
616         return rc;
617 }
618
619 void ofd_access_log_module_exit(void)
620 {
621         class_destroy(oal_log_class);
622         unregister_chrdev_region(MKDEV(oal_log_major, 0), OAL_DEV_COUNT);
623         idr_destroy(&oal_log_minor_idr);
624         misc_deregister(&oal_control_misc);
625 }