Whamcloud - gitweb
LU-4020 hsm: allow copytool event monitoring with JSON
[fs/lustre-release.git] / lustre / utils / liblustreapi_hsm.c
index dcce44e..3f1b8b9 100644 (file)
@@ -46,6 +46,7 @@
 #include <sys/syscall.h>
 #include <fnmatch.h>
 #include <glob.h>
+#include <signal.h>
 #ifdef HAVE_LINUX_UNISTD_H
 #include <linux/unistd.h>
 #else
@@ -82,6 +83,553 @@ struct hsm_copyaction_private {
 
 #include <libcfs/libcfs.h>
 
+enum ct_progress_type {
+       CT_START        = 0,
+       CT_RUNNING      = 50,
+       CT_FINISH       = 100,
+       CT_CANCEL       = 150,
+       CT_ERROR        = 175
+};
+
+enum ct_event {
+       CT_REGISTER             = 1,
+       CT_UNREGISTER           = 2,
+       CT_ARCHIVE_START        = HSMA_ARCHIVE,
+       CT_ARCHIVE_RUNNING      = HSMA_ARCHIVE + CT_RUNNING,
+       CT_ARCHIVE_FINISH       = HSMA_ARCHIVE + CT_FINISH,
+       CT_ARCHIVE_CANCEL       = HSMA_ARCHIVE + CT_CANCEL,
+       CT_ARCHIVE_ERROR        = HSMA_ARCHIVE + CT_ERROR,
+       CT_RESTORE_START        = HSMA_RESTORE,
+       CT_RESTORE_RUNNING      = HSMA_RESTORE + CT_RUNNING,
+       CT_RESTORE_FINISH       = HSMA_RESTORE + CT_FINISH,
+       CT_RESTORE_CANCEL       = HSMA_RESTORE + CT_CANCEL,
+       CT_RESTORE_ERROR        = HSMA_RESTORE + CT_ERROR,
+       CT_REMOVE_START         = HSMA_REMOVE,
+       CT_REMOVE_RUNNING       = HSMA_REMOVE + CT_RUNNING,
+       CT_REMOVE_FINISH        = HSMA_REMOVE + CT_FINISH,
+       CT_REMOVE_CANCEL        = HSMA_REMOVE + CT_CANCEL,
+       CT_REMOVE_ERROR         = HSMA_REMOVE + CT_ERROR,
+       CT_EVENT_MAX
+};
+
+/* initialized in llapi_hsm_register_event_fifo() */
+FILE *llapi_hsm_event_fp;
+
+static inline const char *llapi_hsm_ct_ev2str(int type)
+{
+       switch (type) {
+       case CT_REGISTER:
+               return "REGISTER";
+       case CT_UNREGISTER:
+               return "UNREGISTER";
+       case CT_ARCHIVE_START:
+               return "ARCHIVE_START";
+       case CT_ARCHIVE_RUNNING:
+               return "ARCHIVE_RUNNING";
+       case CT_ARCHIVE_FINISH:
+               return "ARCHIVE_FINISH";
+       case CT_ARCHIVE_CANCEL:
+               return "ARCHIVE_CANCEL";
+       case CT_ARCHIVE_ERROR:
+               return "ARCHIVE_ERROR";
+       case CT_RESTORE_START:
+               return "RESTORE_START";
+       case CT_RESTORE_RUNNING:
+               return "RESTORE_RUNNING";
+       case CT_RESTORE_FINISH:
+               return "RESTORE_FINISH";
+       case CT_RESTORE_CANCEL:
+               return "RESTORE_CANCEL";
+       case CT_RESTORE_ERROR:
+               return "RESTORE_ERROR";
+       case CT_REMOVE_START:
+               return "REMOVE_START";
+       case CT_REMOVE_RUNNING:
+               return "REMOVE_RUNNING";
+       case CT_REMOVE_FINISH:
+               return "REMOVE_FINISH";
+       case CT_REMOVE_CANCEL:
+               return "REMOVE_CANCEL";
+       case CT_REMOVE_ERROR:
+               return "REMOVE_ERROR";
+       default:
+               llapi_err_noerrno(LLAPI_MSG_ERROR,
+                                 "Unknown event type: %d", type);
+               return NULL;
+       }
+}
+
+/**
+ * Writes a JSON event to the monitor FIFO. Noop if no FIFO has been
+ * registered.
+ *
+ * \param event              A list of llapi_json_items comprising a
+ *                           single JSON-formatted event.
+ *
+ * \retval 0 on success.
+ * \retval -errno on error.
+ */
+int llapi_hsm_write_json_event(struct llapi_json_item_list **event)
+{
+       int                             rc;
+       char                            time_string[40];
+       time_t                          event_time = time(0);
+       struct tm                       time_components;
+       struct llapi_json_item_list     *json_items;
+
+       /* Noop unless the event fp was initialized */
+       if (llapi_hsm_event_fp == NULL)
+               return 0;
+
+       if (event == NULL || *event == NULL)
+               return -EINVAL;
+
+       json_items = *event;
+
+       localtime_r(&event_time, &time_components);
+
+       if (strftime(time_string, sizeof(time_string), "%Y-%m-%d %T %z",
+                    &time_components) == 0) {
+               rc = -EINVAL;
+               llapi_error(LLAPI_MSG_ERROR, rc, "strftime() failed");
+               return rc;
+       }
+
+       rc = llapi_json_add_item(&json_items, "event_time", LLAPI_JSON_STRING,
+                                time_string);
+       if (rc < 0) {
+               llapi_error(LLAPI_MSG_ERROR, -rc, "error in "
+                           "llapi_json_add_item()");
+               return rc;
+       }
+
+       rc = llapi_json_write_list(event, llapi_hsm_event_fp);
+       if (rc < 0) {
+               /* Ignore write failures due to missing reader. */
+               if (rc == -EPIPE)
+                       return 0;
+
+               /* Skip llapi_error() here because there's no point
+                * in creating a JSON-formatted error message about
+                * failing to write a JSON-formatted message.
+                */
+               fprintf(stderr,
+                       "\nFATAL ERROR IN llapi_hsm_write_list(): rc %d", rc);
+               return rc;
+       }
+
+       return 0;
+}
+
+/**
+ * Hook for llapi_hsm_copytool_register and llapi_hsm_copytool_unregister
+ * to generate JSON events suitable for consumption by a copytool
+ * monitoring process.
+ *
+ * \param priv               Opaque private control structure.
+ * \param event_type         The type of event (register or unregister).
+ *
+ * \retval 0 on success.
+ * \retval -errno on error.
+ */
+int llapi_hsm_log_ct_registration(struct hsm_copytool_private **priv,
+                                 __u32 event_type)
+{
+       int                             rc;
+       char                            agent_uuid[UUID_MAX];
+       struct hsm_copytool_private     *ct;
+       struct llapi_json_item_list     *json_items;
+
+       if (priv == NULL || *priv == NULL)
+               return -EINVAL;
+
+       ct = *priv;
+       if (ct->magic != CT_PRIV_MAGIC)
+               return -EINVAL;
+
+       if (event_type != CT_REGISTER && event_type != CT_UNREGISTER)
+               return -EINVAL;
+
+       rc = llapi_json_init_list(&json_items);
+       if (rc < 0)
+               goto err;
+
+       rc = llapi_get_agent_uuid(ct->mnt, agent_uuid, sizeof(agent_uuid));
+       if (rc < 0)
+               goto err;
+       llapi_chomp_string(agent_uuid);
+
+       rc = llapi_json_add_item(&json_items, "uuid", LLAPI_JSON_STRING,
+                                agent_uuid);
+       if (rc < 0)
+               goto err;
+
+       rc = llapi_json_add_item(&json_items, "mount_point", LLAPI_JSON_STRING,
+                                ct->mnt);
+       if (rc < 0)
+               goto err;
+
+       rc = llapi_json_add_item(&json_items, "archive", LLAPI_JSON_INTEGER,
+                                &ct->archives);
+       if (rc < 0)
+               goto err;
+
+       rc = llapi_json_add_item(&json_items, "event_type", LLAPI_JSON_STRING,
+                                (char *)llapi_hsm_ct_ev2str(event_type));
+       if (rc < 0)
+               goto err;
+
+       rc = llapi_hsm_write_json_event(&json_items);
+       if (rc < 0)
+               goto err;
+
+       goto out_free;
+
+err:
+       llapi_error(LLAPI_MSG_ERROR, rc, "error in "
+                   "llapi_hsm_log_ct_registration()");
+
+out_free:
+       if (json_items != NULL)
+               llapi_json_destroy_list(&json_items);
+
+       return rc;
+}
+
+/**
+ * Given a copytool progress update, construct a JSON event suitable for
+ * consumption by a copytool monitoring process.
+ *
+ * Examples of various events generated here and written by
+ * llapi_hsm_write_json_event:
+ *
+ * Copytool registration and deregistration:
+ * {"event_time": "2014-02-26 14:58:01 -0500", "event_type": "REGISTER", "archive": 0, "mount_point": "/mnt/lustre", "uuid": "80379a60-1f8a-743f-daf2-307cde793ec2"}
+ * {"event_time": "2014-02-26 14:58:01 -0500", "event_type": "UNREGISTER", "archive": 0, "mount_point": "/mnt/lustre", "uuid": "80379a60-1f8a-743f-daf2-307cde793ec2"}
+ *
+ * An archive action, start to completion:
+ * {"event_time": "2014-02-26 14:50:13 -0500", "event_type": "ARCHIVE_START", "total_bytes": 0, "lustre_path": "d71.sanity-hsm/f71.sanity-hsm", "source_fid": "0x2000013a1:0x2:0x0", "data_fid": "0x2000013a1:0x2:0x0"}
+ * {"event_time": "2014-02-26 14:50:18 -0500", "event_type": "ARCHIVE_RUNNING", "current_bytes": 5242880, "total_bytes": 39000000, "lustre_path": "d71.sanity-hsm/f71.sanity-hsm", "source_fid": "0x2000013a1:0x2:0x0", "data_fid": "0x2000013a1:0x2:0x0"}
+ * {"event_time": "2014-02-26 14:50:50 -0500", "event_type": "ARCHIVE_FINISH", "source_fid": "0x2000013a1:0x2:0x0", "data_fid": "0x2000013a1:0x2:0x0"}
+ *
+ * A log message:
+ * {"event_time": "2014-02-26 14:50:13 -0500", "event_type": "LOGGED_MESSAGE", "level": "INFO", "message": "lhsmtool_posix[59401]: copytool fs=lustre archive#=2 item_count=1"}
+ *
+ * \param hcp                Opaque action handle returned by
+ *                           llapi_hsm_action_start.
+ * \param hai                The hsm_action_item describing the request.
+ * \param progress_type      The ct_progress_type describing the update.
+ * \param total              The total expected bytes for the request.
+ * \param current            The current copied byte count for the request.
+ *
+ * \retval 0 on success.
+ * \retval -errno on error.
+ */
+int llapi_hsm_log_ct_progress(struct hsm_copyaction_private **phcp,
+                   const struct hsm_action_item *hai, __u32 progress_type,
+                   __u64 total, __u64 current)
+{
+       int                             rc;
+       int                             linkno = 0;
+       long long                       recno = -1;
+       char                            lustre_path[PATH_MAX];
+       char                            strfid[FID_NOBRACE_LEN + 1];
+       struct hsm_copyaction_private   *hcp;
+       struct llapi_json_item_list     *json_items;
+
+       if (phcp == NULL || *phcp == NULL)
+               return -EINVAL;
+
+       hcp = *phcp;
+
+       rc = llapi_json_init_list(&json_items);
+       if (rc < 0)
+               goto err;
+
+       snprintf(strfid, sizeof(strfid), DFID_NOBRACE, PFID(&hai->hai_dfid));
+       rc = llapi_json_add_item(&json_items, "data_fid",
+                                LLAPI_JSON_STRING, strfid);
+       if (rc < 0)
+               goto err;
+
+       snprintf(strfid, sizeof(strfid), DFID_NOBRACE, PFID(&hai->hai_fid));
+       rc = llapi_json_add_item(&json_items, "source_fid",
+                                LLAPI_JSON_STRING, strfid);
+       if (rc < 0)
+               goto err;
+
+       if (hcp->copy.hc_errval == ECANCELED) {
+               progress_type = CT_CANCEL;
+               goto cancel;
+       }
+
+       if (hcp->copy.hc_errval != 0) {
+               progress_type = CT_ERROR;
+
+               rc = llapi_json_add_item(&json_items, "errno",
+                                        LLAPI_JSON_INTEGER,
+                                        &hcp->copy.hc_errval);
+               if (rc < 0)
+                       goto err;
+
+               rc = llapi_json_add_item(&json_items, "error",
+                                        LLAPI_JSON_STRING,
+                                        strerror(hcp->copy.hc_errval));
+               if (rc < 0)
+                       goto err;
+
+               goto cancel;
+       }
+
+       /* lustre_path isn't available after a restore completes */
+       /* total_bytes isn't available after a restore or archive completes */
+       if (progress_type != CT_FINISH) {
+               rc = llapi_fid2path(hcp->ct_priv->mnt, strfid, lustre_path,
+                                   sizeof(lustre_path), &recno, &linkno);
+               if (rc < 0)
+                       goto err;
+
+               rc = llapi_json_add_item(&json_items, "lustre_path",
+                                        LLAPI_JSON_STRING, lustre_path);
+               if (rc < 0)
+                       goto err;
+
+               rc = llapi_json_add_item(&json_items, "total_bytes",
+                                        LLAPI_JSON_BIGNUM, &total);
+               if (rc < 0)
+                       goto err;
+       }
+
+       if (progress_type == CT_RUNNING)
+               rc = llapi_json_add_item(&json_items, "current_bytes",
+                                        LLAPI_JSON_BIGNUM, &current);
+               if (rc < 0)
+                       goto err;
+
+cancel:
+       rc = llapi_json_add_item(&json_items, "event_type", LLAPI_JSON_STRING,
+                                (char *)llapi_hsm_ct_ev2str(hai->hai_action +
+                                                            progress_type));
+       if (rc < 0)
+               goto err;
+
+       rc = llapi_hsm_write_json_event(&json_items);
+       if (rc < 0)
+               goto err;
+
+       goto out_free;
+
+err:
+       llapi_error(LLAPI_MSG_ERROR, rc, "error in "
+                   "llapi_hsm_log_ct_progress()");
+
+out_free:
+       if (json_items != NULL)
+               llapi_json_destroy_list(&json_items);
+
+       return rc;
+}
+
+/**
+ * Given a path to a FIFO, create a filehandle for nonblocking writes to it.
+ * Intended to be used for copytool monitoring processes that read an
+ * event stream from the FIFO. Events written in the absence of a reader
+ * are lost.
+ *
+ * \param path               Path to monitor FIFO.
+ *
+ * \retval 0 on success.
+ * \retval -errno on error.
+ */
+int llapi_hsm_register_event_fifo(char *path)
+{
+       int read_fd, write_fd;
+       struct stat statbuf;
+
+       /* Create the FIFO if necessary. */
+       if ((mkfifo(path, 0644) < 0) && (errno != EEXIST)) {
+               llapi_error(LLAPI_MSG_ERROR, errno, "mkfifo(%s) failed", path);
+               return -errno;
+       }
+       if (errno == EEXIST) {
+               if (stat(path, &statbuf) < 0) {
+                       llapi_error(LLAPI_MSG_ERROR, errno, "mkfifo(%s) failed",
+                                   path);
+                       return -errno;
+               }
+               if (!S_ISFIFO(statbuf.st_mode) ||
+                   ((statbuf.st_mode & 0777) != 0644)) {
+                       llapi_error(LLAPI_MSG_ERROR, errno, "%s exists but is "
+                                   "not a pipe or has a wrong mode", path);
+                       return -errno;
+               }
+       }
+
+       /* Open the FIFO for read so that the subsequent open for write
+        * doesn't immediately fail. */
+       read_fd = open(path, O_RDONLY | O_NONBLOCK);
+       if (read_fd < 0) {
+               llapi_error(LLAPI_MSG_ERROR, errno,
+                           "cannot open(%s) for read", path);
+               return -errno;
+       }
+
+       /* Open the FIFO for writes, but don't block on waiting
+        * for a reader. */
+       write_fd = open(path, O_WRONLY | O_NONBLOCK);
+       if (write_fd < 0) {
+               llapi_error(LLAPI_MSG_ERROR, errno,
+                           "cannot open(%s) for write", path);
+               return -errno;
+       }
+
+       /* Now close the reader. An external monitoring process can
+        * now open the FIFO for reads. If no reader comes along the
+        * events are lost. NOTE: Only one reader at a time! */
+       close(read_fd);
+
+       llapi_hsm_event_fp = fdopen(write_fd, "w");
+       if (llapi_hsm_event_fp == NULL) {
+               llapi_error(LLAPI_MSG_ERROR, errno,
+                           "cannot fdopen(%s) for write", path);
+               return -errno;
+       }
+
+       /* Ignore SIGPIPEs -- can occur if the reader goes away. */
+       signal(SIGPIPE, SIG_IGN);
+
+       /* Don't buffer the event stream. */
+       setbuf(llapi_hsm_event_fp, NULL);
+
+       return 0;
+}
+
+/**
+ * Given a path to a FIFO, close its filehandle and delete the FIFO.
+ *
+ * \param path               Path to monitor FIFO.
+ *
+ * \retval 0 on success.
+ * \retval -errno on error.
+ */
+int llapi_hsm_unregister_event_fifo(char *path)
+{
+       /* Noop unless the event fp was initialized */
+       if (llapi_hsm_event_fp == NULL)
+               return 0;
+
+       if (fclose(llapi_hsm_event_fp) != 0)
+               return -errno;
+
+       unlink(path);
+
+       llapi_hsm_event_fp = NULL;
+
+       return 0;
+}
+
+/**
+ * Custom logging callback to be used when a monitoring FIFO has been
+ * registered. Formats log entries as JSON events suitable for
+ * consumption by a copytool monitoring process.
+ *
+ * \param level              The message loglevel.
+ * \param _rc                The returncode associated with the message.
+ * \param fmt                The message format string.
+ * \param args               Arguments to be formatted by the format string.
+ *
+ * \retval None.
+ */
+void llapi_hsm_log_error(enum llapi_message_level level, int _rc,
+                        const char *fmt, va_list args)
+{
+       int                             rc;
+       int                             msg_len;
+       int                             real_level;
+       char                            *msg = NULL;
+       va_list                         args2;
+       struct llapi_json_item_list     *json_items;
+
+       /* Noop unless the event fp was initialized */
+       if (llapi_hsm_event_fp == NULL)
+               return;
+
+       rc = llapi_json_init_list(&json_items);
+       if (rc < 0)
+               goto err;
+
+       if ((level & LLAPI_MSG_NO_ERRNO) == 0) {
+               rc = llapi_json_add_item(&json_items, "errno",
+                                        LLAPI_JSON_INTEGER,
+                                        &_rc);
+               if (rc < 0)
+                       goto err;
+
+               rc = llapi_json_add_item(&json_items, "error",
+                                        LLAPI_JSON_STRING,
+                                        strerror(abs(_rc)));
+               if (rc < 0)
+                       goto err;
+       }
+
+       va_copy(args2, args);
+       msg_len = vsnprintf(NULL, 0, fmt, args2) + 1;
+       va_end(args2);
+       if (msg_len >= 0) {
+               msg = (char *) alloca(msg_len);
+               if (msg == NULL) {
+                       rc = -ENOMEM;
+                       goto err;
+               }
+
+               rc = vsnprintf(msg, msg_len, fmt, args);
+               if (rc < 0)
+                       goto err;
+
+               rc = llapi_json_add_item(&json_items, "message",
+                                        LLAPI_JSON_STRING,
+                                        msg);
+               if (rc < 0)
+                       goto err;
+       } else {
+               rc = llapi_json_add_item(&json_items, "message",
+                                        LLAPI_JSON_STRING,
+                                        "INTERNAL ERROR: message failed");
+               if (rc < 0)
+                       goto err;
+       }
+
+       real_level = level & LLAPI_MSG_NO_ERRNO;
+       real_level = real_level > 0 ? level - LLAPI_MSG_NO_ERRNO : level;
+
+       rc = llapi_json_add_item(&json_items, "level", LLAPI_JSON_STRING,
+                                (void *)llapi_msg_level2str(real_level));
+       if (rc < 0)
+               goto err;
+
+       rc = llapi_json_add_item(&json_items, "event_type", LLAPI_JSON_STRING,
+                                "LOGGED_MESSAGE");
+       if (rc < 0)
+               goto err;
+
+       rc = llapi_hsm_write_json_event(&json_items);
+       if (rc < 0)
+               goto err;
+
+       goto out_free;
+
+err:
+       /* Write directly to stderr to avoid llapi_error, which now
+        * emits JSON event messages. */
+       fprintf(stderr, "\nFATAL ERROR IN llapi_hsm_log_error(): rc %d,", rc);
+
+out_free:
+       if (json_items != NULL)
+               llapi_json_destroy_list(&json_items);
+
+       return;
+}
+
 /** Register a copytool
  * \param[out] priv Opaque private control structure
  * \param mnt Lustre filesystem mount point
@@ -166,6 +714,8 @@ int llapi_hsm_copytool_register(struct hsm_copytool_private **priv,
                rc = 0;
        }
 
+       llapi_hsm_log_ct_registration(&ct, CT_REGISTER);
+
        /* Only the kernel reference keeps the write side open */
        close(ct->kuc.lk_wfd);
        ct->kuc.lk_wfd = LK_NOFD;
@@ -217,6 +767,8 @@ int llapi_hsm_copytool_unregister(struct hsm_copytool_private **priv)
        /* Shut down the kernelcomms */
        libcfs_ukuc_stop(&ct->kuc);
 
+       llapi_hsm_log_ct_registration(&ct, CT_UNREGISTER);
+
        close(ct->open_by_fid_fd);
        close(ct->mnt_fd);
        free(ct->mnt);
@@ -483,6 +1035,8 @@ int llapi_hsm_action_begin(struct hsm_copyaction_private **phcp,
                goto err_out;
        }
 
+       llapi_hsm_log_ct_progress(&hcp, hai, CT_START, 0, 0);
+
 ok_out:
        hcp->magic = CP_PRIV_MAGIC;
        *phcp = hcp;
@@ -563,6 +1117,8 @@ end:
                goto err_cleanup;
        }
 
+       llapi_hsm_log_ct_progress(&hcp, hai, CT_FINISH, 0, 0);
+
 err_cleanup:
        if (!(hcp->data_fd < 0))
                close(hcp->data_fd);
@@ -576,11 +1132,13 @@ err_cleanup:
 /** Notify a progress in processing an HSM action.
  * \param hdl[in,out]   handle returned by llapi_hsm_action_start.
  * \param he[in]        the range of copied data (for copy actions).
+ * \param total[in]     the expected total of copied data (for copy actions).
  * \param hp_flags[in]  HSM progress flags.
  * \return 0 on success.
  */
 int llapi_hsm_action_progress(struct hsm_copyaction_private *hcp,
-                             const struct hsm_extent *he, int hp_flags)
+                             const struct hsm_extent *he, __u64 total,
+                             int hp_flags)
 {
        int                      rc;
        struct hsm_progress      hp;
@@ -607,6 +1165,8 @@ int llapi_hsm_action_progress(struct hsm_copyaction_private *hcp,
        if (rc < 0)
                rc = -errno;
 
+       llapi_hsm_log_ct_progress(&hcp, hai, CT_RUNNING, total, he->length);
+
        return rc;
 }