On Mon, Jun 18, 2018 at 1:26 AM, QI Fuli <qi.fuli(a)jp.fujitsu.com> wrote:
Ndctl monitor is used for monitoring the smart events of nvdimm
DIMMs.
When a smart event fires, monitor will output the notifications which
including dimm health status and event infomations to syslog, stderr
or a logfile by setting [--logfile] option. The notifications follow
json format and can be consumed by log collectors like Fluentd.
DIMMsto monitor can be selected by [--dimm] [--region] [--namespace]
[--bus] options, and event type can be filtered by [--dimm-event] option,
these options support mutiple space-seperated arguments.
Ndctl monitor can be forked as a daemon by using [--daemon] option, like:
# ndctl monitor --daemon --logfile /var/log/ndctl/monitor.log
This is looking good, just some some small cleanups / refactoring below:
Signed-off-by: QI Fuli <qi.fuli(a)jp.fujitsu.com>
---
builtin.h | 1 +
ndctl/Makefile.am | 3 +-
ndctl/monitor.c | 532 ++++++++++++++++++++++++++++++++++++++++++++++
ndctl/ndctl.c | 1 +
4 files changed, 536 insertions(+), 1 deletion(-)
create mode 100644 ndctl/monitor.c
diff --git a/builtin.h b/builtin.h
index d3cc723..675a6ce 100644
--- a/builtin.h
+++ b/builtin.h
@@ -39,6 +39,7 @@ int cmd_inject_error(int argc, const char **argv, void *ctx);
int cmd_wait_scrub(int argc, const char **argv, void *ctx);
int cmd_start_scrub(int argc, const char **argv, void *ctx);
int cmd_list(int argc, const char **argv, void *ctx);
+int cmd_monitor(int argc, const char **argv, void *ctx);
#ifdef ENABLE_TEST
int cmd_test(int argc, const char **argv, void *ctx);
#endif
diff --git a/ndctl/Makefile.am b/ndctl/Makefile.am
index d22a379..7dbf223 100644
--- a/ndctl/Makefile.am
+++ b/ndctl/Makefile.am
@@ -16,7 +16,8 @@ ndctl_SOURCES = ndctl.c \
util/json-smart.c \
util/json-firmware.c \
inject-error.c \
- inject-smart.c
+ inject-smart.c \
+ monitor.c
if ENABLE_DESTRUCTIVE
ndctl_SOURCES += ../test/blk_namespaces.c \
diff --git a/ndctl/monitor.c b/ndctl/monitor.c
new file mode 100644
index 0000000..12aa499
--- /dev/null
+++ b/ndctl/monitor.c
@@ -0,0 +1,532 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2018, FUJITSU LIMITED. All rights reserved. */
+
+#include <stdio.h>
+#include <json-c/json.h>
+#include <libgen.h>
+#include <dirent.h>
+#include <util/log.h>
+#include <util/json.h>
+#include <util/filter.h>
+#include <util/util.h>
+#include <util/parse-options.h>
+#include <util/strbuf.h>
+#include <ndctl/lib/private.h>
+#include <ndctl/libndctl.h>
+#include <sys/stat.h>
+#include <sys/epoll.h>
+#define BUF_SIZE 2048
+
+#define DIMM_SPARES_REMAINING (1 << 0)
+#define DIMM_MEDIA_TEMPERATURE (1 << 1)
+#define DIMM_CTRL_TEMPERATURE (1 << 2)
+#define DIMM_HEALTH_STATE (1 << 3)
+#define DIMM_UNCLEAN_SHUTDOWN (1 << 4)
+
+static enum log_destination {
+ LOG_DESTINATION_STDERR = 1,
+ LOG_DESTINATION_SYSLOG = 2,
+ LOG_DESTINATION_FILE = 3,
+} log_destination = LOG_DESTINATION_SYSLOG;
+
+static struct {
+ const char *logfile;
+ const char *dimm_event;
+ bool daemon;
+} monitor;
+
+struct monitor_dimm {
+ struct ndctl_dimm *dimm;
+ int health_eventfd;
+ unsigned int health;
+ unsigned int event_flags;
+ struct list_node list;
+};
+
+struct monitor_filter_arg {
+ struct list_head dimms;
+ int maxfd_dimm;
+ int num_dimm;
+ unsigned long flags;
+};
+
+struct util_filter_params param;
+
+static int did_fail;
+
+#define fail(fmt, ...) \
+do { \
+ did_fail = 1; \
+ err((struct ndctl_ctx *)ctx, "ndctl-%s:%s:%d: " fmt, \
+ VERSION, __func__, __LINE__, ##__VA_ARGS__); \
+} while (0)
+
+static bool is_dir(char *filepath)
+{
+ DIR *dir = opendir(filepath);
+ if (dir) {
+ closedir(dir);
+ return true;
+ }
+ return false;
+}
+
+static void logreport(struct ndctl_ctx *ctx, int priority, const char *file,
+ int line, const char *fn, const char *format, va_list args)
Rather than have this single 'logreport' routine that then turns
around and looks at 'log_destination' with a switch statement, just
install the proper log function directly.
+{
+ char *log_dir;
+ FILE *f;
+ char *buf = (char *)malloc(BUF_SIZE);
No need to cast the return value from malloc() and please put a space
between variable declarations and code.
+ vsnprintf(buf, BUF_SIZE, format, args);
+
+ switch (log_destination) {
+ case LOG_DESTINATION_STDERR:
+ fprintf(stderr, "%s\n", buf);
+ goto end;
For example, we don't need this one because log_stderr is the default
that is setup by log_init() via ndctl_new().
+
+ case LOG_DESTINATION_SYSLOG:
+ syslog(priority, "%s\n", buf);
+ goto end;
...create a log_syslog()
+
+ case LOG_DESTINATION_FILE:
+ log_dir = dirname(strdup(monitor.logfile));
+ if (!log_dir) {
+ log_destination = LOG_DESTINATION_SYSLOG;
+ fail("strdup %s failed\n%s", monitor.logfile, buf);
+ goto end;
+ }
+ if (!is_dir(log_dir)) {
+ log_destination = LOG_DESTINATION_SYSLOG;
+ fail("dir: %s is required\n%s", log_dir, buf);
+ free(log_dir);
+ goto end;
+ }
+ f = fopen(monitor.logfile, "a+");
+ if (!f) {
+ log_destination = LOG_DESTINATION_SYSLOG;
+ fail("open logfile %s failed\n%s", monitor.logfile,
buf);
+ free(log_dir);
+ goto end;
+ }
+ fprintf(f, "%s\n", buf);
+ fclose(f);
+ free(log_dir);
+ goto end;
+ }
...create a log_file() routine for this case.
+end:
+ free(buf);
+ return;
+}
+
+static int get_monitor_dimm_data(struct monitor_dimm *mdimm)
+{
+ struct ndctl_cmd *cmd;
+ unsigned int alarm_flags;
+
+ cmd = ndctl_dimm_cmd_new_smart(mdimm->dimm);
+ if (!cmd)
+ goto out;
+
+ if (ndctl_cmd_submit(cmd))
+ goto out;
+
+ mdimm->health_eventfd = ndctl_dimm_get_health_eventfd(mdimm->dimm);
+
+ alarm_flags = ndctl_cmd_smart_get_alarm_flags(cmd);
+ if (alarm_flags & ND_SMART_SPARE_TRIP)
+ mdimm->event_flags |= DIMM_SPARES_REMAINING;
+ if (alarm_flags & ND_SMART_MTEMP_TRIP)
+ mdimm->event_flags |= DIMM_MEDIA_TEMPERATURE;
+ if (alarm_flags & ND_SMART_CTEMP_TRIP)
+ mdimm->event_flags |= DIMM_CTRL_TEMPERATURE;
+
+ mdimm->health = ndctl_cmd_smart_get_health(cmd);
+
+ if (ndctl_cmd_smart_get_shutdown_state(cmd))
+ mdimm->event_flags |= DIMM_UNCLEAN_SHUTDOWN;
+
+ ndctl_cmd_unref(cmd);
+ return 0;
+out:
+ if (cmd)
+ ndctl_cmd_unref(cmd);
+ return 1;
+}
+
+static struct json_object *dimm_event_to_json(struct monitor_dimm *mdimm)
+{
+ struct json_object *jevent, *jobj, *jdimm_health, *jdimm_detect;
+ struct ndctl_ctx *ctx = ndctl_dimm_get_ctx(mdimm->dimm);
+
+ jevent = json_object_new_object();
+ if (!jevent) {
+ fail("\n");
+ return NULL;
+ }
+
+ jdimm_health = json_object_new_array();
+ jdimm_detect = json_object_new_array();
+ if (!jdimm_health || !jdimm_detect) {
+ fail("\n");
+ goto out;
+ }
+
+ if (mdimm->event_flags & DIMM_SPARES_REMAINING) {
+ jobj = json_object_new_string("dimm-spares-remaining");
+ if (!jobj)
+ goto out;
+ json_object_array_add(jdimm_health, jobj);
+ }
+ if (mdimm->event_flags & DIMM_MEDIA_TEMPERATURE) {
+ jobj = json_object_new_string("dimm-media-temperature");
+ if (!jobj)
+ goto out;
+ json_object_array_add(jdimm_health, jobj);
+ }
+ if (mdimm->event_flags & DIMM_CTRL_TEMPERATURE) {
+ jobj = json_object_new_string("dimm-controller-temperature");
+ if (!jobj)
+ goto out;
+ json_object_array_add(jdimm_health, jobj);
+ }
+ if (mdimm->event_flags & DIMM_HEALTH_STATE) {
+ jobj = json_object_new_string("dimm-health-state");
+ if (!jobj)
+ goto out;
+ json_object_array_add(jdimm_health, jobj);
+ }
+ if (mdimm->event_flags & DIMM_UNCLEAN_SHUTDOWN) {
+ jobj = json_object_new_string("dimm-unclean-shutdown");
+ if (!jobj)
+ goto out;
+ json_object_array_add(jdimm_detect, jobj);
+ }
+
+ if (json_object_array_length(jdimm_health) > 0)
+ json_object_object_add(jevent, "dimm-health", jdimm_health);
+ else
+ free (jdimm_health);
+
+ if (json_object_array_length(jdimm_detect) > 0)
+ json_object_object_add(jevent, "dimm-detect", jdimm_detect);
+ else
+ free (jdimm_detect);
+
+ return jevent;
+out:
+ if (jdimm_health)
+ free(jdimm_health);
+ if (jdimm_detect)
+ free(jdimm_detect);
+ free(jevent);
+ return NULL;
+}
Is this routine trying to show which events fired? I think these
should be json-boolean types where the value is true / false and the
key is the event-type name.
+
+static int notify_dimm_event(struct monitor_dimm *mdimm)
+{
+ struct json_object *jmsg, *jdimm, *jobj;
+ struct timespec ts;
+ char timestamp[32];
+ struct ndctl_ctx *ctx = ndctl_dimm_get_ctx(mdimm->dimm);
+
+ jmsg = json_object_new_object();
+ if (!jmsg) {
+ fail("\n");
+ return -1;
+ }
+
+ clock_gettime(CLOCK_REALTIME, &ts);
+ sprintf(timestamp, "%10ld.%09ld", ts.tv_sec, ts.tv_nsec);
+ jobj = json_object_new_string(timestamp);
+ if (!jobj) {
+ fail("\n");
+ return -1;
+ }
+ json_object_object_add(jmsg, "timestamp", jobj);
+
+ jobj = json_object_new_int(getpid());
+ if (!jobj) {
+ fail("\n");
+ return -1;
+ }
+ json_object_object_add(jmsg, "pid", jobj);
+
+ jobj = dimm_event_to_json(mdimm);
+ if (!jobj) {
+ fail("\n");
+ return -1;
+ }
+ json_object_object_add(jmsg, "event", jobj);
+
+ jdimm = util_dimm_to_json(mdimm->dimm, 0);
+ if (!jdimm) {
+ fail("\n");
+ return -1;
+ }
+ json_object_object_add(jmsg, "dimm", jdimm);
+
+ jobj = util_dimm_health_to_json(mdimm->dimm);
+ if (!jobj) {
+ fail("\n");
+ return -1;
+ }
+ json_object_object_add(jdimm, "health", jobj);
+
+ notice(ctx, "%s",
+ json_object_to_json_string_ext(jmsg, JSON_C_TO_STRING_PLAIN));
+ return 0;
+}
Where is the memory released for the newly created json objects in this routine?
+
+static struct monitor_dimm *util_dimm_event_filter(struct monitor_dimm *mdimm,
+ const char *__ident)
+{
+ char *ident, *save;
+ const char *name;
+ struct monitor_dimm *__mdimm;
+ struct ndctl_ctx *ctx = ndctl_dimm_get_ctx(mdimm->dimm);
+
+ __mdimm = calloc(1, sizeof(struct monitor_dimm));
+ if (!__mdimm) {
+ fail("\n");
+ return NULL;
+ }
+
+ __mdimm->dimm = mdimm->dimm;
+ if (get_monitor_dimm_data(__mdimm)) {
+ fail("\n");
+ goto out;
+ }
+ if (mdimm->health != __mdimm->health)
+ __mdimm->event_flags |= DIMM_HEALTH_STATE;
+
+ if (!__ident)
+ return __mdimm;
+
+ ident = strdup(__ident);
+ if (!ident)
+ goto out;
+
+ for (name = strtok_r(ident, " ", &save); name;
+ name = strtok_r(NULL, " ", &save)) {
+ if (strcmp(name, "all") == 0)
+ break;
+
+ if (strcmp(name, "dimm-spares-remaining") == 0) {
+ if (__mdimm->event_flags & DIMM_SPARES_REMAINING)
+ break;
+ } else if (strcmp(name, "dimm-media-temperature") == 0) {
+ if (__mdimm->event_flags & DIMM_MEDIA_TEMPERATURE)
+ break;
+ } else if (strcmp(name, "dimm-controller-temperature") == 0) {
+ if (__mdimm->event_flags & DIMM_CTRL_TEMPERATURE)
+ break;
+ } else if (strcmp(name, "dimm-health-state") == 0) {
+ if (__mdimm->event_flags & DIMM_HEALTH_STATE)
+ break;
+ } else if (strcmp(name, "dimm-unclean-shutdown") == 0) {
+ if (__mdimm->event_flags & DIMM_UNCLEAN_SHUTDOWN)
+ break;
+ }
+ }
+ free(ident);
+
+ if (name)
+ return __mdimm;
+out:
+ free(__mdimm);
+ return NULL;
+}
+
+static bool filter_region(struct ndctl_region *region,
+ struct util_filter_ctx *ctx)
+{
+ return true;
+}
+
+static void filter_dimm(struct ndctl_dimm *dimm, struct util_filter_ctx *ctx)
+{
+ struct monitor_dimm *mdimm, *__mdimm;
+ struct monitor_filter_arg *mfa = (struct monitor_filter_arg *)ctx->arg;
+
+ if (!ndctl_dimm_is_cmd_supported(dimm, ND_CMD_SMART_THRESHOLD))
+ return;
+
+ if (!ndctl_dimm_is_cmd_supported(dimm, ND_CMD_SMART))
+ return;
+
+ mdimm = calloc(1, sizeof(struct monitor_dimm));
+ if (!mdimm)
+ return;
+
+ mdimm->dimm = dimm;
+ if (get_monitor_dimm_data(mdimm))
+ goto out;
+
+ if (mdimm->event_flags) {
+ __mdimm = util_dimm_event_filter(mdimm, monitor.dimm_event);
+ if (__mdimm) {
+ if (notify_dimm_event(__mdimm))
+ fail("notify dimm event failed\n");
+ free(__mdimm);
+ }
+ }
+
+ list_add_tail(&mfa->dimms, &mdimm->list);
+ if (mdimm->health_eventfd > mfa->maxfd_dimm)
+ mfa->maxfd_dimm = mdimm->health_eventfd;
+ mfa->num_dimm++;
+ return;
+out:
+ free(mdimm);
+ return;
+}
+
+static bool filter_bus(struct ndctl_bus *bus, struct util_filter_ctx *ctx)
+{
+ return true;
+}
+
+static int monitor_event(struct ndctl_ctx *ctx,
+ struct monitor_filter_arg *mfa)
+{
+ struct epoll_event ev, *events;
+ int nfds, epollfd, i, rc;
+ struct monitor_dimm *mdimm, *__mdimm;
+ char buf;
+
+ events = malloc(sizeof(struct epoll_event) * mfa->num_dimm);
Let's use calloc here.
+ if (!events) {
+ fail("malloc for events error\n");
+ goto out;
+ }
+ epollfd = epoll_create1(0);
+ if (epollfd == -1) {
+ fail("epoll_create1 error\n");
+ goto out;
+ }
+ list_for_each(&mfa->dimms, mdimm, list) {
+ memset(&ev, 0, sizeof(ev));
+ rc = pread(mdimm->health_eventfd, &buf, sizeof(buf), 0);
+ if (rc < 0) {
+ fail("pread error\n");
+ goto out;
+ }
+ ev.data.ptr = mdimm;
+ if (epoll_ctl(epollfd, EPOLL_CTL_ADD,
+ mdimm->health_eventfd, &ev) != 0) {
+ fail("epoll_ctl error\n");
+ goto out;
+ }
+ }
+
+ while (1) {
+ did_fail = 0;
+ nfds = epoll_wait(epollfd, events, mfa->num_dimm, -1);
+ if (nfds <= 0) {
+ fail("epoll_wait error\n");
+ goto out;
+ }
+ for (i = 0; i < nfds; i++) {
+ mdimm = events[i].data.ptr;
+ __mdimm = util_dimm_event_filter(mdimm,
+ monitor.dimm_event);
+ if (__mdimm) {
+ if (notify_dimm_event(__mdimm))
+ fail("notify dimm event failed\n");
+ free(__mdimm);
+ }
+ rc = pread(mdimm->health_eventfd, &buf, sizeof(buf), 0);
+ if (rc < 0) {
+ fail("pread error\n");
+ goto out;
+ }
+ }
+ if (did_fail)
+ goto out;
+ }
+ return 0;
+out:
+ return 1;
+}
+
+int cmd_monitor(int argc, const char **argv, void *ctx)
+{
+ int i;
+ const struct option options[] = {
+ OPT_STRING('b', "bus", ¶m.bus,
"bus-id", "filter by bus"),
+ OPT_STRING('r', "region", ¶m.region,
"region-id",
+ "filter by region"),
+ OPT_STRING('d', "dimm", ¶m.dimm,
"dimm-id",
+ "filter by dimm"),
+ OPT_STRING('n', "namespace", ¶m.namespace,
+ "namespace-id", "filter by namespace
id"),
+ OPT_FILENAME('l', "logfile", &monitor.logfile,
+ "file | syslog | stderr",
+ "where to output the monitor's
notification"),
+ OPT_BOOLEAN('f', "daemon", &monitor.daemon,
+ "run ndctl monitor as a daemon"),
+ OPT_STRING('D', "dimm-event", &monitor.dimm_event,
+ "dimm-spares-remaining | dimm-media-temperature |
dimm-controller-temperature | dimm-health-state | dimm-unclean-shutdown",
+ "filter by DIMM event type"),
+ OPT_END(),
+ };
+ const char * const u[] = {
+ "ndctl monitor [<options>]",
+ NULL
+ };
+ const char *prefix = "./";
+ struct util_filter_ctx fctx = { 0 };
+ struct monitor_filter_arg mfa = { 0 };
+
+ argc = parse_options_prefix(argc, argv, prefix, options, u, 0);
+ for (i = 0; i < argc; i++) {
+ error("unknown parameter \"%s\"\n", argv[i]);
+ }
+ if (argc)
+ usage_with_options(u, options);
+
+ ndctl_set_log_fn((struct ndctl_ctx *)ctx, logreport);
+ ndctl_set_log_priority((struct ndctl_ctx *)ctx, LOG_INFO);
+
+ if (monitor.logfile) {
+ if (strcmp(monitor.logfile, "./stderr") == 0)
+ log_destination = LOG_DESTINATION_STDERR;
+ else if (strcmp(monitor.logfile, "./syslog") == 0)
+ log_destination = LOG_DESTINATION_SYSLOG;
+ else
+ log_destination = LOG_DESTINATION_FILE;
+ }
+
+ if (monitor.daemon) {
+ if (daemon(0, 0) != 0) {
+ fail("daemon start failed\n");
+ goto out;
+ }
+ info((struct ndctl_ctx *)ctx, "ndctl monitor daemon
started\n");
+ }
+
+ fctx.filter_bus = filter_bus;
+ fctx.filter_dimm = filter_dimm;
+ fctx.filter_region = filter_region;
+ fctx.filter_namespace = NULL;
+ fctx.arg = &mfa;
+ list_head_init(&mfa.dimms);
+ mfa.num_dimm = 0;
+ mfa.maxfd_dimm = -1;
+ mfa.flags = 0;
+
+ if (util_filter_walk(ctx, &fctx, ¶m))
+ goto out;
+
+ if (!mfa.num_dimm) {
+ fail("no dimms can be monitored\n");
+ goto out;
+ }
+
+ if (monitor_event(ctx, &mfa))
+ goto out;
+
+ return 0;
+out:
+ return 1;
+}
diff --git a/ndctl/ndctl.c b/ndctl/ndctl.c
index 7daadeb..73dabfa 100644
--- a/ndctl/ndctl.c
+++ b/ndctl/ndctl.c
@@ -89,6 +89,7 @@ static struct cmd_struct commands[] = {
{ "wait-scrub", cmd_wait_scrub },
{ "start-scrub", cmd_start_scrub },
{ "list", cmd_list },
+ { "monitor", cmd_monitor},
{ "help", cmd_help },
#ifdef ENABLE_TEST
{ "test", cmd_test },
--
2.17.1
_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm(a)lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm