On 7/14/2021 10:17 AM, ramesh.thomas(a)intel.com wrote:
From: Ramesh Thomas <ramesh.thomas(a)intel.com>
Capture a snapshot of error condition containing cmd_status and the
associated devices. Provide APIs to return the captured details. Handles
both hw and sw error codes stored in sysfs cmd_status field of devices.
Following APIs are added:
- accfg_ctx_get_last_error;
- accfg_ctx_get_last_error_str;
- accfg_ctx_get_last_error_device;
- accfg_ctx_get_last_error_wq;
- accfg_ctx_get_last_error_group;
- accfg_ctx_get_last_error_engine;
Signed-off-by: Ramesh Thomas <ramesh.thomas(a)intel.com>
---
accfg/lib/libaccel-config.sym | 6 ++
accfg/lib/libaccfg.c | 160 ++++++++++++++++++++++++++++++----
accfg/lib/private.h | 9 ++
accfg/libaccel_config.h | 9 +-
4 files changed, 164 insertions(+), 20 deletions(-)
diff --git a/accfg/lib/libaccel-config.sym b/accfg/lib/libaccel-config.sym
index e9b54e2..f44a3ca 100644
--- a/accfg/lib/libaccel-config.sym
+++ b/accfg/lib/libaccel-config.sym
@@ -154,4 +154,10 @@ LIBACCFG_10 {
global:
accfg_wq_get_ats_disable;
accfg_wq_set_ats_disable;
+ accfg_ctx_get_last_error;
+ accfg_ctx_get_last_error_str;
+ accfg_ctx_get_last_error_device;
+ accfg_ctx_get_last_error_wq;
+ accfg_ctx_get_last_error_group;
+ accfg_ctx_get_last_error_engine;
} LIBACCFG_9;
diff --git a/accfg/lib/libaccfg.c b/accfg/lib/libaccfg.c
index bdfcdcf..03dfc92 100644
--- a/accfg/lib/libaccfg.c
+++ b/accfg/lib/libaccfg.c
@@ -27,6 +27,7 @@
#include <accfg/libaccel_config.h>
#include <fnmatch.h>
#include "private.h"
+#include <linux/idxd.h>
#define MDEV_POSTFIX "mdev_supported_types"
#define IDXD_DRIVER_BIND_PATH "/sys/bus/dsa/drivers/idxd"
@@ -70,8 +71,12 @@ ACCFG_EXPORT char *accfg_mdev_basenames[] = {
enum {
ACCFG_CMD_STATUS_MAX = 0x45,
+ ACCFG_CMD_STATUS_ERROR = 0x80010000,
};
+#define SCMD_STAT(x) (((x) & ~IDXD_SCMD_SOFTERR_MASK) >> \
+ IDXD_SCMD_SOFTERR_SHIFT)
+
const char *accfg_device_cmd_status[] = {
[0x0] = "Successful completion",
[0x1] = "Invalid command code",
@@ -105,9 +110,43 @@ const char *accfg_device_cmd_status[] = {
[0x42] = "No interrupt handle available",
[0x43] = "No interrupt handles associated with the index",
[0x44] = "No revoked handles associted with the index",
- [ACCFG_CMD_STATUS_MAX] = "",
+ [ACCFG_CMD_STATUS_MAX] = "Unknown error",
+};
+
+const char *accfg_sw_cmd_status[] = {
+ [0x01] = "Error reading cmd_status (library error)",
+ [SCMD_STAT(IDXD_SCMD_DEV_DMA_ERR)] = "DMA device registration error (driver
error)",
+ [SCMD_STAT(IDXD_SCMD_WQ_NO_GRP)] = "wq error - group not set",
+ [SCMD_STAT(IDXD_SCMD_WQ_NO_NAME)] = "wq error - name not set",
+ [SCMD_STAT(IDXD_SCMD_WQ_NO_SVM)] = "wq error - no SVM support and needed (driver
error)",
This one is a platform configuration error. Not driver error.
+ [SCMD_STAT(IDXD_SCMD_WQ_NO_THRESH)] = "wq error - threshold
not set",
+ [SCMD_STAT(IDXD_SCMD_WQ_PORTAL_ERR)] = "wq portal mapping error (driver
error)",
+ [SCMD_STAT(IDXD_SCMD_WQ_RES_ALLOC_ERR)] = "wq resource allocation error (driver
error)",
+ [SCMD_STAT(IDXD_SCMD_PERCPU_ERR)] = "wq percpu counter allocation error (driver
error)",
+ [SCMD_STAT(IDXD_SCMD_DMA_CHAN_ERR)] = "wq DMA channel registration error (driver
error)",
+ [SCMD_STAT(IDXD_SCMD_CDEV_ERR)] = "wq char dev registration error (driver
error)",
+ [SCMD_STAT(IDXD_SCMD_WQ_NO_SWQ_SUPPORT)] = "wq error - no shared wq support
(driver error)",
This one is a platform configuration error. Not driver error.
> + [SCMD_STAT(IDXD_SCMD_WQ_NONE_CONFIGURED)] = "wq error - no wqs
configured",
> + [SCMD_STAT(IDXD_SCMD_WQ_NO_SIZE)] = "wq error - size not set",
> };
>
> +static void save_last_error(struct accfg_device *device, struct accfg_wq *wq,
> + struct accfg_group *group, struct accfg_engine *engine)
> +{
> + struct accfg_ctx *ctx = device->ctx;
> +
> + if (!ctx->error_ctx)
> + ctx->error_ctx = calloc(1, sizeof(struct accfg_error_ctx));
> +
> + if (ctx->error_ctx) {
> + ctx->error_ctx->cmd_status = accfg_device_get_cmd_status(device);
> + ctx->error_ctx->device = device;
> + ctx->error_ctx->wq = wq;
> + ctx->error_ctx->group = group;
> + ctx->error_ctx->engine = engine;
> + }
> +}
> +
> static inline bool is_mdev_registered(struct accfg_device *device)
> {
> return device->mdev_path && !access(device->mdev_path, R_OK);
> @@ -251,6 +290,7 @@ static void free_context(struct accfg_ctx *ctx)
>
> list_for_each_safe(&ctx->devices, device, _b, list)
> free_device(device, &ctx->devices);
> + free(ctx->error_ctx);
> free(ctx);
> }
>
> @@ -1083,6 +1123,7 @@ ACCFG_EXPORT int accfg_create_mdev(struct accfg_device
*device,
> rc = sysfs_write_attr(ctx, mdev_path, uuid_str);
> if (rc < 0) {
> err(ctx, "create mdev failed %d\n", rc);
> + save_last_error(device, NULL, NULL, NULL);
> goto create_err;
> }
>
> @@ -1106,8 +1147,10 @@ static int accfg_device_mdev_remove(struct accfg_device
*device,
> uuid_unparse(mdev->uuid, uuid_str);
> sprintf(mdev_path, "%s/%s/remove", device->mdev_path, uuid_str);
> rc = sysfs_write_attr(ctx, mdev_path, "1");
> - if (rc < 0)
> + if (rc < 0) {
> + save_last_error(device, NULL, NULL, NULL);
> return rc;
> + }
>
> list_del(&mdev->list);
> free(mdev);
> @@ -1356,6 +1399,7 @@ ACCFG_EXPORT enum accfg_device_state accfg_device_get_state(
> err(ctx, "%s: sysfs_read_attr failed '%s': %s\n",
> __func__, device->device_path,
> strerror(errno));
> + save_last_error(device, NULL, NULL, NULL);
> return ACCFG_DEVICE_UNKNOWN;
> }
>
> @@ -1415,6 +1459,7 @@ ACCFG_EXPORT int accfg_device_get_clients(struct accfg_device
*device)
> if (sysfs_read_attr(ctx, path, buf) < 0) {
> err(ctx, "%s: retrieve clients failed '%s': %s\n",
> __func__, path, strerror(errno));
> + save_last_error(device, NULL, NULL, NULL);
> return -errno;
> }
>
> @@ -1449,6 +1494,7 @@ ACCFG_EXPORT int accfg_device_set_token_limit(struct
accfg_device *dev, int val)
> if (sysfs_write_attr(ctx, path, buf) < 0) {
> err(ctx, "%s: write failed: %s\n",
> accfg_device_get_devname(dev), strerror(errno));
> + save_last_error(dev, NULL, NULL, NULL);
> return -errno;
> }
>
> @@ -1477,8 +1523,10 @@ ACCFG_EXPORT int accfg_device_is_active(struct accfg_device
*device)
> return 0;
> }
>
> - if (sysfs_read_attr(ctx, path, buf) < 0)
> + if (sysfs_read_attr(ctx, path, buf) < 0) {
> + save_last_error(device, NULL, NULL, NULL);
> return 0;
> + }
>
> if (strcmp(buf, "enabled") == 0)
> return 1;
> @@ -1486,17 +1534,16 @@ ACCFG_EXPORT int accfg_device_is_active(struct accfg_device
*device)
> return 0;
> }
>
> -ACCFG_EXPORT int accfg_device_get_cmd_status(struct accfg_device *device)
> +ACCFG_EXPORT unsigned int accfg_device_get_cmd_status(struct accfg_device *device)
> {
> struct accfg_ctx *ctx;
> long status;
> char *path;
> int len;
> char buf[SYSFS_ATTR_SIZE], *end_ptr;
> - int rc;
>
> if (!device)
> - return -EINVAL;
> + return ACCFG_CMD_STATUS_ERROR;
>
> ctx = accfg_device_get_ctx(device);
> path = device->device_buf;
> @@ -1505,31 +1552,96 @@ ACCFG_EXPORT int accfg_device_get_cmd_status(struct
accfg_device *device)
> if (snprintf(path, len, "%s/cmd_status", device->device_path) >=
len) {
> err(ctx, "%s: buffer too small!\n",
> accfg_device_get_devname(device));
> - return -ENOMEM;
> + return ACCFG_CMD_STATUS_ERROR;
> }
>
> - rc = sysfs_read_attr(ctx, path, buf);
> - if (rc < 0)
> - return rc;
> + if (sysfs_read_attr(ctx, path, buf))
> + return ACCFG_CMD_STATUS_ERROR;
>
> status = strtol(buf, &end_ptr, 0);
> if (errno == ERANGE || end_ptr == buf)
> - return -EIO;
> + return ACCFG_CMD_STATUS_ERROR;
>
> - return (int)status;
> + return (unsigned int)status;
> }
>
> -ACCFG_EXPORT const char * accfg_device_get_cmd_status_str(struct accfg_device
*device)
> +static const char *get_cmd_status_str(unsigned int status)
> {
> - int status;
> + unsigned int sw_status;
>
> - status = accfg_device_get_cmd_status(device);
> - if (status < 0 || status >= ACCFG_CMD_STATUS_MAX)
> - return NULL;
> + if (status & IDXD_SCMD_SOFTERR_MASK) {
> + sw_status = SCMD_STAT(status);
> + if (sw_status) {
> + if (sw_status >= (sizeof(accfg_sw_cmd_status) /
> + sizeof(char *))) {
> + status = ACCFG_CMD_STATUS_MAX;
> + goto ext;
> + }
> + return accfg_sw_cmd_status[sw_status];
> + }
> + status &= ~IDXD_SCMD_SOFTERR_MASK;
> + }
>
> + if (status > ACCFG_CMD_STATUS_MAX)
> + status = ACCFG_CMD_STATUS_MAX;
> +
> +ext:
> return accfg_device_cmd_status[status];
> }
>
> +ACCFG_EXPORT const char *accfg_device_get_cmd_status_str(struct accfg_device
*device)
> +{
> + return get_cmd_status_str(accfg_device_get_cmd_status(device));
> +}
> +
> +ACCFG_EXPORT unsigned int accfg_ctx_get_last_error(struct accfg_ctx *ctx)
> +{
> + if (ctx->error_ctx)
> + return ctx->error_ctx->cmd_status;
> +
> + return 0;
> +}
> +
> +ACCFG_EXPORT const char *accfg_ctx_get_last_error_str(struct accfg_ctx *ctx)
> +{
> + return get_cmd_status_str(accfg_ctx_get_last_error(ctx));
> +}
> +
> +ACCFG_EXPORT struct accfg_device *accfg_ctx_get_last_error_device(
> + struct accfg_ctx *ctx)
> +{
> + if (ctx->error_ctx)
> + return ctx->error_ctx->device;
> +
> + return NULL;
> +}
> +
> +ACCFG_EXPORT struct accfg_wq *accfg_ctx_get_last_error_wq(struct accfg_ctx *ctx)
> +{
> + if (ctx->error_ctx)
> + return ctx->error_ctx->wq;
> +
> + return NULL;
> +}
> +
> +ACCFG_EXPORT struct accfg_group *accfg_ctx_get_last_error_group(
> + struct accfg_ctx *ctx)
> +{
> + if (ctx->error_ctx)
> + return ctx->error_ctx->group;
> +
> + return NULL;
> +}
> +
> +ACCFG_EXPORT struct accfg_engine *accfg_ctx_get_last_error_engine(
> + struct accfg_ctx *ctx)
> +{
> + if (ctx->error_ctx)
> + return ctx->error_ctx->engine;
> +
> + return NULL;
> +}
> +
> /* Helper function to validate device type in the defined device array based on
> * device name */
> ACCFG_EXPORT int accfg_device_type_validate(const char *dev_name)
> @@ -1633,6 +1745,7 @@ static int accfg_device_control(struct accfg_device *device,
> rc = sysfs_write_attr(ctx, path, accfg_device_get_devname(device));
> free(path);
> if (rc < 0) {
> + save_last_error(device, NULL, NULL, NULL);
> return rc;
> }
> }
> @@ -1722,8 +1835,10 @@ ACCFG_EXPORT uint64_t accfg_group_get_available_size(
> return ULLONG_MAX;
> }
>
> - if (sysfs_read_attr(ctx, path, buf) < 0)
> + if (sysfs_read_attr(ctx, path, buf) < 0) {
> + save_last_error(group->device, NULL, group, NULL);
> return ULLONG_MAX;
> + }
>
> return strtoull(buf, NULL, 0);
> }
> @@ -1749,6 +1864,7 @@ ACCFG_EXPORT int accfg_group_set_##field( \
> err(ctx, "%s: write failed: %s\n", \
> accfg_group_get_devname(group), \
> strerror(errno)); \
> + save_last_error(group->device, NULL, group, NULL); \
> return -errno; \
> } \
> group->field = val; \
> @@ -1907,6 +2023,7 @@ ACCFG_EXPORT int accfg_wq_get_clients(struct accfg_wq *wq)
> if (sysfs_read_attr(ctx, path, buf) < 0) {
> err(ctx, "%s: retrieve clients failed: '%s': %s\n",
> __func__, wq->wq_path, strerror(errno));
> + save_last_error(wq->device, wq, NULL, NULL);
> return -errno;
> }
>
> @@ -1994,6 +2111,7 @@ static int accfg_wq_control(struct accfg_wq *wq, enum
accfg_control_flag flag,
> if (path) {
> rc = sysfs_write_attr(ctx, path, wq_name);
> if (rc < 0) {
> + save_last_error(wq->device, wq, NULL, NULL);
> free(path);
> return rc;
> }
> @@ -2038,6 +2156,7 @@ ACCFG_EXPORT enum accfg_wq_state accfg_wq_get_state(struct
accfg_wq *wq)
> if (sysfs_read_attr(ctx, path, read_state) < 0) {
> err(ctx, "%s: sysfs_read_attr failed '%s': %s\n",
> __func__, wq->wq_path, strerror(errno));
> + save_last_error(wq->device, wq, NULL, NULL);
> return ACCFG_WQ_UNKNOWN;
> }
>
> @@ -2163,6 +2282,7 @@ ACCFG_EXPORT int accfg_wq_set_##field( \
> err(ctx, "%s: write failed: %s\n", \
> accfg_wq_get_devname(wq), \
> strerror(errno)); \
> + save_last_error(wq->device, wq, NULL, NULL); \
> return -errno; \
> } \
> wq->field = val; \
> @@ -2202,6 +2322,7 @@ ACCFG_EXPORT int accfg_wq_set_##field( \
> err(ctx, "%s: write failed: %s\n", \
> accfg_wq_get_devname(wq), \
> strerror(errno)); \
> + save_last_error(wq->device, wq, NULL, NULL); \
> return -errno; \
> } \
> wq->field = val; \
> @@ -2237,6 +2358,7 @@ ACCFG_EXPORT int accfg_wq_set_str_##field( \
> err(ctx, "%s: write failed: %s\n", \
> accfg_wq_get_devname(wq), \
> strerror(errno)); \
> + save_last_error(wq->device, wq, NULL, NULL); \
> return -errno; \
> } \
> if (wq->field) \
> @@ -2275,6 +2397,7 @@ ACCFG_EXPORT int accfg_wq_set_str_type(struct accfg_wq *wq,
const char *val)
> err(ctx, "%s: write failed: %s\n",
> accfg_wq_get_devname(wq),
> strerror(errno));
> + save_last_error(wq->device, wq, NULL, NULL);
> return -errno;
> }
>
> @@ -2384,6 +2507,7 @@ ACCFG_EXPORT int accfg_engine_set_##field( \
> err(ctx, "%s: write failed: %s\n", \
> accfg_engine_get_devname(engine), \
> strerror(errno)); \
> + save_last_error(engine->device, NULL, NULL, engine); \
> return -errno; \
> } \
> engine->field = val; \
> diff --git a/accfg/lib/private.h b/accfg/lib/private.h
> index b28fd40..57aa1ed 100644
> --- a/accfg/lib/private.h
> +++ b/accfg/lib/private.h
> @@ -131,6 +131,14 @@ struct accfg_wq {
>
> #define ACCFG_EXPORT __attribute__ ((visibility("default")))
>
> +struct accfg_error_ctx {
> + unsigned int cmd_status;
> + struct accfg_device *device;
> + struct accfg_group *group;
> + struct accfg_wq *wq;
> + struct accfg_engine *engine;
> +};
> +
> /**
> * struct accfg_ctx - library user context to find device instances
> *
> @@ -149,6 +157,7 @@ struct accfg_ctx {
> uint64_t timeout;
> void *private_data;
> bool compat;
> + struct accfg_error_ctx *error_ctx;
> };
>
> static inline int check_udev(struct udev *udev)
> diff --git a/accfg/libaccel_config.h b/accfg/libaccel_config.h
> index c0b1f16..f8b9bf9 100644
> --- a/accfg/libaccel_config.h
> +++ b/accfg/libaccel_config.h
> @@ -194,9 +194,14 @@ unsigned int accfg_device_get_version(struct accfg_device
*device);
> int accfg_device_get_clients(struct accfg_device *device);
> int accfg_device_set_token_limit(struct accfg_device *dev, int val);
> int accfg_device_is_active(struct accfg_device *device);
> -int accfg_device_get_cmd_status(struct accfg_device *device);
> +unsigned int accfg_device_get_cmd_status(struct accfg_device *device);
> const char *accfg_device_get_cmd_status_str(struct accfg_device *device);
> -
> +unsigned int accfg_ctx_get_last_error(struct accfg_ctx *ctx);
> +const char *accfg_ctx_get_last_error_str(struct accfg_ctx *ctx);
> +struct accfg_device *accfg_ctx_get_last_error_device(struct accfg_ctx *ctx);
> +struct accfg_wq *accfg_ctx_get_last_error_wq(struct accfg_ctx *ctx);
> +struct accfg_group *accfg_ctx_get_last_error_group(struct accfg_ctx *ctx);
> +struct accfg_engine *accfg_ctx_get_last_error_engine(struct accfg_ctx *ctx);
> struct accfg_device_mdev;
> struct accfg_device_mdev *accfg_device_first_mdev(struct accfg_device *device);
> struct accfg_device_mdev *accfg_device_next_mdev(struct accfg_device_mdev *mdev);