On 7/9/2021 10:47 AM, ramesh.thomas(a)intel.com wrote:
From: Ramesh Thomas <ramesh.thomas(a)intel.com>
Capture a snapshot of error condition containing cmd_status and the
associated devices. Provide APIs to return the captured details. Handles
both hw and sw error codes stored in sysfs cmd_status field of devices.
Following APIs are added:
- accfg_ctx_get_last_error;
- accfg_ctx_get_last_error_str;
- accfg_ctx_get_last_error_device;
- accfg_ctx_get_last_error_wq;
- accfg_ctx_get_last_error_group;
- accfg_ctx_get_last_error_engine;
Signed-off-by: Ramesh Thomas <ramesh.thomas(a)intel.com>
---
accfg/lib/libaccel-config.sym | 10 +++
accfg/lib/libaccfg.c | 156 ++++++++++++++++++++++++++++++----
accfg/lib/private.h | 9 ++
accfg/libaccel_config.h | 9 +-
4 files changed, 164 insertions(+), 20 deletions(-)
diff --git a/accfg/lib/libaccel-config.sym b/accfg/lib/libaccel-config.sym
index aecc98b..679c597 100644
--- a/accfg/lib/libaccel-config.sym
+++ b/accfg/lib/libaccel-config.sym
@@ -149,3 +149,13 @@ LIBACCFG_9 {
global:
accfg_device_get_op_cap;
} LIBACCFG_8;
+
+LIBACCFG_10 {
+global:
+ accfg_ctx_get_last_error;
+ accfg_ctx_get_last_error_str;
+ accfg_ctx_get_last_error_device;
+ accfg_ctx_get_last_error_wq;
+ accfg_ctx_get_last_error_group;
+ accfg_ctx_get_last_error_engine;
+} LIBACCFG_9;
diff --git a/accfg/lib/libaccfg.c b/accfg/lib/libaccfg.c
index f53751e..00314b0 100644
--- a/accfg/lib/libaccfg.c
+++ b/accfg/lib/libaccfg.c
@@ -70,6 +70,7 @@ ACCFG_EXPORT char *accfg_mdev_basenames[] = {
enum {
ACCFG_CMD_STATUS_MAX = 0x45,
+ ACCFG_CMD_STATUS_ERROR = 0x80010000,
};
const char *accfg_device_cmd_status[] = {
@@ -105,9 +106,43 @@ const char *accfg_device_cmd_status[] = {
[0x42] = "No interrupt handle available",
[0x43] = "No interrupt handles associated with the index",
[0x44] = "No revoked handles associted with the index",
- [ACCFG_CMD_STATUS_MAX] = "",
+ [ACCFG_CMD_STATUS_MAX] = "Unknown error",
};
+const char *accfg_sw_cmd_status[] = {
+ [0x01] = "Error reading cmd_status",
+ [0x02] = "DMA device registration error",
+ [0x03] = "wq error - group not set",
+ [0x04] = "wq error - name not set",
+ [0x05] = "wq error - no SVM support and needed",
+ [0x06] = "wq error - threshold not set",
+ [0x07] = "wq portal mapping error",
+ [0x08] = "wq resource allocation error",
+ [0x09] = "wq percpu counter allocation error",
+ [0x0a] = "wq DMA channel registration error",
+ [0x0b] = "wq char dev registration error",
+ [0x0c] = "wq error - no shared wq support",
+ [0x0d] = "wq error - no wqs configured",
+ [0x0e] = "wq error - size not set",
+};
+
+static void save_last_error(struct accfg_device *device, struct accfg_wq *wq,
+ struct accfg_group *group, struct accfg_engine *engine)
+{
+ struct accfg_ctx *ctx = device->ctx;
+
+ if (!ctx->error_ctx)
+ ctx->error_ctx = calloc(1, sizeof(struct accfg_error_ctx));
+
+ if (ctx->error_ctx) {
+ ctx->error_ctx->cmd_status = accfg_device_get_cmd_status(device);
+ ctx->error_ctx->device = device;
+ ctx->error_ctx->wq = wq;
+ ctx->error_ctx->group = group;
+ ctx->error_ctx->engine = engine;
+ }
+}
+
static inline bool is_mdev_registered(struct accfg_device *device)
{
return device->mdev_path && !access(device->mdev_path, R_OK);
@@ -251,6 +286,7 @@ static void free_context(struct accfg_ctx *ctx)
list_for_each_safe(&ctx->devices, device, _b, list)
free_device(device, &ctx->devices);
+ free(ctx->error_ctx);
free(ctx);
}
@@ -1082,6 +1118,7 @@ ACCFG_EXPORT int accfg_create_mdev(struct accfg_device *device,
rc = sysfs_write_attr(ctx, mdev_path, uuid_str);
if (rc < 0) {
err(ctx, "create mdev failed %d\n", rc);
+ save_last_error(device, NULL, NULL, NULL);
goto create_err;
}
@@ -1105,8 +1142,10 @@ static int accfg_device_mdev_remove(struct accfg_device *device,
uuid_unparse(mdev->uuid, uuid_str);
sprintf(mdev_path, "%s/%s/remove", device->mdev_path, uuid_str);
rc = sysfs_write_attr(ctx, mdev_path, "1");
- if (rc < 0)
+ if (rc < 0) {
+ save_last_error(device, NULL, NULL, NULL);
return rc;
+ }
list_del(&mdev->list);
free(mdev);
@@ -1355,6 +1394,7 @@ ACCFG_EXPORT enum accfg_device_state accfg_device_get_state(
err(ctx, "%s: sysfs_read_attr failed '%s': %s\n",
__func__, device->device_path,
strerror(errno));
+ save_last_error(device, NULL, NULL, NULL);
return ACCFG_DEVICE_UNKNOWN;
}
@@ -1414,6 +1454,7 @@ ACCFG_EXPORT int accfg_device_get_clients(struct accfg_device
*device)
if (sysfs_read_attr(ctx, path, buf) < 0) {
err(ctx, "%s: retrieve clients failed '%s': %s\n",
__func__, path, strerror(errno));
+ save_last_error(device, NULL, NULL, NULL);
return -errno;
}
@@ -1448,6 +1489,7 @@ ACCFG_EXPORT int accfg_device_set_token_limit(struct accfg_device
*dev, int val)
if (sysfs_write_attr(ctx, path, buf) < 0) {
err(ctx, "%s: write failed: %s\n",
accfg_device_get_devname(dev), strerror(errno));
+ save_last_error(dev, NULL, NULL, NULL);
return -errno;
}
@@ -1476,8 +1518,10 @@ ACCFG_EXPORT int accfg_device_is_active(struct accfg_device
*device)
return 0;
}
- if (sysfs_read_attr(ctx, path, buf) < 0)
+ if (sysfs_read_attr(ctx, path, buf) < 0) {
+ save_last_error(device, NULL, NULL, NULL);
return 0;
+ }
if (strcmp(buf, "enabled") == 0)
return 1;
@@ -1485,17 +1529,16 @@ ACCFG_EXPORT int accfg_device_is_active(struct accfg_device
*device)
return 0;
}
-ACCFG_EXPORT int accfg_device_get_cmd_status(struct accfg_device *device)
+ACCFG_EXPORT unsigned int accfg_device_get_cmd_status(struct accfg_device *device)
{
struct accfg_ctx *ctx;
long status;
char *path;
int len;
char buf[SYSFS_ATTR_SIZE], *end_ptr;
- int rc;
if (!device)
- return -EINVAL;
+ return ACCFG_CMD_STATUS_ERROR;
ctx = accfg_device_get_ctx(device);
path = device->device_buf;
@@ -1504,31 +1547,96 @@ ACCFG_EXPORT int accfg_device_get_cmd_status(struct accfg_device
*device)
if (snprintf(path, len, "%s/cmd_status", device->device_path) >= len)
{
err(ctx, "%s: buffer too small!\n",
accfg_device_get_devname(device));
- return -ENOMEM;
+ return ACCFG_CMD_STATUS_ERROR;
}
- rc = sysfs_read_attr(ctx, path, buf);
- if (rc < 0)
- return rc;
+ if (sysfs_read_attr(ctx, path, buf))
+ return ACCFG_CMD_STATUS_ERROR;
status = strtol(buf, &end_ptr, 0);
if (errno == ERANGE || end_ptr == buf)
- return -EIO;
+ return ACCFG_CMD_STATUS_ERROR;
- return (int)status;
+ return (unsigned int)status;
}
-ACCFG_EXPORT const char * accfg_device_get_cmd_status_str(struct accfg_device *device)
+static const char *get_cmd_status_str(unsigned int status)
{
- int status;
+ unsigned int sw_status;
- status = accfg_device_get_cmd_status(device);
- if (status < 0 || status >= ACCFG_CMD_STATUS_MAX)
- return NULL;
+ if (status & 0x80000000) {
+ sw_status = (status & ~0x80000000) >> 16;
Magic number. Do I need to add a mask define to the header?
> + if (sw_status) {
> + if (sw_status >= (sizeof(accfg_sw_cmd_status) /
> + sizeof(char *))) {
> + status = ACCFG_CMD_STATUS_MAX;
> + goto ext;
> + }
> + return accfg_sw_cmd_status[sw_status];
> + }
> + status &= ~0x80000000;
> + }
> +
> + if (status > ACCFG_CMD_STATUS_MAX)
> + status = ACCFG_CMD_STATUS_MAX;
>
> +ext:
> return accfg_device_cmd_status[status];
> }
>
> +ACCFG_EXPORT const char *accfg_device_get_cmd_status_str(struct accfg_device
*device)
> +{
> + return get_cmd_status_str(accfg_device_get_cmd_status(device));
> +}
> +
> +ACCFG_EXPORT unsigned int accfg_ctx_get_last_error(struct accfg_ctx *ctx)
> +{
> + if (ctx->error_ctx)
> + return ctx->error_ctx->cmd_status;
> +
> + return 0;
> +}
> +
> +ACCFG_EXPORT const char *accfg_ctx_get_last_error_str(struct accfg_ctx *ctx)
> +{
> + return get_cmd_status_str(accfg_ctx_get_last_error(ctx));
> +}
> +
> +ACCFG_EXPORT struct accfg_device *accfg_ctx_get_last_error_device(
> + struct accfg_ctx *ctx)
> +{
> + if (ctx->error_ctx)
> + return ctx->error_ctx->device;
> +
> + return NULL;
> +}
> +
> +ACCFG_EXPORT struct accfg_wq *accfg_ctx_get_last_error_wq(struct accfg_ctx *ctx)
> +{
> + if (ctx->error_ctx)
> + return ctx->error_ctx->wq;
> +
> + return NULL;
> +}
> +
> +ACCFG_EXPORT struct accfg_group *accfg_ctx_get_last_error_group(
> + struct accfg_ctx *ctx)
> +{
> + if (ctx->error_ctx)
> + return ctx->error_ctx->group;
> +
> + return NULL;
> +}
> +
> +ACCFG_EXPORT struct accfg_engine *accfg_ctx_get_last_error_engine(
> + struct accfg_ctx *ctx)
> +{
> + if (ctx->error_ctx)
> + return ctx->error_ctx->engine;
> +
> + return NULL;
> +}
> +
> /* Helper function to validate device type in the defined device array based on
> * device name */
> ACCFG_EXPORT int accfg_device_type_validate(const char *dev_name)
> @@ -1632,6 +1740,7 @@ static int accfg_device_control(struct accfg_device *device,
> rc = sysfs_write_attr(ctx, path, accfg_device_get_devname(device));
> free(path);
> if (rc < 0) {
> + save_last_error(device, NULL, NULL, NULL);
> return rc;
> }
> }
> @@ -1721,8 +1830,10 @@ ACCFG_EXPORT uint64_t accfg_group_get_available_size(
> return ULLONG_MAX;
> }
>
> - if (sysfs_read_attr(ctx, path, buf) < 0)
> + if (sysfs_read_attr(ctx, path, buf) < 0) {
> + save_last_error(group->device, NULL, group, NULL);
> return ULLONG_MAX;
> + }
>
> return strtoull(buf, NULL, 0);
> }
> @@ -1748,6 +1859,7 @@ ACCFG_EXPORT int accfg_group_set_##field( \
> err(ctx, "%s: write failed: %s\n", \
> accfg_group_get_devname(group), \
> strerror(errno)); \
> + save_last_error(group->device, NULL, group, NULL); \
> return -errno; \
> } \
> group->field = val; \
> @@ -1906,6 +2018,7 @@ ACCFG_EXPORT int accfg_wq_get_clients(struct accfg_wq *wq)
> if (sysfs_read_attr(ctx, path, buf) < 0) {
> err(ctx, "%s: retrieve clients failed: '%s': %s\n",
> __func__, wq->wq_path, strerror(errno));
> + save_last_error(wq->device, wq, NULL, NULL);
> return -errno;
> }
>
> @@ -1993,6 +2106,7 @@ static int accfg_wq_control(struct accfg_wq *wq, enum
accfg_control_flag flag,
> if (path) {
> rc = sysfs_write_attr(ctx, path, wq_name);
> if (rc < 0) {
> + save_last_error(wq->device, wq, NULL, NULL);
> free(path);
> return rc;
> }
> @@ -2037,6 +2151,7 @@ ACCFG_EXPORT enum accfg_wq_state accfg_wq_get_state(struct
accfg_wq *wq)
> if (sysfs_read_attr(ctx, path, read_state) < 0) {
> err(ctx, "%s: sysfs_read_attr failed '%s': %s\n",
> __func__, wq->wq_path, strerror(errno));
> + save_last_error(wq->device, wq, NULL, NULL);
> return ACCFG_WQ_UNKNOWN;
> }
>
> @@ -2162,6 +2277,7 @@ ACCFG_EXPORT int accfg_wq_set_##field( \
> err(ctx, "%s: write failed: %s\n", \
> accfg_wq_get_devname(wq), \
> strerror(errno)); \
> + save_last_error(wq->device, wq, NULL, NULL); \
> return -errno; \
> } \
> wq->field = val; \
> @@ -2200,6 +2316,7 @@ ACCFG_EXPORT int accfg_wq_set_##field( \
> err(ctx, "%s: write failed: %s\n", \
> accfg_wq_get_devname(wq), \
> strerror(errno)); \
> + save_last_error(wq->device, wq, NULL, NULL); \
> return -errno; \
> } \
> wq->field = val; \
> @@ -2235,6 +2352,7 @@ ACCFG_EXPORT int accfg_wq_set_str_##field( \
> err(ctx, "%s: write failed: %s\n", \
> accfg_wq_get_devname(wq), \
> strerror(errno)); \
> + save_last_error(wq->device, wq, NULL, NULL); \
> return -errno; \
> } \
> if (wq->field) \
> @@ -2273,6 +2391,7 @@ ACCFG_EXPORT int accfg_wq_set_str_type(struct accfg_wq *wq,
const char *val)
> err(ctx, "%s: write failed: %s\n",
> accfg_wq_get_devname(wq),
> strerror(errno));
> + save_last_error(wq->device, wq, NULL, NULL);
> return -errno;
> }
>
> @@ -2381,6 +2500,7 @@ ACCFG_EXPORT int accfg_engine_set_##field( \
> err(ctx, "%s: write failed: %s\n", \
> accfg_engine_get_devname(engine), \
> strerror(errno)); \
> + save_last_error(engine->device, NULL, NULL, engine); \
> return -errno; \
> } \
> engine->field = val; \
> diff --git a/accfg/lib/private.h b/accfg/lib/private.h
> index 7f3162e..4a9843e 100644
> --- a/accfg/lib/private.h
> +++ b/accfg/lib/private.h
> @@ -130,6 +130,14 @@ struct accfg_wq {
>
> #define ACCFG_EXPORT __attribute__ ((visibility("default")))
>
> +struct accfg_error_ctx {
> + unsigned int cmd_status;
> + struct accfg_device *device;
> + struct accfg_group *group;
> + struct accfg_wq *wq;
> + struct accfg_engine *engine;
> +};
> +
> /**
> * struct accfg_ctx - library user context to find device instances
> *
> @@ -148,6 +156,7 @@ struct accfg_ctx {
> uint64_t timeout;
> void *private_data;
> bool compat;
> + struct accfg_error_ctx *error_ctx;
> };
>
> static inline int check_udev(struct udev *udev)
> diff --git a/accfg/libaccel_config.h b/accfg/libaccel_config.h
> index 5b37790..27d10b6 100644
> --- a/accfg/libaccel_config.h
> +++ b/accfg/libaccel_config.h
> @@ -188,9 +188,14 @@ unsigned int accfg_device_get_version(struct accfg_device
*device);
> int accfg_device_get_clients(struct accfg_device *device);
> int accfg_device_set_token_limit(struct accfg_device *dev, int val);
> int accfg_device_is_active(struct accfg_device *device);
> -int accfg_device_get_cmd_status(struct accfg_device *device);
> +unsigned int accfg_device_get_cmd_status(struct accfg_device *device);
> const char *accfg_device_get_cmd_status_str(struct accfg_device *device);
> -
> +unsigned int accfg_ctx_get_last_error(struct accfg_ctx *ctx);
> +const char *accfg_ctx_get_last_error_str(struct accfg_ctx *ctx);
> +struct accfg_device *accfg_ctx_get_last_error_device(struct accfg_ctx *ctx);
> +struct accfg_wq *accfg_ctx_get_last_error_wq(struct accfg_ctx *ctx);
> +struct accfg_group *accfg_ctx_get_last_error_group(struct accfg_ctx *ctx);
> +struct accfg_engine *accfg_ctx_get_last_error_engine(struct accfg_ctx *ctx);
> struct accfg_device_mdev;
> struct accfg_device_mdev *accfg_device_first_mdev(struct accfg_device *device);
> struct accfg_device_mdev *accfg_device_next_mdev(struct accfg_device_mdev *mdev);