From ed007ca416ea18dff58e2955a8fa5aa1570b867c Mon Sep 17 00:00:00 2001 From: David van Moolenbroek Date: Mon, 5 Dec 2011 10:53:57 +0100 Subject: [PATCH] libbdev: extended version This version of libbdev support asynchronous communication, recovery after driver restarts, and retrying of failed transfer operations. --- common/include/minix/bdev.h | 32 ++- lib/libbdev/Makefile | 2 +- lib/libbdev/NOTES | 52 +++++ lib/libbdev/bdev.c | 394 ++++++++++++++++++++++++++++++++---- lib/libbdev/call.c | 118 +++++++++++ lib/libbdev/const.h | 10 + lib/libbdev/driver.c | 1 + lib/libbdev/ipc.c | 281 +++++++++++++++++++++++-- lib/libbdev/minor.c | 120 +++++++++++ lib/libbdev/proto.h | 18 ++ lib/libbdev/type.h | 16 ++ 11 files changed, 989 insertions(+), 55 deletions(-) create mode 100644 lib/libbdev/NOTES create mode 100644 lib/libbdev/call.c create mode 100644 lib/libbdev/minor.c create mode 100644 lib/libbdev/type.h diff --git a/common/include/minix/bdev.h b/common/include/minix/bdev.h index 9f1170eac..dce412a4c 100644 --- a/common/include/minix/bdev.h +++ b/common/include/minix/bdev.h @@ -1,8 +1,10 @@ -#ifndef __MINIX_BDEV_H -#define __MINIX_BDEV_H +#ifndef _MINIX_BDEV_H +#define _MINIX_BDEV_H +/* Common API. */ extern void bdev_driver(dev_t dev, char *label); +/* Synchronous API. */ extern int bdev_open(dev_t dev, int access); extern int bdev_close(dev_t dev); @@ -16,4 +18,28 @@ extern ssize_t bdev_scatter(dev_t dev, u64_t pos, iovec_t *vec, int count, int flags); extern int bdev_ioctl(dev_t dev, int request, void *buf); -#endif /* __MINIX_BDEV_H */ +/* Asynchronous API. */ +typedef int bdev_id_t; +typedef void *bdev_param_t; + +typedef void (*bdev_callback_t)(dev_t dev, bdev_id_t id, bdev_param_t param, + int result); + +extern void bdev_flush_asyn(dev_t dev); + +extern bdev_id_t bdev_read_asyn(dev_t dev, u64_t pos, char *buf, size_t count, + int flags, bdev_callback_t callback, bdev_param_t param); +extern bdev_id_t bdev_write_asyn(dev_t dev, u64_t pos, char *buf, size_t count, + int flags, bdev_callback_t callback, bdev_param_t param); +extern bdev_id_t bdev_gather_asyn(dev_t dev, u64_t pos, iovec_t *vec, + int count, int flags, bdev_callback_t callback, bdev_param_t param); +extern bdev_id_t bdev_scatter_asyn(dev_t dev, u64_t pos, iovec_t *vec, + int count, int flags, bdev_callback_t callback, bdev_param_t param); +extern bdev_id_t bdev_ioctl_asyn(dev_t dev, int request, void *buf, + bdev_callback_t callback, bdev_param_t param); + +extern int bdev_wait_asyn(bdev_id_t id); + +extern void bdev_reply_asyn(message *m); + +#endif /* _MINIX_BDEV_H */ diff --git a/lib/libbdev/Makefile b/lib/libbdev/Makefile index 37e4021a8..9a8a3e3b9 100644 --- a/lib/libbdev/Makefile +++ b/lib/libbdev/Makefile @@ -3,6 +3,6 @@ LIB= bdev -SRCS= bdev.c ipc.c driver.c +SRCS= bdev.c driver.c call.c ipc.c minor.c .include diff --git a/lib/libbdev/NOTES b/lib/libbdev/NOTES new file mode 100644 index 000000000..6bf60e2e9 --- /dev/null +++ b/lib/libbdev/NOTES @@ -0,0 +1,52 @@ +Development notes regarding libbdev, by David van Moolenbroek. + + +GENERAL MODEL + +This library is designed mainly for use by file servers. It essentially covers +two use cases: 1) use of the block device that contains the file system itself, +and 2) use of any block device for raw block I/O (on unmounted file systems) +performed by the root file server. In the first case, the file server is +responsible for opening and closing the block device, and recovery from a +driver restart involves reopening those minor devices. Regular file systems +should have one or at most two (for a separate journal) block devices open at +the same time, which is why NR_OPEN_DEVS is set to a value that is quite low. +In the second case, VFS is responsible for opening and closing the block device +(and performing IOCTLs), as well as reopening the block device on a driver +restart -- the root file server only gets raw I/O (and flush) requests. + +At this time, libbdev considers only clean crashes (a crash-only model), and +does not support recovery from behavioral errors. Protocol errors are passed to +the user process, and generally do not have an effect on the overall state of +the library. + + +RETRY MODEL + +The philosophy for recovering from driver restarts in libbdev can be formulated +as follows: we want to tolerate an unlimited number of driver restarts over a +long time, but we do not want to keep retrying individual requests across +driver restarts. As such, we do not keep track of driver restarts on a per- +driver basis, because that would mean we put a hard limit on the number of +restarts for that driver in total. Instead, there are two limits: a driver +restart limit that is kept on a per-request basis, failing only that request +when the limit is reached, and a driver restart limit that is kept during +recovery, limiting the number of restarts and eventually giving up on the +entire driver when even the recovery keeps failing (as no progress is made in +that case). + +Each transfer request also has a transfer retry count. The assumption here is +that when a transfer request returns EIO, it can be retried and possibly +succeed upon repetition. The driver restart and transfer retry counts are +tracked independently and thus the first to hit the limit will fail the +request. The behavior should be the same for synchronous and asynchronous +requests in this respect. + +It could happen that a new driver gets loaded after we have decided that the +current driver is unusable. This could be due to a race condition (VFS sends a +new-driver request after we've given up) or due to user interaction (the user +loads a replacement driver). The latter case may occur legitimately with raw +I/O on the root file server, so we must not mark the driver as unusable +forever. On the other hand, in the former case, we must not continue to send +I/O without first reopening the minor devices. For this reason, we do not clean +up the record of the minor devices when we mark a driver as unusable. diff --git a/lib/libbdev/bdev.c b/lib/libbdev/bdev.c index 2bd536c35..fb4dfc239 100644 --- a/lib/libbdev/bdev.c +++ b/lib/libbdev/bdev.c @@ -1,12 +1,12 @@ /* libbdev - block device interfacing library, by D.C. van Moolenbroek */ -/* This is a preliminary, bare-essentials-only version of this library. */ - #include #include #include #include +#include "const.h" +#include "type.h" #include "proto.h" void bdev_driver(dev_t dev, char *label) @@ -26,18 +26,55 @@ void bdev_driver(dev_t dev, char *label) bdev_update(dev, label); } +static int bdev_retry(int *driver_tries, int *transfer_tries, int *result) +{ +/* Return TRUE iff the call result implies that we should retry the operation. + */ + + switch (*result) { + case ERESTART: + /* We get this error internally if the driver has restarted and the + * current operation may now go through. Check the retry count for + * driver restarts first, as we don't want to keep trying forever. + */ + if (++*driver_tries < DRIVER_TRIES) + return TRUE; + + *result = EDEADSRCDST; + + break; + + case EIO: + /* The 'transfer_tries' pointer is non-NULL if this was a transfer + * request. If we get back an I/O failure, keep retrying the request + * until we hit the transfer retry limit. + */ + if (transfer_tries != NULL && ++*transfer_tries < TRANSFER_TRIES) + return TRUE; + + break; + } + + return FALSE; +} + static int bdev_opcl(int req, dev_t dev, int access) { /* Open or close the given minor device. */ message m; + int r, driver_tries = 0; - memset(&m, 0, sizeof(m)); - m.m_type = req; - m.BDEV_MINOR = minor(dev); - m.BDEV_ACCESS = access; + do { + memset(&m, 0, sizeof(m)); + m.m_type = req; + m.BDEV_MINOR = minor(dev); + m.BDEV_ACCESS = access; - return bdev_sendrec(dev, &m); + r = bdev_sendrec(dev, &m); + } while (bdev_retry(&driver_tries, NULL, &r)); + + return r; } int bdev_open(dev_t dev, int access) @@ -45,8 +82,14 @@ int bdev_open(dev_t dev, int access) /* Open the given minor device. * File system usage note: typically called from mount, after bdev_driver. */ + int r; - return bdev_opcl(BDEV_OPEN, dev, access); + r = bdev_opcl(BDEV_OPEN, dev, access); + + if (r == OK) + bdev_minor_add(dev, access); + + return r; } int bdev_close(dev_t dev) @@ -54,8 +97,16 @@ int bdev_close(dev_t dev) /* Close the given minor device. * File system usage note: typically called from unmount. */ + int r; - return bdev_opcl(BDEV_CLOSE, dev, 0); + bdev_flush_asyn(dev); + + r = bdev_opcl(BDEV_CLOSE, dev, 0); + + if (r == OK) + bdev_minor_del(dev); + + return r; } static int bdev_rdwt_setup(int req, dev_t dev, u64_t pos, char *buf, @@ -93,7 +144,7 @@ static int bdev_rdwt_setup(int req, dev_t dev, u64_t pos, char *buf, return OK; } -static void bdev_rdwt_cleanup(message *m) +static void bdev_rdwt_cleanup(const message *m) { /* Clean up a single-buffer read/write request. */ @@ -104,17 +155,19 @@ static void bdev_rdwt_cleanup(message *m) static ssize_t bdev_rdwt(int req, dev_t dev, u64_t pos, char *buf, size_t count, int flags) { -/* Perform a read or write call using a single buffer. +/* Perform a synchronous read or write call using a single buffer. */ message m; - int r; + int r, driver_tries = 0, transfer_tries = 0; - if ((r = bdev_rdwt_setup(req, dev, pos, buf, count, flags, &m)) != OK) - return r; + do { + if ((r = bdev_rdwt_setup(req, dev, pos, buf, count, flags, &m)) != OK) + break; - r = bdev_sendrec(dev, &m); + r = bdev_sendrec(dev, &m); - bdev_rdwt_cleanup(&m); + bdev_rdwt_cleanup(&m); + } while (bdev_retry(&driver_tries, &transfer_tries, &r)); return r; } @@ -182,7 +235,7 @@ static int bdev_vrdwt_setup(int req, dev_t dev, u64_t pos, iovec_t *vec, return OK; } -static void bdev_vrdwt_cleanup(message *m, iovec_s_t *gvec) +static void bdev_vrdwt_cleanup(const message *m, iovec_s_t *gvec) { /* Clean up a vectored read/write request. */ @@ -200,25 +253,28 @@ static void bdev_vrdwt_cleanup(message *m, iovec_s_t *gvec) static ssize_t bdev_vrdwt(int req, dev_t dev, u64_t pos, iovec_t *vec, int count, int flags) { -/* Perform a read or write call using a vector of buffers. +/* Perform a synchronous read or write call using a vector of buffers. */ iovec_s_t gvec[NR_IOREQS]; message m; - int r; + int r, driver_tries = 0, transfer_tries = 0; - if ((r = bdev_vrdwt_setup(req, dev, pos, vec, count, flags, &m, gvec)) != OK) - return r; + do { + if ((r = bdev_vrdwt_setup(req, dev, pos, vec, count, flags, &m, + gvec)) != OK) + break; - r = bdev_sendrec(dev, &m); + r = bdev_sendrec(dev, &m); - bdev_vrdwt_cleanup(&m, gvec); + bdev_vrdwt_cleanup(&m, gvec); + } while (bdev_retry(&driver_tries, &transfer_tries, &r)); return r; } ssize_t bdev_read(dev_t dev, u64_t pos, char *buf, size_t count, int flags) { -/* Perform a read call into a single buffer. +/* Perform a synchronous read call into a single buffer. */ return bdev_rdwt(BDEV_READ, dev, pos, buf, count, flags); @@ -226,7 +282,7 @@ ssize_t bdev_read(dev_t dev, u64_t pos, char *buf, size_t count, int flags) ssize_t bdev_write(dev_t dev, u64_t pos, char *buf, size_t count, int flags) { -/* Perform a write call from a single buffer. +/* Perform a synchronous write call from a single buffer. */ return bdev_rdwt(BDEV_WRITE, dev, pos, buf, count, flags); @@ -234,7 +290,7 @@ ssize_t bdev_write(dev_t dev, u64_t pos, char *buf, size_t count, int flags) ssize_t bdev_gather(dev_t dev, u64_t pos, iovec_t *vec, int count, int flags) { -/* Perform a read call into a vector of buffers. +/* Perform a synchronous read call into a vector of buffers. */ return bdev_vrdwt(BDEV_GATHER, dev, pos, vec, count, flags); @@ -242,7 +298,7 @@ ssize_t bdev_gather(dev_t dev, u64_t pos, iovec_t *vec, int count, int flags) ssize_t bdev_scatter(dev_t dev, u64_t pos, iovec_t *vec, int count, int flags) { -/* Perform a write call from a vector of buffers. +/* Perform a synchronous write call from a vector of buffers. */ return bdev_vrdwt(BDEV_SCATTER, dev, pos, vec, count, flags); @@ -286,7 +342,7 @@ static int bdev_ioctl_setup(dev_t dev, int request, void *buf, message *m) return OK; } -static void bdev_ioctl_cleanup(message *m) +static void bdev_ioctl_cleanup(const message *m) { /* Clean up an I/O control request. */ @@ -296,17 +352,287 @@ static void bdev_ioctl_cleanup(message *m) int bdev_ioctl(dev_t dev, int request, void *buf) { -/* Perform an I/O control request. +/* Perform a synchronous I/O control request. */ message m; - int r; + int r, driver_tries = 0; - if ((r = bdev_ioctl_setup(dev, request, buf, &m)) != OK) - return r; + do { + if ((r = bdev_ioctl_setup(dev, request, buf, &m)) != OK) + break; - r = bdev_sendrec(dev, &m); + r = bdev_sendrec(dev, &m); - bdev_ioctl_cleanup(&m); + bdev_ioctl_cleanup(&m); + } while (bdev_retry(&driver_tries, NULL, &r)); return r; } + +void bdev_flush_asyn(dev_t dev) +{ +/* Flush all ongoing asynchronous requests to the given minor device. This + * involves blocking until all I/O for it has completed. + * File system usage note: typically called from flush. + */ + bdev_call_t *call; + + while ((call = bdev_call_find(dev)) != NULL) + (void) bdev_wait_asyn(call->id); +} + +static bdev_id_t bdev_rdwt_asyn(int req, dev_t dev, u64_t pos, char *buf, + size_t count, int flags, bdev_callback_t callback, bdev_param_t param) +{ +/* Perform an asynchronous read or write call using a single buffer. + */ + bdev_call_t *call; + int r; + + if ((call = bdev_call_alloc(1)) == NULL) + return ENOMEM; + + if ((r = bdev_rdwt_setup(req, dev, pos, buf, count, flags, &call->msg)) != + OK) { + bdev_call_free(call); + + return r; + } + + if ((r = bdev_senda(dev, &call->msg, call->id)) != OK) { + bdev_rdwt_cleanup(&call->msg); + + bdev_call_free(call); + + return r; + } + + call->dev = dev; + call->callback = callback; + call->param = param; + call->driver_tries = 0; + call->transfer_tries = 0; + call->vec[0].iov_addr = (vir_bytes) buf; + call->vec[0].iov_size = count; + + return call->id; +} + +static bdev_id_t bdev_vrdwt_asyn(int req, dev_t dev, u64_t pos, iovec_t *vec, + int count, int flags, bdev_callback_t callback, bdev_param_t param) +{ +/* Perform an asynchronous read or write call using a vector of buffers. + */ + bdev_call_t *call; + int r; + + if ((call = bdev_call_alloc(count)) == NULL) + return ENOMEM; + + if ((r = bdev_vrdwt_setup(req, dev, pos, vec, count, flags, &call->msg, + call->gvec)) != OK) { + bdev_call_free(call); + + return r; + } + + if ((r = bdev_senda(dev, &call->msg, call->id)) != OK) { + bdev_vrdwt_cleanup(&call->msg, call->gvec); + + bdev_call_free(call); + + return r; + } + + call->dev = dev; + call->callback = callback; + call->param = param; + call->driver_tries = 0; + call->transfer_tries = 0; + memcpy(call->vec, vec, sizeof(vec[0]) * count); + + return call->id; +} + +bdev_id_t bdev_read_asyn(dev_t dev, u64_t pos, char *buf, size_t count, + int flags, bdev_callback_t callback, bdev_param_t param) +{ +/* Perform an asynchronous read call into a single buffer. + */ + + return bdev_rdwt_asyn(BDEV_READ, dev, pos, buf, count, flags, callback, + param); +} + +bdev_id_t bdev_write_asyn(dev_t dev, u64_t pos, char *buf, size_t count, + int flags, bdev_callback_t callback, bdev_param_t param) +{ +/* Perform an asynchronous write call from a single buffer. + */ + + return bdev_rdwt_asyn(BDEV_WRITE, dev, pos, buf, count, flags, callback, + param); +} + +bdev_id_t bdev_gather_asyn(dev_t dev, u64_t pos, iovec_t *vec, int count, + int flags, bdev_callback_t callback, bdev_param_t param) +{ +/* Perform an asynchronous read call into a vector of buffers. + */ + + return bdev_vrdwt_asyn(BDEV_GATHER, dev, pos, vec, count, flags, callback, + param); +} + +bdev_id_t bdev_scatter_asyn(dev_t dev, u64_t pos, iovec_t *vec, int count, + int flags, bdev_callback_t callback, bdev_param_t param) +{ +/* Perform an asynchronous write call into a vector of buffers. + */ + + return bdev_vrdwt_asyn(BDEV_SCATTER, dev, pos, vec, count, flags, callback, + param); +} + +bdev_id_t bdev_ioctl_asyn(dev_t dev, int request, void *buf, + bdev_callback_t callback, bdev_param_t param) +{ +/* Perform an asynchronous I/O control request. + */ + bdev_call_t *call; + int r; + + if ((call = bdev_call_alloc(1)) == NULL) + return ENOMEM; + + if ((r = bdev_ioctl_setup(dev, request, buf, &call->msg)) != OK) { + bdev_call_free(call); + + return r; + } + + if ((r = bdev_senda(dev, &call->msg, call->id)) != OK) { + bdev_ioctl_cleanup(&call->msg); + + bdev_call_free(call); + + return r; + } + + call->dev = dev; + call->callback = callback; + call->param = param; + call->driver_tries = 0; + call->vec[0].iov_addr = (vir_bytes) buf; + + return call->id; +} + +void bdev_callback_asyn(bdev_call_t *call, int result) +{ +/* Perform the callback for an asynchronous request, with the given result. + * Clean up the call structure afterwards. + */ + + /* If this was a transfer request and the result is EIO, we may want to retry + * the request first. + */ + switch (call->msg.m_type) { + case BDEV_READ: + case BDEV_WRITE: + case BDEV_GATHER: + case BDEV_SCATTER: + if (result == EIO && ++call->transfer_tries < TRANSFER_TRIES) { + result = bdev_senda(call->dev, &call->msg, call->id); + + if (result == OK) + return; + } + } + + /* Clean up. */ + switch (call->msg.m_type) { + case BDEV_READ: + case BDEV_WRITE: + bdev_rdwt_cleanup(&call->msg); + + break; + + case BDEV_GATHER: + case BDEV_SCATTER: + bdev_vrdwt_cleanup(&call->msg, call->gvec); + + break; + + case BDEV_IOCTL: + bdev_ioctl_cleanup(&call->msg); + + break; + + default: + assert(0); + } + + /* Call the callback function. */ + /* FIXME: we assume all reasonable ssize_t values can be stored in an int. */ + call->callback(call->dev, call->id, call->param, result); + + /* Free up the call structure. */ + bdev_call_free(call); +} + +int bdev_restart_asyn(bdev_call_t *call) +{ +/* The driver for the given call has restarted, and may now have a new + * endpoint. Recreate and resend the request for the given call. + */ + int type, r = OK; + + /* Update and check the retry limit for driver restarts first. */ + if (++call->driver_tries >= DRIVER_TRIES) + return EDEADSRCDST; + + /* Recreate all grants for the new endpoint. */ + type = call->msg.m_type; + + switch (type) { + case BDEV_READ: + case BDEV_WRITE: + bdev_rdwt_cleanup(&call->msg); + + r = bdev_rdwt_setup(type, call->dev, + make64(call->msg.BDEV_POS_LO, call->msg.BDEV_POS_HI), + (char *) call->vec[0].iov_addr, call->msg.BDEV_COUNT, + call->msg.BDEV_FLAGS, &call->msg); + + break; + + case BDEV_GATHER: + case BDEV_SCATTER: + bdev_vrdwt_cleanup(&call->msg, call->gvec); + + r = bdev_vrdwt_setup(type, call->dev, + make64(call->msg.BDEV_POS_LO, call->msg.BDEV_POS_HI), + call->vec, call->msg.BDEV_COUNT, call->msg.BDEV_FLAGS, + &call->msg, call->gvec); + + break; + + case BDEV_IOCTL: + bdev_ioctl_cleanup(&call->msg); + + r = bdev_ioctl_setup(call->dev, call->msg.BDEV_REQUEST, + (char *) call->vec[0].iov_addr, &call->msg); + + break; + + default: + assert(0); + } + + if (r != OK) + return r; + + /* Try to resend the request. */ + return bdev_senda(call->dev, &call->msg, call->id); +} diff --git a/lib/libbdev/call.c b/lib/libbdev/call.c new file mode 100644 index 000000000..2a33791fa --- /dev/null +++ b/lib/libbdev/call.c @@ -0,0 +1,118 @@ +/* libbdev - asynchronous call structure management */ + +#include +#include +#include + +#include "const.h" +#include "type.h" +#include "proto.h" + +static bdev_call_t *calls[NR_CALLS]; + +bdev_call_t *bdev_call_alloc(int count) +{ +/* Allocate a call structure. + */ + bdev_call_t *call; + bdev_id_t id; + + for (id = 0; id < NR_CALLS; id++) + if (calls[id] == NULL) + break; + + if (id == NR_CALLS) + return NULL; + + call = malloc(sizeof(bdev_call_t) + + sizeof(call->gvec[0]) * (count - 1) + + sizeof(call->vec[0]) * count); + + if (call == NULL) + return NULL; + + call->id = id; + call->vec = (iovec_t *) &call->gvec[count]; + + calls[id] = call; + + return call; +} + +void bdev_call_free(bdev_call_t *call) +{ +/* Free a call structure. + */ + + assert(calls[call->id] == call); + + calls[call->id] = NULL; + + free(call); +} + +bdev_call_t *bdev_call_get(bdev_id_t id) +{ +/* Retrieve a call structure by request number. + */ + + if (id < 0 || id >= NR_CALLS) + return NULL; + + return calls[id]; +} + +bdev_call_t *bdev_call_find(dev_t dev) +{ +/* Find the first asynchronous request for the given device, if any. + */ + bdev_id_t id; + + for (id = 0; id < NR_CALLS; id++) + if (calls[id] != NULL && calls[id]->dev == dev) + return calls[id]; + + return NULL; +} + +bdev_call_t *bdev_call_iter_maj(dev_t dev, bdev_call_t *call, + bdev_call_t **next) +{ +/* Iterate over all asynchronous requests for a major device. This function + * must be safe even if the returned call structure is freed. + */ + bdev_id_t id; + int major; + + major = major(dev); + + /* If this is the first invocation, find the first match. Otherwise, take the + * call we found to be next in the last invocation, which may be NULL. + */ + if (call == NULL) { + for (id = 0; id < NR_CALLS; id++) + if (calls[id] != NULL && major(calls[id]->dev) == major) + break; + + if (id == NR_CALLS) + return NULL; + + call = calls[id]; + } else { + if ((call = *next) == NULL) + return NULL; + } + + /* Look for the next match, if any. */ + *next = NULL; + + for (id = call->id + 1; id < NR_CALLS; id++) { + if (calls[id] != NULL && major(calls[id]->dev) == major) { + *next = calls[id]; + + break; + } + } + + return call; +} diff --git a/lib/libbdev/const.h b/lib/libbdev/const.h index 8f88835f5..89fab89f0 100644 --- a/lib/libbdev/const.h +++ b/lib/libbdev/const.h @@ -1,7 +1,17 @@ #ifndef _BDEV_CONST_H #define _BDEV_CONST_H +#define NR_CALLS 256 /* maximum number of concurrent async calls */ + +#define NO_ID (-1) /* ID for synchronous requests */ + #define DS_NR_TRIES 100 /* number of times to check endpoint in DS */ #define DS_DELAY 50000 /* delay time (us) between DS checks */ +#define DRIVER_TRIES 10 /* after so many tries, give up on a driver */ +#define RECOVER_TRIES 2 /* tolerated nr of restarts during recovery */ +#define TRANSFER_TRIES 5 /* number of times to try transfers on EIO */ + +#define NR_OPEN_DEVS 4 /* maximum different opened minor devices */ + #endif /* _BDEV_CONST_H */ diff --git a/lib/libbdev/driver.c b/lib/libbdev/driver.c index b7bd187a9..efe2fa2a2 100644 --- a/lib/libbdev/driver.c +++ b/lib/libbdev/driver.c @@ -6,6 +6,7 @@ #include #include "const.h" +#include "type.h" #include "proto.h" static struct { diff --git a/lib/libbdev/ipc.c b/lib/libbdev/ipc.c index 3cbf64380..9a81ca588 100644 --- a/lib/libbdev/ipc.c +++ b/lib/libbdev/ipc.c @@ -4,6 +4,8 @@ #include #include +#include "const.h" +#include "type.h" #include "proto.h" static void bdev_cancel(dev_t dev) @@ -11,14 +13,83 @@ static void bdev_cancel(dev_t dev) /* Recovering the driver for the given device has failed repeatedly. Mark it as * permanently unusable, and clean up any associated calls and resources. */ + bdev_call_t *call, *next; - printf("bdev: driver for major %d (endpoint %d) crashed\n", - major(dev), bdev_driver_get(dev)); + printf("bdev: giving up on major %d\n", major(dev)); + + /* Cancel all pending asynchronous requests. */ + call = NULL; + + while ((call = bdev_call_iter_maj(dev, call, &next)) != NULL) + bdev_callback_asyn(call, EDEADSRCDST); /* Mark the driver as unusable. */ bdev_driver_clear(dev); } +static int bdev_recover(dev_t dev, int update_endpt) +{ +/* The IPC subsystem has signaled an error communicating to the driver + * associated with the given device. Try to recover. If 'update_endpt' is set, + * we need to find the new endpoint of the driver first. Return TRUE iff + * recovery has been successful. + */ + bdev_call_t *call, *next; + endpoint_t endpt; + int r, nr_tries; + + printf("bdev: recovering from a driver crash on major %d\n", major(dev)); + + for (nr_tries = 0; nr_tries < RECOVER_TRIES; nr_tries++) { + /* First update the endpoint, if necessary. */ + if (update_endpt) + (void) bdev_driver_update(dev); + + if ((endpt = bdev_driver_get(dev)) == NONE) + break; + + /* If anything goes wrong, update the endpoint again next time. */ + update_endpt = TRUE; + + /* Reopen all minor devices on the new driver. */ + if ((r = bdev_minor_reopen(dev)) != OK) { + /* If the driver died again, we may give it another try. */ + if (r == EDEADSRCDST) + continue; + + /* If another error occurred, we cannot continue using the + * driver as is, but we also cannot force it to restart. + */ + break; + } + + /* Resend all asynchronous requests. */ + call = NULL; + + while ((call = bdev_call_iter_maj(dev, call, &next)) != NULL) { + /* It is not strictly necessary that we manage to reissue all + * asynchronous requests successfully. We can fail them on an + * individual basis here, without affecting the overall + * recovery. Note that we will never get new IPC failures here. + */ + if ((r = bdev_restart_asyn(call)) != OK) + bdev_callback_asyn(call, r); + } + + /* Recovery seems successful. We can now reissue the current + * synchronous request (if any), and continue normal operation. + */ + printf("bdev: recovery successful, new driver is at %d\n", endpt); + + return TRUE; + } + + /* Recovery failed repeatedly. Give up on this driver. */ + bdev_cancel(dev); + + return FALSE; +} + void bdev_update(dev_t dev, char *label) { /* Set the endpoint for a driver. Perform recovery if necessary. @@ -32,13 +103,40 @@ void bdev_update(dev_t dev, char *label) /* If updating the driver causes an endpoint change, we need to perform * recovery, but not update the endpoint yet again. */ + if (old_endpt != NONE && old_endpt != endpt) + bdev_recover(dev, FALSE /*update_endpt*/); +} + +int bdev_senda(dev_t dev, const message *m_orig, bdev_id_t id) +{ +/* Send an asynchronous request for the given device. This function will never + * get any new IPC errors sending to the driver. If sending an asynchronous + * request fails, we will find out through other ways later. + */ + endpoint_t endpt; + message m; + int r; + + /* If we have no usable driver endpoint, fail instantly. */ + if ((endpt = bdev_driver_get(dev)) == NONE) + return EDEADSRCDST; + + m = *m_orig; + m.BDEV_ID = id; + + r = asynsend(endpt, &m); + + if (r != OK) + printf("bdev: asynsend to driver (%d) failed (%d)\n", endpt, r); + + return r; } int bdev_sendrec(dev_t dev, const message *m_orig) { -/* Send a request to the given device, and wait for the reply. +/* Send a synchronous request for the given device, and wait for the reply. + * Return ERESTART if the caller should try to reissue the request. */ - static long id = 0; endpoint_t endpt; message m; int r; @@ -49,15 +147,19 @@ int bdev_sendrec(dev_t dev, const message *m_orig) /* Send the request and block until we receive a reply. */ m = *m_orig; - m.BDEV_ID = ++id; + m.BDEV_ID = NO_ID; r = sendrec(endpt, &m); - /* This version of libbdev does not support recovery. Forget the driver. */ + /* If communication failed, the driver has died. We assume it will be + * restarted soon after, so we attempt recovery. Upon success, we let the + * caller reissue the synchronous request. + */ if (r == EDEADSRCDST) { - bdev_cancel(dev); + if (!bdev_recover(dev, TRUE /*update_endpt*/)) + return EDEADSRCDST; - return EDEADSRCDST; + return ERESTART; } if (r != OK) { @@ -68,22 +170,167 @@ int bdev_sendrec(dev_t dev, const message *m_orig) if (m.m_type != BDEV_REPLY) { printf("bdev: driver (%d) sent weird response (%d)\n", endpt, m.m_type); - return EIO; + return EINVAL; } - /* ERESTART signifies a driver restart. Again, we do not support this yet. */ + /* The protocol contract states that no asynchronous reply can satisfy a + * synchronous SENDREC call, so we can never get an asynchronous reply here. + */ + if (m.BDEV_ID != NO_ID) { + printf("bdev: driver (%d) sent invalid ID (%ld)\n", endpt, m.BDEV_ID); + return EINVAL; + } + + /* Unless the caller is misusing libbdev, we will only get ERESTART if we + * have managed to resend a raw block I/O request to the driver after a + * restart, but before VFS has had a chance to reopen the associated device + * first. This is highly exceptional, and hard to deal with correctly. We + * take the easiest route: sleep for a while so that VFS can reopen the + * device, and then resend the request. If the call keeps failing, the caller + * will eventually give up. + */ if (m.BDEV_STATUS == ERESTART) { - bdev_cancel(dev); + printf("bdev: got ERESTART from driver (%d), sleeping for reopen\n", + endpt); - return EDEADSRCDST; - } + micro_delay(1000); - if (m.BDEV_ID != id) { - printf("bdev: driver (%d) sent invalid response (%ld)\n", - endpt, m.BDEV_ID); - return EIO; + return ERESTART; } /* Return the result of our request. */ return m.BDEV_STATUS; } + +static int bdev_receive(dev_t dev, message *m) +{ +/* Receive one valid message. + */ + endpoint_t endpt; + int r, nr_tries = 0; + + for (;;) { + /* Retrieve and check the driver endpoint on every try, as it will + * change with each driver restart. + */ + if ((endpt = bdev_driver_get(dev)) == NONE) + return EDEADSRCDST; + + r = sef_receive(endpt, m); + + if (r == EDEADSRCDST) { + /* If we reached the maximum number of retries, give up. */ + if (++nr_tries == DRIVER_TRIES) + break; + + /* Attempt recovery. If successful, all asynchronous requests + * will have been resent, and we can retry receiving a reply. + */ + if (!bdev_recover(dev, TRUE /*update_endpt*/)) + return EDEADSRCDST; + + continue; + } + + if (r != OK) { + printf("bdev: IPC to driver (%d) failed (%d)\n", endpt, r); + + return r; + } + + if (m->m_type != BDEV_REPLY) { + printf("bdev: driver (%d) sent weird response (%d)\n", + endpt, m->m_type); + return EINVAL; + } + + /* The caller is responsible for checking the ID and status. */ + return OK; + } + + /* All tries failed, even though all recovery attempts succeeded. In this + * case, we let the caller recheck whether it wants to keep calling us, + * returning ERESTART to indicate we can be called again but did not actually + * receive a message. + */ + return ERESTART; +} + +void bdev_reply_asyn(message *m) +{ +/* A reply has come in from a disk driver. + */ + bdev_call_t *call; + endpoint_t endpt; + bdev_id_t id; + int r; + + /* This is a requirement for the caller. */ + assert(m->m_type == BDEV_REPLY); + + /* Get the corresponding asynchronous call structure. */ + id = m->BDEV_ID; + + if ((call = bdev_call_get(id)) == NULL) { + printf("bdev: driver (%d) replied to unknown request (%ld)\n", + m->m_source, m->BDEV_ID); + return; + } + + /* Make sure the reply was sent from the right endpoint. */ + endpt = bdev_driver_get(call->dev); + + if (m->m_source != endpt) { + /* If the endpoint is NONE, this may be a stray reply. */ + if (endpt != NONE) + printf("bdev: driver (%d) replied to request not sent to it\n", + m->m_source); + return; + } + + /* See the ERESTART comment in bdev_sendrec(). */ + if (m->BDEV_STATUS == ERESTART) { + printf("bdev: got ERESTART from driver (%d), sleeping for reopen\n", + endpt); + + micro_delay(1000); + + if ((r = bdev_restart_asyn(call)) != OK) + bdev_callback_asyn(call, r); + + return; + } + + bdev_callback_asyn(call, m->BDEV_STATUS); +} + +int bdev_wait_asyn(bdev_id_t id) +{ +/* Wait for an asynchronous request to complete. + */ + bdev_call_t *call; + dev_t dev; + message m; + int r; + + if ((call = bdev_call_get(id)) == NULL) + return ENOENT; + + dev = call->dev; + + do { + if ((r = bdev_receive(dev, &m)) != OK && r != ERESTART) + return r; + + /* Processing the reply will free up the call structure as a side + * effect. If we repeatedly get ERESTART, we will repeatedly resend the + * asynchronous request, which will then eventually hit the retry limit + * and we will break out of the loop. + */ + if (r == OK) + bdev_reply_asyn(&m); + + } while (bdev_call_get(id) != NULL); + + return OK; +} diff --git a/lib/libbdev/minor.c b/lib/libbdev/minor.c new file mode 100644 index 000000000..519da33eb --- /dev/null +++ b/lib/libbdev/minor.c @@ -0,0 +1,120 @@ +/* libbdev - tracking and reopening of opened minor devices */ + +#include +#include +#include + +#include "const.h" +#include "type.h" +#include "proto.h" + +static struct { + dev_t dev; + int count; + int access; +} open_dev[NR_OPEN_DEVS] = { { NO_DEV, 0, 0 } }; + +int bdev_minor_reopen(dev_t dev) +{ +/* Reopen all minor devices on a major device. This function duplicates some + * code from elsewhere, because in this case we must avoid performing recovery. + * FIXME: if reopening fails with a non-IPC error, we should attempt to close + * all minors that we did manage to reopen so far, or they might stay open + * forever. + */ + endpoint_t endpt; + message m; + int i, j, r, major; + + major = major(dev); + endpt = bdev_driver_get(dev); + + assert(endpt != NONE); + + for (i = 0; i < NR_OPEN_DEVS; i++) { + if (major(open_dev[i].dev) != major) + continue; + + /* Each minor device may have been opened multiple times. Send an open + * request for each time that it was opened before. We could reopen it + * just once, but then we'd have to keep a shadow open count as well. + */ + for (j = 0; j < open_dev[i].count; j++) { + memset(&m, 0, sizeof(m)); + m.m_type = BDEV_OPEN; + m.BDEV_MINOR = minor(open_dev[i].dev); + m.BDEV_ACCESS = open_dev[i].access; + m.BDEV_ID = NO_ID; + + if ((r = sendrec(endpt, &m)) != OK) { + printf("bdev: IPC to driver (%d) failed (%d)\n", + endpt, r); + return r; + } + + if (m.m_type != BDEV_REPLY) { + printf("bdev: driver (%d) sent weird response (%d)\n", + endpt, m.m_type); + return EINVAL; + } + + if (m.BDEV_ID != NO_ID) { + printf("bdev: driver (%d) sent invalid ID (%ld)\n", + endpt, m.BDEV_ID); + return EINVAL; + } + + if ((r = m.BDEV_STATUS) != OK) { + printf("bdev: driver (%d) failed device reopen (%d)\n", + endpt, r); + return r; + } + } + } + + return OK; +} + +void bdev_minor_add(dev_t dev, int access) +{ +/* Increase the reference count of the given minor device. + */ + int i, free = -1; + + for (i = 0; i < NR_OPEN_DEVS; i++) { + if (open_dev[i].dev == dev) { + open_dev[i].count++; + open_dev[i].access |= access; + + return; + } + + if (free < 0 && open_dev[i].dev == NO_DEV) + free = i; + } + + if (free < 0) { + printf("bdev: too many open devices, increase NR_OPEN_DEVS\n"); + return; + } + + open_dev[free].dev = dev; + open_dev[free].count = 1; + open_dev[free].access = access; +} + +void bdev_minor_del(dev_t dev) +{ +/* Decrease the reference count of the given minor device, if present. + */ + int i; + + for (i = 0; i < NR_OPEN_DEVS; i++) { + if (open_dev[i].dev == dev) { + if (!--open_dev[i].count) + open_dev[i].dev = NO_DEV; + + break; + } + } +} diff --git a/lib/libbdev/proto.h b/lib/libbdev/proto.h index 45392c053..24f11d089 100644 --- a/lib/libbdev/proto.h +++ b/lib/libbdev/proto.h @@ -1,6 +1,10 @@ #ifndef _BDEV_PROTO_H #define _BDEV_PROTO_H +/* bdev.c */ +extern void bdev_callback_asyn(bdev_call_t *call, int result); +extern int bdev_restart_asyn(bdev_call_t *call); + /* driver.c */ extern void bdev_driver_init(void); extern void bdev_driver_clear(dev_t dev); @@ -8,8 +12,22 @@ extern endpoint_t bdev_driver_set(dev_t dev, char *label); extern endpoint_t bdev_driver_get(dev_t dev); extern endpoint_t bdev_driver_update(dev_t dev); +/* call.c */ +extern bdev_call_t *bdev_call_alloc(int count); +extern void bdev_call_free(bdev_call_t *call); +extern bdev_call_t *bdev_call_get(bdev_id_t id); +extern bdev_call_t *bdev_call_find(dev_t dev); +extern bdev_call_t *bdev_call_iter_maj(dev_t dev, bdev_call_t *last, + bdev_call_t **next); + /* ipc.c */ extern void bdev_update(dev_t dev, char *label); +extern int bdev_senda(dev_t dev, const message *m_orig, bdev_id_t num); extern int bdev_sendrec(dev_t dev, const message *m_orig); +/* minor.c */ +extern int bdev_minor_reopen(dev_t dev); +extern void bdev_minor_add(dev_t dev, int access); +extern void bdev_minor_del(dev_t dev); + #endif /* _BDEV_PROTO_H */ diff --git a/lib/libbdev/type.h b/lib/libbdev/type.h new file mode 100644 index 000000000..b764ac1fd --- /dev/null +++ b/lib/libbdev/type.h @@ -0,0 +1,16 @@ +#ifndef _BDEV_TYPE_H +#define _BDEV_TYPE_H + +typedef struct { + bdev_id_t id; /* call ID */ + dev_t dev; /* target device number */ + message msg; /* request message */ + bdev_callback_t callback; /* callback function */ + bdev_param_t param; /* callback parameter */ + int driver_tries; /* times retried on driver restarts */ + int transfer_tries; /* times retried on transfer errors */ + iovec_t *vec; /* original vector */ + iovec_s_t gvec[1]; /* grant vector */ +} bdev_call_t; + +#endif /* _BDEV_TYPE_H */