libbdev: extended version

This version of libbdev support asynchronous communication,
recovery after driver restarts, and retrying of failed transfer
operations.
This commit is contained in:
David van Moolenbroek 2011-12-05 10:53:57 +01:00
parent 9221586f37
commit ed007ca416
11 changed files with 989 additions and 55 deletions

View file

@ -1,8 +1,10 @@
#ifndef __MINIX_BDEV_H #ifndef _MINIX_BDEV_H
#define __MINIX_BDEV_H #define _MINIX_BDEV_H
/* Common API. */
extern void bdev_driver(dev_t dev, char *label); extern void bdev_driver(dev_t dev, char *label);
/* Synchronous API. */
extern int bdev_open(dev_t dev, int access); extern int bdev_open(dev_t dev, int access);
extern int bdev_close(dev_t dev); extern int bdev_close(dev_t dev);
@ -16,4 +18,28 @@ extern ssize_t bdev_scatter(dev_t dev, u64_t pos, iovec_t *vec, int count,
int flags); int flags);
extern int bdev_ioctl(dev_t dev, int request, void *buf); extern int bdev_ioctl(dev_t dev, int request, void *buf);
#endif /* __MINIX_BDEV_H */ /* Asynchronous API. */
typedef int bdev_id_t;
typedef void *bdev_param_t;
typedef void (*bdev_callback_t)(dev_t dev, bdev_id_t id, bdev_param_t param,
int result);
extern void bdev_flush_asyn(dev_t dev);
extern bdev_id_t bdev_read_asyn(dev_t dev, u64_t pos, char *buf, size_t count,
int flags, bdev_callback_t callback, bdev_param_t param);
extern bdev_id_t bdev_write_asyn(dev_t dev, u64_t pos, char *buf, size_t count,
int flags, bdev_callback_t callback, bdev_param_t param);
extern bdev_id_t bdev_gather_asyn(dev_t dev, u64_t pos, iovec_t *vec,
int count, int flags, bdev_callback_t callback, bdev_param_t param);
extern bdev_id_t bdev_scatter_asyn(dev_t dev, u64_t pos, iovec_t *vec,
int count, int flags, bdev_callback_t callback, bdev_param_t param);
extern bdev_id_t bdev_ioctl_asyn(dev_t dev, int request, void *buf,
bdev_callback_t callback, bdev_param_t param);
extern int bdev_wait_asyn(bdev_id_t id);
extern void bdev_reply_asyn(message *m);
#endif /* _MINIX_BDEV_H */

View file

@ -3,6 +3,6 @@
LIB= bdev LIB= bdev
SRCS= bdev.c ipc.c driver.c SRCS= bdev.c driver.c call.c ipc.c minor.c
.include <bsd.lib.mk> .include <bsd.lib.mk>

52
lib/libbdev/NOTES Normal file
View file

@ -0,0 +1,52 @@
Development notes regarding libbdev, by David van Moolenbroek.
GENERAL MODEL
This library is designed mainly for use by file servers. It essentially covers
two use cases: 1) use of the block device that contains the file system itself,
and 2) use of any block device for raw block I/O (on unmounted file systems)
performed by the root file server. In the first case, the file server is
responsible for opening and closing the block device, and recovery from a
driver restart involves reopening those minor devices. Regular file systems
should have one or at most two (for a separate journal) block devices open at
the same time, which is why NR_OPEN_DEVS is set to a value that is quite low.
In the second case, VFS is responsible for opening and closing the block device
(and performing IOCTLs), as well as reopening the block device on a driver
restart -- the root file server only gets raw I/O (and flush) requests.
At this time, libbdev considers only clean crashes (a crash-only model), and
does not support recovery from behavioral errors. Protocol errors are passed to
the user process, and generally do not have an effect on the overall state of
the library.
RETRY MODEL
The philosophy for recovering from driver restarts in libbdev can be formulated
as follows: we want to tolerate an unlimited number of driver restarts over a
long time, but we do not want to keep retrying individual requests across
driver restarts. As such, we do not keep track of driver restarts on a per-
driver basis, because that would mean we put a hard limit on the number of
restarts for that driver in total. Instead, there are two limits: a driver
restart limit that is kept on a per-request basis, failing only that request
when the limit is reached, and a driver restart limit that is kept during
recovery, limiting the number of restarts and eventually giving up on the
entire driver when even the recovery keeps failing (as no progress is made in
that case).
Each transfer request also has a transfer retry count. The assumption here is
that when a transfer request returns EIO, it can be retried and possibly
succeed upon repetition. The driver restart and transfer retry counts are
tracked independently and thus the first to hit the limit will fail the
request. The behavior should be the same for synchronous and asynchronous
requests in this respect.
It could happen that a new driver gets loaded after we have decided that the
current driver is unusable. This could be due to a race condition (VFS sends a
new-driver request after we've given up) or due to user interaction (the user
loads a replacement driver). The latter case may occur legitimately with raw
I/O on the root file server, so we must not mark the driver as unusable
forever. On the other hand, in the former case, we must not continue to send
I/O without first reopening the minor devices. For this reason, we do not clean
up the record of the minor devices when we mark a driver as unusable.

View file

@ -1,12 +1,12 @@
/* libbdev - block device interfacing library, by D.C. van Moolenbroek */ /* libbdev - block device interfacing library, by D.C. van Moolenbroek */
/* This is a preliminary, bare-essentials-only version of this library. */
#include <minix/drivers.h> #include <minix/drivers.h>
#include <minix/bdev.h> #include <minix/bdev.h>
#include <minix/ioctl.h> #include <minix/ioctl.h>
#include <assert.h> #include <assert.h>
#include "const.h"
#include "type.h"
#include "proto.h" #include "proto.h"
void bdev_driver(dev_t dev, char *label) void bdev_driver(dev_t dev, char *label)
@ -26,18 +26,55 @@ void bdev_driver(dev_t dev, char *label)
bdev_update(dev, label); bdev_update(dev, label);
} }
static int bdev_retry(int *driver_tries, int *transfer_tries, int *result)
{
/* Return TRUE iff the call result implies that we should retry the operation.
*/
switch (*result) {
case ERESTART:
/* We get this error internally if the driver has restarted and the
* current operation may now go through. Check the retry count for
* driver restarts first, as we don't want to keep trying forever.
*/
if (++*driver_tries < DRIVER_TRIES)
return TRUE;
*result = EDEADSRCDST;
break;
case EIO:
/* The 'transfer_tries' pointer is non-NULL if this was a transfer
* request. If we get back an I/O failure, keep retrying the request
* until we hit the transfer retry limit.
*/
if (transfer_tries != NULL && ++*transfer_tries < TRANSFER_TRIES)
return TRUE;
break;
}
return FALSE;
}
static int bdev_opcl(int req, dev_t dev, int access) static int bdev_opcl(int req, dev_t dev, int access)
{ {
/* Open or close the given minor device. /* Open or close the given minor device.
*/ */
message m; message m;
int r, driver_tries = 0;
memset(&m, 0, sizeof(m)); do {
m.m_type = req; memset(&m, 0, sizeof(m));
m.BDEV_MINOR = minor(dev); m.m_type = req;
m.BDEV_ACCESS = access; m.BDEV_MINOR = minor(dev);
m.BDEV_ACCESS = access;
return bdev_sendrec(dev, &m); r = bdev_sendrec(dev, &m);
} while (bdev_retry(&driver_tries, NULL, &r));
return r;
} }
int bdev_open(dev_t dev, int access) int bdev_open(dev_t dev, int access)
@ -45,8 +82,14 @@ int bdev_open(dev_t dev, int access)
/* Open the given minor device. /* Open the given minor device.
* File system usage note: typically called from mount, after bdev_driver. * File system usage note: typically called from mount, after bdev_driver.
*/ */
int r;
return bdev_opcl(BDEV_OPEN, dev, access); r = bdev_opcl(BDEV_OPEN, dev, access);
if (r == OK)
bdev_minor_add(dev, access);
return r;
} }
int bdev_close(dev_t dev) int bdev_close(dev_t dev)
@ -54,8 +97,16 @@ int bdev_close(dev_t dev)
/* Close the given minor device. /* Close the given minor device.
* File system usage note: typically called from unmount. * File system usage note: typically called from unmount.
*/ */
int r;
return bdev_opcl(BDEV_CLOSE, dev, 0); bdev_flush_asyn(dev);
r = bdev_opcl(BDEV_CLOSE, dev, 0);
if (r == OK)
bdev_minor_del(dev);
return r;
} }
static int bdev_rdwt_setup(int req, dev_t dev, u64_t pos, char *buf, static int bdev_rdwt_setup(int req, dev_t dev, u64_t pos, char *buf,
@ -93,7 +144,7 @@ static int bdev_rdwt_setup(int req, dev_t dev, u64_t pos, char *buf,
return OK; return OK;
} }
static void bdev_rdwt_cleanup(message *m) static void bdev_rdwt_cleanup(const message *m)
{ {
/* Clean up a single-buffer read/write request. /* Clean up a single-buffer read/write request.
*/ */
@ -104,17 +155,19 @@ static void bdev_rdwt_cleanup(message *m)
static ssize_t bdev_rdwt(int req, dev_t dev, u64_t pos, char *buf, static ssize_t bdev_rdwt(int req, dev_t dev, u64_t pos, char *buf,
size_t count, int flags) size_t count, int flags)
{ {
/* Perform a read or write call using a single buffer. /* Perform a synchronous read or write call using a single buffer.
*/ */
message m; message m;
int r; int r, driver_tries = 0, transfer_tries = 0;
if ((r = bdev_rdwt_setup(req, dev, pos, buf, count, flags, &m)) != OK) do {
return r; if ((r = bdev_rdwt_setup(req, dev, pos, buf, count, flags, &m)) != OK)
break;
r = bdev_sendrec(dev, &m); r = bdev_sendrec(dev, &m);
bdev_rdwt_cleanup(&m); bdev_rdwt_cleanup(&m);
} while (bdev_retry(&driver_tries, &transfer_tries, &r));
return r; return r;
} }
@ -182,7 +235,7 @@ static int bdev_vrdwt_setup(int req, dev_t dev, u64_t pos, iovec_t *vec,
return OK; return OK;
} }
static void bdev_vrdwt_cleanup(message *m, iovec_s_t *gvec) static void bdev_vrdwt_cleanup(const message *m, iovec_s_t *gvec)
{ {
/* Clean up a vectored read/write request. /* Clean up a vectored read/write request.
*/ */
@ -200,25 +253,28 @@ static void bdev_vrdwt_cleanup(message *m, iovec_s_t *gvec)
static ssize_t bdev_vrdwt(int req, dev_t dev, u64_t pos, iovec_t *vec, static ssize_t bdev_vrdwt(int req, dev_t dev, u64_t pos, iovec_t *vec,
int count, int flags) int count, int flags)
{ {
/* Perform a read or write call using a vector of buffers. /* Perform a synchronous read or write call using a vector of buffers.
*/ */
iovec_s_t gvec[NR_IOREQS]; iovec_s_t gvec[NR_IOREQS];
message m; message m;
int r; int r, driver_tries = 0, transfer_tries = 0;
if ((r = bdev_vrdwt_setup(req, dev, pos, vec, count, flags, &m, gvec)) != OK) do {
return r; if ((r = bdev_vrdwt_setup(req, dev, pos, vec, count, flags, &m,
gvec)) != OK)
break;
r = bdev_sendrec(dev, &m); r = bdev_sendrec(dev, &m);
bdev_vrdwt_cleanup(&m, gvec); bdev_vrdwt_cleanup(&m, gvec);
} while (bdev_retry(&driver_tries, &transfer_tries, &r));
return r; return r;
} }
ssize_t bdev_read(dev_t dev, u64_t pos, char *buf, size_t count, int flags) ssize_t bdev_read(dev_t dev, u64_t pos, char *buf, size_t count, int flags)
{ {
/* Perform a read call into a single buffer. /* Perform a synchronous read call into a single buffer.
*/ */
return bdev_rdwt(BDEV_READ, dev, pos, buf, count, flags); return bdev_rdwt(BDEV_READ, dev, pos, buf, count, flags);
@ -226,7 +282,7 @@ ssize_t bdev_read(dev_t dev, u64_t pos, char *buf, size_t count, int flags)
ssize_t bdev_write(dev_t dev, u64_t pos, char *buf, size_t count, int flags) ssize_t bdev_write(dev_t dev, u64_t pos, char *buf, size_t count, int flags)
{ {
/* Perform a write call from a single buffer. /* Perform a synchronous write call from a single buffer.
*/ */
return bdev_rdwt(BDEV_WRITE, dev, pos, buf, count, flags); return bdev_rdwt(BDEV_WRITE, dev, pos, buf, count, flags);
@ -234,7 +290,7 @@ ssize_t bdev_write(dev_t dev, u64_t pos, char *buf, size_t count, int flags)
ssize_t bdev_gather(dev_t dev, u64_t pos, iovec_t *vec, int count, int flags) ssize_t bdev_gather(dev_t dev, u64_t pos, iovec_t *vec, int count, int flags)
{ {
/* Perform a read call into a vector of buffers. /* Perform a synchronous read call into a vector of buffers.
*/ */
return bdev_vrdwt(BDEV_GATHER, dev, pos, vec, count, flags); return bdev_vrdwt(BDEV_GATHER, dev, pos, vec, count, flags);
@ -242,7 +298,7 @@ ssize_t bdev_gather(dev_t dev, u64_t pos, iovec_t *vec, int count, int flags)
ssize_t bdev_scatter(dev_t dev, u64_t pos, iovec_t *vec, int count, int flags) ssize_t bdev_scatter(dev_t dev, u64_t pos, iovec_t *vec, int count, int flags)
{ {
/* Perform a write call from a vector of buffers. /* Perform a synchronous write call from a vector of buffers.
*/ */
return bdev_vrdwt(BDEV_SCATTER, dev, pos, vec, count, flags); return bdev_vrdwt(BDEV_SCATTER, dev, pos, vec, count, flags);
@ -286,7 +342,7 @@ static int bdev_ioctl_setup(dev_t dev, int request, void *buf, message *m)
return OK; return OK;
} }
static void bdev_ioctl_cleanup(message *m) static void bdev_ioctl_cleanup(const message *m)
{ {
/* Clean up an I/O control request. /* Clean up an I/O control request.
*/ */
@ -296,17 +352,287 @@ static void bdev_ioctl_cleanup(message *m)
int bdev_ioctl(dev_t dev, int request, void *buf) int bdev_ioctl(dev_t dev, int request, void *buf)
{ {
/* Perform an I/O control request. /* Perform a synchronous I/O control request.
*/ */
message m; message m;
int r; int r, driver_tries = 0;
if ((r = bdev_ioctl_setup(dev, request, buf, &m)) != OK) do {
return r; if ((r = bdev_ioctl_setup(dev, request, buf, &m)) != OK)
break;
r = bdev_sendrec(dev, &m); r = bdev_sendrec(dev, &m);
bdev_ioctl_cleanup(&m); bdev_ioctl_cleanup(&m);
} while (bdev_retry(&driver_tries, NULL, &r));
return r; return r;
} }
void bdev_flush_asyn(dev_t dev)
{
/* Flush all ongoing asynchronous requests to the given minor device. This
* involves blocking until all I/O for it has completed.
* File system usage note: typically called from flush.
*/
bdev_call_t *call;
while ((call = bdev_call_find(dev)) != NULL)
(void) bdev_wait_asyn(call->id);
}
static bdev_id_t bdev_rdwt_asyn(int req, dev_t dev, u64_t pos, char *buf,
size_t count, int flags, bdev_callback_t callback, bdev_param_t param)
{
/* Perform an asynchronous read or write call using a single buffer.
*/
bdev_call_t *call;
int r;
if ((call = bdev_call_alloc(1)) == NULL)
return ENOMEM;
if ((r = bdev_rdwt_setup(req, dev, pos, buf, count, flags, &call->msg)) !=
OK) {
bdev_call_free(call);
return r;
}
if ((r = bdev_senda(dev, &call->msg, call->id)) != OK) {
bdev_rdwt_cleanup(&call->msg);
bdev_call_free(call);
return r;
}
call->dev = dev;
call->callback = callback;
call->param = param;
call->driver_tries = 0;
call->transfer_tries = 0;
call->vec[0].iov_addr = (vir_bytes) buf;
call->vec[0].iov_size = count;
return call->id;
}
static bdev_id_t bdev_vrdwt_asyn(int req, dev_t dev, u64_t pos, iovec_t *vec,
int count, int flags, bdev_callback_t callback, bdev_param_t param)
{
/* Perform an asynchronous read or write call using a vector of buffers.
*/
bdev_call_t *call;
int r;
if ((call = bdev_call_alloc(count)) == NULL)
return ENOMEM;
if ((r = bdev_vrdwt_setup(req, dev, pos, vec, count, flags, &call->msg,
call->gvec)) != OK) {
bdev_call_free(call);
return r;
}
if ((r = bdev_senda(dev, &call->msg, call->id)) != OK) {
bdev_vrdwt_cleanup(&call->msg, call->gvec);
bdev_call_free(call);
return r;
}
call->dev = dev;
call->callback = callback;
call->param = param;
call->driver_tries = 0;
call->transfer_tries = 0;
memcpy(call->vec, vec, sizeof(vec[0]) * count);
return call->id;
}
bdev_id_t bdev_read_asyn(dev_t dev, u64_t pos, char *buf, size_t count,
int flags, bdev_callback_t callback, bdev_param_t param)
{
/* Perform an asynchronous read call into a single buffer.
*/
return bdev_rdwt_asyn(BDEV_READ, dev, pos, buf, count, flags, callback,
param);
}
bdev_id_t bdev_write_asyn(dev_t dev, u64_t pos, char *buf, size_t count,
int flags, bdev_callback_t callback, bdev_param_t param)
{
/* Perform an asynchronous write call from a single buffer.
*/
return bdev_rdwt_asyn(BDEV_WRITE, dev, pos, buf, count, flags, callback,
param);
}
bdev_id_t bdev_gather_asyn(dev_t dev, u64_t pos, iovec_t *vec, int count,
int flags, bdev_callback_t callback, bdev_param_t param)
{
/* Perform an asynchronous read call into a vector of buffers.
*/
return bdev_vrdwt_asyn(BDEV_GATHER, dev, pos, vec, count, flags, callback,
param);
}
bdev_id_t bdev_scatter_asyn(dev_t dev, u64_t pos, iovec_t *vec, int count,
int flags, bdev_callback_t callback, bdev_param_t param)
{
/* Perform an asynchronous write call into a vector of buffers.
*/
return bdev_vrdwt_asyn(BDEV_SCATTER, dev, pos, vec, count, flags, callback,
param);
}
bdev_id_t bdev_ioctl_asyn(dev_t dev, int request, void *buf,
bdev_callback_t callback, bdev_param_t param)
{
/* Perform an asynchronous I/O control request.
*/
bdev_call_t *call;
int r;
if ((call = bdev_call_alloc(1)) == NULL)
return ENOMEM;
if ((r = bdev_ioctl_setup(dev, request, buf, &call->msg)) != OK) {
bdev_call_free(call);
return r;
}
if ((r = bdev_senda(dev, &call->msg, call->id)) != OK) {
bdev_ioctl_cleanup(&call->msg);
bdev_call_free(call);
return r;
}
call->dev = dev;
call->callback = callback;
call->param = param;
call->driver_tries = 0;
call->vec[0].iov_addr = (vir_bytes) buf;
return call->id;
}
void bdev_callback_asyn(bdev_call_t *call, int result)
{
/* Perform the callback for an asynchronous request, with the given result.
* Clean up the call structure afterwards.
*/
/* If this was a transfer request and the result is EIO, we may want to retry
* the request first.
*/
switch (call->msg.m_type) {
case BDEV_READ:
case BDEV_WRITE:
case BDEV_GATHER:
case BDEV_SCATTER:
if (result == EIO && ++call->transfer_tries < TRANSFER_TRIES) {
result = bdev_senda(call->dev, &call->msg, call->id);
if (result == OK)
return;
}
}
/* Clean up. */
switch (call->msg.m_type) {
case BDEV_READ:
case BDEV_WRITE:
bdev_rdwt_cleanup(&call->msg);
break;
case BDEV_GATHER:
case BDEV_SCATTER:
bdev_vrdwt_cleanup(&call->msg, call->gvec);
break;
case BDEV_IOCTL:
bdev_ioctl_cleanup(&call->msg);
break;
default:
assert(0);
}
/* Call the callback function. */
/* FIXME: we assume all reasonable ssize_t values can be stored in an int. */
call->callback(call->dev, call->id, call->param, result);
/* Free up the call structure. */
bdev_call_free(call);
}
int bdev_restart_asyn(bdev_call_t *call)
{
/* The driver for the given call has restarted, and may now have a new
* endpoint. Recreate and resend the request for the given call.
*/
int type, r = OK;
/* Update and check the retry limit for driver restarts first. */
if (++call->driver_tries >= DRIVER_TRIES)
return EDEADSRCDST;
/* Recreate all grants for the new endpoint. */
type = call->msg.m_type;
switch (type) {
case BDEV_READ:
case BDEV_WRITE:
bdev_rdwt_cleanup(&call->msg);
r = bdev_rdwt_setup(type, call->dev,
make64(call->msg.BDEV_POS_LO, call->msg.BDEV_POS_HI),
(char *) call->vec[0].iov_addr, call->msg.BDEV_COUNT,
call->msg.BDEV_FLAGS, &call->msg);
break;
case BDEV_GATHER:
case BDEV_SCATTER:
bdev_vrdwt_cleanup(&call->msg, call->gvec);
r = bdev_vrdwt_setup(type, call->dev,
make64(call->msg.BDEV_POS_LO, call->msg.BDEV_POS_HI),
call->vec, call->msg.BDEV_COUNT, call->msg.BDEV_FLAGS,
&call->msg, call->gvec);
break;
case BDEV_IOCTL:
bdev_ioctl_cleanup(&call->msg);
r = bdev_ioctl_setup(call->dev, call->msg.BDEV_REQUEST,
(char *) call->vec[0].iov_addr, &call->msg);
break;
default:
assert(0);
}
if (r != OK)
return r;
/* Try to resend the request. */
return bdev_senda(call->dev, &call->msg, call->id);
}

118
lib/libbdev/call.c Normal file
View file

@ -0,0 +1,118 @@
/* libbdev - asynchronous call structure management */
#include <minix/drivers.h>
#include <minix/bdev.h>
#include <assert.h>
#include "const.h"
#include "type.h"
#include "proto.h"
static bdev_call_t *calls[NR_CALLS];
bdev_call_t *bdev_call_alloc(int count)
{
/* Allocate a call structure.
*/
bdev_call_t *call;
bdev_id_t id;
for (id = 0; id < NR_CALLS; id++)
if (calls[id] == NULL)
break;
if (id == NR_CALLS)
return NULL;
call = malloc(sizeof(bdev_call_t) +
sizeof(call->gvec[0]) * (count - 1) +
sizeof(call->vec[0]) * count);
if (call == NULL)
return NULL;
call->id = id;
call->vec = (iovec_t *) &call->gvec[count];
calls[id] = call;
return call;
}
void bdev_call_free(bdev_call_t *call)
{
/* Free a call structure.
*/
assert(calls[call->id] == call);
calls[call->id] = NULL;
free(call);
}
bdev_call_t *bdev_call_get(bdev_id_t id)
{
/* Retrieve a call structure by request number.
*/
if (id < 0 || id >= NR_CALLS)
return NULL;
return calls[id];
}
bdev_call_t *bdev_call_find(dev_t dev)
{
/* Find the first asynchronous request for the given device, if any.
*/
bdev_id_t id;
for (id = 0; id < NR_CALLS; id++)
if (calls[id] != NULL && calls[id]->dev == dev)
return calls[id];
return NULL;
}
bdev_call_t *bdev_call_iter_maj(dev_t dev, bdev_call_t *call,
bdev_call_t **next)
{
/* Iterate over all asynchronous requests for a major device. This function
* must be safe even if the returned call structure is freed.
*/
bdev_id_t id;
int major;
major = major(dev);
/* If this is the first invocation, find the first match. Otherwise, take the
* call we found to be next in the last invocation, which may be NULL.
*/
if (call == NULL) {
for (id = 0; id < NR_CALLS; id++)
if (calls[id] != NULL && major(calls[id]->dev) == major)
break;
if (id == NR_CALLS)
return NULL;
call = calls[id];
} else {
if ((call = *next) == NULL)
return NULL;
}
/* Look for the next match, if any. */
*next = NULL;
for (id = call->id + 1; id < NR_CALLS; id++) {
if (calls[id] != NULL && major(calls[id]->dev) == major) {
*next = calls[id];
break;
}
}
return call;
}

View file

@ -1,7 +1,17 @@
#ifndef _BDEV_CONST_H #ifndef _BDEV_CONST_H
#define _BDEV_CONST_H #define _BDEV_CONST_H
#define NR_CALLS 256 /* maximum number of concurrent async calls */
#define NO_ID (-1) /* ID for synchronous requests */
#define DS_NR_TRIES 100 /* number of times to check endpoint in DS */ #define DS_NR_TRIES 100 /* number of times to check endpoint in DS */
#define DS_DELAY 50000 /* delay time (us) between DS checks */ #define DS_DELAY 50000 /* delay time (us) between DS checks */
#define DRIVER_TRIES 10 /* after so many tries, give up on a driver */
#define RECOVER_TRIES 2 /* tolerated nr of restarts during recovery */
#define TRANSFER_TRIES 5 /* number of times to try transfers on EIO */
#define NR_OPEN_DEVS 4 /* maximum different opened minor devices */
#endif /* _BDEV_CONST_H */ #endif /* _BDEV_CONST_H */

View file

@ -6,6 +6,7 @@
#include <assert.h> #include <assert.h>
#include "const.h" #include "const.h"
#include "type.h"
#include "proto.h" #include "proto.h"
static struct { static struct {

View file

@ -4,6 +4,8 @@
#include <minix/bdev.h> #include <minix/bdev.h>
#include <assert.h> #include <assert.h>
#include "const.h"
#include "type.h"
#include "proto.h" #include "proto.h"
static void bdev_cancel(dev_t dev) static void bdev_cancel(dev_t dev)
@ -11,14 +13,83 @@ static void bdev_cancel(dev_t dev)
/* Recovering the driver for the given device has failed repeatedly. Mark it as /* Recovering the driver for the given device has failed repeatedly. Mark it as
* permanently unusable, and clean up any associated calls and resources. * permanently unusable, and clean up any associated calls and resources.
*/ */
bdev_call_t *call, *next;
printf("bdev: driver for major %d (endpoint %d) crashed\n", printf("bdev: giving up on major %d\n", major(dev));
major(dev), bdev_driver_get(dev));
/* Cancel all pending asynchronous requests. */
call = NULL;
while ((call = bdev_call_iter_maj(dev, call, &next)) != NULL)
bdev_callback_asyn(call, EDEADSRCDST);
/* Mark the driver as unusable. */ /* Mark the driver as unusable. */
bdev_driver_clear(dev); bdev_driver_clear(dev);
} }
static int bdev_recover(dev_t dev, int update_endpt)
{
/* The IPC subsystem has signaled an error communicating to the driver
* associated with the given device. Try to recover. If 'update_endpt' is set,
* we need to find the new endpoint of the driver first. Return TRUE iff
* recovery has been successful.
*/
bdev_call_t *call, *next;
endpoint_t endpt;
int r, nr_tries;
printf("bdev: recovering from a driver crash on major %d\n", major(dev));
for (nr_tries = 0; nr_tries < RECOVER_TRIES; nr_tries++) {
/* First update the endpoint, if necessary. */
if (update_endpt)
(void) bdev_driver_update(dev);
if ((endpt = bdev_driver_get(dev)) == NONE)
break;
/* If anything goes wrong, update the endpoint again next time. */
update_endpt = TRUE;
/* Reopen all minor devices on the new driver. */
if ((r = bdev_minor_reopen(dev)) != OK) {
/* If the driver died again, we may give it another try. */
if (r == EDEADSRCDST)
continue;
/* If another error occurred, we cannot continue using the
* driver as is, but we also cannot force it to restart.
*/
break;
}
/* Resend all asynchronous requests. */
call = NULL;
while ((call = bdev_call_iter_maj(dev, call, &next)) != NULL) {
/* It is not strictly necessary that we manage to reissue all
* asynchronous requests successfully. We can fail them on an
* individual basis here, without affecting the overall
* recovery. Note that we will never get new IPC failures here.
*/
if ((r = bdev_restart_asyn(call)) != OK)
bdev_callback_asyn(call, r);
}
/* Recovery seems successful. We can now reissue the current
* synchronous request (if any), and continue normal operation.
*/
printf("bdev: recovery successful, new driver is at %d\n", endpt);
return TRUE;
}
/* Recovery failed repeatedly. Give up on this driver. */
bdev_cancel(dev);
return FALSE;
}
void bdev_update(dev_t dev, char *label) void bdev_update(dev_t dev, char *label)
{ {
/* Set the endpoint for a driver. Perform recovery if necessary. /* Set the endpoint for a driver. Perform recovery if necessary.
@ -32,13 +103,40 @@ void bdev_update(dev_t dev, char *label)
/* If updating the driver causes an endpoint change, we need to perform /* If updating the driver causes an endpoint change, we need to perform
* recovery, but not update the endpoint yet again. * recovery, but not update the endpoint yet again.
*/ */
if (old_endpt != NONE && old_endpt != endpt)
bdev_recover(dev, FALSE /*update_endpt*/);
}
int bdev_senda(dev_t dev, const message *m_orig, bdev_id_t id)
{
/* Send an asynchronous request for the given device. This function will never
* get any new IPC errors sending to the driver. If sending an asynchronous
* request fails, we will find out through other ways later.
*/
endpoint_t endpt;
message m;
int r;
/* If we have no usable driver endpoint, fail instantly. */
if ((endpt = bdev_driver_get(dev)) == NONE)
return EDEADSRCDST;
m = *m_orig;
m.BDEV_ID = id;
r = asynsend(endpt, &m);
if (r != OK)
printf("bdev: asynsend to driver (%d) failed (%d)\n", endpt, r);
return r;
} }
int bdev_sendrec(dev_t dev, const message *m_orig) int bdev_sendrec(dev_t dev, const message *m_orig)
{ {
/* Send a request to the given device, and wait for the reply. /* Send a synchronous request for the given device, and wait for the reply.
* Return ERESTART if the caller should try to reissue the request.
*/ */
static long id = 0;
endpoint_t endpt; endpoint_t endpt;
message m; message m;
int r; int r;
@ -49,15 +147,19 @@ int bdev_sendrec(dev_t dev, const message *m_orig)
/* Send the request and block until we receive a reply. */ /* Send the request and block until we receive a reply. */
m = *m_orig; m = *m_orig;
m.BDEV_ID = ++id; m.BDEV_ID = NO_ID;
r = sendrec(endpt, &m); r = sendrec(endpt, &m);
/* This version of libbdev does not support recovery. Forget the driver. */ /* If communication failed, the driver has died. We assume it will be
* restarted soon after, so we attempt recovery. Upon success, we let the
* caller reissue the synchronous request.
*/
if (r == EDEADSRCDST) { if (r == EDEADSRCDST) {
bdev_cancel(dev); if (!bdev_recover(dev, TRUE /*update_endpt*/))
return EDEADSRCDST;
return EDEADSRCDST; return ERESTART;
} }
if (r != OK) { if (r != OK) {
@ -68,22 +170,167 @@ int bdev_sendrec(dev_t dev, const message *m_orig)
if (m.m_type != BDEV_REPLY) { if (m.m_type != BDEV_REPLY) {
printf("bdev: driver (%d) sent weird response (%d)\n", printf("bdev: driver (%d) sent weird response (%d)\n",
endpt, m.m_type); endpt, m.m_type);
return EIO; return EINVAL;
} }
/* ERESTART signifies a driver restart. Again, we do not support this yet. */ /* The protocol contract states that no asynchronous reply can satisfy a
* synchronous SENDREC call, so we can never get an asynchronous reply here.
*/
if (m.BDEV_ID != NO_ID) {
printf("bdev: driver (%d) sent invalid ID (%ld)\n", endpt, m.BDEV_ID);
return EINVAL;
}
/* Unless the caller is misusing libbdev, we will only get ERESTART if we
* have managed to resend a raw block I/O request to the driver after a
* restart, but before VFS has had a chance to reopen the associated device
* first. This is highly exceptional, and hard to deal with correctly. We
* take the easiest route: sleep for a while so that VFS can reopen the
* device, and then resend the request. If the call keeps failing, the caller
* will eventually give up.
*/
if (m.BDEV_STATUS == ERESTART) { if (m.BDEV_STATUS == ERESTART) {
bdev_cancel(dev); printf("bdev: got ERESTART from driver (%d), sleeping for reopen\n",
endpt);
return EDEADSRCDST; micro_delay(1000);
}
if (m.BDEV_ID != id) { return ERESTART;
printf("bdev: driver (%d) sent invalid response (%ld)\n",
endpt, m.BDEV_ID);
return EIO;
} }
/* Return the result of our request. */ /* Return the result of our request. */
return m.BDEV_STATUS; return m.BDEV_STATUS;
} }
static int bdev_receive(dev_t dev, message *m)
{
/* Receive one valid message.
*/
endpoint_t endpt;
int r, nr_tries = 0;
for (;;) {
/* Retrieve and check the driver endpoint on every try, as it will
* change with each driver restart.
*/
if ((endpt = bdev_driver_get(dev)) == NONE)
return EDEADSRCDST;
r = sef_receive(endpt, m);
if (r == EDEADSRCDST) {
/* If we reached the maximum number of retries, give up. */
if (++nr_tries == DRIVER_TRIES)
break;
/* Attempt recovery. If successful, all asynchronous requests
* will have been resent, and we can retry receiving a reply.
*/
if (!bdev_recover(dev, TRUE /*update_endpt*/))
return EDEADSRCDST;
continue;
}
if (r != OK) {
printf("bdev: IPC to driver (%d) failed (%d)\n", endpt, r);
return r;
}
if (m->m_type != BDEV_REPLY) {
printf("bdev: driver (%d) sent weird response (%d)\n",
endpt, m->m_type);
return EINVAL;
}
/* The caller is responsible for checking the ID and status. */
return OK;
}
/* All tries failed, even though all recovery attempts succeeded. In this
* case, we let the caller recheck whether it wants to keep calling us,
* returning ERESTART to indicate we can be called again but did not actually
* receive a message.
*/
return ERESTART;
}
void bdev_reply_asyn(message *m)
{
/* A reply has come in from a disk driver.
*/
bdev_call_t *call;
endpoint_t endpt;
bdev_id_t id;
int r;
/* This is a requirement for the caller. */
assert(m->m_type == BDEV_REPLY);
/* Get the corresponding asynchronous call structure. */
id = m->BDEV_ID;
if ((call = bdev_call_get(id)) == NULL) {
printf("bdev: driver (%d) replied to unknown request (%ld)\n",
m->m_source, m->BDEV_ID);
return;
}
/* Make sure the reply was sent from the right endpoint. */
endpt = bdev_driver_get(call->dev);
if (m->m_source != endpt) {
/* If the endpoint is NONE, this may be a stray reply. */
if (endpt != NONE)
printf("bdev: driver (%d) replied to request not sent to it\n",
m->m_source);
return;
}
/* See the ERESTART comment in bdev_sendrec(). */
if (m->BDEV_STATUS == ERESTART) {
printf("bdev: got ERESTART from driver (%d), sleeping for reopen\n",
endpt);
micro_delay(1000);
if ((r = bdev_restart_asyn(call)) != OK)
bdev_callback_asyn(call, r);
return;
}
bdev_callback_asyn(call, m->BDEV_STATUS);
}
int bdev_wait_asyn(bdev_id_t id)
{
/* Wait for an asynchronous request to complete.
*/
bdev_call_t *call;
dev_t dev;
message m;
int r;
if ((call = bdev_call_get(id)) == NULL)
return ENOENT;
dev = call->dev;
do {
if ((r = bdev_receive(dev, &m)) != OK && r != ERESTART)
return r;
/* Processing the reply will free up the call structure as a side
* effect. If we repeatedly get ERESTART, we will repeatedly resend the
* asynchronous request, which will then eventually hit the retry limit
* and we will break out of the loop.
*/
if (r == OK)
bdev_reply_asyn(&m);
} while (bdev_call_get(id) != NULL);
return OK;
}

120
lib/libbdev/minor.c Normal file
View file

@ -0,0 +1,120 @@
/* libbdev - tracking and reopening of opened minor devices */
#include <minix/drivers.h>
#include <minix/bdev.h>
#include <assert.h>
#include "const.h"
#include "type.h"
#include "proto.h"
static struct {
dev_t dev;
int count;
int access;
} open_dev[NR_OPEN_DEVS] = { { NO_DEV, 0, 0 } };
int bdev_minor_reopen(dev_t dev)
{
/* Reopen all minor devices on a major device. This function duplicates some
* code from elsewhere, because in this case we must avoid performing recovery.
* FIXME: if reopening fails with a non-IPC error, we should attempt to close
* all minors that we did manage to reopen so far, or they might stay open
* forever.
*/
endpoint_t endpt;
message m;
int i, j, r, major;
major = major(dev);
endpt = bdev_driver_get(dev);
assert(endpt != NONE);
for (i = 0; i < NR_OPEN_DEVS; i++) {
if (major(open_dev[i].dev) != major)
continue;
/* Each minor device may have been opened multiple times. Send an open
* request for each time that it was opened before. We could reopen it
* just once, but then we'd have to keep a shadow open count as well.
*/
for (j = 0; j < open_dev[i].count; j++) {
memset(&m, 0, sizeof(m));
m.m_type = BDEV_OPEN;
m.BDEV_MINOR = minor(open_dev[i].dev);
m.BDEV_ACCESS = open_dev[i].access;
m.BDEV_ID = NO_ID;
if ((r = sendrec(endpt, &m)) != OK) {
printf("bdev: IPC to driver (%d) failed (%d)\n",
endpt, r);
return r;
}
if (m.m_type != BDEV_REPLY) {
printf("bdev: driver (%d) sent weird response (%d)\n",
endpt, m.m_type);
return EINVAL;
}
if (m.BDEV_ID != NO_ID) {
printf("bdev: driver (%d) sent invalid ID (%ld)\n",
endpt, m.BDEV_ID);
return EINVAL;
}
if ((r = m.BDEV_STATUS) != OK) {
printf("bdev: driver (%d) failed device reopen (%d)\n",
endpt, r);
return r;
}
}
}
return OK;
}
void bdev_minor_add(dev_t dev, int access)
{
/* Increase the reference count of the given minor device.
*/
int i, free = -1;
for (i = 0; i < NR_OPEN_DEVS; i++) {
if (open_dev[i].dev == dev) {
open_dev[i].count++;
open_dev[i].access |= access;
return;
}
if (free < 0 && open_dev[i].dev == NO_DEV)
free = i;
}
if (free < 0) {
printf("bdev: too many open devices, increase NR_OPEN_DEVS\n");
return;
}
open_dev[free].dev = dev;
open_dev[free].count = 1;
open_dev[free].access = access;
}
void bdev_minor_del(dev_t dev)
{
/* Decrease the reference count of the given minor device, if present.
*/
int i;
for (i = 0; i < NR_OPEN_DEVS; i++) {
if (open_dev[i].dev == dev) {
if (!--open_dev[i].count)
open_dev[i].dev = NO_DEV;
break;
}
}
}

View file

@ -1,6 +1,10 @@
#ifndef _BDEV_PROTO_H #ifndef _BDEV_PROTO_H
#define _BDEV_PROTO_H #define _BDEV_PROTO_H
/* bdev.c */
extern void bdev_callback_asyn(bdev_call_t *call, int result);
extern int bdev_restart_asyn(bdev_call_t *call);
/* driver.c */ /* driver.c */
extern void bdev_driver_init(void); extern void bdev_driver_init(void);
extern void bdev_driver_clear(dev_t dev); extern void bdev_driver_clear(dev_t dev);
@ -8,8 +12,22 @@ extern endpoint_t bdev_driver_set(dev_t dev, char *label);
extern endpoint_t bdev_driver_get(dev_t dev); extern endpoint_t bdev_driver_get(dev_t dev);
extern endpoint_t bdev_driver_update(dev_t dev); extern endpoint_t bdev_driver_update(dev_t dev);
/* call.c */
extern bdev_call_t *bdev_call_alloc(int count);
extern void bdev_call_free(bdev_call_t *call);
extern bdev_call_t *bdev_call_get(bdev_id_t id);
extern bdev_call_t *bdev_call_find(dev_t dev);
extern bdev_call_t *bdev_call_iter_maj(dev_t dev, bdev_call_t *last,
bdev_call_t **next);
/* ipc.c */ /* ipc.c */
extern void bdev_update(dev_t dev, char *label); extern void bdev_update(dev_t dev, char *label);
extern int bdev_senda(dev_t dev, const message *m_orig, bdev_id_t num);
extern int bdev_sendrec(dev_t dev, const message *m_orig); extern int bdev_sendrec(dev_t dev, const message *m_orig);
/* minor.c */
extern int bdev_minor_reopen(dev_t dev);
extern void bdev_minor_add(dev_t dev, int access);
extern void bdev_minor_del(dev_t dev);
#endif /* _BDEV_PROTO_H */ #endif /* _BDEV_PROTO_H */

16
lib/libbdev/type.h Normal file
View file

@ -0,0 +1,16 @@
#ifndef _BDEV_TYPE_H
#define _BDEV_TYPE_H
typedef struct {
bdev_id_t id; /* call ID */
dev_t dev; /* target device number */
message msg; /* request message */
bdev_callback_t callback; /* callback function */
bdev_param_t param; /* callback parameter */
int driver_tries; /* times retried on driver restarts */
int transfer_tries; /* times retried on transfer errors */
iovec_t *vec; /* original vector */
iovec_s_t gvec[1]; /* grant vector */
} bdev_call_t;
#endif /* _BDEV_TYPE_H */