minix/drivers/vnd/vnd.c
David van Moolenbroek e5cc85fdc4 Extend dupfrom(2) into copyfd(2)
This single function allows copying file descriptors from and to
processes, and closing a previously copied remote file descriptor.
This function replaces the five FD-related UDS backcalls. While it
limits the total number of in-flight file descriptors to OPEN_MAX,
this change greatly improves crash recovery support of UDS, since all
in-flight file descriptors will be closed instead of keeping them
open indefinitely (causing VFS to crash on system shutdown). With the
new copyfd call, UDS becomes simpler, and the concept of filps is no
longer exposed outside of VFS.

This patch also moves the checkperms(2) stub into libminlib, thus
fully abstracting away message details of VFS communication from UDS.

Change-Id: Idd32ad390a566143c8ef66955e5ae2c221cff966
2014-03-01 09:04:58 +01:00

603 lines
14 KiB
C

/* VNode Disk driver, by D.C. van Moolenbroek <david@minix3.org> */
#include <minix/drivers.h>
#include <minix/blockdriver.h>
#include <minix/drvlib.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <assert.h>
#define VND_BUF_SIZE 65536
static struct {
int fd; /* file descriptor for the underlying file */
int openct; /* number of times the device is open */
int exiting; /* exit after the last close? */
int rdonly; /* is the device set up read-only? */
dev_t dev; /* device on which the file resides */
ino_t ino; /* inode number of the file */
struct device part[DEV_PER_DRIVE]; /* partition bases and sizes */
struct device subpart[SUB_PER_DRIVE]; /* same for subpartitions */
struct part_geom geom; /* geometry information */
char *buf; /* intermediate I/O transfer buffer */
} state;
static unsigned int instance;
static int vnd_open(devminor_t, int);
static int vnd_close(devminor_t);
static int vnd_transfer(devminor_t, int, u64_t, endpoint_t, iovec_t *,
unsigned int, int);
static int vnd_ioctl(devminor_t, unsigned long, endpoint_t, cp_grant_id_t,
endpoint_t);
static struct device *vnd_part(devminor_t);
static void vnd_geometry(devminor_t, struct part_geom *);
static struct blockdriver vnd_dtab = {
.bdr_type = BLOCKDRIVER_TYPE_DISK,
.bdr_open = vnd_open,
.bdr_close = vnd_close,
.bdr_transfer = vnd_transfer,
.bdr_ioctl = vnd_ioctl,
.bdr_part = vnd_part,
.bdr_geometry = vnd_geometry
};
/*
* Parse partition tables.
*/
static void
vnd_partition(void)
{
memset(state.part, 0, sizeof(state.part));
memset(state.subpart, 0, sizeof(state.subpart));
state.part[0].dv_size = state.geom.size;
partition(&vnd_dtab, 0, P_PRIMARY, FALSE /*atapi*/);
}
/*
* Open a device.
*/
static int
vnd_open(devminor_t minor, int access)
{
/* No sub/partition devices are available before initialization. */
if (state.fd == -1 && minor != 0)
return ENXIO;
else if (state.fd != -1 && vnd_part(minor) == NULL)
return ENXIO;
/*
* If the device either is not configured or configured as read-only,
* block open calls that request write permission. This is what user-
* land expects, although it does mean that vnconfig(8) has to open the
* device as read-only in order to (un)configure it.
*/
if (access & BDEV_W_BIT) {
if (state.fd == -1)
return ENXIO;
if (state.rdonly)
return EACCES;
}
/*
* Userland expects that if the device is opened after having been
* fully closed, partition tables are (re)parsed. Since we already
* parse partition tables upon initialization, we could skip this for
* the first open, but that would introduce more state.
*/
if (state.fd != -1 && state.openct == 0) {
vnd_partition();
/* Make sure our target device didn't just disappear. */
if (vnd_part(minor) == NULL)
return ENXIO;
}
state.openct++;
return OK;
}
/*
* Close a device.
*/
static int
vnd_close(devminor_t UNUSED(minor))
{
if (state.openct == 0) {
printf("VND%u: closing already-closed device\n", instance);
return EINVAL;
}
state.openct--;
if (state.exiting)
blockdriver_terminate();
return OK;
}
/*
* Copy a number of bytes from or to the caller, to or from the intermediate
* buffer. If the given endpoint is SELF, a local memory copy must be made.
*/
static int
vnd_copy(iovec_s_t *iov, size_t iov_off, size_t bytes, endpoint_t endpt,
int do_write)
{
struct vscp_vec vvec[SCPVEC_NR], *vvp;
size_t off, chunk;
int count;
char *ptr;
assert(bytes > 0 && bytes <= VND_BUF_SIZE);
vvp = vvec;
count = 0;
for (off = 0; off < bytes; off += chunk) {
chunk = MIN(bytes - off, iov->iov_size - iov_off);
if (endpt == SELF) {
ptr = (char *) iov->iov_grant + iov_off;
if (do_write)
memcpy(&state.buf[off], ptr, chunk);
else
memcpy(ptr, &state.buf[off], chunk);
} else {
assert(count < SCPVEC_NR); /* SCPVEC_NR >= NR_IOREQS */
vvp->v_from = do_write ? endpt : SELF;
vvp->v_to = do_write ? SELF : endpt;
vvp->v_bytes = chunk;
vvp->v_gid = iov->iov_grant;
vvp->v_offset = iov_off;
vvp->v_addr = (vir_bytes) &state.buf[off];
vvp++;
count++;
}
iov_off += chunk;
if (iov_off == iov->iov_size) {
iov++;
iov_off = 0;
}
}
if (endpt != SELF)
return sys_vsafecopy(vvec, count);
else
return OK;
}
/*
* Advance the given I/O vector, and the offset into its first element, by the
* given number of bytes.
*/
static iovec_s_t *
vnd_advance(iovec_s_t *iov, size_t *iov_offp, size_t bytes)
{
size_t iov_off;
assert(bytes > 0 && bytes <= VND_BUF_SIZE);
iov_off = *iov_offp;
while (bytes > 0) {
if (bytes >= iov->iov_size - iov_off) {
bytes -= iov->iov_size - iov_off;
iov++;
iov_off = 0;
} else {
iov_off += bytes;
bytes = 0;
}
}
*iov_offp = iov_off;
return iov;
}
/*
* Perform data transfer on the selected device.
*/
static int
vnd_transfer(devminor_t minor, int do_write, u64_t position,
endpoint_t endpt, iovec_t *iovt, unsigned int nr_req, int flags)
{
struct device *dv;
iovec_s_t *iov;
size_t off, chunk, bytes, iov_off;
ssize_t r;
unsigned int i;
iov = (iovec_s_t *) iovt;
if (state.fd == -1 || (dv = vnd_part(minor)) == NULL)
return ENXIO;
/* Prevent write operations on devices opened as write-only. */
if (do_write && state.rdonly)
return EACCES;
/* Determine the total number of bytes to transfer. */
if (position >= dv->dv_size)
return 0;
bytes = 0;
for (i = 0; i < nr_req; i++) {
if (iov[i].iov_size == 0 || iov[i].iov_size > LONG_MAX)
return EINVAL;
bytes += iov[i].iov_size;
if (bytes > LONG_MAX)
return EINVAL;
}
if (bytes > dv->dv_size - position)
bytes = dv->dv_size - position;
position += dv->dv_base;
/* Perform the actual transfer, in chunks if necessary. */
iov_off = 0;
for (off = 0; off < bytes; off += chunk) {
chunk = MIN(bytes - off, VND_BUF_SIZE);
assert((unsigned int) (iov - (iovec_s_t *) iovt) < nr_req);
/* For reads, read in the data for the chunk; possibly less. */
if (!do_write) {
chunk = r = pread64(state.fd, state.buf, chunk,
position);
if (r < 0) {
printf("VND%u: pread failed (%d)\n", instance,
-errno);
return -errno;
}
if (r == 0)
break;
}
/* Copy the data for this chunk from or to the caller. */
if ((r = vnd_copy(iov, iov_off, chunk, endpt, do_write)) < 0) {
printf("VND%u: data copy failed (%d)\n", instance, r);
return r;
}
/* For writes, write the data to the file; possibly less. */
if (do_write) {
chunk = r = pwrite64(state.fd, state.buf, chunk,
position);
if (r <= 0) {
if (r < 0)
r = -errno;
printf("VND%u: pwrite failed (%d)\n", instance,
r);
return (r < 0) ? r : EIO;
}
}
/* Move ahead on the I/O vector and the file position. */
iov = vnd_advance(iov, &iov_off, chunk);
position += chunk;
}
/* If force-write is requested, flush the underlying file to disk. */
if (do_write && (flags & BDEV_FORCEWRITE))
fsync(state.fd);
/* Return the number of bytes transferred. */
return off;
}
/*
* Initialize the size and geometry for the device and any partitions. If the
* user provided a geometry, this will be used; otherwise, a geometry will be
* computed.
*/
static int
vnd_layout(u64_t size, struct vnd_ioctl *vnd)
{
u64_t sectors;
state.geom.base = 0ULL;
if (vnd->vnd_flags & VNDIOF_HASGEOM) {
/*
* The geometry determines the accessible part of the file.
* The resulting size must not exceed the file size.
*/
state.geom.cylinders = vnd->vnd_geom.vng_ncylinders;
state.geom.heads = vnd->vnd_geom.vng_ntracks;
state.geom.sectors = vnd->vnd_geom.vng_nsectors;
state.geom.size = (u64_t) state.geom.cylinders *
state.geom.heads * state.geom.sectors *
vnd->vnd_geom.vng_secsize;
if (state.geom.size == 0 || state.geom.size > size)
return EINVAL;
} else {
sectors = size / SECTOR_SIZE;
state.geom.size = sectors * SECTOR_SIZE;
if (sectors >= 32 * 64) {
state.geom.cylinders = sectors / (32 * 64);
state.geom.heads = 64;
state.geom.sectors = 32;
} else {
state.geom.cylinders = sectors;
state.geom.heads = 1;
state.geom.sectors = 1;
}
}
/*
* Parse partition tables immediately, so that (sub)partitions can be
* opened right away. The first open will perform the same procedure,
* but that is only necessary to match userland expectations.
*/
vnd_partition();
return OK;
}
/*
* Process I/O control requests.
*/
static int
vnd_ioctl(devminor_t UNUSED(minor), unsigned long request, endpoint_t endpt,
cp_grant_id_t grant, endpoint_t user_endpt)
{
struct vnd_ioctl vnd;
struct vnd_user vnu;
struct stat st;
int r;
switch (request) {
case VNDIOCSET:
/*
* The VND must not be busy. Note that the caller has the
* device open to perform the IOCTL request.
*/
if (state.fd != -1 || state.openct != 1)
return EBUSY;
if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &vnd,
sizeof(vnd))) != OK)
return r;
/*
* Issue a special VFS backcall that copies a file descriptor
* to the current process, from the user process ultimately
* making the IOCTL call. The result is either a newly
* allocated file descriptor or an error.
*/
if ((state.fd = copyfd(user_endpt, vnd.vnd_fildes,
COPYFD_FROM)) == -1)
return -errno;
/* The target file must be regular. */
if (fstat(state.fd, &st) == -1) {
printf("VND%u: fstat failed (%d)\n", instance, -errno);
r = -errno;
}
if (r == OK && !S_ISREG(st.st_mode))
r = EINVAL;
/*
* Allocate memory for an intermediate I/O transfer buffer. In
* order to save on memory in the common case, the buffer is
* only allocated when the vnd is in use. We use mmap instead
* of malloc to allow the memory to be actually freed later.
*/
if (r == OK) {
state.buf = minix_mmap(NULL, VND_BUF_SIZE, PROT_READ |
PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
if (state.buf == MAP_FAILED)
r = ENOMEM;
}
if (r != OK) {
close(state.fd);
state.fd = -1;
return r;
}
/* Set various device state fields. */
state.dev = st.st_dev;
state.ino = st.st_ino;
state.rdonly = !!(vnd.vnd_flags & VNDIOF_READONLY);
r = vnd_layout(st.st_size, &vnd);
/* Upon success, return the device size to userland. */
if (r == OK) {
vnd.vnd_size = state.geom.size;
r = sys_safecopyto(endpt, grant, 0, (vir_bytes) &vnd,
sizeof(vnd));
}
if (r != OK) {
minix_munmap(state.buf, VND_BUF_SIZE);
close(state.fd);
state.fd = -1;
}
return r;
case VNDIOCCLR:
/* The VND can only be cleared if it has been configured. */
if (state.fd == -1)
return ENXIO;
if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes) &vnd,
sizeof(vnd))) != OK)
return r;
/* The caller has the device open to do the IOCTL request. */
if (!(vnd.vnd_flags & VNDIOF_FORCE) && state.openct != 1)
return EBUSY;
/*
* Close the associated file descriptor immediately, but do not
* allow reuse until the device has been closed by the other
* users.
*/
minix_munmap(state.buf, VND_BUF_SIZE);
close(state.fd);
state.fd = -1;
return OK;
case VNDIOCGET:
/*
* We need not copy in the given structure. It would contain
* the requested unit number, but each driver instance provides
* only one unit anyway.
*/
memset(&vnu, 0, sizeof(vnu));
vnu.vnu_unit = instance;
/* Leave these fields zeroed if the device is not in use. */
if (state.fd != -1) {
vnu.vnu_dev = state.dev;
vnu.vnu_ino = state.ino;
}
return sys_safecopyto(endpt, grant, 0, (vir_bytes) &vnu,
sizeof(vnu));
case DIOCOPENCT:
return sys_safecopyto(endpt, grant, 0,
(vir_bytes) &state.openct, sizeof(state.openct));
case DIOCFLUSH:
if (state.fd == -1)
return ENXIO;
fsync(state.fd);
return OK;
}
return ENOTTY;
}
/*
* Return a pointer to the partition structure for the given minor device.
*/
static struct device *
vnd_part(devminor_t minor)
{
if (minor >= 0 && minor < DEV_PER_DRIVE)
return &state.part[minor];
else if ((unsigned int) (minor -= MINOR_d0p0s0) < SUB_PER_DRIVE)
return &state.subpart[minor];
else
return NULL;
}
/*
* Return geometry information.
*/
static void
vnd_geometry(devminor_t UNUSED(minor), struct part_geom *part)
{
part->cylinders = state.geom.cylinders;
part->heads = state.geom.heads;
part->sectors = state.geom.sectors;
}
/*
* Initialize the device.
*/
static int
vnd_init(int UNUSED(type), sef_init_info_t *UNUSED(info))
{
long v;
/*
* No support for crash recovery. The driver would have no way to
* reacquire the file descriptor for the target file.
*/
/*
* The instance number is used for two purposes: reporting errors, and
* returning the proper unit number to userland in VNDIOCGET calls.
*/
v = 0;
(void) env_parse("instance", "d", 0, &v, 0, 255);
instance = (unsigned int) v;
state.openct = 0;
state.exiting = FALSE;
state.fd = -1;
return OK;
}
/*
* Process an incoming signal.
*/
static void
vnd_signal(int signo)
{
/* In case of a termination signal, initiate driver shutdown. */
if (signo != SIGTERM)
return;
state.exiting = TRUE;
/* Keep running until the device has been fully closed. */
if (state.openct == 0)
blockdriver_terminate();
}
/*
* Set callbacks and initialize the System Event Framework (SEF).
*/
static void
vnd_startup(void)
{
/* Register init and signal callbacks. */
sef_setcb_init_fresh(vnd_init);
sef_setcb_signal_handler(vnd_signal);
/* Let SEF perform startup. */
sef_startup();
}
/*
* Driver task.
*/
int
main(int argc, char **argv)
{
/* Initialize the driver. */
env_setargs(argc, argv);
vnd_startup();
/* Process requests until shutdown. */
blockdriver_task(&vnd_dtab);
return 0;
}