minix/servers/vfs/read.c

383 lines
10 KiB
C
Raw Normal View History

/* This file contains the heart of the mechanism used to read (and write)
* files. Read and write requests are split up into chunks that do not cross
* block boundaries. Each chunk is then processed in turn. Reads on special
* files are also detected and handled.
*
* The entry points into this file are
* do_read: perform the READ system call by calling read_write
* do_getdents: read entries from a directory (GETDENTS)
* read_write: actually do the work of READ and WRITE
*
*/
#include "fs.h"
#include <minix/callnr.h>
#include <minix/com.h>
#include <minix/u64.h>
#include <minix/vfsif.h>
#include <assert.h>
#include <dirent.h>
#include <fcntl.h>
#include <unistd.h>
#include "file.h"
#include "param.h"
#include "scratchpad.h"
#include "vnode.h"
#include "vmnt.h"
/*===========================================================================*
* do_read *
*===========================================================================*/
int do_read(message *UNUSED(m_out))
{
return(do_read_write_peek(READING, job_m_in.fd,
job_m_in.buffer, (size_t) job_m_in.nbytes));
}
/*===========================================================================*
2012-02-13 16:28:04 +01:00
* lock_bsf *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
void lock_bsf(void)
2012-02-13 16:28:04 +01:00
{
struct worker_thread *org_self;
if (mutex_trylock(&bsf_lock) == 0)
return;
VFS: worker thread model overhaul The main purpose of this patch is to fix handling of unpause calls from PM while another call is ongoing. The solution to this problem sparked a full revision of the threading model, consisting of a large number of related changes: - all active worker threads are now always associated with a process, and every process has at most one active thread working for it; - the process lock is always held by a process's worker thread; - a process can now have both normal work and postponed PM work associated to it; - timer expiry and non-postponed PM work is done from the main thread; - filp garbage collection is done from a thread associated with VFS; - reboot calls from PM are now done from a thread associated with PM; - the DS events handler is protected from starting multiple threads; - support for a system worker thread has been removed; - the deadlock recovery thread has been replaced by a parameter to the worker_start() function; the number of worker threads has consequently been increased by one; - saving and restoring of global but per-thread variables is now centralized in worker_suspend() and worker_resume(); err_code is now saved and restored in all cases; - the concept of jobs has been removed, and job_m_in now points to a message stored in the worker thread structure instead; - the PM lock has been removed; - the separate exec lock has been replaced by a lock on the VM process, which was already being locked for exec calls anyway; - PM_UNPAUSE is now processed as a postponed PM request, from a thread associated with the target process; - the FP_DROP_WORK flag has been removed, since it is no longer more than just an optimization and only applied to processes operating on a pipe when getting killed; - assignment to "fp" now takes place only when obtaining new work in the main thread or a worker thread, when resuming execution of a thread, and in the special case of exiting processes during reboot; - there are no longer special cases where the yield() call is used to force a thread to run. Change-Id: I7a97b9b95c2450454a9b5318dfa0e6150d4e6858
2013-08-30 14:00:50 +02:00
org_self = worker_suspend();
2012-02-13 16:28:04 +01:00
if (mutex_lock(&bsf_lock) != 0)
panic("unable to lock block special file lock");
VFS: worker thread model overhaul The main purpose of this patch is to fix handling of unpause calls from PM while another call is ongoing. The solution to this problem sparked a full revision of the threading model, consisting of a large number of related changes: - all active worker threads are now always associated with a process, and every process has at most one active thread working for it; - the process lock is always held by a process's worker thread; - a process can now have both normal work and postponed PM work associated to it; - timer expiry and non-postponed PM work is done from the main thread; - filp garbage collection is done from a thread associated with VFS; - reboot calls from PM are now done from a thread associated with PM; - the DS events handler is protected from starting multiple threads; - support for a system worker thread has been removed; - the deadlock recovery thread has been replaced by a parameter to the worker_start() function; the number of worker threads has consequently been increased by one; - saving and restoring of global but per-thread variables is now centralized in worker_suspend() and worker_resume(); err_code is now saved and restored in all cases; - the concept of jobs has been removed, and job_m_in now points to a message stored in the worker thread structure instead; - the PM lock has been removed; - the separate exec lock has been replaced by a lock on the VM process, which was already being locked for exec calls anyway; - PM_UNPAUSE is now processed as a postponed PM request, from a thread associated with the target process; - the FP_DROP_WORK flag has been removed, since it is no longer more than just an optimization and only applied to processes operating on a pipe when getting killed; - assignment to "fp" now takes place only when obtaining new work in the main thread or a worker thread, when resuming execution of a thread, and in the special case of exiting processes during reboot; - there are no longer special cases where the yield() call is used to force a thread to run. Change-Id: I7a97b9b95c2450454a9b5318dfa0e6150d4e6858
2013-08-30 14:00:50 +02:00
worker_resume(org_self);
2012-02-13 16:28:04 +01:00
}
/*===========================================================================*
* unlock_bsf *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
void unlock_bsf(void)
2012-02-13 16:28:04 +01:00
{
if (mutex_unlock(&bsf_lock) != 0)
panic("failed to unlock block special file lock");
}
/*===========================================================================*
* check_bsf *
*===========================================================================*/
void check_bsf_lock(void)
{
int r = mutex_trylock(&bsf_lock);
if (r == -EBUSY)
panic("bsf_lock locked");
else if (r != 0)
panic("bsf_lock weird state");
/* r == 0 */
unlock_bsf();
2012-02-13 16:28:04 +01:00
}
/*===========================================================================*
* actual_read_write_peek *
2012-02-13 16:28:04 +01:00
*===========================================================================*/
int actual_read_write_peek(struct fproc *rfp, int rw_flag, int io_fd,
char *io_buf, size_t io_nbytes)
{
/* Perform read(fd, buffer, nbytes) or write(fd, buffer, nbytes) call. */
2012-02-13 16:28:04 +01:00
struct filp *f;
tll_access_t locktype;
int r;
int ro = 1;
2012-02-13 16:28:04 +01:00
if(rw_flag == WRITING) ro = 0;
scratch(rfp).file.fd_nr = io_fd;
scratch(rfp).io.io_buffer = io_buf;
scratch(rfp).io.io_nbytes = io_nbytes;
locktype = rw_flag == WRITING ? VNODE_WRITE : VNODE_READ;
if ((f = get_filp2(rfp, scratch(rfp).file.fd_nr, locktype)) == NULL)
2012-02-13 16:28:04 +01:00
return(err_code);
assert(f->filp_count > 0);
if (((f->filp_mode) & (ro ? R_BIT : W_BIT)) == 0) {
2012-02-13 16:28:04 +01:00
unlock_filp(f);
return(f->filp_mode == FILP_CLOSED ? EIO : EBADF);
}
if (scratch(rfp).io.io_nbytes == 0) {
2012-02-13 16:28:04 +01:00
unlock_filp(f);
return(0); /* so char special files need not check for 0*/
2012-02-13 16:28:04 +01:00
}
r = read_write(rfp, rw_flag, f, scratch(rfp).io.io_buffer,
scratch(rfp).io.io_nbytes, who_e);
2012-02-13 16:28:04 +01:00
unlock_filp(f);
return(r);
}
/*===========================================================================*
* do_read_write_peek *
*===========================================================================*/
int do_read_write_peek(int rw_flag, int io_fd, char *io_buf, size_t io_nbytes)
{
return actual_read_write_peek(fp, rw_flag, io_fd, io_buf, io_nbytes);
}
2012-02-13 16:28:04 +01:00
/*===========================================================================*
* read_write *
*===========================================================================*/
int read_write(struct fproc *rfp, int rw_flag, struct filp *f,
char *buf, size_t size, endpoint_t for_e)
2012-02-13 16:28:04 +01:00
{
register struct vnode *vp;
off_t position, res_pos;
2012-02-13 16:28:04 +01:00
unsigned int cum_io, cum_io_incr, res_cum_io;
int op, r;
position = f->filp_pos;
vp = f->filp_vno;
r = OK;
cum_io = 0;
assert(rw_flag == READING || rw_flag == WRITING || rw_flag == PEEKING);
2012-02-13 16:28:04 +01:00
if (size > SSIZE_MAX) return(EINVAL);
op = (rw_flag == READING ? DEV_READ_S : DEV_WRITE_S);
if (S_ISFIFO(vp->v_mode)) { /* Pipes */
if (rfp->fp_cum_io_partial != 0) {
2012-02-13 16:28:04 +01:00
panic("VFS: read_write: fp_cum_io_partial not clear");
2007-08-07 14:52:47 +02:00
}
if(rw_flag == PEEKING) {
printf("read_write: peek on pipe makes no sense\n");
return EINVAL;
}
2012-02-13 16:28:04 +01:00
r = rw_pipe(rw_flag, for_e, f, buf, size);
} else if (S_ISCHR(vp->v_mode)) { /* Character special files. */
2008-02-22 15:26:41 +01:00
dev_t dev;
int suspend_reopen;
int op = (rw_flag == READING ? DEV_READ_S : DEV_WRITE_S);
if(rw_flag == PEEKING) {
printf("read_write: peek on char device makes no sense\n");
return EINVAL;
}
2008-02-22 15:26:41 +01:00
2012-04-25 14:44:42 +02:00
if (vp->v_sdev == NO_DEV)
panic("VFS: read_write tries to access char dev NO_DEV");
VFS: make all IPC asynchronous By decoupling synchronous drivers from VFS, we are a big step closer to supporting driver crashes under all circumstances. That is, VFS can't become stuck on IPC with a synchronous driver (e.g., INET) and can recover from crashing block drivers during open/close/ioctl or during communication with an FS. In order to maintain serialized communication with a synchronous driver, the communication is wrapped by a mutex on a per driver basis (not major numbers as there can be multiple majors with identical endpoints). Majors that share a driver endpoint point to a single mutex object. In order to support crashes from block drivers, the file reopen tactic had to be changed; first reopen files associated with the crashed driver, then send the new driver endpoint to FSes. This solves a deadlock between the FS and the block driver; - VFS would send REQ_NEW_DRIVER to an FS, but he FS only receives it after retrying the current request to the newly started driver. - The block driver would refuse the retried request until all files had been reopened. - VFS would reopen files only after getting a reply from the initial REQ_NEW_DRIVER. When a character special driver crashes, all associated files have to be marked invalid and closed (or reopened if flagged as such). However, they can only be closed if a thread holds exclusive access to it. To obtain exclusive access, the worker thread (which handles the new driver endpoint event from DS) schedules a new job to garbage collect invalid files. This way, we can signal the worker thread that was talking to the crashed driver and will release exclusive access to a file associated with the crashed driver and prevent the garbage collecting worker thread from dead locking on that file. Also, when a character special driver crashes, RS will unmap the driver and remap it upon restart. During unmapping, associated files are marked invalid instead of waiting for an endpoint up event from DS, as that event might come later than new read/write/select requests and thus cause confusion in the freshly started driver. When locking a filp, the usage counters are no longer checked. The usage counter can legally go down to zero during filp invalidation while there are locks pending. DS events are handled by a separate worker thread instead of the main thread as reopening files could lead to another crash and a stuck thread. An additional worker thread is then necessary to unlock it. Finally, with everything asynchronous a race condition in do_select surfaced. A select entry was only marked in use after succesfully sending initial select requests to drivers and having to wait. When multiple select() calls were handled there was opportunity that these entries were overwritten. This had as effect that some select results were ignored (and select() remained blocking instead if returning) or do_select tried to access filps that were not present (because thrown away by secondary select()). This bug manifested itself with sendrecs, but was very hard to reproduce. However, it became awfully easy to trigger with asynsends only.
2012-08-28 16:06:51 +02:00
suspend_reopen = (f->filp_state & FS_NEEDS_REOPEN);
2008-02-22 15:26:41 +01:00
dev = (dev_t) vp->v_sdev;
r = dev_io(op, dev, for_e, buf, position, size, f->filp_flags,
suspend_reopen);
2008-02-22 15:26:41 +01:00
if (r >= 0) {
/* This should no longer happen: all calls are asynchronous. */
printf("VFS: I/O to device %x succeeded immediately!?\n", dev);
2008-02-22 15:26:41 +01:00
cum_io = r;
position += r;
2008-02-22 15:26:41 +01:00
r = OK;
} else if (r == SUSPEND) {
/* FIXME: multiple read/write operations on a single filp
* should be serialized. They currently aren't; in order to
* achieve a similar effect, we optimistically advance the file
* position here. This works under the following assumptions:
* - character drivers that use the seek position at all,
* expose a view of a statically-sized range of bytes, i.e.,
* they are basically byte-granular block devices;
* - if short I/O or an error is returned, all subsequent calls
* will return (respectively) EOF and an error;
* - the application never checks its own file seek position,
* or does not care that it may end up having seeked beyond
* the number of bytes it has actually read;
* - communication to the character driver is FIFO (this one
* is actually true! whew).
* Many improvements are possible here, but in the end,
* anything short of queuing concurrent operations will be
* suboptimal - so we settle for this hack for now.
*/
position += size;
2008-02-22 15:26:41 +01:00
}
2012-04-25 14:44:42 +02:00
} else if (S_ISBLK(vp->v_mode)) { /* Block special files. */
if (vp->v_sdev == NO_DEV)
panic("VFS: read_write tries to access block dev NO_DEV");
2012-02-13 16:28:04 +01:00
lock_bsf();
if(rw_flag == PEEKING) {
r = req_bpeek(vp->v_bfs_e, vp->v_sdev, position, size);
} else {
r = req_breadwrite(vp->v_bfs_e, for_e, vp->v_sdev, position,
size, (vir_bytes) buf, rw_flag, &res_pos, &res_cum_io);
if (r == OK) {
position = res_pos;
cum_io += res_cum_io;
}
}
2012-02-13 16:28:04 +01:00
unlock_bsf();
} else { /* Regular files */
2012-04-25 14:44:42 +02:00
if (rw_flag == WRITING) {
/* Check for O_APPEND flag. */
if (f->filp_flags & O_APPEND) position = vp->v_size;
}
2007-01-05 17:36:55 +01:00
/* Issue request */
if(rw_flag == PEEKING) {
r = req_peek(vp->v_fs_e, vp->v_inode_nr, position, size);
} else {
off_t new_pos;
r = req_readwrite(vp->v_fs_e, vp->v_inode_nr, position,
rw_flag, for_e, (vir_bytes) buf, size, &new_pos,
&cum_io_incr);
if (r >= 0) {
position = new_pos;
cum_io += cum_io_incr;
}
}
}
/* On write, update file size and access time. */
if (rw_flag == WRITING) {
2012-04-25 14:44:42 +02:00
if (S_ISREG(vp->v_mode) || S_ISDIR(vp->v_mode)) {
if (position > vp->v_size) {
vp->v_size = position;
}
}
}
f->filp_pos = position;
2012-02-13 16:28:04 +01:00
if (r == EPIPE && rw_flag == WRITING) {
/* Process is writing, but there is no reader. Tell the kernel to
* generate s SIGPIPE signal.
*/
if (!(f->filp_flags & O_NOSIGPIPE)) {
sys_kill(rfp->fp_endpoint, SIGPIPE);
}
}
if (r == OK) {
return(cum_io);
}
return(r);
}
/*===========================================================================*
* do_getdents *
*===========================================================================*/
int do_getdents(message *UNUSED(m_out))
{
/* Perform the getdents(fd, buf, size) system call. */
int r = OK;
off_t new_pos;
register struct filp *rfilp;
scratch(fp).file.fd_nr = job_m_in.fd;
scratch(fp).io.io_buffer = job_m_in.buffer;
scratch(fp).io.io_nbytes = (size_t) job_m_in.nbytes;
/* Is the file descriptor valid? */
if ( (rfilp = get_filp(scratch(fp).file.fd_nr, VNODE_READ)) == NULL)
return(err_code);
2012-02-13 16:28:04 +01:00
if (!(rfilp->filp_mode & R_BIT))
2012-02-13 16:28:04 +01:00
r = EBADF;
2012-04-25 14:44:42 +02:00
else if (!S_ISDIR(rfilp->filp_vno->v_mode))
2012-02-13 16:28:04 +01:00
r = EBADF;
2012-02-13 16:28:04 +01:00
if (r == OK) {
r = req_getdents(rfilp->filp_vno->v_fs_e, rfilp->filp_vno->v_inode_nr,
rfilp->filp_pos, scratch(fp).io.io_buffer,
scratch(fp).io.io_nbytes, &new_pos, 0);
2012-02-13 16:28:04 +01:00
if (r > 0) rfilp->filp_pos = new_pos;
}
2012-02-13 16:28:04 +01:00
unlock_filp(rfilp);
return(r);
}
2007-08-07 14:52:47 +02:00
/*===========================================================================*
* rw_pipe *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
int rw_pipe(rw_flag, usr_e, f, buf, req_size)
2007-08-07 14:52:47 +02:00
int rw_flag; /* READING or WRITING */
endpoint_t usr_e;
2007-08-07 14:52:47 +02:00
struct filp *f;
char *buf;
size_t req_size;
{
int r, oflags, partial_pipe = 0;
size_t size, cum_io, cum_io_incr;
2007-08-07 14:52:47 +02:00
struct vnode *vp;
off_t position, new_pos;
2007-08-07 14:52:47 +02:00
2012-02-13 16:28:04 +01:00
/* Must make sure we're operating on locked filp and vnode */
VFS: fix locking bugs .sync and fsync used unnecessarily restrictive locking type .fsync violated locking order by obtaining a vmnt lock after a filp lock .fsync contained a TOCTOU bug .new_node violated locking rules (didn't upgrade lock upon file creation) .do_pipe used unnecessarily restrictive locking type .always lock pipes exclusively; even a read operation might require to do a write on a vnode object (update pipe size) .when opening a file with O_TRUNC, upgrade vnode lock when truncating .utime used unnecessarily restrictive locking type .path parsing: .always acquire VMNT_WRITE or VMNT_EXCL on vmnt and downgrade to VMNT_READ if that was what was actually requested. This prevents the following deadlock scenario: thread A: lock_vmnt(vmp, TLL_READSER); lock_vnode(vp, TLL_READSER); upgrade_vmnt_lock(vmp, TLL_WRITE); thread B: lock_vmnt(vmp, TLL_READ); lock_vnode(vp, TLL_READSER); thread A will be stuck in upgrade_vmnt_lock and thread B is stuck in lock_vnode. This happens when, for example, thread A tries create a new node (open.c:new_node) and thread B tries to do eat_path to change dir (stadir.c:do_chdir). When the path is being resolved, a vnode is always locked with VNODE_OPCL (TLL_READSER) and then downgraded to VNODE_READ if read-only is actually requested. Thread A locks the vmnt with VMNT_WRITE (TLL_READSER) which still allows VMNT_READ locks. Thread B can't acquire a lock on the vnode because thread A has it; Thread A can't upgrade its vmnt lock to VMNT_WRITE (TLL_WRITE) because thread B has a VMNT_READ lock on it. By serializing vmnt locks during path parsing, thread B can only acquire a lock on vmp when thread A has completely finished its operation.
2012-11-30 13:49:53 +01:00
assert(tll_locked_by_me(&f->filp_vno->v_lock));
2012-02-13 16:28:04 +01:00
assert(mutex_trylock(&f->filp_lock) == -EDEADLK);
2007-08-07 14:52:47 +02:00
oflags = f->filp_flags;
vp = f->filp_vno;
position = 0; /* Not actually used */
assert(rw_flag == READING || rw_flag == WRITING);
2007-08-07 14:52:47 +02:00
/* fp->fp_cum_io_partial is only nonzero when doing partial writes */
2012-02-13 16:28:04 +01:00
cum_io = fp->fp_cum_io_partial;
2007-08-07 14:52:47 +02:00
r = pipe_check(f, rw_flag, oflags, req_size, 0);
if (r <= 0) {
2012-02-13 16:28:04 +01:00
if (r == SUSPEND) pipe_suspend(f, buf, req_size);
return(r);
2007-08-07 14:52:47 +02:00
}
size = r;
if (size < req_size) partial_pipe = 1;
2007-08-07 14:52:47 +02:00
/* Truncate read request at size. */
if (rw_flag == READING && size > vp->v_size) {
size = vp->v_size;
2007-08-07 14:52:47 +02:00
}
2012-02-13 16:28:04 +01:00
if (vp->v_mapfs_e == 0)
panic("unmapped pipe");
2010-01-27 10:30:39 +01:00
r = req_readwrite(vp->v_mapfs_e, vp->v_mapinode_nr, position, rw_flag, usr_e,
(vir_bytes) buf, size, &new_pos, &cum_io_incr);
if (r != OK) {
return(r);
2007-08-07 14:52:47 +02:00
}
2012-02-13 16:28:04 +01:00
cum_io += cum_io_incr;
buf += cum_io_incr;
req_size -= cum_io_incr;
vp->v_size = new_pos;
if (partial_pipe) {
/* partial write on pipe with */
/* O_NONBLOCK, return write count */
if (!(oflags & O_NONBLOCK)) {
/* partial write on pipe with req_size > PIPE_SIZE,
* non-atomic
*/
fp->fp_cum_io_partial = cum_io;
pipe_suspend(f, buf, req_size);
return(SUSPEND);
2007-08-07 14:52:47 +02:00
}
}
fp->fp_cum_io_partial = 0;
2007-08-07 14:52:47 +02:00
return(cum_io);
2007-08-07 14:52:47 +02:00
}