minix/servers/vfs/open.c

793 lines
22 KiB
C
Raw Normal View History

/* This file contains the procedures for creating, opening, closing, and
* seeking on files.
*
* The entry points into this file are
* do_creat: perform the CREAT system call
* do_open: perform the OPEN system call
* do_mknod: perform the MKNOD system call
* do_mkdir: perform the MKDIR system call
* do_close: perform the CLOSE system call
* do_lseek: perform the LSEEK system call
2012-02-13 16:28:04 +01:00
* do_llseek: perform the LLSEEK system call
*/
#include "fs.h"
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <unistd.h>
#include <minix/callnr.h>
#include <minix/com.h>
#include <minix/u64.h>
#include "file.h"
#include "fproc.h"
2012-02-13 16:28:04 +01:00
#include "scratchpad.h"
#include "dmap.h"
#include "lock.h"
#include "param.h"
#include <dirent.h>
Mostly bugfixes of bugs triggered by the test set. bugfixes: SYSTEM: . removed rc->p_priv->s_flags = 0; for the priv struct shared by all user processes in get_priv(). this should only be done once. doing a SYS_PRIV_USER in sys_privctl() caused the flags of all user processes to be reset, so they were no longer PREEMPTIBLE. this happened when RS executed a policy script. (this broke test1 in the test set) VFS/MFS: . chown can change the mode of a file, and chmod arguments are only part of the full file mode so the full filemode is slightly magic. changed these calls so that the final modes are returned to VFS, so that the vnode can be kept up-to-date. (this broke test11 in the test set) MFS: . lookup() checked for sizeof(string) instead of sizeof(user_path), truncating long path names (caught by test 23) . truncate functions neglected to update ctime (this broke test16) VFS: . corner case of an empty filename lookup caused fields of a request not to be filled in in the lookup functions, not making it clear that the lookup had failed, causing messages to garbage processes, causing strange failures. (caught by test 30) . trust v_size in vnode when doing reads or writes on non-special files, truncating i/o where necessary; this is necessary for pipes, as MFS can't tell when a pipe has been truncated without it being told explicitly each time. when the last reader/writer on a pipe closes, tell FS about the new size using truncate_vn(). (this broke test 25, among others) . permission check for chdir() had disappeared; added a forbidden() call (caught by test 23) new code, shouldn't change anything: . introduced RTS_SET, RTS_UNSET, and RTS_ISSET macro's, and their LOCK variants. These macros set and clear the p_rts_flags field, causing a lot of duplicated logic like old_flags = rp->p_rts_flags; /* save value of the flags */ rp->p_rts_flags &= ~NO_PRIV; if (old_flags != 0 && rp->p_rts_flags == 0) lock_enqueue(rp); to change into the simpler RTS_LOCK_UNSET(rp, NO_PRIV); so the macros take care of calling dequeue() and enqueue() (or lock_*()), as the case may be). This makes the code a bit more readable and a bit less fragile. . removed return code from do_clocktick in CLOCK as it currently never replies . removed some debug code from VFS . fixed grant debug message in device.c preemptive checks, tests, changes: . added return code checks of receive() to SYSTEM and CLOCK . O_TRUNC should never arrive at MFS (added sanity check and removed O_TRUNC code) . user_path declared with PATH_MAX+1 to let it be null-terminated . checks in MFS to see if strings passed by VFS are null-terminated IS: . static irq name table thrown out
2007-02-01 18:50:02 +01:00
#include <assert.h>
#include <minix/vfsif.h>
#include "vnode.h"
#include "vmnt.h"
2012-02-13 16:28:04 +01:00
#include "path.h"
2012-03-25 20:25:53 +02:00
char mode_map[] = {R_BIT, W_BIT, R_BIT|W_BIT, 0};
2012-03-25 20:25:53 +02:00
static struct vnode *new_node(struct lookup *resolve, int oflags,
mode_t bits);
2012-03-25 20:25:53 +02:00
static int pipe_open(struct vnode *vp, mode_t bits, int oflags);
/*===========================================================================*
* do_creat *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
int do_creat()
{
/* Perform the creat(name, mode) system call.
* syscall might provide 'name' embedded in the message.
*/
2012-02-13 16:28:04 +01:00
char fullpath[PATH_MAX];
vir_bytes vname;
size_t vname_length;
mode_t open_mode;
vname = (vir_bytes) job_m_in.name;
vname_length = (size_t) job_m_in.name_length;
open_mode = (mode_t) job_m_in.mode;
if (copy_name(vname_length, fullpath) != OK) {
/* Direct copy failed, try fetching from user space */
if (fetch_name(vname, vname_length, fullpath) != OK)
return(err_code);
}
return common_open(fullpath, O_WRONLY | O_CREAT | O_TRUNC, open_mode);
}
/*===========================================================================*
* do_open *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
int do_open()
{
/* Perform the open(name, flags,...) system call.
* syscall might provide 'name' embedded in message when not creating file */
int create_mode; /* is really mode_t but this gives problems */
int open_mode = 0; /* is really mode_t but this gives problems */
int r = OK;
2012-02-13 16:28:04 +01:00
char fullpath[PATH_MAX];
vir_bytes vname;
size_t vname_length;
open_mode = (mode_t) job_m_in.mode;
create_mode = job_m_in.c_mode;
/* If O_CREAT is set, open has three parameters, otherwise two. */
if (open_mode & O_CREAT) {
vname = (vir_bytes) job_m_in.name1;
vname_length = (size_t) job_m_in.name1_length;
r = fetch_name(vname, vname_length, fullpath);
} else {
vname = (vir_bytes) job_m_in.name;
vname_length = (size_t) job_m_in.name_length;
create_mode = 0;
if (copy_name(vname_length, fullpath) != OK) {
/* Direct copy failed, try fetching from user space */
if (fetch_name(vname, vname_length, fullpath) != OK)
r = err_code;
}
}
if (r != OK) return(err_code); /* name was bad */
return common_open(fullpath, open_mode, create_mode);
}
/*===========================================================================*
* common_open *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
int common_open(char path[PATH_MAX], int oflags, mode_t omode)
{
/* Common code from do_creat and do_open. */
2012-02-13 16:28:04 +01:00
int b, r, exist = TRUE, major_dev;
dev_t dev;
mode_t bits;
2012-02-13 16:28:04 +01:00
struct filp *filp, *filp2;
2007-08-07 14:52:47 +02:00
struct vnode *vp;
struct vmnt *vmp;
2007-08-07 14:52:47 +02:00
struct dmap *dp;
2012-02-13 16:28:04 +01:00
struct lookup resolve;
/* Remap the bottom two bits of oflags. */
bits = (mode_t) mode_map[oflags & O_ACCMODE];
if (!bits) return(EINVAL);
/* See if file descriptor and filp slots are available. */
2012-02-13 16:28:04 +01:00
if ((r = get_fd(0, bits, &(scratch(fp).file.fd_nr), &filp)) != OK) return(r);
lookup_init(&resolve, path, PATH_NOFLAGS, &vmp, &vp);
/* If O_CREATE is set, try to make the file. */
if (oflags & O_CREAT) {
2012-04-25 14:44:42 +02:00
omode = I_REGULAR | (omode & ALLPERMS & fp->fp_umask);
2012-02-13 16:28:04 +01:00
vp = new_node(&resolve, oflags, omode);
r = err_code;
2012-02-13 16:28:04 +01:00
if (r == OK) exist = FALSE; /* We just created the file */
else if (r != EEXIST) { /* other error */
if (vp) unlock_vnode(vp);
unlock_filp(filp);
return(r);
}
else exist = !(oflags & O_EXCL);/* file exists, if the O_EXCL
flag is set this is an error */
} else {
/* Scan path name */
2012-02-13 16:28:04 +01:00
resolve.l_vmnt_lock = VMNT_READ;
resolve.l_vnode_lock = VNODE_OPCL;
if ((vp = eat_path(&resolve, fp)) == NULL) {
unlock_filp(filp);
return(err_code);
}
if (vmp != NULL) unlock_vmnt(vmp);
2007-01-05 17:36:55 +01:00
}
/* Claim the file descriptor and filp slot and fill them in. */
2012-02-13 16:28:04 +01:00
fp->fp_filp[scratch(fp).file.fd_nr] = filp;
FD_SET(scratch(fp).file.fd_nr, &fp->fp_filp_inuse);
filp->filp_count = 1;
filp->filp_vno = vp;
filp->filp_flags = oflags;
/* Only do the normal open code if we didn't just create the file. */
2012-02-13 16:28:04 +01:00
if (exist) {
/* Check protections. */
if ((r = forbidden(fp, vp, bits)) == OK) {
/* Opening reg. files, directories, and special files differ */
2012-04-25 14:44:42 +02:00
switch (vp->v_mode & S_IFMT) {
case S_IFREG:
2012-02-13 16:28:04 +01:00
/* Truncate regular file if O_TRUNC. */
if (oflags & O_TRUNC) {
if ((r = forbidden(fp, vp, W_BIT)) != OK)
break;
VFS: fix locking bugs .sync and fsync used unnecessarily restrictive locking type .fsync violated locking order by obtaining a vmnt lock after a filp lock .fsync contained a TOCTOU bug .new_node violated locking rules (didn't upgrade lock upon file creation) .do_pipe used unnecessarily restrictive locking type .always lock pipes exclusively; even a read operation might require to do a write on a vnode object (update pipe size) .when opening a file with O_TRUNC, upgrade vnode lock when truncating .utime used unnecessarily restrictive locking type .path parsing: .always acquire VMNT_WRITE or VMNT_EXCL on vmnt and downgrade to VMNT_READ if that was what was actually requested. This prevents the following deadlock scenario: thread A: lock_vmnt(vmp, TLL_READSER); lock_vnode(vp, TLL_READSER); upgrade_vmnt_lock(vmp, TLL_WRITE); thread B: lock_vmnt(vmp, TLL_READ); lock_vnode(vp, TLL_READSER); thread A will be stuck in upgrade_vmnt_lock and thread B is stuck in lock_vnode. This happens when, for example, thread A tries create a new node (open.c:new_node) and thread B tries to do eat_path to change dir (stadir.c:do_chdir). When the path is being resolved, a vnode is always locked with VNODE_OPCL (TLL_READSER) and then downgraded to VNODE_READ if read-only is actually requested. Thread A locks the vmnt with VMNT_WRITE (TLL_READSER) which still allows VMNT_READ locks. Thread B can't acquire a lock on the vnode because thread A has it; Thread A can't upgrade its vmnt lock to VMNT_WRITE (TLL_WRITE) because thread B has a VMNT_READ lock on it. By serializing vmnt locks during path parsing, thread B can only acquire a lock on vmp when thread A has completely finished its operation.
2012-11-30 13:49:53 +01:00
upgrade_vnode_lock(vp);
2012-02-13 16:28:04 +01:00
truncate_vnode(vp, 0);
}
break;
2012-04-25 14:44:42 +02:00
case S_IFDIR:
2012-02-13 16:28:04 +01:00
/* Directories may be read but not written. */
r = (bits & W_BIT ? EISDIR : OK);
break;
2012-04-25 14:44:42 +02:00
case S_IFCHR:
/* Invoke the driver for special processing. */
dev = (dev_t) vp->v_sdev;
/* TTY needs to know about the O_NOCTTY flag. */
r = dev_open(dev, who_e, bits | (oflags & O_NOCTTY));
2012-02-13 16:28:04 +01:00
if (r == SUSPEND) suspend(FP_BLOCKED_ON_DOPEN);
else vp = filp->filp_vno; /* Might be updated by
* dev_open/clone_opcl */
break;
2012-04-25 14:44:42 +02:00
case S_IFBLK:
2012-02-13 16:28:04 +01:00
lock_bsf();
/* Invoke the driver for special processing. */
dev = (dev_t) vp->v_sdev;
Split block/character protocols and libdriver This patch separates the character and block driver communication protocols. The old character protocol remains the same, but a new block protocol is introduced. The libdriver library is replaced by two new libraries: libchardriver and libblockdriver. Their exposed API, and drivers that use them, have been updated accordingly. Together, libbdev and libblockdriver now completely abstract away the message format used by the block protocol. As the memory driver is both a character and a block device driver, it now implements its own message loop. The most important semantic change made to the block protocol is that it is no longer possible to return both partial results and an error for a single transfer. This simplifies the interaction between the caller and the driver, as the I/O vector no longer needs to be copied back. Also, drivers are now no longer supposed to decide based on the layout of the I/O vector when a transfer should be cut short. Put simply, transfers are now supposed to either succeed completely, or result in an error. After this patch, the state of the various pieces is as follows: - block protocol: stable - libbdev API: stable for synchronous communication - libblockdriver API: needs slight revision (the drvlib/partition API in particular; the threading API will also change shortly) - character protocol: needs cleanup - libchardriver API: needs cleanup accordingly - driver restarts: largely unsupported until endpoint changes are reintroduced As a side effect, this patch eliminates several bugs, hacks, and gcc -Wall and -W warnings all over the place. It probably introduces a few new ones, too. Update warning: this patch changes the protocol between MFS and disk drivers, so in order to use old/new images, the MFS from the ramdisk must be used to mount all file systems.
2011-11-22 13:27:53 +01:00
r = bdev_open(dev, bits);
2012-02-13 16:28:04 +01:00
if (r != OK) {
unlock_bsf();
break;
}
major_dev = major(vp->v_sdev);
dp = &dmap[major_dev];
if (dp->dmap_driver == NONE) {
printf("VFS: block driver disappeared!\n");
unlock_bsf();
r = ENXIO;
break;
}
/* Check whether the device is mounted or not. If so,
* then that FS is responsible for this device.
* Otherwise we default to ROOT_FS.
*/
vp->v_bfs_e = ROOT_FS_E; /* By default */
2012-02-13 16:28:04 +01:00
for (vmp = &vmnt[0]; vmp < &vmnt[NR_MNTS]; ++vmp)
if (vmp->m_dev == vp->v_sdev &&
!(vmp->m_flags & VMNT_FORCEROOTBSF)) {
vp->v_bfs_e = vmp->m_fs_e;
2012-02-13 16:28:04 +01:00
}
2012-02-13 16:28:04 +01:00
/* Send the driver label to the file system that will
* handle the block I/O requests (even when its label
* and endpoint are known already), but only when it is
* the root file system. Other file systems will
* already have it anyway.
Split block/character protocols and libdriver This patch separates the character and block driver communication protocols. The old character protocol remains the same, but a new block protocol is introduced. The libdriver library is replaced by two new libraries: libchardriver and libblockdriver. Their exposed API, and drivers that use them, have been updated accordingly. Together, libbdev and libblockdriver now completely abstract away the message format used by the block protocol. As the memory driver is both a character and a block device driver, it now implements its own message loop. The most important semantic change made to the block protocol is that it is no longer possible to return both partial results and an error for a single transfer. This simplifies the interaction between the caller and the driver, as the I/O vector no longer needs to be copied back. Also, drivers are now no longer supposed to decide based on the layout of the I/O vector when a transfer should be cut short. Put simply, transfers are now supposed to either succeed completely, or result in an error. After this patch, the state of the various pieces is as follows: - block protocol: stable - libbdev API: stable for synchronous communication - libblockdriver API: needs slight revision (the drvlib/partition API in particular; the threading API will also change shortly) - character protocol: needs cleanup - libchardriver API: needs cleanup accordingly - driver restarts: largely unsupported until endpoint changes are reintroduced As a side effect, this patch eliminates several bugs, hacks, and gcc -Wall and -W warnings all over the place. It probably introduces a few new ones, too. Update warning: this patch changes the protocol between MFS and disk drivers, so in order to use old/new images, the MFS from the ramdisk must be used to mount all file systems.
2011-11-22 13:27:53 +01:00
*/
2012-02-13 16:28:04 +01:00
if (vp->v_bfs_e != ROOT_FS_E) {
unlock_bsf();
break;
}
if (req_newdriver(vp->v_bfs_e, vp->v_sdev,
dp->dmap_label) != OK) {
printf("VFS: error sending driver label\n");
Split block/character protocols and libdriver This patch separates the character and block driver communication protocols. The old character protocol remains the same, but a new block protocol is introduced. The libdriver library is replaced by two new libraries: libchardriver and libblockdriver. Their exposed API, and drivers that use them, have been updated accordingly. Together, libbdev and libblockdriver now completely abstract away the message format used by the block protocol. As the memory driver is both a character and a block device driver, it now implements its own message loop. The most important semantic change made to the block protocol is that it is no longer possible to return both partial results and an error for a single transfer. This simplifies the interaction between the caller and the driver, as the I/O vector no longer needs to be copied back. Also, drivers are now no longer supposed to decide based on the layout of the I/O vector when a transfer should be cut short. Put simply, transfers are now supposed to either succeed completely, or result in an error. After this patch, the state of the various pieces is as follows: - block protocol: stable - libbdev API: stable for synchronous communication - libblockdriver API: needs slight revision (the drvlib/partition API in particular; the threading API will also change shortly) - character protocol: needs cleanup - libchardriver API: needs cleanup accordingly - driver restarts: largely unsupported until endpoint changes are reintroduced As a side effect, this patch eliminates several bugs, hacks, and gcc -Wall and -W warnings all over the place. It probably introduces a few new ones, too. Update warning: this patch changes the protocol between MFS and disk drivers, so in order to use old/new images, the MFS from the ramdisk must be used to mount all file systems.
2011-11-22 13:27:53 +01:00
bdev_close(dev);
r = ENXIO;
}
2012-02-13 16:28:04 +01:00
unlock_bsf();
break;
2012-04-25 14:44:42 +02:00
case S_IFIFO:
/* Create a mapped inode on PFS which handles reads
and writes to this named pipe. */
VFS: fix locking bugs .sync and fsync used unnecessarily restrictive locking type .fsync violated locking order by obtaining a vmnt lock after a filp lock .fsync contained a TOCTOU bug .new_node violated locking rules (didn't upgrade lock upon file creation) .do_pipe used unnecessarily restrictive locking type .always lock pipes exclusively; even a read operation might require to do a write on a vnode object (update pipe size) .when opening a file with O_TRUNC, upgrade vnode lock when truncating .utime used unnecessarily restrictive locking type .path parsing: .always acquire VMNT_WRITE or VMNT_EXCL on vmnt and downgrade to VMNT_READ if that was what was actually requested. This prevents the following deadlock scenario: thread A: lock_vmnt(vmp, TLL_READSER); lock_vnode(vp, TLL_READSER); upgrade_vmnt_lock(vmp, TLL_WRITE); thread B: lock_vmnt(vmp, TLL_READ); lock_vnode(vp, TLL_READSER); thread A will be stuck in upgrade_vmnt_lock and thread B is stuck in lock_vnode. This happens when, for example, thread A tries create a new node (open.c:new_node) and thread B tries to do eat_path to change dir (stadir.c:do_chdir). When the path is being resolved, a vnode is always locked with VNODE_OPCL (TLL_READSER) and then downgraded to VNODE_READ if read-only is actually requested. Thread A locks the vmnt with VMNT_WRITE (TLL_READSER) which still allows VMNT_READ locks. Thread B can't acquire a lock on the vnode because thread A has it; Thread A can't upgrade its vmnt lock to VMNT_WRITE (TLL_WRITE) because thread B has a VMNT_READ lock on it. By serializing vmnt locks during path parsing, thread B can only acquire a lock on vmp when thread A has completely finished its operation.
2012-11-30 13:49:53 +01:00
upgrade_vnode_lock(vp);
2012-02-13 16:28:04 +01:00
r = map_vnode(vp, PFS_PROC_NR);
if (r == OK) {
if (vp->v_ref_count == 1) {
if (vp->v_size != 0)
r = truncate_vnode(vp, 0);
}
oflags |= O_APPEND; /* force append mode */
2012-02-13 16:28:04 +01:00
filp->filp_flags = oflags;
}
if (r == OK) {
r = pipe_open(vp, bits, oflags);
}
if (r != ENXIO) {
/* See if someone else is doing a rd or wt on
* the FIFO. If so, use its filp entry so the
* file position will be automatically shared.
*/
b = (bits & R_BIT ? R_BIT : W_BIT);
2012-02-13 16:28:04 +01:00
filp->filp_count = 0; /* don't find self */
if ((filp2 = find_filp(vp, b)) != NULL) {
/* Co-reader or writer found. Use it.*/
fp->fp_filp[scratch(fp).file.fd_nr] = filp2;
filp2->filp_count++;
filp2->filp_vno = vp;
filp2->filp_flags = oflags;
/* v_count was incremented after the vnode
* has been found. i_count was incremented
* incorrectly in FS, not knowing that we
* were going to use an existing filp
* entry. Correct this error.
*/
unlock_vnode(vp);
put_vnode(vp);
} else {
/* Nobody else found. Restore filp. */
filp->filp_count = 1;
}
2012-02-13 16:28:04 +01:00
}
break;
}
}
}
2012-02-13 16:28:04 +01:00
unlock_filp(filp);
/* If error, release inode. */
if (r != OK) {
2012-02-13 16:28:04 +01:00
if (r != SUSPEND) {
fp->fp_filp[scratch(fp).file.fd_nr] = NULL;
FD_CLR(scratch(fp).file.fd_nr, &fp->fp_filp_inuse);
filp->filp_count = 0;
filp->filp_vno = NULL;
VFS: make all IPC asynchronous By decoupling synchronous drivers from VFS, we are a big step closer to supporting driver crashes under all circumstances. That is, VFS can't become stuck on IPC with a synchronous driver (e.g., INET) and can recover from crashing block drivers during open/close/ioctl or during communication with an FS. In order to maintain serialized communication with a synchronous driver, the communication is wrapped by a mutex on a per driver basis (not major numbers as there can be multiple majors with identical endpoints). Majors that share a driver endpoint point to a single mutex object. In order to support crashes from block drivers, the file reopen tactic had to be changed; first reopen files associated with the crashed driver, then send the new driver endpoint to FSes. This solves a deadlock between the FS and the block driver; - VFS would send REQ_NEW_DRIVER to an FS, but he FS only receives it after retrying the current request to the newly started driver. - The block driver would refuse the retried request until all files had been reopened. - VFS would reopen files only after getting a reply from the initial REQ_NEW_DRIVER. When a character special driver crashes, all associated files have to be marked invalid and closed (or reopened if flagged as such). However, they can only be closed if a thread holds exclusive access to it. To obtain exclusive access, the worker thread (which handles the new driver endpoint event from DS) schedules a new job to garbage collect invalid files. This way, we can signal the worker thread that was talking to the crashed driver and will release exclusive access to a file associated with the crashed driver and prevent the garbage collecting worker thread from dead locking on that file. Also, when a character special driver crashes, RS will unmap the driver and remap it upon restart. During unmapping, associated files are marked invalid instead of waiting for an endpoint up event from DS, as that event might come later than new read/write/select requests and thus cause confusion in the freshly started driver. When locking a filp, the usage counters are no longer checked. The usage counter can legally go down to zero during filp invalidation while there are locks pending. DS events are handled by a separate worker thread instead of the main thread as reopening files could lead to another crash and a stuck thread. An additional worker thread is then necessary to unlock it. Finally, with everything asynchronous a race condition in do_select surfaced. A select entry was only marked in use after succesfully sending initial select requests to drivers and having to wait. When multiple select() calls were handled there was opportunity that these entries were overwritten. This had as effect that some select results were ignored (and select() remained blocking instead if returning) or do_select tried to access filps that were not present (because thrown away by secondary select()). This bug manifested itself with sendrecs, but was very hard to reproduce. However, it became awfully easy to trigger with asynsends only.
2012-08-28 16:06:51 +02:00
filp->filp_state &= ~FS_INVALIDATED; /* Prevent garbage col. */
2012-02-13 16:28:04 +01:00
put_vnode(vp);
}
} else {
r = scratch(fp).file.fd_nr;
}
2012-02-13 16:28:04 +01:00
return(r);
}
2007-08-07 14:52:47 +02:00
/*===========================================================================*
* new_node *
2007-01-05 17:36:55 +01:00
*===========================================================================*/
2012-03-25 20:25:53 +02:00
static struct vnode *new_node(struct lookup *resolve, int oflags, mode_t bits)
2007-08-07 14:52:47 +02:00
{
/* Try to create a new inode and return a pointer to it. If the inode already
exists, return a pointer to it as well, but set err_code accordingly.
NULL is returned if the path cannot be resolved up to the last
directory, or when the inode cannot be created due to permissions or
2012-02-13 16:28:04 +01:00
otherwise. */
struct vnode *dirp, *vp;
2012-02-13 16:28:04 +01:00
struct vmnt *dir_vmp, *vp_vmp;
int r;
struct node_details res;
2012-02-13 16:28:04 +01:00
struct lookup findnode;
char *path;
path = resolve->l_path; /* For easy access */
lookup_init(&findnode, path, resolve->l_flags, &dir_vmp, &dirp);
findnode.l_vmnt_lock = VMNT_WRITE;
findnode.l_vnode_lock = VNODE_WRITE; /* dir node */
/* When O_CREAT and O_EXCL flags are set, the path may not be named by a
* symbolic link. */
2012-02-13 16:28:04 +01:00
if (oflags & O_EXCL) findnode.l_flags |= PATH_RET_SYMLINK;
/* See if the path can be opened down to the last directory. */
2012-02-13 16:28:04 +01:00
if ((dirp = last_dir(&findnode, fp)) == NULL) return(NULL);
/* The final directory is accessible. Get final component of the path. */
2012-02-13 16:28:04 +01:00
lookup_init(&findnode, findnode.l_path, findnode.l_flags, &vp_vmp, &vp);
findnode.l_vmnt_lock = VMNT_WRITE;
findnode.l_vnode_lock = (oflags & O_TRUNC) ? VNODE_WRITE : VNODE_OPCL;
vp = advance(dirp, &findnode, fp);
assert(vp_vmp == NULL); /* Lookup to last dir should have yielded lock
* on vmp or final component does not exist.
* Either way, vp_vmp ought to be not set.
*/
/* The combination of a symlink with absolute path followed by a danglink
* symlink results in a new path that needs to be re-resolved entirely. */
2012-02-13 16:28:04 +01:00
if (path[0] == '/') {
unlock_vnode(dirp);
unlock_vmnt(dir_vmp);
put_vnode(dirp);
if (vp != NULL) {
unlock_vnode(vp);
put_vnode(vp);
}
return new_node(resolve, oflags, bits);
}
if (vp == NULL && err_code == ENOENT) {
/* Last path component does not exist. Make a new directory entry. */
if ((vp = get_free_vnode()) == NULL) {
2012-02-13 16:28:04 +01:00
/* Can't create new entry: out of vnodes. */
unlock_vnode(dirp);
unlock_vmnt(dir_vmp);
put_vnode(dirp);
return(NULL);
2007-08-07 14:52:47 +02:00
}
2012-02-13 16:28:04 +01:00
lock_vnode(vp, VNODE_OPCL);
VFS: fix locking bugs .sync and fsync used unnecessarily restrictive locking type .fsync violated locking order by obtaining a vmnt lock after a filp lock .fsync contained a TOCTOU bug .new_node violated locking rules (didn't upgrade lock upon file creation) .do_pipe used unnecessarily restrictive locking type .always lock pipes exclusively; even a read operation might require to do a write on a vnode object (update pipe size) .when opening a file with O_TRUNC, upgrade vnode lock when truncating .utime used unnecessarily restrictive locking type .path parsing: .always acquire VMNT_WRITE or VMNT_EXCL on vmnt and downgrade to VMNT_READ if that was what was actually requested. This prevents the following deadlock scenario: thread A: lock_vmnt(vmp, TLL_READSER); lock_vnode(vp, TLL_READSER); upgrade_vmnt_lock(vmp, TLL_WRITE); thread B: lock_vmnt(vmp, TLL_READ); lock_vnode(vp, TLL_READSER); thread A will be stuck in upgrade_vmnt_lock and thread B is stuck in lock_vnode. This happens when, for example, thread A tries create a new node (open.c:new_node) and thread B tries to do eat_path to change dir (stadir.c:do_chdir). When the path is being resolved, a vnode is always locked with VNODE_OPCL (TLL_READSER) and then downgraded to VNODE_READ if read-only is actually requested. Thread A locks the vmnt with VMNT_WRITE (TLL_READSER) which still allows VMNT_READ locks. Thread B can't acquire a lock on the vnode because thread A has it; Thread A can't upgrade its vmnt lock to VMNT_WRITE (TLL_WRITE) because thread B has a VMNT_READ lock on it. By serializing vmnt locks during path parsing, thread B can only acquire a lock on vmp when thread A has completely finished its operation.
2012-11-30 13:49:53 +01:00
upgrade_vmnt_lock(dir_vmp); /* Creating file, need exclusive access */
2012-02-13 16:28:04 +01:00
if ((r = forbidden(fp, dirp, W_BIT|X_BIT)) != OK ||
(r = req_create(dirp->v_fs_e, dirp->v_inode_nr,bits, fp->fp_effuid,
2012-02-13 16:28:04 +01:00
fp->fp_effgid, path, &res)) != OK ) {
/* Can't create inode either due to permissions or some other
* problem. In case r is EEXIST, we might be dealing with a
* dangling symlink.*/
VFS: fix locking bugs .sync and fsync used unnecessarily restrictive locking type .fsync violated locking order by obtaining a vmnt lock after a filp lock .fsync contained a TOCTOU bug .new_node violated locking rules (didn't upgrade lock upon file creation) .do_pipe used unnecessarily restrictive locking type .always lock pipes exclusively; even a read operation might require to do a write on a vnode object (update pipe size) .when opening a file with O_TRUNC, upgrade vnode lock when truncating .utime used unnecessarily restrictive locking type .path parsing: .always acquire VMNT_WRITE or VMNT_EXCL on vmnt and downgrade to VMNT_READ if that was what was actually requested. This prevents the following deadlock scenario: thread A: lock_vmnt(vmp, TLL_READSER); lock_vnode(vp, TLL_READSER); upgrade_vmnt_lock(vmp, TLL_WRITE); thread B: lock_vmnt(vmp, TLL_READ); lock_vnode(vp, TLL_READSER); thread A will be stuck in upgrade_vmnt_lock and thread B is stuck in lock_vnode. This happens when, for example, thread A tries create a new node (open.c:new_node) and thread B tries to do eat_path to change dir (stadir.c:do_chdir). When the path is being resolved, a vnode is always locked with VNODE_OPCL (TLL_READSER) and then downgraded to VNODE_READ if read-only is actually requested. Thread A locks the vmnt with VMNT_WRITE (TLL_READSER) which still allows VMNT_READ locks. Thread B can't acquire a lock on the vnode because thread A has it; Thread A can't upgrade its vmnt lock to VMNT_WRITE (TLL_WRITE) because thread B has a VMNT_READ lock on it. By serializing vmnt locks during path parsing, thread B can only acquire a lock on vmp when thread A has completely finished its operation.
2012-11-30 13:49:53 +01:00
/* Downgrade lock to prevent deadlock during symlink resolving*/
downgrade_vmnt_lock(dir_vmp);
if (r == EEXIST) {
struct vnode *slp, *old_wd;
VFS: fix locking bugs .sync and fsync used unnecessarily restrictive locking type .fsync violated locking order by obtaining a vmnt lock after a filp lock .fsync contained a TOCTOU bug .new_node violated locking rules (didn't upgrade lock upon file creation) .do_pipe used unnecessarily restrictive locking type .always lock pipes exclusively; even a read operation might require to do a write on a vnode object (update pipe size) .when opening a file with O_TRUNC, upgrade vnode lock when truncating .utime used unnecessarily restrictive locking type .path parsing: .always acquire VMNT_WRITE or VMNT_EXCL on vmnt and downgrade to VMNT_READ if that was what was actually requested. This prevents the following deadlock scenario: thread A: lock_vmnt(vmp, TLL_READSER); lock_vnode(vp, TLL_READSER); upgrade_vmnt_lock(vmp, TLL_WRITE); thread B: lock_vmnt(vmp, TLL_READ); lock_vnode(vp, TLL_READSER); thread A will be stuck in upgrade_vmnt_lock and thread B is stuck in lock_vnode. This happens when, for example, thread A tries create a new node (open.c:new_node) and thread B tries to do eat_path to change dir (stadir.c:do_chdir). When the path is being resolved, a vnode is always locked with VNODE_OPCL (TLL_READSER) and then downgraded to VNODE_READ if read-only is actually requested. Thread A locks the vmnt with VMNT_WRITE (TLL_READSER) which still allows VMNT_READ locks. Thread B can't acquire a lock on the vnode because thread A has it; Thread A can't upgrade its vmnt lock to VMNT_WRITE (TLL_WRITE) because thread B has a VMNT_READ lock on it. By serializing vmnt locks during path parsing, thread B can only acquire a lock on vmp when thread A has completely finished its operation.
2012-11-30 13:49:53 +01:00
/* Resolve path up to symlink */
2012-02-13 16:28:04 +01:00
findnode.l_flags = PATH_RET_SYMLINK;
findnode.l_vnode_lock = VNODE_READ;
findnode.l_vnode = &slp;
slp = advance(dirp, &findnode, fp);
if (slp != NULL) {
if (S_ISLNK(slp->v_mode)) {
/* Get contents of link */
r = req_rdlink(slp->v_fs_e,
slp->v_inode_nr,
2012-02-13 16:28:04 +01:00
VFS_PROC_NR,
(vir_bytes) path,
2012-02-13 16:28:04 +01:00
PATH_MAX - 1, 0);
if (r < 0) {
/* Failed to read link */
2012-02-13 16:28:04 +01:00
unlock_vnode(slp);
unlock_vnode(dirp);
unlock_vmnt(dir_vmp);
put_vnode(slp);
put_vnode(dirp);
err_code = r;
return(NULL);
}
2012-02-13 16:28:04 +01:00
path[r] = '\0'; /* Terminate path */
}
unlock_vnode(slp);
put_vnode(slp);
}
/* Try to create the inode the dangling symlink was
* pointing to. We have to use dirp as starting point
* as there might be multiple successive symlinks
2012-02-13 16:28:04 +01:00
* crossing multiple mountpoints.
* Unlock vnodes and vmnts as we're going to recurse.
*/
unlock_vnode(dirp);
unlock_vnode(vp);
unlock_vmnt(dir_vmp);
old_wd = fp->fp_wd; /* Save orig. working dirp */
fp->fp_wd = dirp;
2012-02-13 16:28:04 +01:00
vp = new_node(resolve, oflags, bits);
fp->fp_wd = old_wd; /* Restore */
if (vp != NULL) {
put_vnode(dirp);
2012-02-13 16:28:04 +01:00
*(resolve->l_vnode) = vp;
return(vp);
}
r = err_code;
2012-02-13 16:28:04 +01:00
}
2012-02-13 16:28:04 +01:00
if (r == EEXIST)
err_code = EIO; /* Impossible, we have verified that
* the last component doesn't exist and
* is not a dangling symlink. */
else
err_code = r;
unlock_vmnt(dir_vmp);
2012-02-13 16:28:04 +01:00
unlock_vnode(dirp);
unlock_vnode(vp);
put_vnode(dirp);
return(NULL);
2007-08-07 14:52:47 +02:00
}
2012-02-13 16:28:04 +01:00
/* Store results and mark vnode in use */
2012-02-13 16:28:04 +01:00
vp->v_fs_e = res.fs_e;
vp->v_inode_nr = res.inode_nr;
vp->v_mode = res.fmode;
vp->v_size = res.fsize;
vp->v_uid = res.uid;
vp->v_gid = res.gid;
vp->v_sdev = res.dev;
vp->v_vmnt = dirp->v_vmnt;
vp->v_dev = vp->v_vmnt->m_dev;
vp->v_fs_count = 1;
vp->v_ref_count = 1;
} else {
2012-02-13 16:28:04 +01:00
/* Either last component exists, or there is some other problem. */
if (vp != NULL) {
r = EEXIST; /* File exists or a symlink names a file while
* O_EXCL is set. */
} else
r = err_code; /* Other problem. */
2007-08-07 14:52:47 +02:00
}
err_code = r;
2012-02-13 16:28:04 +01:00
/* When dirp equals vp, we shouldn't release the lock as a vp is locked only
* once. Releasing the lock would cause the resulting vp not be locked and
* cause mayhem later on. */
if (dirp != vp) {
unlock_vnode(dirp);
}
unlock_vmnt(dir_vmp);
put_vnode(dirp);
2012-02-13 16:28:04 +01:00
*(resolve->l_vnode) = vp;
return(vp);
2007-08-07 14:52:47 +02:00
}
/*===========================================================================*
* pipe_open *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
static int pipe_open(struct vnode *vp, mode_t bits, int oflags)
{
/* This function is called from common_open. It checks if
* there is at least one reader/writer pair for the pipe, if not
* it suspends the caller, otherwise it revives all other blocked
* processes hanging on the pipe.
*/
2012-02-13 16:28:04 +01:00
if ((bits & (R_BIT|W_BIT)) == (R_BIT|W_BIT)) return(ENXIO);
2012-02-13 16:28:04 +01:00
/* Find the reader/writer at the other end of the pipe */
if (find_filp(vp, bits & W_BIT ? R_BIT : W_BIT) == NULL) {
/* Not found */
if (oflags & O_NONBLOCK) {
if (bits & W_BIT) return(ENXIO);
} else {
2012-02-13 16:28:04 +01:00
/* Let's wait for the other side to show up */
suspend(FP_BLOCKED_ON_POPEN);
return(SUSPEND);
}
} else if (susp_count > 0) { /* revive blocked processes */
release(vp, OPEN, susp_count);
release(vp, CREAT, susp_count);
}
return(OK);
}
/*===========================================================================*
* do_mknod *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
int do_mknod()
{
/* Perform the mknod(name, mode, addr) system call. */
register mode_t bits, mode_bits;
int r;
2007-08-07 14:52:47 +02:00
struct vnode *vp;
2012-02-13 16:28:04 +01:00
struct vmnt *vmp;
char fullpath[PATH_MAX];
struct lookup resolve;
vir_bytes vname1;
size_t vname1_length;
dev_t dev;
vname1 = (vir_bytes) job_m_in.name1;
vname1_length = (size_t) job_m_in.name1_length;
mode_bits = (mode_t) job_m_in.mk_mode; /* mode of the inode */
dev = job_m_in.m1_i3;
2012-02-13 16:28:04 +01:00
lookup_init(&resolve, fullpath, PATH_NOFLAGS, &vmp, &vp);
resolve.l_vmnt_lock = VMNT_WRITE;
resolve.l_vnode_lock = VNODE_WRITE;
2012-02-13 16:28:04 +01:00
/* Only the super_user may make nodes other than fifos. */
if (!super_user && (!S_ISFIFO(mode_bits) && !S_ISSOCK(mode_bits))) {
2012-02-13 16:28:04 +01:00
return(EPERM);
}
2012-04-25 14:44:42 +02:00
bits = (mode_bits & S_IFMT) | (mode_bits & ACCESSPERMS & fp->fp_umask);
/* Open directory that's going to hold the new node. */
if (fetch_name(vname1, vname1_length, fullpath) != OK) return(err_code);
2012-02-13 16:28:04 +01:00
if ((vp = last_dir(&resolve, fp)) == NULL) return(err_code);
2007-08-07 14:52:47 +02:00
/* Make sure that the object is a directory */
if (!S_ISDIR(vp->v_mode)) {
2012-02-13 16:28:04 +01:00
r = ENOTDIR;
} else if ((r = forbidden(fp, vp, W_BIT|X_BIT)) == OK) {
r = req_mknod(vp->v_fs_e, vp->v_inode_nr, fullpath, fp->fp_effuid,
fp->fp_effgid, bits, dev);
2007-08-07 14:52:47 +02:00
}
2012-02-13 16:28:04 +01:00
unlock_vnode(vp);
unlock_vmnt(vmp);
2007-08-07 14:52:47 +02:00
put_vnode(vp);
return(r);
}
/*===========================================================================*
* do_mkdir *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
int do_mkdir()
{
/* Perform the mkdir(name, mode) system call. */
mode_t bits; /* mode bits for the new inode */
int r;
2007-08-07 14:52:47 +02:00
struct vnode *vp;
2012-02-13 16:28:04 +01:00
struct vmnt *vmp;
char fullpath[PATH_MAX];
struct lookup resolve;
vir_bytes vname1;
size_t vname1_length;
mode_t dirmode;
vname1 = (vir_bytes) job_m_in.name1;
vname1_length = (size_t) job_m_in.name1_length;
dirmode = (mode_t) job_m_in.mode;
2012-02-13 16:28:04 +01:00
lookup_init(&resolve, fullpath, PATH_NOFLAGS, &vmp, &vp);
resolve.l_vmnt_lock = VMNT_WRITE;
resolve.l_vnode_lock = VNODE_WRITE;
if (fetch_name(vname1, vname1_length, fullpath) != OK) return(err_code);
bits = I_DIRECTORY | (dirmode & RWX_MODES & fp->fp_umask);
2012-02-13 16:28:04 +01:00
if ((vp = last_dir(&resolve, fp)) == NULL) return(err_code);
2007-08-07 14:52:47 +02:00
/* Make sure that the object is a directory */
if (!S_ISDIR(vp->v_mode)) {
2012-02-13 16:28:04 +01:00
r = ENOTDIR;
} else if ((r = forbidden(fp, vp, W_BIT|X_BIT)) == OK) {
r = req_mkdir(vp->v_fs_e, vp->v_inode_nr, fullpath, fp->fp_effuid,
fp->fp_effgid, bits);
2007-08-07 14:52:47 +02:00
}
2012-02-13 16:28:04 +01:00
unlock_vnode(vp);
unlock_vmnt(vmp);
2007-08-07 14:52:47 +02:00
put_vnode(vp);
return(r);
}
/*===========================================================================*
* do_lseek *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
int do_lseek()
{
/* Perform the lseek(ls_fd, offset, whence) system call. */
register struct filp *rfilp;
int r = OK, seekfd, seekwhence;
off_t offset;
u64_t pos, newpos;
seekfd = job_m_in.ls_fd;
seekwhence = job_m_in.whence;
offset = (off_t) job_m_in.offset_lo;
/* Check to see if the file descriptor is valid. */
if ( (rfilp = get_filp(seekfd, VNODE_READ)) == NULL) return(err_code);
/* No lseek on pipes. */
if (S_ISFIFO(rfilp->filp_vno->v_mode)) {
2012-02-13 16:28:04 +01:00
unlock_filp(rfilp);
return(ESPIPE);
}
/* The value of 'whence' determines the start position to use. */
switch(seekwhence) {
2012-02-13 16:28:04 +01:00
case SEEK_SET: pos = cvu64(0); break;
case SEEK_CUR: pos = rfilp->filp_pos; break;
case SEEK_END: pos = cvul64(rfilp->filp_vno->v_size); break;
default: unlock_filp(rfilp); return(EINVAL);
}
if (offset >= 0)
2012-02-13 16:28:04 +01:00
newpos = add64ul(pos, offset);
else
2012-02-13 16:28:04 +01:00
newpos = sub64ul(pos, -offset);
/* Check for overflow. */
2012-02-13 16:28:04 +01:00
if (ex64hi(newpos) != 0) {
r = EOVERFLOW;
} else if ((off_t) ex64lo(newpos) < 0) { /* no negative file size */
r = EOVERFLOW;
} else {
/* insert the new position into the output message */
m_out.reply_l1 = ex64lo(newpos);
2012-02-13 16:28:04 +01:00
if (cmp64(newpos, rfilp->filp_pos) != 0) {
2012-07-19 16:36:51 +02:00
rfilp->filp_pos = newpos;
2012-02-13 16:28:04 +01:00
/* Inhibit read ahead request */
r = req_inhibread(rfilp->filp_vno->v_fs_e,
rfilp->filp_vno->v_inode_nr);
}
}
2012-02-13 16:28:04 +01:00
unlock_filp(rfilp);
return(r);
}
/*===========================================================================*
* do_llseek *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
int do_llseek()
{
/* Perform the llseek(ls_fd, offset, whence) system call. */
register struct filp *rfilp;
u64_t pos, newpos;
int r = OK, seekfd, seekwhence;
long off_hi, off_lo;
seekfd = job_m_in.ls_fd;
seekwhence = job_m_in.whence;
off_hi = job_m_in.offset_high;
off_lo = job_m_in.offset_lo;
/* Check to see if the file descriptor is valid. */
if ( (rfilp = get_filp(seekfd, VNODE_READ)) == NULL) return(err_code);
/* No lseek on pipes. */
if (S_ISFIFO(rfilp->filp_vno->v_mode)) {
2012-02-13 16:28:04 +01:00
unlock_filp(rfilp);
return(ESPIPE);
}
/* The value of 'whence' determines the start position to use. */
switch(seekwhence) {
2012-02-13 16:28:04 +01:00
case SEEK_SET: pos = cvu64(0); break;
case SEEK_CUR: pos = rfilp->filp_pos; break;
case SEEK_END: pos = cvul64(rfilp->filp_vno->v_size); break;
default: unlock_filp(rfilp); return(EINVAL);
}
newpos = add64(pos, make64(off_lo, off_hi));
/* Check for overflow. */
if ((off_hi > 0) && cmp64(newpos, pos) < 0)
2012-02-13 16:28:04 +01:00
r = EINVAL;
else if ((off_hi < 0) && cmp64(newpos, pos) > 0)
2012-02-13 16:28:04 +01:00
r = EINVAL;
else {
/* insert the new position into the output message */
m_out.reply_l1 = ex64lo(newpos);
m_out.reply_l2 = ex64hi(newpos);
if (cmp64(newpos, rfilp->filp_pos) != 0) {
2012-07-19 16:36:51 +02:00
rfilp->filp_pos = newpos;
2012-02-13 16:28:04 +01:00
/* Inhibit read ahead request */
r = req_inhibread(rfilp->filp_vno->v_fs_e,
rfilp->filp_vno->v_inode_nr);
}
}
2012-02-13 16:28:04 +01:00
unlock_filp(rfilp);
return(r);
}
/*===========================================================================*
* do_close *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
int do_close()
{
/* Perform the close(fd) system call. */
2012-02-13 16:28:04 +01:00
scratch(fp).file.fd_nr = job_m_in.fd;
return close_fd(fp, scratch(fp).file.fd_nr);
}
/*===========================================================================*
* close_fd *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
int close_fd(rfp, fd_nr)
struct fproc *rfp;
int fd_nr;
{
/* Perform the close(fd) system call. */
register struct filp *rfilp;
register struct vnode *vp;
struct file_lock *flp;
int lock_count;
/* First locate the vnode that belongs to the file descriptor. */
2012-02-13 16:28:04 +01:00
if ( (rfilp = get_filp2(rfp, fd_nr, VNODE_OPCL)) == NULL) return(err_code);
vp = rfilp->filp_vno;
2012-02-13 16:28:04 +01:00
close_filp(rfilp);
rfp->fp_filp[fd_nr] = NULL;
2012-02-13 16:28:04 +01:00
FD_CLR(fd_nr, &rfp->fp_cloexec_set);
FD_CLR(fd_nr, &rfp->fp_filp_inuse);
/* Check to see if the file is locked. If so, release all locks. */
2012-02-13 16:28:04 +01:00
if (nr_locks > 0) {
lock_count = nr_locks; /* save count of locks */
for (flp = &file_lock[0]; flp < &file_lock[NR_LOCKS]; flp++) {
if (flp->lock_type == 0) continue; /* slot not in use */
if (flp->lock_vnode == vp && flp->lock_pid == rfp->fp_pid) {
flp->lock_type = 0;
nr_locks--;
}
}
2012-02-13 16:28:04 +01:00
if (nr_locks < lock_count)
lock_revive(); /* one or more locks released */
}
2012-02-13 16:28:04 +01:00
return(OK);
}
/*===========================================================================*
* close_reply *
*===========================================================================*/
2012-03-25 20:25:53 +02:00
void close_reply()
{
/* No need to do anything */
}