minix/servers/vfs/lock.c

/* This file handles advisory file locking as required by POSIX.
 *
 * The entry points into this file are
 *   lock_op:	perform locking operations for FCNTL system call
 *   lock_revive: revive processes when a lock is released
 */

#include "fs.h"
#include <minix/com.h>
#include <minix/u64.h>
#include <fcntl.h>
#include <unistd.h>
#include "file.h"
#include "scratchpad.h"
#include "lock.h"
#include "vnode.h"

/*===========================================================================*
 *				lock_op					     *
 *===========================================================================*/
int lock_op(f, req)
struct filp *f;
int req;			/* either F_SETLK or F_SETLKW */
{
/* Perform the advisory locking required by POSIX. */

  int r, ltype, i, conflict = 0, unlocking = 0;
  mode_t mo;
  off_t first, last;
  struct flock flock;
  struct file_lock *flp, *flp2, *empty;

  /* Fetch the flock structure from user space. */
  r = sys_datacopy_wrapper(who_e, (vir_bytes)scratch(fp).io.io_buffer, VFS_PROC_NR,
		   (vir_bytes) &flock, sizeof(flock));
  if (r != OK) return(EINVAL);

  /* Make some error checks. */
  ltype = flock.l_type;
  mo = f->filp_mode;
  if (ltype != F_UNLCK && ltype != F_RDLCK && ltype != F_WRLCK) return(EINVAL);
  if (req == F_GETLK && ltype == F_UNLCK) return(EINVAL);
  if (!S_ISREG(f->filp_vno->v_mode) && !S_ISBLK(f->filp_vno->v_mode))
	return(EINVAL);
  if (req != F_GETLK && ltype == F_RDLCK && (mo & R_BIT) == 0) return(EBADF);
  if (req != F_GETLK && ltype == F_WRLCK && (mo & W_BIT) == 0) return(EBADF);

  /* Compute the first and last bytes in the lock region. */
  switch (flock.l_whence) {
    case SEEK_SET:	first = 0; break;
    case SEEK_CUR:	first = f->filp_pos; break;
    case SEEK_END:	first = f->filp_vno->v_size; break;
    default:	return(EINVAL);
  }

  /* Check for overflow. */
  if (((long) flock.l_start > 0) && ((first + flock.l_start) < first))
	return(EINVAL);
  if (((long) flock.l_start < 0) && ((first + flock.l_start) > first))
	return(EINVAL);
  first = first + flock.l_start;
  last = first + flock.l_len - 1;
  if (flock.l_len == 0) last = MAX_FILE_POS;
  if (last < first) return(EINVAL);

  /* Check if this region conflicts with any existing lock. */
  empty = NULL;
  for (flp = &file_lock[0]; flp < &file_lock[NR_LOCKS]; flp++) {
	if (flp->lock_type == 0) {
		if (empty == NULL) empty = flp;
		continue;	/* 0 means unused slot */
	}
	if (flp->lock_vnode != f->filp_vno) continue;	/* different file */
	if (last < flp->lock_first) continue;	/* new one is in front */
	if (first > flp->lock_last) continue;	/* new one is afterwards */
	if (ltype == F_RDLCK && flp->lock_type == F_RDLCK) continue;
	if (ltype != F_UNLCK && flp->lock_pid == fp->fp_pid) continue;

	/* There might be a conflict.  Process it. */
	conflict = 1;
	if (req == F_GETLK) break;

	/* If we are trying to set a lock, it just failed. */
	if (ltype == F_RDLCK || ltype == F_WRLCK) {
		if (req == F_SETLK) {
			/* For F_SETLK, just report back failure. */
			return(EAGAIN);
		} else {
			/* For F_SETLKW, suspend the process. */
			suspend(FP_BLOCKED_ON_LOCK);
			return(SUSPEND);
		}
	}

	/* We are clearing a lock and we found something that overlaps. */
	unlocking = 1;
	if (first <= flp->lock_first && last >= flp->lock_last) {
		flp->lock_type = 0;	/* mark slot as unused */
		nr_locks--;		/* number of locks is now 1 less */
		continue;
	}

	/* Part of a locked region has been unlocked. */
	if (first <= flp->lock_first) {
		flp->lock_first = last + 1;
		continue;
	}

	if (last >= flp->lock_last) {
		flp->lock_last = first - 1;
		continue;
	}

	/* Bad luck. A lock has been split in two by unlocking the middle. */
	if (nr_locks == NR_LOCKS) return(ENOLCK);
	for (i = 0; i < NR_LOCKS; i++)
		if (file_lock[i].lock_type == 0) break;
	flp2 = &file_lock[i];
	flp2->lock_type = flp->lock_type;
	flp2->lock_pid = flp->lock_pid;
	flp2->lock_vnode = flp->lock_vnode;
	flp2->lock_first = last + 1;
	flp2->lock_last = flp->lock_last;
	flp->lock_last = first - 1;
	nr_locks++;
  }
  if (unlocking) lock_revive();

  if (req == F_GETLK) {
	if (conflict) {
		/* GETLK and conflict. Report on the conflicting lock. */
		flock.l_type = flp->lock_type;
		flock.l_whence = SEEK_SET;
		flock.l_start = flp->lock_first;
		flock.l_len = flp->lock_last - flp->lock_first + 1;
		flock.l_pid = flp->lock_pid;

	} else {
		/* It is GETLK and there is no conflict. */
		flock.l_type = F_UNLCK;
	}

	/* Copy the flock structure back to the caller. */
	r = sys_datacopy_wrapper(VFS_PROC_NR, (vir_bytes) &flock, who_e,
		(vir_bytes)scratch(fp).io.io_buffer, sizeof(flock));
	return(r);
  }

  if (ltype == F_UNLCK) return(OK);	/* unlocked a region with no locks */

  /* There is no conflict.  If space exists, store new lock in the table. */
  if (empty == NULL) return(ENOLCK);	/* table full */
  empty->lock_type = ltype;
  empty->lock_pid = fp->fp_pid;
  empty->lock_vnode = f->filp_vno;
  empty->lock_first = first;
  empty->lock_last = last;
  nr_locks++;
  return(OK);
}


/*===========================================================================*
 *				lock_revive				     *
 *===========================================================================*/
void lock_revive()
{
/* Go find all the processes that are waiting for any kind of lock and
 * revive them all.  The ones that are still blocked will block again when
 * they run.  The others will complete.  This strategy is a space-time
 * tradeoff.  Figuring out exactly which ones to unblock now would take
 * extra code, and the only thing it would win would be some performance in
 * extremely rare circumstances (namely, that somebody actually used
 * locking).
 */

  struct fproc *fptr;

  for (fptr = &fproc[0]; fptr < &fproc[NR_PROCS]; fptr++){
	if (fptr->fp_pid == PID_FREE) continue;
	if (fptr->fp_blocked_on == FP_BLOCKED_ON_LOCK) {
		revive(fptr->fp_endpoint, 0);
	}
  }
}
Initial revision 2005-04-21 16:53:53 +02:00			`/* This file handles advisory file locking as required by POSIX.`
			`*`
			`* The entry points into this file are`
			`* lock_op: perform locking operations for FCNTL system call`
			`* lock_revive: revive processes when a lock is released`
			`*/`

			`#include "fs.h"`
			`#include <minix/com.h>`
First cut at 64-bit file offsets in block devices for mkfs/fsck. 2006-11-27 15:21:43 +01:00			`#include <minix/u64.h>`
Initial revision 2005-04-21 16:53:53 +02:00			`#include <fcntl.h>`
			`#include <unistd.h>`
			`#include "file.h"`
VFS: replace VFS with AVFS 2012-02-13 16:28:04 +01:00			`#include "scratchpad.h"`
Initial revision 2005-04-21 16:53:53 +02:00			`#include "lock.h"`
Merge of VFS by Balasz Gerofi with Minix trunk. 2006-10-25 15:40:36 +02:00			`#include "vnode.h"`

Initial revision 2005-04-21 16:53:53 +02:00			`/===========================================================================`
			`* lock_op *`
			`===========================================================================/`
retire PUBLIC, PRIVATE and FORWARD 2012-03-25 20:25:53 +02:00			`int lock_op(f, req)`
Initial revision 2005-04-21 16:53:53 +02:00			`struct filp *f;`
			`int req; /* either F_SETLK or F_SETLKW */`
			`{`
			`/* Perform the advisory locking required by POSIX. */`

			`int r, ltype, i, conflict = 0, unlocking = 0;`
			`mode_t mo;`
			`off_t first, last;`
			`struct flock flock;`
			`struct file_lock flp, flp2, *empty;`

			`/* Fetch the flock structure from user space. */`
Message type for VFS_FCNTL Change-Id: I079f3d7902cf5501fbc594a5610acd370abea095 2014-05-12 14:58:20 +02:00			`r = sys_datacopy_wrapper(who_e, (vir_bytes)scratch(fp).io.io_buffer, VFS_PROC_NR,`
Remove support for obsolete 3.2.1 ABI Change-Id: I76b4960bda41f55d9c42f8c99c5beae3424ca851 2013-08-31 23:11:34 +02:00			`(vir_bytes) &flock, sizeof(flock));`
Initial revision 2005-04-21 16:53:53 +02:00			`if (r != OK) return(EINVAL);`

			`/* Make some error checks. */`
			`ltype = flock.l_type;`
			`mo = f->filp_mode;`
			`if (ltype != F_UNLCK && ltype != F_RDLCK && ltype != F_WRLCK) return(EINVAL);`
			`if (req == F_GETLK && ltype == F_UNLCK) return(EINVAL);`
VFS: use S_IS* macros 2012-04-25 14:44:42 +02:00			`if (!S_ISREG(f->filp_vno->v_mode) && !S_ISBLK(f->filp_vno->v_mode))`
			`return(EINVAL);`
Initial revision 2005-04-21 16:53:53 +02:00			`if (req != F_GETLK && ltype == F_RDLCK && (mo & R_BIT) == 0) return(EBADF);`
			`if (req != F_GETLK && ltype == F_WRLCK && (mo & W_BIT) == 0) return(EBADF);`

			`/* Compute the first and last bytes in the lock region. */`
			`switch (flock.l_whence) {`
VFS: replace VFS with AVFS 2012-02-13 16:28:04 +01:00			`case SEEK_SET: first = 0; break;`
VFS: use 64-bit file offsets in all requests Change-Id: I735c4068135474aff2c397f4bc9fb147a618b453 2013-03-25 17:08:04 +01:00			`case SEEK_CUR: first = f->filp_pos; break;`
VFS: replace VFS with AVFS 2012-02-13 16:28:04 +01:00			`case SEEK_END: first = f->filp_vno->v_size; break;`
			`default: return(EINVAL);`
Initial revision 2005-04-21 16:53:53 +02:00			`}`
Scan all processes for that might be blocked on a lock 2010-04-28 13:54:22 +02:00
Initial revision 2005-04-21 16:53:53 +02:00			`/* Check for overflow. */`
VFS: replace VFS with AVFS 2012-02-13 16:28:04 +01:00			`if (((long) flock.l_start > 0) && ((first + flock.l_start) < first))`
Initial revision 2005-04-21 16:53:53 +02:00			`return(EINVAL);`
VFS: replace VFS with AVFS 2012-02-13 16:28:04 +01:00			`if (((long) flock.l_start < 0) && ((first + flock.l_start) > first))`
Initial revision 2005-04-21 16:53:53 +02:00			`return(EINVAL);`
			`first = first + flock.l_start;`
			`last = first + flock.l_len - 1;`
			`if (flock.l_len == 0) last = MAX_FILE_POS;`
			`if (last < first) return(EINVAL);`

			`/* Check if this region conflicts with any existing lock. */`
Use of all NIL_* defines converted to NULL 2010-05-10 15:26:00 +02:00			`empty = NULL;`
Scan all processes for that might be blocked on a lock 2010-04-28 13:54:22 +02:00			`for (flp = &file_lock[0]; flp < &file_lock[NR_LOCKS]; flp++) {`
Initial revision 2005-04-21 16:53:53 +02:00			`if (flp->lock_type == 0) {`
Use of all NIL_* defines converted to NULL 2010-05-10 15:26:00 +02:00			`if (empty == NULL) empty = flp;`
Initial revision 2005-04-21 16:53:53 +02:00			`continue; /* 0 means unused slot */`
			`}`
Merge of VFS by Balasz Gerofi with Minix trunk. 2006-10-25 15:40:36 +02:00			`if (flp->lock_vnode != f->filp_vno) continue; /* different file */`
Initial revision 2005-04-21 16:53:53 +02:00			`if (last < flp->lock_first) continue; /* new one is in front */`
			`if (first > flp->lock_last) continue; /* new one is afterwards */`
			`if (ltype == F_RDLCK && flp->lock_type == F_RDLCK) continue;`
			`if (ltype != F_UNLCK && flp->lock_pid == fp->fp_pid) continue;`
VFS: replace VFS with AVFS 2012-02-13 16:28:04 +01:00
Initial revision 2005-04-21 16:53:53 +02:00			`/* There might be a conflict. Process it. */`
			`conflict = 1;`
			`if (req == F_GETLK) break;`

			`/* If we are trying to set a lock, it just failed. */`
			`if (ltype == F_RDLCK \|\| ltype == F_WRLCK) {`
			`if (req == F_SETLK) {`
			`/* For F_SETLK, just report back failure. */`
			`return(EAGAIN);`
			`} else {`
			`/* For F_SETLKW, suspend the process. */`
Removed dependency of vfs on NR_TASKS macro - all macros in consts.h that depend on NR_TASKS replaced by a FP_BLOCKED_ON_* - fp_suspended removed and replaced by fp_blocked_on. Testing whether a process is supended is qeual to testing whether fp_blocked_on is FP_BLOCKED_ON_NONE or not - fp_task is valid only if fp_blocked_on == FP_BLOCKED_ON_OTHER - no need of special values that do not colide with valid and special endpoints since they are not used as endpoints anymore - suspend only takes FP_BLOCKED_ON_* values not endpoints anymore - suspend(task) replaced by wait_for(task) which sets fp_task so we remember who are we waiting for and suspend sets fp_blocked_on to FP_BLOCKED_ON_OTHER to signal that we are waiting for some other process - some functions should take endpoint_t instead of int, fixed 2009-09-22 23:48:26 +02:00			`suspend(FP_BLOCKED_ON_LOCK);`
Initial revision 2005-04-21 16:53:53 +02:00			`return(SUSPEND);`
			`}`
			`}`

			`/* We are clearing a lock and we found something that overlaps. */`
			`unlocking = 1;`
			`if (first <= flp->lock_first && last >= flp->lock_last) {`
			`flp->lock_type = 0; /* mark slot as unused */`
			`nr_locks--; /* number of locks is now 1 less */`
			`continue;`
			`}`

			`/* Part of a locked region has been unlocked. */`
			`if (first <= flp->lock_first) {`
			`flp->lock_first = last + 1;`
			`continue;`
			`}`

			`if (last >= flp->lock_last) {`
			`flp->lock_last = first - 1;`
			`continue;`
			`}`
VFS: replace VFS with AVFS 2012-02-13 16:28:04 +01:00
Initial revision 2005-04-21 16:53:53 +02:00			`/* Bad luck. A lock has been split in two by unlocking the middle. */`
			`if (nr_locks == NR_LOCKS) return(ENOLCK);`
			`for (i = 0; i < NR_LOCKS; i++)`
			`if (file_lock[i].lock_type == 0) break;`
			`flp2 = &file_lock[i];`
			`flp2->lock_type = flp->lock_type;`
			`flp2->lock_pid = flp->lock_pid;`
Merge of VFS by Balasz Gerofi with Minix trunk. 2006-10-25 15:40:36 +02:00			`flp2->lock_vnode = flp->lock_vnode;`
Initial revision 2005-04-21 16:53:53 +02:00			`flp2->lock_first = last + 1;`
			`flp2->lock_last = flp->lock_last;`
			`flp->lock_last = first - 1;`
			`nr_locks++;`
			`}`
			`if (unlocking) lock_revive();`

			`if (req == F_GETLK) {`
			`if (conflict) {`
			`/* GETLK and conflict. Report on the conflicting lock. */`
			`flock.l_type = flp->lock_type;`
			`flock.l_whence = SEEK_SET;`
			`flock.l_start = flp->lock_first;`
			`flock.l_len = flp->lock_last - flp->lock_first + 1;`
			`flock.l_pid = flp->lock_pid;`

			`} else {`
			`/* It is GETLK and there is no conflict. */`
			`flock.l_type = F_UNLCK;`
			`}`

			`/* Copy the flock structure back to the caller. */`
make vfs & filesystems use failable copying Change the kernel to add features to vircopy and safecopies so that transparent copy fixing won't happen to avoid deadlocks, and such copies fail with EFAULT. Transparently making copying work from filesystems (as normally done by the kernel & VM when copying fails because of missing/readonly memory) is problematic as it can happen that, for file-mapped ranges, that that same filesystem that is blocked on the copy request is needed to satisfy the memory range, leading to deadlock. Dito for VFS itself, if done with a blocking call. This change makes the copying done from a filesystem fail in such cases with EFAULT by VFS adding the CPF_TRY flag to the grants. If a FS call fails with EFAULT, VFS will then request the range to be made available to VM after the FS is unblocked, allowing it to be used to satisfy the range if need be in another VFS thread. Similarly, for datacopies that VFS itself does, it uses the failable vircopy variant and callers use a wrapper that talk to VM if necessary to get the copy to work. . kernel: add CPF_TRY flag to safecopies . kernel: only request writable ranges to VM for the target buffer when copying fails . do copying in VFS TRY-first . some fixes in VM to build SANITYCHECK mode . add regression test for the cases where - a FS system call needs memory mapped in a process that the FS itself must map. - such a range covers more than one file-mapped region. . add 'try' mode to vircopy, physcopy . add flags field to copy kernel call messages . if CP_FLAG_TRY is set, do not transparently try to fix memory ranges . for use by VFS when accessing user buffers to avoid deadlock . remove some obsolete backwards compatability assignments . VFS: let thread scheduling work for VM requests too Allows VFS to make calls to VM while suspending and resuming the currently running thread. Does currently not work for the main thread. . VM: add fix memory range call for use by VFS Change-Id: I295794269cea51a3163519a9cfe5901301d90b32 2014-01-16 14:22:13 +01:00			`r = sys_datacopy_wrapper(VFS_PROC_NR, (vir_bytes) &flock, who_e,`
Message type for VFS_FCNTL Change-Id: I079f3d7902cf5501fbc594a5610acd370abea095 2014-05-12 14:58:20 +02:00			`(vir_bytes)scratch(fp).io.io_buffer, sizeof(flock));`
Initial revision 2005-04-21 16:53:53 +02:00			`return(r);`
			`}`

			`if (ltype == F_UNLCK) return(OK); /* unlocked a region with no locks */`

			`/* There is no conflict. If space exists, store new lock in the table. */`
Use of all NIL_* defines converted to NULL 2010-05-10 15:26:00 +02:00			`if (empty == NULL) return(ENOLCK); /* table full */`
Initial revision 2005-04-21 16:53:53 +02:00			`empty->lock_type = ltype;`
			`empty->lock_pid = fp->fp_pid;`
Merge of VFS by Balasz Gerofi with Minix trunk. 2006-10-25 15:40:36 +02:00			`empty->lock_vnode = f->filp_vno;`
Initial revision 2005-04-21 16:53:53 +02:00			`empty->lock_first = first;`
			`empty->lock_last = last;`
			`nr_locks++;`
			`return(OK);`
			`}`

- Introduce support for sticky bit. - Revise VFS-FS protocol and update VFS/MFS/ISOFS accordingly. - Clean up MFS by removing old, dead code (backwards compatibility is broken by the new VFS-FS protocol, anyway) and rewrite other parts. Also, make sure all functions have proper banners and prototypes. - VFS should always provide a (syntactically) valid path to the FS; no need for the FS to do sanity checks when leaving/entering mount points. - Fix several bugs in MFS: - Several path lookup bugs in MFS. - A link can be too big for the path buffer. - A mountpoint can become inaccessible when the creation of a new inode fails, because the inode already exists and is a mountpoint. - Introduce support for supplemental groups. - Add test 46 to test supplemental group functionality (and removed obsolete suppl. tests from test 2). - Clean up VFS (not everything is done yet). - ISOFS now opens device read-only. This makes the -r flag in the mount command unnecessary (but will still report to be mounted read-write). - Introduce PipeFS. PipeFS is a new FS that handles all anonymous and named pipes. However, named pipes still reside on the (M)FS, as they are part of the file system on disk. To make this work VFS now has a concept of 'mapped' inodes, which causes read, write, truncate and stat requests to be redirected to the mapped FS, and all other requests to the original FS. 2009-12-20 21:27:14 +01:00
Initial revision 2005-04-21 16:53:53 +02:00			`/===========================================================================`
			`* lock_revive *`
			`===========================================================================/`
retire PUBLIC, PRIVATE and FORWARD 2012-03-25 20:25:53 +02:00			`void lock_revive()`
Initial revision 2005-04-21 16:53:53 +02:00			`{`
VFS: replace VFS with AVFS 2012-02-13 16:28:04 +01:00			`/* Go find all the processes that are waiting for any kind of lock and`
			`* revive them all. The ones that are still blocked will block again when`
			`* they run. The others will complete. This strategy is a space-time`
			`* tradeoff. Figuring out exactly which ones to unblock now would take`
			`* extra code, and the only thing it would win would be some performance in`
			`* extremely rare circumstances (namely, that somebody actually used`
Initial revision 2005-04-21 16:53:53 +02:00			`* locking).`
			`*/`

			`struct fproc *fptr;`

Scan all processes for that might be blocked on a lock 2010-04-28 13:54:22 +02:00			`for (fptr = &fproc[0]; fptr < &fproc[NR_PROCS]; fptr++){`
VFS: replace VFS with AVFS 2012-02-13 16:28:04 +01:00			`if (fptr->fp_pid == PID_FREE) continue;`
Removed dependency of vfs on NR_TASKS macro - all macros in consts.h that depend on NR_TASKS replaced by a FP_BLOCKED_ON_* - fp_suspended removed and replaced by fp_blocked_on. Testing whether a process is supended is qeual to testing whether fp_blocked_on is FP_BLOCKED_ON_NONE or not - fp_task is valid only if fp_blocked_on == FP_BLOCKED_ON_OTHER - no need of special values that do not colide with valid and special endpoints since they are not used as endpoints anymore - suspend only takes FP_BLOCKED_ON_* values not endpoints anymore - suspend(task) replaced by wait_for(task) which sets fp_task so we remember who are we waiting for and suspend sets fp_blocked_on to FP_BLOCKED_ON_OTHER to signal that we are waiting for some other process - some functions should take endpoint_t instead of int, fixed 2009-09-22 23:48:26 +02:00			`if (fptr->fp_blocked_on == FP_BLOCKED_ON_LOCK) {`
endpoint-aware conversion of servers. 'who', indicating caller number in pm and fs and some other servers, has been removed in favour of 'who_e' (endpoint) and 'who_p' (proc nr.). In both PM and FS, isokendpt() convert endpoints to process slot numbers, returning OK if it was a valid and consistent endpoint number. okendpt() does the same but panic()s if it doesn't succeed. (In PM, this is pm_isok..) pm and fs keep their own records of process endpoints in their proc tables, which are needed to make kernel calls about those processes. message field names have changed. fs drivers are endpoints. fs now doesn't try to get out of driver deadlock, as the protocol isn't supposed to let that happen any more. (A warning is printed if ELOCKED is detected though.) fproc[].fp_task (indicating which driver the process is suspended on) became an int. PM and FS now get endpoint numbers of initial boot processes from the kernel. These happen to be the same as the old proc numbers, to let user processes reach them with the old numbers, but FS and PM don't know that. All new processes after INIT, even after the generation number wraps around, get endpoint numbers with generation 1 and higher, so the first instances of the boot processes are the only processes ever to have endpoint numbers in the old proc number range. More return code checks of sys_* functions have been added. IS has become endpoint-aware. Ditched the 'text' and 'data' fields in the kernel dump (which show locations, not sizes, so aren't terribly useful) in favour of the endpoint number. Proc number is still visible. Some other dumps (e.g. dmap, rs) show endpoint numbers now too which got the formatting changed. PM reading segments using rw_seg() has changed - it uses other fields in the message now instead of encoding the segment and process number and fd in the fd field. For that it uses _read_pm() and _write_pm() which to _taskcall()s directly in pm/misc.c. PM now sys_exit()s itself on panic(), instead of sys_abort(). RS also talks in endpoints instead of process numbers. 2006-03-03 11:20:58 +01:00			`revive(fptr->fp_endpoint, 0);`
Initial revision 2005-04-21 16:53:53 +02:00			`}`
			`}`
			`}`